The Rabin-Karp algorithm is a string searching algorithm that uses hashing to find a sub string in a text.in this algorithm if two strings are equal, their hash values are also equal.
lets discuss using an example
T[ ] = a b c d a a d e d a
P[ ] = a a d
Σ = alphabet
Σ = {a, b, c, d, e }
a = 1 prime = 3
b = 2
c = 3
d = 4
e = 5
P[ ] hash value = 1 + 1x(3^1) + 4x(3^2)
= 40
- Now considering hash values of the text
First 3 items
a b c d a a d e d a
{a b c} = 1 + 2x(3^1) + 3x(3^2)
= 34
hash values does not match
shift by one and consider next 3 items
{b c d} = 2 + 3x3 +4x9
= 47
hash does not match
- There is a easy way to fined hashes after finding first one
old hash = v
new hash = n
last index of the pattern = l
value new comparing texts last item = u
value old texts first item =w
prime = p
n = (v - w)/p + u(p^l)
{c d a} = (47-2) +1(3^2)
3
= 24
hashes does not match
{ d a a} = 24-3 + 1x9
3
= 16
hashes does not match
{ a a d} = 16-4 + 4x9
3
= 40
Now hash values of the text and pattern matches.
lets discuss using an example
T[ ] = a b c d a a d e d a
P[ ] = a a d
Σ = alphabet
Σ = {a, b, c, d, e }
a = 1 prime = 3
b = 2
c = 3
d = 4
e = 5
P[ ] hash value = 1 + 1x(3^1) + 4x(3^2)
= 40
- Now considering hash values of the text
{a b c} = 1 + 2x(3^1) + 3x(3^2)
= 34
hash values does not match
shift by one and consider next 3 items
{b c d} = 2 + 3x3 +4x9
= 47
hash does not match
- There is a easy way to fined hashes after finding first one
old hash = v
new hash = n
last index of the pattern = l
value new comparing texts last item = u
value old texts first item =w
prime = p
n = (v - w)/p + u(p^l)
{c d a} = (47-2) +1(3^2)
3
= 24
hashes does not match
{ d a a} = 24-3 + 1x9
3
= 16
hashes does not match
{ a a d} = 16-4 + 4x9
3
= 40
Now hash values of the text and pattern matches.
pseudo code
// correctly calculates a mod b even if a < 0
function int_mod(int a, int b)
{
return (a % b + b) % b;
}
function Rabin_Karp(text[], pattern[])
{
// let n be the size of the text, m the size of the
// pattern, B - the base of the numeral system,
// and M - a big enough prime number
if(n < m) return; // no match is possible
// calculate the hash value of the pattern
hp = 0;
for(i = 0; i < m; i++)
hp = int_mod(hp * B + pattern[i], M);
// calculate the hash value of the first segment
// of the text of length m
ht = 0;
for(i = 0; i < m; i++)
ht = int_mod(ht * B + text[i], M);
if(ht == hp) check character by character if the first
segment of the text matches the pattern;
// start the "rolling hash" - for every next character in
// the text calculate the hash value of the new segment
// of length m; E = (Bm-1) modulo M
for(i = m; i < n; i++) {
ht = int_mod(ht - int_mod(text[i - m] * E, M), M);
ht = int_mod(ht * B, M);
ht = int_mod(ht + text[i], M);
if(ht == hp) check character by character if the
current segment of the text matches
the pattern;
}
}
function int_mod(int a, int b)
{
return (a % b + b) % b;
}
function Rabin_Karp(text[], pattern[])
{
// let n be the size of the text, m the size of the
// pattern, B - the base of the numeral system,
// and M - a big enough prime number
if(n < m) return; // no match is possible
// calculate the hash value of the pattern
hp = 0;
for(i = 0; i < m; i++)
hp = int_mod(hp * B + pattern[i], M);
// calculate the hash value of the first segment
// of the text of length m
ht = 0;
for(i = 0; i < m; i++)
ht = int_mod(ht * B + text[i], M);
if(ht == hp) check character by character if the first
segment of the text matches the pattern;
// start the "rolling hash" - for every next character in
// the text calculate the hash value of the new segment
// of length m; E = (Bm-1) modulo M
for(i = m; i < n; i++) {
ht = int_mod(ht - int_mod(text[i - m] * E, M), M);
ht = int_mod(ht * B, M);
ht = int_mod(ht + text[i], M);
if(ht == hp) check character by character if the
current segment of the text matches
the pattern;
}
}
Implementation in python
class RollingHash:
def __init__(self, text, sizeWord):
self.text = text
self.hash = 0
self.sizeWord = sizeWord
for i in range(0, sizeWord):
#ord maps the character to a number
#subtract out the ASCII value of "a" to start the indexing at zero
self.hash += (ord(self.text[i]) - ord("a")+1)*(26**(sizeWord - i -1))
#start index of current window
self.window_start = 0
#end of index window
self.window_end = sizeWord
def move_window(self):
if self.window_end <= len(self.text) - 1:
#remove left letter from hash value
self.hash -= (ord(self.text[self.window_start]) - ord("a")+1)*26**(self.sizeWord-1)
self.hash *= 26
self.hash += ord(self.text[self.window_end])- ord("a")+1
self.window_start += 1
self.window_end += 1
def window_text(self):
return self.text[self.window_start:self.window_end]
def rabin_karp(word, text):
if word == "" or text == "":
return None
if len(word) > len(text):
return None
rolling_hash = RollingHash(text, len(word))
word_hash = RollingHash(word, len(word))
#word_hash.move_window()
for i in range(len(text) - len(word) + 1):
if rolling_hash.hash == word_hash.hash:
if rolling_hash.window_text() == word:
return i
rolling_hash.move_window()
return None
#print rabin_karp("a", "abcdefgh")
#print rabin_karp("d", "abcdefgh")
#print rabin_karp("cupcakes", "balloonsandcupcakes")
def __init__(self, text, sizeWord):
self.text = text
self.hash = 0
self.sizeWord = sizeWord
for i in range(0, sizeWord):
#ord maps the character to a number
#subtract out the ASCII value of "a" to start the indexing at zero
self.hash += (ord(self.text[i]) - ord("a")+1)*(26**(sizeWord - i -1))
#start index of current window
self.window_start = 0
#end of index window
self.window_end = sizeWord
def move_window(self):
if self.window_end <= len(self.text) - 1:
#remove left letter from hash value
self.hash -= (ord(self.text[self.window_start]) - ord("a")+1)*26**(self.sizeWord-1)
self.hash *= 26
self.hash += ord(self.text[self.window_end])- ord("a")+1
self.window_start += 1
self.window_end += 1
def window_text(self):
return self.text[self.window_start:self.window_end]
def rabin_karp(word, text):
if word == "" or text == "":
return None
if len(word) > len(text):
return None
rolling_hash = RollingHash(text, len(word))
word_hash = RollingHash(word, len(word))
#word_hash.move_window()
for i in range(len(text) - len(word) + 1):
if rolling_hash.hash == word_hash.hash:
if rolling_hash.window_text() == word:
return i
rolling_hash.move_window()
return None
#print rabin_karp("a", "abcdefgh")
#print rabin_karp("d", "abcdefgh")
#print rabin_karp("cupcakes", "balloonsandcupcakes")
Implementation in c
#include<stdio.h>
#include<string.h>
// d is the number of characters in input alphabet
#define d 256
/* pat -> pattern
txt -> text
q -> A prime number
*/
void search(char pat[], char txt[], int q)
{
int M = strlen(pat);
int N = strlen(txt);
int i, j;
int p = 0; // hash value for pattern
int t = 0; // hash value for txt
int h = 1;
// The value of h would be "pow(d, M-1)%q"
for (i = 0; i < M-1; i++)
h = (h*d)%q;
// Calculate the hash value of pattern and first
// window of text
for (i = 0; i < M; i++)
{
p = (d*p + pat[i])%q;
t = (d*t + txt[i])%q;
// Slide the pattern over text one by one
for (i = 0; i <= N - M; i++)
{
// Check the hash values of current window of text
// and pattern. If the hash values match then only
// check for characters on by one
if ( p == t )
{
/* Check for characters one by one */
for (j = 0; j < M; j++)
{
if (txt[i+j] != pat[j])
break;
}
// if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
if (j == M)
printf("Pattern found at index %d \n", i);
}
// Calculate hash value for next window of text: Remove
// leading digit, add trailing digit
if ( i < N-M )
{
t = (d*(t - txt[i]*h) + txt[i+M])%q;
// We might get negative value of t, converting it
// to positive
if (t < 0)
t = (t + q);
}
}
}
/* Driver program to test above function */
int main()
{
char txt[] = "sritechviews";
char pat[] = "chvi";
int q = 101; // A prime number
search(pat, txt, q);
return 0;
}
#include<stdio.h>
#include<string.h>
// d is the number of characters in input alphabet
#define d 256
/* pat -> pattern
txt -> text
q -> A prime number
*/
void search(char pat[], char txt[], int q)
{
int M = strlen(pat);
int N = strlen(txt);
int i, j;
int p = 0; // hash value for pattern
int t = 0; // hash value for txt
int h = 1;
// The value of h would be "pow(d, M-1)%q"
for (i = 0; i < M-1; i++)
h = (h*d)%q;
// Calculate the hash value of pattern and first
// window of text
for (i = 0; i < M; i++)
{
p = (d*p + pat[i])%q;
t = (d*t + txt[i])%q;
// Slide the pattern over text one by one
for (i = 0; i <= N - M; i++)
{
// Check the hash values of current window of text
// and pattern. If the hash values match then only
// check for characters on by one
if ( p == t )
{
/* Check for characters one by one */
for (j = 0; j < M; j++)
{
if (txt[i+j] != pat[j])
break;
}
// if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
if (j == M)
printf("Pattern found at index %d \n", i);
}
// Calculate hash value for next window of text: Remove
// leading digit, add trailing digit
if ( i < N-M )
{
t = (d*(t - txt[i]*h) + txt[i+M])%q;
// We might get negative value of t, converting it
// to positive
if (t < 0)
t = (t + q);
}
}
}
/* Driver program to test above function */
int main()
{
char txt[] = "sritechviews";
char pat[] = "chvi";
int q = 101; // A prime number
search(pat, txt, q);
return 0;
}
0 Comments
Thanks for the feedback