I have recently learned about string hashing and about Rabin-Karp algorithm for string matching from here. Here's the code taken directly from cp-algorithms site:
vector<int> rabin_karp(string const& s, string const& t) {
const int p = 31;
const int m = 1e9 + 9;
int S = s.size(), T = t.size();
vector<long long> p_pow(max(S, T));
p_pow[0] = 1;
for (int i = 1; i < (int)p_pow.size(); i++)
p_pow[i] = (p_pow[i-1] * p) % m;
vector<long long> h(T + 1, 0);
for (int i = 0; i < T; i++)
h[i+1] = (h[i] + (t[i] - 'a' + 1) * p_pow[i]) % m;
long long h_s = 0;
for (int i = 0; i < S; i++)
h_s = (h_s + (s[i] - 'a' + 1) * p_pow[i]) % m;
vector<int> occurences;
for (int i = 0; i + S - 1 < T; i++) {
long long cur_h = (h[i+S] + m - h[i]) % m;
if (cur_h == h_s * p_pow[i] % m)
occurences.push_back(i);
}
return occurences;
}
But, the above algorithm fails for one particular test-case:
S: lNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUX
T:ZtonpqnFzlpvUKZrBbRlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXxtHmTxoLuMbRYsvSpxhtrlvABBlFYmndFzHypOmJyFxjHEPlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXbDiEAvtPlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXRRNoBCUMQVOlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXRLKlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXAYPDKWtVpShhclNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXOJlUlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXglmlNoYhXmlwOscxnkTWjsyNJNhgvzMFbxFnbiWuBAGjZQlCRQHjTUXuaOibGlVrwghvNTgLfltIbEdBlgjelFjQkBeFrdEV
It outputs indices of occurrence of string s
as 361 472
, whereas the actual correct output is: 19 118 178 241 296 361 417 472
Can anyone point out the reason why it is failing? I have been stuck at this for hours.
I believe this implementation assumes that the input string contains only lower case latin letters ('a'-'z').
I would recommend to read this article first: https://cp-algorithms.com/string/string-hashing.html