Use rolling hash function for RabinKarp.

This commit is contained in:
Oleksii Trekhleb 2018-08-09 05:57:53 +03:00
parent 650e3099e5
commit d303d83673
3 changed files with 112 additions and 29 deletions

View File

@ -5,11 +5,42 @@ is a string searching algorithm created by Richard M. Karp and
Michael O. Rabin (1987) that uses hashing to find any one of a set
of pattern strings in a text.
## Algorithm
The RabinKarp algorithm seeks to speed up the testing of equality of
the pattern to the substrings in the text by using a hash function. A
hash function is a function which converts every string into a numeric
value, called its hash value; for example, we might
have `hash('hello') = 5`. The algorithm exploits the fact
that if two strings are equal, their hash values are also equal. Thus,
string matching is reduced (almost) to computing the hash value of the
search pattern and then looking for substrings of the input string with
that hash value.
However, there are two problems with this approach. First, because there
are so many different strings and so few hash values, some differing
strings will have the same hash value. If the hash values match, the
pattern and the substring may not match; consequently, the potential
match of search pattern and the substring must be confirmed by comparing
them; that comparison can take a long time for long substrings.
Luckily, a good hash function on reasonable strings usually does not
have many collisions, so the expected search time will be acceptable.
## Hash Function Used
The key to the RabinKarp algorithm's performance is the efficient computation
of hash values of the successive substrings of the text.
The **Rabin fingerprint** is a popular and effective rolling hash function.
The **polynomial hash function** described in this example is not a Rabin
fingerprint, but it works equally well. It treats every substring as a
number in some base, the base being usually a large prime.
## Complexity
For text of length `n` and `p` patterns
of combined length `m`, its average and best case running time is
`O(n + m)` in space `O(p)`, but its worst-case time is `O(n * m)`.
For text of length `n` and `p` patterns of combined length `m`, its average
and best case running time is `O(n + m)` in space `O(p)`, but its
worst-case time is `O(n * m)`.
## Application

View File

@ -13,8 +13,30 @@ describe('rabinKarp', () => {
expect(rabinKarp('abcxabcdabxaabcdabcabcdabcdabcy', 'abcdabca')).toBe(12);
expect(rabinKarp('abcxabcdabxaabaabaaaabcdabcdabcy', 'aabaabaaa')).toBe(11);
expect(rabinKarp('^ !/\'#\'pp', ' !/\'#\'pp')).toBe(1);
});
it('should work with bigger texts', () => {
const text = 'Lorem Ipsum is simply dummy text of the printing and '
+ 'typesetting industry. Lorem Ipsum has been the industry\'s standard '
+ 'dummy text ever since the 1500s, when an unknown printer took a '
+ 'galley of type and scrambled it to make a type specimen book. It '
+ 'has survived not only five centuries, but also the leap into '
+ 'electronic typesetting, remaining essentially unchanged. It was '
+ 'popularised in the 1960s with the release of Letraset sheets '
+ 'containing Lorem Ipsum passages, and more recently with desktop'
+ 'publishing software like Aldus PageMaker including versions of Lorem '
+ 'Ipsum.';
expect(rabinKarp(text, 'Lorem')).toBe(0);
expect(rabinKarp(text, 'versions')).toBe(549);
expect(rabinKarp(text, 'versions of Lorem Ipsum.')).toBe(549);
expect(rabinKarp(text, 'versions of Lorem Ipsum:')).toBe(-1);
expect(rabinKarp(text, 'Lorem Ipsum passages, and more recently with')).toBe(446);
});
it('should work with UTF symbols', () => {
expect(rabinKarp('a\u{ffff}', '\u{ffff}')).toBe(1);
expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
expect(rabinKarp('\u0000耀\u0000', '耀\u0000')).toBe(1);
// expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
});
});

View File

@ -1,33 +1,63 @@
import RabinFingerprint from '../../../utils/hash/rolling/Rabin_Fingerprint';
import PolynomialHash from '../../cryptography/polynomial-hash/PolynomialHash';
/**
* @param {string} text
* @param {string} word
* @return {number}
* Checks if two strings are equal.
*
* We may simply compare (string1 === string2) but for the
* purpose of analyzing algorithm time complexity let's do
* it character by character.
*
* @param {string} string1
* @param {string} string2
*/
export default function rabinKarp(text, word) {
const toNum = function toNum(character) {
const surrogate = character.codePointAt(1);
return ((surrogate === undefined) ? 0 : surrogate) + (character.codePointAt(0) * (2 ** 16));
};
const arrEq = (a1, a2) => ((a1.length === a2.length) && a1.every((val, idx) => val === a2[idx]));
const wordArr = [...word].map(toNum);
const textArr = [...text].map(toNum);
// The prime generation function could depend on the inputs for collision guarantees.
const hasher = new RabinFingerprint(() => 229);
const cmpVal = hasher.init(wordArr);
let currHash = hasher.init(textArr.slice(0, wordArr.length));
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(0, wordArr.length))) {
return 0;
function stringsAreEqual(string1, string2) {
if (string1.length !== string2.length) {
return false;
}
for (let i = 0; i < (textArr.length - wordArr.length); i += 1) {
currHash = hasher.roll(textArr[i], textArr[i + wordArr.length]);
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(i + 1, i + wordArr.length + 1))) {
return i + 1;
for (let charIndex = 0; charIndex < string1.length; charIndex += 1) {
if (string1[charIndex] !== string2[charIndex]) {
return false;
}
}
return true;
}
/**
* @param {string} text - Text that may contain the searchable word.
* @param {string} word - Word that is being searched in text.
* @return {number} - Position of the word in text.
*/
export default function rabinKarp(text, word) {
const hasher = new PolynomialHash();
// Calculate word hash that we will use for comparison with other substring hashes.
const wordHash = hasher.hash(word);
let prevFrame = null;
let currentFrameHash = null;
// Go through all substring of the text that may match.
for (let charIndex = 0; charIndex <= (text.length - word.length); charIndex += 1) {
const currentFrame = text.substring(charIndex, charIndex + word.length);
// Calculate the hash of current substring.
if (currentFrameHash === null) {
currentFrameHash = hasher.hash(currentFrame);
} else {
currentFrameHash = hasher.roll(currentFrameHash, prevFrame, currentFrame);
}
prevFrame = currentFrame;
// Compare the hash of current substring and seeking string.
// In case if hashes match let's check substring char by char.
if (
wordHash === currentFrameHash
&& stringsAreEqual(text.substr(charIndex, word.length), word)
) {
return charIndex;
}
}