mirror of
https://github.moeyy.xyz/https://github.com/trekhleb/javascript-algorithms.git
synced 2024-12-26 07:01:18 +08:00
Use rolling hash function for RabinKarp.
This commit is contained in:
parent
650e3099e5
commit
d303d83673
@ -5,11 +5,42 @@ is a string searching algorithm created by Richard M. Karp and
|
|||||||
Michael O. Rabin (1987) that uses hashing to find any one of a set
|
Michael O. Rabin (1987) that uses hashing to find any one of a set
|
||||||
of pattern strings in a text.
|
of pattern strings in a text.
|
||||||
|
|
||||||
|
## Algorithm
|
||||||
|
|
||||||
|
The Rabin–Karp algorithm seeks to speed up the testing of equality of
|
||||||
|
the pattern to the substrings in the text by using a hash function. A
|
||||||
|
hash function is a function which converts every string into a numeric
|
||||||
|
value, called its hash value; for example, we might
|
||||||
|
have `hash('hello') = 5`. The algorithm exploits the fact
|
||||||
|
that if two strings are equal, their hash values are also equal. Thus,
|
||||||
|
string matching is reduced (almost) to computing the hash value of the
|
||||||
|
search pattern and then looking for substrings of the input string with
|
||||||
|
that hash value.
|
||||||
|
|
||||||
|
However, there are two problems with this approach. First, because there
|
||||||
|
are so many different strings and so few hash values, some differing
|
||||||
|
strings will have the same hash value. If the hash values match, the
|
||||||
|
pattern and the substring may not match; consequently, the potential
|
||||||
|
match of search pattern and the substring must be confirmed by comparing
|
||||||
|
them; that comparison can take a long time for long substrings.
|
||||||
|
Luckily, a good hash function on reasonable strings usually does not
|
||||||
|
have many collisions, so the expected search time will be acceptable.
|
||||||
|
|
||||||
|
## Hash Function Used
|
||||||
|
|
||||||
|
The key to the Rabin–Karp algorithm's performance is the efficient computation
|
||||||
|
of hash values of the successive substrings of the text.
|
||||||
|
The **Rabin fingerprint** is a popular and effective rolling hash function.
|
||||||
|
|
||||||
|
The **polynomial hash function** described in this example is not a Rabin
|
||||||
|
fingerprint, but it works equally well. It treats every substring as a
|
||||||
|
number in some base, the base being usually a large prime.
|
||||||
|
|
||||||
## Complexity
|
## Complexity
|
||||||
|
|
||||||
For text of length `n` and `p` patterns
|
For text of length `n` and `p` patterns of combined length `m`, its average
|
||||||
of combined length `m`, its average and best case running time is
|
and best case running time is `O(n + m)` in space `O(p)`, but its
|
||||||
`O(n + m)` in space `O(p)`, but its worst-case time is `O(n * m)`.
|
worst-case time is `O(n * m)`.
|
||||||
|
|
||||||
## Application
|
## Application
|
||||||
|
|
||||||
|
@ -13,8 +13,30 @@ describe('rabinKarp', () => {
|
|||||||
expect(rabinKarp('abcxabcdabxaabcdabcabcdabcdabcy', 'abcdabca')).toBe(12);
|
expect(rabinKarp('abcxabcdabxaabcdabcabcdabcdabcy', 'abcdabca')).toBe(12);
|
||||||
expect(rabinKarp('abcxabcdabxaabaabaaaabcdabcdabcy', 'aabaabaaa')).toBe(11);
|
expect(rabinKarp('abcxabcdabxaabaabaaaabcdabcdabcy', 'aabaabaaa')).toBe(11);
|
||||||
expect(rabinKarp('^ !/\'#\'pp', ' !/\'#\'pp')).toBe(1);
|
expect(rabinKarp('^ !/\'#\'pp', ' !/\'#\'pp')).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with bigger texts', () => {
|
||||||
|
const text = 'Lorem Ipsum is simply dummy text of the printing and '
|
||||||
|
+ 'typesetting industry. Lorem Ipsum has been the industry\'s standard '
|
||||||
|
+ 'dummy text ever since the 1500s, when an unknown printer took a '
|
||||||
|
+ 'galley of type and scrambled it to make a type specimen book. It '
|
||||||
|
+ 'has survived not only five centuries, but also the leap into '
|
||||||
|
+ 'electronic typesetting, remaining essentially unchanged. It was '
|
||||||
|
+ 'popularised in the 1960s with the release of Letraset sheets '
|
||||||
|
+ 'containing Lorem Ipsum passages, and more recently with desktop'
|
||||||
|
+ 'publishing software like Aldus PageMaker including versions of Lorem '
|
||||||
|
+ 'Ipsum.';
|
||||||
|
|
||||||
|
expect(rabinKarp(text, 'Lorem')).toBe(0);
|
||||||
|
expect(rabinKarp(text, 'versions')).toBe(549);
|
||||||
|
expect(rabinKarp(text, 'versions of Lorem Ipsum.')).toBe(549);
|
||||||
|
expect(rabinKarp(text, 'versions of Lorem Ipsum:')).toBe(-1);
|
||||||
|
expect(rabinKarp(text, 'Lorem Ipsum passages, and more recently with')).toBe(446);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should work with UTF symbols', () => {
|
||||||
expect(rabinKarp('a\u{ffff}', '\u{ffff}')).toBe(1);
|
expect(rabinKarp('a\u{ffff}', '\u{ffff}')).toBe(1);
|
||||||
expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
|
|
||||||
expect(rabinKarp('\u0000耀\u0000', '耀\u0000')).toBe(1);
|
expect(rabinKarp('\u0000耀\u0000', '耀\u0000')).toBe(1);
|
||||||
|
// expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,33 +1,63 @@
|
|||||||
import RabinFingerprint from '../../../utils/hash/rolling/Rabin_Fingerprint';
|
import PolynomialHash from '../../cryptography/polynomial-hash/PolynomialHash';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param {string} text
|
* Checks if two strings are equal.
|
||||||
* @param {string} word
|
*
|
||||||
* @return {number}
|
* We may simply compare (string1 === string2) but for the
|
||||||
|
* purpose of analyzing algorithm time complexity let's do
|
||||||
|
* it character by character.
|
||||||
|
*
|
||||||
|
* @param {string} string1
|
||||||
|
* @param {string} string2
|
||||||
*/
|
*/
|
||||||
export default function rabinKarp(text, word) {
|
function stringsAreEqual(string1, string2) {
|
||||||
const toNum = function toNum(character) {
|
if (string1.length !== string2.length) {
|
||||||
const surrogate = character.codePointAt(1);
|
return false;
|
||||||
return ((surrogate === undefined) ? 0 : surrogate) + (character.codePointAt(0) * (2 ** 16));
|
|
||||||
};
|
|
||||||
const arrEq = (a1, a2) => ((a1.length === a2.length) && a1.every((val, idx) => val === a2[idx]));
|
|
||||||
|
|
||||||
const wordArr = [...word].map(toNum);
|
|
||||||
const textArr = [...text].map(toNum);
|
|
||||||
|
|
||||||
// The prime generation function could depend on the inputs for collision guarantees.
|
|
||||||
const hasher = new RabinFingerprint(() => 229);
|
|
||||||
const cmpVal = hasher.init(wordArr);
|
|
||||||
|
|
||||||
let currHash = hasher.init(textArr.slice(0, wordArr.length));
|
|
||||||
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(0, wordArr.length))) {
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (let i = 0; i < (textArr.length - wordArr.length); i += 1) {
|
for (let charIndex = 0; charIndex < string1.length; charIndex += 1) {
|
||||||
currHash = hasher.roll(textArr[i], textArr[i + wordArr.length]);
|
if (string1[charIndex] !== string2[charIndex]) {
|
||||||
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(i + 1, i + wordArr.length + 1))) {
|
return false;
|
||||||
return i + 1;
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} text - Text that may contain the searchable word.
|
||||||
|
* @param {string} word - Word that is being searched in text.
|
||||||
|
* @return {number} - Position of the word in text.
|
||||||
|
*/
|
||||||
|
export default function rabinKarp(text, word) {
|
||||||
|
const hasher = new PolynomialHash();
|
||||||
|
|
||||||
|
// Calculate word hash that we will use for comparison with other substring hashes.
|
||||||
|
const wordHash = hasher.hash(word);
|
||||||
|
|
||||||
|
let prevFrame = null;
|
||||||
|
let currentFrameHash = null;
|
||||||
|
|
||||||
|
// Go through all substring of the text that may match.
|
||||||
|
for (let charIndex = 0; charIndex <= (text.length - word.length); charIndex += 1) {
|
||||||
|
const currentFrame = text.substring(charIndex, charIndex + word.length);
|
||||||
|
|
||||||
|
// Calculate the hash of current substring.
|
||||||
|
if (currentFrameHash === null) {
|
||||||
|
currentFrameHash = hasher.hash(currentFrame);
|
||||||
|
} else {
|
||||||
|
currentFrameHash = hasher.roll(currentFrameHash, prevFrame, currentFrame);
|
||||||
|
}
|
||||||
|
|
||||||
|
prevFrame = currentFrame;
|
||||||
|
|
||||||
|
// Compare the hash of current substring and seeking string.
|
||||||
|
// In case if hashes match let's check substring char by char.
|
||||||
|
if (
|
||||||
|
wordHash === currentFrameHash
|
||||||
|
&& stringsAreEqual(text.substr(charIndex, word.length), word)
|
||||||
|
) {
|
||||||
|
return charIndex;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user