mirror of
https://github.moeyy.xyz/https://github.com/trekhleb/javascript-algorithms.git
synced 2024-12-26 07:01:18 +08:00
Use rolling hash function for RabinKarp.
This commit is contained in:
parent
650e3099e5
commit
d303d83673
@ -5,11 +5,42 @@ is a string searching algorithm created by Richard M. Karp and
|
||||
Michael O. Rabin (1987) that uses hashing to find any one of a set
|
||||
of pattern strings in a text.
|
||||
|
||||
## Algorithm
|
||||
|
||||
The Rabin–Karp algorithm seeks to speed up the testing of equality of
|
||||
the pattern to the substrings in the text by using a hash function. A
|
||||
hash function is a function which converts every string into a numeric
|
||||
value, called its hash value; for example, we might
|
||||
have `hash('hello') = 5`. The algorithm exploits the fact
|
||||
that if two strings are equal, their hash values are also equal. Thus,
|
||||
string matching is reduced (almost) to computing the hash value of the
|
||||
search pattern and then looking for substrings of the input string with
|
||||
that hash value.
|
||||
|
||||
However, there are two problems with this approach. First, because there
|
||||
are so many different strings and so few hash values, some differing
|
||||
strings will have the same hash value. If the hash values match, the
|
||||
pattern and the substring may not match; consequently, the potential
|
||||
match of search pattern and the substring must be confirmed by comparing
|
||||
them; that comparison can take a long time for long substrings.
|
||||
Luckily, a good hash function on reasonable strings usually does not
|
||||
have many collisions, so the expected search time will be acceptable.
|
||||
|
||||
## Hash Function Used
|
||||
|
||||
The key to the Rabin–Karp algorithm's performance is the efficient computation
|
||||
of hash values of the successive substrings of the text.
|
||||
The **Rabin fingerprint** is a popular and effective rolling hash function.
|
||||
|
||||
The **polynomial hash function** described in this example is not a Rabin
|
||||
fingerprint, but it works equally well. It treats every substring as a
|
||||
number in some base, the base being usually a large prime.
|
||||
|
||||
## Complexity
|
||||
|
||||
For text of length `n` and `p` patterns
|
||||
of combined length `m`, its average and best case running time is
|
||||
`O(n + m)` in space `O(p)`, but its worst-case time is `O(n * m)`.
|
||||
For text of length `n` and `p` patterns of combined length `m`, its average
|
||||
and best case running time is `O(n + m)` in space `O(p)`, but its
|
||||
worst-case time is `O(n * m)`.
|
||||
|
||||
## Application
|
||||
|
||||
|
@ -13,8 +13,30 @@ describe('rabinKarp', () => {
|
||||
expect(rabinKarp('abcxabcdabxaabcdabcabcdabcdabcy', 'abcdabca')).toBe(12);
|
||||
expect(rabinKarp('abcxabcdabxaabaabaaaabcdabcdabcy', 'aabaabaaa')).toBe(11);
|
||||
expect(rabinKarp('^ !/\'#\'pp', ' !/\'#\'pp')).toBe(1);
|
||||
});
|
||||
|
||||
it('should work with bigger texts', () => {
|
||||
const text = 'Lorem Ipsum is simply dummy text of the printing and '
|
||||
+ 'typesetting industry. Lorem Ipsum has been the industry\'s standard '
|
||||
+ 'dummy text ever since the 1500s, when an unknown printer took a '
|
||||
+ 'galley of type and scrambled it to make a type specimen book. It '
|
||||
+ 'has survived not only five centuries, but also the leap into '
|
||||
+ 'electronic typesetting, remaining essentially unchanged. It was '
|
||||
+ 'popularised in the 1960s with the release of Letraset sheets '
|
||||
+ 'containing Lorem Ipsum passages, and more recently with desktop'
|
||||
+ 'publishing software like Aldus PageMaker including versions of Lorem '
|
||||
+ 'Ipsum.';
|
||||
|
||||
expect(rabinKarp(text, 'Lorem')).toBe(0);
|
||||
expect(rabinKarp(text, 'versions')).toBe(549);
|
||||
expect(rabinKarp(text, 'versions of Lorem Ipsum.')).toBe(549);
|
||||
expect(rabinKarp(text, 'versions of Lorem Ipsum:')).toBe(-1);
|
||||
expect(rabinKarp(text, 'Lorem Ipsum passages, and more recently with')).toBe(446);
|
||||
});
|
||||
|
||||
it('should work with UTF symbols', () => {
|
||||
expect(rabinKarp('a\u{ffff}', '\u{ffff}')).toBe(1);
|
||||
expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
|
||||
expect(rabinKarp('\u0000耀\u0000', '耀\u0000')).toBe(1);
|
||||
// expect(rabinKarp('a\u{10000}', '\u{10000}')).toBe(1);
|
||||
});
|
||||
});
|
||||
|
@ -1,33 +1,63 @@
|
||||
import RabinFingerprint from '../../../utils/hash/rolling/Rabin_Fingerprint';
|
||||
import PolynomialHash from '../../cryptography/polynomial-hash/PolynomialHash';
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @param {string} word
|
||||
* @return {number}
|
||||
* Checks if two strings are equal.
|
||||
*
|
||||
* We may simply compare (string1 === string2) but for the
|
||||
* purpose of analyzing algorithm time complexity let's do
|
||||
* it character by character.
|
||||
*
|
||||
* @param {string} string1
|
||||
* @param {string} string2
|
||||
*/
|
||||
export default function rabinKarp(text, word) {
|
||||
const toNum = function toNum(character) {
|
||||
const surrogate = character.codePointAt(1);
|
||||
return ((surrogate === undefined) ? 0 : surrogate) + (character.codePointAt(0) * (2 ** 16));
|
||||
};
|
||||
const arrEq = (a1, a2) => ((a1.length === a2.length) && a1.every((val, idx) => val === a2[idx]));
|
||||
|
||||
const wordArr = [...word].map(toNum);
|
||||
const textArr = [...text].map(toNum);
|
||||
|
||||
// The prime generation function could depend on the inputs for collision guarantees.
|
||||
const hasher = new RabinFingerprint(() => 229);
|
||||
const cmpVal = hasher.init(wordArr);
|
||||
|
||||
let currHash = hasher.init(textArr.slice(0, wordArr.length));
|
||||
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(0, wordArr.length))) {
|
||||
return 0;
|
||||
function stringsAreEqual(string1, string2) {
|
||||
if (string1.length !== string2.length) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (let i = 0; i < (textArr.length - wordArr.length); i += 1) {
|
||||
currHash = hasher.roll(textArr[i], textArr[i + wordArr.length]);
|
||||
if ((currHash === cmpVal) && arrEq(wordArr, textArr.slice(i + 1, i + wordArr.length + 1))) {
|
||||
return i + 1;
|
||||
for (let charIndex = 0; charIndex < string1.length; charIndex += 1) {
|
||||
if (string1[charIndex] !== string2[charIndex]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} text - Text that may contain the searchable word.
|
||||
* @param {string} word - Word that is being searched in text.
|
||||
* @return {number} - Position of the word in text.
|
||||
*/
|
||||
export default function rabinKarp(text, word) {
|
||||
const hasher = new PolynomialHash();
|
||||
|
||||
// Calculate word hash that we will use for comparison with other substring hashes.
|
||||
const wordHash = hasher.hash(word);
|
||||
|
||||
let prevFrame = null;
|
||||
let currentFrameHash = null;
|
||||
|
||||
// Go through all substring of the text that may match.
|
||||
for (let charIndex = 0; charIndex <= (text.length - word.length); charIndex += 1) {
|
||||
const currentFrame = text.substring(charIndex, charIndex + word.length);
|
||||
|
||||
// Calculate the hash of current substring.
|
||||
if (currentFrameHash === null) {
|
||||
currentFrameHash = hasher.hash(currentFrame);
|
||||
} else {
|
||||
currentFrameHash = hasher.roll(currentFrameHash, prevFrame, currentFrame);
|
||||
}
|
||||
|
||||
prevFrame = currentFrame;
|
||||
|
||||
// Compare the hash of current substring and seeking string.
|
||||
// In case if hashes match let's check substring char by char.
|
||||
if (
|
||||
wordHash === currentFrameHash
|
||||
&& stringsAreEqual(text.substr(charIndex, word.length), word)
|
||||
) {
|
||||
return charIndex;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user