mirror of
https://github.moeyy.xyz/https://github.com/trekhleb/javascript-algorithms.git
synced 2024-11-10 11:09:43 +08:00
Add bloom filter (#84)
This commit is contained in:
parent
b33f1d52dc
commit
41a6430532
@ -38,6 +38,7 @@ the data.
|
||||
* `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
|
||||
* `A` [Graph](src/data-structures/graph) (both directed and undirected)
|
||||
* `A` [Disjoint Set](src/data-structures/disjoint-set)
|
||||
* `A` [Bloom Filter](src/data-structures/bloom-filter)
|
||||
|
||||
## Algorithms
|
||||
|
||||
@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
|
||||
| **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||
| **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||
| **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||
| **Bloom Filter** | | 1 | 1 | | |
|
||||
|
||||
### Array Sorting Algorithms Complexity
|
||||
|
||||
|
127
src/data-structures/bloom-filter/BloomFilter.js
Normal file
127
src/data-structures/bloom-filter/BloomFilter.js
Normal file
@ -0,0 +1,127 @@
|
||||
export default class BloomFilter {
|
||||
/**
|
||||
* @param {number} size
|
||||
*/
|
||||
constructor(size = 100) {
|
||||
// Bloom filter size directly affects the likelihood of false positives.
|
||||
// The bigger the size the lower the likelihood of false positives.
|
||||
this.size = size;
|
||||
this.storage = this.createStore(size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} item
|
||||
*/
|
||||
insert(item) {
|
||||
const hashValues = this.getHashValues(item);
|
||||
|
||||
// Set each hashValue index to true
|
||||
hashValues.forEach(val => this.storage.setValue(val));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} item
|
||||
* @return {boolean}
|
||||
*/
|
||||
mayContain(item) {
|
||||
const hashValues = this.getHashValues(item);
|
||||
|
||||
for (let i = 0; i < hashValues.length; i += 1) {
|
||||
if (!this.storage.getValue(hashValues[i])) {
|
||||
// We know that the item was definitely not inserted.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// The item may or may not have been inserted.
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the data store for our filter.
|
||||
* We use this method to generate the store in order to
|
||||
* encapsulate the data itself and only provide access
|
||||
* to the necessary methods.
|
||||
*
|
||||
* @param {number} size
|
||||
* @return {Object}
|
||||
*/
|
||||
createStore(size) {
|
||||
const storage = [];
|
||||
|
||||
// Initialize all indexes to false
|
||||
for (let i = 0; i < size; i += 1) {
|
||||
storage.push(false);
|
||||
}
|
||||
|
||||
const storageInterface = {
|
||||
getValue(index) {
|
||||
return storage[index];
|
||||
},
|
||||
setValue(index) {
|
||||
storage[index] = true;
|
||||
},
|
||||
};
|
||||
|
||||
return storageInterface;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} str
|
||||
* @return {number}
|
||||
*/
|
||||
hash1(str) {
|
||||
let hash = 0;
|
||||
|
||||
for (let i = 0; i < str.length; i += 1) {
|
||||
const char = str.charCodeAt(i);
|
||||
hash = (hash << 5) + hash + char;
|
||||
hash &= hash; // Convert to 32bit integer
|
||||
hash = Math.abs(hash);
|
||||
}
|
||||
|
||||
return hash % this.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} str
|
||||
* @return {number}
|
||||
*/
|
||||
hash2(str) {
|
||||
let hash = 5381;
|
||||
|
||||
for (let i = 0; i < str.length; i += 1) {
|
||||
const char = str.charCodeAt(i);
|
||||
hash = (hash << 5) + hash + char; /* hash * 33 + c */
|
||||
}
|
||||
|
||||
return hash % this.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {string} str
|
||||
* @return {number}
|
||||
*/
|
||||
hash3(str) {
|
||||
let hash = 0;
|
||||
|
||||
for (let i = 0; i < str.length; i += 1) {
|
||||
const char = str.charCodeAt(i);
|
||||
hash = (hash << 5) - hash;
|
||||
hash += char;
|
||||
hash &= hash; // Convert to 32bit integer
|
||||
}
|
||||
|
||||
return hash % this.size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs all 3 hash functions on the input and returns an array of results
|
||||
*
|
||||
* @param {string} str
|
||||
* @return {number[]}
|
||||
*/
|
||||
getHashValues(item) {
|
||||
return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
|
||||
}
|
||||
}
|
104
src/data-structures/bloom-filter/README.md
Normal file
104
src/data-structures/bloom-filter/README.md
Normal file
@ -0,0 +1,104 @@
|
||||
# Bloom Filter
|
||||
|
||||
A bloom filter is a data structure designed to
|
||||
test whether an element is present in a set. It
|
||||
is designed to be blazingly fast and use minimal
|
||||
memory at the cost of potential false positives.
|
||||
|
||||
![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
|
||||
|
||||
## Operations
|
||||
|
||||
There are two main operations a bloom filter can
|
||||
perform: insertion and search. Search may result in
|
||||
false positives. Deletion is not possible.
|
||||
|
||||
In other words, the filter can take in items. When
|
||||
we go to check if an item has previously been
|
||||
inserted, it can tell us either "no" or "maybe".
|
||||
|
||||
Both insertion and search are O(1) operations.
|
||||
|
||||
## Making the filter
|
||||
|
||||
A bloom filter is created by allotting a certain size.
|
||||
In our example, we use 100 as a default length. All
|
||||
locations are initialized to `false`.
|
||||
|
||||
### Insertion
|
||||
|
||||
During insertion, a number of hash functions,
|
||||
in our case 3 hash functions, are used to create
|
||||
hashes of the input. These hash functions output
|
||||
indexes. At every index received, we simply change
|
||||
the value in our bloom filter to `true`.
|
||||
|
||||
### Search
|
||||
|
||||
During a search, the same hash functions are called
|
||||
and used to hash the input. We then check if the
|
||||
indexes received _all_ have a value of `true` inside
|
||||
our bloom filter. If they _all_ have a value of
|
||||
`true`, we know that the bloom filter may have had
|
||||
the value previously inserted.
|
||||
|
||||
However, it's not certain, because it's possible
|
||||
that other values previously inserted flipped the
|
||||
values to `true`. The values aren't necessarily
|
||||
`true` due to the item currently being searched for.
|
||||
Absolute certainty is impossible unless only a single
|
||||
item has previously been inserted.
|
||||
|
||||
While checking the bloom filter for the indexes
|
||||
returned by our hash functions, if even one of them
|
||||
has a value of `false`, we definitively know that the
|
||||
item was not previously inserted.
|
||||
|
||||
## False Positives
|
||||
|
||||
The probability of false positives is determined by
|
||||
three factors: the size of the bloom filter, the
|
||||
number of hash functions we use, and the number
|
||||
of items that have been inserted into the filter.
|
||||
|
||||
The formula to calculate probablity of a false positive is:
|
||||
|
||||
( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
|
||||
|
||||
k = # hash functions
|
||||
|
||||
m = size
|
||||
|
||||
n = # items inserted
|
||||
|
||||
These variables, k, m, and n, should be picked based
|
||||
on how acceptable false positives are. If the values
|
||||
are picked and the resulting probability is too high,
|
||||
the values should be tweaked and the probability
|
||||
re-calculated.
|
||||
|
||||
## Applications
|
||||
|
||||
A bloom filter can be used on a blogging website. If
|
||||
the goal is to show readers only articles that they
|
||||
have never seen before, a bloom filter is perfect.
|
||||
It can store hashed values based on the articles. After
|
||||
a user reads a few articles, they can be inserted into
|
||||
the filter. The next time the user visits the site,
|
||||
those articles can be filtered out of the results.
|
||||
|
||||
Some articles will inevitably be filtered out by mistake,
|
||||
but the cost is acceptable. It's ok if a user never sees
|
||||
a few articles as long as they have other, brand new ones
|
||||
to see every time they visit the site.
|
||||
|
||||
The popular blog site Medium does a version of this.
|
||||
Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
|
||||
|
||||
## References
|
||||
|
||||
- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
|
||||
- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
|
||||
- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
|
||||
- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
|
||||
- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
|
@ -0,0 +1,39 @@
|
||||
import BloomFilter from '../BloomFilter';
|
||||
|
||||
describe('Bloom Filter', () => {
|
||||
let bloomFilter;
|
||||
const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
|
||||
|
||||
beforeEach(() => {
|
||||
bloomFilter = new BloomFilter();
|
||||
});
|
||||
|
||||
it('Should have methods named "insert" and "mayContain"', () => {
|
||||
expect(typeof bloomFilter.insert).toBe('function');
|
||||
expect(typeof bloomFilter.mayContain).toBe('function');
|
||||
});
|
||||
|
||||
it('Should create a new filter store with the appropriate methods', () => {
|
||||
const store = bloomFilter.createStore(18);
|
||||
expect(typeof store.getValue).toBe('function');
|
||||
expect(typeof store.setValue).toBe('function');
|
||||
});
|
||||
|
||||
it('Should hash deterministically with all 3 hash functions', () => {
|
||||
const str = 'abc';
|
||||
expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
|
||||
expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
|
||||
expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
|
||||
});
|
||||
|
||||
it('Should create an array with 3 hash values', () => {
|
||||
expect(bloomFilter.getHashValues('abc').length).toEqual(3);
|
||||
});
|
||||
|
||||
it('Should insert strings correctly and return true when checking for inserted values', () => {
|
||||
people.forEach(person => bloomFilter.insert(person));
|
||||
expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
|
||||
expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
|
||||
expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
|
||||
});
|
||||
});
|
@ -0,0 +1,86 @@
|
||||
import BloomFilter from '../BloomFilter';
|
||||
|
||||
// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
|
||||
function makeID() {
|
||||
const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
||||
let id = '';
|
||||
|
||||
for (let i = 0; i < 10; i += 1) {
|
||||
const randomLength = Math.random() * possible.length;
|
||||
const randomIndex = Math.floor(randomLength);
|
||||
id += possible.charAt(randomIndex);
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
function run10kTrials(numRandomTests = 1000) {
|
||||
const bloomFilter = new BloomFilter();
|
||||
const mockPeopleIDs = [];
|
||||
|
||||
for (let i = 0; i < 10; i += 1) {
|
||||
mockPeopleIDs.push(makeID());
|
||||
}
|
||||
|
||||
mockPeopleIDs.forEach(id => bloomFilter.insert(id));
|
||||
let numFalsePositives = 0;
|
||||
|
||||
for (let index = 0; index < numRandomTests; index += 1) {
|
||||
const randomID = makeID();
|
||||
if (bloomFilter.mayContain(randomID)) {
|
||||
numFalsePositives += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return numFalsePositives;
|
||||
}
|
||||
|
||||
function testFilter(numTrials = 100) {
|
||||
const results = [];
|
||||
|
||||
for (let i = 0; i < numTrials; i += 1) {
|
||||
results.push(run10kTrials());
|
||||
}
|
||||
|
||||
const sum = results.reduce((cumulative, next) => cumulative + next, 0);
|
||||
return sum / numTrials;
|
||||
}
|
||||
|
||||
describe('Bloom filter false positives', () => {
|
||||
const falsePositiveProbability = 0.0174;
|
||||
const expectedFalsePositives = falsePositiveProbability * 1000;
|
||||
const avgFalsePositives = testFilter();
|
||||
|
||||
it(`Should keep false positives close to an expected value:
|
||||
|
||||
# trials = 1000
|
||||
k = 3 (hash functions)
|
||||
m = 100 (size)
|
||||
n = 10 (items inserted)
|
||||
|
||||
Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
|
||||
Chance of false positive = 0.017
|
||||
|
||||
Expected false positives = # trials * chance of false positive
|
||||
Expected false positives => 1000 * ${falsePositiveProbability}
|
||||
Expected false positives => ${expectedFalsePositives}
|
||||
|
||||
**************************
|
||||
EXPECTED = ${expectedFalsePositives}
|
||||
ACTUAL AVG = ${avgFalsePositives}
|
||||
**************************
|
||||
|
||||
If the expected and actual numbers are far off, something is wrong.
|
||||
Inspect manually.`, () => {
|
||||
// We give it a large range to avoid unnecessary failures.
|
||||
// If it's working correctly, the value should definitely
|
||||
// fall within this range.
|
||||
|
||||
// In over 1,000 test runs, none of them ever come close
|
||||
// to falling outside of this range.
|
||||
const upperLimit = expectedFalsePositives + 5;
|
||||
const lowerLimit = expectedFalsePositives - 5;
|
||||
expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
|
||||
expect(avgFalsePositives).toBeLessThan(upperLimit);
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue
Block a user