Add bloom filter (#84)

This commit is contained in:
arnav-aggarwal 2018-06-30 10:07:19 -07:00 committed by Oleksii Trekhleb
parent b33f1d52dc
commit 41a6430532
5 changed files with 358 additions and 0 deletions

View File

@ -38,6 +38,7 @@ the data.
* `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
* `A` [Graph](src/data-structures/graph) (both directed and undirected)
* `A` [Disjoint Set](src/data-structures/disjoint-set)
* `A` [Bloom Filter](src/data-structures/bloom-filter)
## Algorithms
@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
| **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
| **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
| **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
| **Bloom Filter** | | 1 | 1 | | |
### Array Sorting Algorithms Complexity

View File

@ -0,0 +1,127 @@
export default class BloomFilter {
/**
* @param {number} size
*/
constructor(size = 100) {
// Bloom filter size directly affects the likelihood of false positives.
// The bigger the size the lower the likelihood of false positives.
this.size = size;
this.storage = this.createStore(size);
}
/**
* @param {string} item
*/
insert(item) {
const hashValues = this.getHashValues(item);
// Set each hashValue index to true
hashValues.forEach(val => this.storage.setValue(val));
}
/**
* @param {string} item
* @return {boolean}
*/
mayContain(item) {
const hashValues = this.getHashValues(item);
for (let i = 0; i < hashValues.length; i += 1) {
if (!this.storage.getValue(hashValues[i])) {
// We know that the item was definitely not inserted.
return false;
}
}
// The item may or may not have been inserted.
return true;
}
/**
* Creates the data store for our filter.
* We use this method to generate the store in order to
* encapsulate the data itself and only provide access
* to the necessary methods.
*
* @param {number} size
* @return {Object}
*/
createStore(size) {
const storage = [];
// Initialize all indexes to false
for (let i = 0; i < size; i += 1) {
storage.push(false);
}
const storageInterface = {
getValue(index) {
return storage[index];
},
setValue(index) {
storage[index] = true;
},
};
return storageInterface;
}
/**
* @param {string} str
* @return {number}
*/
hash1(str) {
let hash = 0;
for (let i = 0; i < str.length; i += 1) {
const char = str.charCodeAt(i);
hash = (hash << 5) + hash + char;
hash &= hash; // Convert to 32bit integer
hash = Math.abs(hash);
}
return hash % this.size;
}
/**
* @param {string} str
* @return {number}
*/
hash2(str) {
let hash = 5381;
for (let i = 0; i < str.length; i += 1) {
const char = str.charCodeAt(i);
hash = (hash << 5) + hash + char; /* hash * 33 + c */
}
return hash % this.size;
}
/**
* @param {string} str
* @return {number}
*/
hash3(str) {
let hash = 0;
for (let i = 0; i < str.length; i += 1) {
const char = str.charCodeAt(i);
hash = (hash << 5) - hash;
hash += char;
hash &= hash; // Convert to 32bit integer
}
return hash % this.size;
}
/**
* Runs all 3 hash functions on the input and returns an array of results
*
* @param {string} str
* @return {number[]}
*/
getHashValues(item) {
return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
}
}

View File

@ -0,0 +1,104 @@
# Bloom Filter
A bloom filter is a data structure designed to
test whether an element is present in a set. It
is designed to be blazingly fast and use minimal
memory at the cost of potential false positives.
![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
## Operations
There are two main operations a bloom filter can
perform: insertion and search. Search may result in
false positives. Deletion is not possible.
In other words, the filter can take in items. When
we go to check if an item has previously been
inserted, it can tell us either "no" or "maybe".
Both insertion and search are O(1) operations.
## Making the filter
A bloom filter is created by allotting a certain size.
In our example, we use 100 as a default length. All
locations are initialized to `false`.
### Insertion
During insertion, a number of hash functions,
in our case 3 hash functions, are used to create
hashes of the input. These hash functions output
indexes. At every index received, we simply change
the value in our bloom filter to `true`.
### Search
During a search, the same hash functions are called
and used to hash the input. We then check if the
indexes received _all_ have a value of `true` inside
our bloom filter. If they _all_ have a value of
`true`, we know that the bloom filter may have had
the value previously inserted.
However, it's not certain, because it's possible
that other values previously inserted flipped the
values to `true`. The values aren't necessarily
`true` due to the item currently being searched for.
Absolute certainty is impossible unless only a single
item has previously been inserted.
While checking the bloom filter for the indexes
returned by our hash functions, if even one of them
has a value of `false`, we definitively know that the
item was not previously inserted.
## False Positives
The probability of false positives is determined by
three factors: the size of the bloom filter, the
number of hash functions we use, and the number
of items that have been inserted into the filter.
The formula to calculate probablity of a false positive is:
( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
k = # hash functions
m = size
n = # items inserted
These variables, k, m, and n, should be picked based
on how acceptable false positives are. If the values
are picked and the resulting probability is too high,
the values should be tweaked and the probability
re-calculated.
## Applications
A bloom filter can be used on a blogging website. If
the goal is to show readers only articles that they
have never seen before, a bloom filter is perfect.
It can store hashed values based on the articles. After
a user reads a few articles, they can be inserted into
the filter. The next time the user visits the site,
those articles can be filtered out of the results.
Some articles will inevitably be filtered out by mistake,
but the cost is acceptable. It's ok if a user never sees
a few articles as long as they have other, brand new ones
to see every time they visit the site.
The popular blog site Medium does a version of this.
Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
## References
- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)

View File

@ -0,0 +1,39 @@
import BloomFilter from '../BloomFilter';
describe('Bloom Filter', () => {
let bloomFilter;
const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
beforeEach(() => {
bloomFilter = new BloomFilter();
});
it('Should have methods named "insert" and "mayContain"', () => {
expect(typeof bloomFilter.insert).toBe('function');
expect(typeof bloomFilter.mayContain).toBe('function');
});
it('Should create a new filter store with the appropriate methods', () => {
const store = bloomFilter.createStore(18);
expect(typeof store.getValue).toBe('function');
expect(typeof store.setValue).toBe('function');
});
it('Should hash deterministically with all 3 hash functions', () => {
const str = 'abc';
expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
});
it('Should create an array with 3 hash values', () => {
expect(bloomFilter.getHashValues('abc').length).toEqual(3);
});
it('Should insert strings correctly and return true when checking for inserted values', () => {
people.forEach(person => bloomFilter.insert(person));
expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
});
});

View File

@ -0,0 +1,86 @@
import BloomFilter from '../BloomFilter';
// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
function makeID() {
const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
let id = '';
for (let i = 0; i < 10; i += 1) {
const randomLength = Math.random() * possible.length;
const randomIndex = Math.floor(randomLength);
id += possible.charAt(randomIndex);
}
return id;
}
function run10kTrials(numRandomTests = 1000) {
const bloomFilter = new BloomFilter();
const mockPeopleIDs = [];
for (let i = 0; i < 10; i += 1) {
mockPeopleIDs.push(makeID());
}
mockPeopleIDs.forEach(id => bloomFilter.insert(id));
let numFalsePositives = 0;
for (let index = 0; index < numRandomTests; index += 1) {
const randomID = makeID();
if (bloomFilter.mayContain(randomID)) {
numFalsePositives += 1;
}
}
return numFalsePositives;
}
function testFilter(numTrials = 100) {
const results = [];
for (let i = 0; i < numTrials; i += 1) {
results.push(run10kTrials());
}
const sum = results.reduce((cumulative, next) => cumulative + next, 0);
return sum / numTrials;
}
describe('Bloom filter false positives', () => {
const falsePositiveProbability = 0.0174;
const expectedFalsePositives = falsePositiveProbability * 1000;
const avgFalsePositives = testFilter();
it(`Should keep false positives close to an expected value:
# trials = 1000
k = 3 (hash functions)
m = 100 (size)
n = 10 (items inserted)
Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
Chance of false positive = 0.017
Expected false positives = # trials * chance of false positive
Expected false positives => 1000 * ${falsePositiveProbability}
Expected false positives => ${expectedFalsePositives}
**************************
EXPECTED = ${expectedFalsePositives}
ACTUAL AVG = ${avgFalsePositives}
**************************
If the expected and actual numbers are far off, something is wrong.
Inspect manually.`, () => {
// We give it a large range to avoid unnecessary failures.
// If it's working correctly, the value should definitely
// fall within this range.
// In over 1,000 test runs, none of them ever come close
// to falling outside of this range.
const upperLimit = expectedFalsePositives + 5;
const lowerLimit = expectedFalsePositives - 5;
expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
expect(avgFalsePositives).toBeLessThan(upperLimit);
});
});