diff --git a/README.md b/README.md index fddf28a4..143d6d0a 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ the data. * `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree) * `A` [Graph](src/data-structures/graph) (both directed and undirected) * `A` [Disjoint Set](src/data-structures/disjoint-set) +* `A` [Bloom Filter](src/data-structures/bloom-filter) ## Algorithms @@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance | **B-Tree** | log(n) | log(n) | log(n) | log(n) | | | **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | | | **AVL Tree** | log(n) | log(n) | log(n) | log(n) | | +| **Bloom Filter** | | 1 | 1 | | | ### Array Sorting Algorithms Complexity diff --git a/src/data-structures/bloom-filter/BloomFilter.js b/src/data-structures/bloom-filter/BloomFilter.js new file mode 100644 index 00000000..465e6f58 --- /dev/null +++ b/src/data-structures/bloom-filter/BloomFilter.js @@ -0,0 +1,127 @@ +export default class BloomFilter { + /** + * @param {number} size + */ + constructor(size = 100) { + // Bloom filter size directly affects the likelihood of false positives. + // The bigger the size the lower the likelihood of false positives. + this.size = size; + this.storage = this.createStore(size); + } + + /** + * @param {string} item + */ + insert(item) { + const hashValues = this.getHashValues(item); + + // Set each hashValue index to true + hashValues.forEach(val => this.storage.setValue(val)); + } + + /** + * @param {string} item + * @return {boolean} + */ + mayContain(item) { + const hashValues = this.getHashValues(item); + + for (let i = 0; i < hashValues.length; i += 1) { + if (!this.storage.getValue(hashValues[i])) { + // We know that the item was definitely not inserted. + return false; + } + } + + // The item may or may not have been inserted. + return true; + } + + /** + * Creates the data store for our filter. + * We use this method to generate the store in order to + * encapsulate the data itself and only provide access + * to the necessary methods. + * + * @param {number} size + * @return {Object} + */ + createStore(size) { + const storage = []; + + // Initialize all indexes to false + for (let i = 0; i < size; i += 1) { + storage.push(false); + } + + const storageInterface = { + getValue(index) { + return storage[index]; + }, + setValue(index) { + storage[index] = true; + }, + }; + + return storageInterface; + } + + /** + * @param {string} str + * @return {number} + */ + hash1(str) { + let hash = 0; + + for (let i = 0; i < str.length; i += 1) { + const char = str.charCodeAt(i); + hash = (hash << 5) + hash + char; + hash &= hash; // Convert to 32bit integer + hash = Math.abs(hash); + } + + return hash % this.size; + } + + /** + * @param {string} str + * @return {number} + */ + hash2(str) { + let hash = 5381; + + for (let i = 0; i < str.length; i += 1) { + const char = str.charCodeAt(i); + hash = (hash << 5) + hash + char; /* hash * 33 + c */ + } + + return hash % this.size; + } + + /** + * @param {string} str + * @return {number} + */ + hash3(str) { + let hash = 0; + + for (let i = 0; i < str.length; i += 1) { + const char = str.charCodeAt(i); + hash = (hash << 5) - hash; + hash += char; + hash &= hash; // Convert to 32bit integer + } + + return hash % this.size; + } + + /** + * Runs all 3 hash functions on the input and returns an array of results + * + * @param {string} str + * @return {number[]} + */ + getHashValues(item) { + return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))]; + } +} diff --git a/src/data-structures/bloom-filter/README.md b/src/data-structures/bloom-filter/README.md new file mode 100644 index 00000000..07936a34 --- /dev/null +++ b/src/data-structures/bloom-filter/README.md @@ -0,0 +1,104 @@ +# Bloom Filter + +A bloom filter is a data structure designed to +test whether an element is present in a set. It +is designed to be blazingly fast and use minimal +memory at the cost of potential false positives. + +![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg) + +## Operations + +There are two main operations a bloom filter can +perform: insertion and search. Search may result in +false positives. Deletion is not possible. + +In other words, the filter can take in items. When +we go to check if an item has previously been +inserted, it can tell us either "no" or "maybe". + +Both insertion and search are O(1) operations. + +## Making the filter + +A bloom filter is created by allotting a certain size. +In our example, we use 100 as a default length. All +locations are initialized to `false`. + +### Insertion + +During insertion, a number of hash functions, +in our case 3 hash functions, are used to create +hashes of the input. These hash functions output +indexes. At every index received, we simply change +the value in our bloom filter to `true`. + +### Search + +During a search, the same hash functions are called +and used to hash the input. We then check if the +indexes received _all_ have a value of `true` inside +our bloom filter. If they _all_ have a value of +`true`, we know that the bloom filter may have had +the value previously inserted. + +However, it's not certain, because it's possible +that other values previously inserted flipped the +values to `true`. The values aren't necessarily +`true` due to the item currently being searched for. +Absolute certainty is impossible unless only a single +item has previously been inserted. + +While checking the bloom filter for the indexes +returned by our hash functions, if even one of them +has a value of `false`, we definitively know that the +item was not previously inserted. + +## False Positives + +The probability of false positives is determined by +three factors: the size of the bloom filter, the +number of hash functions we use, and the number +of items that have been inserted into the filter. + +The formula to calculate probablity of a false positive is: + +( 1 - e -kn/m ) k + +k = # hash functions + +m = size + +n = # items inserted + +These variables, k, m, and n, should be picked based +on how acceptable false positives are. If the values +are picked and the resulting probability is too high, +the values should be tweaked and the probability +re-calculated. + +## Applications + +A bloom filter can be used on a blogging website. If +the goal is to show readers only articles that they +have never seen before, a bloom filter is perfect. +It can store hashed values based on the articles. After +a user reads a few articles, they can be inserted into +the filter. The next time the user visits the site, +those articles can be filtered out of the results. + +Some articles will inevitably be filtered out by mistake, +but the cost is acceptable. It's ok if a user never sees +a few articles as long as they have other, brand new ones +to see every time they visit the site. + +The popular blog site Medium does a version of this. +Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff). + +## References + +- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter) +- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/) +- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3) +- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff) +- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw) diff --git a/src/data-structures/bloom-filter/__test__/BloomFilter.test.js b/src/data-structures/bloom-filter/__test__/BloomFilter.test.js new file mode 100644 index 00000000..a8b9c233 --- /dev/null +++ b/src/data-structures/bloom-filter/__test__/BloomFilter.test.js @@ -0,0 +1,39 @@ +import BloomFilter from '../BloomFilter'; + +describe('Bloom Filter', () => { + let bloomFilter; + const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen']; + + beforeEach(() => { + bloomFilter = new BloomFilter(); + }); + + it('Should have methods named "insert" and "mayContain"', () => { + expect(typeof bloomFilter.insert).toBe('function'); + expect(typeof bloomFilter.mayContain).toBe('function'); + }); + + it('Should create a new filter store with the appropriate methods', () => { + const store = bloomFilter.createStore(18); + expect(typeof store.getValue).toBe('function'); + expect(typeof store.setValue).toBe('function'); + }); + + it('Should hash deterministically with all 3 hash functions', () => { + const str = 'abc'; + expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str)); + expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str)); + expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str)); + }); + + it('Should create an array with 3 hash values', () => { + expect(bloomFilter.getHashValues('abc').length).toEqual(3); + }); + + it('Should insert strings correctly and return true when checking for inserted values', () => { + people.forEach(person => bloomFilter.insert(person)); + expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true); + expect(bloomFilter.mayContain('Clark Kent')).toBe(true); + expect(bloomFilter.mayContain('Barry Allen')).toBe(true); + }); +}); diff --git a/src/data-structures/bloom-filter/__test__/BloomFilterFalsePositive.test.js b/src/data-structures/bloom-filter/__test__/BloomFilterFalsePositive.test.js new file mode 100644 index 00000000..dd1ad73a --- /dev/null +++ b/src/data-structures/bloom-filter/__test__/BloomFilterFalsePositive.test.js @@ -0,0 +1,86 @@ +import BloomFilter from '../BloomFilter'; + +// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript +function makeID() { + const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; + let id = ''; + + for (let i = 0; i < 10; i += 1) { + const randomLength = Math.random() * possible.length; + const randomIndex = Math.floor(randomLength); + id += possible.charAt(randomIndex); + } + + return id; +} + +function run10kTrials(numRandomTests = 1000) { + const bloomFilter = new BloomFilter(); + const mockPeopleIDs = []; + + for (let i = 0; i < 10; i += 1) { + mockPeopleIDs.push(makeID()); + } + + mockPeopleIDs.forEach(id => bloomFilter.insert(id)); + let numFalsePositives = 0; + + for (let index = 0; index < numRandomTests; index += 1) { + const randomID = makeID(); + if (bloomFilter.mayContain(randomID)) { + numFalsePositives += 1; + } + } + + return numFalsePositives; +} + +function testFilter(numTrials = 100) { + const results = []; + + for (let i = 0; i < numTrials; i += 1) { + results.push(run10kTrials()); + } + + const sum = results.reduce((cumulative, next) => cumulative + next, 0); + return sum / numTrials; +} + +describe('Bloom filter false positives', () => { + const falsePositiveProbability = 0.0174; + const expectedFalsePositives = falsePositiveProbability * 1000; + const avgFalsePositives = testFilter(); + + it(`Should keep false positives close to an expected value: + + # trials = 1000 + k = 3 (hash functions) + m = 100 (size) + n = 10 (items inserted) + + Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3 + Chance of false positive = 0.017 + + Expected false positives = # trials * chance of false positive + Expected false positives => 1000 * ${falsePositiveProbability} + Expected false positives => ${expectedFalsePositives} + + ************************** + EXPECTED = ${expectedFalsePositives} + ACTUAL AVG = ${avgFalsePositives} + ************************** + + If the expected and actual numbers are far off, something is wrong. + Inspect manually.`, () => { + // We give it a large range to avoid unnecessary failures. + // If it's working correctly, the value should definitely + // fall within this range. + + // In over 1,000 test runs, none of them ever come close + // to falling outside of this range. + const upperLimit = expectedFalsePositives + 5; + const lowerLimit = expectedFalsePositives - 5; + expect(avgFalsePositives).toBeGreaterThan(lowerLimit); + expect(avgFalsePositives).toBeLessThan(upperLimit); + }); +});