Add bloom filter (#84)

2024-09-20 07:43:04 +08:00 · 2018-06-30 10:07:19 -07:00 · 2018-06-30 10:07:19 -07:00 · 41a6430532
commit 41a6430532
parent b33f1d52dc
5 changed files with 358 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -38,6 +38,7 @@ the data.
    * `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
 * `A` [Graph](src/data-structures/graph) (both directed and undirected)
 * `A` [Disjoint Set](src/data-structures/disjoint-set)
+* `A` [Bloom Filter](src/data-structures/bloom-filter)

 ## Algorithms

@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
 | **B-Tree**              | log(n)    | log(n)    | log(n)    | log(n)    |           |
 | **Red-Black Tree**      | log(n)    | log(n)    | log(n)    | log(n)    |           |
 | **AVL Tree**            | log(n)    | log(n)    | log(n)    | log(n)    |           |
+| **Bloom Filter**        |           | 1         | 1         |           |           |

 ### Array Sorting Algorithms Complexity

--- a/src/data-structures/bloom-filter/BloomFilter.js
+++ b/src/data-structures/bloom-filter/BloomFilter.js
@ -0,0 +1,127 @@
+export default class BloomFilter {
+  /**
+   * @param {number} size
+   */
+  constructor(size = 100) {
+    // Bloom filter size directly affects the likelihood of false positives.
+    // The bigger the size the lower the likelihood of false positives.
+    this.size = size;
+    this.storage = this.createStore(size);
+  }
+
+  /**
+   * @param {string} item
+   */
+  insert(item) {
+    const hashValues = this.getHashValues(item);
+
+    // Set each hashValue index to true
+    hashValues.forEach(val => this.storage.setValue(val));
+  }
+
+  /**
+   * @param {string} item
+   * @return {boolean}
+   */
+  mayContain(item) {
+    const hashValues = this.getHashValues(item);
+
+    for (let i = 0; i < hashValues.length; i += 1) {
+      if (!this.storage.getValue(hashValues[i])) {
+        // We know that the item was definitely not inserted.
+        return false;
+      }
+    }
+
+    // The item may or may not have been inserted.
+    return true;
+  }
+
+  /**
+   * Creates the data store for our filter.
+   * We use this method to generate the store in order to
+   * encapsulate the data itself and only provide access
+   * to the necessary methods.
+   *
+   * @param {number} size
+   * @return {Object}
+   */
+  createStore(size) {
+    const storage = [];
+
+    // Initialize all indexes to false
+    for (let i = 0; i < size; i += 1) {
+      storage.push(false);
+    }
+
+    const storageInterface = {
+      getValue(index) {
+        return storage[index];
+      },
+      setValue(index) {
+        storage[index] = true;
+      },
+    };
+
+    return storageInterface;
+  }
+
+  /**
+   * @param {string} str
+   * @return {number}
+   */
+  hash1(str) {
+    let hash = 0;
+
+    for (let i = 0; i < str.length; i += 1) {
+      const char = str.charCodeAt(i);
+      hash = (hash << 5) + hash + char;
+      hash &= hash; // Convert to 32bit integer
+      hash = Math.abs(hash);
+    }
+
+    return hash % this.size;
+  }
+
+  /**
+   * @param {string} str
+   * @return {number}
+   */
+  hash2(str) {
+    let hash = 5381;
+
+    for (let i = 0; i < str.length; i += 1) {
+      const char = str.charCodeAt(i);
+      hash = (hash << 5) + hash + char; /* hash * 33 + c */
+    }
+
+    return hash % this.size;
+  }
+
+  /**
+   * @param {string} str
+   * @return {number}
+   */
+  hash3(str) {
+    let hash = 0;
+
+    for (let i = 0; i < str.length; i += 1) {
+      const char = str.charCodeAt(i);
+      hash = (hash << 5) - hash;
+      hash += char;
+      hash &= hash; // Convert to 32bit integer
+    }
+
+    return hash % this.size;
+  }
+
+  /**
+   * Runs all 3 hash functions on the input and returns an array of results
+   *
+   * @param {string} str
+   * @return {number[]}
+   */
+  getHashValues(item) {
+    return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
+  }
+}
--- a/src/data-structures/bloom-filter/README.md
+++ b/src/data-structures/bloom-filter/README.md
@ -0,0 +1,104 @@
+# Bloom Filter
+
+A bloom filter is a data structure designed to
+test whether an element is present in a set. It
+is designed to be blazingly fast and use minimal
+memory at the cost of potential false positives.
+
+![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
+
+## Operations
+
+There are two main operations a bloom filter can
+perform: insertion and search. Search may result in
+false positives. Deletion is not possible.
+
+In other words, the filter can take in items. When
+we go to check if an item has previously been
+inserted, it can tell us either "no" or "maybe".
+
+Both insertion and search are O(1) operations.
+
+## Making the filter
+
+A bloom filter is created by allotting a certain size.
+In our example, we use 100 as a default length. All
+locations are initialized to `false`.
+
+### Insertion
+
+During insertion, a number of hash functions,
+in our case 3 hash functions, are used to create
+hashes of the input. These hash functions output
+indexes. At every index received, we simply change
+the value in our bloom filter to `true`.
+
+### Search
+
+During a search, the same hash functions are called
+and used to hash the input. We then check if the
+indexes received _all_ have a value of `true` inside
+our bloom filter. If they _all_ have a value of
+`true`, we know that the bloom filter may have had
+the value previously inserted.
+
+However, it's not certain, because it's possible
+that other values previously inserted flipped the
+values to `true`. The values aren't necessarily
+`true` due to the item currently being searched for.
+Absolute certainty is impossible unless only a single
+item has previously been inserted.
+
+While checking the bloom filter for the indexes
+returned by our hash functions, if even one of them
+has a value of `false`, we definitively know that the
+item was not previously inserted.
+
+## False Positives
+
+The probability of false positives is determined by
+three factors: the size of the bloom filter, the
+number of hash functions we use, and the number
+of items that have been inserted into the filter.
+
+The formula to calculate probablity of a false positive is:
+
+( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
+
+k = # hash functions
+
+m = size
+
+n = # items inserted
+
+These variables, k, m, and n, should be picked based
+on how acceptable false positives are. If the values
+are picked and the resulting probability is too high,
+the values should be tweaked and the probability
+re-calculated.
+
+## Applications
+
+A bloom filter can be used on a blogging website. If
+the goal is to show readers only articles that they
+have never seen before, a bloom filter is perfect.
+It can store hashed values based on the articles. After
+a user reads a few articles, they can be inserted into
+the filter. The next time the user visits the site,
+those articles can be filtered out of the results.
+
+Some articles will inevitably be filtered out by mistake,
+but the cost is acceptable. It's ok if a user never sees
+a few articles as long as they have other, brand new ones
+to see every time they visit the site.
+
+The popular blog site Medium does a version of this.
+Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
+
+## References
+
+- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
+- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
+- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
+- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
+- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
--- a/src/data-structures/bloom-filter/test/BloomFilter.test.js
+++ b/src/data-structures/bloom-filter/test/BloomFilter.test.js
@ -0,0 +1,39 @@
+import BloomFilter from '../BloomFilter';
+
+describe('Bloom Filter', () => {
+  let bloomFilter;
+  const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
+
+  beforeEach(() => {
+    bloomFilter = new BloomFilter();
+  });
+
+  it('Should have methods named "insert" and "mayContain"', () => {
+    expect(typeof bloomFilter.insert).toBe('function');
+    expect(typeof bloomFilter.mayContain).toBe('function');
+  });
+
+  it('Should create a new filter store with the appropriate methods', () => {
+    const store = bloomFilter.createStore(18);
+    expect(typeof store.getValue).toBe('function');
+    expect(typeof store.setValue).toBe('function');
+  });
+
+  it('Should hash deterministically with all 3 hash functions', () => {
+    const str = 'abc';
+    expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
+    expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
+    expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
+  });
+
+  it('Should create an array with 3 hash values', () => {
+    expect(bloomFilter.getHashValues('abc').length).toEqual(3);
+  });
+
+  it('Should insert strings correctly and return true when checking for inserted values', () => {
+    people.forEach(person => bloomFilter.insert(person));
+    expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
+    expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
+    expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
+  });
+});
--- a/src/data-structures/bloom-filter/test/BloomFilterFalsePositive.test.js
+++ b/src/data-structures/bloom-filter/test/BloomFilterFalsePositive.test.js
@ -0,0 +1,86 @@
+import BloomFilter from '../BloomFilter';
+
+// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
+function makeID() {
+  const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+  let id = '';
+
+  for (let i = 0; i < 10; i += 1) {
+    const randomLength = Math.random() * possible.length;
+    const randomIndex = Math.floor(randomLength);
+    id += possible.charAt(randomIndex);
+  }
+
+  return id;
+}
+
+function run10kTrials(numRandomTests = 1000) {
+  const bloomFilter = new BloomFilter();
+  const mockPeopleIDs = [];
+
+  for (let i = 0; i < 10; i += 1) {
+    mockPeopleIDs.push(makeID());
+  }
+
+  mockPeopleIDs.forEach(id => bloomFilter.insert(id));
+  let numFalsePositives = 0;
+
+  for (let index = 0; index < numRandomTests; index += 1) {
+    const randomID = makeID();
+    if (bloomFilter.mayContain(randomID)) {
+      numFalsePositives += 1;
+    }
+  }
+
+  return numFalsePositives;
+}
+
+function testFilter(numTrials = 100) {
+  const results = [];
+
+  for (let i = 0; i < numTrials; i += 1) {
+    results.push(run10kTrials());
+  }
+
+  const sum = results.reduce((cumulative, next) => cumulative + next, 0);
+  return sum / numTrials;
+}
+
+describe('Bloom filter false positives', () => {
+  const falsePositiveProbability = 0.0174;
+  const expectedFalsePositives = falsePositiveProbability * 1000;
+  const avgFalsePositives = testFilter();
+
+  it(`Should keep false positives close to an expected value:
+  
+  # trials = 1000
+  k = 3    (hash functions)
+  m = 100  (size)
+  n = 10   (items inserted)
+  
+  Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
+  Chance of false positive = 0.017
+  
+  Expected false positives    =  # trials * chance of false positive
+  Expected false positives    => 1000 * ${falsePositiveProbability}
+  Expected false positives    => ${expectedFalsePositives}
+  
+  **************************
+  EXPECTED   = ${expectedFalsePositives}
+  ACTUAL AVG = ${avgFalsePositives}
+  **************************
+  
+  If the expected and actual numbers are far off, something is wrong.
+  Inspect manually.`, () => {
+    // We give it a large range to avoid unnecessary failures.
+    // If it's working correctly, the value should definitely
+    // fall within this range.
+
+    // In over 1,000 test runs, none of them ever come close
+    // to falling outside of this range.
+    const upperLimit = expectedFalsePositives + 5;
+    const lowerLimit = expectedFalsePositives - 5;
+    expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
+    expect(avgFalsePositives).toBeLessThan(upperLimit);
+  });
+});