37 implement a quotient filter (#111)

* basic quotient filter, to be expanded upon * basic bit array implementation
barrust · Jan 4, 2024 · a62e9ab · a62e9ab
1 parent 4544c6a
commit a62e9ab
Show file tree

Hide file tree

Showing 16 changed files with 642 additions and 32 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -535,7 +535,7 @@ function-naming-style=snake_case
 #function-rgx=
 
 # Good variable names which should always be accepted, separated by a comma.
-good-names=i,j,k,b,f,v,m,n,p,d,hh,st,ex,Run,_
+good-names=i,j,k,b,f,v,m,n,p,d,hh,st,ex,Run,_,r,q
 
 # Good variable names regexes, separated by a comma. If names match any regex,
 # they will always be accepted

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # PyProbables Changelog
 
+### Version 0.6.0
+
+* Add `QuotientFilter` implementation; [see issue #37](https://github.com/barrust/pyprobables/issues/37)
+* Add `bitarray` implementation
+
 ### Version 0.5.9
 
 * Add `py.typed` files so that mypy will find type annotations

diff --git a/docs/source/code.rst b/docs/source/code.rst
@@ -19,6 +19,7 @@ operations. Bloom Filters guarantee a zero percent false negative rate
 and a predetermined false positive rate. Once the number of elements inserted
 exceeds the estimated elements, the false positive rate will increase over the
 desired amount.
+
 `Further Reading <https://en.wikipedia.org/wiki/Bloom_filter>`__
 
 
@@ -69,6 +70,7 @@ membership testing. Cuckoo filters support insertion, deletion, and lookup of
 elements with low overhead and few false positive results. The name is derived
 from the `cuckoo hashing <https://en.wikipedia.org/wiki/Cuckoo_hashing>`__
 strategy used to resolve conflicts.
+
 `Further Reading <https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf>`__
 
 CuckooFilter
@@ -92,6 +94,7 @@ data elements. The result is a probabilistic count of elements inserted into
 the data structure. It will always provide the **maximum** number of times a
 data element was encountered. Notice that the result may be **more** than the
 true number of times it was inserted, but never fewer.
+
 `Further Reading <https://en.wikipedia.org/wiki/Count%E2%80%93min_sketch>`__
 
 
@@ -137,6 +140,38 @@ StreamThreshold
 
 For more information of all methods and properties, see `CountMinSketch`_.
 
+QuotientFilter
+------------------
+
+Quotient filters are an aproximate membership query filter (AMQ) that is both
+space efficient and returns a zero false negative rate and a probablistic false
+positive rate. Unlike Bloom filters, the quotient filter only requires a single
+hash of the element to insert. The upper **q** bits denote the location within the
+filter while the lower **r** bits are stored in the filter.
+
+Quotient filters provide some useful benifits over Bloom filters including:
+
+* Merging of two filters (not union)
+* Resizing of the filter
+* Ability to remove elements
+
+`Further Reading <https://en.wikipedia.org/wiki/Quotient_filter>`__
+
+QuotientFilter
++++++++++++++++++++++++++++++++
+
+.. autoclass:: probables.QuotientFilter
+    :members:
+
+
+Utilities
+------------------
+
+Bitarray
++++++++++++++++++++++++++++++++
+
+.. autoclass:: probables.utilities.Bitarray
+    :members:
 
 Exceptions
 ============================

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -103,7 +103,14 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-# html_theme_options = {}
+
+html_theme_options = {
+    # "collapse_navigation": True,
+    # "sticky_navigation": True,
+    # "navigation_depth": 4,
+    # "includehidden": True,
+    # "titles_only": False,
+}
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,6 +1,7 @@
 .. _home:
 .. include:: ../../README.rst
 
+
 .. toctree::
 
     code

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -253,6 +253,34 @@ The counting cuckoo filter is similar to the standard filter except that it
 tracks the number of times a fingerprint has been added to the filter.
 
 
+Quotient Filters
+----------------
+
+Quotient Filters provide set operations of large datasets while being relatively
+small in memory footprint. They provide a zero percent false negative rate and a
+small false positive rate.
+`more information <https://en.wikipedia.org/wiki/Quotient_filter>`__
+
+
+Import, Initialize, and Train
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python3
+
+    >>> qf = QuotientFilter(quotient=22)
+    >>> with open('war_and_peace.txt', 'r') as fp:
+    >>>     for line in fp:
+    >>>         for word in line.split():
+    >>>             blm.add(word.lower())  # add each word to the bloom filter!
+
+
+Query the Quotient Filter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: python3
+
+    >>> words_to_check = ['borzoi', 'diametrically', 'fleches', 'rain', 'foo']
+    >>> for word in words_to_check:
+    >>>     print(qf.check(word))  # prints: True, True, True, True, False
+
 Custom Hashing Functions
 ----------------------------------
 In many instances, to get the best raw performance out of the data structures,

diff --git a/probables/__init__.py b/probables/__init__.py
@@ -1,19 +1,7 @@
 """ pyprobables module """
 
-from .blooms import (
-    BloomFilter,
-    BloomFilterOnDisk,
-    CountingBloomFilter,
-    ExpandingBloomFilter,
-    RotatingBloomFilter,
-)
-from .countminsketch import (
-    CountMeanMinSketch,
-    CountMeanSketch,
-    CountMinSketch,
-    HeavyHitters,
-    StreamThreshold,
-)
+from .blooms import BloomFilter, BloomFilterOnDisk, CountingBloomFilter, ExpandingBloomFilter, RotatingBloomFilter
+from .countminsketch import CountMeanMinSketch, CountMeanSketch, CountMinSketch, HeavyHitters, StreamThreshold
 from .cuckoo import CountingCuckooFilter, CuckooFilter
 from .exceptions import (
     CuckooFilterFullError,
@@ -22,6 +10,8 @@
     ProbablesBaseException,
     RotatingBloomFilterError,
 )
+from .quotientfilter import QuotientFilter
+from .utilities import Bitarray
 
 __author__ = "Tyler Barrus"
 __maintainer__ = "Tyler Barrus"
@@ -50,4 +40,6 @@
     "ExpandingBloomFilter",
     "RotatingBloomFilter",
     "RotatingBloomFilterError",
+    "QuotientFilter",
+    "Bitarray",
 ]
diff --git a/probables/blooms/bloom.py b/probables/blooms/bloom.py
@@ -286,7 +286,7 @@ def export(self, file: Union[Path, str, IOBase, mmap]) -> None:
         """Export the Bloom Filter to disk
 
         Args:
-            filename (str): The filename to which the Bloom Filter will be written."""
+            file (str): The file or filepath to which the Bloom Filter will be written."""
         if not isinstance(file, (IOBase, mmap)):
             file = resolve_path(file)
             with open(file, "wb") as filepointer:
@@ -658,23 +658,23 @@ def close(self) -> None:
             self.__file_pointer.close()
             self.__file_pointer = None
 
-    def export(self, filename: Union[str, Path]) -> None:  # type: ignore
+    def export(self, file: Union[str, Path]) -> None:  # type: ignore
         """Export to disk if a different location
 
         Args:
-            filename (str): The filename to which the Bloom Filter will be exported
+            file (str|Path): The filename to which the Bloom Filter will be exported
         Note:
             Only exported if the filename is not the original filename"""
         self.__update()
-        if filename and Path(filename) != self._filepath:
-            copyfile(self._filepath.name, str(filename))
+        if file and Path(file) != self._filepath:
+            copyfile(self._filepath.name, str(file))
         # otherwise, nothing to do!
 
-    def _load(self, filepath: Union[str, Path], hash_function: Union[HashFuncT, None] = None):  # type: ignore
+    def _load(self, file: Union[str, Path], hash_function: Union[HashFuncT, None] = None):  # type: ignore
         """load the Bloom Filter on disk"""
         # read the file, set the optimal params
         # mmap everything
-        file = resolve_path(filepath)
+        file = resolve_path(file)
         with open(file, "r+b") as filepointer:
             offset = self._FOOTER_STRUCT.size
             filepointer.seek(offset * -1, os.SEEK_END)
@@ -683,7 +683,7 @@ def _load(self, filepath: Union[str, Path], hash_function: Union[HashFuncT, None
             fpr, n_hashes, n_bits = self._get_optimized_params(est_els, fpr)
             self._set_values(est_els, fpr, n_hashes, n_bits, hash_function)
         # setup a few additional items
-        self.__file_pointer = open(filepath, "r+b")  # type: ignore
+        self.__file_pointer = open(file, "r+b")  # type: ignore
         self._bloom = mmap(self.__file_pointer.fileno(), 0)  # type: ignore
         self._on_disk = True
 

diff --git a/probables/hashes.py b/probables/hashes.py
@@ -5,7 +5,7 @@
 from struct import unpack
 from typing import Callable, List, Union
 
-from .constants import UINT64_T_MAX
+from .constants import UINT32_T_MAX, UINT64_T_MAX
 
 KeyT = Union[str, bytes]
 SimpleHashT = Callable[[KeyT, int], int]
@@ -103,6 +103,26 @@ def fnv_1a(key: KeyT, seed: int = 0) -> int:
     return hval
 
 
+def fnv_1a_32(key: KeyT, seed: int = 0) -> int:
+    """Pure python implementation of the 32 bit fnv-1a hash
+    Args:
+        key (str): The element to be hashed
+        seed (int): Add a seed to the initial starting point (0 means no seed)
+    Returns:
+        int: 32-bit hashed representation of key
+    Note:
+        Uses the lower 32 bits when overflows occur"""
+    max32mod = UINT32_T_MAX + 1
+    hval = (0x811C9DC5 + (31 * seed)) % max32mod
+    fnv_32_prime = 0x01000193
+    tmp = list(key) if not isinstance(key, str) else list(map(ord, key))
+    for t_str in tmp:
+        hval ^= t_str
+        hval *= fnv_32_prime
+        hval %= max32mod
+    return hval
+
+
 @hash_with_depth_bytes
 def default_md5(key: KeyT, *args, **kwargs) -> bytes:
     """The default md5 hashing routine

diff --git a/probables/quotientfilter/__init__.py b/probables/quotientfilter/__init__.py
@@ -0,0 +1,6 @@
+""" Quotient Filters """
+
+
+from .quotientfilter import QuotientFilter
+
+__all__ = ["QuotientFilter"]
diff --git a/probables/quotientfilter/py.typed b/probables/quotientfilter/py.typed