Skip to content

Commit

Permalink
parameterize quotient filter
Browse files Browse the repository at this point in the history
  • Loading branch information
barrust committed Jan 3, 2024
1 parent 52e6fd9 commit a25ce32
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 86 deletions.
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ function-naming-style=snake_case
#function-rgx=

# Good variable names which should always be accepted, separated by a comma.
good-names=i,j,k,b,f,v,m,n,p,d,hh,st,ex,Run,_
good-names=i,j,k,b,f,v,m,n,p,d,hh,st,ex,Run,_,r,q

# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# PyProbables Changelog

### Version 0.6.0

* Add `QuotientFilter` implementation; [see issue #37](https://github.com/barrust/pyprobables/issues/37)
* Add `bitarray` implementation

### Version 0.5.9

* Add `py.typed` files so that mypy will find type annotations
Expand Down
240 changes: 156 additions & 84 deletions probables/quotientfilter/quotientfilter.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,134 @@
# https://mecha-mind.medium.com/membership-queries-with-big-data-9e5046d3270f
""" BloomFilter and BloomFiter on Disk, python implementation
License: MIT
Author: Tyler Barrus ([email protected])
"""

from array import array

from probables.hashes import KeyT, fnv_1a_32
from probables.hashes import HashFuncT, KeyT, fnv_1a_32
from probables.utilities import Bitarray


def get_hash(x: KeyT, m: int):
return fnv_1a_32(x, 0) & ((1 << m) - 1)


class QuotientFilter:
def __init__(self): # needs to be parameterized
self._q = 24
self._r = 8
"""Simple Quotient Filter implementation
Args:
quotient (int): The size of the quotient to use
hash_function (function): Hashing strategy function to use `hf(key, number)`
Returns:
QuotientFilter: The initialized filter
Raises:
ValueError:
Note:
The size of the QuotientFilter will be 2**q"""

__slots__ = (
"_q",
"_r",
"_size",
"_elements_added",
"_hash_func",
"_int_type_code",
"_bits_per_elm",
"_is_occupied",
"_is_continuation",
"_is_shifted",
"_filter",
)

def __init__(self, quotient: int = 20, hash_function: HashFuncT = None): # needs to be parameterized
if quotient < 3 or quotient > 31:
raise ValueError(
f"Invalid q setting for Quotient filter; q must be between 3 and 31; {quotient} was provided"
)
self._q = quotient
self._r = 32 - quotient
self._size = 1 << self._q # same as 2**q
self._elements_added = 0
self._hash_func = fnv_1a_32 if hash_function is None else hash_function

# ensure we use the smallest type possible to reduce memory wastage
if self._r <= 8:
self._int_type_code = "B"
self._bits_per_elm = 8
elif self._r <= 16:
self._int_type_code = "I"
self._bits_per_elm = 16
else:
self._int_type_code = "L"
self._bits_per_elm = 32

self._is_occupied = Bitarray(self._size)
self._is_continuation = Bitarray(self._size)
self._is_shifted = Bitarray(self._size)
self._filter = array(self._int_type_code, [0]) * self._size

def __contains__(self, val: KeyT) -> bool:
"""setup the `in` keyword"""
return self.contains(val)

@property
def quotient(self) -> int:
"""int: The size of the quotient, in bits"""
return self._q

@property
def remainder(self) -> int:
"""int: The size of the remainder, in bits"""
return self._r

@property
def num_elements(self) -> int:
"""int: The total size of the filter"""
return self._size

@property
def elements_added(self) -> int:
"""int: The number of elements added to the filter"""
return self._elements_added

@property
def bits_per_elm(self):
"""int: The number of bits used per element"""
return self._bits_per_elm

def _get_hash(self, key: KeyT, m: int = 32):
return self._hash_func(key, 0) & ((1 << m) - 1) # ensure that we only get 32 bits

# External properties
def add(self, key: KeyT) -> None:
_hash = self._hash_func(key)
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)

self.is_occupied_arr = Bitarray(self._size)
self.is_continuation_arr = Bitarray(self._size)
self.is_shifted_arr = Bitarray(self._size)
self._filter = array("I", [0]) * self._size
if not self._contains(key_quotient, key_remainder):
# TODO, add it here
self._add(key_quotient, key_remainder)

# TODO: Add properties
def contains(self, key: KeyT) -> bool:
_hash = self._hash_func(key)
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)
return self._contains(key_quotient, key_remainder)

def shift_insert(self, k, v, start, j, flag):
if self.is_occupied_arr[j] == 0 and self.is_continuation_arr[j] == 0 and self.is_shifted_arr[j] == 0:
def _shift_insert(self, k, v, start, j, flag):
if self._is_occupied[j] == 0 and self._is_continuation[j] == 0 and self._is_shifted[j] == 0:
self._filter[j] = v
self.is_occupied_arr[k] = 1
self.is_continuation_arr[j] = 1 if j != start else 0
self.is_shifted_arr[j] = 1 if j != k else 0
self._is_occupied[k] = 1
self._is_continuation[j] = 1 if j != start else 0
self._is_shifted[j] = 1 if j != k else 0

else:
# print("using shift insert")
i = (j + 1) & (self._size - 1)

while True:
f = self.is_occupied_arr[i] + self.is_continuation_arr[i] + self.is_shifted_arr[i]
f = self._is_occupied[i] + self._is_continuation[i] + self._is_shifted[i]

temp = self.is_continuation_arr[i]
self.is_continuation_arr[i] = self.is_continuation_arr[j]
self.is_continuation_arr[j] = temp
temp = self._is_continuation[i]
self._is_continuation[i] = self._is_continuation[j]
self._is_continuation[j] = temp

self.is_shifted_arr[i] = 1
self._is_shifted[i] = 1

temp = self._filter[i]
self._filter[i] = self._filter[j]
Expand All @@ -54,28 +140,28 @@ def shift_insert(self, k, v, start, j, flag):
i = (i + 1) & (self._size - 1)

self._filter[j] = v
self.is_occupied_arr[k] = 1
self.is_continuation_arr[j] = 1 if j != start else 0
self.is_shifted_arr[j] = 1 if j != k else 0
self._is_occupied[k] = 1
self._is_continuation[j] = 1 if j != start else 0
self._is_shifted[j] = 1 if j != k else 0

if flag == 1:
self.is_continuation_arr[(j + 1) & (self._size - 1)] = 1
self._is_continuation[(j + 1) & (self._size - 1)] = 1

def get_start_index(self, k):
def _get_start_index(self, k):
j = k
cnts = 0

while True:
if j == k or self.is_occupied_arr[j] == 1:
if j == k or self._is_occupied[j] == 1:
cnts += 1

if self.is_shifted_arr[j] == 1:
if self._is_shifted[j] == 1:
j = (j - 1) & (self._size - 1)
else:
break

while True:
if self.is_continuation_arr[j] == 0:
if self._is_continuation[j] == 0:
if cnts == 1:
break
cnts -= 1
Expand All @@ -84,70 +170,56 @@ def get_start_index(self, k):

return j

def add(self, key: KeyT):
if self.contains(key) is False:
_hash = get_hash(key, self._q + self._r)
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)

if (
self.is_occupied_arr[key_quotient] == 0
and self.is_continuation_arr[key_quotient] == 0
and self.is_shifted_arr[key_quotient] == 0
):
self._filter[key_quotient] = key_remainder
self.is_occupied_arr[key_quotient] = 1
def _add(self, q: int, r: int):
if self._is_occupied[q] == 0 and self._is_continuation[q] == 0 and self._is_shifted[q] == 0:
self._filter[q] = r
self._is_occupied[q] = 1

else:
j = self.get_start_index(key_quotient)
else:
j = self._get_start_index(q)

if self.is_occupied_arr[key_quotient] == 0:
self.shift_insert(key_quotient, key_remainder, j, j, 0)
if self._is_occupied[q] == 0:
self._shift_insert(q, r, j, j, 0)

else:
u = j
starts = 0
f = self.is_occupied_arr[j] + self.is_continuation_arr[j] + self.is_shifted_arr[j]

while starts == 0 and f != 0 and key_remainder > self._filter[j]:
j = (j + 1) & (self._size - 1)
else:
u = j
starts = 0
f = self._is_occupied[j] + self._is_continuation[j] + self._is_shifted[j]

if self.is_continuation_arr[j] == 0:
starts += 1
while starts == 0 and f != 0 and r > self._filter[j]:
j = (j + 1) & (self._size - 1)

f = self.is_occupied_arr[j] + self.is_continuation_arr[j] + self.is_shifted_arr[j]
if self._is_continuation[j] == 0:
starts += 1

if starts == 1:
self.shift_insert(key_quotient, key_remainder, u, j, 0)
else:
self.shift_insert(key_quotient, key_remainder, u, j, 1)
self._elements_added += 1
f = self._is_occupied[j] + self._is_continuation[j] + self._is_shifted[j]

def contains(self, key: KeyT):
_hash = get_hash(key, self._q + self._r)
key_quotient = _hash >> self._r
key_remainder = _hash & ((1 << self._r) - 1)
if starts == 1:
self._shift_insert(q, r, u, j, 0)
else:
self._shift_insert(q, r, u, j, 1)
self._elements_added += 1

if self.is_occupied_arr[key_quotient] == 0:
def _contains(self, q: int, r: int) -> bool:
if self._is_occupied[q] == 0:
return False

else:
j = self.get_start_index(key_quotient)
j = self._get_start_index(q)

starts = 0
f = self.is_occupied_arr[j] + self.is_continuation_arr[j] + self.is_shifted_arr[j]
starts = 0
f = self._is_occupied[j] + self._is_continuation[j] + self._is_shifted[j]

while f != 0:
if self.is_continuation_arr[j] == 0:
starts += 1
while f != 0:
if self._is_continuation[j] == 0:
starts += 1

if starts == 2 or self._filter[j] > key_remainder:
break
if starts == 2 or self._filter[j] > r:
break

if self._filter[j] == key_remainder:
return True
if self._filter[j] == r:
return True

j = (j + 1) & (self._size - 1)
f = self.is_occupied_arr[j] + self.is_continuation_arr[j] + self.is_shifted_arr[j]
j = (j + 1) & (self._size - 1)
f = self._is_occupied[j] + self._is_continuation[j] + self._is_shifted[j]

return False
return False
17 changes: 16 additions & 1 deletion probables/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,15 @@ def read(self, n: int = -1) -> bytes:


class Bitarray:
"""Simplified, pure python bitarray implementation using as little memory as possible"""
"""Simplified, pure python bitarray implementation using as little memory as possible
Args:
size (int): The number of bits in the bitarray
Returns:
Bitarray: A bitarray
Raises:
TypeError:
ValueError:"""

def __init__(self, size: int):
if not isinstance(size, int):
Expand Down Expand Up @@ -179,3 +187,10 @@ def as_string(self):
Returns:
str: Bitarray representation as a string"""
return "".join([str(self.check_bit(x)) for x in range(self._size)])

def num_bits_set(self) -> int:
"""Number of bits set in the bitarray
Returns:
int: Number of bits set"""
return sum([self.check_bit(x) for x in range(self._size)])
1 change: 1 addition & 0 deletions tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def test_bitarray(self):
ba.as_string(),
"1001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001001000",
)
self.assertEqual(ba.num_bits_set(), 33)
self.assertTrue(ba.is_bit_set(3))
self.assertFalse(ba.is_bit_set(4))
self.assertEqual(ba[0], 1)
Expand Down

0 comments on commit a25ce32

Please sign in to comment.