Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
173ab0e
Bloom filter with tests
isidroas Apr 6, 2023
08bc970
has functions constant
isidroas Apr 6, 2023
0448109
fix type
isidroas Apr 6, 2023
486dcbc
isort
isidroas Apr 6, 2023
4111807
passing ruff
isidroas Apr 6, 2023
e6ce098
type hints
isidroas Apr 6, 2023
e4d39db
type hints
isidroas Apr 6, 2023
7629686
from fail to erro
isidroas Apr 6, 2023
3926167
captital leter
isidroas Apr 6, 2023
280ffa0
type hints requested by boot
isidroas Apr 6, 2023
5d460aa
descriptive name for m
isidroas Apr 6, 2023
cc54095
more descriptibe arguments II
isidroas Apr 6, 2023
78d19fd
moved movies_test to doctest
isidroas Apr 7, 2023
8b1bec0
commented doctest
isidroas Apr 7, 2023
28e6691
removed test_probability
isidroas Apr 7, 2023
2fd7196
estimated error
isidroas Apr 7, 2023
314237d
added types
isidroas Apr 7, 2023
9b01472
again hash_
isidroas Apr 7, 2023
c132d50
Update data_structures/hashing/bloom_filter.py
isidroas Apr 8, 2023
313c80c
from b to bloom
isidroas Apr 8, 2023
18e0dde
Update data_structures/hashing/bloom_filter.py
isidroas Apr 8, 2023
54041ff
Update data_structures/hashing/bloom_filter.py
isidroas Apr 8, 2023
483a2a0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 8, 2023
174ce08
syntax error in dict comprehension
isidroas Apr 8, 2023
00cc60e
from goodfather to godfather
isidroas Apr 8, 2023
35fa5f5
removed Interestellar
isidroas Apr 8, 2023
5cd20ea
forgot the last Godfather
isidroas Apr 8, 2023
7617143
Revert "removed Interestellar"
isidroas Apr 8, 2023
799171a
pretty dict
isidroas Apr 8, 2023
1a71f4c
Apply suggestions from code review
cclauss Apr 8, 2023
4e0263f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 8, 2023
e746746
Update bloom_filter.py
cclauss Apr 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Bloom filter with tests
  • Loading branch information
isidroas committed Apr 7, 2023
commit 173ab0ea96b4969b51f4d23f033a45242fe7e80a
103 changes: 103 additions & 0 deletions data_structures/hashing/bloom_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
"""
See https://en.wikipedia.org/wiki/Bloom_filter
"""
from hashlib import sha256, md5
from random import randint, choices
import string


class Bloom:
def __init__(self, size=8):
self.bitstring = 0b0
self.size = size

def add(self, value):
h = self.hash(value)
self.bitstring |= h
print(
f"""\
[add] value = {value}
hash = {self.format_bin(h)}
filter = {self.format_bin(self.bitstring)}
"""
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print(
f"""\
[add] value = {value}
hash = {self.format_bin(h)}
filter = {self.format_bin(self.bitstring)}
"""
)

Same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


def exists(self, value):
h = self.hash(value)
res = (h & self.bitstring) == h

print(
f"""\
[exists] value = {value}
hash = {self.format_bin(h)}
filter = {self.format_bin(self.bitstring)}
res = {res}
"""
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print(
f"""\
[exists] value = {value}
hash = {self.format_bin(h)}
filter = {self.format_bin(self.bitstring)}
res = {res}
"""
)

In the CONTRIBUTING it says

  • return all calculation results instead of printing or plotting them

I don't think these prints are necessary

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved to method and called in doctest

return res

def format_bin(self, value):
res = bin(value)[2:]
return res.zfill(self.size)

def hash(self, value):
res = 0b0
for func in (sha256, md5):
b = func(value.encode()).digest()
position = int.from_bytes(b, "little") % self.size
res |= 2**position
return res


def test_movies():
b = Bloom()
b.add("titanic")
b.add("avatar")

assert b.exists("titanic")
assert b.exists("avatar")

assert b.exists("the goodfather") in (True, False)
assert b.exists("interstellar") in (True, False)
assert b.exists("Parasite") in (True, False)
assert b.exists("Pulp fiction") in (True, False)


def random_string(size):
return "".join(choices(string.ascii_lowercase + " ", k=size))


def test_probability(m=64, n=20):
b = Bloom(size=m)

added = {random_string(10) for i in range(n)}
for a in added:
b.add(a)

# number of hash functions is fixed
k = 2

n_ones = bin(b.bitstring).count("1")
expected_probability = (n_ones / m) ** k

expected_probability_wikipedia = (1 - (1 - 1 / m) ** (k * n)) ** k

not_added = {random_string(10) for i in range(1000)}
fails = 0
for string in not_added:
if b.exists(string):
fails += 1
fail_rate = fails / len(not_added)

print(f"total = {len(not_added)}, fails = {fails}, fail_rate = {fail_rate}")
print(f"{expected_probability=}")
print(f"{expected_probability_wikipedia=}")

assert (
abs(expected_probability - fail_rate) <= 0.05
) # 5% margin calculated experiementally


if __name__ == "__main__":
test_movies()
test_probability()