-
Notifications
You must be signed in to change notification settings - Fork 8
Feature extend binary format #505
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
e77c624
fa8d267
101f404
cb9e6cc
6dc218c
56af9c8
f2257c1
312038f
ffb93b5
4ea1e60
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,11 +41,12 @@ def binary_format(encoding_size): | |
|
|
||
| The binary format string can be understood as: | ||
| - "!" Use network byte order (big-endian). | ||
| - "I" store the entity ID as an unsigned int | ||
| - "<encoding size>s" Store the n (e.g. 128) raw bytes of the bitarray | ||
|
|
||
| https://docs.python.org/3/library/struct.html | ||
| """ | ||
| bit_packing_fmt = f"!{encoding_size}s" | ||
| bit_packing_fmt = f"!I{encoding_size}s" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are just going to pretend the old format doesn't exist aren't we? Should we take the opportunity to add a version byte to the format?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's an internal format, so yes, I pretend that it never existed. I'm old, I forget quickly.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was meaning a header for the file rather than for each encoding. But sure not worth it right now. |
||
| bit_packing_struct = struct.Struct(bit_packing_fmt) | ||
| return bit_packing_struct | ||
|
|
||
|
|
@@ -54,31 +55,41 @@ def binary_pack_filters(filters, encoding_size): | |
| """Efficient packing of bloomfilters. | ||
|
|
||
| :param filters: | ||
| An iterable of bytes as produced by deserialize_bytes. | ||
| An iterable of tuples, with | ||
| - first element is the entity ID as an unsigned int | ||
| - second element is 'encoding_size' number of bytes as produced by deserialize_bytes. | ||
|
|
||
| :return: | ||
| An iterable of bytes. | ||
| """ | ||
| bit_packing_struct = binary_format(encoding_size) | ||
|
|
||
| for hash_bytes in filters: | ||
| yield bit_packing_struct.pack(hash_bytes) | ||
| yield bit_packing_struct.pack(*hash_bytes) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ⭐️ |
||
|
|
||
|
|
||
| def binary_unpack_one(data, bit_packing_struct): | ||
| clk_bytes, = bit_packing_struct.unpack(data) | ||
| return clk_bytes | ||
| entity_id, clk_bytes, = bit_packing_struct.unpack(data) | ||
| return entity_id, clk_bytes | ||
|
|
||
|
|
||
| def binary_unpack_filters(streamable_data, max_bytes=None, encoding_size=None): | ||
| def binary_unpack_filters(data_iterable, max_bytes=None, encoding_size=None): | ||
| """ | ||
| Unpack filters that were packed with the 'binary_pack_filters' method. | ||
|
|
||
| :param data_iterable: an iterable of binary packed filters. | ||
| :param max_bytes: if present, only read up to 'max_bytes' bytes. | ||
| :param encoding_size: the encoding size of one filter, excluding the entity ID info | ||
|
||
| :return: list of filters with their corresponding entity IDs as a list of tuples. | ||
| """ | ||
| assert encoding_size is not None | ||
| bit_packed_element = binary_format(encoding_size) | ||
| bit_packed_element_size = bit_packed_element.size | ||
| filters = [] | ||
| bytes_consumed = 0 | ||
|
|
||
| logger.info(f"Unpacking stream of encodings with size {encoding_size} - packed as {bit_packed_element_size}") | ||
| for raw_bytes in streamable_data.stream(bit_packed_element_size): | ||
| logger.info(f"Iterating over encodings of size {encoding_size} - packed as {bit_packed_element_size}") | ||
| for raw_bytes in data_iterable: | ||
| filters.append(binary_unpack_one(raw_bytes, bit_packed_element)) | ||
|
|
||
| bytes_consumed += bit_packed_element_size | ||
|
|
@@ -185,6 +196,6 @@ def get_chunk_from_object_store(chunk_info, encoding_size=128): | |
| bit_packed_element_size * chunk_range_start, | ||
| chunk_bytes) | ||
|
|
||
| chunk_data = binary_unpack_filters(chunk_stream, chunk_bytes, encoding_size) | ||
| chunk_data = binary_unpack_filters(chunk_stream.stream(bit_packed_element_size), chunk_bytes, encoding_size) | ||
|
|
||
| return chunk_data, chunk_length | ||
Uh oh!
There was an error while loading. Please reload this page.