Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
lint fixes
  • Loading branch information
rambleraptor committed Dec 8, 2025
commit 71dd92510dee5785a1fe6a9e1eec804806ee6b29
15 changes: 8 additions & 7 deletions pyiceberg/table/puffin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import io
import math
import zlib
from typing import TYPE_CHECKING, Dict, Iterable, List, Literal, Optional
from collections.abc import Iterable
from typing import TYPE_CHECKING, Literal

from pydantic import Field
from pyroaring import BitMap, FrozenBitMap
Expand Down Expand Up @@ -65,9 +66,9 @@ def _deserialize_bitmap(pl: bytes) -> list[BitMap]:
return bitmaps


def _serialize_bitmaps(bitmaps: Dict[int, BitMap]) -> bytes:
def _serialize_bitmaps(bitmaps: dict[int, BitMap]) -> bytes:
"""
Serializes a dictionary of bitmaps into a byte array.
Serialize a dictionary of bitmaps into a byte array.

The format is:
- 8 bytes: number of bitmaps (little-endian)
Expand Down Expand Up @@ -149,8 +150,8 @@ def to_vector(self) -> dict[str, "pa.ChunkedArray"]:


class PuffinWriter:
_blobs: List[PuffinBlobMetadata]
_blob_payloads: List[bytes]
_blobs: list[PuffinBlobMetadata]
_blob_payloads: list[bytes]

def __init__(self) -> None:
self._blobs = []
Expand All @@ -162,7 +163,7 @@ def add(
referenced_data_file: str,
) -> None:
# 1. Create bitmaps from positions
bitmaps: Dict[int, BitMap] = {}
bitmaps: dict[int, BitMap] = {}
cardinality = 0
for pos in positions:
cardinality += 1
Copy link
Contributor

@geruh geruh Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cardinality could be incorrect with same positions passed in we can probably use the pyroaring stats to get this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I made the proper change, let me know if you're thinking differently.

Expand Down Expand Up @@ -219,7 +220,7 @@ def finish(self) -> bytes:
for blob_payload in self._blob_payloads:
payload_buffer.write(blob_payload)

updated_blobs_metadata: List[PuffinBlobMetadata] = []
updated_blobs_metadata: list[PuffinBlobMetadata] = []
current_offset = 4 # Start after file magic (4 bytes)
for i, blob_payload in enumerate(self._blob_payloads):
original_metadata_dict = self._blobs[i].model_dump(by_alias=True, exclude_none=True)
Expand Down
14 changes: 7 additions & 7 deletions tests/table/test_puffin.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pytest
from pyroaring import BitMap

from pyiceberg.table.puffin import _deserialize_bitmap, PuffinFile, PuffinWriter, PROPERTY_REFERENCED_DATA_FILE
from pyiceberg.table.puffin import PROPERTY_REFERENCED_DATA_FILE, PuffinFile, PuffinWriter, _deserialize_bitmap


def _open_file(file: str) -> bytes:
Expand Down Expand Up @@ -73,10 +73,10 @@ def test_map_high_vals() -> None:
_ = _deserialize_bitmap(puffin)


def test_puffin_round_trip():
def test_puffin_round_trip() -> None:
# Define some deletion positions for multiple files
deletions1 = [10, 20, 30]
deletions2 = [5, (1 << 32) + 1] # Test with a high-bit position
deletions2 = [5, (1 << 32) + 1] # Test with a high-bit position

file1_path = "path/to/data1.parquet"
file2_path = "path/to/data2.parquet"
Expand All @@ -92,7 +92,7 @@ def test_puffin_round_trip():

# Assert footer metadata
assert len(reader.footer.blobs) == 2

blob1_meta = reader.footer.blobs[0]
assert blob1_meta.properties[PROPERTY_REFERENCED_DATA_FILE] == file1_path
assert blob1_meta.properties["cardinality"] == str(len(deletions1))
Expand All @@ -103,15 +103,15 @@ def test_puffin_round_trip():

# Assert the content of deletion vectors
read_vectors = reader.to_vector()

assert file1_path in read_vectors
assert file2_path in read_vectors

assert read_vectors[file1_path].to_pylist() == sorted(deletions1)
assert read_vectors[file2_path].to_pylist() == sorted(deletions2)


def test_write_and_read_puffin_file():
def test_write_and_read_puffin_file() -> None:
writer = PuffinWriter()
writer.add(positions=[1, 2, 3], referenced_data_file="file1.parquet")
writer.add(positions=[4, 5, 6], referenced_data_file="file2.parquet")
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_write_and_read_puffin_file():
assert vectors["file2.parquet"].to_pylist() == [4, 5, 6]


def test_puffin_file_with_no_blobs():
def test_puffin_file_with_no_blobs() -> None:
writer = PuffinWriter()
puffin_bytes = writer.finish()

Expand Down