Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
support partitions metadata table
  • Loading branch information
sungwy committed Apr 13, 2024
commit 6f4f0f33d7d6ebd20944717ab915108fa5978c8c
21 changes: 14 additions & 7 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3416,12 +3416,7 @@ def partitions(self) -> "pa.Table":

from pyiceberg.io.pyarrow import schema_to_pyarrow

partition_record = self.tbl.metadata.specs_struct()
pa_record_struct = schema_to_pyarrow(partition_record)

partitions_schema = pa.schema([
pa.field('partition', pa_record_struct, nullable=False),
pa.field('spec_id', pa.int32(), nullable=False),
table_schema = pa.schema([
pa.field('record_count', pa.int64(), nullable=False),
pa.field('file_count', pa.int32(), nullable=False),
pa.field('total_data_file_size_in_bytes', pa.int64(), nullable=False),
Expand All @@ -3433,6 +3428,18 @@ def partitions(self) -> "pa.Table":
pa.field('last_updated_snapshot_id', pa.int64(), nullable=True),
])

partition_record = self.tbl.metadata.specs_struct()
has_partitions = len(partition_record.fields) > 0

if has_partitions:
pa_record_struct = schema_to_pyarrow(partition_record)
partitions_schema = pa.schema([
pa.field('partition', pa_record_struct, nullable=False),
pa.field('spec_id', pa.int32(), nullable=False),
])

table_schema = pa.unify_schemas([partitions_schema, table_schema])

def update_partitions_map(
partitions_map: Dict[Tuple[str, Any], Any],
file: DataFile,
Expand Down Expand Up @@ -3489,7 +3496,7 @@ def update_partitions_map(

return pa.Table.from_pylist(
partitions_map.values(),
schema=partitions_schema,
schema=table_schema,
)


Expand Down
121 changes: 118 additions & 3 deletions tests/integration/test_inspect_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,6 @@ def test_inspect_entries(
for value in df[int_column]:
assert isinstance(value.as_py(), int)

for snapshot_id in df['snapshot_id']:
assert isinstance(snapshot_id.as_py(), int)

lhs = df.to_pandas()
rhs = spark.table(f"{identifier}.entries").toPandas()
for column in df.column_names:
Expand Down Expand Up @@ -266,3 +263,121 @@ def test_inspect_entries_partitioned(spark: SparkSession, session_catalog: Catal

assert df.to_pydict()['data_file'][0]['partition'] == {'dt_day': date(2021, 2, 1), 'dt_month': None}
assert df.to_pydict()['data_file'][1]['partition'] == {'dt_day': None, 'dt_month': 612}


@pytest.mark.integration
@pytest.mark.parametrize("format_version", [1, 2])
def test_inspect_partitions_unpartitioned(
spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
) -> None:
identifier = "default.table_metadata_partitions_unpartitioned"
tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})

# Write some data through multiple commits
tbl.append(arrow_table_with_null)
tbl.append(arrow_table_with_null)

df = tbl.inspect.partitions()
assert df.column_names == [
'record_count',
'file_count',
'total_data_file_size_in_bytes',
'position_delete_record_count',
'position_delete_file_count',
'equality_delete_record_count',
'equality_delete_file_count',
'last_updated_at',
'last_updated_snapshot_id',
]
for last_updated_at in df['last_updated_at']:
assert isinstance(last_updated_at.as_py(), datetime)

int_cols = [
'record_count',
'file_count',
'total_data_file_size_in_bytes',
'position_delete_record_count',
'position_delete_file_count',
'equality_delete_record_count',
'equality_delete_file_count',
'last_updated_snapshot_id',
]
for column in int_cols:
for value in df[column]:
assert isinstance(value.as_py(), int)
lhs = df.to_pandas()
rhs = spark.table(f"{identifier}.partitions").toPandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
assert left == right, f"Difference in column {column}: {left} != {right}"


@pytest.mark.integration
@pytest.mark.parametrize("format_version", [1, 2])
def test_inspect_partitions_partitioned(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None:
identifier = "default.table_metadata_partitions_partitioned"
try:
session_catalog.drop_table(identifier=identifier)
except NoSuchTableError:
pass

spark.sql(
f"""
CREATE TABLE {identifier} (
name string,
dt date
)
PARTITIONED BY (months(dt))
"""
)

spark.sql(
f"""
INSERT INTO {identifier} VALUES ('John', CAST('2021-01-01' AS date))
"""
)

spark.sql(
f"""
INSERT INTO {identifier} VALUES ('Doe', CAST('2021-01-05' AS date))
"""
)

spark.sql(
f"""
ALTER TABLE {identifier}
REPLACE PARTITION FIELD dt_month WITH days(dt)
"""
)

spark.sql(
f"""
INSERT INTO {identifier} VALUES ('Jenny', CAST('2021-02-01' AS date))
"""
)

spark.sql(
f"""
ALTER TABLE {identifier}
DROP PARTITION FIELD dt_day
"""
)

spark.sql(
f"""
INSERT INTO {identifier} VALUES ('James', CAST('2021-02-01' AS date))
"""
)

df = session_catalog.load_table(identifier).inspect.partitions()

lhs = df.to_pandas()
rhs = spark.table(f"{identifier}.partitions").toPandas()

lhs.sort_values('spec_id', inplace=True)
rhs.sort_values('spec_id', inplace=True)
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
if column == "partition":
right = right.asDict()
assert left == right, f"Difference in column {column}: {left} != {right}"