Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review comments
  • Loading branch information
Gowthami03B committed Apr 29, 2024
commit f7aee5e0d3cb737371c16c6002d8a4c218173d2b
71 changes: 70 additions & 1 deletion mkdocs/docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ pyarrow.Table
content: int8 not null
file_path: string not null
file_format: string not null
spec_id: int32 not null
record_count: int64 not null
file_size_in_bytes: int64 not null
column_sizes: map<int32, int64>
Expand Down Expand Up @@ -650,10 +651,34 @@ split_offsets: list<item: int64>
child 0, item: int64
equality_ids: list<item: int32>
child 0, item: int32
sort_order_id: int32 not null
readable_metrics: struct<city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null>
child 0, city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null
child 0, column_size: int64
child 1, value_count: int64
child 2, null_value_count: int64
child 3, nan_value_count: int64
child 4, lower_bound: string
child 5, upper_bound: string
child 1, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
child 0, column_size: int64
child 1, value_count: int64
child 2, null_value_count: int64
child 3, nan_value_count: int64
child 4, lower_bound: double
child 5, upper_bound: double
child 2, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
child 0, column_size: int64
child 1, value_count: int64
child 2, null_value_count: int64
child 3, nan_value_count: int64
child 4, lower_bound: double
child 5, upper_bound: double
----
content: [[0,0]]
file_path: [["s3://warehouse/default/table_metadata_files/data/00000-0-9ea7d222-6457-467f-bad5-6fb125c9aa5f.parquet","s3://warehouse/default/table_metadata_files/data/00000-0-afa8893c-de71-4710-97c9-6b01590d0c44.parquet"]]
file_format: [["PARQUET","PARQUET"]]
spec_id: [[0,0]]
record_count: [[3,3]]
file_size_in_bytes: [[5459,5459]]
column_sizes: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109]]]
Expand All @@ -665,7 +690,51 @@ upper_bounds:[[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,616161616161616161
key_metadata: [[0100,0100]]
split_offsets:[[[],[]]]
equality_ids:[[[],[]]]

sort_order_id:[[[],[]]]
readable_metrics: [
-- is_valid: all not null
-- child 0 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string>
-- is_valid: all not null
-- child 0 type: int64
[140]
-- child 1 type: int64
[4]
-- child 2 type: int64
[0]
-- child 3 type: int64
[null]
-- child 4 type: string
["Amsterdam"]
-- child 5 type: string
["San Francisco"]
-- child 1 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
-- is_valid: all not null
-- child 0 type: int64
[135]
-- child 1 type: int64
[4]
-- child 2 type: int64
[0]
-- child 3 type: int64
[null]
-- child 4 type: double
[37.773972]
-- child 5 type: double
[53.11254]
-- child 2 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
-- is_valid: all not null
-- child 0 type: int64
[135]
-- child 1 type: int64
[4]
-- child 2 type: int64
[0]
-- child 3 type: int64
[null]
-- child 4 type: double
[-122.431297]
-- child 5 type: double
[6.0989]]
```

## Add Files
Expand Down
54 changes: 51 additions & 3 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3537,13 +3537,35 @@ def update_partitions_map(
schema=table_schema,
)

def files(self) -> "pa.Table":
def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
import pyarrow as pa

from pyiceberg.io.pyarrow import schema_to_pyarrow

schema = self.tbl.metadata.schema()
readable_metrics_struct = []

def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
pa_bound_type = schema_to_pyarrow(bound_type)
return pa.struct([
pa.field("column_size", pa.int64(), nullable=True),
pa.field("value_count", pa.int64(), nullable=True),
pa.field("null_value_count", pa.int64(), nullable=True),
pa.field("nan_value_count", pa.int64(), nullable=True),
pa.field("lower_bound", pa_bound_type, nullable=True),
pa.field("upper_bound", pa_bound_type, nullable=True),
])

for field in self.tbl.metadata.schema().fields:
readable_metrics_struct.append(
pa.field(schema.find_column_name(field.field_id), _readable_metrics_struct(field.field_type), nullable=False)
)

files_schema = pa.schema([
pa.field('content', pa.int8(), nullable=False),
pa.field('file_path', pa.string(), nullable=False),
pa.field('file_format', pa.string(), nullable=False),
pa.field('file_format', pa.dictionary(pa.int32(), pa.string()), nullable=False),
pa.field('spec_id', pa.int32(), nullable=False),
pa.field('record_count', pa.int64(), nullable=False),
pa.field('file_size_in_bytes', pa.int64(), nullable=False),
pa.field('column_sizes', pa.map_(pa.int32(), pa.int64()), nullable=True),
Expand All @@ -3555,22 +3577,46 @@ def files(self) -> "pa.Table":
pa.field('key_metadata', pa.binary(), nullable=True),
pa.field('split_offsets', pa.list_(pa.int64()), nullable=True),
pa.field('equality_ids', pa.list_(pa.int32()), nullable=True),
pa.field('sort_order_id', pa.int32(), nullable=True),
pa.field('readable_metrics', pa.struct(readable_metrics_struct), nullable=True),
])

files = []

snapshot = self.tbl.current_snapshot()
snapshot = self._get_snapshot(snapshot_id)
if not snapshot:
return pa.pylist([])

io = self.tbl.io
for manifest_list in snapshot.manifests(io):
for manifest_entry in manifest_list.fetch_manifest_entry(io):
data_file = manifest_entry.data_file
column_sizes = data_file.column_sizes or {}
value_counts = data_file.value_counts or {}
null_value_counts = data_file.null_value_counts or {}
nan_value_counts = data_file.nan_value_counts or {}
lower_bounds = data_file.lower_bounds or {}
upper_bounds = data_file.upper_bounds or {}
readable_metrics = {
schema.find_column_name(field.field_id): {
"column_size": column_sizes.get(field.field_id),
"value_count": value_counts.get(field.field_id),
"null_value_count": null_value_counts.get(field.field_id),
"nan_value_count": nan_value_counts.get(field.field_id),
"lower_bound": from_bytes(field.field_type, lower_bound)
if (lower_bound := lower_bounds.get(field.field_id))
else None,
"upper_bound": from_bytes(field.field_type, upper_bound)
if (upper_bound := upper_bounds.get(field.field_id))
else None,
}
for field in self.tbl.metadata.schema().fields
}
files.append({
'content': data_file.content,
'file_path': data_file.file_path,
'file_format': data_file.file_format,
'spec_id': data_file.spec_id,
'record_count': data_file.record_count,
'file_size_in_bytes': data_file.file_size_in_bytes,
'column_sizes': dict(data_file.column_sizes),
Expand All @@ -3582,6 +3628,8 @@ def files(self) -> "pa.Table":
'key_metadata': data_file.key_metadata,
'split_offsets': data_file.split_offsets,
'equality_ids': data_file.equality_ids,
'sort_order_id': data_file.sort_order_id,
'readable_metrics': readable_metrics,
})

return pa.Table.from_pylist(
Expand Down
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2060,7 +2060,7 @@ def spark() -> "SparkSession":
.config("spark.sql.catalog.hive.warehouse", "s3://warehouse/hive/")
.config("spark.sql.catalog.hive.s3.endpoint", "http://localhost:9000")
.config("spark.sql.catalog.hive.s3.path-style-access", "true")
.config("spark.sql.execution.arrow.pyspark.enabled", "false")
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.getOrCreate()
)

Expand Down
56 changes: 49 additions & 7 deletions tests/integration/test_inspect_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ def test_inspect_files(
spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
) -> None:
identifier = "default.table_metadata_files"

tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})

tbl.overwrite(arrow_table_with_null)
Expand All @@ -466,6 +467,7 @@ def test_inspect_files(
'content',
'file_path',
'file_format',
'spec_id',
'record_count',
'file_size_in_bytes',
'column_sizes',
Expand All @@ -477,10 +479,14 @@ def test_inspect_files(
'key_metadata',
'split_offsets',
'equality_ids',
'sort_order_id',
'readable_metrics',
]

for file_size_in_bytes in df['file_size_in_bytes']:
assert isinstance(file_size_in_bytes.as_py(), int)
# make sure the non-nullable fields are filled
for int_column in ['content', 'spec_id', 'record_count', 'file_size_in_bytes']:
for value in df[int_column]:
assert isinstance(value.as_py(), int)

for split_offsets in df['split_offsets']:
assert isinstance(split_offsets.as_py(), list)
Expand All @@ -491,10 +497,13 @@ def test_inspect_files(
for file_path in df['file_path']:
assert file_path.as_py().startswith("s3://")

lhs = spark.table(f"{identifier}.files").toPandas()
rhs = df.to_pandas()
lhs = df.to_pandas()
rhs = spark.table(f"{identifier}.files").toPandas()
for column in df.column_names:
for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
# NaN != NaN in Python
continue
if column in [
'column_sizes',
'value_counts',
Expand All @@ -504,6 +513,39 @@ def test_inspect_files(
'upper_bounds',
]:
# Arrow returns a list of tuples, instead of a dict
right = dict(right)

assert left == right, f"Difference in column {column}: {left} != {right}"
left = dict(left)
elif column == 'readable_metrics':
assert list(left.keys()) == [
'bool',
'string',
'string_long',
'int',
'long',
'float',
'double',
'timestamp',
'timestamptz',
'date',
'binary',
'fixed',
]
assert left.keys() == right.asDict().keys()

for rm_column in left.keys():
rm_lhs = left[rm_column]
rm_rhs = right[rm_column].asDict()

assert rm_lhs['column_size'] == rm_rhs['column_size']
assert rm_lhs['value_count'] == rm_rhs['value_count']
assert rm_lhs['null_value_count'] == rm_rhs['null_value_count']
assert rm_lhs['nan_value_count'] == rm_rhs['nan_value_count']

if rm_column == 'timestamptz':
# PySpark does not correctly set the timstamptz
rm_rhs['lower_bound'] = rm_rhs['lower_bound'].replace(tzinfo=pytz.utc)
rm_rhs['upper_bound'] = rm_rhs['upper_bound'].replace(tzinfo=pytz.utc)

assert rm_lhs['lower_bound'] == rm_rhs['lower_bound']
assert rm_lhs['upper_bound'] == rm_rhs['upper_bound']
else:
assert left == right, f"Difference in column {column}: {left} != {right}"