review comments

apache · HonahX · Jul 4, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 25, 2024
commit f7aee5e0d3cb737371c16c6002d8a4c218173d2b
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -619,6 +619,7 @@ pyarrow.Table
 content: int8 not null
 file_path: string not null
 file_format: string not null
+spec_id: int32 not null
 record_count: int64 not null
 file_size_in_bytes: int64 not null
 column_sizes: map<int32, int64>
@@ -650,10 +651,34 @@ split_offsets: list<item: int64>
   child 0, item: int64
 equality_ids: list<item: int32>
   child 0, item: int32
+sort_order_id: int32 not null
+readable_metrics: struct<city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null>
+  child 0, city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null
+      child 0, column_size: int64
+      child 1, value_count: int64
+      child 2, null_value_count: int64
+      child 3, nan_value_count: int64
+      child 4, lower_bound: string
+      child 5, upper_bound: string
+  child 1, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
+      child 0, column_size: int64
+      child 1, value_count: int64
+      child 2, null_value_count: int64
+      child 3, nan_value_count: int64
+      child 4, lower_bound: double
+      child 5, upper_bound: double
+  child 2, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
+      child 0, column_size: int64
+      child 1, value_count: int64
+      child 2, null_value_count: int64
+      child 3, nan_value_count: int64
+      child 4, lower_bound: double
+      child 5, upper_bound: double
 ----
 content: [[0,0]]
 file_path: [["s3://warehouse/default/table_metadata_files/data/00000-0-9ea7d222-6457-467f-bad5-6fb125c9aa5f.parquet","s3://warehouse/default/table_metadata_files/data/00000-0-afa8893c-de71-4710-97c9-6b01590d0c44.parquet"]]
 file_format: [["PARQUET","PARQUET"]]
+spec_id: [[0,0]]
 record_count: [[3,3]]
 file_size_in_bytes: [[5459,5459]]
 column_sizes: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109]]]
@@ -665,7 +690,51 @@ upper_bounds:[[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,616161616161616161
 key_metadata: [[0100,0100]]
 split_offsets:[[[],[]]]
 equality_ids:[[[],[]]]
-
+sort_order_id:[[[],[]]]
+readable_metrics: [
+  -- is_valid: all not null
+  -- child 0 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string>
+    -- is_valid: all not null
+    -- child 0 type: int64
+[140]
+    -- child 1 type: int64
+[4]
+    -- child 2 type: int64
+[0]
+    -- child 3 type: int64
+[null]
+    -- child 4 type: string
+["Amsterdam"]
+    -- child 5 type: string
+["San Francisco"]
+  -- child 1 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
+    -- is_valid: all not null
+    -- child 0 type: int64
+[135]
+    -- child 1 type: int64
+[4]
+    -- child 2 type: int64
+[0]
+    -- child 3 type: int64
+[null]
+    -- child 4 type: double
+[37.773972]
+    -- child 5 type: double
+[53.11254]
+  -- child 2 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
+    -- is_valid: all not null
+    -- child 0 type: int64
+[135]
+    -- child 1 type: int64
+[4]
+    -- child 2 type: int64
+[0]
+    -- child 3 type: int64
+[null]
+    -- child 4 type: double
+[-122.431297]
+    -- child 5 type: double
+[6.0989]]
 ```
 
 ## Add Files

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -3537,13 +3537,35 @@ def update_partitions_map(
             schema=table_schema,
         )
 
-    def files(self) -> "pa.Table":
+    def files(self, snapshot_id: Optional[int] = None) -> "pa.Table":
         import pyarrow as pa
 
+        from pyiceberg.io.pyarrow import schema_to_pyarrow
+
+        schema = self.tbl.metadata.schema()
+        readable_metrics_struct = []
+
+        def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType:
+            pa_bound_type = schema_to_pyarrow(bound_type)
+            return pa.struct([
+                pa.field("column_size", pa.int64(), nullable=True),
+                pa.field("value_count", pa.int64(), nullable=True),
+                pa.field("null_value_count", pa.int64(), nullable=True),
+                pa.field("nan_value_count", pa.int64(), nullable=True),
+                pa.field("lower_bound", pa_bound_type, nullable=True),
+                pa.field("upper_bound", pa_bound_type, nullable=True),
+            ])
+
+        for field in self.tbl.metadata.schema().fields:
+            readable_metrics_struct.append(
+                pa.field(schema.find_column_name(field.field_id), _readable_metrics_struct(field.field_type), nullable=False)
+            )
+
         files_schema = pa.schema([
             pa.field('content', pa.int8(), nullable=False),
             pa.field('file_path', pa.string(), nullable=False),
-            pa.field('file_format', pa.string(), nullable=False),
+            pa.field('file_format', pa.dictionary(pa.int32(), pa.string()), nullable=False),
+            pa.field('spec_id', pa.int32(), nullable=False),
             pa.field('record_count', pa.int64(), nullable=False),
             pa.field('file_size_in_bytes', pa.int64(), nullable=False),
             pa.field('column_sizes', pa.map_(pa.int32(), pa.int64()), nullable=True),
@@ -3555,22 +3577,46 @@ def files(self) -> "pa.Table":
             pa.field('key_metadata', pa.binary(), nullable=True),
             pa.field('split_offsets', pa.list_(pa.int64()), nullable=True),
             pa.field('equality_ids', pa.list_(pa.int32()), nullable=True),
+            pa.field('sort_order_id', pa.int32(), nullable=True),
+            pa.field('readable_metrics', pa.struct(readable_metrics_struct), nullable=True),
         ])
 
         files = []
 
-        snapshot = self.tbl.current_snapshot()
+        snapshot = self._get_snapshot(snapshot_id)
         if not snapshot:
             return pa.pylist([])
 
         io = self.tbl.io
         for manifest_list in snapshot.manifests(io):
             for manifest_entry in manifest_list.fetch_manifest_entry(io):
                 data_file = manifest_entry.data_file
+                column_sizes = data_file.column_sizes or {}
+                value_counts = data_file.value_counts or {}
+                null_value_counts = data_file.null_value_counts or {}
+                nan_value_counts = data_file.nan_value_counts or {}
+                lower_bounds = data_file.lower_bounds or {}
+                upper_bounds = data_file.upper_bounds or {}
+                readable_metrics = {
+                    schema.find_column_name(field.field_id): {
+                        "column_size": column_sizes.get(field.field_id),
+                        "value_count": value_counts.get(field.field_id),
+                        "null_value_count": null_value_counts.get(field.field_id),
+                        "nan_value_count": nan_value_counts.get(field.field_id),
+                        "lower_bound": from_bytes(field.field_type, lower_bound)
+                        if (lower_bound := lower_bounds.get(field.field_id))
+                        else None,
+                        "upper_bound": from_bytes(field.field_type, upper_bound)
+                        if (upper_bound := upper_bounds.get(field.field_id))
+                        else None,
+                    }
+                    for field in self.tbl.metadata.schema().fields
+                }
                 files.append({
                     'content': data_file.content,
                     'file_path': data_file.file_path,
                     'file_format': data_file.file_format,
+                    'spec_id': data_file.spec_id,
                     'record_count': data_file.record_count,
                     'file_size_in_bytes': data_file.file_size_in_bytes,
                     'column_sizes': dict(data_file.column_sizes),
@@ -3582,6 +3628,8 @@ def files(self) -> "pa.Table":
                     'key_metadata': data_file.key_metadata,
                     'split_offsets': data_file.split_offsets,
                     'equality_ids': data_file.equality_ids,
+                    'sort_order_id': data_file.sort_order_id,
+                    'readable_metrics': readable_metrics,
                 })
 
         return pa.Table.from_pylist(

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2060,7 +2060,7 @@ def spark() -> "SparkSession":
         .config("spark.sql.catalog.hive.warehouse", "s3://warehouse/hive/")
         .config("spark.sql.catalog.hive.s3.endpoint", "http://localhost:9000")
         .config("spark.sql.catalog.hive.s3.path-style-access", "true")
-        .config("spark.sql.execution.arrow.pyspark.enabled", "false")
+        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
         .getOrCreate()
     )
 

diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py
@@ -453,6 +453,7 @@ def test_inspect_files(
     spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
 ) -> None:
     identifier = "default.table_metadata_files"
+
     tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})
 
     tbl.overwrite(arrow_table_with_null)
@@ -466,6 +467,7 @@ def test_inspect_files(
         'content',
         'file_path',
         'file_format',
+        'spec_id',
         'record_count',
         'file_size_in_bytes',
         'column_sizes',
@@ -477,10 +479,14 @@ def test_inspect_files(
         'key_metadata',
         'split_offsets',
         'equality_ids',
+        'sort_order_id',
+        'readable_metrics',
     ]
 
-    for file_size_in_bytes in df['file_size_in_bytes']:
-        assert isinstance(file_size_in_bytes.as_py(), int)
+    # make sure the non-nullable fields are filled
+    for int_column in ['content', 'spec_id', 'record_count', 'file_size_in_bytes']:
+        for value in df[int_column]:
+            assert isinstance(value.as_py(), int)
 
     for split_offsets in df['split_offsets']:
         assert isinstance(split_offsets.as_py(), list)
@@ -491,10 +497,13 @@ def test_inspect_files(
     for file_path in df['file_path']:
         assert file_path.as_py().startswith("s3://")
 
-    lhs = spark.table(f"{identifier}.files").toPandas()
-    rhs = df.to_pandas()
+    lhs = df.to_pandas()
+    rhs = spark.table(f"{identifier}.files").toPandas()
     for column in df.column_names:
         for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            if isinstance(left, float) and math.isnan(left) and isinstance(right, float) and math.isnan(right):
+                # NaN != NaN in Python
+                continue
             if column in [
                 'column_sizes',
                 'value_counts',
@@ -504,6 +513,39 @@ def test_inspect_files(
                 'upper_bounds',
             ]:
                 # Arrow returns a list of tuples, instead of a dict
-                right = dict(right)
-
-            assert left == right, f"Difference in column {column}: {left} != {right}"
+                left = dict(left)
+            elif column == 'readable_metrics':
+                assert list(left.keys()) == [
+                    'bool',
+                    'string',
+                    'string_long',
+                    'int',
+                    'long',
+                    'float',
+                    'double',
+                    'timestamp',
+                    'timestamptz',
+                    'date',
+                    'binary',
+                    'fixed',
+                ]
+                assert left.keys() == right.asDict().keys()
+
+                for rm_column in left.keys():
+                    rm_lhs = left[rm_column]
+                    rm_rhs = right[rm_column].asDict()
+
+                    assert rm_lhs['column_size'] == rm_rhs['column_size']
+                    assert rm_lhs['value_count'] == rm_rhs['value_count']
+                    assert rm_lhs['null_value_count'] == rm_rhs['null_value_count']
+                    assert rm_lhs['nan_value_count'] == rm_rhs['nan_value_count']
+
+                    if rm_column == 'timestamptz':
+                        # PySpark does not correctly set the timstamptz
+                        rm_rhs['lower_bound'] = rm_rhs['lower_bound'].replace(tzinfo=pytz.utc)
+                        rm_rhs['upper_bound'] = rm_rhs['upper_bound'].replace(tzinfo=pytz.utc)
+
+                    assert rm_lhs['lower_bound'] == rm_rhs['lower_bound']
+                    assert rm_lhs['upper_bound'] == rm_rhs['upper_bound']
+            else:
+                assert left == right, f"Difference in column {column}: {left} != {right}"