apache · HonahX · Jul 4, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 25, 2024
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -606,6 +606,68 @@ min_snapshots_to_keep: [[null,10]]
 max_snapshot_age_in_ms: [[null,604800000]]
 ```
 
+### Files
+
+Inspect the data files in the current snapshot of the table:
+
+```python
+table.inspect.files()
+```
+
+```
+pyarrow.Table
+content: int8 not null
+file_path: string not null
+file_format: string not null
+record_count: int64 not null
+file_size_in_bytes: int64 not null
+column_sizes: map<int32, int64>
+  child 0, entries: struct<key: int32 not null, value: int64> not null
+      child 0, key: int32 not null
+      child 1, value: int64
+value_counts: map<int32, int64>
+  child 0, entries: struct<key: int32 not null, value: int64> not null
+      child 0, key: int32 not null
+      child 1, value: int64
+null_value_counts: map<int32, int64>
+  child 0, entries: struct<key: int32 not null, value: int64> not null
+      child 0, key: int32 not null
+      child 1, value: int64
+nan_value_counts: map<int32, int64>
+  child 0, entries: struct<key: int32 not null, value: int64> not null
+      child 0, key: int32 not null
+      child 1, value: int64
+lower_bounds: map<int32, binary>
+  child 0, entries: struct<key: int32 not null, value: binary> not null
+      child 0, key: int32 not null
+      child 1, value: binary
+upper_bounds: map<int32, binary>
+  child 0, entries: struct<key: int32 not null, value: binary> not null
+      child 0, key: int32 not null
+      child 1, value: binary
+key_metadata: binary
+split_offsets: list<item: int64>
+  child 0, item: int64
+equality_ids: list<item: int32>
+  child 0, item: int32
+----
+content: [[0,0]]
+file_path: [["s3://warehouse/default/table_metadata_files/data/00000-0-9ea7d222-6457-467f-bad5-6fb125c9aa5f.parquet","s3://warehouse/default/table_metadata_files/data/00000-0-afa8893c-de71-4710-97c9-6b01590d0c44.parquet"]]
+file_format: [["PARQUET","PARQUET"]]
+record_count: [[3,3]]
+file_size_in_bytes: [[5459,5459]]
+column_sizes: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[49,78,128,94,118,...,118,118,94,78,109]]]
+value_counts: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[3,3,3,3,3,...,3,3,3,3,3],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[3,3,3,3,3,...,3,3,3,3,3]]]
+null_value_counts: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[1,1,1,1,1,...,1,1,1,1,1],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[1,1,1,1,1,...,1,1,1,1,1]]]
+nan_value_counts: [[keys:[]values:[],keys:[]values:[]]]
+lower_bounds: [[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000]]]
+upper_bounds:[[keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000],keys:[1,2,3,4,5,...,8,9,10,11,12]values:[00,61,61616161616161616161616161616161,01000000,0100000000000000,...,009B6ACA38F10500,009B6ACA38F10500,9E4B0000,01,00000000000000000000000000000000]]]
+key_metadata: [[0100,0100]]
+split_offsets:[[[],[]]]
+equality_ids:[[[],[]]]
+
+```
+
 ## Add Files
 
 Expert Iceberg users may choose to commit existing parquet files to the Iceberg table as data files, without rewriting them.

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -3537,6 +3537,58 @@ def update_partitions_map(
             schema=table_schema,
         )
 
+    def files(self) -> "pa.Table":
+        import pyarrow as pa
+
+        files_schema = pa.schema([
+            pa.field('content', pa.int8(), nullable=False),
+            pa.field('file_path', pa.string(), nullable=False),
+            pa.field('file_format', pa.string(), nullable=False),
+            pa.field('record_count', pa.int64(), nullable=False),
+            pa.field('file_size_in_bytes', pa.int64(), nullable=False),
+            pa.field('column_sizes', pa.map_(pa.int32(), pa.int64()), nullable=True),
+            pa.field('value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True),
+            pa.field('null_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True),
+            pa.field('nan_value_counts', pa.map_(pa.int32(), pa.int64()), nullable=True),
+            pa.field('lower_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True),
+            pa.field('upper_bounds', pa.map_(pa.int32(), pa.binary()), nullable=True),
+            pa.field('key_metadata', pa.binary(), nullable=True),
+            pa.field('split_offsets', pa.list_(pa.int64()), nullable=True),
+            pa.field('equality_ids', pa.list_(pa.int32()), nullable=True),
+        ])
+
+        files = []
+
+        snapshot = self.tbl.current_snapshot()
+        if not snapshot:
+            return pa.pylist([])
+
+        io = self.tbl.io
+        for manifest_list in snapshot.manifests(io):
+            for manifest_entry in manifest_list.fetch_manifest_entry(io):
+                data_file = manifest_entry.data_file
+                files.append({
+                    'content': data_file.content,
+                    'file_path': data_file.file_path,
+                    'file_format': data_file.file_format,
+                    'record_count': data_file.record_count,
+                    'file_size_in_bytes': data_file.file_size_in_bytes,
+                    'column_sizes': dict(data_file.column_sizes),
+                    'value_counts': dict(data_file.value_counts),
+                    'null_value_counts': dict(data_file.null_value_counts),
+                    'nan_value_counts': dict(data_file.nan_value_counts),
+                    'lower_bounds': dict(data_file.lower_bounds),
+                    'upper_bounds': dict(data_file.upper_bounds),
+                    'key_metadata': data_file.key_metadata,
+                    'split_offsets': data_file.split_offsets,
+                    'equality_ids': data_file.equality_ids,
+                })
+
+        return pa.Table.from_pylist(
+            files,
+            schema=files_schema,
+        )
+
 
 @dataclass(frozen=True)
 class TablePartition:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2060,7 +2060,7 @@ def spark() -> "SparkSession":
         .config("spark.sql.catalog.hive.warehouse", "s3://warehouse/hive/")
         .config("spark.sql.catalog.hive.s3.endpoint", "http://localhost:9000")
         .config("spark.sql.catalog.hive.s3.path-style-access", "true")
-        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
+        .config("spark.sql.execution.arrow.pyspark.enabled", "false")
         .getOrCreate()
     )
 

diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py
@@ -445,3 +445,65 @@ def check_pyiceberg_df_equals_spark_df(df: pa.Table, spark_df: DataFrame) -> Non
         df = tbl.inspect.partitions(snapshot_id=snapshot.snapshot_id)
         spark_df = spark.sql(f"SELECT * FROM {identifier}.partitions VERSION AS OF {snapshot.snapshot_id}")
         check_pyiceberg_df_equals_spark_df(df, spark_df)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_inspect_files(
+    spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int
+) -> None:
+    identifier = "default.table_metadata_files"
+    tbl = _create_table(session_catalog, identifier, properties={"format-version": format_version})
+
+    tbl.overwrite(arrow_table_with_null)
+
+    # append more data
+    tbl.append(arrow_table_with_null)
+
+    df = tbl.refresh().inspect.files()
+
+    assert df.column_names == [
+        'content',
+        'file_path',
+        'file_format',
+        'record_count',
+        'file_size_in_bytes',
+        'column_sizes',
+        'value_counts',
+        'null_value_counts',
+        'nan_value_counts',
+        'lower_bounds',
+        'upper_bounds',
+        'key_metadata',
+        'split_offsets',
+        'equality_ids',
+    ]
+
+    for file_size_in_bytes in df['file_size_in_bytes']:
+        assert isinstance(file_size_in_bytes.as_py(), int)
+
+    for split_offsets in df['split_offsets']:
+        assert isinstance(split_offsets.as_py(), list)
+
+    for file_format in df['file_format']:
+        assert file_format.as_py() == "PARQUET"
+
+    for file_path in df['file_path']:
+        assert file_path.as_py().startswith("s3://")
+
+    lhs = spark.table(f"{identifier}.files").toPandas()
+    rhs = df.to_pandas()
+    for column in df.column_names:
+        for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            if column in [
+                'column_sizes',
+                'value_counts',
+                'null_value_counts',
+                'nan_value_counts',
+                'lower_bounds',
+                'upper_bounds',
+            ]:
+                # Arrow returns a list of tuples, instead of a dict
+                right = dict(right)
+
+            assert left == right, f"Difference in column {column}: {left} != {right}"