updated various items including adding a dataclass return result

apache · Fokko · Feb 13, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
commit 2ba1ed68ef772d6d1ba4705b1291200de9a33c1e
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -137,6 +137,8 @@
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.properties import property_as_bool
 
+from dataclasses import dataclass
+
 if TYPE_CHECKING:
     import daft
     import pandas as pd
@@ -1065,10 +1067,18 @@ def name_mapping(self) -> Optional[NameMapping]:
         """Return the table's field-id NameMapping."""
         return self.metadata.name_mapping()
 
+    @dataclass
+    class MergeResult:
+        """docstring"""
+        rows_updated: int
+        rows_inserted: int
+        info_msgs: str
+        error_msgs: str
+
     def merge_rows(self, df: pa.Table, join_cols: list
                    , when_matched_update_all: bool = True
                    , when_not_matched_insert_all: bool = True
-                ) -> Dict:
+                ) -> MergeResult:
         """
         Shorthand API for performing an upsert/merge to an iceberg table.
 
@@ -1098,7 +1108,7 @@ def merge_rows(self, df: pa.Table, join_cols: list
         target_table_name = "target"
 
         if when_matched_update_all == False and when_not_matched_insert_all == False:
-            return {'rows_updated': 0, 'rows_inserted': 0, 'msg': 'no merge options selected...exiting'}
+            return {'rows_updated': 0, 'rows_inserted': 0, 'info_msgs': 'no merge options selected...exiting'}
 
         missing_columns = merge_rows_util.do_join_columns_exist(df, self, join_cols)
 
@@ -1143,9 +1153,9 @@ def merge_rows(self, df: pa.Table, join_cols: list
 
                 txn.overwrite(update_recs, overwrite_filter=overwrite_filter)    
 
-            # Insert the new records
 
             if when_not_matched_insert_all:
+
                 insert_recs_sql = merge_rows_util.get_rows_to_insert_sql(source_table_name, target_table_name, join_cols, source_col_list, target_col_list)
 
                 insert_recs = ctx.sql(insert_recs_sql).to_arrow_table()

diff --git a/pyiceberg/table/merge_rows_util.py b/pyiceberg/table/merge_rows_util.py
@@ -48,10 +48,10 @@ def get_filter_list(df: pyarrow_table, join_cols: list) -> BooleanExpression:
     return pred
 
 
-def get_table_column_list_pa(df: pyarrow_table) -> list:
+def get_table_column_list_pa(df: pyarrow_table) -> set:
     return set(col for col in df.column_names)
 
-def get_table_column_list_iceberg(table: pyiceberg_table) -> list:
+def get_table_column_list_iceberg(table: pyiceberg_table) -> set:
     return set(col for col in table.schema().column_names)
 
 def dups_check_in_source(df: pyarrow_table, join_cols: list) -> bool:
@@ -69,7 +69,6 @@ def dups_check_in_source(df: pyarrow_table, join_cols: list) -> bool:
 
     return source_dup_count > 0
 
-
 def do_join_columns_exist(source_df: pyarrow_table, target_iceberg_table: pyiceberg_table, join_cols: list) -> bool:
 
     """
@@ -89,6 +88,8 @@ def do_join_columns_exist(source_df: pyarrow_table, target_iceberg_table: pyiceb
 
     return missing_columns
 
+
+
 def get_rows_to_update_sql(source_table_name: str, target_table_name: str
                            , join_cols: list
                            , source_cols_list: set
@@ -99,6 +100,7 @@ def get_rows_to_update_sql(source_table_name: str, target_table_name: str
     """
 
     # Determine non-join columns that exist in both tables
+
 
     non_join_cols = source_cols_list.intersection(target_cols_list) - set(join_cols)
 

diff --git a/tests/table/test_merge_rows.py b/tests/table/test_merge_rows.py
@@ -85,11 +85,8 @@ def gen_source_dataset(start_row: int, end_row: int, composite_key: bool, add_du
         {dup_row}
     """
 
-    #print(sql)
-
     df = ctx.sql(sql).to_arrow_table()
 
-
     return df
 
 def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool, ctx: SessionContext):
@@ -110,7 +107,7 @@ def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool,
 
     return table
 
-def test_merge_scenario_1a_simple():
+def test_merge_scenario_single_ins_upd():
 
     """
         tests a single insert and update
@@ -130,9 +127,8 @@ def test_merge_scenario_1a_simple():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: test scenario 1a pass')
 
-def test_merge_scenario_1b_simple():
+def test_merge_scenario_skip_upd_row():
 
     """
         tests a single insert and update; skips a row that does not need to be updated
@@ -168,10 +164,8 @@ def test_merge_scenario_1b_simple():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: test scenario 1b (skip 1 row) pass')
-
 
-def test_merge_scenario_1c_simple():
+def test_merge_scenario_date_as_key():
 
     """
         tests a single insert and update; primary key is a date column
@@ -207,9 +201,8 @@ def test_merge_scenario_1c_simple():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: test scenario 1c (date as key column) pass')
 
-def test_merge_scenario_1d_simple():
+def test_merge_scenario_string_as_key():
 
     """
         tests a single insert and update; primary key is a string column
@@ -245,9 +238,8 @@ def test_merge_scenario_1d_simple():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: test scenario 1d (string as key column) pass')
 
-def test_merge_scenario_2_10k_rows():
+def test_merge_scenario_10k_rows():
 
     """
         tests merging 10000 rows on a single key to simulate larger workload
@@ -268,9 +260,8 @@ def test_merge_scenario_2_10k_rows():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: test scenario 2 pass')
 
-def test_merge_scenario_3_composite_key():
+def test_merge_scenario_composite_key():
 
     """
         tests merging 200 rows with a composite key
@@ -291,7 +282,6 @@ def test_merge_scenario_3_composite_key():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: composite keys test pass')
 
 def test_merge_update_only():
 
@@ -313,7 +303,6 @@ def test_merge_update_only():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: update only pass')
 
 def test_merge_insert_only():
     """
@@ -334,7 +323,6 @@ def test_merge_insert_only():
     assert res['rows_inserted'] == rows_inserted_should_be, f"rows inserted should be {rows_inserted_should_be}, but got {res['rows_inserted']}"
 
     purge_warehouse()
-    print('merge rows: insert only pass')
 
 def test_merge_source_dups():
 
@@ -354,7 +342,6 @@ def test_merge_source_dups():
     assert 'Duplicate rows found in source dataset' in error_msgs, f"error message should contain 'Duplicate rows found in source dataset', but got {error_msgs}"
 
     purge_warehouse()
-    print('merge rows: source dups test pass')
 
 def test_key_cols_misaligned():
 
@@ -381,17 +368,13 @@ def test_key_cols_misaligned():
 
     purge_warehouse()
 
-    print('merge rows: key cols misaligned test pass')
-
-if __name__ == "__main__":
-
-    test_merge_scenario_1a_simple()
-    test_merge_scenario_1b_simple()
-    test_merge_scenario_1c_simple()
-    test_merge_scenario_1d_simple()
-    test_merge_scenario_2_10k_rows()
-    test_merge_scenario_3_composite_key()
-    test_merge_update_only()
-    test_merge_insert_only()
-    test_merge_source_dups()
-    test_key_cols_misaligned()
+test_merge_scenario_single_ins_upd
+test_merge_scenario_skip_upd_row
+test_merge_scenario_date_as_key
+test_merge_scenario_string_as_key
+test_merge_scenario_10k_rows()
+test_merge_scenario_composite_key()
+test_merge_update_only()
+test_merge_insert_only()
+test_merge_source_dups()
+test_key_cols_misaligned()