bug fixes

apache · Fokko · Feb 13, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
commit 7d55a4eb988addff6fc51738fe1b6cd53434e2c3
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -52,7 +52,6 @@
     IsNull,
     Or,
     Reference,
-    In,
 )
 from pyiceberg.expressions.visitors import (
     _InclusiveMetricsEvaluator,
@@ -1072,8 +1071,6 @@ class UpsertResult:
         """Summary the upsert operation"""
         rows_updated: int = 0
         rows_inserted: int = 0
-        info_msgs: Optional[str] = None
-        error_msgs: Optional[str] = None
 
     def upsert(self, df: pa.Table, join_cols: list
                    , when_matched_update_all: bool = True
@@ -1083,27 +1080,46 @@ def upsert(self, df: pa.Table, join_cols: list
         Shorthand API for performing an upsert to an iceberg table.
 
         Args:
+            self: the target Iceberg table to execute the upsert on
             df: The input dataframe to upsert with the table's data.
-            join_cols: The columns to join on.
+            join_cols: The columns to join on. These are essentially analogous to primary keys
             when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing
             when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table
 
-        Returns: a UpsertResult class
+            Example Use Cases:
+                Case 1: Both Parameters = True (Full Upsert)
+                Existing row found → Update it
+                New row found → Insert it
+
+                Case 2: when_matched_update_all = False, when_not_matched_insert_all = True
+                Existing row found → Do nothing (no updates)
+                New row found → Insert it
+
+                Case 3: when_matched_update_all = True, when_not_matched_insert_all = False
+                Existing row found → Update it
+                New row found → Do nothing (no inserts)
+
+                Case 4: Both Parameters = False (No Merge Effect)
+                Existing row found → Do nothing
+                New row found → Do nothing
+                (Function effectively does nothing)
+
+
+        Returns: a UpsertResult class (contains details of rows updated and inserted)
         """
 
         from pyiceberg.table import upsert_util
 
         if when_matched_update_all == False and when_not_matched_insert_all == False:
-            return {'rows_updated': 0, 'rows_inserted': 0, 'info_msgs': 'no upsert options selected...exiting'}
-            #return UpsertResult(info_msgs='no upsert options selected...exiting')
+            raise Exception('no upsert options selected...exiting')
 
-        if upsert_util.dups_check_in_source(df, join_cols):
+        if upsert_util.has_duplicate_rows(df, join_cols):
 
-            return {'error_msgs': 'Duplicate rows found in source dataset based on the key columns. No upsert executed'}
+            raise Exception('Duplicate rows found in source dataset based on the key columns. No upsert executed')
 
         #get list of rows that exist so we don't have to load the entire target table
-        pred = upsert_util.get_filter_list(df, join_cols)
-        iceberg_table_trimmed = self.scan(row_filter=pred).to_arrow()
+        matched_predicate = upsert_util.create_match_filter(df, join_cols)
+        matched_iceberg_table = self.scan(row_filter=matched_predicate).to_arrow()
 
         update_row_cnt = 0
         insert_row_cnt = 0
@@ -1113,23 +1129,25 @@ def upsert(self, df: pa.Table, join_cols: list
             with self.transaction() as txn:
 
                 if when_matched_update_all:
+
+                    #function get_rows_to_update is doing a check on non-key columns to see if any of the values have actually changed
+                    rows_to_update = upsert_util.get_rows_to_update(df, matched_iceberg_table, join_cols)
 
-                    update_recs = upsert_util.get_rows_to_update(df, iceberg_table_trimmed, join_cols)
-
-                    update_row_cnt = len(update_recs)
+                    update_row_cnt = len(rows_to_update)
 
-                    overwrite_filter = upsert_util.get_filter_list(update_recs, join_cols)
+                    #build the match predicate filter
+                    overwrite_mask_predicate = upsert_util.create_match_filter(rows_to_update, join_cols)
 
-                    txn.overwrite(update_recs, overwrite_filter=overwrite_filter)    
+                    txn.overwrite(rows_to_update, overwrite_filter=overwrite_mask_predicate)    
 
 
                 if when_not_matched_insert_all:
 
-                    insert_recs = upsert_util.get_rows_to_insert(df, iceberg_table_trimmed, join_cols)
+                    rows_to_insert = upsert_util.get_rows_to_insert(df, matched_iceberg_table, join_cols)
 
-                    insert_row_cnt = len(insert_recs)
+                    insert_row_cnt = len(rows_to_insert)
 
-                    txn.append(insert_recs)
+                    txn.append(rows_to_insert)
 
             return {
                 "rows_updated": update_row_cnt,

diff --git a/pyiceberg/table/upsert_util.py b/pyiceberg/table/upsert_util.py
@@ -28,26 +28,22 @@
     In,
 )
 
-def get_filter_list(df: pyarrow_table, join_cols: list) -> BooleanExpression:
+def create_match_filter(df: pyarrow_table, join_cols: list) -> BooleanExpression:
 
     unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])
 
-    pred = None
-
     if len(join_cols) == 1:
-        pred = In(join_cols[0], unique_keys[0].to_pylist())
+        return In(join_cols[0], unique_keys[0].to_pylist())
     else:
-        pred = Or(*[
+        return Or(*[
             And(*[
                 EqualTo(col, row[col])
                 for col in join_cols
             ])
             for row in unique_keys.to_pylist()
         ])
 
-    return pred
-
-def dups_check_in_source(df: pyarrow_table, join_cols: list) -> bool:
+def has_duplicate_rows(df: pyarrow_table, join_cols: list) -> bool:
     """
     This function checks if there are duplicate rows in the source table based on the join columns.
     It returns True if there are duplicate rows in the source table, otherwise it returns False.
@@ -144,7 +140,7 @@ def get_rows_to_insert(source_table: pa.Table, target_table: pa.Table, join_cols
             source_filter_expr = expr
         else:
             source_filter_expr = source_filter_expr & expr
-
+    
     non_matching_expr = ~source_filter_expr
 
     source_columns = set(source_table.column_names)

diff --git a/tests/table/test_upsert.py b/tests/table/test_upsert.py
@@ -77,7 +77,7 @@ def gen_source_dataset(start_row: int, end_row: int, composite_key: bool, add_du
 
     return df
 
-def gen_target_iceberg_table_v2(start_row: int, end_row: int, composite_key: bool, ctx: SessionContext, catalog: SqlCatalog, namespace: str):
+def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool, ctx: SessionContext, catalog: SqlCatalog, namespace: str):
 
     additional_columns = ", t.order_id + 1000 as order_line_id" if composite_key else ""
 
@@ -107,7 +107,7 @@ def catalog_conn():
         },
     )
 
-    catalog.create_namespace(namespace="test_ns")
+    catalog.create_namespace(namespace=_TEST_NAMESPACE)
 
     yield catalog
 
@@ -128,7 +128,7 @@ def test_merge_rows(catalog_conn, join_cols, src_start_row, src_end_row, target_
     catalog = catalog_conn
 
     source_df = gen_source_dataset(src_start_row, src_end_row, False, False, ctx)
-    ice_table = gen_target_iceberg_table_v2(target_start_row, target_end_row, False, ctx, catalog, _TEST_NAMESPACE)
+    ice_table = gen_target_iceberg_table(target_start_row, target_end_row, False, ctx, catalog, _TEST_NAMESPACE)
     res = ice_table.upsert(df=source_df, join_cols=join_cols, when_matched_update_all=when_matched_update_all, when_not_matched_insert_all=when_not_matched_insert_all)
 
     assert res['rows_updated'] == expected_updated, f"rows updated should be {expected_updated}, but got {res['rows_updated']}"
@@ -147,7 +147,6 @@ def test_merge_scenario_skip_upd_row(catalog_conn):
         tests a single insert and update; skips a row that does not need to be updated
     """
 
-
     ctx = SessionContext()
 
     df = ctx.sql(f"""
@@ -262,7 +261,7 @@ def test_merge_scenario_composite_key(catalog_conn):
     ctx = SessionContext()
 
     catalog = catalog_conn
-    table = gen_target_iceberg_table_v2(1, 200, True, ctx, catalog, _TEST_NAMESPACE)
+    table = gen_target_iceberg_table(1, 200, True, ctx, catalog, _TEST_NAMESPACE)
     source_df = gen_source_dataset(101, 300, True, False, ctx)
 
 
@@ -286,14 +285,12 @@ def test_merge_source_dups(catalog_conn):
 
 
     catalog = catalog_conn
-    table = gen_target_iceberg_table_v2(1, 10, False, ctx, catalog, _TEST_NAMESPACE)
+    table = gen_target_iceberg_table(1, 10, False, ctx, catalog, _TEST_NAMESPACE)
     source_df = gen_source_dataset(5, 15, False, True, ctx)
 
-    res = table.upsert(df=source_df, join_cols=["order_id"])
-
-    error_msgs = res['error_msgs']
+    with pytest.raises(Exception, match="Duplicate rows found in source dataset based on the key columns. No upsert executed"):
+        table.upsert(df=source_df, join_cols=["order_id"])
 
-    assert 'Duplicate rows found in source dataset' in error_msgs, f"error message should contain 'Duplicate rows found in source dataset', but got {error_msgs}"
 
     catalog.drop_table(f"{_TEST_NAMESPACE}.target")
 
@@ -314,14 +311,8 @@ def test_key_cols_misaligned(catalog_conn):
 
     df_src = ctx.sql("select 1 as item_id, date '2021-05-01' as order_date, 'B' as order_type").to_arrow_table()
 
-    try:
-
-        res = table.upsert(df=df_src, join_cols=['order_id'])
-
-    except KeyError as e:
-        error_msgs = str(e)
-
-    assert 'Field "order_id" does not exist in schema' in error_msgs, f"""error message should contain 'Field "order_id" does not exist in schema', but got {error_msgs}"""
+    with pytest.raises(Exception, match=r"""Field ".*" does not exist in schema"""):
+        table.upsert(df=df_src, join_cols=['order_id'])
 
     catalog.drop_table(f"{_TEST_NAMESPACE}.target")