Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
fccb74b
test
mattmartin14 Jan 14, 2025
7298589
unit testing
mattmartin14 Jan 14, 2025
25bc9cf
adding unit tests
mattmartin14 Jan 14, 2025
af6c868
adding unit tests
mattmartin14 Jan 14, 2025
94be807
adding unit tests
mattmartin14 Jan 14, 2025
269d9f5
adding unit tests
mattmartin14 Jan 15, 2025
f44c61a
adding unit tests
mattmartin14 Jan 15, 2025
a96fdf9
finished unit tests
mattmartin14 Jan 16, 2025
fa5ab35
removed unnecesary return
mattmartin14 Jan 16, 2025
cfa2277
updated poetry manifest list for datafusion package dependency
mattmartin14 Jan 17, 2025
35f29be
added license headers, cleaned up dead code
mattmartin14 Jan 22, 2025
6c68d0d
updated the merge function to use bools for matched and not matched rows
mattmartin14 Jan 29, 2025
2d1e8ae
incorporated changes for boolExpression. It simplified the filters a lot
mattmartin14 Jan 31, 2025
f988f25
moved the filter build function to a separate function to accomodate …
mattmartin14 Jan 31, 2025
43393b4
removed unneccessary comment
mattmartin14 Jan 31, 2025
9a561b4
removed test files
mattmartin14 Jan 31, 2025
9ef39a6
bug fixes and removed some more dependency on datafusion
mattmartin14 Feb 3, 2025
2ba1ed6
updated various items including adding a dataclass return result
mattmartin14 Feb 4, 2025
a42eecd
updated merge_rows to remove dependency from datafusion! wahoo
mattmartin14 Feb 4, 2025
1305f58
renamed merge_rows to upsert, removed unnecessary code. will put in f…
mattmartin14 Feb 5, 2025
b2be3db
adding params to unit testing for pytest; having some errors
mattmartin14 Feb 5, 2025
f5688ad
fixed bugs on unit testing; added context wrapper for txn; fixed vari…
mattmartin14 Feb 5, 2025
7d55a4e
bug fixes
mattmartin14 Feb 6, 2025
2e14767
updated some error throwing items
mattmartin14 Feb 6, 2025
85c5848
moved datafusion to just a dev dependency in poetry toml
mattmartin14 Feb 6, 2025
6472071
updated UpsertRow class to be recognized in the return statement
mattmartin14 Feb 6, 2025
51c34da
removed some spaces and streamlined assert statements in unit testing
mattmartin14 Feb 6, 2025
862a69a
updated test cases to use an InMemory catalog
mattmartin14 Feb 7, 2025
3731b86
updated some formatting; added more commentary on the rows_to_update …
mattmartin14 Feb 7, 2025
bbb35d6
rebased poetry lock file and pyproject.toml file; removed sf repo info
mattmartin14 Feb 10, 2025
c8189c9
Merge branch 'main' into main
mattmartin14 Feb 10, 2025
02af4d4
updated equality checks with not instead of == false
mattmartin14 Feb 10, 2025
cc75192
ran ruff check --fix
mattmartin14 Feb 10, 2025
998d98b
manually added lint fixes and updated poetry toml and lock files. tha…
mattmartin14 Feb 11, 2025
513c839
added formatting fices
mattmartin14 Feb 11, 2025
0fd6446
remove the node_modules
mattmartin14 Feb 11, 2025
5fc3478
updated code for another round of fixes
mattmartin14 Feb 11, 2025
6cef789
removed npm uneeded files
mattmartin14 Feb 11, 2025
40b69b8
fixed formatting on upsert function for docs build
mattmartin14 Feb 12, 2025
804c526
Merge branch 'main' into main
mattmartin14 Feb 12, 2025
09e0347
rebased for poetry lock files
mattmartin14 Feb 12, 2025
ca2d904
updated lock files. thanks kevin
mattmartin14 Feb 12, 2025
77375fb
fixed other changes
mattmartin14 Feb 12, 2025
ba4db49
fixed gitignore file
mattmartin14 Feb 12, 2025
622e66c
no whitespace
mattmartin14 Feb 12, 2025
9e79dad
fixed vendor fb file from kevins changes
mattmartin14 Feb 12, 2025
4cbf3e3
reverting vendor changes
mattmartin14 Feb 12, 2025
5333a1e
removing node modules
mattmartin14 Feb 12, 2025
11a25be
updating vendor files
mattmartin14 Feb 12, 2025
03a8d10
Update vendor/fb303/FacebookService.py
mattmartin14 Feb 12, 2025
8a2143c
updated vendor files
mattmartin14 Feb 12, 2025
e719cf8
updated vendor files
mattmartin14 Feb 12, 2025
245b4a9
attempting to update poetry files
mattmartin14 Feb 12, 2025
e3e9611
Merge branch 'main' into main
mattmartin14 Feb 12, 2025
e575b3c
restore vendor/
kevinjqliu Feb 13, 2025
e4e530f
resetore pyproject.toml
kevinjqliu Feb 13, 2025
2ff2083
poetry lock
kevinjqliu Feb 13, 2025
8585d2d
add datafusion to tool.mypy.overrides
kevinjqliu Feb 13, 2025
f673b70
Merge remote-tracking branch 'apache/main' into StateFarmIns/main
kevinjqliu Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
bug fixes
  • Loading branch information
mattmartin14 committed Feb 6, 2025
commit 7d55a4eb988addff6fc51738fe1b6cd53434e2c3
56 changes: 37 additions & 19 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
IsNull,
Or,
Reference,
In,
)
from pyiceberg.expressions.visitors import (
_InclusiveMetricsEvaluator,
Expand Down Expand Up @@ -1072,8 +1071,6 @@ class UpsertResult:
"""Summary the upsert operation"""
rows_updated: int = 0
rows_inserted: int = 0
info_msgs: Optional[str] = None
error_msgs: Optional[str] = None

def upsert(self, df: pa.Table, join_cols: list
, when_matched_update_all: bool = True
Expand All @@ -1083,27 +1080,46 @@ def upsert(self, df: pa.Table, join_cols: list
Shorthand API for performing an upsert to an iceberg table.

Args:
self: the target Iceberg table to execute the upsert on
df: The input dataframe to upsert with the table's data.
join_cols: The columns to join on.
join_cols: The columns to join on. These are essentially analogous to primary keys
when_matched_update_all: Bool indicating to update rows that are matched but require an update due to a value in a non-key column changing
when_not_matched_insert_all: Bool indicating new rows to be inserted that do not match any existing rows in the table

Returns: a UpsertResult class
Example Use Cases:
Case 1: Both Parameters = True (Full Upsert)
Existing row found → Update it
New row found → Insert it

Case 2: when_matched_update_all = False, when_not_matched_insert_all = True
Existing row found → Do nothing (no updates)
New row found → Insert it

Case 3: when_matched_update_all = True, when_not_matched_insert_all = False
Existing row found → Update it
New row found → Do nothing (no inserts)

Case 4: Both Parameters = False (No Merge Effect)
Existing row found → Do nothing
New row found → Do nothing
(Function effectively does nothing)


Returns: a UpsertResult class (contains details of rows updated and inserted)
"""

from pyiceberg.table import upsert_util

if when_matched_update_all == False and when_not_matched_insert_all == False:
return {'rows_updated': 0, 'rows_inserted': 0, 'info_msgs': 'no upsert options selected...exiting'}
#return UpsertResult(info_msgs='no upsert options selected...exiting')
raise Exception('no upsert options selected...exiting')

if upsert_util.dups_check_in_source(df, join_cols):
if upsert_util.has_duplicate_rows(df, join_cols):

return {'error_msgs': 'Duplicate rows found in source dataset based on the key columns. No upsert executed'}
raise Exception('Duplicate rows found in source dataset based on the key columns. No upsert executed')

#get list of rows that exist so we don't have to load the entire target table
pred = upsert_util.get_filter_list(df, join_cols)
iceberg_table_trimmed = self.scan(row_filter=pred).to_arrow()
matched_predicate = upsert_util.create_match_filter(df, join_cols)
matched_iceberg_table = self.scan(row_filter=matched_predicate).to_arrow()

update_row_cnt = 0
insert_row_cnt = 0
Expand All @@ -1113,23 +1129,25 @@ def upsert(self, df: pa.Table, join_cols: list
with self.transaction() as txn:

if when_matched_update_all:

#function get_rows_to_update is doing a check on non-key columns to see if any of the values have actually changed
rows_to_update = upsert_util.get_rows_to_update(df, matched_iceberg_table, join_cols)

update_recs = upsert_util.get_rows_to_update(df, iceberg_table_trimmed, join_cols)

update_row_cnt = len(update_recs)
update_row_cnt = len(rows_to_update)

overwrite_filter = upsert_util.get_filter_list(update_recs, join_cols)
#build the match predicate filter
overwrite_mask_predicate = upsert_util.create_match_filter(rows_to_update, join_cols)

txn.overwrite(update_recs, overwrite_filter=overwrite_filter)
txn.overwrite(rows_to_update, overwrite_filter=overwrite_mask_predicate)


if when_not_matched_insert_all:

insert_recs = upsert_util.get_rows_to_insert(df, iceberg_table_trimmed, join_cols)
rows_to_insert = upsert_util.get_rows_to_insert(df, matched_iceberg_table, join_cols)

insert_row_cnt = len(insert_recs)
insert_row_cnt = len(rows_to_insert)

txn.append(insert_recs)
txn.append(rows_to_insert)

return {
"rows_updated": update_row_cnt,
Expand Down
14 changes: 5 additions & 9 deletions pyiceberg/table/upsert_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,22 @@
In,
)

def get_filter_list(df: pyarrow_table, join_cols: list) -> BooleanExpression:
def create_match_filter(df: pyarrow_table, join_cols: list) -> BooleanExpression:

unique_keys = df.select(join_cols).group_by(join_cols).aggregate([])

pred = None

if len(join_cols) == 1:
pred = In(join_cols[0], unique_keys[0].to_pylist())
return In(join_cols[0], unique_keys[0].to_pylist())
else:
pred = Or(*[
return Or(*[
And(*[
EqualTo(col, row[col])
for col in join_cols
])
for row in unique_keys.to_pylist()
])

return pred

def dups_check_in_source(df: pyarrow_table, join_cols: list) -> bool:
def has_duplicate_rows(df: pyarrow_table, join_cols: list) -> bool:
"""
This function checks if there are duplicate rows in the source table based on the join columns.
It returns True if there are duplicate rows in the source table, otherwise it returns False.
Expand Down Expand Up @@ -144,7 +140,7 @@ def get_rows_to_insert(source_table: pa.Table, target_table: pa.Table, join_cols
source_filter_expr = expr
else:
source_filter_expr = source_filter_expr & expr

non_matching_expr = ~source_filter_expr

source_columns = set(source_table.column_names)
Expand Down
27 changes: 9 additions & 18 deletions tests/table/test_upsert.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def gen_source_dataset(start_row: int, end_row: int, composite_key: bool, add_du

return df

def gen_target_iceberg_table_v2(start_row: int, end_row: int, composite_key: bool, ctx: SessionContext, catalog: SqlCatalog, namespace: str):
def gen_target_iceberg_table(start_row: int, end_row: int, composite_key: bool, ctx: SessionContext, catalog: SqlCatalog, namespace: str):

additional_columns = ", t.order_id + 1000 as order_line_id" if composite_key else ""

Expand Down Expand Up @@ -107,7 +107,7 @@ def catalog_conn():
},
)

catalog.create_namespace(namespace="test_ns")
catalog.create_namespace(namespace=_TEST_NAMESPACE)

yield catalog

Expand All @@ -128,7 +128,7 @@ def test_merge_rows(catalog_conn, join_cols, src_start_row, src_end_row, target_
catalog = catalog_conn

source_df = gen_source_dataset(src_start_row, src_end_row, False, False, ctx)
ice_table = gen_target_iceberg_table_v2(target_start_row, target_end_row, False, ctx, catalog, _TEST_NAMESPACE)
ice_table = gen_target_iceberg_table(target_start_row, target_end_row, False, ctx, catalog, _TEST_NAMESPACE)
res = ice_table.upsert(df=source_df, join_cols=join_cols, when_matched_update_all=when_matched_update_all, when_not_matched_insert_all=when_not_matched_insert_all)

assert res['rows_updated'] == expected_updated, f"rows updated should be {expected_updated}, but got {res['rows_updated']}"
Expand All @@ -147,7 +147,6 @@ def test_merge_scenario_skip_upd_row(catalog_conn):
tests a single insert and update; skips a row that does not need to be updated
"""


ctx = SessionContext()

df = ctx.sql(f"""
Expand Down Expand Up @@ -262,7 +261,7 @@ def test_merge_scenario_composite_key(catalog_conn):
ctx = SessionContext()

catalog = catalog_conn
table = gen_target_iceberg_table_v2(1, 200, True, ctx, catalog, _TEST_NAMESPACE)
table = gen_target_iceberg_table(1, 200, True, ctx, catalog, _TEST_NAMESPACE)
source_df = gen_source_dataset(101, 300, True, False, ctx)


Expand All @@ -286,14 +285,12 @@ def test_merge_source_dups(catalog_conn):


catalog = catalog_conn
table = gen_target_iceberg_table_v2(1, 10, False, ctx, catalog, _TEST_NAMESPACE)
table = gen_target_iceberg_table(1, 10, False, ctx, catalog, _TEST_NAMESPACE)
source_df = gen_source_dataset(5, 15, False, True, ctx)

res = table.upsert(df=source_df, join_cols=["order_id"])

error_msgs = res['error_msgs']
with pytest.raises(Exception, match="Duplicate rows found in source dataset based on the key columns. No upsert executed"):
table.upsert(df=source_df, join_cols=["order_id"])

assert 'Duplicate rows found in source dataset' in error_msgs, f"error message should contain 'Duplicate rows found in source dataset', but got {error_msgs}"

catalog.drop_table(f"{_TEST_NAMESPACE}.target")

Expand All @@ -314,14 +311,8 @@ def test_key_cols_misaligned(catalog_conn):

df_src = ctx.sql("select 1 as item_id, date '2021-05-01' as order_date, 'B' as order_type").to_arrow_table()

try:

res = table.upsert(df=df_src, join_cols=['order_id'])

except KeyError as e:
error_msgs = str(e)

assert 'Field "order_id" does not exist in schema' in error_msgs, f"""error message should contain 'Field "order_id" does not exist in schema', but got {error_msgs}"""
with pytest.raises(Exception, match=r"""Field ".*" does not exist in schema"""):
table.upsert(df=df_src, join_cols=['order_id'])

catalog.drop_table(f"{_TEST_NAMESPACE}.target")