Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
2e0b308
initial commit of cogroup
d80tb7 Jun 20, 2019
64ff5ac
minor tidy up
d80tb7 Jun 20, 2019
6d039e3
removed incorrect test
d80tb7 Jun 21, 2019
d8a5c5d
tidies up test, fixed output cols
d80tb7 Jun 25, 2019
73188f6
removed incorrect file
d80tb7 Jun 25, 2019
690fa14
Revert: removed incorrect test
d80tb7 Jun 25, 2019
c86b2bf
Merge branch 'master' of https://github.com/d80tb7/spark into SPARK-2…
d80tb7 Jun 25, 2019
e3b66ac
fix for resolving key cols
d80tb7 Jun 25, 2019
8007fa6
common trait for grouped mandas udfs
d80tb7 Jun 27, 2019
d4cf6d0
poc using arrow streams
d80tb7 Jun 27, 2019
87aeb92
more unit tests fro cogroup
d80tb7 Jun 27, 2019
e7528d0
argspec includes grouping key
d80tb7 Jul 2, 2019
b85ec75
fixed tests und
d80tb7 Jul 2, 2019
6a8ecff
keys now handled properly. Validation of udf. More tests
d80tb7 Jul 2, 2019
d2da787
formatting
d80tb7 Jul 2, 2019
7321141
fixed scalastyle errors
d80tb7 Jul 2, 2019
6bbe31c
updated grouped map to new args format
d80tb7 Jul 2, 2019
b444ff7
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Jul 2, 2019
94be574
some code review fixes
d80tb7 Jul 11, 2019
9241639
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Jul 11, 2019
3de551f
more code review fixes
d80tb7 Jul 11, 2019
300b53a
more code review fixes
d80tb7 Jul 11, 2019
7d161ba
fix comment on PandasCogroupSerializer
d80tb7 Jul 11, 2019
d1a6366
formatting
d80tb7 Jul 11, 2019
a201161
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Jul 19, 2019
3e4bc95
python style fixes
d80tb7 Jul 19, 2019
307e664
added doc
d80tb7 Jul 19, 2019
7558b8d
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Jul 23, 2019
19360c4
minor formatting
d80tb7 Jul 23, 2019
28493b4
a couple more usnit tests
d80tb7 Jul 23, 2019
d6d11e4
minor formatting
d80tb7 Jul 23, 2019
a62a1e3
more doc
d80tb7 Jul 25, 2019
ec78284
added comment to cogroup func
d80tb7 Jul 25, 2019
1a9ff58
fixed python style
d80tb7 Jul 25, 2019
c0d2919
review comments
d80tb7 Aug 20, 2019
4cd5c70
review comments scala
d80tb7 Aug 20, 2019
e025375
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Aug 20, 2019
dd1ffaf
python formatting
d80tb7 Aug 20, 2019
733b592
review comments (mainly formatting)
d80tb7 Sep 8, 2019
51dcbdc
Merge branch 'master' of https://github.com/apache/spark into SPARK-2…
d80tb7 Sep 8, 2019
1b966fd
couple more format changes
d80tb7 Sep 15, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review comments
  • Loading branch information
d80tb7 committed Aug 20, 2019
commit c0d291961316950d0390ab588222d18160fc9b12
51 changes: 17 additions & 34 deletions python/pyspark/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,33 +356,6 @@ def __repr__(self):
return "ArrowStreamPandasSerializer"


class InterleavedArrowReader(object):

def __init__(self, stream):
self._stream = stream

def __iter__(self):
return self

def __next__(self):
dataframes_in_group = read_int(self._stream)
if dataframes_in_group == 2:
return self._read_df(), self._read_df()
elif dataframes_in_group == 0:
raise StopIteration
else:
raise ValueError(
'Received Invalid number of dataframes in group {0}'.format(dataframes_in_group))

def next(self):
return self.__next__()

def _read_df(self):
import pyarrow as pa
reader = pa.ipc.open_stream(self._stream)
return [b for b in reader]


class ArrowStreamPandasUDFSerializer(ArrowStreamPandasSerializer):
"""
Serializer used by Python worker to evaluate Pandas UDFs
Expand Down Expand Up @@ -428,21 +401,31 @@ def __repr__(self):
return "ArrowStreamPandasUDFSerializer"


class PandasCogroupSerializer(ArrowStreamPandasUDFSerializer):
class CogroupUDFSerializer(ArrowStreamPandasUDFSerializer):

def __init__(self, timezone, safecheck, assign_cols_by_name):
super(PandasCogroupSerializer, self).__init__(timezone, safecheck, assign_cols_by_name)
super(CogroupUDFSerializer, self).__init__(timezone, safecheck, assign_cols_by_name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: seems like this only calls the super constructor. I don't think we need this.


def load_stream(self, stream):
"""
Deserialize Cogrouped ArrowRecordBatches to a tuple of Arrow tables and return as a two
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

grammar: as a -> as
change: return -> yield

lists of pandas.Series.
"""
reader = InterleavedArrowReader(stream)
for batch1, batch2 in reader:
import pyarrow as pa
yield ([self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch1).itercolumns()],
[self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch2).itercolumns()])
import pyarrow as pa
dataframes_in_group = None

while dataframes_in_group is None or dataframes_in_group > 0:
dataframes_in_group = read_int(stream)

if dataframes_in_group == 2:
batch1 = [batch for batch in ArrowStreamSerializer.load_stream(self, stream)]
batch2 = [batch for batch in ArrowStreamSerializer.load_stream(self, stream)]
yield ([self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch1).itercolumns()],
[self.arrow_to_pandas(c) for c in pa.Table.from_batches(batch2).itercolumns()])

elif dataframes_in_group != 0:
raise ValueError(
'Received Invalid number of dataframes in group {0}'.format(dataframes_in_group))


class BatchedSerializer(Serializer):
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/sql/cogroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def apply(self, udf):
as a `DataFrame`.

The user-defined function should take two `pandas.DataFrame` and return another
`pandas.DataFrame`. For each side of the cogroup, all columns are passed together
``pandas.DataFrame``. For each side of the cogroup, all columns are passed together
as a `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame`
are combined as a :class:`DataFrame`.

Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/sql/tests/test_pandas_udf_cogrouped_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,20 @@ def test_complex_group_by(self):
'v2': [90, 100, 110]
})

left_df = self.spark\
left_gdf = self.spark\
.createDataFrame(left)\
.groupby(col('id') % 2 == 0)

right_df = self.spark \
right_gdf = self.spark \
.createDataFrame(right) \
.groupby(col('id') % 2 == 0)

@pandas_udf('k long, v long, v2 long', PandasUDFType.COGROUPED_MAP)
def merge_pandas(l, r):
return pd.merge(l[['k', 'v']], r[['k', 'v2']], on=['k'])

result = left_df \
.cogroup(right_df) \
result = left_gdf \
.cogroup(right_gdf) \
.apply(merge_pandas) \
.sort(['k']) \
.toPandas()
Expand Down
16 changes: 8 additions & 8 deletions python/pyspark/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
from pyspark.rdd import PythonEvalType
from pyspark.serializers import write_with_length, write_int, read_long, read_bool, \
write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \
BatchedSerializer, ArrowStreamPandasUDFSerializer, PandasCogroupSerializer
BatchedSerializer, ArrowStreamPandasUDFSerializer, CogroupUDFSerializer
from pyspark.sql.types import to_arrow_type, StructType
from pyspark.util import _get_argspec, fail_on_stopiteration
from pyspark import shuffle
Expand Down Expand Up @@ -314,7 +314,7 @@ def read_udfs(pickleSer, infile, eval_type):
# Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
# pandas Series. See SPARK-27240.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment should be moved into the following else-clause?

if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
ser = PandasCogroupSerializer(timezone, safecheck, assign_cols_by_name)
ser = CogroupUDFSerializer(timezone, safecheck, assign_cols_by_name)
else:
df_for_struct = (eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or
Expand Down Expand Up @@ -418,8 +418,8 @@ def extract_key_value_indexes():
pickleSer, infile, eval_type, runner_conf, udf_index=0)
udfs['f'] = udf
parsed_offsets = extract_key_value_indexes()
keys = ["a[%d]" % o for o in parsed_offsets[0][0]]
vals = ["a[%d]" % o for o in parsed_offsets[0][1]]
keys = ["a[%d]" % (o,) for o in parsed_offsets[0][0]]
vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]]
mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys), ", ".join(vals))
elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
# We assume there is only one UDF here because cogrouped map doesn't
Expand All @@ -429,10 +429,10 @@ def extract_key_value_indexes():
pickleSer, infile, eval_type, runner_conf, udf_index=0)
udfs['f'] = udf
parsed_offsets = extract_key_value_indexes()
df1_keys = ["a[0][%d]" % o for o in parsed_offsets[0][0]]
df1_vals = ["a[0][%d]" % o for o in parsed_offsets[0][1]]
df2_keys = ["a[1][%d]" % o for o in parsed_offsets[1][0]]
df2_vals = ["a[1][%d]" % o for o in parsed_offsets[1][1]]
df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]]
df1_vals = ["a[0][%d]" % (o, )for o in parsed_offsets[0][1]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

space before for

df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]]
df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]]
mapper_str = "lambda a: f([%s], [%s], [%s], [%s])" % (
", ".join(df1_keys), ", ".join(df1_vals), ", ".join(df2_keys), ", ".join(df2_vals))
else:
Expand Down