-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-47845][SQL][PYTHON][CONNECT] Support Column type in split function for scala and python #46045
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-47845][SQL][PYTHON][CONNECT] Support Column type in split function for scala and python #46045
Changes from all commits
32ab7eb
29347d8
9ca7e0f
c3c7474
46355bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| Project [split(g#0, g#0, -1) AS split(g, g, -1)#0] | ||
| +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| Project [split(g#0, ;, a#0) AS split(g, ;, a)#0] | ||
| +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| { | ||
| "common": { | ||
| "planId": "1" | ||
| }, | ||
| "project": { | ||
| "input": { | ||
| "common": { | ||
| "planId": "0" | ||
| }, | ||
| "localRelation": { | ||
| "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" | ||
| } | ||
| }, | ||
| "expressions": [{ | ||
| "unresolvedFunction": { | ||
| "functionName": "split", | ||
| "arguments": [{ | ||
| "unresolvedAttribute": { | ||
| "unparsedIdentifier": "g" | ||
| } | ||
| }, { | ||
| "unresolvedAttribute": { | ||
| "unparsedIdentifier": "g" | ||
| } | ||
| }] | ||
| } | ||
| }] | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| { | ||
| "common": { | ||
| "planId": "1" | ||
| }, | ||
| "project": { | ||
| "input": { | ||
| "common": { | ||
| "planId": "0" | ||
| }, | ||
| "localRelation": { | ||
| "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" | ||
| } | ||
| }, | ||
| "expressions": [{ | ||
| "unresolvedFunction": { | ||
| "functionName": "split", | ||
| "arguments": [{ | ||
| "unresolvedAttribute": { | ||
| "unparsedIdentifier": "g" | ||
| } | ||
| }, { | ||
| "literal": { | ||
| "string": ";" | ||
| } | ||
| }, { | ||
| "unresolvedAttribute": { | ||
| "unparsedIdentifier": "a" | ||
| } | ||
| }] | ||
| } | ||
| }] | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2476,8 +2476,13 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: | |
| repeat.__doc__ = pysparkfuncs.repeat.__doc__ | ||
|
|
||
|
|
||
| def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: | ||
| return _invoke_function("split", _to_col(str), lit(pattern), lit(limit)) | ||
| def split( | ||
| str: "ColumnOrName", | ||
|
||
| pattern: Union[Column, str], | ||
| limit: Union["ColumnOrName", int] = -1, | ||
| ) -> Column: | ||
| limit = lit(limit) if isinstance(limit, int) else _to_col(limit) | ||
| return _invoke_function("split", _to_col(str), lit(pattern), limit) | ||
|
|
||
|
|
||
| split.__doc__ = pysparkfuncs.split.__doc__ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10944,7 +10944,11 @@ def repeat(col: "ColumnOrName", n: Union["ColumnOrName", int]) -> Column: | |
|
|
||
|
|
||
| @_try_remote_functions | ||
| def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: | ||
| def split( | ||
| str: "ColumnOrName", | ||
| pattern: Union[Column, str], | ||
| limit: Union["ColumnOrName", int] = -1, | ||
| ) -> Column: | ||
| """ | ||
| Splits str around matches of the given pattern. | ||
|
|
||
|
|
@@ -10957,10 +10961,10 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: | |
| ---------- | ||
| str : :class:`~pyspark.sql.Column` or str | ||
| a string expression to split | ||
| pattern : str | ||
| pattern : :class:`~pyspark.sql.Column` or str | ||
| a string representing a regular expression. The regex string should be | ||
| a Java regular expression. | ||
| limit : int, optional | ||
| limit : :class:`~pyspark.sql.Column` or str or int | ||
| an integer which controls the number of times `pattern` is applied. | ||
|
|
||
| * ``limit > 0``: The resulting array's length will not be more than `limit`, and the | ||
|
|
@@ -10972,20 +10976,65 @@ def split(str: "ColumnOrName", pattern: str, limit: int = -1) -> Column: | |
| .. versionchanged:: 3.0 | ||
| `split` now takes an optional `limit` field. If not provided, default limit value is -1. | ||
|
|
||
| .. versionchanged:: 4.0.0 | ||
| `pattern` now accepts column. Does not accept column name since string type remain | ||
| accepted as a regular expression representation, for backwards compatibility. | ||
| In addition to int, `limit` now accepts column and column name. | ||
|
|
||
|
||
| Returns | ||
| ------- | ||
| :class:`~pyspark.sql.Column` | ||
| array of separated strings. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> import pyspark.sql.functions as sf | ||
| >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) | ||
| >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect() | ||
| [Row(s=['one', 'twoBthreeC'])] | ||
| >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect() | ||
| [Row(s=['one', 'two', 'three', ''])] | ||
| >>> df.select(sf.split(df.s, '[ABC]', 2).alias('s')).show() | ||
| +-----------------+ | ||
| | s| | ||
| +-----------------+ | ||
| |[one, twoBthreeC]| | ||
| +-----------------+ | ||
|
|
||
| >>> import pyspark.sql.functions as sf | ||
| >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',]) | ||
| >>> df.select(sf.split(df.s, '[ABC]', -1).alias('s')).show() | ||
| +-------------------+ | ||
| | s| | ||
| +-------------------+ | ||
| |[one, two, three, ]| | ||
| +-------------------+ | ||
|
|
||
| >>> import pyspark.sql.functions as sf | ||
| >>> df = spark.createDataFrame( | ||
| ... [('oneAtwoBthreeC', '[ABC]'), ('1A2B3C', '[1-9]+'), ('aa2bb3cc4', '[1-9]+')], | ||
| ... ['s', 'pattern'] | ||
| ... ) | ||
| >>> df.select(sf.split(df.s, df.pattern).alias('s')).show() | ||
| +-------------------+ | ||
| | s| | ||
| +-------------------+ | ||
| |[one, two, three, ]| | ||
| | [, A, B, C]| | ||
| | [aa, bb, cc, ]| | ||
| +-------------------+ | ||
|
|
||
| >>> import pyspark.sql.functions as sf | ||
| >>> df = spark.createDataFrame( | ||
| ... [('oneAtwoBthreeC', '[ABC]', 2), ('1A2B3C', '[1-9]+', -1)], | ||
| ... ['s', 'pattern', 'expected_parts'] | ||
| ... ) | ||
| >>> df.select(sf.split(df.s, df.pattern, df.expected_parts).alias('s')).show() | ||
| +-----------------+ | ||
| | s| | ||
| +-----------------+ | ||
| |[one, twoBthreeC]| | ||
| | [, A, B, C]| | ||
| +-----------------+ | ||
| """ | ||
| return _invoke_function("split", _to_java_column(str), pattern, limit) | ||
| limit = lit(limit) if isinstance(limit, int) else limit | ||
|
||
| return _invoke_function_over_columns("split", str, lit(pattern), limit) | ||
|
|
||
|
|
||
| @_try_remote_functions | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's add some new test cases for the connect module in
PlanGenerationTestSuite.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@CTCC1 also need to run
connect/ testOnly org. apache. spark. sql. connect. ProtoToParsedPlanTestSuiteto generate the golden files needed for reverse validation testing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks for the pointer, fixed