From 7d85f1ccd89d20fb0327847e8d8713a8f8907b14 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Wed, 5 Jun 2024 16:22:29 +0300 Subject: [PATCH 01/10] Support usage of columns as parameters to more pyspark functions --- python/pyspark/sql/functions/builtin.py | 51 ++++++++++++++++++++----- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 07dfcaf2e2b7..b81501a4e7cd 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10915,7 +10915,7 @@ def sentences( @_try_remote_functions -def substring(str: "ColumnOrName", pos: int, len: int) -> Column: +def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` @@ -10934,9 +10934,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - pos : int + pos : :class:`~pyspark.sql.Column` or str or int starting position in str. - len : int + len : :class:`~pyspark.sql.Column` or str or int length of chars. Returns @@ -10949,14 +10949,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() [Row(s='ab')] + + >>> df = spark.createDataFrame([('abcd', 2, 3)], ['s', 'start', 'len']) + >>> df.select(substring(df.s, df.start, df.len).alias('s')).collect() + [Row(s='bcd')] """ from pyspark.sql.classic.column import _to_java_column + pos = _to_java_column(pos) if isinstance(pos, (str, Column)) else pos + len = _to_java_column(len) if isinstance(pos, (str, Column)) else len return _invoke_function("substring", _to_java_column(str), pos, len) @_try_remote_functions -def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: +def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union["ColumnOrName", int]) -> Column: """ Returns the substring from string str before count occurrences of the delimiter delim. If count is positive, everything the left of the final delimiter (counting from left) is @@ -10972,9 +10978,9 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - delim : str + delim : :class:`~pyspark.sql.Column` or str delimiter of values. - count : int + count : :class:`~pyspark.sql.Column` or str or int number of occurrences. Returns @@ -10992,6 +10998,8 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: """ from pyspark.sql.classic.column import _to_java_column + delim = delim._jc if isinstance(delim, Column) else delim + count = _to_java_column(count) if isinstance(count, (str, Column)) else count return _invoke_function("substring_index", _to_java_column(str), delim, count) @@ -13969,7 +13977,7 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: col : :class:`~pyspark.sql.Column` or str target column to work on. value : Any - value to look for. + value or a :class:`~pyspark.sql.Column` expression to look for. Returns ------- @@ -14034,9 +14042,21 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: +-----------------------+ | 3| +-----------------------+ + + Example 6: Finding the position of a column's value in an array of integers + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col']) + >>> df.select(sf.array_position(df.data, df.col)).show() + +-------------------------+ + |array_position(data, col)| + +-------------------------+ + | 2 | + +-------------------------+ """ from pyspark.sql.classic.column import _to_java_column + value = value._jc if isinstance(value, Column) else value return _invoke_function("array_position", _to_java_column(col), value) @@ -14402,7 +14422,7 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: col : :class:`~pyspark.sql.Column` or str name of column containing array element : - element to be removed from the array + element or a :class:`~pyspark.sql.Column` expression to be removed from the array Returns ------- @@ -14470,9 +14490,21 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: +---------------------+ | []| +---------------------+ + + Example 6: Removing a column's value from a simple array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col']) + >>> df.select(sf.array_remove(df.data, df.col)).show() + +-----------------------+ + |array_remove(data, col)| + +-----------------------+ + | [2, 3]| + +-----------------------+ """ from pyspark.sql.classic.column import _to_java_column + element = element._jc if isinstance(element, Column) else element return _invoke_function("array_remove", _to_java_column(col), element) @@ -17237,7 +17269,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: col : :class:`~pyspark.sql.Column` or str The name of the column or an expression that represents the map. value : - A literal value. + A literal value, or a :class:`~pyspark.sql.Column` expression. Returns ------- @@ -17270,6 +17302,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: """ from pyspark.sql.classic.column import _to_java_column + value = value._jc if isinstance(value, Column) else value return _invoke_function("map_contains_key", _to_java_column(col), value) From d690698656456383c35aeb9fb1914309d5a25b96 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Wed, 5 Jun 2024 22:40:04 +0300 Subject: [PATCH 02/10] allow usage of column params in substring functions --- python/pyspark/sql/functions/builtin.py | 25 ++++++++++++------- .../expressions/stringExpressions.scala | 4 +++ .../org/apache/spark/sql/functions.scala | 4 +-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index b81501a4e7cd..928902cf5269 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10949,15 +10949,16 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union[" >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() [Row(s='ab')] - - >>> df = spark.createDataFrame([('abcd', 2, 3)], ['s', 'start', 'len']) - >>> df.select(substring(df.s, df.start, df.len).alias('s')).collect() - [Row(s='bcd')] + >>> df = spark.createDataFrame([('abcd', 1, 2)], ['s','p', 'l']) + >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect() + [Row(s='ab')] """ + # `str` is shadowed by the function's param + from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column - pos = _to_java_column(pos) if isinstance(pos, (str, Column)) else pos - len = _to_java_column(len) if isinstance(pos, (str, Column)) else len + pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos + len = _to_java_column(len) if isinstance(pos, (StrType, Column)) else len return _invoke_function("substring", _to_java_column(str), pos, len) @@ -10995,11 +10996,16 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union [Row(s='a.b')] >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() [Row(s='b.c.d')] + >>> df = spark.createDataFrame([('a.b.c.d', '.', -3)], ['s', 'd', 'c']) + >>> df.select(substring_index(df.s, df.d, df.c).alias('s')).collect() + [Row(s='b.c.d')] """ + # `str` is shadowed by the function's param + from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column - + delim = delim._jc if isinstance(delim, Column) else delim - count = _to_java_column(count) if isinstance(count, (str, Column)) else count + count = _to_java_column(count) if isinstance(count, (StrType, Column)) else count return _invoke_function("substring_index", _to_java_column(str), delim, count) @@ -14051,8 +14057,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: +-------------------------+ |array_position(data, col)| +-------------------------+ - | 2 | + | 2| +-------------------------+ + """ from pyspark.sql.classic.column import _to_java_column diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 09ec501311ad..35cfb92b7672 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1422,6 +1422,8 @@ case class StringInstr(str: Expression, substr: Expression) Examples: > SELECT _FUNC_('www.apache.org', '.', 2); www.apache + > SELECT _FUNC_(col1, col2, col3) from values('www.apache.org', '.', 2); + www.apache """, since = "1.5.0", group = "string_funcs") @@ -1967,6 +1969,8 @@ case class StringSpace(child: Expression) k > SELECT _FUNC_(encode('Spark SQL', 'utf-8'), 5); k SQL + > SELECT _FUNC_(col1, col2) from values('Spark SQL', 5); + k SQL """, since = "1.5.0", group = "string_funcs") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 52733611e42a..7f12e15a2f9c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4231,7 +4231,7 @@ object functions { * @group string_funcs * @since 1.5.0 */ - def substring(str: Column, pos: Int, len: Int): Column = + def substring(str: Column, pos: Any, len: Any): Column = Column.fn("substring", str, lit(pos), lit(len)) /** @@ -4242,7 +4242,7 @@ object functions { * * @group string_funcs */ - def substring_index(str: Column, delim: String, count: Int): Column = + def substring_index(str: Column, delim: Any, count: Any): Column = Column.fn("substring_index", str, lit(delim), lit(count)) /** From ce954456e7a900716b33ae90f2ba5b525cbba955 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Thu, 6 Jun 2024 03:42:24 +0300 Subject: [PATCH 03/10] revert support of column params in substring functions --- python/pyspark/sql/functions/builtin.py | 28 +++++-------------- .../expressions/stringExpressions.scala | 4 --- .../org/apache/spark/sql/functions.scala | 4 +-- 3 files changed, 9 insertions(+), 27 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 928902cf5269..7b99a24a96eb 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10915,7 +10915,7 @@ def sentences( @_try_remote_functions -def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column: +def substring(str: "ColumnOrName", pos: int, len: int) -> Column: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` @@ -10934,9 +10934,9 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union[" ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - pos : :class:`~pyspark.sql.Column` or str or int + pos : int starting position in str. - len : :class:`~pyspark.sql.Column` or str or int + len : int length of chars. Returns @@ -10949,21 +10949,14 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union[" >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() [Row(s='ab')] - >>> df = spark.createDataFrame([('abcd', 1, 2)], ['s','p', 'l']) - >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect() - [Row(s='ab')] """ - # `str` is shadowed by the function's param - from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column - pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos - len = _to_java_column(len) if isinstance(pos, (StrType, Column)) else len return _invoke_function("substring", _to_java_column(str), pos, len) @_try_remote_functions -def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union["ColumnOrName", int]) -> Column: +def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column: """ Returns the substring from string str before count occurrences of the delimiter delim. If count is positive, everything the left of the final delimiter (counting from left) is @@ -10979,9 +10972,9 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - delim : :class:`~pyspark.sql.Column` or str + delim : str delimiter of values. - count : :class:`~pyspark.sql.Column` or str or int + count : int number of occurrences. Returns @@ -10996,16 +10989,9 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union [Row(s='a.b')] >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect() [Row(s='b.c.d')] - >>> df = spark.createDataFrame([('a.b.c.d', '.', -3)], ['s', 'd', 'c']) - >>> df.select(substring_index(df.s, df.d, df.c).alias('s')).collect() - [Row(s='b.c.d')] """ - # `str` is shadowed by the function's param - from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column - - delim = delim._jc if isinstance(delim, Column) else delim - count = _to_java_column(count) if isinstance(count, (StrType, Column)) else count + return _invoke_function("substring_index", _to_java_column(str), delim, count) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 35cfb92b7672..09ec501311ad 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1422,8 +1422,6 @@ case class StringInstr(str: Expression, substr: Expression) Examples: > SELECT _FUNC_('www.apache.org', '.', 2); www.apache - > SELECT _FUNC_(col1, col2, col3) from values('www.apache.org', '.', 2); - www.apache """, since = "1.5.0", group = "string_funcs") @@ -1969,8 +1967,6 @@ case class StringSpace(child: Expression) k > SELECT _FUNC_(encode('Spark SQL', 'utf-8'), 5); k SQL - > SELECT _FUNC_(col1, col2) from values('Spark SQL', 5); - k SQL """, since = "1.5.0", group = "string_funcs") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 7f12e15a2f9c..52733611e42a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4231,7 +4231,7 @@ object functions { * @group string_funcs * @since 1.5.0 */ - def substring(str: Column, pos: Any, len: Any): Column = + def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) /** @@ -4242,7 +4242,7 @@ object functions { * * @group string_funcs */ - def substring_index(str: Column, delim: Any, count: Any): Column = + def substring_index(str: Column, delim: String, count: Int): Column = Column.fn("substring_index", str, lit(delim), lit(count)) /** From 3a3c2ac1ce802a947eac3f7e5e443b397d0c598d Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Thu, 6 Jun 2024 15:38:55 +0300 Subject: [PATCH 04/10] Add support for columns as params in the substring function --- .../org/apache/spark/sql/functions.scala | 39 ++++++++++++++++ .../spark/sql/PlanGenerationTestSuite.scala | 12 +++++ .../function_substring_using_len_column.json | 33 ++++++++++++++ ...ction_substring_using_len_column.proto.bin | Bin 0 -> 190 bytes ..._substring_using_start_and_len_column.json | 33 ++++++++++++++ ...tring_using_start_and_len_column.proto.bin | Bin 0 -> 192 bytes ...function_substring_using_start_column.json | 33 ++++++++++++++ ...ion_substring_using_start_column.proto.bin | Bin 0 -> 190 bytes .../function_substring_with_columns.json | 33 ++++++++++++++ .../function_substring_with_columns.proto.bin | Bin 0 -> 190 bytes python/pyspark/sql/functions/builtin.py | 42 ++++++++++++++++-- .../org/apache/spark/sql/functions.scala | 39 ++++++++++++++++ .../spark/sql/StringFunctionsSuite.scala | 7 +++ 13 files changed, 267 insertions(+), 4 deletions(-) create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index c287e3469108..8984466c2229 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4276,6 +4276,45 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) + /** + * Substring starts at `pos` and is of length `len` when str is String type or returns the slice + * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type + * + * @note + * The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Int): Column = + Column.fn("substring", str, pos, lit(len)) + + /** + * Substring starts at `pos` and is of length `len` when str is String type or returns the slice + * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type + * + * @note + * The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Int, len: Column): Column = + Column.fn("substring", str, lit(pos), len) + + /** + * Substring starts at `pos` and is of length `len` when str is String type or returns the slice + * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type + * + * @note + * The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Column): Column = + Column.fn("substring", str, pos, len) + /** * Returns the substring from string str before count occurrences of the delimiter delim. If * count is positive, everything the left of the final delimiter (counting from left) is diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index e0ad8f7078ca..bed1cdcfe976 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1780,6 +1780,18 @@ class PlanGenerationTestSuite fn.substring(fn.col("g"), 4, 5) } + functionTest("substring using start column") { + fn.substring(fn.col("g"), lit(4), 5) + } + + functionTest("substring using len column") { + fn.substring(fn.col("g"), 4, lit(5)) + } + + functionTest("substring using start and len column") { + fn.substring(fn.col("g"), fn.col("a"), fn.col("b")) + } + functionTest("substring_index") { fn.substring_index(fn.col("g"), ";", 5) } diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json new file mode 100644 index 000000000000..84a70cf1c023 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 4 + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274 GIT binary patch literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz literal 0 HcmV?d00001 diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json new file mode 100644 index 000000000000..ba28b1c7f570 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..f14b44ef5a501da5c419619ada7b5bff665490a1 GIT binary patch literal 192 zcmd;L5@3|t$;dT{k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1He81uo9w(j*99 Qh*gN0i!mKSB|@kq08dFfaR2}S literal 0 HcmV?d00001 diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json new file mode 100644 index 000000000000..84a70cf1c023 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 4 + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274 GIT binary patch literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz literal 0 HcmV?d00001 diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json new file mode 100644 index 000000000000..84a70cf1c023 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 4 + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274 GIT binary patch literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz literal 0 HcmV?d00001 diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 7b99a24a96eb..aa158da2da00 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10915,7 +10915,7 @@ def sentences( @_try_remote_functions -def substring(str: "ColumnOrName", pos: int, len: int) -> Column: +def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` @@ -10926,6 +10926,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + `pos` and `len` now also accept Columns or names of Columns. + Notes ----- The position is not zero based, but 1 based index. @@ -10934,9 +10937,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: ---------- str : :class:`~pyspark.sql.Column` or str target column to work on. - pos : int + pos : :class:`~pyspark.sql.Column` or str or int starting position in str. - len : int + len : :class:`~pyspark.sql.Column` or str or int length of chars. Returns @@ -10949,9 +10952,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column: >>> df = spark.createDataFrame([('abcd',)], ['s',]) >>> df.select(substring(df.s, 1, 2).alias('s')).collect() [Row(s='ab')] - """ + >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l']) + >>> df.select(substring(df.s, 2, df.l).alias('s')).collect() + [Row(s='par')] + >>> df.select(substring(df.s, df.p, 3).alias('s')).collect() + [Row(s='par')] + >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect() + [Row(s='par')] + """ + # the native str type is shadowed by the function's `str` param + from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column + pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos + len = _to_java_column(len) if isinstance(len, (StrType, Column)) else len return _invoke_function("substring", _to_java_column(str), pos, len) @@ -13959,6 +13973,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. + Notes ----- The position is not zero based, but 1 based index. Returns 0 if the given @@ -14410,6 +14427,9 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + `element` now also accepts a Column type. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -17257,6 +17277,9 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. + Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -17292,6 +17315,17 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: +--------------------------+ | false| +--------------------------+ + + Example 3: Check for key using a column + + >>> from pyspark.sql import functions as sf + >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key") + >>> df.select(sf.map_contains_key("data", sf.col("key"))).show() + +---------------------------+ + |map_contains_key(data, key)| + +---------------------------+ + | true| + +---------------------------+ """ from pyspark.sql.classic.column import _to_java_column diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 52733611e42a..11b5d2561583 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4234,6 +4234,45 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) + /** + * Substring starts at `pos` and is of length `len` when str is String type or + * returns the slice of byte array that starts at `pos` in byte and is of length `len` + * when str is Binary type + * + * @note The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Int): Column = + Column.fn("substring", str, pos, lit(len)) + + /** + * Substring starts at `pos` and is of length `len` when str is String type or + * returns the slice of byte array that starts at `pos` in byte and is of length `len` + * when str is Binary type + * + * @note The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Int, len: Column): Column = + Column.fn("substring", str, lit(pos), len) + + /** + * Substring starts at `pos` and is of length `len` when str is String type or + * returns the slice of byte array that starts at `pos` in byte and is of length `len` + * when str is Binary type + * + * @note The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 4.0.0 + */ + def substring(str: Column, pos: Column, len: Column): Column = + Column.fn("substring", str, pos, len) + /** * Returns the substring from string str before count occurrences of the delimiter delim. * If count is positive, everything the left of the final delimiter (counting from left) is diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 3fc0b572d80b..dca8d904fe50 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -332,6 +332,13 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { // scalastyle:on } + test("string substring function using columns") { + val df = Seq(("Spark", 2, 3)).toDF("a", "b", "c") + checkAnswer(df.select(substring($"a", $"b", 3)), Row("par")) + checkAnswer(df.select(substring($"a", 2, $"c")), Row("par")) + checkAnswer(df.select(substring($"a", $"b", $"c")), Row("par")) + } + test("string encode/decode function") { val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116) // scalastyle:off From 08e0088ee22e23b47363c9cc56cc156cc75c470f Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Thu, 6 Jun 2024 16:54:12 +0300 Subject: [PATCH 05/10] reformat pyspark functions --- python/pyspark/sql/functions/builtin.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index aa158da2da00..cea88282fc85 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10915,7 +10915,9 @@ def sentences( @_try_remote_functions -def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column: +def substring( + str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int] +) -> Column: """ Substring starts at `pos` and is of length `len` when str is String type or returns the slice of byte array that starts at `pos` in byte and is of length `len` From 377e6ba99f7d4ab894ae5c8b42cd905255cdc743 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Thu, 6 Jun 2024 17:04:12 +0300 Subject: [PATCH 06/10] add substring test golden files --- .../explain-results/function_substring_using_len_column.explain | 2 ++ .../function_substring_using_start_and_len_column.explain | 2 ++ .../function_substring_using_start_column.explain | 2 ++ .../explain-results/function_substring_with_columns.explain | 2 ++ 4 files changed, 8 insertions(+) create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain new file mode 100644 index 000000000000..fe07244fc9ce --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain new file mode 100644 index 000000000000..3050d15d9754 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain new file mode 100644 index 000000000000..fe07244fc9ce --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain new file mode 100644 index 000000000000..fe07244fc9ce --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] From a1de1f10dd1e6cad9f297601db485a769024eae0 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Wed, 12 Jun 2024 20:01:35 +0300 Subject: [PATCH 07/10] Remove substrings override with columns and ints --- .../org/apache/spark/sql/functions.scala | 26 -------------- .../spark/sql/PlanGenerationTestSuite.scala | 10 +----- ...unction_substring_using_len_column.explain | 2 -- ...bstring_using_start_and_len_column.explain | 2 -- ...ction_substring_using_start_column.explain | 2 -- ... => function_substring_using_columns.json} | 0 ...unction_substring_using_columns.proto.bin} | Bin .../function_substring_using_len_column.json | 33 ------------------ ...ction_substring_using_len_column.proto.bin | Bin 190 -> 0 bytes ...function_substring_using_start_column.json | 33 ------------------ ...ion_substring_using_start_column.proto.bin | Bin 190 -> 0 bytes .../function_substring_with_columns.json | 33 ------------------ .../function_substring_with_columns.proto.bin | Bin 190 -> 0 bytes 13 files changed, 1 insertion(+), 140 deletions(-) delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain rename connector/connect/common/src/test/resources/query-tests/queries/{function_substring_using_start_and_len_column.json => function_substring_using_columns.json} (100%) rename connector/connect/common/src/test/resources/query-tests/queries/{function_substring_using_start_and_len_column.proto.bin => function_substring_using_columns.proto.bin} (100%) delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 8984466c2229..eae239a25589 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4276,32 +4276,6 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) - /** - * Substring starts at `pos` and is of length `len` when str is String type or returns the slice - * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type - * - * @note - * The position is not zero based, but 1 based index. - * - * @group string_funcs - * @since 4.0.0 - */ - def substring(str: Column, pos: Column, len: Int): Column = - Column.fn("substring", str, pos, lit(len)) - - /** - * Substring starts at `pos` and is of length `len` when str is String type or returns the slice - * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type - * - * @note - * The position is not zero based, but 1 based index. - * - * @group string_funcs - * @since 4.0.0 - */ - def substring(str: Column, pos: Int, len: Column): Column = - Column.fn("substring", str, lit(pos), len) - /** * Substring starts at `pos` and is of length `len` when str is String type or returns the slice * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index bed1cdcfe976..987a50e13645 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1780,15 +1780,7 @@ class PlanGenerationTestSuite fn.substring(fn.col("g"), 4, 5) } - functionTest("substring using start column") { - fn.substring(fn.col("g"), lit(4), 5) - } - - functionTest("substring using len column") { - fn.substring(fn.col("g"), 4, lit(5)) - } - - functionTest("substring using start and len column") { + functionTest("substring using columns") { fn.substring(fn.col("g"), fn.col("a"), fn.col("b")) } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain deleted file mode 100644 index fe07244fc9ce..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain +++ /dev/null @@ -1,2 +0,0 @@ -Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] -+- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain deleted file mode 100644 index 3050d15d9754..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain +++ /dev/null @@ -1,2 +0,0 @@ -Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0] -+- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain deleted file mode 100644 index fe07244fc9ce..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain +++ /dev/null @@ -1,2 +0,0 @@ -Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] -+- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json similarity index 100% rename from connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json rename to connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin similarity index 100% rename from connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin rename to connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json deleted file mode 100644 index 84a70cf1c023..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "common": { - "planId": "1" - }, - "project": { - "input": { - "common": { - "planId": "0" - }, - "localRelation": { - "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" - } - }, - "expressions": [{ - "unresolvedFunction": { - "functionName": "substring", - "arguments": [{ - "unresolvedAttribute": { - "unparsedIdentifier": "g" - } - }, { - "literal": { - "integer": 4 - } - }, { - "literal": { - "integer": 5 - } - }] - } - }] - } -} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin deleted file mode 100644 index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json deleted file mode 100644 index 84a70cf1c023..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "common": { - "planId": "1" - }, - "project": { - "input": { - "common": { - "planId": "0" - }, - "localRelation": { - "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" - } - }, - "expressions": [{ - "unresolvedFunction": { - "functionName": "substring", - "arguments": [{ - "unresolvedAttribute": { - "unparsedIdentifier": "g" - } - }, { - "literal": { - "integer": 4 - } - }, { - "literal": { - "integer": 5 - } - }] - } - }] - } -} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin deleted file mode 100644 index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json deleted file mode 100644 index 84a70cf1c023..000000000000 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "common": { - "planId": "1" - }, - "project": { - "input": { - "common": { - "planId": "0" - }, - "localRelation": { - "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" - } - }, - "expressions": [{ - "unresolvedFunction": { - "functionName": "substring", - "arguments": [{ - "unresolvedAttribute": { - "unparsedIdentifier": "g" - } - }, { - "literal": { - "integer": 4 - } - }, { - "literal": { - "integer": 5 - } - }] - } - }] - } -} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin deleted file mode 100644 index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 190 zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+ zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99 Uh*gN0i!oh@g^S661x&I60M)`e=Kufz From f46983c8e4632a26580105fc58bc7fcc37de01a7 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Wed, 12 Jun 2024 20:01:51 +0300 Subject: [PATCH 08/10] Remove substring overrides with columns and ints --- python/pyspark/sql/functions/builtin.py | 12 ++++----- .../org/apache/spark/sql/functions.scala | 26 ------------------- .../spark/sql/StringFunctionsSuite.scala | 2 -- 3 files changed, 5 insertions(+), 35 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index cea88282fc85..8796000d7df2 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10962,12 +10962,10 @@ def substring( >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect() [Row(s='par')] """ - # the native str type is shadowed by the function's `str` param - from builtins import str as StrType from pyspark.sql.classic.column import _to_java_column - pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos - len = _to_java_column(len) if isinstance(len, (StrType, Column)) else len + pos = _to_java_column(lit(pos) if isinstance(pos, int) else pos) + len = _to_java_column(lit(len) if isinstance(len, int) else len) return _invoke_function("substring", _to_java_column(str), pos, len) @@ -14068,7 +14066,7 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: """ from pyspark.sql.classic.column import _to_java_column - value = value._jc if isinstance(value, Column) else value + value = _to_java_column(value) if isinstance(value, Column) else value return _invoke_function("array_position", _to_java_column(col), value) @@ -14519,7 +14517,7 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: """ from pyspark.sql.classic.column import _to_java_column - element = element._jc if isinstance(element, Column) else element + element = _to_java_column(element) if isinstance(element, Column) else element return _invoke_function("array_remove", _to_java_column(col), element) @@ -17331,7 +17329,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: """ from pyspark.sql.classic.column import _to_java_column - value = value._jc if isinstance(value, Column) else value + value = _to_java_column(value) if isinstance(value, Column) else value return _invoke_function("map_contains_key", _to_java_column(col), value) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 11b5d2561583..882918eb78c7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -4234,32 +4234,6 @@ object functions { def substring(str: Column, pos: Int, len: Int): Column = Column.fn("substring", str, lit(pos), lit(len)) - /** - * Substring starts at `pos` and is of length `len` when str is String type or - * returns the slice of byte array that starts at `pos` in byte and is of length `len` - * when str is Binary type - * - * @note The position is not zero based, but 1 based index. - * - * @group string_funcs - * @since 4.0.0 - */ - def substring(str: Column, pos: Column, len: Int): Column = - Column.fn("substring", str, pos, lit(len)) - - /** - * Substring starts at `pos` and is of length `len` when str is String type or - * returns the slice of byte array that starts at `pos` in byte and is of length `len` - * when str is Binary type - * - * @note The position is not zero based, but 1 based index. - * - * @group string_funcs - * @since 4.0.0 - */ - def substring(str: Column, pos: Int, len: Column): Column = - Column.fn("substring", str, lit(pos), len) - /** * Substring starts at `pos` and is of length `len` when str is String type or * returns the slice of byte array that starts at `pos` in byte and is of length `len` diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index dca8d904fe50..31c1cac9fb71 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -334,8 +334,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { test("string substring function using columns") { val df = Seq(("Spark", 2, 3)).toDF("a", "b", "c") - checkAnswer(df.select(substring($"a", $"b", 3)), Row("par")) - checkAnswer(df.select(substring($"a", 2, $"c")), Row("par")) checkAnswer(df.select(substring($"a", $"b", $"c")), Row("par")) } From f33ea8803bb9d9a3db2506f6a4014349a6cbc81d Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Wed, 12 Jun 2024 22:43:17 +0300 Subject: [PATCH 09/10] add missing golden file --- .../explain-results/function_substring_using_columns.explain | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain new file mode 100644 index 000000000000..3050d15d9754 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] From 6d08684cfe9b262d2239e9324c0606b1434bb6b7 Mon Sep 17 00:00:00 2001 From: Ron Serruya Date: Sat, 15 Jun 2024 01:19:49 +0300 Subject: [PATCH 10/10] Document parameter changes under parameters header in docstring --- python/pyspark/sql/functions/builtin.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 8796000d7df2..2edbc9f5abe1 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10928,9 +10928,6 @@ def substring( .. versionchanged:: 3.4.0 Supports Spark Connect. - .. versionchanged:: 4.0.0 - `pos` and `len` now also accept Columns or names of Columns. - Notes ----- The position is not zero based, but 1 based index. @@ -10944,6 +10941,9 @@ def substring( len : :class:`~pyspark.sql.Column` or str or int length of chars. + .. versionchanged:: 4.0.0 + `pos` and `len` now also accept Columns or names of Columns. + Returns ------- :class:`~pyspark.sql.Column` @@ -13973,9 +13973,6 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. - .. versionchanged:: 4.0.0 - `value` now also accepts a Column type. - Notes ----- The position is not zero based, but 1 based index. Returns 0 if the given @@ -13988,6 +13985,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: value : Any value or a :class:`~pyspark.sql.Column` expression to look for. + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. + Returns ------- :class:`~pyspark.sql.Column` @@ -14427,9 +14427,6 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. - .. versionchanged:: 4.0.0 - `element` now also accepts a Column type. - Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -14437,6 +14434,9 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column: element : element or a :class:`~pyspark.sql.Column` expression to be removed from the array + .. versionchanged:: 4.0.0 + `element` now also accepts a Column type. + Returns ------- :class:`~pyspark.sql.Column` @@ -17277,9 +17277,6 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: .. versionchanged:: 3.4.0 Supports Spark Connect. - .. versionchanged:: 4.0.0 - `value` now also accepts a Column type. - Parameters ---------- col : :class:`~pyspark.sql.Column` or str @@ -17287,6 +17284,9 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column: value : A literal value, or a :class:`~pyspark.sql.Column` expression. + .. versionchanged:: 4.0.0 + `value` now also accepts a Column type. + Returns ------- :class:`~pyspark.sql.Column`