From 7d85f1ccd89d20fb0327847e8d8713a8f8907b14 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Wed, 5 Jun 2024 16:22:29 +0300
Subject: [PATCH 01/10] Support usage of columns as parameters to more pyspark
 functions

---
 python/pyspark/sql/functions/builtin.py | 51 ++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 07dfcaf2e2b7..b81501a4e7cd 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10915,7 +10915,7 @@ def sentences(
 
 
 @_try_remote_functions
-def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
+def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column:
     """
     Substring starts at `pos` and is of length `len` when str is String type or
     returns the slice of byte array that starts at `pos` in byte and is of length `len`
@@ -10934,9 +10934,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     ----------
     str : :class:`~pyspark.sql.Column` or str
         target column to work on.
-    pos : int
+    pos : :class:`~pyspark.sql.Column` or str or int
         starting position in str.
-    len : int
+    len : :class:`~pyspark.sql.Column` or str or int
         length of chars.
 
     Returns
@@ -10949,14 +10949,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
     [Row(s='ab')]
+
+    >>> df = spark.createDataFrame([('abcd', 2, 3)], ['s', 'start', 'len'])
+    >>> df.select(substring(df.s, df.start, df.len).alias('s')).collect()
+    [Row(s='bcd')]
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    pos = _to_java_column(pos) if isinstance(pos, (str, Column)) else pos
+    len = _to_java_column(len) if isinstance(pos, (str, Column)) else len
     return _invoke_function("substring", _to_java_column(str), pos, len)
 
 
 @_try_remote_functions
-def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column:
+def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union["ColumnOrName", int]) -> Column:
     """
     Returns the substring from string str before count occurrences of the delimiter delim.
     If count is positive, everything the left of the final delimiter (counting from left) is
@@ -10972,9 +10978,9 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column:
     ----------
     str : :class:`~pyspark.sql.Column` or str
         target column to work on.
-    delim : str
+    delim : :class:`~pyspark.sql.Column` or str
         delimiter of values.
-    count : int
+    count : :class:`~pyspark.sql.Column` or str or int
         number of occurrences.
 
     Returns
@@ -10992,6 +10998,8 @@ def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column:
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    delim = delim._jc if isinstance(delim, Column) else delim
+    count = _to_java_column(count) if isinstance(count, (str, Column)) else count
     return _invoke_function("substring_index", _to_java_column(str), delim, count)
 
 
@@ -13969,7 +13977,7 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     col : :class:`~pyspark.sql.Column` or str
         target column to work on.
     value : Any
-        value to look for.
+        value or a :class:`~pyspark.sql.Column` expression to look for.
 
     Returns
     -------
@@ -14034,9 +14042,21 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     +-----------------------+
     |                      3|
     +-----------------------+
+
+    Example 6: Finding the position of a column's value in an array of integers
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([10, 20, 30], 20)], ['data', 'col'])
+    >>> df.select(sf.array_position(df.data, df.col)).show()
+    +-------------------------+
+    |array_position(data, col)|
+    +-------------------------+
+    |                      2  |
+    +-------------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    value = value._jc if isinstance(value, Column) else value
     return _invoke_function("array_position", _to_java_column(col), value)
 
 
@@ -14402,7 +14422,7 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     col : :class:`~pyspark.sql.Column` or str
         name of column containing array
     element :
-        element to be removed from the array
+        element or a :class:`~pyspark.sql.Column` expression to be removed from the array
 
     Returns
     -------
@@ -14470,9 +14490,21 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     +---------------------+
     |                   []|
     +---------------------+
+
+    Example 6: Removing a column's value from a simple array
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([1, 2, 3, 1, 1], 1)], ['data', 'col'])
+    >>> df.select(sf.array_remove(df.data, df.col)).show()
+    +-----------------------+
+    |array_remove(data, col)|
+    +-----------------------+
+    |                 [2, 3]|
+    +-----------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    element = element._jc if isinstance(element, Column) else element
     return _invoke_function("array_remove", _to_java_column(col), element)
 
 
@@ -17237,7 +17269,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     col : :class:`~pyspark.sql.Column` or str
         The name of the column or an expression that represents the map.
     value :
-        A literal value.
+        A literal value, or a :class:`~pyspark.sql.Column` expression.
 
     Returns
     -------
@@ -17270,6 +17302,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     """
     from pyspark.sql.classic.column import _to_java_column
 
+    value = value._jc if isinstance(value, Column) else value
     return _invoke_function("map_contains_key", _to_java_column(col), value)
 
 
From d690698656456383c35aeb9fb1914309d5a25b96 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Wed, 5 Jun 2024 22:40:04 +0300
Subject: [PATCH 02/10] allow usage of column params in substring functions

---
 python/pyspark/sql/functions/builtin.py       | 25 ++++++++++++-------
 .../expressions/stringExpressions.scala       |  4 +++
 .../org/apache/spark/sql/functions.scala      |  4 +--
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index b81501a4e7cd..928902cf5269 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10949,15 +10949,16 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
     [Row(s='ab')]
-
-    >>> df = spark.createDataFrame([('abcd', 2, 3)], ['s', 'start', 'len'])
-    >>> df.select(substring(df.s, df.start, df.len).alias('s')).collect()
-    [Row(s='bcd')]
+    >>> df = spark.createDataFrame([('abcd', 1, 2)], ['s','p', 'l'])
+    >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
+    [Row(s='ab')]
     """
+    # `str` is shadowed by the function's param
+    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
 
-    pos = _to_java_column(pos) if isinstance(pos, (str, Column)) else pos
-    len = _to_java_column(len) if isinstance(pos, (str, Column)) else len
+    pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos
+    len = _to_java_column(len) if isinstance(pos, (StrType, Column)) else len
     return _invoke_function("substring", _to_java_column(str), pos, len)
 
 
@@ -10995,11 +10996,16 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union
     [Row(s='a.b')]
     >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
     [Row(s='b.c.d')]
+    >>> df = spark.createDataFrame([('a.b.c.d', '.', -3)], ['s', 'd', 'c'])
+    >>> df.select(substring_index(df.s, df.d, df.c).alias('s')).collect()
+    [Row(s='b.c.d')]
     """
+    # `str` is shadowed by the function's param
+    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
-
+    
     delim = delim._jc if isinstance(delim, Column) else delim
-    count = _to_java_column(count) if isinstance(count, (str, Column)) else count
+    count = _to_java_column(count) if isinstance(count, (StrType, Column)) else count
     return _invoke_function("substring_index", _to_java_column(str), delim, count)
 
 
@@ -14051,8 +14057,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     +-------------------------+
     |array_position(data, col)|
     +-------------------------+
-    |                      2  |
+    |                        2|
     +-------------------------+
+
     """
     from pyspark.sql.classic.column import _to_java_column
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 09ec501311ad..35cfb92b7672 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1422,6 +1422,8 @@ case class StringInstr(str: Expression, substr: Expression)
     Examples:
       > SELECT _FUNC_('www.apache.org', '.', 2);
        www.apache
+      > SELECT _FUNC_(col1, col2, col3) from values('www.apache.org', '.', 2);
+       www.apache
   """,
   since = "1.5.0",
   group = "string_funcs")
@@ -1967,6 +1969,8 @@ case class StringSpace(child: Expression)
        k
       > SELECT _FUNC_(encode('Spark SQL', 'utf-8'), 5);
        k SQL
+      > SELECT _FUNC_(col1, col2) from values('Spark SQL', 5);
+       k SQL
   """,
   since = "1.5.0",
   group = "string_funcs")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 52733611e42a..7f12e15a2f9c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4231,7 +4231,7 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def substring(str: Column, pos: Int, len: Int): Column =
+  def substring(str: Column, pos: Any, len: Any): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
   /**
@@ -4242,7 +4242,7 @@ object functions {
    *
    * @group string_funcs
    */
-  def substring_index(str: Column, delim: String, count: Int): Column =
+  def substring_index(str: Column, delim: Any, count: Any): Column =
     Column.fn("substring_index", str, lit(delim), lit(count))
 
   /**

From ce954456e7a900716b33ae90f2ba5b525cbba955 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Thu, 6 Jun 2024 03:42:24 +0300
Subject: [PATCH 03/10] revert support of column params in substring functions

---
 python/pyspark/sql/functions/builtin.py       | 28 +++++--------------
 .../expressions/stringExpressions.scala       |  4 ---
 .../org/apache/spark/sql/functions.scala      |  4 +--
 3 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 928902cf5269..7b99a24a96eb 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10915,7 +10915,7 @@ def sentences(
 
 
 @_try_remote_functions
-def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column:
+def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     """
     Substring starts at `pos` and is of length `len` when str is String type or
     returns the slice of byte array that starts at `pos` in byte and is of length `len`
@@ -10934,9 +10934,9 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["
     ----------
     str : :class:`~pyspark.sql.Column` or str
         target column to work on.
-    pos : :class:`~pyspark.sql.Column` or str or int
+    pos : int
         starting position in str.
-    len : :class:`~pyspark.sql.Column` or str or int
+    len : int
         length of chars.
 
     Returns
@@ -10949,21 +10949,14 @@ def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
     [Row(s='ab')]
-    >>> df = spark.createDataFrame([('abcd', 1, 2)], ['s','p', 'l'])
-    >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
-    [Row(s='ab')]
     """
-    # `str` is shadowed by the function's param
-    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
 
-    pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos
-    len = _to_java_column(len) if isinstance(pos, (StrType, Column)) else len
     return _invoke_function("substring", _to_java_column(str), pos, len)
 
 
 @_try_remote_functions
-def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union["ColumnOrName", int]) -> Column:
+def substring_index(str: "ColumnOrName", delim: str, count: int) -> Column:
     """
     Returns the substring from string str before count occurrences of the delimiter delim.
     If count is positive, everything the left of the final delimiter (counting from left) is
@@ -10979,9 +10972,9 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union
     ----------
     str : :class:`~pyspark.sql.Column` or str
         target column to work on.
-    delim : :class:`~pyspark.sql.Column` or str
+    delim : str
         delimiter of values.
-    count : :class:`~pyspark.sql.Column` or str or int
+    count : int
         number of occurrences.
 
     Returns
@@ -10996,16 +10989,9 @@ def substring_index(str: "ColumnOrName", delim: Union[Column, str], count: Union
     [Row(s='a.b')]
     >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
     [Row(s='b.c.d')]
-    >>> df = spark.createDataFrame([('a.b.c.d', '.', -3)], ['s', 'd', 'c'])
-    >>> df.select(substring_index(df.s, df.d, df.c).alias('s')).collect()
-    [Row(s='b.c.d')]
     """
-    # `str` is shadowed by the function's param
-    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
-    
-    delim = delim._jc if isinstance(delim, Column) else delim
-    count = _to_java_column(count) if isinstance(count, (StrType, Column)) else count
+
     return _invoke_function("substring_index", _to_java_column(str), delim, count)
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 35cfb92b7672..09ec501311ad 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -1422,8 +1422,6 @@ case class StringInstr(str: Expression, substr: Expression)
     Examples:
       > SELECT _FUNC_('www.apache.org', '.', 2);
        www.apache
-      > SELECT _FUNC_(col1, col2, col3) from values('www.apache.org', '.', 2);
-       www.apache
   """,
   since = "1.5.0",
   group = "string_funcs")
@@ -1969,8 +1967,6 @@ case class StringSpace(child: Expression)
        k
       > SELECT _FUNC_(encode('Spark SQL', 'utf-8'), 5);
        k SQL
-      > SELECT _FUNC_(col1, col2) from values('Spark SQL', 5);
-       k SQL
   """,
   since = "1.5.0",
   group = "string_funcs")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 7f12e15a2f9c..52733611e42a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4231,7 +4231,7 @@ object functions {
    * @group string_funcs
    * @since 1.5.0
    */
-  def substring(str: Column, pos: Any, len: Any): Column =
+  def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
   /**
@@ -4242,7 +4242,7 @@ object functions {
    *
    * @group string_funcs
    */
-  def substring_index(str: Column, delim: Any, count: Any): Column =
+  def substring_index(str: Column, delim: String, count: Int): Column =
     Column.fn("substring_index", str, lit(delim), lit(count))
 
   /**

From 3a3c2ac1ce802a947eac3f7e5e443b397d0c598d Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Thu, 6 Jun 2024 15:38:55 +0300
Subject: [PATCH 04/10] Add support for columns as params in the substring
 function

---
 .../org/apache/spark/sql/functions.scala      |  39 ++++++++++++++++
 .../spark/sql/PlanGenerationTestSuite.scala   |  12 +++++
 .../function_substring_using_len_column.json  |  33 ++++++++++++++
 ...ction_substring_using_len_column.proto.bin | Bin 0 -> 190 bytes
 ..._substring_using_start_and_len_column.json |  33 ++++++++++++++
 ...tring_using_start_and_len_column.proto.bin | Bin 0 -> 192 bytes
 ...function_substring_using_start_column.json |  33 ++++++++++++++
 ...ion_substring_using_start_column.proto.bin | Bin 0 -> 190 bytes
 .../function_substring_with_columns.json      |  33 ++++++++++++++
 .../function_substring_with_columns.proto.bin | Bin 0 -> 190 bytes
 python/pyspark/sql/functions/builtin.py       |  42 ++++++++++++++++--
 .../org/apache/spark/sql/functions.scala      |  39 ++++++++++++++++
 .../spark/sql/StringFunctionsSuite.scala      |   7 +++
 13 files changed, 267 insertions(+), 4 deletions(-)
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index c287e3469108..8984466c2229 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4276,6 +4276,45 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
+   * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
+   *
+   * @note
+   *   The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Int): Column =
+    Column.fn("substring", str, pos, lit(len))
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
+   * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
+   *
+   * @note
+   *   The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Int, len: Column): Column =
+    Column.fn("substring", str, lit(pos), len)
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
+   * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
+   *
+   * @note
+   *   The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Column): Column =
+    Column.fn("substring", str, pos, len)
+
   /**
    * Returns the substring from string str before count occurrences of the delimiter delim. If
    * count is positive, everything the left of the final delimiter (counting from left) is
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index e0ad8f7078ca..bed1cdcfe976 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1780,6 +1780,18 @@ class PlanGenerationTestSuite
     fn.substring(fn.col("g"), 4, 5)
   }
 
+  functionTest("substring using start column") {
+    fn.substring(fn.col("g"), lit(4), 5)
+  }
+
+  functionTest("substring using len column") {
+    fn.substring(fn.col("g"), 4, lit(5))
+  }
+
+  functionTest("substring using start and len column") {
+    fn.substring(fn.col("g"), fn.col("a"), fn.col("b"))
+  }
+
   functionTest("substring_index") {
     fn.substring_index(fn.col("g"), ";", 5)
   }
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
new file mode 100644
index 000000000000..84a70cf1c023
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
@@ -0,0 +1,33 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "substring",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "literal": {
+            "integer": 4
+          }
+        }, {
+          "literal": {
+            "integer": 5
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274
GIT binary patch
literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz

literal 0
HcmV?d00001

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json
new file mode 100644
index 000000000000..ba28b1c7f570
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json
@@ -0,0 +1,33 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "substring",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a"
+          }
+        }, {
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "b"
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f14b44ef5a501da5c419619ada7b5bff665490a1
GIT binary patch
literal 192
zcmd;L5@3|t$;dT{k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1He81uo9w(j*99
Qh*gN0i!mKSB|@kq08dFfaR2}S

literal 0
HcmV?d00001

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
new file mode 100644
index 000000000000..84a70cf1c023
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
@@ -0,0 +1,33 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "substring",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "literal": {
+            "integer": 4
+          }
+        }, {
+          "literal": {
+            "integer": 5
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274
GIT binary patch
literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz

literal 0
HcmV?d00001

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
new file mode 100644
index 000000000000..84a70cf1c023
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
@@ -0,0 +1,33 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "substring",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "g"
+          }
+        }, {
+          "literal": {
+            "integer": 4
+          }
+        }, {
+          "literal": {
+            "integer": 5
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d302cd95c7434095bd8ebcf2511786058de52274
GIT binary patch
literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz

literal 0
HcmV?d00001

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 7b99a24a96eb..aa158da2da00 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10915,7 +10915,7 @@ def sentences(
 
 
 @_try_remote_functions
-def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
+def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column:
     """
     Substring starts at `pos` and is of length `len` when str is String type or
     returns the slice of byte array that starts at `pos` in byte and is of length `len`
@@ -10926,6 +10926,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    .. versionchanged:: 4.0.0
+        `pos` and `len` now also accept Columns or names of Columns.
+
     Notes
     -----
     The position is not zero based, but 1 based index.
@@ -10934,9 +10937,9 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     ----------
     str : :class:`~pyspark.sql.Column` or str
         target column to work on.
-    pos : int
+    pos : :class:`~pyspark.sql.Column` or str or int
         starting position in str.
-    len : int
+    len : :class:`~pyspark.sql.Column` or str or int
         length of chars.
 
     Returns
@@ -10949,9 +10952,20 @@ def substring(str: "ColumnOrName", pos: int, len: int) -> Column:
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
     >>> df.select(substring(df.s, 1, 2).alias('s')).collect()
     [Row(s='ab')]
-    """
+    >>> df = spark.createDataFrame([('Spark', 2, 3)], ['s', 'p', 'l'])
+    >>> df.select(substring(df.s, 2, df.l).alias('s')).collect()
+    [Row(s='par')]
+    >>> df.select(substring(df.s, df.p, 3).alias('s')).collect()
+    [Row(s='par')]
+    >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
+    [Row(s='par')]
+    """
+    # the native str type is shadowed by the function's `str` param
+    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
 
+    pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos
+    len = _to_java_column(len) if isinstance(len, (StrType, Column)) else len
     return _invoke_function("substring", _to_java_column(str), pos, len)
 
 
@@ -13959,6 +13973,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    .. versionchanged:: 4.0.0
+        `value` now also accepts a Column type.
+
     Notes
     -----
     The position is not zero based, but 1 based index. Returns 0 if the given
@@ -14410,6 +14427,9 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    .. versionchanged:: 4.0.0
+        `element` now also accepts a Column type.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -17257,6 +17277,9 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    .. versionchanged:: 4.0.0
+        `value` now also accepts a Column type.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -17292,6 +17315,17 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     +--------------------------+
     |                     false|
     +--------------------------+
+
+    Example 3: Check for key using a column
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data, 1 as key")
+    >>> df.select(sf.map_contains_key("data", sf.col("key"))).show()
+    +---------------------------+
+    |map_contains_key(data, key)|
+    +---------------------------+
+    |                       true|
+    +---------------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 52733611e42a..11b5d2561583 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4234,6 +4234,45 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or
+   * returns the slice of byte array that starts at `pos` in byte and is of length `len`
+   * when str is Binary type
+   *
+   * @note The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Int): Column =
+    Column.fn("substring", str, pos, lit(len))
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or
+   * returns the slice of byte array that starts at `pos` in byte and is of length `len`
+   * when str is Binary type
+   *
+   * @note The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Int, len: Column): Column =
+    Column.fn("substring", str, lit(pos), len)
+
+  /**
+   * Substring starts at `pos` and is of length `len` when str is String type or
+   * returns the slice of byte array that starts at `pos` in byte and is of length `len`
+   * when str is Binary type
+   *
+   * @note The position is not zero based, but 1 based index.
+   *
+   * @group string_funcs
+   * @since 4.0.0
+   */
+  def substring(str: Column, pos: Column, len: Column): Column =
+    Column.fn("substring", str, pos, len)
+
   /**
    * Returns the substring from string str before count occurrences of the delimiter delim.
    * If count is positive, everything the left of the final delimiter (counting from left) is
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index 3fc0b572d80b..dca8d904fe50 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -332,6 +332,13 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
     // scalastyle:on
   }
 
+  test("string substring function using columns") {
+    val df = Seq(("Spark", 2, 3)).toDF("a", "b", "c")
+    checkAnswer(df.select(substring($"a", $"b", 3)), Row("par"))
+    checkAnswer(df.select(substring($"a", 2, $"c")), Row("par"))
+    checkAnswer(df.select(substring($"a", $"b", $"c")), Row("par"))
+  }
+
   test("string encode/decode function") {
     val bytes = Array[Byte](-27, -92, -89, -27, -115, -125, -28, -72, -106, -25, -107, -116)
     // scalastyle:off

From 08e0088ee22e23b47363c9cc56cc156cc75c470f Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Thu, 6 Jun 2024 16:54:12 +0300
Subject: [PATCH 05/10] reformat pyspark functions

---
 python/pyspark/sql/functions/builtin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index aa158da2da00..cea88282fc85 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10915,7 +10915,9 @@ def sentences(
 
 
 @_try_remote_functions
-def substring(str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]) -> Column:
+def substring(
+    str: "ColumnOrName", pos: Union["ColumnOrName", int], len: Union["ColumnOrName", int]
+) -> Column:
     """
     Substring starts at `pos` and is of length `len` when str is String type or
     returns the slice of byte array that starts at `pos` in byte and is of length `len`

From 377e6ba99f7d4ab894ae5c8b42cd905255cdc743 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Thu, 6 Jun 2024 17:04:12 +0300
Subject: [PATCH 06/10] add substring test golden files

---
 .../explain-results/function_substring_using_len_column.explain | 2 ++
 .../function_substring_using_start_and_len_column.explain       | 2 ++
 .../function_substring_using_start_column.explain               | 2 ++
 .../explain-results/function_substring_with_columns.explain     | 2 ++
 4 files changed, 8 insertions(+)
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain

diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
new file mode 100644
index 000000000000..fe07244fc9ce
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
@@ -0,0 +1,2 @@
+Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
new file mode 100644
index 000000000000..3050d15d9754
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
@@ -0,0 +1,2 @@
+Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
new file mode 100644
index 000000000000..fe07244fc9ce
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
@@ -0,0 +1,2 @@
+Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain
new file mode 100644
index 000000000000..fe07244fc9ce
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_with_columns.explain
@@ -0,0 +1,2 @@
+Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]

From a1de1f10dd1e6cad9f297601db485a769024eae0 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Wed, 12 Jun 2024 20:01:35 +0300
Subject: [PATCH 07/10] Remove substrings override with columns and ints

---
 .../org/apache/spark/sql/functions.scala      |  26 --------------
 .../spark/sql/PlanGenerationTestSuite.scala   |  10 +-----
 ...unction_substring_using_len_column.explain |   2 --
 ...bstring_using_start_and_len_column.explain |   2 --
 ...ction_substring_using_start_column.explain |   2 --
 ... => function_substring_using_columns.json} |   0
 ...unction_substring_using_columns.proto.bin} | Bin
 .../function_substring_using_len_column.json  |  33 ------------------
 ...ction_substring_using_len_column.proto.bin | Bin 190 -> 0 bytes
 ...function_substring_using_start_column.json |  33 ------------------
 ...ion_substring_using_start_column.proto.bin | Bin 190 -> 0 bytes
 .../function_substring_with_columns.json      |  33 ------------------
 .../function_substring_with_columns.proto.bin | Bin 190 -> 0 bytes
 13 files changed, 1 insertion(+), 140 deletions(-)
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
 rename connector/connect/common/src/test/resources/query-tests/queries/{function_substring_using_start_and_len_column.json => function_substring_using_columns.json} (100%)
 rename connector/connect/common/src/test/resources/query-tests/queries/{function_substring_using_start_and_len_column.proto.bin => function_substring_using_columns.proto.bin} (100%)
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
 delete mode 100644 connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 8984466c2229..eae239a25589 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4276,32 +4276,6 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
-  /**
-   * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
-   * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
-   *
-   * @note
-   *   The position is not zero based, but 1 based index.
-   *
-   * @group string_funcs
-   * @since 4.0.0
-   */
-  def substring(str: Column, pos: Column, len: Int): Column =
-    Column.fn("substring", str, pos, lit(len))
-
-  /**
-   * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
-   * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
-   *
-   * @note
-   *   The position is not zero based, but 1 based index.
-   *
-   * @group string_funcs
-   * @since 4.0.0
-   */
-  def substring(str: Column, pos: Int, len: Column): Column =
-    Column.fn("substring", str, lit(pos), len)
-
   /**
    * Substring starts at `pos` and is of length `len` when str is String type or returns the slice
    * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index bed1cdcfe976..987a50e13645 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1780,15 +1780,7 @@ class PlanGenerationTestSuite
     fn.substring(fn.col("g"), 4, 5)
   }
 
-  functionTest("substring using start column") {
-    fn.substring(fn.col("g"), lit(4), 5)
-  }
-
-  functionTest("substring using len column") {
-    fn.substring(fn.col("g"), 4, lit(5))
-  }
-
-  functionTest("substring using start and len column") {
+  functionTest("substring using columns") {
     fn.substring(fn.col("g"), fn.col("a"), fn.col("b"))
   }
 
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
deleted file mode 100644
index fe07244fc9ce..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_len_column.explain
+++ /dev/null
@@ -1,2 +0,0 @@
-Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0]
-+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
deleted file mode 100644
index 3050d15d9754..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_and_len_column.explain
+++ /dev/null
@@ -1,2 +0,0 @@
-Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0]
-+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
deleted file mode 100644
index fe07244fc9ce..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_start_column.explain
+++ /dev/null
@@ -1,2 +0,0 @@
-Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0]
-+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json
similarity index 100%
rename from connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.json
rename to connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.json
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin
similarity index 100%
rename from connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_and_len_column.proto.bin
rename to connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_columns.proto.bin
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
deleted file mode 100644
index 84a70cf1c023..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "common": {
-    "planId": "1"
-  },
-  "project": {
-    "input": {
-      "common": {
-        "planId": "0"
-      },
-      "localRelation": {
-        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
-      }
-    },
-    "expressions": [{
-      "unresolvedFunction": {
-        "functionName": "substring",
-        "arguments": [{
-          "unresolvedAttribute": {
-            "unparsedIdentifier": "g"
-          }
-        }, {
-          "literal": {
-            "integer": 4
-          }
-        }, {
-          "literal": {
-            "integer": 5
-          }
-        }]
-      }
-    }]
-  }
-}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_len_column.proto.bin
deleted file mode 100644
index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
deleted file mode 100644
index 84a70cf1c023..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "common": {
-    "planId": "1"
-  },
-  "project": {
-    "input": {
-      "common": {
-        "planId": "0"
-      },
-      "localRelation": {
-        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
-      }
-    },
-    "expressions": [{
-      "unresolvedFunction": {
-        "functionName": "substring",
-        "arguments": [{
-          "unresolvedAttribute": {
-            "unparsedIdentifier": "g"
-          }
-        }, {
-          "literal": {
-            "integer": 4
-          }
-        }, {
-          "literal": {
-            "integer": 5
-          }
-        }]
-      }
-    }]
-  }
-}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_using_start_column.proto.bin
deleted file mode 100644
index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz

diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
deleted file mode 100644
index 84a70cf1c023..000000000000
--- a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-  "common": {
-    "planId": "1"
-  },
-  "project": {
-    "input": {
-      "common": {
-        "planId": "0"
-      },
-      "localRelation": {
-        "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
-      }
-    },
-    "expressions": [{
-      "unresolvedFunction": {
-        "functionName": "substring",
-        "arguments": [{
-          "unresolvedAttribute": {
-            "unparsedIdentifier": "g"
-          }
-        }, {
-          "literal": {
-            "integer": 4
-          }
-        }, {
-          "literal": {
-            "integer": 5
-          }
-        }]
-      }
-    }]
-  }
-}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_with_columns.proto.bin
deleted file mode 100644
index d302cd95c7434095bd8ebcf2511786058de52274..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 190
zcmd;L5@3|t&d4>1k&8)yA*!2EsDrV%q^LBx#3nPvDk(EPGp|G^(F#N+S*7HcCgr5+
zq*xJ9VW*R7l~`1iSZM>)XQz{9m77>#1Jsk5m##xdtDR0d$atVqJ1GSzIWErP(j*99
Uh*gN0i!oh@g^S661x&I60M)`e=Kufz


From f46983c8e4632a26580105fc58bc7fcc37de01a7 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Wed, 12 Jun 2024 20:01:51 +0300
Subject: [PATCH 08/10] Remove substring overrides with columns and ints

---
 python/pyspark/sql/functions/builtin.py       | 12 ++++-----
 .../org/apache/spark/sql/functions.scala      | 26 -------------------
 .../spark/sql/StringFunctionsSuite.scala      |  2 --
 3 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index cea88282fc85..8796000d7df2 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10962,12 +10962,10 @@ def substring(
     >>> df.select(substring(df.s, df.p, df.l).alias('s')).collect()
     [Row(s='par')]
     """
-    # the native str type is shadowed by the function's `str` param
-    from builtins import str as StrType
     from pyspark.sql.classic.column import _to_java_column
 
-    pos = _to_java_column(pos) if isinstance(pos, (StrType, Column)) else pos
-    len = _to_java_column(len) if isinstance(len, (StrType, Column)) else len
+    pos = _to_java_column(lit(pos) if isinstance(pos, int) else pos)
+    len = _to_java_column(lit(len) if isinstance(len, int) else len)
     return _invoke_function("substring", _to_java_column(str), pos, len)
 
 
@@ -14068,7 +14066,7 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     """
     from pyspark.sql.classic.column import _to_java_column
 
-    value = value._jc if isinstance(value, Column) else value
+    value = _to_java_column(value) if isinstance(value, Column) else value
     return _invoke_function("array_position", _to_java_column(col), value)
 
 
@@ -14519,7 +14517,7 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     """
     from pyspark.sql.classic.column import _to_java_column
 
-    element = element._jc if isinstance(element, Column) else element
+    element = _to_java_column(element) if isinstance(element, Column) else element
     return _invoke_function("array_remove", _to_java_column(col), element)
 
 
@@ -17331,7 +17329,7 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     """
     from pyspark.sql.classic.column import _to_java_column
 
-    value = value._jc if isinstance(value, Column) else value
+    value = _to_java_column(value) if isinstance(value, Column) else value
     return _invoke_function("map_contains_key", _to_java_column(col), value)
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 11b5d2561583..882918eb78c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4234,32 +4234,6 @@ object functions {
   def substring(str: Column, pos: Int, len: Int): Column =
     Column.fn("substring", str, lit(pos), lit(len))
 
-  /**
-   * Substring starts at `pos` and is of length `len` when str is String type or
-   * returns the slice of byte array that starts at `pos` in byte and is of length `len`
-   * when str is Binary type
-   *
-   * @note The position is not zero based, but 1 based index.
-   *
-   * @group string_funcs
-   * @since 4.0.0
-   */
-  def substring(str: Column, pos: Column, len: Int): Column =
-    Column.fn("substring", str, pos, lit(len))
-
-  /**
-   * Substring starts at `pos` and is of length `len` when str is String type or
-   * returns the slice of byte array that starts at `pos` in byte and is of length `len`
-   * when str is Binary type
-   *
-   * @note The position is not zero based, but 1 based index.
-   *
-   * @group string_funcs
-   * @since 4.0.0
-   */
-  def substring(str: Column, pos: Int, len: Column): Column =
-    Column.fn("substring", str, lit(pos), len)
-
   /**
    * Substring starts at `pos` and is of length `len` when str is String type or
    * returns the slice of byte array that starts at `pos` in byte and is of length `len`
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index dca8d904fe50..31c1cac9fb71 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -334,8 +334,6 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
 
   test("string substring function using columns") {
     val df = Seq(("Spark", 2, 3)).toDF("a", "b", "c")
-    checkAnswer(df.select(substring($"a", $"b", 3)), Row("par"))
-    checkAnswer(df.select(substring($"a", 2, $"c")), Row("par"))
     checkAnswer(df.select(substring($"a", $"b", $"c")), Row("par"))
   }
 

From f33ea8803bb9d9a3db2506f6a4014349a6cbc81d Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Wed, 12 Jun 2024 22:43:17 +0300
Subject: [PATCH 09/10] add missing golden file

---
 .../explain-results/function_substring_using_columns.explain    | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain

diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain
new file mode 100644
index 000000000000..3050d15d9754
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_using_columns.explain
@@ -0,0 +1,2 @@
+Project [substring(g#0, a#0, cast(b#0 as int)) AS substring(g, a, b)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]

From 6d08684cfe9b262d2239e9324c0606b1434bb6b7 Mon Sep 17 00:00:00 2001
From: Ron Serruya <ron@lusha.com>
Date: Sat, 15 Jun 2024 01:19:49 +0300
Subject: [PATCH 10/10] Document parameter changes under parameters header in
 docstring

---
 python/pyspark/sql/functions/builtin.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 8796000d7df2..2edbc9f5abe1 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -10928,9 +10928,6 @@ def substring(
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
-    .. versionchanged:: 4.0.0
-        `pos` and `len` now also accept Columns or names of Columns.
-
     Notes
     -----
     The position is not zero based, but 1 based index.
@@ -10944,6 +10941,9 @@ def substring(
     len : :class:`~pyspark.sql.Column` or str or int
         length of chars.
 
+        .. versionchanged:: 4.0.0
+            `pos` and `len` now also accept Columns or names of Columns.
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -13973,9 +13973,6 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
-    .. versionchanged:: 4.0.0
-        `value` now also accepts a Column type.
-
     Notes
     -----
     The position is not zero based, but 1 based index. Returns 0 if the given
@@ -13988,6 +13985,9 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
     value : Any
         value or a :class:`~pyspark.sql.Column` expression to look for.
 
+        .. versionchanged:: 4.0.0
+            `value` now also accepts a Column type.
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -14427,9 +14427,6 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
-    .. versionchanged:: 4.0.0
-        `element` now also accepts a Column type.
-
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -14437,6 +14434,9 @@ def array_remove(col: "ColumnOrName", element: Any) -> Column:
     element :
         element or a :class:`~pyspark.sql.Column` expression to be removed from the array
 
+        .. versionchanged:: 4.0.0
+            `element` now also accepts a Column type.
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -17277,9 +17277,6 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
-    .. versionchanged:: 4.0.0
-        `value` now also accepts a Column type.
-
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -17287,6 +17284,9 @@ def map_contains_key(col: "ColumnOrName", value: Any) -> Column:
     value :
         A literal value, or a :class:`~pyspark.sql.Column` expression.
 
+        .. versionchanged:: 4.0.0
+            `value` now also accepts a Column type.
+
     Returns
     -------
     :class:`~pyspark.sql.Column`