diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 0d40368c9cd6..39815497f395 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -2185,6 +2185,13 @@ def udf(f=None, returnType=StringType()): duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. + .. note:: The user-defined functions do not support conditional execution by using them with + SQL conditional expressions such as `when` or `if`. The functions still apply on all rows no + matter the conditions are met or not. So the output is correct if the functions can be + correctly run on all rows without failure. If the functions can cause runtime failure on the + rows that do not satisfy the conditions, the suggested workaround is to incorporate the + condition logic into the functions. + :param f: python function if used as a standalone function :param returnType: a :class:`pyspark.sql.types.DataType` object @@ -2278,6 +2285,13 @@ def pandas_udf(f=None, returnType=StringType()): .. seealso:: :meth:`pyspark.sql.GroupedData.apply` .. note:: The user-defined function must be deterministic. + + .. note:: The user-defined functions do not support conditional execution by using them with + SQL conditional expressions such as `when` or `if`. The functions still apply on all rows no + matter the conditions are met or not. So the output is correct if the functions can be + correctly run on all rows without failure. If the functions can cause runtime failure on the + rows that do not satisfy the conditions, the suggested workaround is to incorporate the + condition logic into the functions. """ return _create_udf(f, returnType=returnType, pythonUdfType=PythonUdfType.PANDAS_UDF)