Support schema_of_csv in PySpark

MaxGekk · MaxGekk · Sep 16, 2018 · Sep 21, 2018 · Sep 21, 2018 · Sep 21, 2018
commit a2322d1ae21f9f89a9ac09dd418bc4f72b50f711
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2348,6 +2348,27 @@ def schema_of_json(col):
     return Column(jc)
 
 
+@ignore_unicode_prefix
+@since(2.5)
+def schema_of_csv(col, options={}):
+    """
+    Parses a column containing a CSV string and infers its schema in DDL format.
+
+    :param col: string column in CSV format
+    :param options: options to control parsing. accepts the same options as the CSV datasource
+
+    >>> from pyspark.sql.types import *
+    >>> data = [(1, '1|a')]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(schema_of_csv(df.value, {'sep':'|'}).alias("csv")).collect()
+    [Row(csv=u'struct<_c0:int,_c1:string>')]
+    """
+
+    sc = SparkContext._active_spark_context
+    jc = sc._jvm.functions.schema_of_csv(_to_java_column(col), options)
+    return Column(jc)
+
+
 @since(1.5)
 def size(col):
     """