Skip to content
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Support schema_of_csv in PySpark
  • Loading branch information
MaxGekk committed Sep 21, 2018
commit a2322d1ae21f9f89a9ac09dd418bc4f72b50f711
21 changes: 21 additions & 0 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2348,6 +2348,27 @@ def schema_of_json(col):
return Column(jc)


@ignore_unicode_prefix
@since(2.5)
def schema_of_csv(col, options={}):
"""
Parses a column containing a CSV string and infers its schema in DDL format.

:param col: string column in CSV format
:param options: options to control parsing. accepts the same options as the CSV datasource

>>> from pyspark.sql.types import *
>>> data = [(1, '1|a')]
>>> df = spark.createDataFrame(data, ("key", "value"))
>>> df.select(schema_of_csv(df.value, {'sep':'|'}).alias("csv")).collect()
[Row(csv=u'struct<_c0:int,_c1:string>')]
"""

sc = SparkContext._active_spark_context
jc = sc._jvm.functions.schema_of_csv(_to_java_column(col), options)
return Column(jc)


@since(1.5)
def size(col):
"""
Expand Down