apache · goldmedal · Sep 25, 2017 · Sep 25, 2017 · Sep 25, 2017 · Sep 25, 2017
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -336,6 +336,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         ``inferSchema`` option or specify the schema explicitly using ``schema``.
 
         :param path: string, or list of strings, for input path(s).
+                     or RDD of Strings storing CSV rows.
         :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
                        or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
         :param sep: sets the single character as a separator for each field and value.
@@ -408,6 +409,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         >>> df = spark.read.csv('python/test_support/sql/ages.csv')
         >>> df.dtypes
         [('_c0', 'string'), ('_c1', 'string')]
+        >>> rdd = sc.textFile('python/test_support/sql/ages.csv')
+        >>> df2 = spark.read.csv(rdd)
+        >>> df2.dtypes
+        [('_c0', 'string'), ('_c1', 'string')]
         """
         self._set_opts(
             schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment,
@@ -420,7 +425,22 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
         if isinstance(path, basestring):
             path = [path]
-        return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
+        if type(path) == list:
+            return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
+        elif isinstance(path, RDD):
+            def func(iterator):
+                for x in iterator:
+                    if not isinstance(x, basestring):
+                        x = unicode(x)
+                    if isinstance(x, unicode):
+                        x = x.encode("utf-8")
+                    yield x
+            keyed = path.mapPartitions(func)
+            keyed._bypass_serializer = True
+            jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString())
+            return self._df(self._jreader.csv(jrdd))
+        else:
+            raise TypeError("path can be only string, list or RDD")
 
     @since(1.5)
     def orc(self, path):

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -455,6 +455,40 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
     csv(Seq(path): _*)
   }
 
+  /**
+   * Loads a `JavaRDD[String]` storing storing CSV rows and returns the result as a `DataFrame`.
+   *
+   * If the schema is not specified using `schema` function and `inferSchema` option is enabled,
+   * this function goes through the input once to determine the input schema.
+   *
+   * If the schema is not specified using `schema` function and `inferSchema` option is disabled,
+   * it determines the columns as string types and it reads only the first line to determine the
+   * names and the number of fields.
+   *
+   * @param csvRDD input RDD with one CSV row per record
+   * @since 2.2.0
+   */
+  @deprecated("Use csv(Dataset[String]) instead.", "2.2.0")
+  def csv(csvRDD: JavaRDD[String]): DataFrame = csv(csvRDD.rdd)
+
+  /**
+   * Loads a `RDD[String]` storing storing CSV rows and returns the result as a `DataFrame`.
+   *
+   * If the schema is not specified using `schema` function and `inferSchema` option is enabled,
+   * this function goes through the input once to determine the input schema.
+   *
+   * If the schema is not specified using `schema` function and `inferSchema` option is disabled,
+   * it determines the columns as string types and it reads only the first line to determine the
+   * names and the number of fields.
+   *
+   * @param csvRDD input RDD with one CSV row per record
+   * @since 2.2.0
+   */
+  @deprecated("Use csv(Dataset[String]) instead.", "2.2.0")
+  def csv(csvRDD: RDD[String]): DataFrame = {
+    csv(sparkSession.createDataset(csvRDD)(Encoders.STRING))
+  }
+
   /**
    * Loads an `Dataset[String]` storing CSV rows and returns the result as a `DataFrame`.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -146,6 +146,22 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     verifyCars(carsWithoutHeader, withHeader = false, checkTypes = false)
   }
 
+  test("simple csv test with string RDD") {
+    val csvRDD = spark.sparkContext.textFile(testFile(carsFile))
+    val cars = spark.read
+        .option("header", "true")
+        .option("inferSchema", "true")
+        .csv(csvRDD)
+
+    verifyCars(cars, withHeader = true, checkTypes = true)
+
+    val carsWithoutHeader = spark.read
+        .option("header", "false")
+        .csv(csvRDD)
+
+    verifyCars(carsWithoutHeader, withHeader = false, checkTypes = false)
+  }
+
   test("test inferring booleans") {
     val result = spark.read
       .format("csv")