apache · WeichenXu123 · Sep 4, 2018 · Sep 4, 2018 · Sep 5, 2018 · Sep 5, 2018
diff --git a/...es/images/kittens/29.5.a_b_EGDP022204.jpg → ...es/origin/kittens/29.5.a_b_EGDP022204.jpg b/...es/images/kittens/29.5.a_b_EGDP022204.jpg → ...es/origin/kittens/29.5.a_b_EGDP022204.jpg
diff --git a/data/mllib/images/images/kittens/54893.jpg → data/mllib/images/origin/kittens/54893.jpg b/data/mllib/images/images/kittens/54893.jpg → data/mllib/images/origin/kittens/54893.jpg
diff --git a/.../mllib/images/images/kittens/DP153539.jpg → .../mllib/images/origin/kittens/DP153539.jpg b/.../mllib/images/images/kittens/DP153539.jpg → .../mllib/images/origin/kittens/DP153539.jpg
diff --git a/.../mllib/images/images/kittens/DP802813.jpg → .../mllib/images/origin/kittens/DP802813.jpg b/.../mllib/images/images/kittens/DP802813.jpg → .../mllib/images/origin/kittens/DP802813.jpg
diff --git a/...mllib/images/images/kittens/not-image.txt → ...mllib/images/origin/kittens/not-image.txt b/...mllib/images/images/kittens/not-image.txt → ...mllib/images/origin/kittens/not-image.txt
diff --git a/data/mllib/images/images/license.txt → data/mllib/images/origin/license.txt b/data/mllib/images/images/license.txt → data/mllib/images/origin/license.txt
diff --git a/...llib/images/images/multi-channel/BGRA.png → ...llib/images/origin/multi-channel/BGRA.png b/...llib/images/images/multi-channel/BGRA.png → ...llib/images/origin/multi-channel/BGRA.png
diff --git a/...es/images/multi-channel/BGRA_alpha_60.png → ...es/origin/multi-channel/BGRA_alpha_60.png b/...es/images/multi-channel/BGRA_alpha_60.png → ...es/origin/multi-channel/BGRA_alpha_60.png
diff --git a/...ages/images/multi-channel/chr30.4.184.jpg → ...ages/origin/multi-channel/chr30.4.184.jpg b/...ages/images/multi-channel/chr30.4.184.jpg → ...ages/origin/multi-channel/chr30.4.184.jpg
diff --git a/...images/images/multi-channel/grayscale.jpg → ...images/origin/multi-channel/grayscale.jpg b/...images/images/multi-channel/grayscale.jpg → ...images/origin/multi-channel/grayscale.jpg
diff --git a/...tens/date=2018-01/29.5.a_b_EGDP022204.jpg → ...tens/date=2018-01/29.5.a_b_EGDP022204.jpg b/...tens/date=2018-01/29.5.a_b_EGDP022204.jpg → ...tens/date=2018-01/29.5.a_b_EGDP022204.jpg
diff --git a/...ns/cls=kittens/date=2018-01/not-image.txt → ...ed/cls=kittens/date=2018-01/not-image.txt b/...ns/cls=kittens/date=2018-01/not-image.txt → ...ed/cls=kittens/date=2018-01/not-image.txt
diff --git a/...itions/cls=kittens/date=2018-02/54893.jpg → ...tioned/cls=kittens/date=2018-02/54893.jpg b/...itions/cls=kittens/date=2018-02/54893.jpg → ...tioned/cls=kittens/date=2018-02/54893.jpg
diff --git a/...ons/cls=kittens/date=2018-02/DP153539.jpg → ...ned/cls=kittens/date=2018-02/DP153539.jpg b/...ons/cls=kittens/date=2018-02/DP153539.jpg → ...ned/cls=kittens/date=2018-02/DP153539.jpg
diff --git a/...ons/cls=kittens/date=2018-02/DP802813.jpg → ...ned/cls=kittens/date=2018-02/DP802813.jpg b/...ons/cls=kittens/date=2018-02/DP802813.jpg → ...ned/cls=kittens/date=2018-02/DP802813.jpg
diff --git a/...ns/cls=multichannel/date=2018-01/BGRA.png → ...ed/cls=multichannel/date=2018-01/BGRA.png b/...ns/cls=multichannel/date=2018-01/BGRA.png → ...ed/cls=multichannel/date=2018-01/BGRA.png
diff --git a/...ltichannel/date=2018-01/BGRA_alpha_60.png → ...ltichannel/date=2018-01/BGRA_alpha_60.png b/...ltichannel/date=2018-01/BGRA_alpha_60.png → ...ltichannel/date=2018-01/BGRA_alpha_60.png
diff --git a/...multichannel/date=2018-02/chr30.4.184.jpg → ...multichannel/date=2018-02/chr30.4.184.jpg b/...multichannel/date=2018-02/chr30.4.184.jpg → ...multichannel/date=2018-02/chr30.4.184.jpg
diff --git a/...s=multichannel/date=2018-02/grayscale.jpg → ...s=multichannel/date=2018-02/grayscale.jpg b/...s=multichannel/date=2018-02/grayscale.jpg → ...s=multichannel/date=2018-02/grayscale.jpg
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/mllib/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1,2 +1,2 @@
 org.apache.spark.ml.source.libsvm.LibSVMFileFormat
-org.apache.spark.ml.source.image.ImageFileFormat
+org.apache.spark.ml.source.image.ImageFileFormat
diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageDataSource.scala
@@ -29,7 +29,7 @@ package org.apache.spark.ml.source.image
  *  - data: BinaryType (Image bytes in OpenCV-compatible order: row-wise BGR in most cases)
  *
  * To use IMAGE data source, you need to set "image" as the format in `DataFrameReader` and
- * optionally specify options, for example:
+ * optionally specify the datasource options, for example:
  * {{{
  *   // Scala
  *   val df = spark.read.format("image")
@@ -45,6 +45,8 @@ package org.apache.spark.ml.source.image
  * IMAGE data source supports the following options:
  *  - "dropImageFailures": Whether to drop the files that are not valid images from the result.
  *
+ * @note This IMAGE data source does not support "write".
+ *
  * @note This class is public for documentation purpose. Please don't use this class directly.
  * Rather, use the data source API as illustrated above.
  */

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageFileFormat.scala
@@ -33,15 +33,6 @@ import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.SerializableConfiguration
 
-
-private[image] class ImageFileFormatOptions(
-    @transient private val parameters: CaseInsensitiveMap[String]) extends Serializable {
-
-  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
-
-  val dropImageFailures = parameters.getOrElse("dropImageFailures", "false").toBoolean
-}
-
 private[image] class ImageFileFormat extends FileFormat with DataSourceRegister {
 
   override def inferSchema(
@@ -53,8 +44,7 @@ private[image] class ImageFileFormat extends FileFormat with DataSourceRegister
       sparkSession: SparkSession,
       job: Job, options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    throw new UnsupportedOperationException(
-      s"prepareWrite is not supported for image data source")
+    throw new UnsupportedOperationException("Write is not supported for image data source")
   }
 
   override def shortName(): String = "image"
@@ -74,7 +64,7 @@ private[image] class ImageFileFormat extends FileFormat with DataSourceRegister
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
-    val imageSourceOptions = new ImageFileFormatOptions(options)
+    val imageSourceOptions = new ImageOptions(options)
 
     (file: PartitionedFile) => {
       val emptyUnsafeRow = new UnsafeRow(0)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageOptions.scala b/mllib/src/main/scala/org/apache/spark/ml/source/image/ImageOptions.scala
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.source.image
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+
+private[image] class ImageOptions(
+    @transient private val parameters: CaseInsensitiveMap[String]) extends Serializable {
+
+  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+
+  val dropImageFailures = parameters.getOrElse("dropImageFailures", "false").toBoolean
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/image/ImageSchemaSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.types._
 
 class ImageSchemaSuite extends SparkFunSuite with MLlibTestSparkContext {
   // Single column of images named "image"
-  private lazy val imagePath = "../data/mllib/images/images"
+  private lazy val imagePath = "../data/mllib/images/origin"
 
   test("Smoke test: create basic ImageSchema dataframe") {
     val origin = "path"

diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.functions.{col, substring_index}
 class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // Single column of images named "image"
-  private lazy val imagePath = "../data/mllib/images/imagesWithPartitions"
+  private lazy val imagePath = "../data/mllib/images/partitioned"
 
   test("image datasource count test") {
     val df1 = spark.read.format("image").load(imagePath)
@@ -82,27 +82,26 @@ class ImageFileFormatSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   // Images with the different number of channels
   test("readImages pixel values test") {
-
     val images = spark.read.format("image").option("dropImageFailures", "true")
       .load(imagePath + "/cls=multichannel/").collect()
 
-    val firstBytes20Map = images.map { rrow =>
+    val firstBytes20Set = images.map { rrow =>
       val row = rrow.getAs[Row]("image")
       val filename = Paths.get(getOrigin(row)).getFileName().toString()
       val mode = getMode(row)
       val bytes20 = getData(row).slice(0, 20).toList
       filename -> Tuple2(mode, bytes20)
-    }.toMap
+    }.toSet
 
-    assert(firstBytes20Map === expectedFirstBytes20Map)
+    assert(firstBytes20Set === expectedFirstBytes20Set)
   }
 
   // number of channels and first 20 bytes of OpenCV representation
   // - default representation for 3-channel RGB images is BGR row-wise:
   //   (B00, G00, R00,      B10, G10, R10,      ...)
   // - default representation for 4-channel RGB images is BGRA row-wise:
   //   (B00, G00, R00, A00, B10, G10, R10, A10, ...)
-  private val expectedFirstBytes20Map = Map(
+  private val expectedFirstBytes20Set = Set(
     "grayscale.jpg" ->
       ((0, List[Byte](-2, -33, -61, -60, -59, -59, -64, -59, -66, -67, -73, -73, -62,
         -57, -60, -63, -53, -49, -55, -69))),

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -592,7 +592,6 @@ object DataSource extends Logging {
       "org.apache.spark.sql.execution.datasources.orc" -> nativeOrc,
       "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
       "org.apache.spark.ml.source.libsvm" -> libsvm,
-      "org.apache.spark.ml.source.image.ImageFileFormat.DefaultSource" -> image,
       "org.apache.spark.ml.source.image.ImageFileFormat" -> image,
       "com.databricks.spark.csv" -> csv,
       "org.apache.spark.sql.execution.streaming.TextSocketSourceProvider" -> socket,