apache
diff --git a/‎bin/docker-image-tool.sh‎
Lines changed: 1 addition & 1 deletion b/‎bin/docker-image-tool.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/ml-features.md‎
Lines changed: 22 additions & 3 deletions b/‎docs/ml-features.md‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎docs/sql-programming-guide.md‎
Lines changed: 7 additions & 0 deletions b/‎docs/sql-programming-guide.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎external/avro/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎external/avro/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala‎
Lines changed: 68 additions & 0 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 13 additions & 22 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala‎
Lines changed: 13 additions & 22 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala‎
Lines changed: 26 additions & 3 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala‎
Lines changed: 11 additions & 3 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala‎
Lines changed: 69 additions & 0 deletions b/‎external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala‎
Lines changed: 69 additions & 0 deletions
@@ -135,7 +135,7 @@ BASEDOCKERFILE=
 PYDOCKERFILE=
 NOCACHEARG=
 BUILD_PARAMS=
-while getopts f:mr:t:n:b: option
+while getopts f:p:mr:t:n:b: option
 do
  case "${option}"
  in
 
@@ -585,7 +585,11 @@ for more details on the API.
 ## StringIndexer
 
 `StringIndexer` encodes a string column of labels to a column of label indices.
-The indices are in `[0, numLabels)`, ordered by label frequencies, so the most frequent label gets index `0`.
+The indices are in `[0, numLabels)`, and four ordering options are supported:
+"frequencyDesc": descending order by label frequency (most frequent label assigned 0),
+"frequencyAsc": ascending order by label frequency (least frequent label assigned 0),
+"alphabetDesc": descending alphabetical order, and "alphabetAsc": ascending alphabetical order 
+(default = "frequencyDesc").
 The unseen labels will be put at index numLabels if user chooses to keep them.
 If the input column is numeric, we cast it to string and index the string
 values. When downstream pipeline components such as `Estimator` or
@@ -1593,10 +1597,25 @@ Suppose `a` and `b` are double columns, we use the following simple examples to
 * `y ~ a + b + a:b - 1` means model `y ~ w1 * a + w2 * b + w3 * a * b` where `w1, w2, w3` are coefficients.
 
 `RFormula` produces a vector column of features and a double or string column of label. 
-Like when formulas are used in R for linear regression, string input columns will be one-hot encoded, and numeric columns will be cast to doubles.
-If the label column is of type string, it will be first transformed to double with `StringIndexer`.
+Like when formulas are used in R for linear regression, numeric columns will be cast to doubles.
+As to string input columns, they will first be transformed with [StringIndexer](ml-features.html#stringindexer) using ordering determined by `stringOrderType`,
+and the last category after ordering is dropped, then the doubles will be one-hot encoded.
+
+Suppose a string feature column containing values `{'b', 'a', 'b', 'a', 'c', 'b'}`, we set `stringOrderType` to control the encoding:
+~~~
+stringOrderType | Category mapped to 0 by StringIndexer |  Category dropped by RFormula
+----------------|---------------------------------------|---------------------------------
+'frequencyDesc' | most frequent category ('b')          | least frequent category ('c')
+'frequencyAsc'  | least frequent category ('c')         | most frequent category ('b')
+'alphabetDesc'  | last alphabetical category ('c')      | first alphabetical category ('a')
+'alphabetAsc'   | first alphabetical category ('a')     | last alphabetical category ('c')
+~~~
+
+If the label column is of type string, it will be first transformed to double with [StringIndexer](ml-features.html#stringindexer) using `frequencyDesc` ordering.
 If the label column does not exist in the DataFrame, the output label column will be created from the specified response variable in the formula.
 
+**Note:** The ordering option `stringOrderType` is NOT used for the label column. When the label column is indexed, it uses the default descending frequency ordering in `StringIndexer`.
+
 **Examples**
 
 Assume that we have a DataFrame with the columns `id`, `country`, `hour`, and `clicked`:
 
@@ -1407,6 +1407,13 @@ the following case-insensitive options:
      This is a JDBC writer related option. When <code>SaveMode.Overwrite</code> is enabled, this option causes Spark to truncate an existing table instead of dropping and recreating it. This can be more efficient, and prevents the table metadata (e.g., indices) from being removed. However, it will not work in some cases, such as when the new data has a different schema. It defaults to <code>false</code>. This option applies only to writing.
    </td>
   </tr>
+  
+  <tr>
+    <td><code>cascadeTruncate</code></td>
+    <td>
+        This is a JDBC writer related option. If enabled and supported by the JDBC database (PostgreSQL and Oracle at the moment), this options allows execution of a <code>TRUNCATE TABLE t CASCADE</code> (in the case of PostgreSQL a <code>TRUNCATE TABLE ONLY t CASCADE</code> is executed to prevent inadvertently truncating descendant tables). This will affect other tables, and thus should be used with care. This option applies only to writing. It defaults to the default cascading truncate behaviour of the JDBC database in question, specified in the <code>isCascadeTruncate</code> in each JDBCDialect.
+    </td>
+  </tr>
 
   <tr>
     <td><code>createTableOptions</code></td>
 
@@ -61,6 +61,11 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
 
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.avro.Schema
+import org.apache.avro.generic.GenericDatumReader
+import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
+
+import org.apache.spark.sql.avro.{AvroDeserializer, SchemaConverters}
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType}
+
+case class AvroDataToCatalyst(child: Expression, jsonFormatSchema: String)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
+
+  override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType
+
+  override def nullable: Boolean = true
+
+  @transient private lazy val avroSchema = new Schema.Parser().parse(jsonFormatSchema)
+
+  @transient private lazy val reader = new GenericDatumReader[Any](avroSchema)
+
+  @transient private lazy val deserializer = new AvroDeserializer(avroSchema, dataType)
+
+  @transient private var decoder: BinaryDecoder = _
+
+  @transient private var result: Any = _
+
+  override def nullSafeEval(input: Any): Any = {
+    val binary = input.asInstanceOf[Array[Byte]]
+    decoder = DecoderFactory.get().binaryDecoder(binary, 0, binary.length, decoder)
+    result = reader.read(result, decoder)
+    deserializer.deserialize(result)
+  }
+
+  override def simpleString: String = {
+    s"from_avro(${child.sql}, ${dataType.simpleString})"
+  }
+
+  override def sql: String = {
+    s"from_avro(${child.sql}, ${dataType.catalogString})"
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val expr = ctx.addReferenceObj("this", this)
+    defineCodeGen(ctx, ev, input =>
+      s"(${CodeGenerator.boxedType(dataType)})$expr.nullSafeEval($input)")
+  }
+}
@@ -58,21 +58,19 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
     val conf = spark.sparkContext.hadoopConfiguration
-    val parsedOptions = new AvroOptions(options)
+    val parsedOptions = new AvroOptions(options, conf)
 
     // Schema evolution is not supported yet. Here we only pick a single random sample file to
     // figure out the schema of the whole dataset.
     val sampleFile =
-      if (AvroFileFormat.ignoreFilesWithoutExtensions(conf)) {
-        files.find(_.getPath.getName.endsWith(".avro")).getOrElse {
-          throw new FileNotFoundException(
-            "No Avro files found. Hadoop option \"avro.mapred.ignore.inputs.without.extension\" " +
-              " is set to true. Do all input files have \".avro\" extension?"
-          )
+      if (parsedOptions.ignoreExtension) {
+        files.headOption.getOrElse {
+          throw new FileNotFoundException("Files for schema inferring have been not found.")
         }
       } else {
-        files.headOption.getOrElse {
-          throw new FileNotFoundException("No Avro files found.")
+        files.find(_.getPath.getName.endsWith(".avro")).getOrElse {
+          throw new FileNotFoundException(
+            "No Avro files found. If files don't have .avro extension, set ignoreExtension to true")
         }
       }
 
@@ -115,7 +113,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       job: Job,
       options: Map[String, String],
       dataSchema: StructType): OutputWriterFactory = {
-    val parsedOptions = new AvroOptions(options)
+    val parsedOptions = new AvroOptions(options, spark.sessionState.newHadoopConf())
     val outputAvroSchema = SchemaConverters.toAvroType(
       dataSchema, nullable = false, parsedOptions.recordName, parsedOptions.recordNamespace)
 
@@ -146,7 +144,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
         log.error(s"unsupported compression codec $unknown")
     }
 
-    new AvroOutputWriterFactory(dataSchema, new SerializableSchema(outputAvroSchema))
+    new AvroOutputWriterFactory(dataSchema, outputAvroSchema.toString)
   }
 
   override def buildReader(
@@ -160,7 +158,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
 
     val broadcastedConf =
       spark.sparkContext.broadcast(new AvroFileFormat.SerializableConfiguration(hadoopConf))
-    val parsedOptions = new AvroOptions(options)
+    val parsedOptions = new AvroOptions(options, hadoopConf)
 
     (file: PartitionedFile) => {
       val log = LoggerFactory.getLogger(classOf[AvroFileFormat])
@@ -171,9 +169,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
       // Doing input file filtering is improper because we may generate empty tasks that process no
       // input files but stress the scheduler. We should probably add a more general input file
       // filtering mechanism for `FileFormat` data sources. See SPARK-16317.
-      if (AvroFileFormat.ignoreFilesWithoutExtensions(conf) && !file.filePath.endsWith(".avro")) {
-        Iterator.empty
-      } else {
+      if (parsedOptions.ignoreExtension || file.filePath.endsWith(".avro")) {
         val reader = {
           val in = new FsInput(new Path(new URI(file.filePath)), conf)
           try {
@@ -228,6 +224,8 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister {
             deserializer.deserialize(record).asInstanceOf[InternalRow]
           }
         }
+      } else {
+        Iterator.empty
       }
     }
   }
@@ -274,11 +272,4 @@ private[avro] object AvroFileFormat {
       value.readFields(new DataInputStream(in))
     }
   }
-
-  def ignoreFilesWithoutExtensions(conf: Configuration): Boolean = {
-    // Files without .avro extensions are not ignored by default
-    val defaultValue = false
-
-    conf.getBoolean(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, defaultValue)
-  }
 }
@@ -17,16 +17,21 @@
 
 package org.apache.spark.sql.avro
 
+import org.apache.hadoop.conf.Configuration
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 
 /**
  * Options for Avro Reader and Writer stored in case insensitive manner.
  */
-class AvroOptions(@transient val parameters: CaseInsensitiveMap[String])
-  extends Logging with Serializable {
+class AvroOptions(
+    @transient val parameters: CaseInsensitiveMap[String],
+    @transient val conf: Configuration) extends Logging with Serializable {
 
-  def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
+  def this(parameters: Map[String, String], conf: Configuration) = {
+    this(CaseInsensitiveMap(parameters), conf)
+  }
 
   /**
    * Optional schema provided by an user in JSON format.
@@ -45,4 +50,22 @@ class AvroOptions(@transient val parameters: CaseInsensitiveMap[String])
    * See Avro spec for details: https://avro.apache.org/docs/1.8.2/spec.html#schema_record .
    */
   val recordNamespace: String = parameters.getOrElse("recordNamespace", "")
+
+  /**
+   * The `ignoreExtension` option controls ignoring of files without `.avro` extensions in read.
+   * If the option is enabled, all files (with and without `.avro` extension) are loaded.
+   * If the option is not set, the Hadoop's config `avro.mapred.ignore.inputs.without.extension`
+   * is taken into account. If the former one is not set too, file extensions are ignored.
+   */
+  val ignoreExtension: Boolean = {
+    val ignoreFilesWithoutExtensionByDefault = false
+    val ignoreFilesWithoutExtension = conf.getBoolean(
+      AvroFileFormat.IgnoreFilesWithoutExtensionProperty,
+      ignoreFilesWithoutExtensionByDefault)
+
+    parameters
+      .get("ignoreExtension")
+      .map(_.toBoolean)
+      .getOrElse(!ignoreFilesWithoutExtension)
+  }
 }
@@ -17,21 +17,29 @@
 
 package org.apache.spark.sql.avro
 
+import org.apache.avro.Schema
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory}
 import org.apache.spark.sql.types.StructType
 
+/**
+ * A factory that produces [[AvroOutputWriter]].
+ * @param catalystSchema Catalyst schema of input data.
+ * @param avroSchemaAsJsonString Avro schema of output result, in JSON string format.
+ */
 private[avro] class AvroOutputWriterFactory(
-    schema: StructType,
-    avroSchema: SerializableSchema) extends OutputWriterFactory {
+    catalystSchema: StructType,
+    avroSchemaAsJsonString: String) extends OutputWriterFactory {
+
+  private lazy val avroSchema = new Schema.Parser().parse(avroSchemaAsJsonString)
 
   override def getFileExtension(context: TaskAttemptContext): String = ".avro"
 
   override def newInstance(
       path: String,
       dataSchema: StructType,
       context: TaskAttemptContext): OutputWriter = {
-    new AvroOutputWriter(path, context, schema, avroSchema.value)
+    new AvroOutputWriter(path, context, catalystSchema, avroSchema)
   }
 }
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.io.ByteArrayOutputStream
+
+import org.apache.avro.generic.GenericDatumWriter
+import org.apache.avro.io.{BinaryEncoder, EncoderFactory}
+
+import org.apache.spark.sql.avro.{AvroSerializer, SchemaConverters}
+import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
+import org.apache.spark.sql.types.{BinaryType, DataType}
+
+case class CatalystDataToAvro(child: Expression) extends UnaryExpression {
+
+  override def dataType: DataType = BinaryType
+
+  @transient private lazy val avroType =
+    SchemaConverters.toAvroType(child.dataType, child.nullable)
+
+  @transient private lazy val serializer =
+    new AvroSerializer(child.dataType, avroType, child.nullable)
+
+  @transient private lazy val writer =
+    new GenericDatumWriter[Any](avroType)
+
+  @transient private var encoder: BinaryEncoder = _
+
+  @transient private lazy val out = new ByteArrayOutputStream
+
+  override def nullSafeEval(input: Any): Any = {
+    out.reset()
+    encoder = EncoderFactory.get().directBinaryEncoder(out, encoder)
+    val avroData = serializer.serialize(input)
+    writer.write(avroData, encoder)
+    encoder.flush()
+    out.toByteArray
+  }
+
+  override def simpleString: String = {
+    s"to_avro(${child.sql}, ${child.dataType.simpleString})"
+  }
+
+  override def sql: String = {
+    s"to_avro(${child.sql}, ${child.dataType.catalogString})"
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val expr = ctx.addReferenceObj("this", this)
+    defineCodeGen(ctx, ev, input =>
+      s"(byte[]) $expr.nullSafeEval($input)")
+  }
+}