refactor

apache · davies · Mar 31, 2015 · Mar 31, 2015 · Mar 31, 2015 · Mar 31, 2015
commit 28d6f32eda151ed51f35117eb5beb1ec6b6882d1
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -78,10 +78,28 @@ trait ScalaReflection {
     case (d: BigDecimal, _) => Decimal(d)
     case (d: java.math.BigDecimal, _) => Decimal(d)
     case (d: java.sql.Date, _) => DateUtils.fromJavaDate(d)
-    case (s: String, st: StringType) => UTF8String(s)
+    case (s: String, _) => UTF8String(s)
     case (other, _) => other
   }
 
+  /**
+   *  Converts Scala objects to catalyst rows / types.
+   *  Note: This should be called before do evaluation on Row
+   *        (It does not support UDT)
+   */
+  def convertToCatalyst(a: Any): Any = a match {
+    case s: String => UTF8String(s)
+    case d: java.sql.Date => DateUtils.fromJavaDate(d)
+    case d: BigDecimal => Decimal(d)
+    case d: java.math.BigDecimal => Decimal(d)
+    case seq: Seq[Any] => seq.map(convertToCatalyst)
+    case r: Row => Row(r.toSeq.map(convertToCatalyst): _*)
+    case arr: Array[Any] => arr.toSeq.map(convertToCatalyst).toArray
+    case m: Map[Any, Any] =>
+      m.map { case (k, v) => (convertToCatalyst(k), convertToCatalyst(v)) }.toMap
+    case other => other
+  }
+
   /** Converts Catalyst types used internally in rows to standard Scala types */
   def convertToScala(a: Any, dataType: DataType): Any = (a, dataType) match {
     // Check UDT first since UDTs can override other types

diff --git a/...src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala b/...src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateProjection.scala
@@ -128,7 +128,7 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
           }"""
         case other =>
           q"""
-          override def ${accessorForType(dataType)}(i: Int):${termForType(dataType)} = {
+          override def ${accessorForType(dataType)}(i: Int): ${termForType(dataType)} = {
             ..$ifStatements;
             $accessorFailure
           }"""
@@ -148,13 +148,13 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
       dataType match {
         case StringType =>
           q"""
-          override def setString(i: Int, value: String): Unit = {
+          override def setString(i: Int, value: String) {
             ..$ifStatements;
             $accessorFailure
           }"""
         case other =>
           q"""
-          override def ${mutatorForType(dataType)}(i: Int, value: ${termForType(dataType)}):Unit = {
+          override def ${mutatorForType(dataType)}(i: Int, value: ${termForType(dataType)}) {
             ..$ifStatements;
             $accessorFailure
           }"""

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.types._
 
 object Literal {
@@ -42,20 +43,9 @@ object Literal {
       throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v)
   }
 
-  /**
-   * convert String in `v` as UTF8String
-    */
-  def convertToUTF8String(v: Any): Any = v match {
-    case s: String => UTF8String(s)
-    case seq: Seq[Any] => seq.map(convertToUTF8String)
-    case r: Row => Row(r.toSeq.map(convertToUTF8String): _*)
-    case arr: Array[Any] => arr.toSeq.map(convertToUTF8String).toArray
-    case m: Map[Any, Any] =>
-      m.map { case (k, v) => (convertToUTF8String(k), convertToUTF8String(v)) }.toMap
-    case other => other
+  def create(v: Any, dataType: DataType): Literal = {
+    Literal(ScalaReflection.convertToCatalyst(v), dataType)
   }
-
-  def create(v: Any, dataType: DataType): Literal = Literal(convertToUTF8String(v), dataType)
 }
 
 /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/rows.scala
@@ -196,10 +196,7 @@ class GenericMutableRow(v: Array[Any]) extends GenericRow(v) with MutableRow {
   override def setFloat(ordinal: Int, value: Float): Unit = { values(ordinal) = value }
   override def setInt(ordinal: Int, value: Int): Unit = { values(ordinal) = value }
   override def setLong(ordinal: Int, value: Long): Unit = { values(ordinal) = value }
-  override def setString(ordinal: Int, value: String): Unit = {
-    // TODO(davies): need this?
-    values(ordinal) = UTF8String(value)
-  }
+  override def setString(ordinal: Int, value: String) { values(ordinal) = UTF8String(value)}
   override def setNullAt(i: Int): Unit = { values(i) = null }
 
   override def setShort(ordinal: Int, value: Short): Unit = { values(ordinal) = value }

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -19,9 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.regex.Pattern
 
-import scala.collection.IndexedSeqOptimized
-
-
 import org.apache.spark.sql.catalyst.analysis.UnresolvedException
 import org.apache.spark.sql.types._
 
@@ -226,8 +223,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
   override def children: Seq[Expression] = str :: pos :: len :: Nil
 
   @inline
-  def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)
-      (implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
+  def slicePos(startPos: Int, sliceLen: Int, length: () => Int): (Int, Int) = {
     // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
     // negative indices for start positions. If a start index i is greater than 0, it 
     // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
@@ -236,29 +232,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
 
     val start = startPos match {
       case pos if pos > 0 => pos - 1
-      case neg if neg < 0 => str.length + neg
-      case _ => 0
-    }
-
-    val end = sliceLen match {
-      case max if max == Integer.MAX_VALUE => max
-      case x => start + x
-    }
-
-    str.slice(start, end)    
-  }
-
-  @inline
-  def slice(str: UTF8String, startPos: Int, sliceLen: Int): Any = {
-    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
-    // negative indices for start positions. If a start index i is greater than 0, it
-    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
-    // to the -ith element before the end of the sequence. If a start index i is 0, it
-    // refers to the first element.
-
-    val start = startPos match {
-      case pos if pos > 0 => pos - 1
-      case neg if neg < 0 => str.length + neg
+      case neg if neg < 0 => length() + neg
       case _ => 0
     }
 
@@ -267,24 +241,26 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
       case x => start + x
     }
 
-    str.slice(start, end)
+    (start, end)
   }
 
   override def eval(input: Row): Any = {
     val string = str.eval(input)
-
     val po = pos.eval(input)
     val ln = len.eval(input)
 
     if ((string == null) || (po == null) || (ln == null)) {
       null
     } else {
       val start = po.asInstanceOf[Int]
-      val length = ln.asInstanceOf[Int] 
-
+      val length = ln.asInstanceOf[Int]
       string match {
-        case ba: Array[Byte] => slice(ba, start, length)
-        case s: UTF8String => slice(s, start, length)
+        case ba: Array[Byte] =>
+          val (st, end) = slicePos(start, length, () => ba.length)
+          ba.slice(st, end)
+        case s: UTF8String =>
+          val (st, end) = slicePos(start, length, () => s.length)
+          s.slice(st, end)
       }
     }
   }

diff --git a/.../src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/.../src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -25,6 +25,7 @@ import org.scalactic.TripleEqualsSupport.Spread
 import org.scalatest.FunSuite
 import org.scalatest.Matchers._
 
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.analysis.UnresolvedGetField
 import org.apache.spark.sql.types._
@@ -60,7 +61,7 @@ class ExpressionEvaluationBaseSuite extends FunSuite {
 class ExpressionEvaluationSuite extends ExpressionEvaluationBaseSuite {
 
   def create_row(values: Array[Any]): Row = {
-    new GenericRow(values.toSeq.map(Literal.convertToUTF8String).toArray)
+    new GenericRow(values.toSeq.map(ScalaReflection.convertToCatalyst).toArray)
   }
 
   test("literals") {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -396,11 +396,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
     // schema differs from the existing schema on any field data type.
     def needsConversion(dt: DataType): Boolean = dt match {
       case StringType => true
+      case DateType => true
+      case DecimalType() => true
       case dt: ArrayType => needsConversion(dt.elementType)
       case dt: MapType => needsConversion(dt.keyType) || needsConversion(dt.valueType)
-      case dt: StructType =>
-        !dt.fields.forall(f => !needsConversion(f.dataType))
-      // TODO(davies): check other types and values
+      case dt: StructType => !dt.fields.forall(f => !needsConversion(f.dataType))
       case other => false
     }
     val convertedRdd = if (needsConversion(schema)) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala
@@ -230,7 +230,6 @@ case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child:
 
   def execute(): RDD[Row] = {
     // TODO: Clean up after ourselves?
-    // TODO(davies): convert internal type to Scala Type
     val childResults = child.execute().map(_.copy()).cache()
 
     val parent = childResults.mapPartitions { iter =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala
@@ -450,7 +450,7 @@ private[sql] object JsonRDD extends Logging {
   private[sql] def rowToJSON(rowSchema: StructType, gen: JsonGenerator)(row: Row) = {
     def valWriter: (DataType, Any) => Unit = {
       case (_, null) | (NullType, _)  => gen.writeNull()
-      case (StringType, v: String) => gen.writeString(v.toString)
+      case (StringType, v: String) => gen.writeString(v)
       case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
       case (IntegerType, v: Int) => gen.writeNumber(v)
       case (ShortType, v: Short) => gen.writeNumber(v)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala
@@ -20,7 +20,7 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.parquet.ParquetRelation2._
 import org.apache.spark.sql.test.TestSQLContext
 import org.apache.spark.sql.types._

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -17,25 +17,22 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.spark.sql.catalyst.expressions.Row
-
 import scala.collection.JavaConversions._
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.{Row, _}
 import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate
 import org.apache.spark.sql.catalyst.planning._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.sources.DescribeCommand
-import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand}
-import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand, _}
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.parquet.ParquetRelation
-import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, CreateTableUsing}
-import org.apache.spark.sql.types.{UTF8String, StringType}
+import org.apache.spark.sql.sources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand}
+import org.apache.spark.sql.types.StringType
 
 
 private[hive] trait HiveStrategies {
@@ -131,10 +128,7 @@ private[hive] trait HiveStrategies {
               val partitionValues = part.getValues
               var i = 0
               while (i < partitionValues.size()) {
-                inputData(i) = partitionValues(i) match {
-                  case s: String => UTF8String(s)
-                  case other => other
-                }
+                inputData(i) = ScalaReflection.convertToCatalyst(partitionValues(i))
                 i += 1
               }
               pruningCondition(inputData)

diff --git a/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala b/sql/hive/v0.13.1/src/main/scala/org/apache/spark/sql/hive/Shim13.scala
@@ -23,12 +23,14 @@ import java.util.{Properties, ArrayList => JArrayList}
 import scala.collection.JavaConversions._
 import scala.language.implicitConversions
 
+import com.esotericsoftware.kryo.Kryo
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.common.StatsSetupConst
 import org.apache.hadoop.hive.common.`type`.HiveDecimal
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.ql.Context
+import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
 import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table}
 import org.apache.hadoop.hive.ql.plan.{CreateTableDesc, FileSinkDesc, TableDesc}
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory
@@ -45,7 +47,6 @@ import org.apache.hadoop.{io => hadoopIo}
 import org.apache.spark.Logging
 import org.apache.spark.sql.types.{Decimal, DecimalType, UTF8String}
 
-
 /**
  * This class provides the UDF creation and also the UDF instance serialization and
  * de-serialization cross process boundary.
@@ -60,19 +61,14 @@ private[hive] case class HiveFunctionWrapper(var functionClassName: String)
   // for Serialization
   def this() = this(null)
 
-  import java.io.{InputStream, OutputStream}
-
-import com.esotericsoftware.kryo.Kryo
-  import org.apache.hadoop.hive.ql.exec.{UDF, Utilities}
-
-import org.apache.spark.util.Utils._
+  import org.apache.spark.util.Utils._
 
   @transient
   private val methodDeSerialize = {
     val method = classOf[Utilities].getDeclaredMethod(
       "deserializeObjectByKryo",
       classOf[Kryo],
-      classOf[InputStream],
+      classOf[java.io.InputStream],
       classOf[Class[_]])
     method.setAccessible(true)
 
@@ -85,7 +81,7 @@ import org.apache.spark.util.Utils._
       "serializeObjectByKryo",
       classOf[Kryo],
       classOf[Object],
-      classOf[OutputStream])
+      classOf[java.io.OutputStream])
     method.setAccessible(true)
 
     method