Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adds fixes from review:
* orders imports in stringOperations.scala
* Substring.dataType throws exception if children are unresolved
* inlines Substring.slice (~11.5% performance improvement on microbenchmark runs)
* adds a special `toString` case for two-argument SUBSTR expressions
* removes spurious I_ prefix to SUBSTR(ING) in HiveQL.scala

Thanks to @concretevitamin for prompt and useful feedback!
  • Loading branch information
willb committed Jul 14, 2014
commit ec35c8016c63dccc7793d1cedb3b11e1c4dc1214
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,9 @@ import java.util.regex.Pattern

import scala.collection.IndexedSeqOptimized

import org.apache.spark.sql.catalyst.types.DataType
import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BinaryType
import org.apache.spark.sql.catalyst.types.BooleanType

import org.apache.spark.sql.catalyst.analysis.UnresolvedException
import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}

trait StringRegexExpression {
self: BinaryExpression =>
Expand Down Expand Up @@ -219,13 +218,17 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends

def nullable: Boolean = true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nullable could be str.nullable || pos.nullable || len.nullable ?

def dataType: DataType = {
if (!resolved) {
throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
}
if (str.dataType == BinaryType) str.dataType else StringType
}

def references = children.flatMap(_.references).toSet

override def children = str :: pos :: len :: Nil

@inline
def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
val len = str.length
// Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
Expand Down Expand Up @@ -267,5 +270,8 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
}
}

override def toString = s"SUBSTR($str, $pos, $len)"
override def toString = len match {
case max if max == Integer.MAX_VALUE => s"SUBSTR($str, $pos)"
case _ => s"SUBSTR($str, $pos, $len)"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ private[hive] object HiveQl {
val BETWEEN = "(?i)BETWEEN".r
val WHEN = "(?i)WHEN".r
val CASE = "(?i)CASE".r
val SUBSTR = "(?i)I_SUBSTR(?:ING)?".r
val SUBSTR = "(?i)SUBSTR(?:ING)?".r

protected def nodeToExpr(node: Node): Expression = node match {
/* Attribute References */
Expand Down