Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
e14b545
[SPARK-7977] [BUILD] Disallowing println
jonalter Jul 10, 2015
11e22b7
[SPARK-7944] [SPARK-8013] Remove most of the Spark REPL fork for Scal…
dragos Jul 10, 2015
5dd45bd
[SPARK-8958] Dynamic allocation: change cached timeout to infinity
Jul 10, 2015
db6d57f
[CORE] [MINOR] change the log level to info
chenghao-intel Jul 10, 2015
c185f3a
[SPARK-8675] Executors created by LocalBackend won't get the same cla…
coderplay Jul 10, 2015
05ac023
[HOTFIX] fix flaky test in PySpark SQL
Jul 10, 2015
0772026
[SPARK-8923] [DOCUMENTATION, MLLIB] Add @since tags to mllib.fpm
rahulpalamuttam Jul 10, 2015
fb8807c
[SPARK-7078] [SPARK-7079] Binary processing sort for Spark SQL
JoshRosen Jul 10, 2015
857e325
[SPARK-8990] [SQL] SPARK-8990 DataFrameReader.parquet() should respec…
liancheng Jul 10, 2015
b6fc0ad
add inline comment for python tests
davies Jul 11, 2015
3363088
[SPARK-8961] [SQL] Makes BaseWriterContainer.outputWriterForRow accep…
liancheng Jul 11, 2015
6e1c7e2
[SPARK-7735] [PYSPARK] Raise Exception on non-zero exit from pipe com…
megatron-me-uk Jul 11, 2015
9c50757
[SPARK-8598] [MLLIB] Implementation of 1-sample, two-sided, Kolmogoro…
Jul 11, 2015
7f6be1f
[SPARK-6487] [MLLIB] Add sequential pattern mining algorithm PrefixSp…
zhangjiajin Jul 11, 2015
0c5207c
[SPARK-8994] [ML] tiny cleanups to Params, Pipeline
jkbradley Jul 11, 2015
c472eb1
[SPARK-8970][SQL] remove unnecessary abstraction for ExtractValue
cloud-fan Jul 11, 2015
3009088
[SPARK-8880] Fix confusing Stage.attemptId member variable
kayousterhout Jul 13, 2015
20b4743
[SPARK-9006] [PYSPARK] fix microsecond loss in Python 3
Jul 13, 2015
92540d2
[SPARK-8203] [SPARK-8204] [SQL] conditional function: least/greatest
adrian-wang Jul 13, 2015
6b89943
[SPARK-8944][SQL] Support casting between IntervalType and StringType
cloud-fan Jul 13, 2015
a5bc803
[SPARK-8596] Add module for rstudio link to spark
koaning Jul 13, 2015
7f487c8
[SPARK-6797] [SPARKR] Add support for YARN cluster mode.
Jul 13, 2015
9b62e93
[SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark
MechCoder Jul 13, 2015
5ca26fb
[SPARK-8950] [WEBUI] Correct the calculation of SchedulerDelay in Sta…
carsonwang Jul 13, 2015
79c3582
Revert "[SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to P…
davies Jul 13, 2015
5c41691
[SPARK-8954] [BUILD] Remove unneeded deb repository from Dockerfile t…
yongtang Jul 13, 2015
714fc55
[SPARK-8991] [ML] Update SharedParamsCodeGen's Generated Documentation
Jul 13, 2015
4c797f2
[SPARK-8636] [SQL] Fix equalNullSafe comparison
Jul 13, 2015
0aed38e
[SPARK-8533] [STREAMING] Upgrade Flume to 1.6.0
harishreedharan Jul 13, 2015
b7bcbe2
[SPARK-8743] [STREAMING] Deregister Codahale metrics for streaming wh…
Jul 13, 2015
408b384
[SPARK-6910] [SQL] Support for pushing predicates down to metastore f…
Jul 14, 2015
20c1434
[SPARK-9001] Fixing errors in javadocs that lead to failed build/sbt doc
jegonzal Jul 14, 2015
c1feebd
[SPARK-9010] [DOCUMENTATION] Improve the Spark Configuration document…
stanzhai Jul 14, 2015
257236c
[SPARK-6851] [SQL] function least/greatest follow up
adrian-wang Jul 14, 2015
59d820a
[SPARK-9029] [SQL] shortcut CaseKeyWhen if key is null
cloud-fan Jul 14, 2015
37f2d96
[SPARK-9027] [SQL] Generalize metastore predicate pushdown
marmbrus Jul 14, 2015
c4e98ff
[SPARK-8933] [BUILD] Provide a --force flag to build/mvn that always …
Jul 14, 2015
8fb3a65
[SPARK-8911] Fix local mode endless heartbeats
Jul 14, 2015
d267c28
[SPARK-9031] Merge BlockObjectWriter and DiskBlockObject writer to re…
JoshRosen Jul 14, 2015
0a4071e
[SPARK-8718] [GRAPHX] Improve EdgePartition2D for non perfect square …
aray Jul 14, 2015
fb1d06f
[SPARK-4072] [CORE] Display Streaming blocks in Streaming UI
zsxwing Jul 14, 2015
4b5cfc9
[SPARK-8800] [SQL] Fix inaccurate precision/scale of Decimal division…
viirya Jul 14, 2015
740b034
[SPARK-4362] [MLLIB] Make prediction probability available in NaiveBa…
srowen Jul 14, 2015
11e5c37
[SPARK-8962] Add Scalastyle rule to ban direct use of Class.forName; …
JoshRosen Jul 14, 2015
e965a79
[SPARK-9045] Fix Scala 2.11 build break in UnsafeExternalRowSorter
JoshRosen Jul 15, 2015
cc57d70
[SPARK-9050] [SQL] Remove unused newOrdering argument from Exchange (…
JoshRosen Jul 15, 2015
f957796
[SPARK-8820] [STREAMING] Add a configuration to set checkpoint dir.
SaintBacchus Jul 15, 2015
bb870e7
[SPARK-5523] [CORE] [STREAMING] Add a cache for hostname in TaskMetri…
jerryshao Jul 15, 2015
5572fd0
[HOTFIX] Adding new names to known contributors
pwendell Jul 15, 2015
f650a00
[SPARK-8808] [SPARKR] Fix assignments in SparkR.
Jul 15, 2015
f23a721
[SPARK-8993][SQL] More comprehensive type checking in expressions.
rxin Jul 15, 2015
c6b1a9e
Revert SPARK-6910 and SPARK-9027
marmbrus Jul 15, 2015
4692769
[SPARK-6259] [MLLIB] Python API for LDA
yu-iskw Jul 15, 2015
3f6296f
[SPARK-8018] [MLLIB] KMeans should accept initial cluster centers as …
FlytxtRnD Jul 15, 2015
f0e1297
[SPARK-8279][SQL]Add math function round
yjshen Jul 15, 2015
1bb8acc
[SPARK-8997] [MLLIB] Performance improvements in LocalPrefixSpan
Jul 15, 2015
14935d8
[HOTFIX][SQL] Unit test breaking.
rxin Jul 15, 2015
adb33d3
[SPARK-9012] [WEBUI] Escape Accumulators in the task table
zsxwing Jul 15, 2015
20bb10f
[SPARK-8706] [PYSPARK] [PROJECT INFRA] Add pylint checks to PySpark
MechCoder Jul 15, 2015
6f69025
[SPARK-8840] [SPARKR] Add float coercion on SparkR
viirya Jul 15, 2015
fa4ec36
[SPARK-9020][SQL] Support mutable state in code gen expressions
cloud-fan Jul 15, 2015
a938527
[SPARK-8221][SQL]Add pmod function
zhichao-li Jul 15, 2015
9716a72
[Minor][SQL] Allow spaces in the beginning and ending of string for I…
viirya Jul 15, 2015
303c120
[SPARK-7555] [DOCS] Add doc for elastic net in ml-guide and mllib-guide
coderxiang Jul 15, 2015
ec9b621
SPARK-9070 JavaDataFrameSuite teardown NPEs if setup failed
steveloughran Jul 15, 2015
536533c
[SPARK-9005] [MLLIB] Fix RegressionMetrics computation of explainedVa…
Jul 15, 2015
b9a922e
[SPARK-6602][Core]Replace Akka Serialization with Spark Serializer
zsxwing Jul 15, 2015
674eb2a
[SPARK-8974] Catch exceptions in allocation schedule task.
Jul 15, 2015
affbe32
[SPARK-9071][SQL] MonotonicallyIncreasingID and SparkPartitionID shou…
rxin Jul 15, 2015
5599cc4
Predicate pushdown to hive metastore
Jul 15, 2015
b3cb5af
Synchronize getPartitionsByFilter
Jul 17, 2015
acf96d1
Synchronize on hive client
Jul 17, 2015
f897087
Synchronize on this
Jul 17, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[SPARK-9020][SQL] Support mutable state in code gen expressions
We can keep expressions' mutable states in generated class(like `SpecificProjection`) as member variables, so that we can read and modify them inside codegened expressions.

Author: Wenchen Fan <[email protected]>

Closes apache#7392 from cloud-fan/mutable-state and squashes the following commits:

eb3a221 [Wenchen Fan] fix order
73144d8 [Wenchen Fan] naming improvement
318f41d [Wenchen Fan] address more comments
d43b65d [Wenchen Fan] address comments
fd45c7a [Wenchen Fan] Support mutable state in code gen expressions
  • Loading branch information
cloud-fan authored and rxin committed Jul 15, 2015
commit fa4ec3606a965238423f977808163983c9d56e0a
15 changes: 14 additions & 1 deletion core/src/main/scala/org/apache/spark/TaskContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,20 @@ object TaskContext {
*/
def get(): TaskContext = taskContext.get

private val taskContext: ThreadLocal[TaskContext] = new ThreadLocal[TaskContext]
/**
* Returns the partition id of currently active TaskContext. It will return 0
* if there is no active TaskContext for cases like local execution.
*/
def getPartitionId(): Int = {
val tc = taskContext.get()
if (tc == null) {
0
} else {
tc.partitionId()
}
}

private[this] val taskContext: ThreadLocal[TaskContext] = new ThreadLocal[TaskContext]

// Note: protected[spark] instead of private[spark] to prevent the following two from
// showing up in JavaDoc.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ class CodeGenContext {
*/
val references: mutable.ArrayBuffer[Expression] = new mutable.ArrayBuffer[Expression]()

/**
* Holding expressions' mutable states like `MonotonicallyIncreasingID.count` as a
* 3-tuple: java type, variable name, code to init it.
* They will be kept as member variables in generated classes like `SpecificProjection`.
*/
val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
mutable.ArrayBuffer.empty[(String, String, String)]

def addMutableState(javaType: String, variableName: String, initialValue: String): Unit = {
mutableStates += ((javaType, variableName, initialValue))
}

val stringType: String = classOf[UTF8String].getName
val decimalType: String = classOf[Decimal].getName

Expand Down Expand Up @@ -203,7 +215,10 @@ class CodeGenContext {
def isPrimitiveType(dt: DataType): Boolean = isPrimitiveType(javaType(dt))
}


/**
* A wrapper for generated class, defines a `generate` method so that we can pass extra objects
* into generated class.
*/
abstract class GeneratedClass {
def generate(expressions: Array[Expression]): Any
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu
${ctx.setColumn("mutableRow", e.dataType, i, evaluationCode.primitive)};
"""
}.mkString("\n")
val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")
val code = s"""
public Object generate($exprType[] expr) {
return new SpecificProjection(expr);
Expand All @@ -55,6 +58,7 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], () => Mu

private $exprType[] expressions = null;
private $mutableRowType mutableRow = null;
$mutableStates

public SpecificProjection($exprType[] expr) {
expressions = expr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,30 +46,47 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
protected def create(ordering: Seq[SortOrder]): Ordering[InternalRow] = {
val ctx = newCodeGenContext()

val comparisons = ordering.zipWithIndex.map { case (order, i) =>
val evalA = order.child.gen(ctx)
val evalB = order.child.gen(ctx)
val comparisons = ordering.map { order =>
val eval = order.child.gen(ctx)
val asc = order.direction == Ascending
val isNullA = ctx.freshName("isNullA")
val primitiveA = ctx.freshName("primitiveA")
val isNullB = ctx.freshName("isNullB")
val primitiveB = ctx.freshName("primitiveB")
s"""
i = a;
${evalA.code}
boolean $isNullA;
${ctx.javaType(order.child.dataType)} $primitiveA;
{
${eval.code}
$isNullA = ${eval.isNull};
$primitiveA = ${eval.primitive};
}
i = b;
${evalB.code}
if (${evalA.isNull} && ${evalB.isNull}) {
boolean $isNullB;
${ctx.javaType(order.child.dataType)} $primitiveB;
{
${eval.code}
$isNullB = ${eval.isNull};
$primitiveB = ${eval.primitive};
}
if ($isNullA && $isNullB) {
// Nothing
} else if (${evalA.isNull}) {
} else if ($isNullA) {
return ${if (order.direction == Ascending) "-1" else "1"};
} else if (${evalB.isNull}) {
} else if ($isNullB) {
return ${if (order.direction == Ascending) "1" else "-1"};
} else {
int comp = ${ctx.genComp(order.child.dataType, evalA.primitive, evalB.primitive)};
int comp = ${ctx.genComp(order.child.dataType, primitiveA, primitiveB)};
if (comp != 0) {
return ${if (asc) "comp" else "-comp"};
}
}
"""
}.mkString("\n")

val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")
val code = s"""
public SpecificOrdering generate($exprType[] expr) {
return new SpecificOrdering(expr);
Expand All @@ -78,6 +95,7 @@ object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[InternalR
class SpecificOrdering extends ${classOf[BaseOrdering].getName} {

private $exprType[] expressions = null;
$mutableStates

public SpecificOrdering($exprType[] expr) {
expressions = expr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,17 @@ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Bool
protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
val ctx = newCodeGenContext()
val eval = predicate.gen(ctx)
val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")
val code = s"""
public SpecificPredicate generate($exprType[] expr) {
return new SpecificPredicate(expr);
}

class SpecificPredicate extends ${classOf[Predicate].getName} {
private final $exprType[] expressions;
$mutableStates
public SpecificPredicate($exprType[] expr) {
expressions = expr;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,79 +151,84 @@ object GenerateProjection extends CodeGenerator[Seq[Expression], Projection] {
s"""if (!nullBits[$i]) arr[$i] = c$i;"""
}.mkString("\n ")

val mutableStates = ctx.mutableStates.map { case (javaType, variableName, initialValue) =>
s"private $javaType $variableName = $initialValue;"
}.mkString("\n ")

val code = s"""
public SpecificProjection generate($exprType[] expr) {
return new SpecificProjection(expr);
}

class SpecificProjection extends ${classOf[BaseProject].getName} {
private $exprType[] expressions = null;
$mutableStates

public SpecificProjection($exprType[] expr) {
expressions = expr;
}

@Override
public Object apply(Object r) {
return new SpecificRow(expressions, (InternalRow) r);
return new SpecificRow((InternalRow) r);
}
}

final class SpecificRow extends ${classOf[MutableRow].getName} {
final class SpecificRow extends ${classOf[MutableRow].getName} {

$columns
$columns

public SpecificRow($exprType[] expressions, InternalRow i) {
$initColumns
}
public SpecificRow(InternalRow i) {
$initColumns
}

public int length() { return ${expressions.length};}
protected boolean[] nullBits = new boolean[${expressions.length}];
public void setNullAt(int i) { nullBits[i] = true; }
public boolean isNullAt(int i) { return nullBits[i]; }
public int length() { return ${expressions.length};}
protected boolean[] nullBits = new boolean[${expressions.length}];
public void setNullAt(int i) { nullBits[i] = true; }
public boolean isNullAt(int i) { return nullBits[i]; }

public Object get(int i) {
if (isNullAt(i)) return null;
switch (i) {
$getCases
public Object get(int i) {
if (isNullAt(i)) return null;
switch (i) {
$getCases
}
return null;
}
return null;
}
public void update(int i, Object value) {
if (value == null) {
setNullAt(i);
return;
public void update(int i, Object value) {
if (value == null) {
setNullAt(i);
return;
}
nullBits[i] = false;
switch (i) {
$updateCases
}
}
nullBits[i] = false;
switch (i) {
$updateCases
$specificAccessorFunctions
$specificMutatorFunctions

@Override
public int hashCode() {
int result = 37;
$hashUpdates
return result;
}
}
$specificAccessorFunctions
$specificMutatorFunctions

@Override
public int hashCode() {
int result = 37;
$hashUpdates
return result;
}

@Override
public boolean equals(Object other) {
if (other instanceof SpecificRow) {
SpecificRow row = (SpecificRow) other;
$columnChecks
return true;
@Override
public boolean equals(Object other) {
if (other instanceof SpecificRow) {
SpecificRow row = (SpecificRow) other;
$columnChecks
return true;
}
return super.equals(other);
}
return super.equals(other);
}

@Override
public InternalRow copy() {
Object[] arr = new Object[${expressions.length}];
${copyColumns}
return new ${classOf[GenericInternalRow].getName}(arr);
@Override
public InternalRow copy() {
Object[] arr = new Object[${expressions.length}];
${copyColumns}
return new ${classOf[GenericInternalRow].getName}(arr);
}
}
}
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom
Expand All @@ -38,11 +39,7 @@ abstract class RDG(seed: Long) extends LeafExpression with Serializable {
* Record ID within each partition. By being transient, the Random Number Generator is
* reset every time we serialize and deserialize it.
*/
@transient protected lazy val partitionId = TaskContext.get() match {
case null => 0
case _ => TaskContext.get().partitionId()
}
@transient protected lazy val rng = new XORShiftRandom(seed + partitionId)
@transient protected lazy val rng = new XORShiftRandom(seed + TaskContext.getPartitionId)

override def deterministic: Boolean = false

Expand All @@ -61,6 +58,17 @@ case class Rand(seed: Long) extends RDG(seed) {
case IntegerLiteral(s) => s
case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
})

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
val rngTerm = ctx.freshName("rng")
val className = classOf[XORShiftRandom].getCanonicalName
ctx.addMutableState(className, rngTerm,
s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
ev.isNull = "false"
s"""
final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextDouble();
"""
}
}

/** Generate a random column with i.i.d. gaussian random distribution. */
Expand All @@ -73,4 +81,15 @@ case class Randn(seed: Long) extends RDG(seed) {
case IntegerLiteral(s) => s
case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
})

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
val rngTerm = ctx.freshName("rng")
val className = classOf[XORShiftRandom].getCanonicalName
ctx.addMutableState(className, rngTerm,
s"new $className($seed + org.apache.spark.TaskContext.getPartitionId())")
ev.isNull = "false"
s"""
final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
"""
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.expressions
import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.LeafExpression
import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
import org.apache.spark.sql.types.{LongType, DataType}

/**
Expand All @@ -40,13 +41,29 @@ private[sql] case class MonotonicallyIncreasingID() extends LeafExpression {
*/
@transient private[this] var count: Long = 0L

@transient private lazy val partitionMask = TaskContext.getPartitionId.toLong << 33

override def nullable: Boolean = false

override def dataType: DataType = LongType

override def eval(input: InternalRow): Long = {
val currentCount = count
count += 1
(TaskContext.get().partitionId().toLong << 33) + currentCount
partitionMask + currentCount
}

override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
val countTerm = ctx.freshName("count")
val partitionMaskTerm = ctx.freshName("partitionMask")
ctx.addMutableState(ctx.JAVA_LONG, countTerm, "0L")
ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
"((long) org.apache.spark.TaskContext.getPartitionId()) << 33")

ev.isNull = "false"
s"""
final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm;
$countTerm++;
"""
}
}
Loading