-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-17528][SQL] data should be copied properly before saving into InternalRow #18483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,9 @@ | |
| package org.apache.spark.sql.catalyst | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} | ||
| import org.apache.spark.sql.types.{DataType, Decimal, StructType} | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
| /** | ||
| * An abstract class for row used internally in Spark SQL, which only contains the columns as | ||
|
|
@@ -33,6 +35,10 @@ abstract class InternalRow extends SpecializedGetters with Serializable { | |
|
|
||
| def setNullAt(i: Int): Unit | ||
|
|
||
| /** | ||
| * Updates the value at column `i`. Note that after updating, the given value will be kept in this | ||
| * row, and the caller side should guarantee that this value won't be changed afterwards. | ||
| */ | ||
| def update(i: Int, value: Any): Unit | ||
|
|
||
| // default implementation (slow) | ||
|
|
@@ -58,7 +64,15 @@ abstract class InternalRow extends SpecializedGetters with Serializable { | |
| def copy(): InternalRow | ||
|
|
||
| /** Returns true if there are any NULL values in this row. */ | ||
| def anyNull: Boolean | ||
| def anyNull: Boolean = { | ||
| val len = numFields | ||
| var i = 0 | ||
| while (i < len) { | ||
| if (isNullAt(i)) { return true } | ||
| i += 1 | ||
| } | ||
| false | ||
| } | ||
|
|
||
| /* ---------------------- utility methods for Scala ---------------------- */ | ||
|
|
||
|
|
@@ -94,4 +108,21 @@ object InternalRow { | |
|
|
||
| /** Returns an empty [[InternalRow]]. */ | ||
| val empty = apply() | ||
|
|
||
| /** | ||
| * Copies the given value if it's string/struct/array/map type. | ||
| */ | ||
| def copyValue(value: Any): Any = { | ||
|
||
| if (value.isInstanceOf[UTF8String]) { | ||
|
||
| value.asInstanceOf[UTF8String].copy() | ||
| } else if (value.isInstanceOf[InternalRow]) { | ||
| value.asInstanceOf[InternalRow].copy() | ||
| } else if (value.isInstanceOf[ArrayData]) { | ||
| value.asInstanceOf[ArrayData].copy() | ||
| } else if (value.isInstanceOf[MapData]) { | ||
| value.asInstanceOf[MapData].copy() | ||
| } else { | ||
|
||
| value | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1047,7 +1047,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String | |
| final $rowClass $result = new $rowClass(${fieldsCasts.length}); | ||
| final InternalRow $tmpRow = $c; | ||
| $fieldsEvalCode | ||
| $evPrim = $result.copy(); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this copy is not needed, because we already do the copy when setting columns to this row. |
||
| $evPrim = $result; | ||
| """ | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -131,8 +131,6 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection] | |
| case s: StructType => createCodeForStruct(ctx, input, s) | ||
| case ArrayType(elementType, _) => createCodeForArray(ctx, input, elementType) | ||
| case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType) | ||
| // UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe. | ||
| case StringType => ExprCode("", "false", s"$input.clone()") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this copy is not needed, as we will do copy before we updating a value to the row. |
||
| case udt: UserDefinedType[_] => convertToSafe(ctx, input, udt.sqlType) | ||
| case _ => ExprCode("", "false", input) | ||
| } | ||
|
|
||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -172,4 +172,40 @@ class GeneratedProjectionSuite extends SparkFunSuite { | |
| assert(unsafe1 === unsafe3) | ||
| assert(unsafe1.getStruct(1, 7) === unsafe3.getStruct(1, 7)) | ||
| } | ||
|
|
||
| test("MutableProjection should not cache content from the input row") { | ||
| val mutableProj = GenerateMutableProjection.generate( | ||
| Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
| val row = new GenericInternalRow(1) | ||
| mutableProj.target(row) | ||
|
|
||
| val unsafeProj = GenerateUnsafeProjection.generate( | ||
| Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
| val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a")))) | ||
|
|
||
| mutableProj.apply(unsafeRow) | ||
| assert(row.getStruct(0, 1).getString(0) == "a") | ||
|
|
||
| // Even if the input row of the mutable projection has been changed, the target mutable row | ||
| // should keep same. | ||
| unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b")))) | ||
| assert(row.getStruct(0, 1).getString(0).toString == "a") | ||
| } | ||
|
|
||
| test("SafeProjection should not cache content from the input row") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has always worked right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yea, I add this test to make sure my change to the |
||
| val safeProj = GenerateSafeProjection.generate( | ||
| Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
|
|
||
| val unsafeProj = GenerateUnsafeProjection.generate( | ||
| Seq(BoundReference(0, new StructType().add("i", StringType), true))) | ||
| val unsafeRow = unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("a")))) | ||
|
|
||
| val row = safeProj.apply(unsafeRow) | ||
| assert(row.getStruct(0, 1).getString(0) == "a") | ||
|
|
||
| // Even if the input row of the mutable projection has been changed, the target mutable row | ||
| // should keep same. | ||
| unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b")))) | ||
| assert(row.getStruct(0, 1).getString(0).toString == "a") | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is because
clone()doesn't always make a copy, right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps we should just make clone make an actual copy...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UTF8Stringis public to users, so I'm hesitating to change theclonemethodThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, let's leave it then