Skip to content

Commit a83e868

Browse files
sarutakdongjoon-hyun
authored andcommitted
[SPARK-31416][SQL] Check more strictly that a field name can be used as a valid Java identifier for codegen
### What changes were proposed in this pull request? Check more strictly that a field name can be used as a valid Java identifier in `ScalaReflection.serializerFor` To check that, `SourceVersion` is used so that we need not add reserved keywords to be checked manually for the future Java versions (e.g, underscore, var, yield), . ### Why are the changes needed? In the current implementation, `enum` is not checked even though it's a reserved keyword. Also, there are lots of characters and sequences of character including numeric literals but they are not checked. So we can't get better error message with following code. ``` case class Data(`0`: Int) Seq(Data(1)).toDF.show 20/04/11 03:24:24 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type ... ``` ### Does this PR introduce any user-facing change? Yes. With this change and the code example above, we can get following error message. ``` java.lang.UnsupportedOperationException: `0` is not a valid identifier of Java and cannot be used as field name - root class: "Data" ... ``` ### How was this patch tested? Add another assertion to existing test case. Closes apache#28184 from sarutak/improve-identifier-check. Authored-by: Kousuke Saruta <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]> (cherry picked from commit 6cd0bef) Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 36b0767 commit a83e868

File tree

4 files changed

+32
-22
lines changed

4 files changed

+32
-22
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark.sql.catalyst
1919

20+
import javax.lang.model.SourceVersion
21+
2022
import org.apache.commons.lang3.reflect.ConstructorUtils
2123

2224
import org.apache.spark.internal.Logging
@@ -539,9 +541,10 @@ object ScalaReflection extends ScalaReflection {
539541

540542
val params = getConstructorParameters(t)
541543
val fields = params.map { case (fieldName, fieldType) =>
542-
if (javaKeywords.contains(fieldName)) {
543-
throw new UnsupportedOperationException(s"`$fieldName` is a reserved keyword and " +
544-
"cannot be used as field name\n" + walkedTypePath)
544+
if (SourceVersion.isKeyword(fieldName) ||
545+
!SourceVersion.isIdentifier(encodeFieldNameToIdentifier(fieldName))) {
546+
throw new UnsupportedOperationException(s"`$fieldName` is not a valid identifier of " +
547+
"Java and cannot be used as field name\n" + walkedTypePath)
545548
}
546549

547550
// SPARK-26730 inputObject won't be null with If's guard below. And KnownNotNul
@@ -782,13 +785,6 @@ object ScalaReflection extends ScalaReflection {
782785
}
783786
}
784787

785-
private val javaKeywords = Set("abstract", "assert", "boolean", "break", "byte", "case", "catch",
786-
"char", "class", "const", "continue", "default", "do", "double", "else", "extends", "false",
787-
"final", "finally", "float", "for", "goto", "if", "implements", "import", "instanceof", "int",
788-
"interface", "long", "native", "new", "null", "package", "private", "protected", "public",
789-
"return", "short", "static", "strictfp", "super", "switch", "synchronized", "this", "throw",
790-
"throws", "transient", "true", "try", "void", "volatile", "while")
791-
792788
val typeJavaMapping = Map[DataType, Class[_]](
793789
BooleanType -> classOf[Boolean],
794790
ByteType -> classOf[Byte],
@@ -847,6 +843,10 @@ object ScalaReflection extends ScalaReflection {
847843
Seq.empty
848844
}
849845
}
846+
847+
def encodeFieldNameToIdentifier(fieldName: String): String = {
848+
TermName(fieldName).encodedName.toString
849+
}
850850
}
851851

852852
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import org.apache.spark.{SparkConf, SparkEnv}
2828
import org.apache.spark.serializer._
2929
import org.apache.spark.sql.Row
3030
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection}
31-
import org.apache.spark.sql.catalyst.ScalaReflection.universe.TermName
3231
import org.apache.spark.sql.catalyst.encoders.RowEncoder
3332
import org.apache.spark.sql.catalyst.expressions._
3433
import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -311,7 +310,7 @@ case class Invoke(
311310
override def nullable: Boolean = targetObject.nullable || needNullCheck || returnNullable
312311
override def children: Seq[Expression] = targetObject +: arguments
313312

314-
private lazy val encodedFunctionName = TermName(functionName).encodedName.toString
313+
private lazy val encodedFunctionName = ScalaReflection.encodeFieldNameToIdentifier(functionName)
315314

316315
@transient lazy val method = targetObject.dataType match {
317316
case ObjectType(cls) =>

sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1222,14 +1222,6 @@ class DatasetSuite extends QueryTest
12221222
assert(result == Set(ClassData("a", 1) -> null, ClassData("b", 2) -> ClassData("x", 2)))
12231223
}
12241224

1225-
test("better error message when use java reserved keyword as field name") {
1226-
val e = intercept[UnsupportedOperationException] {
1227-
Seq(InvalidInJava(1)).toDS()
1228-
}
1229-
assert(e.getMessage.contains(
1230-
"`abstract` is a reserved keyword and cannot be used as field name"))
1231-
}
1232-
12331225
test("Dataset should support flat input object to be null") {
12341226
checkDataset(Seq("a", null).toDS(), "a", null)
12351227
}
@@ -1949,8 +1941,6 @@ case class ClassNullableData(a: String, b: Integer)
19491941
case class NestedStruct(f: ClassData)
19501942
case class DeepNestedStruct(f: NestedStruct)
19511943

1952-
case class InvalidInJava(`abstract`: Int)
1953-
19541944
/**
19551945
* A class used to test serialization using encoders. This class throws exceptions when using
19561946
* Java serialization -- so the only way it can be "serialized" is through our encoders.

sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,14 @@ case class ComplexReflectData(
7474
mapFieldContainsNull: Map[Int, Option[Long]],
7575
dataField: Data)
7676

77+
case class InvalidInJava(`abstract`: Int)
78+
7779
class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession {
7880
import testImplicits._
7981

82+
// To avoid syntax error thrown by genjavadoc, make this case class non-top level and private.
83+
private case class InvalidInJava2(`0`: Int)
84+
8085
test("query case class RDD") {
8186
val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true,
8287
new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3),
@@ -142,4 +147,20 @@ class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession
142147
Map(10 -> 100L, 20 -> 200L, 30 -> null),
143148
Row(null, "abc"))))
144149
}
150+
151+
test("better error message when use java reserved keyword as field name") {
152+
val e = intercept[UnsupportedOperationException] {
153+
Seq(InvalidInJava(1)).toDS()
154+
}
155+
assert(e.getMessage.contains(
156+
"`abstract` is not a valid identifier of Java and cannot be used as field name"))
157+
}
158+
159+
test("better error message when use invalid java identifier as field name") {
160+
val e1 = intercept[UnsupportedOperationException] {
161+
Seq(InvalidInJava2(1)).toDS()
162+
}
163+
assert(e1.getMessage.contains(
164+
"`0` is not a valid identifier of Java and cannot be used as field name"))
165+
}
145166
}

0 commit comments

Comments
 (0)