-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8777] [SQL] Add random data generator test utilities to Spark SQL #7176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
d2b4a4a
ab76cbd
5acdd5c
b55875a
0c20905
89d86b1
e0d7d49
f71634d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,46 +17,68 @@ | |
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import org.scalacheck.Prop.{exists, forAll, secure} | ||
| import org.scalatest.prop.Checkers | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.sql.catalyst.CatalystTypeConverters | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
| /** | ||
| * Tests of [[RandomDataGenerator]]. | ||
| */ | ||
| class RandomDataGeneratorSuite extends SparkFunSuite { | ||
| class RandomDataGeneratorSuite extends SparkFunSuite with Checkers { | ||
|
|
||
| /** | ||
| * Tests random data generation for the given type by using it to generate random values then | ||
| * converting those values into their Catalyst equivalents using CatalystTypeConverters. | ||
| */ | ||
| def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { | ||
| val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) | ||
| RandomDataGenerator.forType(dataType, nullable, Some(42L)).foreach { generator => | ||
| for (_ <- 1 to 10) { | ||
| val generatedValue = generator() | ||
| val convertedValue = toCatalyst(generatedValue) | ||
| if (!nullable) { | ||
| assert(convertedValue !== null) | ||
| } | ||
| } | ||
| val generator = RandomDataGenerator.forType(dataType, nullable).getOrElse { | ||
| fail(s"Random data generator was not defined for $dataType") | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we throw an exception if no generator is defined for the given
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea; this uncovered the fact that I forgot to implement a generator for Timestamp. |
||
|
|
||
| if (nullable) { | ||
| check(exists(generator) { _ == null }) | ||
| } | ||
| if (!nullable) { | ||
| check(forAll(generator) { _ != null }) | ||
| } | ||
| check(secure(forAll(generator) { v => { toCatalyst(v); true } })) | ||
| } | ||
|
|
||
| // Basic types: | ||
|
|
||
| (DataTypeTestUtils.atomicTypes ++ DataTypeTestUtils.atomicArrayTypes).foreach { dataType => | ||
| test(s"$dataType") { | ||
| for ( | ||
| dataType <- DataTypeTestUtils.atomicTypes; | ||
| nullable <- Seq(true, false) | ||
| if !dataType.isInstanceOf[DecimalType] || | ||
| dataType.asInstanceOf[DecimalType].precisionInfo.isEmpty | ||
| ) { | ||
| test(s"$dataType (nullable=$nullable)") { | ||
| testRandomDataGeneration(dataType) | ||
| } | ||
| } | ||
|
|
||
| // Complex types: | ||
| for ( | ||
| arrayType <- DataTypeTestUtils.atomicArrayTypes | ||
| if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined | ||
| ) { | ||
| test(s"$arrayType") { | ||
| testRandomDataGeneration(arrayType) | ||
| } | ||
| } | ||
|
|
||
| val atomicTypesWithDataGenerators = | ||
| DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) | ||
|
|
||
| // Complex types: | ||
| for ( | ||
| keyType <- DataTypeTestUtils.atomicTypes; | ||
| valueType <- DataTypeTestUtils.atomicTypes | ||
| keyType <- atomicTypesWithDataGenerators; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably use |
||
| valueType <- atomicTypesWithDataGenerators | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Value type of a map can be any type.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I'm going to come back and fix this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be worth deferring this to a followup patch; I think that it should work but don't want to necessarily test all maps-of-maps or maps-of-arrays here. |
||
| // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and | ||
| // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). | ||
| // For these reasons, we don't support generation of maps with decimal keys. | ||
| if !keyType.isInstanceOf[DecimalType] | ||
| ) { | ||
| val mapType = MapType(keyType, valueType) | ||
| test(s"$mapType") { | ||
|
|
@@ -65,8 +87,8 @@ class RandomDataGeneratorSuite extends SparkFunSuite { | |
| } | ||
|
|
||
| for ( | ||
| colOneType <- DataTypeTestUtils.atomicTypes; | ||
| colTwoType <- DataTypeTestUtils.atomicTypes | ||
| colOneType <- atomicTypesWithDataGenerators; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove the trailing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need the semicolons when we're doing a for comprehension over multiple inputs.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I didn't notice you were using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm to me it is less clear to drop the ; here, although i don't have a strong preference
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have a super strong preference either. |
||
| colTwoType <- atomicTypesWithDataGenerators | ||
| ) { | ||
| val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) | ||
| test(s"$structType") { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that I mapped a list into an Array instead of using
Arbitrary.arbitrary[Array[Byte]]because the latter seems to run into a possible ScalaCheck NPE bug when we wrap the resulting generator to sometimes return nulls. I've filed typelevel/scalacheck#177 to try to investigate this upstream. AFAIK the only downside to the workaround here is that we lose some of the failing test case minimization benefits that ScalaCheck provides.