-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21274][SQL] Add a new generator function replicate_rows to support EXCEPT ALL and INTERSECT ALL #21240
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ | |
| import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
||
|
||
| /** | ||
| * An expression that produces zero or more rows given a single input row. | ||
| * | ||
|
|
@@ -222,6 +223,51 @@ case class Stack(children: Seq[Expression]) extends Generator { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Replicate the row based N times. N is specified as the first argument to the function. | ||
|
||
| * {{{ | ||
| * SELECT replicate_rows(2, "val1", "val2") -> | ||
| * 2 val1 val2 | ||
| * 2 val1 val2 | ||
| * }}} | ||
| */ | ||
| @ExpressionDescription( | ||
| usage = "_FUNC_(n, expr1, ..., exprk) - Replicates `expr1`, ..., `exprk` into `n` rows.", | ||
|
||
| examples = """ | ||
| Examples: | ||
| > SELECT _FUNC_(2, "val1", "val2"); | ||
| 2 val1 val2 | ||
| 2 val1 val2 | ||
| """) | ||
| case class ReplicateRows(children: Seq[Expression]) extends Generator with CodegenFallback { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be easily implemented in codegen so we don't need
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya If you don't mind, i would like to do it in a follow-up. |
||
| override def checkInputDataTypes(): TypeCheckResult = { | ||
| if (children.length < 2) { | ||
| TypeCheckResult.TypeCheckFailure(s"$prettyName requires at least 2 arguments.") | ||
| } else if (children.head.dataType != LongType) { | ||
| TypeCheckResult.TypeCheckFailure("The number of rows must be a positive long value.") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about this message? |
||
| } else { | ||
| TypeCheckResult.TypeCheckSuccess | ||
| } | ||
| } | ||
|
|
||
| override def elementSchema: StructType = | ||
| StructType(children.zipWithIndex.map { | ||
| case (e, index) => StructField(s"col$index", e.dataType) | ||
| }) | ||
|
|
||
| override def eval(input: InternalRow): TraversableOnce[InternalRow] = { | ||
| val numRows = children.head.eval(input).asInstanceOf[Long] | ||
| val values = children.map(_.eval(input)).toArray | ||
|
||
| Range.Long(0, numRows, 1).map { i => | ||
| val fields = new Array[Any](children.length) | ||
| for (col <- 0 until children.length) { | ||
| fields.update(col, values(col)) | ||
| } | ||
| InternalRow(fields: _*) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Wrapper around another generator to specify outer behavior. This is used to implement functions | ||
| * such as explode_outer. This expression gets replaced during analysis. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1353,6 +1353,31 @@ class TypeCoercionSuite extends AnalysisTest { | |
| SpecifiedWindowFrame(RangeFrame, CurrentRow, UnboundedFollowing)) | ||
| ) | ||
| } | ||
|
|
||
| test("type coercion for ReplicateRows") { | ||
|
||
| val rule = TypeCoercion.ReplicateRowsCoercion | ||
| // Cast is setup to promote the first expression to Long | ||
| // for numeric types. | ||
| ruleTest(rule, | ||
| ReplicateRows(Seq(1.toShort, Literal("rowdata"))), | ||
| ReplicateRows(Seq(Cast(1.toShort, LongType), Literal("rowdata")))) | ||
| ruleTest(rule, | ||
| ReplicateRows(Seq(1, Literal("rowdata"))), | ||
| ReplicateRows(Seq(Cast(1, LongType), Literal("rowdata")))) | ||
| ruleTest(rule, | ||
| ReplicateRows(Seq(1.toByte, Literal("rowdata"))), | ||
| ReplicateRows(Seq(Cast(1.toByte, LongType), Literal("rowdata")))) | ||
|
|
||
| // No cast here since the expected type is Long. | ||
| ruleTest(rule, | ||
| ReplicateRows(Seq(1L, Literal("rowdata"))), | ||
| ReplicateRows(Seq(1L, Literal("rowdata")))) | ||
|
|
||
| // No type coercion when first expression is a non numeric type. | ||
| ruleTest(rule, | ||
| ReplicateRows(Seq(Literal("invalid"), Literal("rowdata"))), | ||
| ReplicateRows(Seq(Literal("invalid"), Literal("rowdata")))) | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES | ||
| (1, 'row1', 1.1), | ||
| (2, 'row2', 2.2), | ||
| (0, 'row3', 3.3), | ||
| (-1,'row4', 4.4), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current behaviour of the negative value case is the same with the hive one? |
||
| (null,'row5', 5.5), | ||
| (3, 'row6', null) | ||
| AS tab1(c1, c2, c3); | ||
|
|
||
| -- c1, c2 replicated c1 times | ||
| SELECT replicate_rows(c1, c2) FROM tab1; | ||
|
|
||
| -- c1, c2, c2 repeated replicated c1 times | ||
| SELECT replicate_rows(c1, c2, c2) FROM tab1; | ||
|
|
||
| -- c1, c2, c2, c3 replicated c1 times | ||
| SELECT replicate_rows(c1, c2, c2, c2, c3) FROM tab1; | ||
|
|
||
| -- Used as a derived table in FROM clause. | ||
| SELECT c2, c1 | ||
| FROM ( | ||
| SELECT replicate_rows(c1, c2) AS (c1, c2) FROM tab1 | ||
| ); | ||
|
|
||
| -- column expression. | ||
| SELECT replicate_rows(c1, concat(c2, '...'), c2) FROM tab1; | ||
|
|
||
| -- Clean-up | ||
| DROP VIEW IF EXISTS tab1; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| -- Automatically generated by SQLQueryTestSuite | ||
| -- Number of queries: 7 | ||
|
|
||
|
|
||
| -- !query 0 | ||
| CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES | ||
| (1, 'row1', 1.1), | ||
| (2, 'row2', 2.2), | ||
| (0, 'row3', 3.3), | ||
| (-1,'row4', 4.4), | ||
| (null,'row5', 5.5), | ||
| (3, 'row6', null) | ||
| AS tab1(c1, c2, c3) | ||
| -- !query 0 schema | ||
| struct<> | ||
| -- !query 0 output | ||
|
|
||
|
|
||
|
|
||
| -- !query 1 | ||
| SELECT replicate_rows(c1, c2) FROM tab1 | ||
| -- !query 1 schema | ||
| struct<col0:bigint,col1:string> | ||
| -- !query 1 output | ||
| 1 row1 | ||
| 2 row2 | ||
| 2 row2 | ||
| 3 row6 | ||
| 3 row6 | ||
| 3 row6 | ||
|
|
||
|
|
||
| -- !query 2 | ||
| SELECT replicate_rows(c1, c2, c2) FROM tab1 | ||
| -- !query 2 schema | ||
| struct<col0:bigint,col1:string,col2:string> | ||
| -- !query 2 output | ||
| 1 row1 row1 | ||
| 2 row2 row2 | ||
| 2 row2 row2 | ||
| 3 row6 row6 | ||
| 3 row6 row6 | ||
| 3 row6 row6 | ||
|
|
||
|
|
||
| -- !query 3 | ||
| SELECT replicate_rows(c1, c2, c2, c2, c3) FROM tab1 | ||
| -- !query 3 schema | ||
| struct<col0:bigint,col1:string,col2:string,col3:string,col4:decimal(2,1)> | ||
| -- !query 3 output | ||
| 1 row1 row1 row1 1.1 | ||
| 2 row2 row2 row2 2.2 | ||
| 2 row2 row2 row2 2.2 | ||
| 3 row6 row6 row6 NULL | ||
| 3 row6 row6 row6 NULL | ||
| 3 row6 row6 row6 NULL | ||
|
|
||
|
|
||
| -- !query 4 | ||
| SELECT c2, c1 | ||
| FROM ( | ||
| SELECT replicate_rows(c1, c2) AS (c1, c2) FROM tab1 | ||
| ) | ||
| -- !query 4 schema | ||
| struct<c2:string,c1:bigint> | ||
| -- !query 4 output | ||
| row1 1 | ||
| row2 2 | ||
| row2 2 | ||
| row6 3 | ||
| row6 3 | ||
| row6 3 | ||
|
|
||
|
|
||
| -- !query 5 | ||
| SELECT replicate_rows(c1, concat(c2, '...'), c2) FROM tab1 | ||
| -- !query 5 schema | ||
| struct<col0:bigint,col1:string,col2:string> | ||
| -- !query 5 output | ||
| 1 row1... row1 | ||
| 2 row2... row2 | ||
| 2 row2... row2 | ||
| 3 row6... row6 | ||
| 3 row6... row6 | ||
| 3 row6... row6 | ||
|
|
||
|
|
||
| -- !query 6 | ||
| DROP VIEW IF EXISTS tab1 | ||
| -- !query 6 schema | ||
| struct<> | ||
| -- !query 6 output | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -307,6 +307,37 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext { | |
| sql("select * from values 1, 2 lateral view outer empty_gen() a as b"), | ||
| Row(1, null) :: Row(2, null) :: Nil) | ||
| } | ||
|
|
||
| test("ReplicateRows generator") { | ||
|
||
| val df = spark.range(1) | ||
|
|
||
| // Empty DataFrame suppress the result generation | ||
| checkAnswer(spark.emptyDataFrame.selectExpr("replicate_rows(1, 1, 2, 3)"), Nil) | ||
|
|
||
| checkAnswer(df.selectExpr("replicate_rows(1, 2.5)"), Row(1, 2.5) :: Nil) | ||
| checkAnswer(df.selectExpr("replicate_rows(1, null)"), Row(1, null) :: Nil) | ||
| checkAnswer(df.selectExpr("replicate_rows(3, 'row1')"), | ||
| Row(3, "row1") :: Row(3, "row1") :: Row(3, "row1") :: Nil) | ||
| checkAnswer(df.selectExpr("replicate_rows(-1, 2.5)"), Nil) | ||
|
|
||
| // The data for the same column should have the same type. | ||
|
||
| val msg1 = intercept[AnalysisException] { | ||
| df.selectExpr("replicate_rows(1)") | ||
| }.getMessage | ||
| assert(msg1.contains("requires at least 2 arguments")) | ||
|
|
||
| // The data for the same column should have the same type. | ||
| val msg2 = intercept[AnalysisException] { | ||
| df.selectExpr("replicate_rows('a', 1)") | ||
| }.getMessage | ||
| assert(msg2.contains("The number of rows must be a positive long value.")) | ||
|
|
||
| val msg3 = intercept[AnalysisException] { | ||
| df.selectExpr("replicate_rows(null, 1)") | ||
| }.getMessage | ||
| assert(msg3.contains("The number of rows must be a positive long value.")) | ||
|
|
||
| } | ||
| } | ||
|
|
||
| case class EmptyGenerator() extends Generator { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
LongTypeseems not necessary be here. Can avoid re-entering the following pattern matching if it is already long type.