Skip to content

Commit f94247e

Browse files
gatorsmilejiangxb1987
authored andcommitted
[SPARK-27770][SQL][PART 1] Port AGGREGATES.sql
## What changes were proposed in this pull request? This PR is to port AGGREGATES.sql from PostgreSQL regression tests. https://github.com/postgres/postgres/blob/02ddd499322ab6f2f0d58692955dc9633c2150fc/src/test/regress/sql/aggregates.sql#L1-L143 The expected results can be found in the link: https://github.com/postgres/postgres/blob/master/src/test/regress/expected/aggregates.out When porting the test cases, found three PostgreSQL specific features that do not exist in Spark SQL. - https://issues.apache.org/jira/browse/SPARK-27765: Type Casts: expression::type - https://issues.apache.org/jira/browse/SPARK-27766: Data type: POINT(x, y) - https://issues.apache.org/jira/browse/SPARK-27767: Built-in function: generate_series Also, found two bugs: - https://issues.apache.org/jira/browse/SPARK-27768: Infinity, -Infinity, NaN should be recognized in a case insensitive manner - https://issues.apache.org/jira/browse/SPARK-27769: Handling of sublinks within outer-level aggregates. This PR also fixes the error message when the column can't be resolved. For running the regression tests, this PR also added three tables `aggtest`, `onek` and `tenk1` from the postgreSQL data sets: https://github.com/postgres/postgres/tree/02ddd499322ab6f2f0d58692955dc9633c2150fc/src/test/regress/data ## How was this patch tested? N/A Closes apache#24640 from gatorsmile/addTestCase. Authored-by: gatorsmile <[email protected]> Signed-off-by: Xingbo Jiang <[email protected]>
1 parent c1e5557 commit f94247e

File tree

10 files changed

+11620
-3
lines changed

10 files changed

+11620
-3
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ trait CheckAnalysis extends PredicateHelper {
109109

110110
operator transformExpressionsUp {
111111
case a: Attribute if !a.resolved =>
112-
val from = operator.inputSet.map(_.qualifiedName).mkString(", ")
112+
val from = operator.inputSet.toSeq.map(_.qualifiedName).mkString(", ")
113113
a.failAnalysis(s"cannot resolve '${a.sql}' given input columns: [$from]")
114114

115115
case e: Expression if e.checkInputDataTypes().isFailure =>

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ class AnalysisErrorSuite extends AnalysisTest {
223223
errorTest(
224224
"sorting by attributes are not from grouping expressions",
225225
testRelation2.groupBy('a, 'c)('a, 'c, count('a).as("a3")).orderBy('b.asc),
226-
"cannot resolve" :: "'`b`'" :: "given input columns" :: "[a, c, a3]" :: Nil)
226+
"cannot resolve" :: "'`b`'" :: "given input columns" :: "[a, a3, c]" :: Nil)
227227

228228
errorTest(
229229
"non-boolean filters",
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
--
2+
-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
3+
--
4+
--
5+
-- AGGREGATES [Part 1]
6+
-- https://github.com/postgres/postgres/blob/02ddd499322ab6f2f0d58692955dc9633c2150fc/src/test/regress/sql/aggregates.sql#L1-L143
7+
8+
-- avoid bit-exact output here because operations may not be bit-exact.
9+
-- SET extra_float_digits = 0;
10+
11+
SELECT avg(four) AS avg_1 FROM onek;
12+
13+
SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100;
14+
15+
-- In 7.1, avg(float4) is computed using float8 arithmetic.
16+
-- Round the result to 3 digits to avoid platform-specific results.
17+
18+
select CAST(avg(b) AS Decimal(10,3)) AS avg_107_943 FROM aggtest;
19+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
20+
-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student;
21+
22+
SELECT sum(four) AS sum_1500 FROM onek;
23+
SELECT sum(a) AS sum_198 FROM aggtest;
24+
SELECT sum(b) AS avg_431_773 FROM aggtest;
25+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
26+
-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student;
27+
28+
SELECT max(four) AS max_3 FROM onek;
29+
SELECT max(a) AS max_100 FROM aggtest;
30+
SELECT max(aggtest.b) AS max_324_78 FROM aggtest;
31+
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
32+
-- SELECT max(student.gpa) AS max_3_7 FROM student;
33+
34+
SELECT stddev_pop(b) FROM aggtest;
35+
SELECT stddev_samp(b) FROM aggtest;
36+
SELECT var_pop(b) FROM aggtest;
37+
SELECT var_samp(b) FROM aggtest;
38+
39+
SELECT stddev_pop(CAST(b AS Decimal(38,0))) FROM aggtest;
40+
SELECT stddev_samp(CAST(b AS Decimal(38,0))) FROM aggtest;
41+
SELECT var_pop(CAST(b AS Decimal(38,0))) FROM aggtest;
42+
SELECT var_samp(CAST(b AS Decimal(38,0))) FROM aggtest;
43+
44+
-- population variance is defined for a single tuple, sample variance
45+
-- is not
46+
SELECT var_pop(1.0), var_samp(2.0);
47+
SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0)));
48+
49+
50+
-- verify correct results for null and NaN inputs
51+
select sum(CAST(null AS int)) from range(1,4);
52+
select sum(CAST(null AS long)) from range(1,4);
53+
select sum(CAST(null AS Decimal(38,0))) from range(1,4);
54+
select sum(CAST(null AS DOUBLE)) from range(1,4);
55+
select avg(CAST(null AS int)) from range(1,4);
56+
select avg(CAST(null AS long)) from range(1,4);
57+
select avg(CAST(null AS Decimal(38,0))) from range(1,4);
58+
select avg(CAST(null AS DOUBLE)) from range(1,4);
59+
select sum(CAST('NaN' AS DOUBLE)) from range(1,4);
60+
select avg(CAST('NaN' AS DOUBLE)) from range(1,4);
61+
62+
-- [SPARK-27768] verify correct results for infinite inputs
63+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
64+
FROM (VALUES (CAST('1' AS DOUBLE)), (CAST('Infinity' AS DOUBLE))) v(x);
65+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
66+
FROM (VALUES ('Infinity'), ('1')) v(x);
67+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
68+
FROM (VALUES ('Infinity'), ('Infinity')) v(x);
69+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
70+
FROM (VALUES ('-Infinity'), ('Infinity')) v(x);
71+
72+
73+
-- test accuracy with a large input offset
74+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
75+
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
76+
SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
77+
FROM (VALUES (7000000000005), (7000000000007)) v(x);
78+
79+
-- SQL2003 binary aggregates [SPARK-23907]
80+
-- SELECT regr_count(b, a) FROM aggtest;
81+
-- SELECT regr_sxx(b, a) FROM aggtest;
82+
-- SELECT regr_syy(b, a) FROM aggtest;
83+
-- SELECT regr_sxy(b, a) FROM aggtest;
84+
-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
85+
-- SELECT regr_r2(b, a) FROM aggtest;
86+
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
87+
SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest;
88+
SELECT corr(b, a) FROM aggtest;
89+
90+
91+
-- test accum and combine functions directly [SPARK-23907]
92+
-- CREATE TABLE regr_test (x float8, y float8);
93+
-- INSERT INTO regr_test VALUES (10,150),(20,250),(30,350),(80,540),(100,200);
94+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
95+
-- FROM regr_test WHERE x IN (10,20,30,80);
96+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
97+
-- FROM regr_test;
98+
-- SELECT float8_accum('{4,140,2900}'::float8[], 100);
99+
-- SELECT float8_regr_accum('{4,140,2900,1290,83075,15050}'::float8[], 200, 100);
100+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
101+
-- FROM regr_test WHERE x IN (10,20,30);
102+
-- SELECT count(*), sum(x), regr_sxx(y,x), sum(y),regr_syy(y,x), regr_sxy(y,x)
103+
-- FROM regr_test WHERE x IN (80,100);
104+
-- SELECT float8_combine('{3,60,200}'::float8[], '{0,0,0}'::float8[]);
105+
-- SELECT float8_combine('{0,0,0}'::float8[], '{2,180,200}'::float8[]);
106+
-- SELECT float8_combine('{3,60,200}'::float8[], '{2,180,200}'::float8[]);
107+
-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
108+
-- '{0,0,0,0,0,0}'::float8[]);
109+
-- SELECT float8_regr_combine('{0,0,0,0,0,0}'::float8[],
110+
-- '{2,180,200,740,57800,-3400}'::float8[]);
111+
-- SELECT float8_regr_combine('{3,60,200,750,20000,2000}'::float8[],
112+
-- '{2,180,200,740,57800,-3400}'::float8[]);
113+
-- DROP TABLE regr_test;
114+
115+
116+
-- test count, distinct
117+
SELECT count(four) AS cnt_1000 FROM onek;
118+
SELECT count(DISTINCT four) AS cnt_4 FROM onek;
119+
120+
select ten, count(*), sum(four) from onek
121+
group by ten order by ten;
122+
123+
select ten, count(four), sum(DISTINCT four) from onek
124+
group by ten order by ten;
125+
126+
-- user-defined aggregates
127+
-- SELECT newavg(four) AS avg_1 FROM onek;
128+
-- SELECT newsum(four) AS sum_1500 FROM onek;
129+
-- SELECT newcnt(four) AS cnt_1000 FROM onek;
130+
-- SELECT newcnt(*) AS cnt_1000 FROM onek;
131+
-- SELECT oldcnt(*) AS cnt_1000 FROM onek;
132+
-- SELECT sum2(q1,q2) FROM int8_tbl;
133+
134+
-- test for outer-level aggregates
135+
136+
-- this should work
137+
select ten, sum(distinct four) from onek a
138+
group by ten
139+
having exists (select 1 from onek b where sum(distinct a.four) = b.four);
140+
141+
-- this should fail because subquery has an agg of its own in WHERE
142+
select ten, sum(distinct four) from onek a
143+
group by ten
144+
having exists (select 1 from onek b
145+
where sum(distinct a.four + b.four) = b.four);
146+
147+
-- [SPARK-27769] Test handling of sublinks within outer-level aggregates.
148+
-- Per bug report from Daniel Grace.
149+
select
150+
(select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))
151+
from tenk1 o;

0 commit comments

Comments
 (0)