Skip to content
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ import org.apache.spark.sql.types._
> SELECT _FUNC_(10.0, array(0.5, 0.4, 0.1), 100);
[10.0,10.0,10.0]
> SELECT _FUNC_(10.0, 0.5, 100);
10.0
10
""",
since = "2.1.0")
case class ApproximatePercentile(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ case class VarianceSamp(child: Expression) extends CentralMomentAgg(child) {
examples = """
Examples:
> SELECT _FUNC_(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);
1.1135657469022013
1.1135657469022011
> SELECT _FUNC_(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col);
-1.1135657469022011
""",
Expand All @@ -245,9 +245,9 @@ case class Skewness(child: Expression) extends CentralMomentAgg(child) {
examples = """
Examples:
> SELECT _FUNC_(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);
-0.7014368047529618
-0.7014368047529627
> SELECT _FUNC_(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col);
0.19432323191698986
0.19432323191699075
""",
since = "1.6.0")
case class Kurtosis(child: Expression) extends CentralMomentAgg(child) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,9 +422,9 @@ case class CreateNamedStructUnsafe(children: Seq[Expression]) extends CreateName
examples = """
Examples:
> SELECT _FUNC_('a:1,b:2,c:3', ',', ':');
map("a":"1","b":"2","c":"3")
{"a":"1","b":"2","c":"3"}
> SELECT _FUNC_('a');
map("a":null)
{"a":null}
""")
// scalastyle:on line.size.limit
case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: Expression)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ import org.apache.spark.unsafe.types.UTF8String
examples = """
Examples:
> SELECT _FUNC_('1, 0.8', 'a INT, b DOUBLE');
{"a":1, "b":0.8}
{"a":1,"b":0.8}
> SELECT _FUNC_('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'))
{"time":2015-08-26 00:00:00.0}
""",
Expand Down Expand Up @@ -199,7 +199,7 @@ case class SchemaOfCsv(
> SELECT _FUNC_(named_struct('a', 1, 'b', 2));
1,2
> SELECT _FUNC_(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy'));
"26/08/2015"
26/08/2015
""",
since = "3.0.0")
// scalastyle:on line.size.limit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
examples = """
Examples:
> SELECT _FUNC_('2016-04-08', 'yyyy-MM-dd');
1460041200
1460098800
""",
since = "1.6.0")
case class ToUnixTimestamp(
Expand Down Expand Up @@ -842,7 +842,7 @@ abstract class UnixTime extends ToTimestamp {
examples = """
Examples:
> SELECT _FUNC_(0, 'yyyy-MM-dd HH:mm:ss');
1970-01-01 00:00:00
1969-12-31 16:00:00
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, surprising.

Copy link
Member Author

@MaxGekk MaxGekk Sep 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The yyyy-MM-dd HH:mm:ss pattern does not contain the time zone sub-pattern. If you point out it, you will see something like:

spark-sql> SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ssXXX');
1970-01-01 03:00:00+03:00

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And you can change your current time zone to UTC to see 1970-01-01 00:00:00:

spark-sql> set spark.sql.session.timeZone=UTC;
spark.sql.session.timeZone	UTC
spark-sql> SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ssXXX');
1970-01-01 00:00:00Z

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ya. The timezone issue will make a failure on different timezone machines.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but the time zone is forcibly set to "America/Los_Angeles" in tests:

TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))

.createWithDefaultFunction(() => TimeZone.getDefault.getID)

""",
since = "1.5.0")
case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[String] = None)
Expand Down Expand Up @@ -1766,7 +1766,7 @@ case class MakeDate(year: Expression, month: Expression, day: Expression)
> SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887);
2014-12-28 06:30:45.887
> SELECT _FUNC_(2014, 12, 28, 6, 30, 45.887, 'CET');
2014-12-28 10:30:45.887
2014-12-27 21:30:45.887
> SELECT _FUNC_(2019, 6, 30, 23, 59, 60)
2019-07-01 00:00:00
> SELECT _FUNC_(2019, 13, 1, 10, 11, 12, 13);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ case class ArrayExists(
> SELECT _FUNC_(array(1, null, 3), x -> x % 2 == 0);
false
> SELECT _FUNC_(array(2, null, 8), x -> x % 2 == 0);
null
NULL
""",
since = "3.0.0")
case class ArrayForAll(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,15 +331,15 @@ case class GetJsonObject(json: Expression, path: Expression)
}
}

// scalastyle:off line.size.limit
// scalastyle:off
@ExpressionDescription(
usage = "_FUNC_(jsonStr, p1, p2, ..., pn) - Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.",
examples = """
Examples:
> SELECT _FUNC_('{"a":1, "b":2}', 'a', 'b');
1 2
1 2
""")
// scalastyle:on line.size.limit
// scalastyle:on
case class JsonTuple(children: Seq[Expression])
extends Generator with CodegenFallback {

Expand Down Expand Up @@ -502,9 +502,9 @@ case class JsonTuple(children: Seq[Expression])
examples = """
Examples:
> SELECT _FUNC_('{"a":1, "b":0.8}', 'a INT, b DOUBLE');
{"a":1, "b":0.8}
{"a":1,"b":0.8}
> SELECT _FUNC_('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy'));
{"time":"2015-08-26 00:00:00.0"}
{"time":2015-08-26 00:00:00.0}
""",
since = "2.2.0")
// scalastyle:on line.size.limit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1291,7 +1291,7 @@ abstract class RoundBase(child: Expression, scale: Expression,
examples = """
Examples:
> SELECT _FUNC_(2.5, 0);
3.0
3
""")
// scalastyle:on line.size.limit
case class Round(child: Expression, scale: Expression)
Expand All @@ -1311,7 +1311,7 @@ case class Round(child: Expression, scale: Expression)
examples = """
Examples:
> SELECT _FUNC_(2.5, 0);
2.0
2
""")
// scalastyle:on line.size.limit
case class BRound(child: Expression, scale: Expression)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ object StringTrimLeft {
usage = """
_FUNC_(str) - Removes the leading space characters from `str`.

_FUNC_(trimStr, str) - Removes the leading string contains the characters from the trim string
_FUNC_(str, trimStr) - Removes the leading string contains the characters from the trim string
""",
arguments = """
Arguments:
Expand All @@ -858,7 +858,7 @@ object StringTrimLeft {
Examples:
> SELECT _FUNC_(' SparkSQL ');
SparkSQL
> SELECT _FUNC_('Sp', 'SSparkSQLS');
> SELECT _FUNC_('SparkSQLS', 'Sp');
arkSQLS
""",
since = "1.5.0")
Expand Down Expand Up @@ -949,7 +949,7 @@ object StringTrimRight {
usage = """
_FUNC_(str) - Removes the trailing space characters from `str`.

_FUNC_(trimStr, str) - Removes the trailing string which contains the characters from the trim string from the `str`
_FUNC_(str, trimStr) - Removes the trailing string which contains the characters from the trim string from the `str`
""",
arguments = """
Arguments:
Expand All @@ -960,7 +960,7 @@ object StringTrimRight {
Examples:
> SELECT _FUNC_(' SparkSQL ');
SparkSQL
> SELECT _FUNC_('LQSa', 'SSparkSQLS');
> SELECT _FUNC_('SSparkSQLS', 'SQLS');
SSpark
""",
since = "1.5.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ case class XPathString(xml: Expression, path: Expression) extends XPathExtract {
examples = """
Examples:
> SELECT _FUNC_('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()');
['b1','b2','b3']
["b1","b2","b3"]
""")
// scalastyle:on line.size.limit
case class XPathList(xml: Expression, path: Expression) extends XPathExtract {
Expand Down
50 changes: 50 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.apache.spark.{AccumulatorSuite, SparkException}
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
import org.apache.spark.sql.catalyst.util.StringUtils
import org.apache.spark.sql.execution.HiveResult.hiveResultString
import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
Expand Down Expand Up @@ -140,6 +141,55 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession {
}
}

test("check outputs of expression examples") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The fixes are good. How long does this take to run, BTW? just want to make sure it's not huge to rerun this every time, though I agree testing examples is useful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

~ 15 seconds on my laptop

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we do it in parallel?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will do that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running the test in parallel takes ~5-6 seconds on my laptop now.

val exampleRe = ">(.+);\n(.+)".r
val ignoreSet = Set(
// One of examples shows getting the current timestamp
"org.apache.spark.sql.catalyst.expressions.UnixTimestamp",
// Random output without a seed
"org.apache.spark.sql.catalyst.expressions.Rand",
"org.apache.spark.sql.catalyst.expressions.Randn",
"org.apache.spark.sql.catalyst.expressions.Shuffle",
"org.apache.spark.sql.catalyst.expressions.Uuid",
"org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection",
// TODO: handle multiline output, look at the DOTALL flag
"org.apache.spark.sql.catalyst.expressions.GroupingID",
"org.apache.spark.sql.catalyst.expressions.Stack",
"org.apache.spark.sql.catalyst.expressions.PosExplode",
"org.apache.spark.sql.catalyst.expressions.Explode",
"org.apache.spark.sql.catalyst.expressions.Cube",
"org.apache.spark.sql.catalyst.expressions.Inline",
"org.apache.spark.sql.catalyst.expressions.Rollup",
"org.apache.spark.sql.catalyst.expressions.Grouping",
// Fails on parsing `SELECT 2 mod 1.8`:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What should I do for all the exceptions? @dongjoon-hyun @srowen Open a separate JIRA ticket per-each case?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine to just fix what you have so far. I think it's fine to fix additional ones here, too. I don't think you need to fix each of the individually unless you feel they're logically distinct.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have fixed 3 out of 4. I will open a ticket for the last one.

Copy link
Member Author

@MaxGekk MaxGekk Sep 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix the last one as well

// org.apache.spark.sql.catalyst.parser.ParseException:
// extraneous input '1.8' expecting <EOF>(line 1, pos 14)
"org.apache.spark.sql.catalyst.expressions.Remainder",
// Fails on `SELECT make_timestamp(2019, 13, 1, 10, 11, 12, 13)`:
// Invalid ID for region-based ZoneId, invalid format: 13
// java.time.DateTimeException: Invalid ID for region-based ZoneId, invalid format: 13
"org.apache.spark.sql.catalyst.expressions.MakeTimestamp")

withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") {
spark.sessionState.functionRegistry.listFunction().foreach { funcId =>
val info = spark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
if (!ignoreSet.contains(className)) {
withClue(s"Function '${info.getName}', Expression class '$className'") {
exampleRe.findAllIn(info.getExamples).toList.foreach(_ match {
case exampleRe(sql, output) =>
val df = spark.sql(sql)
val actual = hiveResultString(df.queryExecution.executedPlan).mkString("\n").trim
val expected = output.trim
assert(actual === expected)
case other => throw new IllegalArgumentException(other)
})
}
}
}
}
}

test("SPARK-6743: no columns from cache") {
Seq(
(83, 0, 38),
Expand Down