Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
)).toDF("id", "items")
val model = new FPGrowth().setMinSupport(0.7).fit(dataset)
val prediction = model.transform(df)
assert(prediction.select("prediction").where("id=3").first().getSeq[String](0).isEmpty)
assert(prediction.where("id=3").select("prediction").first().getSeq[String](0).isEmpty)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm worried that existing spark applications may already use this pattern in the code, so no matter it's a bug or not, it seems a feature now and we can't break it...

}

test("FPGrowth prediction should not contain duplicates") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1023,8 +1023,6 @@ class Analyzer(
* clause. This rule detects such queries and adds the required attributes to the original
* projection, so that they will be available during sorting. Another projection is added to
* remove these attributes after sorting.
*
* The HAVING clause could also used a grouping columns that is not presented in the SELECT.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is by design.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example,

select type, avg (price)
from titles
group by type
having sum (total_sales) > 10000

This example is copied from Sybase ASE. I believe this is part of Transact-SQL

Copy link
Member Author

@viirya viirya May 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is by design.

For the HAVING clause could also used a grouping columns that is not presented in the SELECT, yes.

For other general cases, I doubt it.

We have other rule doing this (HAVING clause with grouping columns). That is why the tests are passed after this rule is removed. The above query also works without this rule.

Copy link
Member

@gatorsmile gatorsmile May 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we introduced this by accident, I do not think we can remove it now. It could break the applications that are built on it. cc @rxin @cloud-fan @marmbrus

*/
object ResolveMissingReferences extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
Expand All @@ -1051,26 +1049,6 @@ class Analyzer(
// in Sort
case ae: AnalysisException => s
}

case f @ Filter(cond, child) if child.resolved =>
try {
val newCond = resolveExpressionRecursively(cond, child)
val requiredAttrs = newCond.references.filter(_.resolved)
val missingAttrs = requiredAttrs -- child.outputSet
if (missingAttrs.nonEmpty) {
// Add missing attributes and then project them away.
Project(child.output,
Filter(newCond, addMissingAttr(child, missingAttrs)))
} else if (newCond != cond) {
f.copy(condition = newCond)
} else {
f
}
} catch {
// Attempting to resolve it might fail. When this happens, return the original plan.
// Users will see an AnalysisException for resolution failure of missing attributes
case ae: AnalysisException => f
}
}

/**
Expand Down
16 changes: 16 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1844,4 +1844,20 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
.filter($"x1".isNotNull || !$"y".isin("a!"))
.count
}

test("Unresolvable attribute in Filter should throw analysis exception") {
val df = Seq((1, "a"), (2, "b"), (3, "c")).toDF("x", "y")
val e1 = intercept[AnalysisException](df.select("y").where("x=1"))
assert(e1.message.contains("cannot resolve '`x`'"))

Seq(1).toDF("c1").createOrReplaceTempView("onerow")
val e2 = intercept[AnalysisException] {
sql(
"""
| select 1
| from (select 1 from onerow t2 LIMIT 1)
| where t2.c1=1""".stripMargin)
}
assert(e2.message.contains("cannot resolve '`t2.c1`'"))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
"""
| select c1 from onerow t1
| where exists (select 1
| from (select 1 from onerow t2 LIMIT 1)
| from (select 1 as c1 from onerow t2 LIMIT 1) t2
| where t1.c1=t2.c1)""".stripMargin),
Row(1) :: Nil)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEac
import _hc.implicits._
val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x")
val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c")
.select($"a", $"b")
.filter($"a" > 10 && $"b" > 6 && $"c")
.select($"a", $"b")
val df3 = df1.join(df2, "a")
val res = df3.collect()
val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect()
Expand Down