Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -392,13 +392,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
val dataType = attr.dataType
var newNdv = ndv

if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
return Some(0.0)
}

// use [min, max] to filter the original hSet
dataType match {
case _: NumericType | BooleanType | DateType | TimestampType =>
if (ndv.toDouble == 0 || colStat.min.isEmpty || colStat.max.isEmpty) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we always have max/min for integral type? cc @wzhfy

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

min/max could be None when the table is empty

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

min/max can be None if the column contains only null values. This is exactly the case for my query.

return Some(0.0)
}

val statsInterval =
ValueInterval(colStat.min, colStat.max, dataType).asInstanceOf[NumericValueInterval]
val validQuerySet = hSet.filter { v =>
Expand All @@ -422,6 +422,10 @@ case class FilterEstimation(plan: Filter) extends Logging {

// We assume the whole set since there is no min/max information for String/Binary type
case StringType | BinaryType =>
if (ndv.toDouble == 0) {
return Some(0.0)
}

newNdv = ndv.min(BigInt(hSet.size))
if (update) {
val newStats = colStat.copy(distinctCount = Some(newNdv), nullCount = Some(0))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,18 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
expectedRowCount = 0)
}

test("evaluateInSet with string") {
validateEstimatedStats(
Filter(InSet(attrString, Set("A0")),
StatsTestPlan(Seq(attrString), 10,
AttributeMap(Seq(attrString ->
ColumnStat(distinctCount = Some(10), min = None, max = None,
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2)))))),
Seq(attrString -> ColumnStat(distinctCount = Some(1), min = None, max = None,
nullCount = Some(0), avgLen = Some(2), maxLen = Some(2))),
expectedRowCount = 1)
}

test("cint NOT IN (3, 4, 5)") {
validateEstimatedStats(
Filter(Not(InSet(attrInt, Set(3, 4, 5))), childStatsTestPlan(Seq(attrInt), 10L)),
Expand Down