Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
move CatalogStatistics into the same file of CatalogTable
  • Loading branch information
wangzhenhua committed Dec 19, 2016
commit 5dbaadec80bdb8f2cc620f59a9c28f659d3fe5de
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ package org.apache.spark.sql.catalyst.catalog

import java.util.Date

import scala.collection.mutable

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{CatalogStatistics, LeafNode, LogicalPlan}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Cast, Literal}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.util.quoteIdentifier
import org.apache.spark.sql.types.{StructField, StructType}

Expand Down Expand Up @@ -241,6 +243,38 @@ case class CatalogTable(
}


/**
* This class of statistics is used in [[CatalogTable]] to interact with metastore.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add few words explaining why don't use Statistics for CatalogTable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

*/
case class CatalogStatistics(
sizeInBytes: BigInt,
rowCount: Option[BigInt] = None,
colStats: Map[String, ColumnStat] = Map.empty) {

/**
* Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
* on column names.
*/
def convert(attributes: Seq[Attribute]): Statistics = {
Copy link
Contributor

@cloud-fan cloud-fan Dec 22, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bad name, it doesn't tell anything, without looking at the doc.
How about def toPlanStats(planOuput: ...)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a lot better, thanks!

val matched = mutable.HashMap[Attribute, ColumnStat]()
attributes.foreach { attr =>
if (colStats.contains(attr.name)) {
matched.put(attr, colStats(attr.name))
}
}
Copy link
Contributor

@cloud-fan cloud-fan Dec 22, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

attributes.flatMap(a => colStats.get(a.name).map(a -> _)).toMap

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
attributeStats = AttributeMap(matched.toSeq))
}

/** Readable string representation for the CatalogStatistics. */
def simpleString: String = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do you define a simpleString instead of override toString?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we don't print column stats in it, it's not a "complete" string representation. Column stats can be too much and make CatalogTable unreadable.

Seq(s"sizeInBytes=$sizeInBytes",
if (rowCount.isDefined) s"rowCount=${rowCount.get}" else ""
).filter(_.nonEmpty).mkString(", ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

val rowCountString = if (rowCount.isDefined) s", ${rowCount.get} rows" else ""
s"$sizeInBytes bytes$rowCountString" 

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

}
}


case class CatalogTableType private(name: String)
object CatalogTableType {
val EXTERNAL = new CatalogTableType("EXTERNAL")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@

package org.apache.spark.sql.catalyst.plans.logical

import scala.collection.mutable
import scala.util.control.NonFatal

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, Row}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -63,37 +61,6 @@ case class Statistics(
}
}

/**
* This class of Statistics is used in [[CatalogTable]] to interact with metastore.
*/
case class CatalogStatistics(
sizeInBytes: BigInt,
rowCount: Option[BigInt] = None,
colStats: Map[String, ColumnStat] = Map.empty) {

/**
* Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based
* on column names.
*/
def convert(attributes: Seq[Attribute]): Statistics = {
val matched = mutable.HashMap[Attribute, ColumnStat]()
attributes.foreach { attr =>
if (colStats.contains(attr.name)) {
matched.put(attr, colStats(attr.name))
}
}
Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount,
attributeStats = AttributeMap(matched.toSeq))
}

/** Readable string representation for the CatalogStatistics. */
def simpleString: String = {
Seq(s"sizeInBytes=$sizeInBytes",
if (rowCount.isDefined) s"rowCount=${rowCount.get}" else ""
).filter(_.nonEmpty).mkString(", ")
}
}


/**
* Statistics collected for a column.
Expand Down