-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24588][SS] streaming join should require HashClusteredPartitioning from children #21587
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -68,50 +68,42 @@ case object AllTuples extends Distribution { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Represents data where tuples that share the same values for the `clustering` | ||
| * [[Expression Expressions]] will be co-located. Based on the context, this | ||
| * can mean such tuples are either co-located in the same partition or they will be contiguous | ||
| * within a single partition. | ||
| */ | ||
| case class ClusteredDistribution( | ||
| clustering: Seq[Expression], | ||
| requiredNumPartitions: Option[Int] = None) extends Distribution { | ||
| abstract class ClusteredDistributionBase(exprs: Seq[Expression]) extends Distribution { | ||
| require( | ||
| clustering != Nil, | ||
| "The clustering expressions of a ClusteredDistribution should not be Nil. " + | ||
| exprs.nonEmpty, | ||
| s"The clustering expressions of a ${getClass.getSimpleName} should not be empty. " + | ||
| "An AllTuples should be used to represent a distribution that only has " + | ||
| "a single partition.") | ||
|
|
||
| override def createPartitioning(numPartitions: Int): Partitioning = { | ||
| assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions, | ||
| s"This ClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " + | ||
| s"This ${getClass.getSimpleName} requires ${requiredNumPartitions.get} partitions, but " + | ||
| s"the actual number of partitions is $numPartitions.") | ||
| HashPartitioning(clustering, numPartitions) | ||
| HashPartitioning(exprs, numPartitions) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Represents data where tuples have been clustered according to the hash of the given | ||
| * `expressions`. The hash function is defined as `HashPartitioning.partitionIdExpression`, so only | ||
| * Represents data where tuples that share the same values for the `clustering` | ||
| * [[Expression Expressions]] will be co-located. Based on the context, this | ||
| * can mean such tuples are either co-located in the same partition or they will be contiguous | ||
| * within a single partition. | ||
| */ | ||
| case class ClusteredDistribution( | ||
| clustering: Seq[Expression], | ||
| requiredNumPartitions: Option[Int] = None) extends ClusteredDistributionBase(clustering) | ||
|
|
||
| /** | ||
| * Represents data where tuples have been clustered according to the hash of the given expressions. | ||
| * The hash function is defined as [[HashPartitioning.partitionIdExpression]], so only | ||
| * [[HashPartitioning]] can satisfy this distribution. | ||
| * | ||
| * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the | ||
| * number of partitions, this distribution strictly requires which partition the tuple should be in. | ||
| */ | ||
| case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution { | ||
| require( | ||
| expressions != Nil, | ||
| "The expressions for hash of a HashPartitionedDistribution should not be Nil. " + | ||
| "An AllTuples should be used to represent a distribution that only has " + | ||
| "a single partition.") | ||
|
|
||
| override def requiredNumPartitions: Option[Int] = None | ||
|
|
||
| override def createPartitioning(numPartitions: Int): Partitioning = { | ||
| HashPartitioning(expressions, numPartitions) | ||
| } | ||
| } | ||
| case class HashClusteredDistribution( | ||
| hashExprs: Seq[Expression], | ||
| requiredNumPartitions: Option[Int] = None) extends ClusteredDistributionBase(hashExprs) | ||
|
|
||
| /** | ||
| * Represents data where tuples have been ordered according to the `ordering` | ||
|
|
@@ -170,8 +162,10 @@ trait Partitioning { | |
| def satisfies(required: Distribution): Boolean = required match { | ||
| case UnspecifiedDistribution => true | ||
| case AllTuples => numPartitions == 1 | ||
| case _ => false | ||
| case _ => required.requiredNumPartitions.forall(_ == numPartitions) && satisfies0(required) | ||
| } | ||
|
|
||
| protected def satisfies0(required: Distribution): Boolean = false | ||
| } | ||
|
|
||
| case class UnknownPartitioning(numPartitions: Int) extends Partitioning | ||
|
|
@@ -186,9 +180,8 @@ case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning | |
| case object SinglePartition extends Partitioning { | ||
| val numPartitions = 1 | ||
|
|
||
| override def satisfies(required: Distribution): Boolean = required match { | ||
| override def satisfies0(required: Distribution): Boolean = required match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add docs to explain what is satisfies0 and how it different from satisfies?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added in the base class |
||
| case _: BroadcastDistribution => false | ||
| case ClusteredDistribution(_, Some(requiredNumPartitions)) => requiredNumPartitions == 1 | ||
| case _ => true | ||
| } | ||
| } | ||
|
|
@@ -205,18 +198,15 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int) | |
| override def nullable: Boolean = false | ||
| override def dataType: DataType = IntegerType | ||
|
|
||
| override def satisfies(required: Distribution): Boolean = { | ||
| super.satisfies(required) || { | ||
| required match { | ||
| case h: HashClusteredDistribution => | ||
| expressions.length == h.expressions.length && expressions.zip(h.expressions).forall { | ||
| case (l, r) => l.semanticEquals(r) | ||
| } | ||
| case ClusteredDistribution(requiredClustering, requiredNumPartitions) => | ||
| expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) && | ||
| (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions) | ||
| case _ => false | ||
| } | ||
| override def satisfies0(required: Distribution): Boolean = { | ||
| required match { | ||
| case h: HashClusteredDistribution => | ||
| expressions.length == h.hashExprs.length && expressions.zip(h.hashExprs).forall { | ||
| case (l, r) => l.semanticEquals(r) | ||
| } | ||
| case c: ClusteredDistribution => | ||
| expressions.forall(x => c.clustering.exists(_.semanticEquals(x))) | ||
| case _ => false | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -246,17 +236,14 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int) | |
| override def nullable: Boolean = false | ||
| override def dataType: DataType = IntegerType | ||
|
|
||
| override def satisfies(required: Distribution): Boolean = { | ||
| super.satisfies(required) || { | ||
| required match { | ||
| case OrderedDistribution(requiredOrdering) => | ||
| val minSize = Seq(requiredOrdering.size, ordering.size).min | ||
| requiredOrdering.take(minSize) == ordering.take(minSize) | ||
| case ClusteredDistribution(requiredClustering, requiredNumPartitions) => | ||
| ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) && | ||
| (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions) | ||
| case _ => false | ||
| } | ||
| override def satisfies0(required: Distribution): Boolean = { | ||
| required match { | ||
| case OrderedDistribution(requiredOrdering) => | ||
| val minSize = Seq(requiredOrdering.size, ordering.size).min | ||
| requiredOrdering.take(minSize) == ordering.take(minSize) | ||
| case ClusteredDistribution(requiredClustering, _) => | ||
| ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) | ||
| case _ => false | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not see any new tests in the DistributionSuite. I feel like issues likes this should have specified unit tests in DistributionSuite and shouldnt have to rely on StreamingJoinSuite.