Skip to content

Commit cbe9230

Browse files
eatoncysHyukjinKwon
authored andcommitted
[SPARK-26312][SQL] Replace RDDConversions.rowToRowRdd with RowEncoder to improve its conversion performance
## What changes were proposed in this pull request? `RDDConversions` would get disproportionately slower as the number of columns in the query increased, for the type of `converters` before is `scala.collection.immutable.::` which is a subtype of list. This PR removing `RDDConversions` and using `RowEncoder` to convert the Row to InternalRow. The test of `PrunedScanSuite` for 2000 columns and 20k rows takes 409 seconds before this PR, and 361 seconds after. ## How was this patch tested? Test case of `PrunedScanSuite` Closes #23262 from eatoncys/toarray. Authored-by: 10129659 <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent 05cf81e commit cbe9230

File tree

2 files changed

+7
-44
lines changed

2 files changed

+7
-44
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala

Lines changed: 2 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,54 +18,14 @@
1818
package org.apache.spark.sql.execution
1919

2020
import org.apache.spark.rdd.RDD
21-
import org.apache.spark.sql.{Encoder, Row, SparkSession}
22-
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
21+
import org.apache.spark.sql.{Encoder, SparkSession}
22+
import org.apache.spark.sql.catalyst.InternalRow
2323
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
2424
import org.apache.spark.sql.catalyst.expressions._
2525
import org.apache.spark.sql.catalyst.plans.logical._
2626
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
2727
import org.apache.spark.sql.catalyst.util.truncatedString
2828
import org.apache.spark.sql.execution.metric.SQLMetrics
29-
import org.apache.spark.sql.types.DataType
30-
31-
object RDDConversions {
32-
def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
33-
data.mapPartitions { iterator =>
34-
val numColumns = outputTypes.length
35-
val mutableRow = new GenericInternalRow(numColumns)
36-
val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
37-
iterator.map { r =>
38-
var i = 0
39-
while (i < numColumns) {
40-
mutableRow(i) = converters(i)(r.productElement(i))
41-
i += 1
42-
}
43-
44-
mutableRow
45-
}
46-
}
47-
}
48-
49-
/**
50-
* Convert the objects inside Row into the types Catalyst expected.
51-
*/
52-
def rowToRowRdd(data: RDD[Row], outputTypes: Seq[DataType]): RDD[InternalRow] = {
53-
data.mapPartitions { iterator =>
54-
val numColumns = outputTypes.length
55-
val mutableRow = new GenericInternalRow(numColumns)
56-
val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
57-
iterator.map { r =>
58-
var i = 0
59-
while (i < numColumns) {
60-
mutableRow(i) = converters(i)(r(i))
61-
i += 1
62-
}
63-
64-
mutableRow
65-
}
66-
}
67-
}
68-
}
6929

7030
object ExternalRDD {
7131

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, Quali
2929
import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
3030
import org.apache.spark.sql.catalyst.analysis._
3131
import org.apache.spark.sql.catalyst.catalog._
32+
import org.apache.spark.sql.catalyst.encoders.RowEncoder
3233
import org.apache.spark.sql.catalyst.expressions
3334
import org.apache.spark.sql.catalyst.expressions._
3435
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
3536
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan, Project}
36-
import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
3737
import org.apache.spark.sql.catalyst.rules.Rule
3838
import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan}
3939
import org.apache.spark.sql.execution.command._
@@ -416,7 +416,10 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with
416416
output: Seq[Attribute],
417417
rdd: RDD[Row]): RDD[InternalRow] = {
418418
if (relation.relation.needConversion) {
419-
execution.RDDConversions.rowToRowRdd(rdd, output.map(_.dataType))
419+
val converters = RowEncoder(StructType.fromAttributes(output))
420+
rdd.mapPartitions { iterator =>
421+
iterator.map(converters.toRow)
422+
}
420423
} else {
421424
rdd.asInstanceOf[RDD[InternalRow]]
422425
}

0 commit comments

Comments
 (0)