address comments

cloud-fan · cloud-fan · Jul 11, 2017 · Aug 17, 2017 · Aug 22, 2017 · Aug 31, 2017
commit a29031db10b4c0a2dd87d7f033068feb0f64d14a
diff --git a/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java b/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java
@@ -29,12 +29,7 @@
 @InterfaceStability.Unstable
 public interface CatalystFilterPushDownSupport {
   /**
-   * Push down one filter, returns true if this filter can be pushed down to this data source,
-   * false otherwise. This method might be called many times if more than one filter need to be
-   * pushed down.
-   *
-   * TODO: we can also make it `Expression[] pushDownCatalystFilters(Expression[] filters)` which
-   * returns unsupported filters.
+   * Push down filters, returns unsupported filters.
    */
-  boolean pushDownCatalystFilter(Expression filter);
+  Expression[] pushDownCatalystFilters(Expression[] filters);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java
@@ -35,6 +35,9 @@ public interface ColumnarReadSupport {
    * A safety door for columnar reader. It's possible that the implementation can only support
    * columnar reads for some certain columns, users can overwrite this method to fallback to
    * normal read path under some conditions.
+   *
+   * Note that, if the implementation always return true here, then he can throw exception in
+   * the row based `DataSourceV2Reader.createReadTasks`, as it will never be called.
    */
   default boolean supportsColumnarReads() {
     return true;

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataReader.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.Closeable;
+import java.util.Iterator;
+
+/**
+ * A data reader returned by a read task and is responsible for outputting data for an RDD
+ * partition.
+ */
+public interface DataReader<T> extends Iterator<T>, Closeable {}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
+import java.io.IOException;
 import java.util.List;
 import java.util.stream.Collectors;
 
 import org.apache.spark.annotation.Experimental;
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
+import org.apache.spark.sql.catalyst.encoders.RowEncoder;
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
 import org.apache.spark.sql.types.StructType;
 
@@ -47,13 +50,16 @@ public abstract class DataSourceV2Reader {
    * output.
    */
   // TODO: maybe we should support arbitrary type and work with Dataset, instead of only Row.
-  public abstract List<ReadTask<Row>> createReadTasks();
+  protected abstract List<ReadTask<Row>> createReadTasks();
 
   /**
    * Inside Spark, the input rows will be converted to `UnsafeRow`s before processing. To avoid
    * this conversion, implementations can overwrite this method and output `UnsafeRow`s directly.
    * Note that, this is an experimental and unstable interface, as `UnsafeRow` is not public and
    * may get changed in future Spark versions.
+   *
+   * Note that, if the implement overwrites this method, he should also overwrite `createReadTasks`
+   * to throw exception, as it will never be called.
    */
   @Experimental
   @InterfaceStability.Unstable
@@ -64,3 +70,48 @@ public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() {
         .collect(Collectors.toList());
   }
 }
+
+class RowToUnsafeRowReadTask implements ReadTask<UnsafeRow> {
+  private final ReadTask<Row> rowReadTask;
+  private final StructType schema;
+
+  RowToUnsafeRowReadTask(ReadTask<Row> rowReadTask, StructType schema) {
+    this.rowReadTask = rowReadTask;
+    this.schema = schema;
+  }
+
+  @Override
+  public String[] preferredLocations() {
+    return rowReadTask.preferredLocations();
+  }
+
+  @Override
+  public DataReader<UnsafeRow> getReader() {
+    return new RowToUnsafeDataReader(rowReadTask.getReader(), RowEncoder.apply(schema));
+  }
+}
+
+class RowToUnsafeDataReader implements DataReader<UnsafeRow> {
+  private final DataReader<Row> rowReader;
+  private final ExpressionEncoder<Row> encoder;
+
+  RowToUnsafeDataReader(DataReader<Row> rowReader, ExpressionEncoder<Row> encoder) {
+    this.rowReader = rowReader;
+    this.encoder = encoder;
+  }
+
+  @Override
+  public boolean hasNext() {
+    return rowReader.hasNext();
+  }
+
+  @Override
+  public UnsafeRow next() {
+    return (UnsafeRow) encoder.toRow(rowReader.next());
+  }
+
+  @Override
+  public void close() throws IOException {
+    rowReader.close();
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/FilterPushDownSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/FilterPushDownSupport.java
@@ -25,12 +25,7 @@
  */
 public interface FilterPushDownSupport {
   /**
-   * Push down one filter, returns true if this filter can be pushed down to this data source,
-   * false otherwise. This method might be called many times if more than one filter need to be
-   * pushed down.
-   *
-   * TODO: we can also make it `Expression[] pushDownCatalystFilters(Expression[] filters)` which
-   * returns unsupported filters.
+   * Push down filters, returns unsupported filters.
    */
-  boolean pushDownFilter(Filter filter);
+  Filter[] pushDownFilters(Filter[] filters);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ReadTask.java
@@ -20,8 +20,8 @@
 import java.io.Serializable;
 
 /**
- * A read task returned by a data source reader and is responsible for outputting data for an RDD
- * partition.
+ * A read task returned by a data source reader and is responsible to create the data reader.
+ * The relationship between `ReadTask` and `DataReader` is similar to `Iterable` and `Iterator`.
  */
 public interface ReadTask<T> extends Serializable {
   /**
@@ -33,25 +33,5 @@ default String[] preferredLocations() {
     return new String[0];
   }
 
-  /**
-   * This method will be called before running this read task, users can overwrite this method
-   * and put initialization logic here.
-   */
-  default void open() {}
-
-  /**
-   * Proceed to next record, returns false if there is no more records.
-   */
-  boolean next();
-
-  /**
-   * Return the current record. This method should return same value until `next` is called.
-   */
-  T get();
-
-  /**
-   * This method will be called after finishing this read task, users can overwrite this method
-   * and put clean up logic here.
-   */
-  default void close() {}
+  DataReader<T> getReader();
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/RowToUnsafeRowReadTask.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/RowToUnsafeRowReadTask.java
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SortPushDown.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SortPushDown.java
@@ -26,5 +26,5 @@ public interface SortPushDown {
    * Returns true if the implementation can handle this sorting requirement and save a sort
    * operation at Spark side.
    */
-  boolean pushDownSort(String[] sortingColumns);
+  boolean pushDownSort(String[] sortingColumns, boolean asc, boolean nullFirst);
 }
diff --git a/...rces/v2/reader/SparkHashPartitioning.java → ...ark/sql/sources/v2/reader/Statistics.java b/...rces/v2/reader/SparkHashPartitioning.java → ...ark/sql/sources/v2/reader/Statistics.java
@@ -17,24 +17,10 @@
 
 package org.apache.spark.sql.sources.v2.reader;
 
-public class SparkHashPartitioning implements Partitioning {
-  private int numPartitions;
+import java.util.OptionalLong;
 
-  public SparkHashPartitioning(int numPartitions) {
-    this.numPartitions = numPartitions;
-  }
-
-  @Override
-  public boolean compatibleWith(Partitioning other) {
-    if (other instanceof SparkHashPartitioning) {
-      return this.numPartitions() == other.numPartitions();
-    } else {
-      return other.compatibleWith(this);
-    }
-  }
-
-  @Override
-  public int numPartitions() {
-    return numPartitions;
-  }
+public interface Statistics {
+  OptionalLong getSize();
+  OptionalLong getRows();
+  OptionalLong getDistinctValues(String columnName);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/StatisticsSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/StatisticsSupport.java
@@ -22,6 +22,5 @@
  * statistics like sizeInBytes, to Spark.
  */
 public interface StatisticsSupport {
-  // todo: shall we add more statistics? what do we want?
-  long sizeInBytes();
+  Statistics getStatistics();
 }
diff --git a/.../main/java/org/apache/spark/sql/sources/v2/reader/distribution/ClusteredDistribution.java b/.../main/java/org/apache/spark/sql/sources/v2/reader/distribution/ClusteredDistribution.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader.distribution;
+
+/**
+ * Represents a distribution where records that share the same values for the `clusteringColumns`
+ * will be co-located, which means, they will be produced by the same `ReadTask`.
+ */
+public class ClusteredDistribution {
+  private String[] clusteringColumns;
+
+  public ClusteredDistribution(String[] clusteringColumns) {
+    this.clusteringColumns = clusteringColumns;
+  }
+
+  public String[] getClusteringColumns() {
+    return clusteringColumns;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/distribution/Distribution.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/distribution/Distribution.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader.distribution;
+
+/**
+ * Specifies how data should be distributed when a query is executed in parallel on many machines.
+ *
+ * Current implementations: `ClusteredDistribution`.
+ */
+public interface Distribution {}
diff --git a/.../v2/reader/ClusteringPushDownSupport.java → ...der/distribution/DistributionSupport.java b/.../v2/reader/ClusteringPushDownSupport.java → ...der/distribution/DistributionSupport.java
@@ -15,24 +15,21 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
-
+package org.apache.spark.sql.sources.v2.reader.distribution;
 
 /**
- * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to pre-clustering
- * the data and avoid shuffle at Spark side.
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to report the
+ * output partitioning, to avoid shuffle at Spark side if the output partitioning can satisfy the
+ * distribution requirement.
  */
-public interface ClusteringPushDownSupport {
+public interface DistributionSupport {
   /**
-   * Returns a non-null `Partitioning` if the implementation can handle this clustering requirement
-   * and save a shuffle at Spark side. Clustering means, if two records have same values for the
-   * given clustering columns, they must be produced by the same read task.
+   * Returns an array of partitionings this data source can output. Spark will pick one partitioning
+   * that can avoid shuffle, and call `pickPartitioning` to notify the data source which
+   * partitioning was picked. Note that, if none of the partitions can help to avoid shuffle,
+   * `NoPartitioning` will be passed to `pickPartitioning`.
    */
-  Partitioning pushDownClustering(String[] clusteringColumns);
+  Partitioning[] getPartitionings();
 
-  /**
-   * Cancel this clustering push down. This will be called if Spark finds out that we can't avoid
-   * the shuffle after we push down the clustering.
-   */
-  void cancel();
+  void pickPartitioning(Partitioning p);
 }