cloud-fan · cloud-fan · Jul 11, 2017 · Aug 17, 2017 · Aug 22, 2017 · Aug 31, 2017
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BucketingSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/BucketingSupport.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+/**
+ * A mix in interface for `DataSourceV2Reader` and `DataSourceV2Writer`. Users can implement this
+ * interface to specify the bucket/sort columns of the reader/writer, to improve performance.
+ */
+public interface BucketingSupport {
+  void setBucketColumns(String[] bucketColumns, int numBuckets);
+
+  void setSortColumns(String[] sortColumns);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * The main interface and minimal requirement for data source v2 implementations. Users can mix in
+ * more interfaces to implement more functions other than just scan.
+ */
+public interface DataSourceV2 {
+
+  /**
+   * The main entrance for read interface.
+   *
+   * @param schema the full schema of this data source reader. Full schema usually maps to the
+   *               physical schema of the underlying storage of this data source reader, e.g.
+   *               parquet files, JDBC tables, etc, while this reader may not read data with full
+   *               schema, as column pruning or other optimizations may happen.
+   * @param options the options for this data source reader, which is case insensitive.
+   * @return a reader that implements the actual read logic.
+   */
+  DataSourceV2Reader createReader(
+      StructType schema,
+      CaseInsensitiveMap<String> options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2SchemaProvider.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2SchemaProvider.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix in interface for `DataSourceV2`. Users can implement this interface to provide schema
+ * inference ability when scanning data.
+ */
+public interface DataSourceV2SchemaProvider {
+  /**
+   * Return the inferred schema of this data source given these options.
+   */
+  StructType inferSchema(CaseInsensitiveMap<String> options);
+
+  /**
+   * Whether or not this data source can accept user specified schema. When Spark scans a data
+   * source, users can specify the schema to avoid expensive schema inference. However some data
+   * sources may have to infer the schema and reject any user specified schemas, they can overwrite
+   * this method to achieve this.
+   */
+  default boolean acceptsUserDefinedSchema() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/PartitioningSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/PartitioningSupport.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+/**
+ * A mix in interface for `DataSourceV2Reader` and `DataSourceV2Writer`. Users can implement this
+ * interface to specify the partition columns of the reader/writer, to improve performance.
+ */
+public interface PartitioningSupport {
+  void setPartitionColumns(String[] partitionColumns);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WritableDataSourceV2.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/WritableDataSourceV2.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+
+/**
+ * A mix in interface for `DataSourceV2`. Users can implement this interface to provide data writing
+ * ability with job-level transaction.
+ */
+public interface WritableDataSourceV2 extends DataSourceV2 {
+
+  /**
+   * The main entrance for write interface.
+   *
+   * @param mode the save move, can be append, overwrite, etc.
+   * @param options the options for this data source writer.
+   * @return a writer that implements the actual write logic.
+   */
+  DataSourceV2Writer createWriter(SaveMode mode, CaseInsensitiveMap<String> options);
+}
diff --git a/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java b/...e/src/main/java/org/apache/spark/sql/sources/v2/reader/CatalystFilterPushDownSupport.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.catalyst.expressions.Expression;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to push down
+ * arbitrary expressions as predicates to the data source.
+ */
+@Experimental
+@InterfaceStability.Unstable
+public interface CatalystFilterPushDownSupport {
+  /**
+   * Push down one filter, returns true if this filter can be pushed down to this data source,
+   * false otherwise. This method might be called many times if more than one filter need to be
+   * pushed down.
+   *
+   * TODO: we can also make it `Expression[] pushDownCatalystFilters(Expression[] filters)` which
+   * returns unsupported filters.
+   */
+  boolean pushDownCatalystFilter(Expression filter);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteringPushDownSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ClusteringPushDownSupport.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to pre-clustering
+ * the data and avoid shuffle at Spark side.
+ */
+public interface ClusteringPushDownSupport {
+  /**
+   * Returns true if the implementation can handle this clustering requirement and save a shuffle
+   * at Spark side. Clustering means, if two records have same values for the given clustering
+   * columns, they must be produced by the same read task.
+   */
+  boolean pushDownClustering(String[] clusteringColumns);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnPruningSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnPruningSupport.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to only read
+ * required columns/nested fields during scan.
+ */
+public interface ColumnPruningSupport {
+  /**
+   * Returns true if the implementation can apple this column pruning optimization, so that we can
+   * reduce the data size to be read at the very beginning.
+   */
+  boolean pruneColumns(StructType requiredSchema);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ColumnarReadSupport.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+
+import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
+
+/**
+ * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to provide
+ * columnar read ability for better performance.
+ */
+public interface ColumnarReadSupport {
+  /**
+   * Similar to `DataSourceV2Reader.createReadTasks`, but return data in columnar format.
+   */
+  List<ReadTask<ColumnarBatch>> createColumnarReadTasks();
+
+  /**
+   * A safety door for columnar reader. It's possible that the implementation can only support
+   * columnar reads for some certain columns, users can overwrite this method to fallback to
+   * normal read path under some conditions.
+   */
+  default boolean supportsColumnarReads() {
+    return true;
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/DataSourceV2Reader.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * The main interface and minimal requirement for a data source reader. The implementations should
+ * at least implement the full scan logic, users can mix in more interfaces to implement scan
+ * optimizations like column pruning, filter push down, etc.
+ */
+public abstract class DataSourceV2Reader {
+
+  /**
+   * The actual schema of this data source reader, which may be different from the physical schema
+   * of the underlying storage, as column pruning or other optimizations may happen.
+   */
+  public abstract StructType readSchema();
+
+  /**
+   * The actual read logic should be implemented here. This may not be a full scan as optimizations
+   * may have already been applied on this reader. Implementations should return a list of
+   * read tasks, each task is responsible to output data for one RDD partition, which means
+   * the number of tasks returned here will be same as the number of RDD partitions this scan
+   * output.
+   */
+  // TODO: maybe we should support arbitrary type and work with Dataset, instead of only Row.
+  public abstract List<ReadTask<Row>> createReadTasks();
+
+  /**
+   * Inside Spark, the input rows will be converted to `UnsafeRow`s before processing. To avoid
+   * this conversion, implementations can overwrite this method and output `UnsafeRow`s directly.
+   * Note that, this is an experimental and unstable interface, as `UnsafeRow` is not public and
+   * may get changed in future Spark versions.
+   */
+  @Experimental
+  @InterfaceStability.Unstable
+  public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() {
+    StructType schema = readSchema();
+    return createReadTasks().stream()
+        .map(rowGenerator -> new RowToUnsafeRowReadTask(rowGenerator, schema))
+        .collect(Collectors.toList());
+  }
+}