-
Notifications
You must be signed in to change notification settings - Fork 2
data source v2 prototype #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2Reader` and `DataSourceV2Writer`. Users can implement this | ||
| * interface to specify the bucket/sort columns of the reader/writer, to improve performance. | ||
| */ | ||
| public interface BucketingSupport { | ||
| void setBucketColumns(String[] bucketColumns, int numBuckets); | ||
|
|
||
| void setSortColumns(String[] sortColumns); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2; | ||
|
|
||
| import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap; | ||
| import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader; | ||
| import org.apache.spark.sql.types.StructType; | ||
|
|
||
| /** | ||
| * The main interface and minimal requirement for data source v2 implementations. Users can mix in | ||
| * more interfaces to implement more functions other than just scan. | ||
| */ | ||
| public interface DataSourceV2 { | ||
|
|
||
| /** | ||
| * The main entrance for read interface. | ||
| * | ||
| * @param schema the full schema of this data source reader. Full schema usually maps to the | ||
| * physical schema of the underlying storage of this data source reader, e.g. | ||
| * parquet files, JDBC tables, etc, while this reader may not read data with full | ||
| * schema, as column pruning or other optimizations may happen. | ||
| * @param options the options for this data source reader, which is case insensitive. | ||
| * @return a reader that implements the actual read logic. | ||
| */ | ||
| DataSourceV2Reader createReader( | ||
| StructType schema, | ||
| CaseInsensitiveMap<String> options); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2; | ||
|
|
||
| import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap; | ||
| import org.apache.spark.sql.types.StructType; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2`. Users can implement this interface to provide schema | ||
| * inference ability when scanning data. | ||
| */ | ||
| public interface DataSourceV2SchemaProvider { | ||
| /** | ||
| * Return the inferred schema of this data source given these options. | ||
| */ | ||
| StructType inferSchema(CaseInsensitiveMap<String> options); | ||
|
|
||
| /** | ||
| * Whether or not this data source can accept user specified schema. When Spark scans a data | ||
| * source, users can specify the schema to avoid expensive schema inference. However some data | ||
| * sources may have to infer the schema and reject any user specified schemas, they can overwrite | ||
| * this method to achieve this. | ||
| */ | ||
| default boolean acceptsUserDefinedSchema() { | ||
| return true; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2Reader` and `DataSourceV2Writer`. Users can implement this | ||
| * interface to specify the partition columns of the reader/writer, to improve performance. | ||
| */ | ||
| public interface PartitioningSupport { | ||
| void setPartitionColumns(String[] partitionColumns); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2; | ||
|
|
||
| import org.apache.spark.sql.SaveMode; | ||
| import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap; | ||
| import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2`. Users can implement this interface to provide data writing | ||
| * ability with job-level transaction. | ||
| */ | ||
| public interface WritableDataSourceV2 extends DataSourceV2 { | ||
|
|
||
| /** | ||
| * The main entrance for write interface. | ||
| * | ||
| * @param mode the save move, can be append, overwrite, etc. | ||
| * @param options the options for this data source writer. | ||
| * @return a writer that implements the actual write logic. | ||
| */ | ||
| DataSourceV2Writer createWriter(SaveMode mode, CaseInsensitiveMap<String> options); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2.reader; | ||
|
|
||
| import org.apache.spark.annotation.Experimental; | ||
| import org.apache.spark.annotation.InterfaceStability; | ||
| import org.apache.spark.sql.catalyst.expressions.Expression; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to push down | ||
| * arbitrary expressions as predicates to the data source. | ||
| */ | ||
| @Experimental | ||
| @InterfaceStability.Unstable | ||
| public interface CatalystFilterPushDownSupport { | ||
| /** | ||
| * Push down one filter, returns true if this filter can be pushed down to this data source, | ||
| * false otherwise. This method might be called many times if more than one filter need to be | ||
| * pushed down. | ||
| * | ||
| * TODO: we can also make it `Expression[] pushDownCatalystFilters(Expression[] filters)` which | ||
| * returns unsupported filters. | ||
| */ | ||
| boolean pushDownCatalystFilter(Expression filter); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2.reader; | ||
|
|
||
| import org.apache.spark.sql.types.StructType; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to only read | ||
| * required columns/nested fields during scan. | ||
| */ | ||
| public interface ColumnPruningSupport { | ||
| /** | ||
| * Returns true if the implementation can apple this column pruning optimization, so that we can | ||
| * reduce the data size to be read at the very beginning. | ||
| */ | ||
| boolean pruneColumns(StructType requiredSchema); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2.reader; | ||
|
|
||
| import java.util.List; | ||
|
|
||
| import org.apache.spark.sql.execution.vectorized.ColumnarBatch; | ||
|
|
||
| /** | ||
| * A mix in interface for `DataSourceV2Reader`. Users can implement this interface to provide | ||
| * columnar read ability for better performance. | ||
| */ | ||
| public interface ColumnarReadSupport { | ||
| /** | ||
| * Similar to `DataSourceV2Reader.createReadTasks`, but return data in columnar format. | ||
| */ | ||
| List<ReadTask<ColumnarBatch>> createColumnarReadTasks(); | ||
|
|
||
| /** | ||
| * A safety door for columnar reader. It's possible that the implementation can only support | ||
| * columnar reads for some certain columns, users can overwrite this method to fallback to | ||
| * normal read path under some conditions. | ||
| */ | ||
| default boolean supportsColumnarReads() { | ||
| return true; | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.sources.v2.reader; | ||
|
|
||
| import java.util.List; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import org.apache.spark.annotation.Experimental; | ||
| import org.apache.spark.annotation.InterfaceStability; | ||
| import org.apache.spark.sql.Row; | ||
| import org.apache.spark.sql.catalyst.expressions.UnsafeRow; | ||
| import org.apache.spark.sql.types.StructType; | ||
|
|
||
| /** | ||
| * The main interface and minimal requirement for a data source reader. The implementations should | ||
| * at least implement the full scan logic, users can mix in more interfaces to implement scan | ||
| * optimizations like column pruning, filter push down, etc. | ||
| */ | ||
| public abstract class DataSourceV2Reader { | ||
|
||
|
|
||
| /** | ||
| * The actual schema of this data source reader, which may be different from the physical schema | ||
| * of the underlying storage, as column pruning or other optimizations may happen. | ||
| */ | ||
| public abstract StructType readSchema(); | ||
|
|
||
| /** | ||
| * The actual read logic should be implemented here. This may not be a full scan as optimizations | ||
| * may have already been applied on this reader. Implementations should return a list of | ||
| * read tasks, each task is responsible to output data for one RDD partition, which means | ||
| * the number of tasks returned here will be same as the number of RDD partitions this scan | ||
| * output. | ||
| */ | ||
| // TODO: maybe we should support arbitrary type and work with Dataset, instead of only Row. | ||
| public abstract List<ReadTask<Row>> createReadTasks(); | ||
|
|
||
| /** | ||
| * Inside Spark, the input rows will be converted to `UnsafeRow`s before processing. To avoid | ||
| * this conversion, implementations can overwrite this method and output `UnsafeRow`s directly. | ||
| * Note that, this is an experimental and unstable interface, as `UnsafeRow` is not public and | ||
| * may get changed in future Spark versions. | ||
| */ | ||
| @Experimental | ||
| @InterfaceStability.Unstable | ||
| public List<ReadTask<UnsafeRow>> createUnsafeRowReadTasks() { | ||
| StructType schema = readSchema(); | ||
| return createReadTasks().stream() | ||
| .map(rowGenerator -> new RowToUnsafeRowReadTask(rowGenerator, schema)) | ||
| .collect(Collectors.toList()); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Also, describe the meaning of the returned boolean by adding
@return?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the first sentence
Whether or not this data source can accept user specified schemaalready described the meaning?