Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
987e33b
Pin to pre-release version of arrow 52.2.0
alamb Jul 16, 2024
2c808fb
Update for deprecated method
alamb Jul 16, 2024
8d8732c
Add a config to force using string view in benchmark (#11514)
XiangpengHao Jul 19, 2024
8e0ca1a
Add String view helper functions (#11517)
XiangpengHao Jul 19, 2024
db65772
Add ArrowBytesViewMap and ArrowBytesViewSet (#11515)
XiangpengHao Jul 19, 2024
efcf5c6
Enable `GroupValueBytesView` for aggregation with StringView types (#…
XiangpengHao Jul 20, 2024
34d42bc
Initial support for regex_replace on `StringViewArray` (#11556)
XiangpengHao Jul 22, 2024
bb780b3
Add support for Utf8View for date/temporal codepaths (#11518)
a10y Jul 22, 2024
2b58fd5
GC `StringViewArray` in `CoalesceBatchesStream` (#11587)
XiangpengHao Jul 25, 2024
2b2b8ab
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
ea11a9d
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
f13bb82
[Bug] fix bug in return type inference of `utf8_to_int_type` (#11662)
XiangpengHao Jul 26, 2024
fb79638
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 26, 2024
281fbed
Fix clippy
alamb Jul 26, 2024
5690712
Increase ByteViewMap block size to 2MB (#11674)
XiangpengHao Jul 27, 2024
322c3d2
Change `--string-view` to only apply to parquet formats (#11663)
XiangpengHao Jul 27, 2024
ab8005d
Implement native support StringView for character length (#11676)
XiangpengHao Jul 27, 2024
561aee8
Merge remote-tracking branch 'apache/main' into string-view2
alamb Jul 29, 2024
2e9c8a0
Remove uneeded patches
alamb Jul 29, 2024
f1f22fa
cargo fmt
alamb Jul 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add a config to force using string view in benchmark (#11514)
* add a knob to force string view in benchmark

* fix sql logic test

* update doc

* fix ci

* fix ci only test

* Update benchmarks/src/util/options.rs

Co-authored-by: Andrew Lamb <[email protected]>

* Update datafusion/common/src/config.rs

Co-authored-by: Andrew Lamb <[email protected]>

* update tests

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb authored Jul 19, 2024
commit 8d8732ce198ea43d2ce0f2240a3be610f754ac8d
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ large_futures = "warn"
[workspace.lints.rust]
unused_imports = "deny"


## Temporary arrow-rs patch until 52.2.0 is released

[patch.crates-io]
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/src/clickbench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ impl RunOpt {
None => queries.min_query_id()..=queries.max_query_id(),
};

let config = self.common.config();
let mut config = self.common.config();
config.options_mut().execution.schema_force_string_view = self.common.string_view;

let ctx = SessionContext::new_with_config(config);
self.register_hits(&ctx).await?;

Expand Down
3 changes: 3 additions & 0 deletions benchmarks/src/tpch/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ impl RunOpt {
.config()
.with_collect_statistics(!self.disable_statistics);
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
config.options_mut().execution.schema_force_string_view = self.common.string_view;
let ctx = SessionContext::new_with_config(config);

// register tables
Expand Down Expand Up @@ -339,6 +340,7 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
string_view: false,
};
let opt = RunOpt {
query: Some(query),
Expand Down Expand Up @@ -372,6 +374,7 @@ mod tests {
partitions: Some(2),
batch_size: 8192,
debug: false,
string_view: false,
};
let opt = RunOpt {
query: Some(query),
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/src/util/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ pub struct CommonOpt {
/// Activate debug mode to see more details
#[structopt(short, long)]
pub debug: bool,

/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
/// when reading ParquetFiles
#[structopt(long)]
pub string_view: bool,
}

impl CommonOpt {
Expand Down
4 changes: 4 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ config_namespace! {

/// Should DataFusion keep the columns used for partition_by in the output RecordBatches
pub keep_partition_by_columns: bool, default = false

/// If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`,
/// and `Binary/BinaryLarge` with `BinaryView`.
pub schema_force_string_view: bool, default = false
}
}

Expand Down
27 changes: 26 additions & 1 deletion datafusion/core/src/datasource/listing/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,32 @@ impl ListingOptions {
.try_collect()
.await?;

self.format.infer_schema(state, &store, &files).await
let mut schema = self.format.infer_schema(state, &store, &files).await?;

if state.config_options().execution.schema_force_string_view {
let transformed_fields: Vec<Arc<Field>> = schema
.fields
.iter()
.map(|field| match field.data_type() {
DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new(
field.name(),
DataType::Utf8View,
field.is_nullable(),
)),
DataType::Binary | DataType::LargeBinary => Arc::new(Field::new(
field.name(),
DataType::BinaryView,
field.is_nullable(),
)),
_ => field.clone(),
})
.collect();
schema = Arc::new(Schema::new_with_metadata(
transformed_fields,
schema.metadata.clone(),
));
}
Ok(schema)
}

/// Infers the partition columns stored in `LOCATION` and compares
Expand Down
2 changes: 2 additions & 0 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ datafusion.execution.parquet.statistics_enabled NULL
datafusion.execution.parquet.write_batch_size 1024
datafusion.execution.parquet.writer_version 1.0
datafusion.execution.planning_concurrency 13
datafusion.execution.schema_force_string_view false
datafusion.execution.soft_max_rows_per_output_file 50000000
datafusion.execution.sort_in_place_threshold_bytes 1048576
datafusion.execution.sort_spill_reservation_bytes 10485760
Expand Down Expand Up @@ -289,6 +290,7 @@ datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are enab
datafusion.execution.parquet.write_batch_size 1024 Sets write_batch_size in bytes
datafusion.execution.parquet.writer_version 1.0 Sets parquet writer version valid values are "1.0" and "2.0"
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
datafusion.execution.schema_force_string_view false If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max
datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.
datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).
Expand Down
1 change: 1 addition & 0 deletions docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs |
| datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental |
| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches |
| datafusion.execution.schema_force_string_view | false | If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. |
| datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. |
| datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores |
| datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible |
Expand Down