Skip to content

Commit 12f55d8

Browse files
authored
Add Dictionary String (UTF8) type to String sqllogictests (#12621)
* mapping DictionaryString to text * disable and move out the fail case for dictionary string * fix the schema for dictionary string * rollback the unnecessary change * cargo fmt
1 parent a98ffdd commit 12f55d8

File tree

15 files changed

+172
-88
lines changed

15 files changed

+172
-88
lines changed

datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use crate::engines::output::DFColumnType;
19+
use arrow::array::Array;
1820
use arrow::datatypes::Fields;
1921
use arrow::util::display::ArrayFormatter;
2022
use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch};
@@ -23,8 +25,6 @@ use datafusion_common::DataFusionError;
2325
use std::path::PathBuf;
2426
use std::sync::OnceLock;
2527

26-
use crate::engines::output::DFColumnType;
27-
2828
use super::super::conversion::*;
2929
use super::error::{DFSqlLogicTestError, Result};
3030

@@ -275,6 +275,17 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec<DFColumnType> {
275275
| DataType::Time32(_)
276276
| DataType::Time64(_) => DFColumnType::DateTime,
277277
DataType::Timestamp(_, _) => DFColumnType::Timestamp,
278+
DataType::Dictionary(key_type, value_type) => {
279+
if key_type.is_integer() {
280+
// mapping dictionary string types to Text
281+
match value_type.as_ref() {
282+
DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text,
283+
_ => DFColumnType::Another,
284+
}
285+
} else {
286+
DFColumnType::Another
287+
}
288+
}
278289
_ => DFColumnType::Another,
279290
})
280291
.collect()

datafusion/sqllogictest/test_files/aggregate.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4546,7 +4546,7 @@ set datafusion.sql_parser.dialect = 'Generic';
45464546
statement ok
45474547
create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));
45484548

4549-
query I?
4549+
query IT
45504550
select * from dict_test;
45514551
----
45524552
1 foo

datafusion/sqllogictest/test_files/coalesce.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,13 @@ select
220220
statement ok
221221
create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null);
222222

223-
query ?
223+
query T
224224
select coalesce(column1, 'none_set') from test1;
225225
----
226226
foo
227227
none_set
228228

229-
query ?
229+
query T
230230
select coalesce(null, column1, 'none_set') from test1;
231231
----
232232
foo
@@ -246,7 +246,7 @@ drop table test1
246246
statement ok
247247
create table t(c varchar) as values ('a'), (null);
248248

249-
query ?T
249+
query TT
250250
select
251251
coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')),
252252
arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')))
@@ -264,7 +264,7 @@ create table t as values
264264
(arrow_cast('foo', 'Dictionary(Int32, Utf8)')),
265265
(null);
266266

267-
query ?T
267+
query TT
268268
select
269269
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')),
270270
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')))
@@ -273,7 +273,7 @@ from t;
273273
foo Dictionary(Int64, LargeUtf8)
274274
bar Dictionary(Int64, LargeUtf8)
275275

276-
query ?T
276+
query TT
277277
select
278278
coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')),
279279
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')))
@@ -282,7 +282,7 @@ from t;
282282
foo Dictionary(Int32, LargeUtf8)
283283
bar Dictionary(Int32, LargeUtf8)
284284

285-
query ?T
285+
query TT
286286
select
287287
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')),
288288
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')))

datafusion/sqllogictest/test_files/copy.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ statement ok
3636
CREATE EXTERNAL TABLE validate_partitioned_parquet STORED AS PARQUET
3737
LOCATION 'test_files/scratch/copy/partitioned_table1/' PARTITIONED BY (col2);
3838

39-
query I?
39+
query IT
4040
select * from validate_partitioned_parquet order by col1, col2;
4141
----
4242
1 Foo
@@ -64,7 +64,7 @@ statement ok
6464
CREATE EXTERNAL TABLE validate_partitioned_parquet2 STORED AS PARQUET
6565
LOCATION 'test_files/scratch/copy/partitioned_table2/' PARTITIONED BY (column2, column3);
6666

67-
query I??
67+
query ITT
6868
select * from validate_partitioned_parquet2 order by column1,column2,column3;
6969
----
7070
1 a x
@@ -92,7 +92,7 @@ statement ok
9292
CREATE EXTERNAL TABLE validate_partitioned_parquet3 STORED AS PARQUET
9393
LOCATION 'test_files/scratch/copy/partitioned_table3/' PARTITIONED BY (column1, column3);
9494

95-
query ?T?
95+
query TTT
9696
select column1, column2, column3 from validate_partitioned_parquet3 order by column1,column2,column3;
9797
----
9898
1 a x
@@ -552,7 +552,7 @@ CREATE EXTERNAL TABLE validate_arrow_file_dict
552552
STORED AS arrow
553553
LOCATION 'test_files/scratch/copy/table_dict.arrow';
554554

555-
query T?
555+
query TT
556556
select * from validate_arrow_file_dict;
557557
----
558558
c foo

datafusion/sqllogictest/test_files/dictionary.slt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ FROM (
6262
('1000', 32, 'foo', 'True', 10.0, 1703035800000000000)
6363
);
6464

65-
query ?RTTRP
65+
query TRTTRP
6666
SELECT * FROM m1;
6767
----
6868
1000 32 foo True 1 2023-12-20T00:00:00
@@ -137,7 +137,7 @@ FROM (
137137
('passive', '1000', 1000, 1701653400000000000)
138138
);
139139

140-
query ??RP
140+
query TTRP
141141
SELECT * FROM m2;
142142
----
143143
active 1000 100 2023-12-04T00:00:00
@@ -208,7 +208,7 @@ true false NULL true true false true NULL
208208

209209
# Reproducer for https://github.com/apache/datafusion/issues/8738
210210
# This query should work correctly
211-
query P?TT rowsort
211+
query PTTT rowsort
212212
SELECT
213213
"data"."timestamp" as "time",
214214
"data"."tag_id",
@@ -264,7 +264,7 @@ ORDER BY
264264

265265

266266
# deterministic sort (so we can avoid rowsort)
267-
query P?TT
267+
query PTTT
268268
SELECT
269269
"data"."timestamp" as "time",
270270
"data"."tag_id",
@@ -348,7 +348,7 @@ create table m3 as
348348
from m3_source;
349349

350350
# there are two values in column2
351-
query T?I rowsort
351+
query TTI rowsort
352352
SELECT *
353353
FROM m3;
354354
----
@@ -397,7 +397,7 @@ create table test as values
397397
;
398398

399399
# query using an string '1' which must be coerced into a dictionary string
400-
query T?
400+
query TT
401401
SELECT * from test where column2 = '1';
402402
----
403403
row1 1
@@ -429,7 +429,7 @@ physical_plan
429429

430430

431431
# Now query using an integer which must be coerced into a dictionary string
432-
query T?
432+
query TT
433433
SELECT * from test where column2 = 1;
434434
----
435435
row1 1

datafusion/sqllogictest/test_files/group_by.slt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4614,7 +4614,7 @@ CREATE TABLE int8_dict AS VALUES
46144614
(1, arrow_cast('A', 'Dictionary(Int8, Utf8)'));
46154615

46164616
# Group by the non-dict column
4617-
query ?I rowsort
4617+
query TI rowsort
46184618
SELECT column2, count(column1) FROM int8_dict GROUP BY column2;
46194619
----
46204620
A 4
@@ -4652,7 +4652,7 @@ CREATE TABLE int16_dict AS VALUES
46524652
(1, arrow_cast('A', 'Dictionary(Int16, Utf8)'));
46534653

46544654
# Group by the non-dict column
4655-
query ?I rowsort
4655+
query TI rowsort
46564656
SELECT column2, count(column1) FROM int16_dict GROUP BY column2;
46574657
----
46584658
A 4
@@ -4690,7 +4690,7 @@ CREATE TABLE int32_dict AS VALUES
46904690
(1, arrow_cast('A', 'Dictionary(Int32, Utf8)'));
46914691

46924692
# Group by the non-dict column
4693-
query ?I rowsort
4693+
query TI rowsort
46944694
SELECT column2, count(column1) FROM int32_dict GROUP BY column2;
46954695
----
46964696
A 4
@@ -4728,7 +4728,7 @@ CREATE TABLE int64_dict AS VALUES
47284728
(1, arrow_cast('A', 'Dictionary(Int64, Utf8)'));
47294729

47304730
# Group by the non-dict column
4731-
query ?I rowsort
4731+
query TI rowsort
47324732
SELECT column2, count(column1) FROM int64_dict GROUP BY column2;
47334733
----
47344734
A 4
@@ -4766,7 +4766,7 @@ CREATE TABLE uint8_dict AS VALUES
47664766
(1, arrow_cast('A', 'Dictionary(UInt8, Utf8)'));
47674767

47684768
# Group by the non-dict column
4769-
query ?I rowsort
4769+
query TI rowsort
47704770
SELECT column2, count(column1) FROM uint8_dict GROUP BY column2;
47714771
----
47724772
A 4
@@ -4804,7 +4804,7 @@ CREATE TABLE uint16_dict AS VALUES
48044804
(1, arrow_cast('A', 'Dictionary(UInt16, Utf8)'));
48054805

48064806
# Group by the non-dict column
4807-
query ?I rowsort
4807+
query TI rowsort
48084808
SELECT column2, count(column1) FROM uint16_dict GROUP BY column2;
48094809
----
48104810
A 4
@@ -4842,7 +4842,7 @@ CREATE TABLE uint32_dict AS VALUES
48424842
(1, arrow_cast('A', 'Dictionary(UInt32, Utf8)'));
48434843

48444844
# Group by the non-dict column
4845-
query ?I rowsort
4845+
query TI rowsort
48464846
SELECT column2, count(column1) FROM uint32_dict GROUP BY column2;
48474847
----
48484848
A 4
@@ -4880,7 +4880,7 @@ CREATE TABLE uint64_dict AS VALUES
48804880
(1, arrow_cast('A', 'Dictionary(UInt64, Utf8)'));
48814881

48824882
# Group by the non-dict column
4883-
query ?I rowsort
4883+
query TI rowsort
48844884
SELECT column2, count(column1) FROM uint64_dict GROUP BY column2;
48854885
----
48864886
A 4

datafusion/sqllogictest/test_files/joins.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2672,7 +2672,7 @@ logical_plan
26722672
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
26732673

26742674
# hash_join_with_date32
2675-
query DDR?DDR? rowsort
2675+
query DDRTDDRT rowsort
26762676
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
26772677
----
26782678
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
@@ -2691,7 +2691,7 @@ logical_plan
26912691
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]
26922692

26932693
# hash_join_with_date64
2694-
query DDR?DDR? rowsort
2694+
query DDRTDDRT rowsort
26952695
select * from hashjoin_datatype_table_t1 t1 left join hashjoin_datatype_table_t2 t2 on t1.c2 = t2.c2
26962696
----
26972697
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
@@ -2712,7 +2712,7 @@ logical_plan
27122712
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]
27132713

27142714
# hash_join_with_decimal
2715-
query DDR?DDR? rowsort
2715+
query DDRTDDRT rowsort
27162716
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t1 t2 on t1.c3 = t2.c3
27172717
----
27182718
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 1.23 abc
@@ -2732,7 +2732,7 @@ logical_plan
27322732
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]
27332733

27342734
# hash_join_with_dictionary
2735-
query DDR?DDR? rowsort
2735+
query DDRTDDRT rowsort
27362736
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c4 = t2.c4
27372737
----
27382738
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
@@ -2783,7 +2783,7 @@ physical_plan
27832783
11)----------MemoryExec: partitions=1, partition_sizes=[1]
27842784

27852785
# sort_merge_join_on_date32 inner sort merge join on data type (Date32)
2786-
query DDR?DDR? rowsort
2786+
query DDRTDDRT rowsort
27872787
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
27882788
----
27892789
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
@@ -2815,7 +2815,7 @@ physical_plan
28152815
13)------------MemoryExec: partitions=1, partition_sizes=[1]
28162816

28172817
# sort_merge_join_on_decimal right join on data type (Decimal)
2818-
query DDR?DDR? rowsort
2818+
query DDRTDDRT rowsort
28192819
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t2 t2 on t1.c3 = t2.c3
28202820
----
28212821
1970-01-04 NULL -123.12 jkl 1970-01-02 1970-01-02T00:00:00 -123.12 abc

datafusion/sqllogictest/test_files/regexp.slt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -478,30 +478,30 @@ create or replace table dict_table as
478478
select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1
479479
from strings;
480480

481-
query ?
481+
query T
482482
select column1 from dict_table where column1 LIKE '%oo%';
483483
----
484484
FooBar
485485
Foo
486486
Foo
487487
FooBar
488488

489-
query ?
489+
query T
490490
select column1 from dict_table where column1 NOT LIKE '%oo%';
491491
----
492492
Bar
493493
Bar
494494
Baz
495495

496-
query ?
496+
query T
497497
select column1 from dict_table where column1 ILIKE '%oO%';
498498
----
499499
FooBar
500500
Foo
501501
Foo
502502
FooBar
503503

504-
query ?
504+
query T
505505
select column1 from dict_table where column1 NOT ILIKE '%oO%';
506506
----
507507
Bar

0 commit comments

Comments
 (0)