From 44db0af5bcae0c8f9d1c4ff493cd747a747ca58f Mon Sep 17 00:00:00 2001 From: anjakefala Date: Mon, 22 Aug 2022 15:12:44 -0700 Subject: [PATCH 1/2] ARROW-17464: [C++] Store/Read float16 values in Parquet as FixedSizeBinary Half-float values are currently not supported in Parquet: https://issues.apache.org/jira/browse/PARQUET-1647 --- .../parquet/arrow/arrow_reader_writer_test.cc | 25 +++++++++++++++++++ cpp/src/parquet/arrow/reader_internal.cc | 4 +++ cpp/src/parquet/arrow/schema.cc | 11 ++++++++ cpp/src/parquet/column_writer.cc | 1 + python/pyarrow/tests/parquet/common.py | 1 + 5 files changed, 42 insertions(+) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d719f0e642e..62632a67923 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1279,6 +1279,31 @@ TEST_F(TestDurationParquetIO, Roundtrip) { this->RoundTripSingleColumn(duration_arr, duration_arr, arrow_properties); } +using TestHalfFloatParquetIO = TestParquetIO<::arrow::HalfFloatType>; + +TEST_F(TestHalfFloatParquetIO, Roundtrip) { + std::vector is_valid = {true, true, false, true}; + // TODO How to test with a Binary vector? + std::vector values = {1, 2, 3, 4}; + + std::shared_ptr int_array, half_float_arr; + ::arrow::ArrayFromVector<::arrow::UInt16Type, uint16_t>(::arrow::uint16(), is_valid, + values, &int_array); + ::arrow::ArrayFromVector<::arrow::HalfFloatType, uint16_t>(::arrow::float16(), is_valid, + values, &half_float_arr); + + // When the original Arrow schema isn't stored, a HalfFloat comes back as Binary (how it + // is stored in Parquet) + this->RoundTripSingleColumn(half_float_arr, int_array, + default_arrow_writer_properties()); + + // When the original arrow schema is stored, the HalfFloat array type should be + // preserved + const auto arrow_properties = + ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); + this->RoundTripSingleColumn(half_float_arr, half_float_arr, arrow_properties); +} + TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) { // This also tests max_definition_level = 1 std::shared_ptr arr; diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index 64fcc451808..567a859b998 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -762,6 +762,10 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr& va TRANSFER_INT32(TIME32, ::arrow::Time32Type); TRANSFER_INT64(TIME64, ::arrow::Time64Type); TRANSFER_INT64(DURATION, ::arrow::DurationType); + case ::arrow::Type::HALF_FLOAT: { + RETURN_NOT_OK(TransferBinary(reader, pool, value_field, &chunked_result)); + result = chunked_result; + } break; case ::arrow::Type::DATE64: RETURN_NOT_OK(TransferDate64(reader, pool, value_field, &result)); break; diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 716083f8a58..2e2e7d355ed 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -390,6 +390,11 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, case ArrowTypeId::DURATION: type = ParquetType::INT64; break; + case ArrowTypeId::HALF_FLOAT: { + type = ParquetType::FIXED_LEN_BYTE_ARRAY; + // defining that a HALF_FLOAT is 2 bytes long + length = 2; + } break; case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); return StructToNode(struct_type, name, field->nullable(), properties, @@ -926,6 +931,12 @@ Result ApplyOriginalStorageMetadata(const Field& origin_field, modified = true; } + if (origin_type->id() == ::arrow::Type::HALF_FLOAT && + inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) { + inferred->field = inferred->field->WithType(origin_type); + modified = true; + } + if (origin_type->id() == ::arrow::Type::DICTIONARY && inferred_type->id() != ::arrow::Type::DICTIONARY && IsDictionaryReadSupported(*inferred_type)) { diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f7898c02d47..7f0c5cb8102 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2050,6 +2050,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType) + WRITE_ZERO_COPY_CASE(HALF_FLOAT, HalfFloatType, FLBAType) default: break; } diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index cbff41c7b10..ac393e635bf 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -166,6 +166,7 @@ def alltypes_sample(size=10000, seed=0, categorical=False): 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), + 'float16': np.arange(size, dtype=np.float16), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, From 5f11e8f78f70dd5409d5accfe3bc9ef22d616e3b Mon Sep 17 00:00:00 2001 From: Rok Date: Tue, 30 Aug 2022 21:01:24 +0200 Subject: [PATCH 2/2] minor change to WriteArrowDense --- cpp/src/parquet/arrow/arrow_schema_test.cc | 31 +++++++++++----------- cpp/src/parquet/column_writer.cc | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 703b234044e..01f436b1375 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -909,21 +909,22 @@ TEST_F(TestConvertArrowSchema, ArrowFields) { // ASSERT_NO_FATAL_FAILURE(); } -TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { - struct FieldConstructionArguments { - std::string name; - std::shared_ptr<::arrow::DataType> datatype; - }; - - std::vector cases = { - {"float16", ::arrow::float16()}, - }; - - for (const FieldConstructionArguments& c : cases) { - auto field = ::arrow::field(c.name, c.datatype); - ASSERT_RAISES(NotImplemented, ConvertSchema({field})); - } -} +// TODO +//TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) { +// struct FieldConstructionArguments { +// std::string name; +// std::shared_ptr<::arrow::DataType> datatype; +// }; +// +// std::vector cases = { +// {"float16", ::arrow::float16()}, +// }; +// +// for (const FieldConstructionArguments& c : cases) { +// auto field = ::arrow::field(c.name, c.datatype); +// ASSERT_RAISES(NotImplemented, ConvertSchema({field})); +// } +//} TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) { std::vector parquet_fields; diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 7f0c5cb8102..5159d809e6b 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2050,7 +2050,7 @@ Status TypedColumnWriterImpl::WriteArrowDense( WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType) WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType) - WRITE_ZERO_COPY_CASE(HALF_FLOAT, HalfFloatType, FLBAType) + WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType) default: break; }