Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,31 @@ TEST_F(TestDurationParquetIO, Roundtrip) {
this->RoundTripSingleColumn(duration_arr, duration_arr, arrow_properties);
}

using TestHalfFloatParquetIO = TestParquetIO<::arrow::HalfFloatType>;

TEST_F(TestHalfFloatParquetIO, Roundtrip) {
std::vector<bool> is_valid = {true, true, false, true};
// TODO How to test with a Binary vector?
std::vector<uint16_t> values = {1, 2, 3, 4};

std::shared_ptr<Array> int_array, half_float_arr;
::arrow::ArrayFromVector<::arrow::UInt16Type, uint16_t>(::arrow::uint16(), is_valid,
values, &int_array);
::arrow::ArrayFromVector<::arrow::HalfFloatType, uint16_t>(::arrow::float16(), is_valid,
values, &half_float_arr);

// When the original Arrow schema isn't stored, a HalfFloat comes back as Binary (how it
// is stored in Parquet)
this->RoundTripSingleColumn(half_float_arr, int_array,
default_arrow_writer_properties());

// When the original arrow schema is stored, the HalfFloat array type should be
// preserved
const auto arrow_properties =
::parquet::ArrowWriterProperties::Builder().store_schema()->build();
this->RoundTripSingleColumn(half_float_arr, half_float_arr, arrow_properties);
}

TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
// This also tests max_definition_level = 1
std::shared_ptr<Array> arr;
Expand Down
31 changes: 16 additions & 15 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -909,21 +909,22 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
// ASSERT_NO_FATAL_FAILURE();
}

TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) {
struct FieldConstructionArguments {
std::string name;
std::shared_ptr<::arrow::DataType> datatype;
};

std::vector<FieldConstructionArguments> cases = {
{"float16", ::arrow::float16()},
};

for (const FieldConstructionArguments& c : cases) {
auto field = ::arrow::field(c.name, c.datatype);
ASSERT_RAISES(NotImplemented, ConvertSchema({field}));
}
}
// TODO
//TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) {
// struct FieldConstructionArguments {
// std::string name;
// std::shared_ptr<::arrow::DataType> datatype;
// };
//
// std::vector<FieldConstructionArguments> cases = {
// {"float16", ::arrow::float16()},
// };
//
// for (const FieldConstructionArguments& c : cases) {
// auto field = ::arrow::field(c.name, c.datatype);
// ASSERT_RAISES(NotImplemented, ConvertSchema({field}));
// }
//}

TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) {
std::vector<NodePtr> parquet_fields;
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,10 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& va
TRANSFER_INT32(TIME32, ::arrow::Time32Type);
TRANSFER_INT64(TIME64, ::arrow::Time64Type);
TRANSFER_INT64(DURATION, ::arrow::DurationType);
case ::arrow::Type::HALF_FLOAT: {
RETURN_NOT_OK(TransferBinary(reader, pool, value_field, &chunked_result));
result = chunked_result;
} break;
case ::arrow::Type::DATE64:
RETURN_NOT_OK(TransferDate64(reader, pool, value_field, &result));
break;
Expand Down
11 changes: 11 additions & 0 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
case ArrowTypeId::DURATION:
type = ParquetType::INT64;
break;
case ArrowTypeId::HALF_FLOAT: {
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
// defining that a HALF_FLOAT is 2 bytes long
length = 2;
} break;
case ArrowTypeId::STRUCT: {
auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
return StructToNode(struct_type, name, field->nullable(), properties,
Expand Down Expand Up @@ -926,6 +931,12 @@ Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
modified = true;
}

if (origin_type->id() == ::arrow::Type::HALF_FLOAT &&
inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) {
inferred->field = inferred->field->WithType(origin_type);
modified = true;
}

if (origin_type->id() == ::arrow::Type::DICTIONARY &&
inferred_type->id() != ::arrow::Type::DICTIONARY &&
IsDictionaryReadSupported(*inferred_type)) {
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,7 @@ Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would resolve the segfault but doesn't really feel elegant.

Suggested change
WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType)
case ::arrow::Type::HALF_FLOAT: {
auto array_data = array.data();
const auto& arr = ::arrow::FixedSizeBinaryArray(
::arrow::fixed_size_binary(2), array.length(), array_data->buffers[1],
array_data->buffers[0], array.null_count(), array.offset());
return WriteArrowSerialize<FLBAType, ::arrow::FixedSizeBinaryType>(
arr, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
}

default:
break;
}
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/tests/parquet/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
'int16': np.arange(size, dtype=np.int16),
'int32': np.arange(size, dtype=np.int32),
'int64': np.arange(size, dtype=np.int64),
'float16': np.arange(size, dtype=np.float16),
'float32': np.arange(size, dtype=np.float32),
'float64': np.arange(size, dtype=np.float64),
'bool': np.random.randn(size) > 0,
Expand Down