apache · anjakefala · Aug 22, 2022 · Aug 30, 2022 · rok · Sep 7, 2022
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -1279,6 +1279,31 @@ TEST_F(TestDurationParquetIO, Roundtrip) {
   this->RoundTripSingleColumn(duration_arr, duration_arr, arrow_properties);
 }
 
+using TestHalfFloatParquetIO = TestParquetIO<::arrow::HalfFloatType>;
+
+TEST_F(TestHalfFloatParquetIO, Roundtrip) {
+  std::vector<bool> is_valid = {true, true, false, true};
+  // TODO How to test with a Binary vector?
+  std::vector<uint16_t> values = {1, 2, 3, 4};
+
+  std::shared_ptr<Array> int_array, half_float_arr;
+  ::arrow::ArrayFromVector<::arrow::UInt16Type, uint16_t>(::arrow::uint16(), is_valid,
+                                                          values, &int_array);
+  ::arrow::ArrayFromVector<::arrow::HalfFloatType, uint16_t>(::arrow::float16(), is_valid,
+                                                             values, &half_float_arr);
+
+  // When the original Arrow schema isn't stored, a HalfFloat comes back as Binary (how it
+  // is stored in Parquet)
+  this->RoundTripSingleColumn(half_float_arr, int_array,
+                              default_arrow_writer_properties());
+
+  // When the original arrow schema is stored, the HalfFloat array type should be
+  // preserved
+  const auto arrow_properties =
+      ::parquet::ArrowWriterProperties::Builder().store_schema()->build();
+  this->RoundTripSingleColumn(half_float_arr, half_float_arr, arrow_properties);
+}
+
 TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compatibility) {
   // This also tests max_definition_level = 1
   std::shared_ptr<Array> arr;

diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -909,21 +909,22 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
   // ASSERT_NO_FATAL_FAILURE();
 }
 
-TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) {
-  struct FieldConstructionArguments {
-    std::string name;
-    std::shared_ptr<::arrow::DataType> datatype;
-  };
-
-  std::vector<FieldConstructionArguments> cases = {
-      {"float16", ::arrow::float16()},
-  };
-
-  for (const FieldConstructionArguments& c : cases) {
-    auto field = ::arrow::field(c.name, c.datatype);
-    ASSERT_RAISES(NotImplemented, ConvertSchema({field}));
-  }
-}
+// TODO
+//TEST_F(TestConvertArrowSchema, ArrowNonconvertibleFields) {
+//  struct FieldConstructionArguments {
+//    std::string name;
+//    std::shared_ptr<::arrow::DataType> datatype;
+//  };
+//
+//  std::vector<FieldConstructionArguments> cases = {
+//      {"float16", ::arrow::float16()},
+//  };
+//
+//  for (const FieldConstructionArguments& c : cases) {
+//    auto field = ::arrow::field(c.name, c.datatype);
+//    ASSERT_RAISES(NotImplemented, ConvertSchema({field}));
+//  }
+//}
 
 TEST_F(TestConvertArrowSchema, ParquetFlatPrimitivesAsDictionaries) {
   std::vector<NodePtr> parquet_fields;

diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
@@ -762,6 +762,10 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& va
       TRANSFER_INT32(TIME32, ::arrow::Time32Type);
       TRANSFER_INT64(TIME64, ::arrow::Time64Type);
       TRANSFER_INT64(DURATION, ::arrow::DurationType);
+    case ::arrow::Type::HALF_FLOAT: {
+      RETURN_NOT_OK(TransferBinary(reader, pool, value_field, &chunked_result));
+      result = chunked_result;
+    } break;
     case ::arrow::Type::DATE64:
       RETURN_NOT_OK(TransferDate64(reader, pool, value_field, &result));
       break;

diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
@@ -390,6 +390,11 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
     case ArrowTypeId::DURATION:
       type = ParquetType::INT64;
       break;
+    case ArrowTypeId::HALF_FLOAT: {
+      type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+      // defining that a HALF_FLOAT is 2 bytes long
+      length = 2;
+    } break;
     case ArrowTypeId::STRUCT: {
       auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
       return StructToNode(struct_type, name, field->nullable(), properties,
@@ -926,6 +931,12 @@ Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
     modified = true;
   }
 
+  if (origin_type->id() == ::arrow::Type::HALF_FLOAT &&
+      inferred_type->id() == ::arrow::Type::FIXED_SIZE_BINARY) {
+    inferred->field = inferred->field->WithType(origin_type);
+    modified = true;
+  }
+
   if (origin_type->id() == ::arrow::Type::DICTIONARY &&
       inferred_type->id() != ::arrow::Type::DICTIONARY &&
       IsDictionaryReadSupported(*inferred_type)) {

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
@@ -2050,6 +2050,7 @@ Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
     WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
     WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
     WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
+    WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType)
-    WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType)
+    case ::arrow::Type::HALF_FLOAT: {
+      auto array_data = array.data();
+      const auto& arr = ::arrow::FixedSizeBinaryArray(
+          ::arrow::fixed_size_binary(2), array.length(), array_data->buffers[1],
+          array_data->buffers[0], array.null_count(), array.offset());
+      return WriteArrowSerialize<FLBAType, ::arrow::FixedSizeBinaryType>(
+          arr, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+    }
-    WRITE_SERIALIZE_CASE(HALF_FLOAT, FixedSizeBinaryType, FLBAType)
+    case ::arrow::Type::HALF_FLOAT: {
+      auto array_data = array.data();
+      const auto& arr = ::arrow::FixedSizeBinaryArray(
+          ::arrow::fixed_size_binary(2), array.length(), array_data->buffers[1],
+          array_data->buffers[0], array.null_count(), array.offset());
+      return WriteArrowSerialize<FLBAType, ::arrow::FixedSizeBinaryType>(
+          arr, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+    }
     default:
       break;
   }

diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
@@ -166,6 +166,7 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
         'int16': np.arange(size, dtype=np.int16),
         'int32': np.arange(size, dtype=np.int32),
         'int64': np.arange(size, dtype=np.int64),
+        'float16': np.arange(size, dtype=np.float16),
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,