From 430ad81c2b563bc2d57e81bef76da0c4bddc95e8 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 29 Oct 2025 21:20:31 -0400 Subject: [PATCH 01/15] GH-47919: [C++] Update Meson config for C Data Interface changes (#47920) ### Rationale for this change This retains parity with the CMake configuration ### What changes are included in this PR? C Data Interface library is created as a standalone ### Are these changes tested? No (not possible with current CI) ### Are there any user-facing changes? No * GitHub Issue: #47919 Authored-by: Will Ayd Signed-off-by: Sutou Kouhei --- cpp/src/arrow/integration/meson.build | 6 ++++++ cpp/src/arrow/meson.build | 1 - cpp/src/arrow/util/meson.build | 4 ++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/integration/meson.build b/cpp/src/arrow/integration/meson.build index edc9fdf724f..6437c380bb3 100644 --- a/cpp/src/arrow/integration/meson.build +++ b/cpp/src/arrow/integration/meson.build @@ -23,6 +23,12 @@ exc = executable( dependencies: [arrow_test_dep_no_main, rapidjson_dep, gflags_dep], ) +arrow_c_data_integration_lib = library( + 'arrow_c_data_integration', + sources: ['c_data_integration_internal.cc'], + dependencies: [arrow_test_dep_no_main], +) + if needs_tests test('arrow-json-integration-test', exc) endif diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index fb7a7c2830f..703d5976aeb 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -280,7 +280,6 @@ if needs_integration or needs_tests arrow_components += { 'arrow_integration': { 'sources': [ - 'integration/c_data_integration_internal.cc', 'integration/json_integration.cc', 'integration/json_internal.cc', ], diff --git a/cpp/src/arrow/util/meson.build b/cpp/src/arrow/util/meson.build index cd92f167a95..d13a4bb8a96 100644 --- a/cpp/src/arrow/util/meson.build +++ b/cpp/src/arrow/util/meson.build @@ -253,7 +253,10 @@ util_tests = { 'sources': [ 'bit_block_counter_test.cc', 'bit_util_test.cc', + 'bitmap_test.cc', + 'bpacking_test.cc', 'rle_encoding_test.cc', + 'test_common.cc', ], }, 'arrow-threading-utility-test': { @@ -283,6 +286,7 @@ util_benchmarks = [ 'bit_block_counter', 'bit_util', 'bitmap_reader', + 'bpacking', 'cache', 'compression', 'decimal', From 2e46c05b8adf69468bb09ea8578c7b783bc50b3a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 31 Oct 2025 00:26:56 +0900 Subject: [PATCH 02/15] GH-47945: [C++] Add support for Boost 1.89.0 and require Boost 1.69 or later (#47947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Boost 1.89.0 removed backward compatibility CMake package for Boost.System. Boost.System is header-only since Boost 1.69. So CMake package for Boost.System isn't needed but it's provided for backward compatibility. CentOS 7 uses Boost 1.69. So we can drop support for Boost 1.68 or older. ### What changes are included in this PR? * Require Boost 1.69 or later. * Remove `system` from `COMPONENTS` because Boost.System is header-only. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #47945 Authored-by: Sutou Kouhei Signed-off-by: RaΓΊl Cumplido --- ci/conda_env_cpp.txt | 2 +- cpp/cmake_modules/ThirdpartyToolchain.cmake | 17 ++++++++++------- cpp/src/arrow/filesystem/CMakeLists.txt | 5 ++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/ci/conda_env_cpp.txt b/ci/conda_env_cpp.txt index 6e23e920a40..18d58f7bb2d 100644 --- a/ci/conda_env_cpp.txt +++ b/ci/conda_env_cpp.txt @@ -22,7 +22,6 @@ azure-storage-blobs-cpp>=12.10.0 azure-storage-common-cpp>=12.5.0 azure-storage-files-datalake-cpp>=12.9.0 benchmark>=1.6.0,!=1.8.4 -boost-cpp>=1.68.0 brotli bzip2 c-ares @@ -32,6 +31,7 @@ glog gmock>=1.10.0 google-cloud-cpp>=1.34.0 gtest>=1.10.0 +libboost-devel libgrpc libprotobuf libutf8proc diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 232ee64d9f8..1724c0d3a3d 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1191,10 +1191,9 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION # GH-34094 Older versions of Boost use the deprecated std::unary_function in # boost/container_hash/hash.hpp and support for that was removed in clang 16 set(ARROW_BOOST_REQUIRED_VERSION "1.81") -elseif(ARROW_BUILD_TESTS) - set(ARROW_BOOST_REQUIRED_VERSION "1.64") else() - set(ARROW_BOOST_REQUIRED_VERSION "1.58") + # CentOS 7 uses Boost 1.69. + set(ARROW_BOOST_REQUIRED_VERSION "1.69") endif() set(Boost_USE_MULTITHREADED ON) @@ -1202,7 +1201,14 @@ if(MSVC AND ARROW_USE_STATIC_CRT) set(Boost_USE_STATIC_RUNTIME ON) endif() # CMake 3.25.0 has 1.80 and older versions. +# +# We can remove this once we require CMake 3.30.0 or later because we +# enable CMP0167 "The FindBoost module is removed." +# https://cmake.org/cmake/help/latest/policy/CMP0167.html with CMake +# 3.30.0 or later. set(Boost_ADDITIONAL_VERSIONS + "1.89.0" + "1.89" "1.88.0" "1.88" "1.87.0" @@ -1273,7 +1279,7 @@ if(ARROW_USE_BOOST) set(Boost_USE_STATIC_LIBS ON) endif() if(ARROW_BOOST_REQUIRE_LIBRARY) - set(ARROW_BOOST_COMPONENTS filesystem system) + set(ARROW_BOOST_COMPONENTS filesystem) if(ARROW_FLIGHT_SQL_ODBC) list(APPEND ARROW_BOOST_COMPONENTS locale) endif() @@ -1327,9 +1333,6 @@ if(ARROW_USE_BOOST) if(TARGET Boost::filesystem) target_link_libraries(arrow::Boost::process INTERFACE Boost::filesystem) endif() - if(TARGET Boost::system) - target_link_libraries(arrow::Boost::process INTERFACE Boost::system) - endif() if(TARGET Boost::headers) target_link_libraries(arrow::Boost::process INTERFACE Boost::headers) endif() diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 5250ed2a887..9c5e655f6cc 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -132,12 +132,11 @@ if(ARROW_S3) DEFINITIONS ARROW_S3_LIBPATH="$" EXTRA_LINK_LIBS - Boost::filesystem - Boost::system) + Boost::filesystem) target_compile_definitions(arrow-filesystem-test PUBLIC ARROW_S3_LIBPATH="$") target_sources(arrow-filesystem-test PUBLIC s3fs_module_test.cc s3_test_util.cc) - target_link_libraries(arrow-filesystem-test PUBLIC Boost::filesystem Boost::system) + target_link_libraries(arrow-filesystem-test PUBLIC Boost::filesystem) endif() endif() From 055c2f4e91c63593aacab38250ac9da899cabb31 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 31 Oct 2025 21:08:36 +1300 Subject: [PATCH 03/15] GH-47973: [C++][Parquet] Fix invalid Parquet files written when dictionary encoded pages are large (#47998) ### Rationale for this change Prevents silently writing invalid data when using dictionary encoding and the number of bits in the estimated max buffer size is greater than the max int32 value. Also fixes an overflow resulting in a "Negative buffer resize" error if the buffer size in bytes is greater than max int32, and instead throw a more helpful exception. ### What changes are included in this PR? * Fix overflow when computing the bit position in `BitWriter::PutValue`. This overflow would cause the method to return without writing data, and the return value is only checked in debug builds. * Change buffer size calculations to use int64 and check for overflow before casting to int ### Are these changes tested? Yes, I've added unit tests for both issues. These require enabling `ARROW_LARGE_MEMORY_TESTS` as they allocate a lot of memory. ### Are there any user-facing changes? **This PR contains a "Critical Fix".** This fixes a bug where invalid Parquet files can be silently written when the buffer size for dictionary indices is large. * GitHub Issue: #47973 Authored-by: Adam Reeve Signed-off-by: Antoine Pitrou --- .../arrow/util/bit_stream_utils_internal.h | 6 +- cpp/src/arrow/util/rle_encoding_internal.h | 28 +++-- cpp/src/arrow/util/rle_encoding_test.cc | 5 +- cpp/src/parquet/column_writer.cc | 13 ++- cpp/src/parquet/column_writer_test.cc | 107 ++++++++++++++++++ cpp/src/parquet/encoder.cc | 32 ++++-- 6 files changed, 158 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 1057a0bf381..d8c7317fe8a 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -58,7 +58,7 @@ class BitWriter { int buffer_len() const { return max_bytes_; } /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit - /// packed. Returns false if there was not enough space. num_bits must be <= 32. + /// packed. Returns false if there was not enough space. num_bits must be <= 64. bool PutValue(uint64_t v, int num_bits); /// Writes v to the next aligned byte using num_bytes. If T is larger than @@ -197,7 +197,9 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) { ARROW_DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits; } - if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) + if (ARROW_PREDICT_FALSE(static_cast(byte_offset_) * 8 + bit_offset_ + + num_bits > + static_cast(max_bytes_) * 8)) return false; buffered_values_ |= v << bit_offset_; diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index 6b2782da315..50193d8903d 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -513,7 +513,7 @@ class RleBitPackedEncoder { : bit_width_(bit_width), bit_writer_(buffer, buffer_len) { ARROW_DCHECK_GE(bit_width_, 0); ARROW_DCHECK_LE(bit_width_, 64); - max_run_byte_size_ = MinBufferSize(bit_width); + max_run_byte_size_ = static_cast(MinBufferSize(bit_width)); ARROW_DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough."; Clear(); } @@ -521,32 +521,30 @@ class RleBitPackedEncoder { /// Returns the minimum buffer size needed to use the encoder for 'bit_width' /// This is the maximum length of a single run for 'bit_width'. /// It is not valid to pass a buffer less than this length. - static int MinBufferSize(int bit_width) { + static int64_t MinBufferSize(int bit_width) { // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values. - int max_literal_run_size = 1 + static_cast(::arrow::bit_util::BytesForBits( - MAX_VALUES_PER_LITERAL_RUN * bit_width)); + int64_t max_literal_run_size = + 1 + ::arrow::bit_util::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width); // Up to kMaxVlqByteLength indicator and a single 'bit_width' value. - int max_repeated_run_size = - bit_util::kMaxLEB128ByteLenFor + - static_cast(::arrow::bit_util::BytesForBits(bit_width)); + int64_t max_repeated_run_size = bit_util::kMaxLEB128ByteLenFor + + ::arrow::bit_util::BytesForBits(bit_width); return std::max(max_literal_run_size, max_repeated_run_size); } /// Returns the maximum byte size it could take to encode 'num_values'. - static int MaxBufferSize(int bit_width, int num_values) { + static int64_t MaxBufferSize(int bit_width, int64_t num_values) { // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 // and then a repeated run of length 8". // 8 values per smallest run, 8 bits per byte - int bytes_per_run = bit_width; - int num_runs = static_cast(::arrow::bit_util::CeilDiv(num_values, 8)); - int literal_max_size = num_runs + num_runs * bytes_per_run; + int64_t bytes_per_run = bit_width; + int64_t num_runs = ::arrow::bit_util::CeilDiv(num_values, 8); + int64_t literal_max_size = num_runs + num_runs * bytes_per_run; // In the very worst case scenario, the data is a concatenation of repeated // runs of 8 values. Repeated run has a 1 byte varint followed by the // bit-packed repeated value - int min_repeated_run_size = - 1 + static_cast(::arrow::bit_util::BytesForBits(bit_width)); - int repeated_max_size = num_runs * min_repeated_run_size; + int64_t min_repeated_run_size = 1 + ::arrow::bit_util::BytesForBits(bit_width); + int64_t repeated_max_size = num_runs * min_repeated_run_size; return std::max(literal_max_size, repeated_max_size); } @@ -1432,7 +1430,7 @@ inline int RleBitPackedEncoder::Flush() { } inline void RleBitPackedEncoder::CheckBufferFull() { - int bytes_written = bit_writer_.bytes_written(); + int64_t bytes_written = bit_writer_.bytes_written(); if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) { buffer_full_ = true; } diff --git a/cpp/src/arrow/util/rle_encoding_test.cc b/cpp/src/arrow/util/rle_encoding_test.cc index f3a14af4412..453fa78ea48 100644 --- a/cpp/src/arrow/util/rle_encoding_test.cc +++ b/cpp/src/arrow/util/rle_encoding_test.cc @@ -813,7 +813,7 @@ TEST(BitRle, RepeatedPattern) { TEST(BitRle, Overflow) { for (int bit_width = 1; bit_width < 32; bit_width += 3) { - int len = RleBitPackedEncoder::MinBufferSize(bit_width); + int len = static_cast(RleBitPackedEncoder::MinBufferSize(bit_width)); std::vector buffer(len); int num_added = 0; bool parity = true; @@ -861,7 +861,8 @@ void CheckRoundTrip(const Array& data, int bit_width, bool spaced, int32_t parts const int data_size = static_cast(data.length()); const int data_values_count = static_cast(data.length() - spaced * data.null_count()); - const int buffer_size = RleBitPackedEncoder::MaxBufferSize(bit_width, data_size); + const int buffer_size = + static_cast(RleBitPackedEncoder::MaxBufferSize(bit_width, data_size)); ASSERT_GE(parts, 1); ASSERT_LE(parts, data_size); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 22c36531cdb..94b67dfa807 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -185,7 +185,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level, int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level, int num_buffered_values) { int bit_width = bit_util::Log2(max_level + 1); - int num_bytes = 0; + int64_t num_bytes = 0; switch (encoding) { case Encoding::RLE: { // TODO: Due to the way we currently check if the buffer is full enough, @@ -195,14 +195,19 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level, break; } case Encoding::BIT_PACKED: { - num_bytes = - static_cast(bit_util::BytesForBits(num_buffered_values * bit_width)); + num_bytes = bit_util::BytesForBits(num_buffered_values * bit_width); break; } default: throw ParquetException("Unknown encoding type for levels."); } - return num_bytes; + if (num_bytes > std::numeric_limits::max()) { + std::stringstream ss; + ss << "Maximum buffer size for LevelEncoder (" << num_bytes + << ") is greater than the maximum int32 value"; + throw ParquetException(ss.str()); + } + return static_cast(num_bytes); } int LevelEncoder::Encode(int batch_size, const int16_t* levels) { diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 48cac04f071..dedf25abcab 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1034,6 +1034,113 @@ TEST_F(TestValuesWriterInt32Type, PagesSplitWithListAlignedWrites) { ASSERT_EQ(values_out_, values_); } +// Test writing a dictionary encoded page where the number of +// bits is greater than max int32. +// For https://github.com/apache/arrow/issues/47973 +TEST(TestColumnWriter, LARGE_MEMORY_TEST(WriteLargeDictEncodedPage)) { + auto sink = CreateOutputStream(); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, + { + PrimitiveNode::Make("item", Repetition::REQUIRED, Type::INT32), + })); + auto properties = + WriterProperties::Builder().data_pagesize(1024 * 1024 * 1024)->build(); + auto file_writer = ParquetFileWriter::Open(sink, schema, properties); + auto rg_writer = file_writer->AppendRowGroup(); + + constexpr int64_t num_batches = 150; + constexpr int64_t batch_size = 1'000'000; + constexpr int64_t unique_count = 200'000; + static_assert(batch_size % unique_count == 0); + + std::vector values(batch_size, 0); + for (int64_t i = 0; i < batch_size; i++) { + values[i] = static_cast(i % unique_count); + } + + auto col_writer = dynamic_cast(rg_writer->NextColumn()); + for (int64_t i = 0; i < num_batches; i++) { + col_writer->WriteBatch(batch_size, nullptr, nullptr, values.data()); + } + file_writer->Close(); + + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + + auto file_reader = ParquetFileReader::Open( + std::make_shared<::arrow::io::BufferReader>(buffer), default_reader_properties()); + auto metadata = file_reader->metadata(); + ASSERT_EQ(1, metadata->num_row_groups()); + auto row_group_reader = file_reader->RowGroup(0); + + // Verify page size property was applied and only 1 data page was written + auto page_reader = row_group_reader->GetColumnPageReader(0); + int64_t page_count = 0; + while (true) { + auto page = page_reader->NextPage(); + if (page == nullptr) { + break; + } + if (page_count == 0) { + ASSERT_EQ(page->type(), PageType::DICTIONARY_PAGE); + } else { + ASSERT_EQ(page->type(), PageType::DATA_PAGE); + } + page_count++; + } + ASSERT_EQ(page_count, 2); + + auto col_reader = std::static_pointer_cast(row_group_reader->Column(0)); + + constexpr int64_t buffer_size = 1024 * 1024; + values.resize(buffer_size); + + // Verify values were round-tripped correctly + int64_t levels_read = 0; + while (levels_read < num_batches * batch_size) { + int64_t batch_values; + int64_t batch_levels = col_reader->ReadBatch(buffer_size, nullptr, nullptr, + values.data(), &batch_values); + for (int64_t i = 0; i < batch_levels; i++) { + ASSERT_EQ(values[i], (levels_read + i) % unique_count); + } + levels_read += batch_levels; + } +} + +TEST(TestColumnWriter, LARGE_MEMORY_TEST(ThrowsOnDictIndicesTooLarge)) { + auto sink = CreateOutputStream(); + auto schema = std::static_pointer_cast( + GroupNode::Make("schema", Repetition::REQUIRED, + { + PrimitiveNode::Make("item", Repetition::REQUIRED, Type::INT32), + })); + auto properties = + WriterProperties::Builder().data_pagesize(4 * 1024LL * 1024 * 1024)->build(); + auto file_writer = ParquetFileWriter::Open(sink, schema, properties); + auto rg_writer = file_writer->AppendRowGroup(); + + constexpr int64_t num_batches = 1'000; + constexpr int64_t batch_size = 1'000'000; + constexpr int64_t unique_count = 200'000; + static_assert(batch_size % unique_count == 0); + + std::vector values(batch_size, 0); + for (int64_t i = 0; i < batch_size; i++) { + values[i] = static_cast(i % unique_count); + } + + auto col_writer = dynamic_cast(rg_writer->NextColumn()); + for (int64_t i = 0; i < num_batches; i++) { + col_writer->WriteBatch(batch_size, nullptr, nullptr, values.data()); + } + + EXPECT_THROW_THAT( + [&]() { file_writer->Close(); }, ParquetException, + ::testing::Property(&ParquetException::what, + ::testing::HasSubstr("exceeds maximum int value"))); +} + TEST(TestPageWriter, ThrowsOnPagesTooLarge) { NodePtr item = schema::Int32("item"); // optional item NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, ConvertedType::LIST)); diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index f9367555d97..04f079ce70c 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -438,7 +438,7 @@ struct DictEncoderTraits { // Initially 1024 elements static constexpr int32_t kInitialHashTableSize = 1 << 10; -int RlePreserveBufferSize(int num_values, int bit_width) { +int64_t RlePreserveBufferSize(int64_t num_values, int bit_width) { // Note: because of the way RleEncoder::CheckBufferFull() // is called, we have to reserve an extra "RleEncoder::MinBufferSize" // bytes. These extra bytes won't be used but not reserving them @@ -496,7 +496,8 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { /// indices. Used to size the buffer passed to WriteIndices(). int64_t EstimatedDataEncodedSize() override { return kDataPageBitWidthBytes + - RlePreserveBufferSize(static_cast(buffered_indices_.size()), bit_width()); + RlePreserveBufferSize(static_cast(buffered_indices_.size()), + bit_width()); } /// The minimum bit width required to encode the currently buffered indices. @@ -582,10 +583,15 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder { } std::shared_ptr FlushValues() override { - std::shared_ptr buffer = - AllocateBuffer(this->pool_, EstimatedDataEncodedSize()); - int result_size = WriteIndices(buffer->mutable_data(), - static_cast(EstimatedDataEncodedSize())); + const int64_t buffer_size = EstimatedDataEncodedSize(); + if (buffer_size > std::numeric_limits::max()) { + std::stringstream ss; + ss << "Buffer size for DictEncoder (" << buffer_size + << ") exceeds maximum int value"; + throw ParquetException(ss.str()); + } + std::shared_ptr buffer = AllocateBuffer(this->pool_, buffer_size); + int result_size = WriteIndices(buffer->mutable_data(), static_cast(buffer_size)); PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false)); return buffer; } @@ -1690,8 +1696,8 @@ class RleBooleanEncoder final : public EncoderImpl, virtual public BooleanEncode template void PutImpl(const SequenceType& src, int num_values); - int MaxRleBufferSize() const noexcept { - return RlePreserveBufferSize(static_cast(buffered_append_values_.size()), + int64_t MaxRleBufferSize() const noexcept { + return RlePreserveBufferSize(static_cast(buffered_append_values_.size()), kBitWidth); } @@ -1719,11 +1725,17 @@ void RleBooleanEncoder::PutImpl(const SequenceType& src, int num_values) { } std::shared_ptr RleBooleanEncoder::FlushValues() { - int rle_buffer_size_max = MaxRleBufferSize(); + int64_t rle_buffer_size_max = MaxRleBufferSize(); + if (rle_buffer_size_max > std::numeric_limits::max()) { + std::stringstream ss; + ss << "Buffer size for RleBooleanEncoder (" << rle_buffer_size_max + << ") exceeds maximum int value"; + throw ParquetException(ss.str()); + } std::shared_ptr buffer = AllocateBuffer(this->pool_, rle_buffer_size_max + kRleLengthInBytes); ::arrow::util::RleBitPackedEncoder encoder(buffer->mutable_data() + kRleLengthInBytes, - rle_buffer_size_max, + static_cast(rle_buffer_size_max), /*bit_width*/ kBitWidth); for (bool value : buffered_append_values_) { From 65f77875223f3fa5948c471ff462df2c9bb23702 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 31 Oct 2025 09:11:47 +0100 Subject: [PATCH 04/15] GH-47955: [C++][Parquet] Support reading INT-encoded Decimal stats as Arrow scalar (#48001) ### Rationale for this change The `StatisticsAsScalars` function, which allows converting Parquet statistics (min/max values) for a given logical type into Arrow scalars, did not support DECIMAL columns with physical type INT32 or INT64. ### Are these changes tested? Yes, by expanded unit test. ### Are there any user-facing changes? No, just a bug fix. * GitHub Issue: #47955 Authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../parquet/arrow/arrow_reader_writer_test.cc | 59 +++++---- cpp/src/parquet/arrow/reader_internal.cc | 117 +++++++++++------- 2 files changed, 105 insertions(+), 71 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index e081b428e24..cd69b2f9469 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -400,16 +400,23 @@ using ParquetDataType = PhysicalType::parquet_enum>; template using ParquetWriter = TypedColumnWriter>; +Result> WriteTableToBuffer( + const std::shared_ptr& table, int64_t row_group_size, + const std::shared_ptr& properties = default_writer_properties(), + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()) { + auto sink = CreateOutputStream(); + ARROW_RETURN_NOT_OK(WriteTable(*table, ::arrow::default_memory_pool(), sink, + row_group_size, properties, arrow_properties)); + return sink->Finish(); +} + void WriteTableToBuffer(const std::shared_ptr
& table, int64_t row_group_size, const std::shared_ptr& arrow_properties, std::shared_ptr* out) { - auto sink = CreateOutputStream(); - auto write_props = WriterProperties::Builder().write_batch_size(100)->build(); - - ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), sink, - row_group_size, write_props, arrow_properties)); - ASSERT_OK_AND_ASSIGN(*out, sink->Finish()); + ASSERT_OK_AND_ASSIGN( + *out, WriteTableToBuffer(table, row_group_size, write_props, arrow_properties)); } void DoRoundtrip(const std::shared_ptr
& table, int64_t row_group_size, @@ -3101,27 +3108,33 @@ TEST(ArrowReadWrite, DecimalStats) { using ::arrow::Decimal128; using ::arrow::field; - auto type = ::arrow::decimal128(/*precision=*/8, /*scale=*/0); - - const char* json = R"(["255", "128", null, "0", "1", "-127", "-128", "-129", "-255"])"; - auto array = ::arrow::ArrayFromJSON(type, json); - auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); + // Try various precisions to trigger encoding as different physical types: + // - precision 8 should use INT32 + // - precision 18 should use INT64 + // - precision 35 should use FIXED_LEN_BYTE_ARRAY + for (const int precision : {8, 18, 35}) { + auto type = ::arrow::decimal128(precision, /*scale=*/0); - std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, /*row_group_size=*/100, - default_arrow_writer_properties(), &buffer)); + const char* json = + R"(["255", "128", null, "0", "1", "-127", "-128", "-129", "-255"])"; + auto array = ::arrow::ArrayFromJSON(type, json); + auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array}); - ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), - ::arrow::default_memory_pool())); + auto props = WriterProperties::Builder().enable_store_decimal_as_integer()->build(); + ASSERT_OK_AND_ASSIGN(auto buffer, + WriteTableToBuffer(table, /*row_group_size=*/100, props)); + ASSERT_OK_AND_ASSIGN(auto reader, OpenFile(std::make_shared(buffer), + ::arrow::default_memory_pool())); - std::shared_ptr min, max; - ReadSingleColumnFileStatistics(std::move(reader), &min, &max); + std::shared_ptr min, max; + ReadSingleColumnFileStatistics(std::move(reader), &min, &max); - std::shared_ptr expected_min, expected_max; - ASSERT_OK_AND_ASSIGN(expected_min, array->GetScalar(array->length() - 1)); - ASSERT_OK_AND_ASSIGN(expected_max, array->GetScalar(0)); - ::arrow::AssertScalarsEqual(*expected_min, *min, /*verbose=*/true); - ::arrow::AssertScalarsEqual(*expected_max, *max, /*verbose=*/true); + std::shared_ptr expected_min, expected_max; + ASSERT_OK_AND_ASSIGN(expected_min, array->GetScalar(array->length() - 1)); + ASSERT_OK_AND_ASSIGN(expected_max, array->GetScalar(0)); + ::arrow::AssertScalarsEqual(*expected_min, *min, /*verbose=*/true); + ::arrow::AssertScalarsEqual(*expected_max, *max, /*verbose=*/true); + } } TEST(ArrowReadWrite, NestedNullableField) { diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index b622e93e072..12f36fe39cf 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -86,6 +86,7 @@ using arrow::Table; using arrow::TimestampArray; using ::arrow::bit_util::FromBigEndian; +using ::arrow::bit_util::ToBigEndian; using ::arrow::internal::checked_cast; using ::arrow::internal::checked_pointer_cast; using ::arrow::internal::SafeLeftShift; @@ -108,6 +109,62 @@ namespace { template using ArrayType = typename ::arrow::TypeTraits::ArrayType; +template +Result> DecimalScalarFromBigEndianBytes( + std::string_view data, std::shared_ptr arrow_type) { + ARROW_ASSIGN_OR_RAISE( + DecimalType decimal, + DecimalType::FromBigEndian(reinterpret_cast(data.data()), + static_cast(data.size()))); + return ::arrow::MakeScalar(std::move(arrow_type), decimal); +} + +// Extract Min and Max scalars from big-endian representation of Decimals. +Status ExtractDecimalMinMaxFromBytes(std::string_view min_bytes, + std::string_view max_bytes, + const LogicalType& logical_type, + std::shared_ptr<::arrow::Scalar>* min, + std::shared_ptr<::arrow::Scalar>* max) { + const DecimalLogicalType& decimal_type = + checked_cast(logical_type); + + Result> maybe_type = + Decimal128Type::Make(decimal_type.precision(), decimal_type.scale()); + std::shared_ptr arrow_type; + if (maybe_type.ok()) { + arrow_type = maybe_type.ValueOrDie(); + ARROW_ASSIGN_OR_RAISE( + *min, DecimalScalarFromBigEndianBytes(min_bytes, arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, DecimalScalarFromBigEndianBytes( + max_bytes, std::move(arrow_type))); + return Status::OK(); + } + // Fallback to see if Decimal256 can represent the type. + ARROW_ASSIGN_OR_RAISE( + arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale())); + ARROW_ASSIGN_OR_RAISE( + *min, DecimalScalarFromBigEndianBytes(min_bytes, arrow_type)); + ARROW_ASSIGN_OR_RAISE(*max, DecimalScalarFromBigEndianBytes( + max_bytes, std::move(arrow_type))); + + return Status::OK(); +} + +template +Status ExtractDecimalMinMaxFromInteger(Int min_value, Int max_value, + const LogicalType& logical_type, + std::shared_ptr<::arrow::Scalar>* min, + std::shared_ptr<::arrow::Scalar>* max) { + static_assert(std::is_integral_v); + const Int min_be = ToBigEndian(min_value); + const Int max_be = ToBigEndian(max_value); + const auto min_bytes = + std::string_view(reinterpret_cast(&min_be), sizeof(min_be)); + const auto max_bytes = + std::string_view(reinterpret_cast(&max_be), sizeof(max_be)); + return ExtractDecimalMinMaxFromBytes(min_bytes, max_bytes, logical_type, min, max); +} + template Status MakeMinMaxScalar(const StatisticsType& statistics, std::shared_ptr<::arrow::Scalar>* min, @@ -165,17 +222,19 @@ static Status FromInt32Statistics(const Int32Statistics& statistics, switch (logical_type.type()) { case LogicalType::Type::INT: return MakeMinMaxIntegralScalar(statistics, *type, min, max); - break; case LogicalType::Type::DATE: case LogicalType::Type::TIME: case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); - break; + case LogicalType::Type::DECIMAL: + return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), + logical_type, min, max); default: break; } - return Status::NotImplemented("Cannot extract statistics for type "); + return Status::NotImplemented("Cannot extract statistics for INT32 with logical type ", + logical_type.ToString()); } static Status FromInt64Statistics(const Int64Statistics& statistics, @@ -188,58 +247,19 @@ static Status FromInt64Statistics(const Int64Statistics& statistics, switch (logical_type.type()) { case LogicalType::Type::INT: return MakeMinMaxIntegralScalar(statistics, *type, min, max); - break; case LogicalType::Type::TIME: case LogicalType::Type::TIMESTAMP: case LogicalType::Type::NONE: return MakeMinMaxTypedScalar(statistics, type, min, max); - break; + case LogicalType::Type::DECIMAL: + return ExtractDecimalMinMaxFromInteger(statistics.min(), statistics.max(), + logical_type, min, max); default: break; } - return Status::NotImplemented("Cannot extract statistics for type "); -} - -template -Result> FromBigEndianString( - const std::string& data, std::shared_ptr arrow_type) { - ARROW_ASSIGN_OR_RAISE( - DecimalType decimal, - DecimalType::FromBigEndian(reinterpret_cast(data.data()), - static_cast(data.size()))); - return ::arrow::MakeScalar(std::move(arrow_type), decimal); -} - -// Extracts Min and Max scalar from bytes like types (i.e. types where -// decimal is encoded as little endian. -Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics, - const LogicalType& logical_type, - std::shared_ptr<::arrow::Scalar>* min, - std::shared_ptr<::arrow::Scalar>* max) { - const DecimalLogicalType& decimal_type = - checked_cast(logical_type); - - Result> maybe_type = - Decimal128Type::Make(decimal_type.precision(), decimal_type.scale()); - std::shared_ptr arrow_type; - if (maybe_type.ok()) { - arrow_type = maybe_type.ValueOrDie(); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); - return Status::OK(); - } - // Fallback to see if Decimal256 can represent the type. - ARROW_ASSIGN_OR_RAISE( - arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale())); - ARROW_ASSIGN_OR_RAISE( - *min, FromBigEndianString(statistics.EncodeMin(), arrow_type)); - ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString(statistics.EncodeMax(), - std::move(arrow_type))); - - return Status::OK(); + return Status::NotImplemented("Cannot extract statistics for INT64 with logical type ", + logical_type.ToString()); } Status ByteArrayStatisticsAsScalars(const Statistics& statistics, @@ -247,7 +267,8 @@ Status ByteArrayStatisticsAsScalars(const Statistics& statistics, std::shared_ptr<::arrow::Scalar>* max) { auto logical_type = statistics.descr()->logical_type(); if (logical_type->type() == LogicalType::Type::DECIMAL) { - return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max); + return ExtractDecimalMinMaxFromBytes(statistics.EncodeMin(), statistics.EncodeMax(), + *logical_type, min, max); } std::shared_ptr<::arrow::DataType> type; if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) { From 2289f3143499a0c6a54c724e0b4515b623eb951c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 31 Oct 2025 06:38:09 -0400 Subject: [PATCH 05/15] GH-47986: [Docs] Update the Rust on implementation status page (#47987) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change - closes https://github.com/apache/arrow/issues/47986 The Rust implementation has gained many features recently and it would be good to update the status page It turns out the Rust implementation is quite complete now πŸŽ‰ We are also consolidating our docs to point here as the canonical source of status. See - https://github.com/apache/arrow-rs/pull/8732 ### What changes are included in this PR? Update the status page https://arrow.apache.org/docs/status.html to reflect the Rust implementation status I will comment inline with doc / code references ### Are these changes tested? ### Are there any user-facing changes? Doc only, no behavior changes * GitHub Issue: #47986 --- docs/source/status.rst | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/status.rst b/docs/source/status.rst index 6b8098bd991..f9174e72883 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -46,9 +46,9 @@ Data Types +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Float32/64 | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Decimal32 | βœ“ | | βœ“ | | βœ“ | | | | βœ“ | +| Decimal32 | βœ“ | | βœ“ | | βœ“ | βœ“ | | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Decimal64 | βœ“ | | βœ“ | | βœ“ | | | | βœ“ | +| Decimal64 | βœ“ | | βœ“ | | βœ“ | βœ“ | | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Decimal128 | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ @@ -89,9 +89,9 @@ Data Types +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Large List | βœ“ | βœ“ | βœ“ | | \(4) | βœ“ | βœ“ | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| List View | βœ“ | | βœ“ | | βœ“ | | | | βœ“ | +| List View | βœ“ | | βœ“ | | βœ“ | βœ“ | | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Large List View | βœ“ | | βœ“ | | | | | | βœ“ | +| Large List View | βœ“ | | βœ“ | | | βœ“ | | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Struct | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ @@ -110,24 +110,24 @@ Data Types +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Extension | βœ“ | βœ“ | βœ“ | | | βœ“ | βœ“ | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Run-End Encoded | βœ“ | | βœ“ | | | | | | βœ“ | +| Run-End Encoded | βœ“ | | βœ“ | | | βœ“ | | | βœ“ | +-------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Canonical | C++ | Java | Go | JavaScript | C# | Rust | Julia | Swift | | Extension types | | | | | | | | | +=======================+=======+=======+=======+============+=======+=======+=======+=======+ -| Fixed shape tensor | βœ“ | | | | | | | | +| Fixed shape tensor | βœ“ | | | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Variable shape tensor | | | | | | | | | +| Variable shape tensor | | | | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| JSON | βœ“ | | βœ“ | | | | | | +| JSON | βœ“ | | βœ“ | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| Opaque | βœ“ | βœ“ | βœ“ | | | | | | +| Opaque | βœ“ | βœ“ | βœ“ | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| UUID | βœ“ | | βœ“ | | | | | | +| UUID | βœ“ | | βœ“ | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| 8-bit Boolean | βœ“ | | βœ“ | | | | | | +| 8-bit Boolean | βœ“ | | βœ“ | | | βœ“ | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | Parquet Variant | | | βœ“ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ @@ -160,9 +160,9 @@ IPC Format +-----------------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Dictionaries | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | βœ“ | | | +-----------------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Replacement dictionaries | βœ“ | βœ“ | βœ“ | | | | βœ“ | | | +| Replacement dictionaries | βœ“ | βœ“ | βœ“ | | | βœ“ | βœ“ | | | +-----------------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ -| Delta dictionaries | βœ“ (1) | | βœ“ (1) | βœ“ | βœ“ | | βœ“ | | | +| Delta dictionaries | βœ“ (1) | | βœ“ (1) | βœ“ | βœ“ | βœ“ | βœ“ | | | +-----------------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ | Tensors | βœ“ | | | | | | | | | +-----------------------------+-------+-------+-------+----+-------+-------+-------+-------+-----------+ From b1eeba7e530ca50779771ae121b20a108c6752ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Fri, 31 Oct 2025 22:33:31 +0100 Subject: [PATCH 06/15] GH-47974: [Docs] Remove stray documentation from Java and JS (#48006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change The documentation was migrated to their respective repositories ### What changes are included in this PR? Remove Java and JS stray documentation. Point Java docs link to https://arrow.apache.org/java/ ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #47974 Authored-by: RaΓΊl Cumplido Signed-off-by: Sutou Kouhei --- docs/source/implementations.rst | 2 +- docs/source/java/algorithm.rst | 92 ---- docs/source/java/cdata.rst | 470 ------------------ docs/source/java/dataset.rst | 311 ------------ docs/source/java/flight.rst | 239 ---------- docs/source/java/flight_sql.rst | 32 -- docs/source/java/flight_sql_jdbc_driver.rst | 176 ------- docs/source/java/index.rst | 46 -- docs/source/java/install.rst | 232 --------- docs/source/java/ipc.rst | 202 -------- docs/source/java/jdbc.rst | 278 ----------- docs/source/java/memory.rst | 501 -------------------- docs/source/java/overview.rst | 92 ---- docs/source/java/quickstartguide.rst | 316 ------------ docs/source/java/reference/index.rst | 21 - docs/source/java/substrait.rst | 203 -------- docs/source/java/table.rst | 378 --------------- docs/source/java/vector.rst | 366 -------------- docs/source/java/vector_schema_root.rst | 163 ------- docs/source/js/index.rst | 23 - 20 files changed, 1 insertion(+), 4142 deletions(-) delete mode 100644 docs/source/java/algorithm.rst delete mode 100644 docs/source/java/cdata.rst delete mode 100644 docs/source/java/dataset.rst delete mode 100644 docs/source/java/flight.rst delete mode 100644 docs/source/java/flight_sql.rst delete mode 100644 docs/source/java/flight_sql_jdbc_driver.rst delete mode 100644 docs/source/java/index.rst delete mode 100644 docs/source/java/install.rst delete mode 100644 docs/source/java/ipc.rst delete mode 100644 docs/source/java/jdbc.rst delete mode 100644 docs/source/java/memory.rst delete mode 100644 docs/source/java/overview.rst delete mode 100644 docs/source/java/quickstartguide.rst delete mode 100644 docs/source/java/reference/index.rst delete mode 100644 docs/source/java/substrait.rst delete mode 100644 docs/source/java/table.rst delete mode 100644 docs/source/java/vector.rst delete mode 100644 docs/source/java/vector_schema_root.rst delete mode 100644 docs/source/js/index.rst diff --git a/docs/source/implementations.rst b/docs/source/implementations.rst index 7cc2447bdd9..daeea2c5146 100644 --- a/docs/source/implementations.rst +++ b/docs/source/implementations.rst @@ -47,7 +47,7 @@ documentation and source code for these libraries. - `Go Docs `_ :fa:`external-link-alt` - `Go Source `_ * - Java - - :doc:`Java Docs` + - `Java Docs `_ :fa:`external-link-alt` - `Java Source `_ * - JavaScript - `JavaScript Docs `_ :fa:`external-link-alt` diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst deleted file mode 100644 index d4838967d61..00000000000 --- a/docs/source/java/algorithm.rst +++ /dev/null @@ -1,92 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Java Algorithms -=============== - -Arrow's Java library provides algorithms for some commonly-used -functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` -package of the ``algorithm`` module. - -Comparing Vector Elements -------------------------- - -Comparing vector elements is the basic for many algorithms. Vector -elements can be compared in one of the two ways: - -1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. -Currently, this type of comparison is supported through the ``org.apache.arrow.vector.compare.VectorValueEqualizer`` -interface. - -2. **Ordering comparison**: there are three possible results for this type of comparisons: ``less than``, ``equal to`` -and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. - -We provide default implementations to compare vector elements. However, users can also define ways -for customized comparisons. - -Vector Element Search ---------------------- - -A search algorithm tries to find a particular value in a vector. When successful, a vector index is -returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: - -1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is -found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements -in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. - -2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. -However, it is only applicable to sorted vectors. To get a sorted vector, -one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm -is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. - -3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search -for a value. To make this process faster, one can split the vector into multiple partitions, and perform the -search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. - -4. **Range search**: for many scenarios, there can be multiple matching values in the vector. -If the vector is sorted, the matching values reside in a contiguous region in the vector. The -range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. -An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. - -Vector Sorting --------------- - -Given a vector, a sorting algorithm turns it into a sorted one. The sorting criteria must -be specified by some ordering comparison operation. The sorting algorithms can be -classified into the following categories: - -1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original -vector, without creating any new vector. So it just returns the original vector after the sorting operations. -Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place -sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. - -2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, -it copies vector elements to a new vector in sorted order, and returns the new vector. -We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` -and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` -for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. - -3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer -vector, which correspond to indices of vector elements in sorted order. With the index vector, one can -easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k`` th -smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, -which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. - -Other Algorithms ----------------- - -Other algorithms include vector deduplication, dictionary encoding, etc., in the ``algorithm`` module. diff --git a/docs/source/java/cdata.rst b/docs/source/java/cdata.rst deleted file mode 100644 index 69f7ab0b078..00000000000 --- a/docs/source/java/cdata.rst +++ /dev/null @@ -1,470 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================ -C Data Interface -================ - -.. contents:: - -Arrow supports exchanging data without copying or serialization within the same process -through the :ref:`c-data-interface`, even between different language runtimes. - -Java to Python --------------- - -See :doc:`../python/integration/python_java` to implement Java to -Python communication using the C Data Interface. - -Java to C++ ------------ - -See :doc:`../developers/cpp/building` to build the Arrow C++ libraries: - -.. code-block:: shell - - $ git clone https://github.com/apache/arrow.git - $ cd arrow/cpp - $ mkdir build # from inside the `cpp` subdirectory - $ cd build - $ cmake .. --preset ninja-debug-minimal - $ cmake --build . - $ tree debug/ - debug/ - β”œβ”€β”€ libarrow.800.0.0.dylib - β”œβ”€β”€ libarrow.800.dylib -> libarrow.800.0.0.dylib - └── libarrow.dylib -> libarrow.800.dylib - -Share an Int64 array from C++ to Java -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**C++ Side** - -Implement a function in CDataCppBridge.h that exports an array via the C Data Interface: - -.. code-block:: cpp - - #include - #include - #include - - void FillInt64Array(const uintptr_t c_schema_ptr, const uintptr_t c_array_ptr) { - arrow::Int64Builder builder; - builder.Append(1); - builder.Append(2); - builder.Append(3); - builder.AppendNull(); - builder.Append(5); - builder.Append(6); - builder.Append(7); - builder.Append(8); - builder.Append(9); - builder.Append(10); - std::shared_ptr array = *builder.Finish(); - - struct ArrowSchema* c_schema = reinterpret_cast(c_schema_ptr); - auto c_schema_status = arrow::ExportType(*array->type(), c_schema); - if (!c_schema_status.ok()) c_schema_status.Abort(); - - struct ArrowArray* c_array = reinterpret_cast(c_array_ptr); - auto c_array_status = arrow::ExportArray(*array, c_array); - if (!c_array_status.ok()) c_array_status.Abort(); - } - -**Java Side** - -For this example, we will use `JavaCPP`_ to call our C++ function from Java, -without writing JNI bindings ourselves. - -.. code-block:: xml - - - - 4.0.0 - - org.example - java-cdata-example - 1.0-SNAPSHOT - - - 8 - 8 - 9.0.0 - - - - org.bytedeco - javacpp - 1.5.7 - - - org.apache.arrow - arrow-c-data - ${arrow.version} - - - org.apache.arrow - arrow-vector - ${arrow.version} - - - org.apache.arrow - arrow-memory-core - ${arrow.version} - - - org.apache.arrow - arrow-memory-netty - ${arrow.version} - - - org.apache.arrow - arrow-format - ${arrow.version} - - - - -.. code-block:: java - - import org.bytedeco.javacpp.annotation.Platform; - import org.bytedeco.javacpp.annotation.Properties; - import org.bytedeco.javacpp.tools.InfoMap; - import org.bytedeco.javacpp.tools.InfoMapper; - - @Properties( - target = "CDataJavaToCppExample", - value = @Platform( - include = { - "CDataCppBridge.h" - }, - compiler = {"cpp17"}, - linkpath = {"/arrow/cpp/build/debug/"}, - link = {"arrow"} - ) - ) - public class CDataJavaConfig implements InfoMapper { - - @Override - public void map(InfoMap infoMap) { - } - } - -.. code-block:: shell - - # Compile our Java code - $ javac -cp javacpp-1.5.7.jar CDataJavaConfig.java - - # Generate CDataInterfaceLibrary - $ java -jar javacpp-1.5.7.jar CDataJavaConfig.java - - # Generate libjniCDataInterfaceLibrary.dylib - $ java -jar javacpp-1.5.7.jar CDataJavaToCppExample.java - - # Validate libjniCDataInterfaceLibrary.dylib created - $ otool -L macosx-x86_64/libjniCDataJavaToCppExample.dylib - macosx-x86_64/libjniCDataJavaToCppExample.dylib: - libjniCDataJavaToCppExample.dylib (compatibility version 0.0.0, current version 0.0.0) - @rpath/libarrow.800.dylib (compatibility version 800.0.0, current version 800.0.0) - /usr/lib/libc++.1.dylib (compatibility version 1.0.0, current version 1200.3.0) - /usr/lib/libSystem.B.dylib (compatibility version 1.0.0, current version 1311.0.0) - -**Java Test** - -Let's create a Java class to test our bridge: - -.. code-block:: java - - import org.apache.arrow.c.ArrowArray; - import org.apache.arrow.c.ArrowSchema; - import org.apache.arrow.c.Data; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.BigIntVector; - - public class TestCDataInterface { - public static void main(String[] args) { - try( - BufferAllocator allocator = new RootAllocator(); - ArrowSchema arrowSchema = ArrowSchema.allocateNew(allocator); - ArrowArray arrowArray = ArrowArray.allocateNew(allocator) - ){ - CDataJavaToCppExample.FillInt64Array( - arrowSchema.memoryAddress(), arrowArray.memoryAddress()); - try( - BigIntVector bigIntVector = (BigIntVector) Data.importVector( - allocator, arrowArray, arrowSchema, null) - ){ - System.out.println("C++-allocated array: " + bigIntVector); - } - } - } - } - -.. code-block:: shell - - C++-allocated array: [1, 2, 3, null, 5, 6, 7, 8, 9, 10] - -Share an Int32 array from Java to C++ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -**Java Side** - -For this example, we will build a JAR with all dependencies bundled. - -.. code-block:: xml - - - - 4.0.0 - org.example - cpptojava - 1.0-SNAPSHOT - - 8 - 8 - 9.0.0 - - - - org.apache.arrow - arrow-c-data - ${arrow.version} - - - org.apache.arrow - arrow-memory-netty - ${arrow.version} - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - package - - single - - - - jar-with-dependencies - - - - - - - - - -.. code-block:: java - - import org.apache.arrow.c.ArrowArray; - import org.apache.arrow.c.ArrowSchema; - import org.apache.arrow.c.Data; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.FieldVector; - import org.apache.arrow.vector.IntVector; - import org.apache.arrow.vector.VectorSchemaRoot; - - import java.util.Arrays; - - public class ToBeCalledByCpp { - final static BufferAllocator allocator = new RootAllocator(); - - /** - * Create a {@link FieldVector} and export it via the C Data Interface - * @param schemaAddress Schema memory address to wrap - * @param arrayAddress Array memory address to wrap - */ - public static void fillVector(long schemaAddress, long arrayAddress){ - try (ArrowArray arrow_array = ArrowArray.wrap(arrayAddress); - ArrowSchema arrow_schema = ArrowSchema.wrap(schemaAddress) ) { - Data.exportVector(allocator, populateFieldVectorToExport(), null, arrow_array, arrow_schema); - } - } - - /** - * Create a {@link VectorSchemaRoot} and export it via the C Data Interface - * @param schemaAddress Schema memory address to wrap - * @param arrayAddress Array memory address to wrap - */ - public static void fillVectorSchemaRoot(long schemaAddress, long arrayAddress){ - try (ArrowArray arrow_array = ArrowArray.wrap(arrayAddress); - ArrowSchema arrow_schema = ArrowSchema.wrap(schemaAddress) ) { - Data.exportVectorSchemaRoot(allocator, populateVectorSchemaRootToExport(), null, arrow_array, arrow_schema); - } - } - - private static FieldVector populateFieldVectorToExport(){ - IntVector intVector = new IntVector("int-to-export", allocator); - intVector.allocateNew(3); - intVector.setSafe(0, 1); - intVector.setSafe(1, 2); - intVector.setSafe(2, 3); - intVector.setValueCount(3); - System.out.println("[Java] FieldVector: \n" + intVector); - return intVector; - } - - private static VectorSchemaRoot populateVectorSchemaRootToExport(){ - IntVector intVector = new IntVector("age-to-export", allocator); - intVector.setSafe(0, 10); - intVector.setSafe(1, 20); - intVector.setSafe(2, 30); - VectorSchemaRoot root = new VectorSchemaRoot(Arrays.asList(intVector)); - root.setRowCount(3); - System.out.println("[Java] VectorSchemaRoot: \n" + root.contentToTSVString()); - return root; - } - } - -Build the JAR and copy it to the C++ project. - -.. code-block:: shell - - $ mvn clean install - $ cp target/cpptojava-1.0-SNAPSHOT-jar-with-dependencies.jar /cpptojava.jar - -**C++ Side** - -This application uses JNI to call Java code, but transfers data (zero-copy) via the C Data Interface instead. - -.. code-block:: cpp - - #include - #include - - #include - #include - - JNIEnv *CreateVM(JavaVM **jvm) { - JNIEnv *env; - JavaVMInitArgs vm_args; - JavaVMOption options[2]; - options[0].optionString = "-Djava.class.path=cpptojava.jar"; - options[1].optionString = "-DXcheck:jni:pedantic"; - vm_args.version = JNI_VERSION_10; - vm_args.nOptions = 2; - vm_args.options = options; - int status = JNI_CreateJavaVM(jvm, (void **) &env, &vm_args); - if (status < 0) { - std::cerr << "\n<<<<< Unable to Launch JVM >>>>>\n" << std::endl; - return nullptr; - } - return env; - } - - int main() { - JNIEnv *env; - JavaVM *jvm; - env = CreateVM(&jvm); - if (env == nullptr) return EXIT_FAILURE; - jclass javaClassToBeCalledByCpp = env->FindClass("ToBeCalledByCpp"); - if (javaClassToBeCalledByCpp != nullptr) { - jmethodID fillVector = env->GetStaticMethodID(javaClassToBeCalledByCpp, - "fillVector", - "(JJ)V"); - if (fillVector != nullptr) { - struct ArrowSchema arrowSchema; - struct ArrowArray arrowArray; - std::cout << "\n<<<<< C++ to Java for Arrays >>>>>\n" << std::endl; - env->CallStaticVoidMethod(javaClassToBeCalledByCpp, fillVector, - static_cast(reinterpret_cast(&arrowSchema)), - static_cast(reinterpret_cast(&arrowArray))); - auto resultImportArray = arrow::ImportArray(&arrowArray, &arrowSchema); - std::shared_ptr array = resultImportArray.ValueOrDie(); - std::cout << "[C++] Array: " << array->ToString() << std::endl; - } else { - std::cerr << "Could not find fillVector method\n" << std::endl; - return EXIT_FAILURE; - } - jmethodID fillVectorSchemaRoot = env->GetStaticMethodID(javaClassToBeCalledByCpp, - "fillVectorSchemaRoot", - "(JJ)V"); - if (fillVectorSchemaRoot != nullptr) { - struct ArrowSchema arrowSchema; - struct ArrowArray arrowArray; - std::cout << "\n<<<<< C++ to Java for RecordBatch >>>>>\n" << std::endl; - env->CallStaticVoidMethod(javaClassToBeCalledByCpp, fillVectorSchemaRoot, - static_cast(reinterpret_cast(&arrowSchema)), - static_cast(reinterpret_cast(&arrowArray))); - auto resultImportVectorSchemaRoot = arrow::ImportRecordBatch(&arrowArray, &arrowSchema); - std::shared_ptr recordBatch = resultImportVectorSchemaRoot.ValueOrDie(); - std::cout << "[C++] RecordBatch: " << recordBatch->ToString() << std::endl; - } else { - std::cerr << "Could not find fillVectorSchemaRoot method\n" << std::endl; - return EXIT_FAILURE; - } - } else { - std::cout << "Could not find ToBeCalledByCpp class\n" << std::endl; - return EXIT_FAILURE; - } - jvm->DestroyJavaVM(); - return EXIT_SUCCESS; - } - -CMakeLists.txt definition file: - -.. code-block:: cmake - - cmake_minimum_required(VERSION 3.19) - project(cdatacpptojava) - find_package(JNI REQUIRED) - find_package(Arrow REQUIRED) - message(STATUS "Arrow version: ${ARROW_VERSION}") - include_directories(${JNI_INCLUDE_DIRS}) - set(CMAKE_CXX_STANDARD 17) - add_executable(${PROJECT_NAME} main.cpp) - target_link_libraries(cdatacpptojava PRIVATE Arrow::arrow_shared) - target_link_libraries(cdatacpptojava PRIVATE ${JNI_LIBRARIES}) - -**Result** - -.. code-block:: text - - <<<<< C++ to Java for Arrays >>>>> - [Java] FieldVector: - [1, 2, 3] - [C++] Array: [ - 1, - 2, - 3 - ] - - <<<<< C++ to Java for RecordBatch >>>>> - [Java] VectorSchemaRoot: - age-to-export - 10 - 20 - 30 - - [C++] RecordBatch: age-to-export: [ - 10, - 20, - 30 - ] - -.. _`JavaCPP`: https://github.com/bytedeco/javacpp diff --git a/docs/source/java/dataset.rst b/docs/source/java/dataset.rst deleted file mode 100644 index ec816052e76..00000000000 --- a/docs/source/java/dataset.rst +++ /dev/null @@ -1,311 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -======= -Dataset -======= - -.. warning:: - - Experimental: The Java module ``dataset`` is currently under early - development. API might be changed in each release of Apache Arrow until it - gets mature. - -Dataset is an universal layer in Apache Arrow for querying data in different -formats or in different partitioning strategies. Usually the data to be queried -is supposed to be located from a traditional file system, however Arrow Dataset -is not designed only for querying files but can be extended to serve all -possible data sources such as from inter-process communication or from other -network locations, etc. - -.. contents:: - -Getting Started -=============== - -Currently supported file formats are: - -- Apache Arrow (``.arrow``) -- Apache ORC (``.orc``) -- Apache Parquet (``.parquet``) -- Comma-Separated Values (``.csv``) -- Line-delimited JSON Values (``.json``) - -Below shows a simplest example of using Dataset to query a Parquet file in Java: - -.. code-block:: Java - - // read data from file /opt/example.parquet - String uri = "file:/opt/example.parquet"; - ScanOptions options = new ScanOptions(/*batchSize*/ 32768); - try ( - BufferAllocator allocator = new RootAllocator(); - DatasetFactory datasetFactory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), - FileFormat.PARQUET, uri); - Dataset dataset = datasetFactory.finish(); - Scanner scanner = dataset.newScan(options); - ArrowReader reader = scanner.scanBatches() - ) { - List batches = new ArrayList<>(); - while (reader.loadNextBatch()) { - try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) { - final VectorUnloader unloader = new VectorUnloader(root); - batches.add(unloader.getRecordBatch()); - } - } - - // do something with read record batches, for example: - analyzeArrowData(batches); - - // finished the analysis of the data, close all resources: - AutoCloseables.close(batches); - } catch (Exception e) { - e.printStackTrace(); - } - -.. note:: - ``ArrowRecordBatch`` is a low-level composite Arrow data exchange format - that doesn't provide API to read typed data from it directly. - It's recommended to use utilities ``VectorLoader`` to load it into a schema - aware container ``VectorSchemaRoot`` by which user could be able to access - decoded data conveniently in Java. - - The ``ScanOptions batchSize`` argument takes effect only if it is set to a value - smaller than the number of rows in the recordbatch. - -.. seealso:: - Load record batches with :doc:`VectorSchemaRoot `. - -Schema -====== - -Schema of the data to be queried can be inspected via method -``DatasetFactory#inspect()`` before actually reading it. For example: - -.. code-block:: Java - - // read data from local file /opt/example.parquet - String uri = "file:/opt/example.parquet"; - BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - DatasetFactory factory = new FileSystemDatasetFactory(allocator, - NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); - - // inspect schema - Schema schema = factory.inspect(); - -For some of the data format that is compatible with a user-defined schema, user -can use method ``DatasetFactory#inspect(Schema schema)`` to create the dataset: - -.. code-block:: Java - - Schema schema = createUserSchema() - Dataset dataset = factory.finish(schema); - -Otherwise when the non-parameter method ``DatasetFactory#inspect()`` is called, -schema will be inferred automatically from data source. The same as the result -of ``DatasetFactory#inspect()``. - -Also, if projector is specified during scanning (see next section -:ref:`java-dataset-projection`), the actual schema of output data can be got -within method ``Scanner::schema()``: - -.. code-block:: Java - - Scanner scanner = dataset.newScan( - new ScanOptions(32768, Optional.of(new String[] {"id", "name"}))); - Schema projectedSchema = scanner.schema(); - -.. _java-dataset-projection: - -Projection (Subset of Columns) -============================== - -User can specify projections in ScanOptions. For example: - -.. code-block:: Java - - String[] projection = new String[] {"id", "name"}; - ScanOptions options = new ScanOptions(32768, Optional.of(projection)); - -If no projection is needed, leave the optional projection argument absent in -ScanOptions: - -.. code-block:: Java - - ScanOptions options = new ScanOptions(32768, Optional.empty()); - -Or use shortcut constructor: - -.. code-block:: Java - - ScanOptions options = new ScanOptions(32768); - -Then all columns will be emitted during scanning. - -Projection (Produce New Columns) and Filters -============================================ - -User can specify projections (new columns) or filters in ScanOptions using Substrait. For example: - -.. code-block:: Java - - ByteBuffer substraitExpressionFilter = getSubstraitExpressionFilter(); - ByteBuffer substraitExpressionProject = getSubstraitExpressionProjection(); - // Use Substrait APIs to create an Expression and serialize to a ByteBuffer - ScanOptions options = new ScanOptions.Builder(/*batchSize*/ 32768) - .columns(Optional.empty()) - .substraitExpressionFilter(substraitExpressionFilter) - .substraitExpressionProjection(getSubstraitExpressionProjection()) - .build(); - -.. seealso:: - - :doc:`Executing Projections and Filters Using Extended Expressions ` - Projections and Filters using Substrait. - -Read Data from HDFS -=================== - -``FileSystemDataset`` supports reading data from non-local file systems. HDFS -support is included in the official Apache Arrow Java package releases and -can be used directly without re-building the source code. - -To access HDFS data using Dataset API, pass a general HDFS URI to -``FilesSystemDatasetFactory``: - -.. code-block:: Java - - String uri = "hdfs://{hdfs_host}:{port}/data/example.parquet"; - BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - DatasetFactory factory = new FileSystemDatasetFactory(allocator, - NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); - -Native Memory Management -======================== - -To gain better performance and reduce code complexity, Java -``FileSystemDataset`` internally relies on C++ -``arrow::dataset::FileSystemDataset`` via JNI. -As a result, all Arrow data read from ``FileSystemDataset`` is supposed to be -allocated off the JVM heap. To manage this part of memory, an utility class -``NativeMemoryPool`` is provided to users. - -As a basic example, by using a listenable ``NativeMemoryPool``, user can pass -a listener hooking on C++ buffer allocation/deallocation: - -.. code-block:: Java - - AtomicLong reserved = new AtomicLong(0L); - ReservationListener listener = new ReservationListener() { - @Override - public void reserve(long size) { - reserved.getAndAdd(size); - } - - @Override - public void unreserve(long size) { - reserved.getAndAdd(-size); - } - }; - NativeMemoryPool pool = NativeMemoryPool.createListenable(listener); - FileSystemDatasetFactory factory = new FileSystemDatasetFactory(allocator, - pool, FileFormat.PARQUET, uri); - - -Also, it's a very common case to reserve the same amount of JVM direct memory -for the data read from datasets. For this use a built-in utility -class ``DirectReservationListener`` is provided: - -.. code-block:: Java - - NativeMemoryPool pool = NativeMemoryPool.createListenable( - DirectReservationListener.instance()); - -This way, once the allocated byte count of Arrow buffers reaches the limit of -JVM direct memory, ``OutOfMemoryError: Direct buffer memory`` will -be thrown during scanning. - -.. note:: - The default instance ``NativeMemoryPool.getDefaultMemoryPool()`` does - nothing on buffer allocation/deallocation. It's OK to use it in - the case of POC or testing, but for production use in complex environment, - it's recommended to manage memory by using a listenable memory pool. - -.. note:: - The ``BufferAllocator`` instance passed to ``FileSystemDatasetFactory``'s - constructor is also aware of the overall memory usage of the produced - dataset instances. Once the Java buffers are created the passed allocator - will become their parent allocator. - -Usage Notes -=========== - -Native Object Resource Management ---------------------------------- - -As another result of relying on JNI, all components related to -``FileSystemDataset`` should be closed manually or use try-with-resources to -release the corresponding native objects after using. For example: - -.. code-block:: Java - - String uri = "file:/opt/example.parquet"; - ScanOptions options = new ScanOptions(/*batchSize*/ 32768); - try ( - BufferAllocator allocator = new RootAllocator(); - DatasetFactory factory = new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), - FileFormat.PARQUET, uri); - Dataset dataset = factory.finish(); - Scanner scanner = dataset.newScan(options) - ) { - - // do something - - } catch (Exception e) { - e.printStackTrace(); - } - -If user forgets to close them then native object leakage might be caused. - -BatchSize ---------- - -The ``batchSize`` argument of ``ScanOptions`` is a limit on the size of an individual batch. - -For example, let's try to read a Parquet file with gzip compression and 3 row groups: - -.. code-block:: - - # Let configure ScanOptions as: - ScanOptions options = new ScanOptions(/*batchSize*/ 32768); - - $ parquet-tools meta data4_3rg_gzip.parquet - file schema: schema - age: OPTIONAL INT64 R:0 D:1 - name: OPTIONAL BINARY L:STRING R:0 D:1 - row group 1: RC:4 TS:182 OFFSET:4 - row group 2: RC:4 TS:190 OFFSET:420 - row group 3: RC:3 TS:179 OFFSET:838 - -Here, we set the batchSize in ScanOptions to 32768. Because that's greater -than the number of rows in the next batch, which is 4 rows because the first -row group has only 4 rows, then the program gets only 4 rows. The scanner -will not combine smaller batches to reach the limit, but it will split -large batches to stay under the limit. So in the case the row group had more -than 32768 rows, it would get split into blocks of 32768 rows or less. diff --git a/docs/source/java/flight.rst b/docs/source/java/flight.rst deleted file mode 100644 index 6d26583aeef..00000000000 --- a/docs/source/java/flight.rst +++ /dev/null @@ -1,239 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================ -Arrow Flight RPC -================ - -Arrow Flight is an RPC framework for efficient transfer of Arrow data -over the network. - -.. contents:: - -.. seealso:: - - :doc:`Flight protocol documentation <../format/Flight>` - Documentation of the Flight protocol, including how to use - Flight conceptually. - - `Java Cookbook `_ - Recipes for using Arrow Flight in Java. - -Writing a Flight Service -======================== - -Flight servers implement the `FlightProducer`_ interface. For convenience, -they can subclass `NoOpFlightProducer`_ instead, which offers default -implementations of all the RPC methods. - -.. code-block:: Java - - public class TutorialFlightProducer implements FlightProducer { - @Override - // Override methods or use NoOpFlightProducer for only methods needed - } - -Each RPC method always takes a ``CallContext`` for common parameters. To indicate -failure, pass an exception to the "listener" if present, or else raise an -exception. - -.. code-block:: Java - - // Server - @Override - public void listFlights(CallContext context, Criteria criteria, StreamListener listener) { - // ... - listener.onError( - CallStatus.UNAUTHENTICATED.withDescription( - "Custom UNAUTHENTICATED description message.").toRuntimeException()); - // ... - } - - // Client - try{ - Iterable flightInfosBefore = flightClient.listFlights(Criteria.ALL); - // ... - } catch (FlightRuntimeException e){ - // Catch UNAUTHENTICATED exception - } - -To start a server, create a `Location`_ to specify where to listen, and then create -a `FlightServer`_ with an instance of a producer. This will start the server, but -won't block the rest of the program. Call ``FlightServer.awaitTermination`` -to block until the server stops. - -.. code-block:: Java - - class TutorialFlightProducer implements FlightProducer { - @Override - // Override methods or use NoOpFlightProducer for only methods needed - } - - Location location = Location.forGrpcInsecure("0.0.0.0", 0); - try( - BufferAllocator allocator = new RootAllocator(); - FlightServer server = FlightServer.builder( - allocator, - location, - new TutorialFlightProducer() - ).build(); - ){ - server.start(); - System.out.println("Server listening on port " + server.getPort()); - server.awaitTermination(); - } catch (Exception e) { - e.printStackTrace(); - } - -.. code-block:: shell - - Server listening on port 58104 - -Using the Flight Client -======================= - -To connect to a Flight service, create a `FlightClient`_ with a location. - -.. code-block:: Java - - Location location = Location.forGrpcInsecure("0.0.0.0", 58104); - - try(BufferAllocator allocator = new RootAllocator(); - FlightClient client = FlightClient.builder(allocator, location).build()){ - // ... Consume operations exposed by Flight server - } catch (Exception e) { - e.printStackTrace(); - } - -Cancellation and Timeouts -========================= - -When making a call, clients can optionally provide ``CallOptions``. This allows -clients to set a timeout on calls. Also, some objects returned by client RPC calls -expose a cancel method which allows terminating a call early. - -.. code-block:: Java - - Location location = Location.forGrpcInsecure("0.0.0.0", 58609); - - try(BufferAllocator allocator = new RootAllocator(); - FlightClient tutorialFlightClient = FlightClient.builder(allocator, location).build()){ - - Iterator resultIterator = tutorialFlightClient.doAction( - new Action("test-timeout"), - CallOptions.timeout(2, TimeUnit.SECONDS) - ); - } catch (Exception e) { - e.printStackTrace(); - } - -On the server side, timeouts are transparent. For cancellation, the server needs to manually poll -``setOnCancelHandler`` or ``isCancelled`` to check if the client has cancelled the call, -and if so, break out of any processing the server is currently doing. - -.. code-block:: Java - - // Client - Location location = Location.forGrpcInsecure("0.0.0.0", 58609); - try(BufferAllocator allocator = new RootAllocator(); - FlightClient tutorialFlightClient = FlightClient.builder(allocator, location).build()){ - try(FlightStream flightStream = flightClient.getStream(new Ticket(new byte[]{}))) { - // ... - flightStream.cancel("tutorial-cancel", new Exception("Testing cancellation option!")); - } - } catch (Exception e) { - e.printStackTrace(); - } - // Server - @Override - public void getStream(CallContext context, Ticket ticket, ServerStreamListener listener) { - // ... - listener.setOnCancelHandler(()->{ - // Implement logic to handle cancellation option - }); - } - -Enabling TLS -============ - -TLS can be enabled when setting up a server by providing a -certificate and key pair to ``FlightServer.Builder.useTls``. - -On the client side, use ``Location.forGrpcTls`` to create the Location for the client. - -Enabling Authentication -======================= - -.. warning:: Authentication is insecure without enabling TLS. - -Handshake-based authentication can be enabled by implementing -``ServerAuthHandler``. Authentication consists of two parts: on -initial client connection, the server and client authentication -implementations can perform any negotiation needed. The client authentication -handler then provides a token that will be attached to future calls. - -The client send data to be validated through ``ClientAuthHandler.authenticate`` -The server validate data received through ``ServerAuthHandler.authenticate``. - -Custom Middleware -================= - -Servers and clients support custom middleware (or interceptors) that are called on every -request and can modify the request in a limited fashion. These can be implemented by implementing the -``FlightServerMiddleware`` and ``FlightClientMiddleware`` interfaces. - -Middleware are fairly limited, but they can add headers to a -request/response. On the server, they can inspect incoming headers and -fail the request; hence, they can be used to implement custom -authentication methods. - -Adding Services -=============== - -Servers can add other gRPC services. For example, to add the `Health Check service `_: - -.. code-block:: Java - - final HealthStatusManager statusManager = new HealthStatusManager(); - final Consumer consumer = (builder) -> { - builder.addService(statusManager.getHealthService()); - }; - final Location location = forGrpcInsecure(LOCALHOST, 5555); - try ( - BufferAllocator a = new RootAllocator(Long.MAX_VALUE); - Producer producer = new Producer(a); - FlightServer s = FlightServer.builder(a, location, producer) - .transportHint("grpc.builderConsumer", consumer).build().start(); - ) { - Channel channel = NettyChannelBuilder.forAddress(location.toSocketAddress()).usePlaintext().build(); - HealthCheckResponse response = HealthGrpc - .newBlockingStub(channel) - .check(HealthCheckRequest.getDefaultInstance()); - - System.out.println(response.getStatus()); - } - - -:ref:`Flight best practices ` -==================================================== - - -.. _`FlightClient`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightClient.html -.. _`FlightProducer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightProducer.html -.. _`FlightServer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/FlightServer.html -.. _`NoOpFlightProducer`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/NoOpFlightProducer.html -.. _`Location`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/Location.html diff --git a/docs/source/java/flight_sql.rst b/docs/source/java/flight_sql.rst deleted file mode 100644 index dbf97238d4c..00000000000 --- a/docs/source/java/flight_sql.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================ -Arrow Flight SQL -================ - -Arrow Flight SQL is an RPC framework for efficient transfer of Arrow data -over the network. - -.. seealso:: - - :doc:`Flight SQL protocol documentation <../format/FlightSql>` - Documentation of the Flight SQL protocol. - -For usage information, see the `API documentation`_. - -.. _API documentation: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/sql/package-summary.html diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst deleted file mode 100644 index 290625ba714..00000000000 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ /dev/null @@ -1,176 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -============================ -Arrow Flight SQL JDBC Driver -============================ - -The Flight SQL JDBC driver is a JDBC driver implementation that uses -the :doc:`Flight SQL protocol <../format/FlightSql>` under the hood. -This driver can be used with any database that implements Flight SQL. - -.. contents:: - -Installation and Requirements -============================= - -The driver is compatible with JDK 11+. Note that the following JVM -parameter is required: - -.. code-block:: shell - - java --add-opens=java.base/java.nio=ALL-UNNAMED ... - -To add a dependency via Maven, use a ``pom.xml`` like the following: - -.. code-block:: xml - - - - 4.0.0 - org.example - demo - 1.0-SNAPSHOT - - 18.1.0 - - - - org.apache.arrow - flight-sql-jdbc-driver - ${arrow.version} - - - - -Connecting to a Database -======================== - -The URI format is as follows:: - - jdbc:arrow-flight-sql://HOSTNAME:PORT[/?param1=val1¶m2=val2&...] - -For example, take this URI:: - - jdbc:arrow-flight-sql://localhost:12345/?username=admin&password=pass&useEncryption=1 - -This will connect to a Flight SQL service running on ``localhost`` on -port 12345. It will create a secure, encrypted connection, and -authenticate using the username ``admin`` and the password ``pass``. - -The components of the URI are as follows. - -* The URI scheme must be ``jdbc:arrow-flight-sql://``. -* **HOSTNAME** is the hostname of the Flight SQL service. -* **PORT** is the port of the Flight SQL service. - -Additional options can be passed as query parameters. Parameter names are -case-sensitive. The supported parameters are: - -.. list-table:: - :header-rows: 1 - - * - Parameter - - Default - - Description - - * - disableCertificateVerification - - false - - When TLS is enabled, whether to verify the server certificate - - * - password - - null - - The password for user/password authentication - - * - threadPoolSize - - 1 - - The size of an internal thread pool - - * - token - - null - - The token used for token authentication - - * - trustStore - - null - - When TLS is enabled, the path to the certificate store - - * - trustStorePassword - - null - - When TLS is enabled, the password for the certificate store - - * - tlsRootCerts - - null - - Path to PEM-encoded root certificates for TLS - use this as - an alternative to ``trustStore`` - - * - clientCertificate - - null - - Path to PEM-encoded client mTLS certificate when the Flight - SQL server requires client verification. - - * - clientKey - - null - - Path to PEM-encoded client mTLS key when the Flight - SQL server requires client verification. - - * - useEncryption - - true - - Whether to use TLS (the default is an encrypted connection) - - * - user - - null - - The username for user/password authentication - - * - useSystemTrustStore - - true - - When TLS is enabled, whether to use the system certificate store - - * - retainCookies - - true - - Whether to use cookies from the initial connection in subsequent - internal connections when retrieving streams from separate endpoints. - - * - retainAuth - - true - - Whether to use bearer tokens obtained from the initial connection - in subsequent internal connections used for retrieving streams - from separate endpoints. - -Note that URI values must be URI-encoded if they contain characters such -as !, @, $, etc. - -Any URI parameters that are not handled by the driver are passed to -the Flight SQL service as gRPC headers. For example, the following URI :: - - jdbc:arrow-flight-sql://localhost:12345/?useEncryption=0&database=mydb - -This will connect without authentication or encryption, to a Flight -SQL service running on ``localhost`` on port 12345. Each request will -also include a ``database=mydb`` gRPC header. - -Connection parameters may also be supplied using the Properties object -when using the JDBC Driver Manager to connect. When supplying using -the Properties object, values should *not* be URI-encoded. - -Parameters specified by the URI supercede parameters supplied by the -Properties object. When calling the `user/password overload of -DriverManager#getConnection() -`_, -the username and password supplied on the URI supercede the username and -password arguments to the function call. diff --git a/docs/source/java/index.rst b/docs/source/java/index.rst deleted file mode 100644 index cf93b0e8978..00000000000 --- a/docs/source/java/index.rst +++ /dev/null @@ -1,46 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _java: - -Java Implementation -=================== - -This is the documentation of the Java API of Apache Arrow. For more details -on the Arrow format and other language bindings see the :doc:`parent documentation <../index>`. - -.. toctree:: - :maxdepth: 2 - - quickstartguide - overview - install - memory - vector - vector_schema_root - table - ipc - algorithm - flight - flight_sql - flight_sql_jdbc_driver - dataset - substrait - cdata - jdbc - Reference (javadoc) - Java cookbook diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst deleted file mode 100644 index c238690c6b9..00000000000 --- a/docs/source/java/install.rst +++ /dev/null @@ -1,232 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -======================= -Installing Java Modules -======================= - -.. contents:: - -System Compatibility -==================== - -Java modules are regularly built and tested on macOS and Linux distributions. - -Java Compatibility -================== - -Java modules are compatible with JDK 11 and above. Currently, JDK versions -11, 17, 21, and latest are tested in CI. - -Note that some JDK internals must be exposed by -adding ``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED`` to the ``java`` command: - -.. code-block:: shell - - # Directly on the command line - $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... - # Indirectly via environment variables - $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... - -Otherwise, you may see errors like ``module java.base does not "opens -java.nio" to unnamed module`` or ``module java.base does not "opens -java.nio" to org.apache.arrow.memory.core`` - -Note that the command has changed from Arrow 15 and earlier. If you are still using the flags from that version -(``--add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED``) you will see the -``module java.base does not "opens java.nio" to org.apache.arrow.memory.core`` error. - -If you are using flight-core or dependent modules, you will need to mark that flight-core can read unnamed modules. -Modifying the command above for Flight: - -.. code-block:: shell - - # Directly on the command line - $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... - # Indirectly via environment variables - $ env JDK_JAVA_OPTIONS="--add-reads=org.apache.arrow.flight.core=ALL-UNNAMED --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... - -Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class -org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) -cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module -org.apache.arrow.flight.core does not read unnamed module ...`` - -Finally, if you are using arrow-dataset, you'll also need to report that JDK internals need to be exposed. -Modifying the command above for arrow-memory: - -.. code-block:: shell - - # Directly on the command line - $ java --add-opens=java.base/java.nio=org.apache.arrow.memory.core,ALL-UNNAMED -jar ... - # Indirectly via environment variables - $ env JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=org.apache.arrow.dataset,org.apache.arrow.memory.core,ALL-UNNAMED" java -jar ... - -Otherwise you may see errors such as ``java.lang.RuntimeException: java.lang.reflect.InaccessibleObjectException: -Unable to make static void java.nio.Bits.reserveMemory(long,long) accessible: module -java.base does not "opens java.nio" to module org.apache.arrow.dataset`` - -If using Maven and Surefire for unit testing, :ref:`this argument must -be added to Surefire as well `. - -Installing from Maven -===================== - -By default, Maven will download from the central repository: https://repo.maven.apache.org/maven2/org/apache/arrow/ - -Configure your pom.xml with the Java modules needed, for example: -arrow-vector, and arrow-memory-netty. - -.. code-block:: xml - - - - 4.0.0 - org.example - demo - 1.0-SNAPSHOT - - 9.0.0 - - - - org.apache.arrow - arrow-vector - ${arrow.version} - - - org.apache.arrow - arrow-memory-netty - ${arrow.version} - - - - -A bill of materials (BOM) module has been provided to simplify adding -Arrow modules. This eliminates the need to specify the version for -every module. An alternative to the above would be: - -.. code-block:: xml - - - - 4.0.0 - org.example - demo - 1.0-SNAPSHOT - - 15.0.0 - - - - org.apache.arrow - arrow-vector - - - org.apache.arrow - arrow-memory-netty - - - - - - org.apache.arrow - arrow-bom - ${arrow.version} - pom - import - - - - - -To use the Arrow Flight dependencies, also add the ``os-maven-plugin`` -plugin. This plugin generates useful platform-dependent properties -such as ``os.detected.name`` and ``os.detected.arch`` needed to resolve -transitive dependencies of Flight. - -.. code-block:: xml - - - - 4.0.0 - org.example - demo - 1.0-SNAPSHOT - - 9.0.0 - - - - org.apache.arrow - flight-core - ${arrow.version} - - - - - - kr.motd.maven - os-maven-plugin - 1.7.0 - - - - - -.. _java-install-maven-testing: - -The ``--add-opens`` flag must be added when running unit tests through Maven: - -.. code-block:: xml - - - - - org.apache.maven.plugins - maven-surefire-plugin - 3.0.0-M6 - - --add-opens=java.base/java.nio=ALL-UNNAMED - - - - - -Or they can be added via environment variable, for example when executing your code: - -.. code-block:: - - JDK_JAVA_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED" mvn exec:java -Dexec.mainClass="YourMainCode" - -Installing from Source -====================== - -See :ref:`java-development`. - -IDE Configuration -================= - -Generally, no additional configuration should be needed. However, -ensure your Maven or other build configuration has the ``--add-opens`` -flag as described above, so that the IDE picks it up and runs tests -with that flag as well. diff --git a/docs/source/java/ipc.rst b/docs/source/java/ipc.rst deleted file mode 100644 index f5939179177..00000000000 --- a/docs/source/java/ipc.rst +++ /dev/null @@ -1,202 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -=========================== -Reading/Writing IPC formats -=========================== -Arrow defines two types of binary formats for serializing record batches: - -* **Streaming format**: for sending an arbitrary number of record - batches. The format must be processed from start to end, and does not support - random access - -* **File or Random Access format**: for serializing a fixed number of record - batches. It supports random access, and thus is very useful when used with - memory maps - -Writing and Reading Streaming Format -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -First, let's populate a :class:`VectorSchemaRoot` with a small batch of records - -.. code-block:: Java - - BitVector bitVector = new BitVector("boolean", allocator); - VarCharVector varCharVector = new VarCharVector("varchar", allocator); - for (int i = 0; i < 10; i++) { - bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); - varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); - } - bitVector.setValueCount(10); - varCharVector.setValueCount(10); - - List fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); - List vectors = Arrays.asList(bitVector, varCharVector); - VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); - -Now, we can begin writing a stream containing some number of these batches. For this we use :class:`ArrowStreamWriter` -(DictionaryProvider used for any vectors that are dictionary encoded is optional and can be null)) - -.. code-block:: Java - - try ( - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ArrowStreamWriter writer = new ArrowStreamWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out)); - ) { - // ... do write into the ArrowStreamWriter - } - -Here we used an in-memory stream, but this could have been a socket or some other IO stream. Then we can do - -.. code-block:: Java - - writer.start(); - // write the first batch - writer.writeBatch(); - - // write another four batches. - for (int i = 0; i < 4; i++) { - // populate VectorSchemaRoot data and write the second batch - BitVector childVector1 = (BitVector)root.getVector(0); - VarCharVector childVector2 = (VarCharVector)root.getVector(1); - childVector1.reset(); - childVector2.reset(); - // ... do some populate work here, could be different for each batch - writer.writeBatch(); - } - - writer.end(); - -Note that, since the :class:`VectorSchemaRoot` in the writer is a container that can hold batches, batches flow through -:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before ``writeBatch``, so that later batches -could overwrite previous ones. - -Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches. -We can read such a stream with :class:`ArrowStreamReader`. Note that the :class:`VectorSchemaRoot` within the reader -will be loaded with new values on every call to :class:`loadNextBatch()` - -.. code-block:: Java - - try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { - // This will be loaded with new values on every call to loadNextBatch - VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); - Schema schema = readRoot.getSchema(); - for (int i = 0; i < 5; i++) { - reader.loadNextBatch(); - // ... do something with readRoot - } - } - -Here we also give a simple example with dictionary encoded vectors - -.. code-block:: Java - - // create provider - DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); - - try ( - final VarCharVector dictVector = new VarCharVector("dict", allocator); - final VarCharVector vector = new VarCharVector("vector", allocator); - ) { - // create dictionary vector - dictVector.allocateNewSafe(); - dictVector.setSafe(0, "aa".getBytes()); - dictVector.setSafe(1, "bb".getBytes()); - dictVector.setSafe(2, "cc".getBytes()); - dictVector.setValueCount(3); - - // create dictionary - Dictionary dictionary = - new Dictionary(dictVector, new DictionaryEncoding(1L, false, /*indexType=*/null)); - provider.put(dictionary); - - // create original data vector - vector.allocateNewSafe(); - vector.setSafe(0, "bb".getBytes()); - vector.setSafe(1, "bb".getBytes()); - vector.setSafe(2, "cc".getBytes()); - vector.setSafe(3, "aa".getBytes()); - vector.setValueCount(4); - - // get the encoded vector - IntVector encodedVector = (IntVector) DictionaryEncoder.encode(vector, dictionary); - - ByteArrayOutputStream out = new ByteArrayOutputStream(); - - // create VectorSchemaRoot - List fields = Arrays.asList(encodedVector.getField()); - List vectors = Arrays.asList(encodedVector); - try (VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors)) { - - // write data - ArrowStreamWriter writer = new ArrowStreamWriter(root, provider, Channels.newChannel(out)); - writer.start(); - writer.writeBatch(); - writer.end(); - } - - // read data - try (ArrowStreamReader reader = new ArrowStreamReader(new ByteArrayInputStream(out.toByteArray()), allocator)) { - reader.loadNextBatch(); - VectorSchemaRoot readRoot = reader.getVectorSchemaRoot(); - // get the encoded vector - IntVector intVector = (IntVector) readRoot.getVector(0); - - // get dictionaries and decode the vector - Map dictionaryMap = reader.getDictionaryVectors(); - long dictionaryId = intVector.getField().getDictionary().getId(); - try (VarCharVector varCharVector = - (VarCharVector) DictionaryEncoder.decode(intVector, dictionaryMap.get(dictionaryId))) { - // ... use decoded vector - } - } - } - -Writing and Reading Random Access Files -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The :class:`ArrowFileWriter` has the same API as :class:`ArrowStreamWriter` - -.. code-block:: Java - - try ( - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ArrowFileWriter writer = new ArrowFileWriter(root, /*DictionaryProvider=*/null, Channels.newChannel(out)); - ) { - writer.start(); - // write the first batch - writer.writeBatch(); - // write another four batches. - for (int i = 0; i < 4; i++) { - // ... do populate work - writer.writeBatch(); - } - writer.end(); - } - -The difference between :class:`ArrowFileReader` and :class:`ArrowStreamReader` is that the input source -must have a ``seek`` method for random access. Because we have access to the entire payload, we know the -number of record batches in the file, and can read any at random - -.. code-block:: Java - - try (ArrowFileReader reader = new ArrowFileReader( - new ByteArrayReadableSeekableByteChannel(out.toByteArray()), allocator)) { - - // read the 4-th batch - ArrowBlock block = reader.getRecordBlocks().get(3); - reader.loadRecordBatch(block); - VectorSchemaRoot readBatch = reader.getVectorSchemaRoot(); - } diff --git a/docs/source/java/jdbc.rst b/docs/source/java/jdbc.rst deleted file mode 100644 index c0477cb06d8..00000000000 --- a/docs/source/java/jdbc.rst +++ /dev/null @@ -1,278 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================== -Arrow JDBC Adapter -================== - -The Arrow JDBC Adapter assists with working with JDBC and Arrow -data. Currently, it supports reading JDBC ResultSets into Arrow -VectorSchemaRoots. - -ResultSet to VectorSchemaRoot Conversion -======================================== - -This can be accessed via the JdbcToArrow class. The resulting -ArrowVectorIterator will convert a ResultSet to Arrow data in batches -of rows. - -.. code-block:: java - - try (ArrowVectorIterator it = JdbcToArrow.sqlToArrowVectorIterator(resultSet, allocator)) { - while (it.hasNext()) { - VectorSchemaRoot root = it.next(); - // Consume the root… - } - } - -The batch size and type mapping can both be customized: - -.. code-block:: java - - JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, /*calendar=*/null) - .setReuseVectorSchemaRoot(reuseVectorSchemaRoot) - .setJdbcToArrowTypeConverter((jdbcFieldInfo -> { - switch (jdbcFieldInfo.getJdbcType()) { - case Types.BIGINT: - // Assume actual value range is SMALLINT - return new ArrowType.Int(16, true); - default: - return null; - } - })) - .build(); - try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) { - while (iter.hasNext()) { - VectorSchemaRoot root = iter.next(); - // Consume the root… - } - } - -The JDBC type can be explicitly specified, which is useful since JDBC -drivers can give spurious type information. For example, the Postgres -driver has been observed to use Decimal types with scale and precision -0; these cases can be handled by specifying the type explicitly before -reading. Also, some JDBC drivers may return BigDecimal values with -inconsistent scale. A RoundingMode can be set to handle these cases: - -.. code-block:: java - - Map mapping = new HashMap<>(); - mapping.put(1, new JdbcFieldInfo(Types.DECIMAL, 20, 7)); - JdbcToArrowConfig config = new JdbcToArrowConfigBuilder(allocator, /*calendar=*/null) - .setBigDecimalRoundingMode(RoundingMode.UNNECESSARY) - .setExplicitTypesByColumnIndex(mapping) - .build(); - try (ArrowVectorIterator iter = JdbcToArrow.sqlToArrowVectorIterator(rs, config)) { - while (iter.hasNext()) { - VectorSchemaRoot root = iter.next(); - // Consume the root… - } - } - -The mapping from JDBC type to Arrow type can be overridden via the -``JdbcToArrowConfig``, but it is not possible to customize the -conversion from JDBC value to Arrow value itself, nor is it possible -to define a conversion for an unsupported type. - -Type Mapping ------------- - -The JDBC to Arrow type mapping can be obtained at runtime from -`JdbcToArrowUtils.getArrowTypeFromJdbcType`_. - -.. _JdbcToArrowUtils.getArrowTypeFromJdbcType: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.html#getArrowTypeFromJdbcType-org.apache.arrow.adapter.jdbc.JdbcFieldInfo-java.util.Calendar- - -+--------------------+--------------------+-------+ -| JDBC Type | Arrow Type | Notes | -+====================+====================+=======+ -| ARRAY | List | \(1) | -+--------------------+--------------------+-------+ -| BIGINT | Int64 | | -+--------------------+--------------------+-------+ -| BINARY | Binary | | -+--------------------+--------------------+-------+ -| BIT | Bool | | -+--------------------+--------------------+-------+ -| BLOB | Binary | | -+--------------------+--------------------+-------+ -| BOOLEAN | Bool | | -+--------------------+--------------------+-------+ -| CHAR | Utf8 | | -+--------------------+--------------------+-------+ -| CLOB | Utf8 | | -+--------------------+--------------------+-------+ -| DATE | Date32 | | -+--------------------+--------------------+-------+ -| DECIMAL | Decimal128 | \(2) | -+--------------------+--------------------+-------+ -| DOUBLE | Double | | -+--------------------+--------------------+-------+ -| FLOAT | Float32 | | -+--------------------+--------------------+-------+ -| INTEGER | Int32 | | -+--------------------+--------------------+-------+ -| LONGVARBINARY | Binary | | -+--------------------+--------------------+-------+ -| LONGNVARCHAR | Utf8 | | -+--------------------+--------------------+-------+ -| LONGVARCHAR | Utf8 | | -+--------------------+--------------------+-------+ -| NCHAR | Utf8 | | -+--------------------+--------------------+-------+ -| NULL | Null | | -+--------------------+--------------------+-------+ -| NUMERIC | Decimal128 | | -+--------------------+--------------------+-------+ -| NVARCHAR | Utf8 | | -+--------------------+--------------------+-------+ -| REAL | Float32 | | -+--------------------+--------------------+-------+ -| SMALLINT | Int16 | | -+--------------------+--------------------+-------+ -| STRUCT | Struct | \(3) | -+--------------------+--------------------+-------+ -| TIME | Time32[ms] | | -+--------------------+--------------------+-------+ -| TIMESTAMP | Timestamp[ms] | \(4) | -+--------------------+--------------------+-------+ -| TINYINT | Int8 | | -+--------------------+--------------------+-------+ -| VARBINARY | Binary | | -+--------------------+--------------------+-------+ -| VARCHAR | Utf8 | | -+--------------------+--------------------+-------+ - -* \(1) The list value type must be explicitly configured and cannot be - inferred. Use `setArraySubTypeByColumnIndexMap`_ or - `setArraySubTypeByColumnNameMap`_. -* \(2) By default, the scale of decimal values must match the scale in - the type exactly; precision is allowed to be any value greater or - equal to the type precision. If there is a mismatch, by default, an - exception will be thrown. This can be configured by setting a - different RoundingMode with setBigDecimalRoundingMode. -* \(3) Not fully supported: while the type conversion is defined, the - value conversion is not. See ARROW-17006_. -* \(4) If a Calendar is provided, then the timestamp will have the - timezone of the calendar, else it will be a timestamp without - timezone. - -.. _setArraySubTypeByColumnIndexMap: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnIndexMap-java.util.Map- -.. _setArraySubTypeByColumnNameMap: https://arrow.apache.org/docs/java/reference/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigBuilder.html#setArraySubTypeByColumnNameMap-java.util.Map- -.. _ARROW-17006: https://issues.apache.org/jira/browse/ARROW-17006 - -VectorSchemaRoot to PreparedStatement Parameter Conversion -========================================================== - -The adapter can bind rows of Arrow data from a VectorSchemaRoot to -parameters of a JDBC PreparedStatement. This can be accessed via the -JdbcParameterBinder class. Each call to next() will bind parameters -from the next row of data, and then the application can execute the -statement, call addBatch(), etc. as desired. Null values will lead to -a setNull call with an appropriate JDBC type code (listed below). - -.. code-block:: java - - final JdbcParameterBinder binder = - JdbcParameterBinder.builder(statement, root).bindAll().build(); - while (binder.next()) { - statement.executeUpdate(); - } - // Use a VectorLoader to update the root - binder.reset(); - while (binder.next()) { - statement.executeUpdate(); - } - -The mapping of vectors to parameters, the JDBC type code used by the -converters, and the type conversions themselves can all be customized: - -.. code-block:: java - - final JdbcParameterBinder binder = - JdbcParameterBinder.builder(statement, root) - .bind(/*parameterIndex*/2, /*columnIndex*/0) - .bind(/*parameterIndex*/1, customColumnBinderInstance) - .build(); - -Type Mapping ------------- - -The Arrow to JDBC type mapping can be obtained at runtime via -a method on ColumnBinder. - -+----------------------------+----------------------------+-------+ -| Arrow Type | JDBC Type | Notes | -+============================+============================+=======+ -| Binary | VARBINARY (setBytes) | | -+----------------------------+----------------------------+-------+ -| Bool | BOOLEAN (setBoolean) | | -+----------------------------+----------------------------+-------+ -| Date32 | DATE (setDate) | | -+----------------------------+----------------------------+-------+ -| Date64 | DATE (setDate) | | -+----------------------------+----------------------------+-------+ -| Decimal128 | DECIMAL (setBigDecimal) | | -+----------------------------+----------------------------+-------+ -| Decimal256 | DECIMAL (setBigDecimal) | | -+----------------------------+----------------------------+-------+ -| FixedSizeBinary | BINARY (setBytes) | | -+----------------------------+----------------------------+-------+ -| Float32 | REAL (setFloat) | | -+----------------------------+----------------------------+-------+ -| Int8 | TINYINT (setByte) | | -+----------------------------+----------------------------+-------+ -| Int16 | SMALLINT (setShort) | | -+----------------------------+----------------------------+-------+ -| Int32 | INTEGER (setInt) | | -+----------------------------+----------------------------+-------+ -| Int64 | BIGINT (setLong) | | -+----------------------------+----------------------------+-------+ -| LargeBinary | LONGVARBINARY (setBytes) | | -+----------------------------+----------------------------+-------+ -| LargeUtf8 | LONGVARCHAR (setString) | \(1) | -+----------------------------+----------------------------+-------+ -| Time[s] | TIME (setTime) | | -+----------------------------+----------------------------+-------+ -| Time[ms] | TIME (setTime) | | -+----------------------------+----------------------------+-------+ -| Time[us] | TIME (setTime) | | -+----------------------------+----------------------------+-------+ -| Time[ns] | TIME (setTime) | | -+----------------------------+----------------------------+-------+ -| Timestamp[s] | TIMESTAMP (setTimestamp) | \(2) | -+----------------------------+----------------------------+-------+ -| Timestamp[ms] | TIMESTAMP (setTimestamp) | \(2) | -+----------------------------+----------------------------+-------+ -| Timestamp[us] | TIMESTAMP (setTimestamp) | \(2) | -+----------------------------+----------------------------+-------+ -| Timestamp[ns] | TIMESTAMP (setTimestamp) | \(2) | -+----------------------------+----------------------------+-------+ -| Utf8 | VARCHAR (setString) | | -+----------------------------+----------------------------+-------+ - -* \(1) Strings longer than Integer.MAX_VALUE bytes (the maximum length - of a Java ``byte[]``) will cause a runtime exception. -* \(2) If the timestamp has a timezone, the JDBC type defaults to - TIMESTAMP_WITH_TIMEZONE. If the timestamp has no timezone, - technically there is not a correct conversion from Arrow value to - JDBC value, because a JDBC Timestamp is in UTC, and we have no - timezone information. In this case, the default binder will call - `setTimestamp(int, Timestamp) - `_, - which will lead to the driver using the "default timezone" (that of - the Java VM). diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst deleted file mode 100644 index 28ff01fb944..00000000000 --- a/docs/source/java/memory.rst +++ /dev/null @@ -1,501 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================= -Memory Management -================= - -The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: -The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. - -.. contents:: - -Memory Basics -============= -This section will introduce you to the major concepts in Java’s memory management: - -* `ArrowBuf`_ -* `BufferAllocator`_ -* Reference counting - -It also provides some guidelines for working with memory in Arrow, and describes how to debug memory issues when they arise. - -Getting Started ---------------- - -Arrow's memory management is built around the needs of the columnar format and using off-heap memory. -Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough -to be used with memory allocated in C++ that is used by Java code. - -Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. -Users need the core interfaces, and exactly one of the implementations. - -* ``memory-core``: Provides the interfaces used by the Arrow libraries and applications. -* ``memory-netty``: An implementation of the memory interfaces based on the `Netty`_ library. -* ``memory-unsafe``: An implementation of the memory interfaces based on the `sun.misc.Unsafe`_ library. - - -ArrowBuf --------- - -ArrowBuf represents a single, contiguous region of `direct memory`_. It consists of an address and a length, -and provides low-level interfaces for working with the contents, similar to ByteBuffer. - -Unlike (Direct)ByteBuffer, it has reference counting built in, as discussed later. - -Why Arrow Uses Direct Memory -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* The JVM can optimize I/O operations when using direct memory/direct buffers; it will attempt to avoid copying buffer contents to/from an intermediate buffer. This can speed up IPC in Arrow. -* Since Arrow always uses direct memory, JNI modules can directly wrap native memory addresses instead of copying data. We use this in modules like the C Data Interface. -* Conversely, on the C++ side of the JNI boundary, we can directly access the memory in ArrowBuf without copying data. - -BufferAllocator ---------------- - -The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). -As the name suggests, it can allocate new buffers associated with itself, but it can also -handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for -memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: - -.. code-block:: Java - - import org.apache.arrow.memory.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - - try(BufferAllocator bufferAllocator = new RootAllocator(8 * 1024)){ - ArrowBuf arrowBuf = bufferAllocator.buffer(4 * 1024); - System.out.println(arrowBuf); - arrowBuf.close(); - } - -.. code-block:: shell - - ArrowBuf[2], address:140363641651200, length:4096 - -The concrete implementation of the BufferAllocator interface is `RootAllocator`_. Applications should generally create -one RootAllocator at the start of the program, and use it through the BufferAllocator interface. Allocators implement -AutoCloseable and must be closed after the application is done with them; this will check that all outstanding memory -has been freed (see the next section). - -Arrow provides a tree-based model for memory allocation. The RootAllocator is created first, then more allocators -are created as children of an existing allocator via `newChildAllocator`_. When creating a RootAllocator or a child -allocator, a memory limit is provided, and when allocating memory, the limit is checked. Furthermore, when allocating -memory from a child allocator, those allocations are also reflected in all parent allocators. Hence, the RootAllocator -effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. - -Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can -be set for a particular section of code. The child allocator can be closed when that section completes, -at which point it checks that that section didn't leak any memory. -Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. - -Reference counting ------------------- - -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To manage shared buffers -deterministically, we use manual reference counting instead of the garbage collector. -This simply means that each buffer has a counter keeping track of the number of references to -the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. - -In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve -it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, -and `ReferenceManager.retain`_ to increment it. - -Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use -higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically -decrement the reference count when closed. - -Allocators implement AutoCloseable as well. In this case, closing the allocator will check that all buffers -obtained from the allocator are closed. If not, ``close()`` method will raise an exception; this helps track -memory leaks from unclosed buffers. - -Reference counting needs to be handled carefully. To ensure that an -independent section of code has fully cleaned up all allocated buffers, use a new child allocator. - -Development Guidelines ----------------------- - -Applications should generally: - -* Use the BufferAllocator interface in APIs instead of RootAllocator. -* Create one RootAllocator at the start of the program and explicitly pass it when needed. -* ``close()`` allocators after use (whether they are child allocators or the RootAllocator), either manually or preferably via a try-with-resources statement. - - -Debugging Memory Leaks/Allocation ---------------------------------- - -In ``DEBUG`` mode, the allocator and supporting classes will record additional -debug tracking information to better track down memory leaks and issues. To -enable DEBUG mode pass the following system property to the VM when starting -``-Darrow.memory.debug.allocator=true``. - -When DEBUG is enabled, a log will be kept of allocations. Configure SLF4J to see these logs (e.g. via Logback/Apache Log4j). -Consider the following example to see how it helps us with the tracking of allocators: - -.. code-block:: Java - - import org.apache.arrow.memory.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - - try (BufferAllocator bufferAllocator = new RootAllocator(8 * 1024)) { - ArrowBuf arrowBuf = bufferAllocator.buffer(4 * 1024); - System.out.println(arrowBuf); - } - -Without the debug mode enabled, when we close the allocator, we get this: - -.. code-block:: shell - - 11:56:48.944 [main] INFO o.apache.arrow.memory.BaseAllocator - Debug mode disabled. - ArrowBuf[2], address:140508391276544, length:4096 - 16:28:08.847 [main] ERROR o.apache.arrow.memory.BaseAllocator - Memory was leaked by query. Memory leaked: (4096) - Allocator(ROOT) 0/4096/4096/8192 (res/actual/peak/limit) - -Enabling the debug mode, we get more details: - -.. code-block:: shell - - 11:56:48.944 [main] INFO o.apache.arrow.memory.BaseAllocator - Debug mode enabled. - ArrowBuf[2], address:140437894463488, length:4096 - Exception in thread "main" java.lang.IllegalStateException: Allocator[ROOT] closed with outstanding buffers allocated (1). - Allocator(ROOT) 0/4096/4096/8192 (res/actual/peak/limit) - child allocators: 0 - ledgers: 1 - ledger[1] allocator: ROOT), isOwning: , size: , references: 1, life: 261438177096661..0, allocatorManager: [, life: ] holds 1 buffers. - ArrowBuf[2], address:140437894463488, length:4096 - reservations: 0 - -Additionally, in debug mode, `ArrowBuf.print()`_ can be used to obtain a debug string. -This will include information about allocation operations on the buffer with stack traces, such as when/where the buffer was allocated. - -.. code-block:: java - - import org.apache.arrow.memory.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - - try (final BufferAllocator allocator = new RootAllocator()) { - try (final ArrowBuf buf = allocator.buffer(1024)) { - final StringBuilder sb = new StringBuilder(); - buf.print(sb, /*indent*/ 0); - System.out.println(sb.toString()); - } - } - -.. code-block:: text - - ArrowBuf[2], address:140433199984656, length:1024 - event log for: ArrowBuf[2] - 675959093395667 create() - at org.apache.arrow.memory.util.HistoricalLog$Event.(HistoricalLog.java:175) - at org.apache.arrow.memory.util.HistoricalLog.recordEvent(HistoricalLog.java:83) - at org.apache.arrow.memory.ArrowBuf.(ArrowBuf.java:96) - at org.apache.arrow.memory.BufferLedger.newArrowBuf(BufferLedger.java:271) - at org.apache.arrow.memory.BaseAllocator.bufferWithoutReservation(BaseAllocator.java:300) - at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:276) - at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) - at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:240) - at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) - at REPL.$JShell$14.do_it$($JShell$14.java:10) - at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(NativeMethodAccessorImpl.java:-2) - at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) - at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) - at java.lang.reflect.Method.invoke(Method.java:566) - at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:209) - at jdk.jshell.execution.RemoteExecutionControl.invoke(RemoteExecutionControl.java:116) - at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:119) - at jdk.jshell.execution.ExecutionControlForwarder.processCommand(ExecutionControlForwarder.java:144) - at jdk.jshell.execution.ExecutionControlForwarder.commandLoop(ExecutionControlForwarder.java:262) - at jdk.jshell.execution.Util.forwardExecutionControl(Util.java:76) - at jdk.jshell.execution.Util.forwardExecutionControlAndIO(Util.java:137) - at jdk.jshell.execution.RemoteExecutionControl.main(RemoteExecutionControl.java:70) - -The BufferAllocator also provides a ``BufferAllocator.toVerboseString()`` which can be used in -``DEBUG`` mode to get extensive stacktrace information and events associated with various Allocator behaviors. - -Finally, enabling the ``TRACE`` logging level will automatically provide this stack trace when the allocator is closed: - -.. code-block:: java - - // Assumes use of Logback; adjust for Log4j, etc. as appropriate - import ch.qos.logback.classic.Level; - import ch.qos.logback.classic.Logger; - import org.apache.arrow.memory.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.slf4j.LoggerFactory; - - // Set log level to TRACE to get tracebacks - ((Logger) LoggerFactory.getLogger("org.apache.arrow")).setLevel(Level.TRACE); - try (final BufferAllocator allocator = new RootAllocator()) { - // Leak buffer - allocator.buffer(1024); - } - -.. code-block:: text - - | Exception java.lang.IllegalStateException: Allocator[ROOT] closed with outstanding buffers allocated (1). - Allocator(ROOT) 0/1024/1024/9223372036854775807 (res/actual/peak/limit) - child allocators: 0 - ledgers: 1 - ledger[1] allocator: ROOT), isOwning: , size: , references: 1, life: 712040870231544..0, allocatorManager: [, life: ] holds 1 buffers. - ArrowBuf[2], address:139926571810832, length:1024 - event log for: ArrowBuf[2] - 712040888650134 create() - at org.apache.arrow.memory.util.StackTrace.(StackTrace.java:34) - at org.apache.arrow.memory.util.HistoricalLog$Event.(HistoricalLog.java:175) - at org.apache.arrow.memory.util.HistoricalLog.recordEvent(HistoricalLog.java:83) - at org.apache.arrow.memory.ArrowBuf.(ArrowBuf.java:96) - at org.apache.arrow.memory.BufferLedger.newArrowBuf(BufferLedger.java:271) - at org.apache.arrow.memory.BaseAllocator.bufferWithoutReservation(BaseAllocator.java:300) - at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:276) - at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) - at org.apache.arrow.memory.BaseAllocator.buffer(BaseAllocator.java:240) - at org.apache.arrow.memory.RootAllocator.buffer(RootAllocator.java:29) - at REPL.$JShell$18.do_it$($JShell$18.java:13) - at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(NativeMethodAccessorImpl.java:-2) - at jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) - at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) - at java.lang.reflect.Method.invoke(Method.java:566) - at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:209) - at jdk.jshell.execution.RemoteExecutionControl.invoke(RemoteExecutionControl.java:116) - at jdk.jshell.execution.DirectExecutionControl.invoke(DirectExecutionControl.java:119) - at jdk.jshell.execution.ExecutionControlForwarder.processCommand(ExecutionControlForwarder.java:144) - at jdk.jshell.execution.ExecutionControlForwarder.commandLoop(ExecutionControlForwarder.java:262) - at jdk.jshell.execution.Util.forwardExecutionControl(Util.java:76) - at jdk.jshell.execution.Util.forwardExecutionControlAndIO(Util.java:137) - - reservations: 0 - - | at BaseAllocator.close (BaseAllocator.java:405) - | at RootAllocator.close (RootAllocator.java:29) - | at (#8:1) - -Sometimes, explicitly passing allocators around is difficult. For example, it -can be hard to pass around extra state, like an allocator, through layers of -existing application or framework code. A global or singleton allocator instance -can be useful here, though it should not be your first choice. - -How this works: - -1. Set up a global allocator in a singleton class. -2. Provide methods to create child allocators from the global allocator. -3. Give child allocators proper names to make it easier to figure out where - allocations occurred in case of errors. -4. Ensure that resources are properly closed. -5. Check that the global allocator is empty at some suitable point, such as - right before program shutdown. -6. If it is not empty, review the above allocation bugs. - -.. code-block:: java - - //1 - private static final BufferAllocator allocator = new RootAllocator(); - private static final AtomicInteger childNumber = new AtomicInteger(0); - ... - //2 - public static BufferAllocator getChildAllocator() { - return allocator.newChildAllocator(nextChildName(), 0, Long.MAX_VALUE); - } - ... - //3 - private static String nextChildName() { - return "Allocator-Child-" + childNumber.incrementAndGet(); - } - ... - //4: Business code - try (BufferAllocator allocator = GlobalAllocator.getChildAllocator()) { - ... - } - ... - //5 - public static void checkGlobalCleanUpResources() { - ... - if (!allocator.getChildAllocators().isEmpty()) { - throw new IllegalStateException(...); - } else if (allocator.getAllocatedMemory() != 0) { - throw new IllegalStateException(...); - } - } - -.. _`ArrowBuf`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html -.. _`ArrowBuf.print()`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html#print-java.lang.StringBuilder-int-org.apache.arrow.memory.BaseAllocator.Verbosity- -.. _`BufferAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/BufferAllocator.html -.. _`BufferLedger`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/BufferLedger.html -.. _`RootAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/RootAllocator.html -.. _`newChildAllocator`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/RootAllocator.html#newChildAllocator-java.lang.String-org.apache.arrow.memory.AllocationListener-long-long- -.. _`Netty`: https://netty.io/wiki/ -.. _`sun.misc.unsafe`: https://web.archive.org/web/20210929024401/http://www.docjar.com/html/api/sun/misc/Unsafe.java.html -.. _`Direct Memory`: https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/nio/ByteBuffer.html -.. _`ReferenceManager`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html -.. _`ReferenceManager.release`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html#release-- -.. _`ReferenceManager.retain`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ReferenceManager.html#retain-- - -Arrow Memory In-Depth -===================== - -Design Principles ------------------ -Arrow’s memory model is based on the following basic concepts: - -- Memory can be allocated up to some limit. That limit could be a real - limit (OS/JVM) or a locally imposed limit. -- Allocation operates in two phases: accounting then actual allocation. - Allocation could fail at either point. -- Allocation failure should be recoverable. In all cases, the Allocator - infrastructure should expose memory allocation failures (OS or - internal limit-based) as ``OutOfMemoryException``\ s. -- Any allocator can reserve memory when created. This memory shall be - held such that this allocator will always be able to allocate that - amount of memory. -- A particular application component should work to use a local - allocator to understand local memory usage and better debug memory - leaks. -- The same physical memory can be shared by multiple allocators and the - allocator must provide an accounting paradigm for this purpose. - -Reserving Memory ----------------- - -Arrow provides two different ways to reserve memory: - -- BufferAllocator accounting reservations: When a new allocator (other - than the ``RootAllocator``) is initialized, it can set aside memory - that it will keep locally for its lifetime. This is memory that will - never be released back to its parent allocator until the allocator is - closed. -- ``AllocationReservation`` via BufferAllocator.newReservation(): - Allows a short-term preallocation strategy so that a particular - subsystem can ensure future memory is available to support a - particular request. - -Reference Counting Details --------------------------- - -Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. -A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, -a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s - -All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination -share the same reference count and either all will be valid or all will be invalid. -For simplicity of accounting, we treat that memory as being used by one -of the BufferAllocators associated with the memory. When that allocator -releases its claim on that memory, the memory ownership is then moved to -another BufferLedger belonging to the same AllocationManager. - -Allocation Details ------------------- - -There are several Allocator types in Arrow Java: - -- ``BufferAllocator`` - The public interface application users should be leveraging -- ``BaseAllocator`` - The base implementation of memory allocation, contains the meat of the Arrow allocator implementation -- ``RootAllocator`` - The root allocator. Typically only one created for a JVM. It serves as the parent/ancestor for child allocators -- ``ChildAllocator`` - A child allocator that derives from the root allocator - -Many BufferAllocators can reference the same piece of physical memory at the same -time. It is the AllocationManager’s responsibility to ensure that in this situation, -all memory is accurately accounted for from the Root’s perspective -and also to ensure that the memory is correctly released once all -BufferAllocators have stopped using that memory. - -For simplicity of accounting, we treat that memory as being used by one -of the BufferAllocators associated with the memory. When that allocator -releases its claim on that memory, the memory ownership is then moved to -another BufferLedger belonging to the same AllocationManager. Note that -because a ArrowBuf.release() is what actually causes memory ownership -transfer to occur, we always proceed with ownership transfer (even if -that violates an allocator limit). It is the responsibility of the -application owning a particular allocator to frequently confirm whether -the allocator is over its memory limit (BufferAllocator.isOverLimit()) -and if so, attempt to aggressively release memory to ameliorate the -situation. - - -Object Hierarchy ----------------- - -There are two main ways that someone can look at the object hierarchy -for Arrow’s memory management scheme. The first is a memory based -perspective as below: - -Memory Perspective -~~~~~~~~~~~~~~~~~~ - -.. code-block:: none - - + AllocationManager - | - |-- UnsignedDirectLittleEndian (One per AllocationManager) - | - |-+ BufferLedger 1 ==> Allocator A (owning) - | ` - ArrowBuf 1 - |-+ BufferLedger 2 ==> Allocator B (non-owning) - | ` - ArrowBuf 2 - |-+ BufferLedger 3 ==> Allocator C (non-owning) - | - ArrowBuf 3 - | - ArrowBuf 4 - ` - ArrowBuf 5 - -In this picture, a piece of memory is owned by an allocator manager. An -allocator manager is responsible for that piece of memory no matter -which allocator(s) it is working with. An allocator manager will have -relationships with a piece of raw memory (via its reference to -UnsignedDirectLittleEndian) as well as references to each -BufferAllocator it has a relationship to. - -Allocator Perspective -~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: none - - + RootAllocator - |-+ ChildAllocator 1 - | | - ChildAllocator 1.1 - | ` ... - | - |-+ ChildAllocator 2 - |-+ ChildAllocator 3 - | | - | |-+ BufferLedger 1 ==> AllocationManager 1 (owning) ==> UDLE - | | `- ArrowBuf 1 - | `-+ BufferLedger 2 ==> AllocationManager 2 (non-owning)==> UDLE - | `- ArrowBuf 2 - | - |-+ BufferLedger 3 ==> AllocationManager 1 (non-owning)==> UDLE - | ` - ArrowBuf 3 - |-+ BufferLedger 4 ==> AllocationManager 2 (owning) ==> UDLE - | - ArrowBuf 4 - | - ArrowBuf 5 - ` - ArrowBuf 6 - -In this picture, a RootAllocator owns three ChildAllocators. The first -ChildAllocator (ChildAllocator 1) owns a subsequent ChildAllocator. -ChildAllocator has two BufferLedgers/AllocationManager references. -Coincidentally, each of these AllocationManager’s is also associated -with the RootAllocator. In this case, one of the these -AllocationManagers is owned by ChildAllocator 3 (AllocationManager 1) -while the other AllocationManager (AllocationManager 2) is -owned/accounted for by the RootAllocator. Note that in this scenario, -ArrowBuf 1 is sharing the underlying memory as ArrowBuf 3. However the -subset of that memory (e.g.Β through slicing) might be different. Also -note that ArrowBuf 2 and ArrowBuf 4, 5 and 6 are also sharing the same -underlying memory. Also note that ArrowBuf 4, 5 and 6 all share the same -reference count and fate. diff --git a/docs/source/java/overview.rst b/docs/source/java/overview.rst deleted file mode 100644 index 7780ee32ec9..00000000000 --- a/docs/source/java/overview.rst +++ /dev/null @@ -1,92 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -=================== -High-Level Overview -=================== - -.. contents:: - -The Apache Arrow Java modules implement various specifications including the -columnar format and IPC. Most modules are native Java implementations, -but some modules are JNI bindings to the C++ library. - -.. list-table:: Arrow Java Modules - :widths: 25 50 25 - :header-rows: 1 - - * - Module - - Description - - Implementation - * - arrow-format - - Generated Java files from the IPC Flatbuffer definitions. - - Native - * - arrow-memory-core - - Core off-heap memory management libraries for Arrow ValueVectors. - - Native - * - arrow-memory-unsafe - - Memory management implementation based on sun.misc.Unsafe. - - Native - * - arrow-memory-netty - - Memory management implementation based on Netty. - - Native - * - arrow-vector - - An off-heap reference implementation for Arrow columnar data format. - - Native - * - arrow-tools - - Java applications for working with Arrow ValueVectors. - - Native - * - arrow-jdbc - - (Experimental) A library for converting JDBC data to Arrow data. - - Native - * - flight-core - - An RPC mechanism for transferring ValueVectors. - - Native - * - flight-sql - - Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. - - Native - * - flight-integration-tests - - Integration tests for Flight RPC. - - Native - * - arrow-performance - - JMH benchmarks for the Arrow libraries. - - Native - * - arrow-algorithm - - (Experimental) A collection of algorithms for working with ValueVectors. - - Native - * - arrow-avro - - (Experimental) A library for converting Avro data to Arrow data. - - Native - * - arrow-compression - - (Experimental) A library for working with compression/decompression of Arrow data. - - Native - * - arrow-c-data - - Java implementation of `C Data Interface`_ - - JNI - * - arrow-orc - - (Experimental) A JNI wrapper for the C++ ORC reader implementation. - - JNI - * - arrow-gandiva - - Java wrappers around the native Gandiva SQL expression compiler. - - JNI - * - arrow-dataset - - Java bindings to the Arrow Datasets library. - - JNI - -Arrow Java modules support working with data (1) in-memory, (2) at rest, and (3) on-the-wire. - -.. _`C Data Interface`: https://arrow.apache.org/docs/format/CDataInterface.html diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst deleted file mode 100644 index 1f3ec861d3f..00000000000 --- a/docs/source/java/quickstartguide.rst +++ /dev/null @@ -1,316 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================= -Quick Start Guide -================= - -.. contents:: - -Arrow Java provides several building blocks. Data types describe the types of values; -ValueVectors are sequences of typed values; fields describe the types of columns in -tabular data; schemas describe a sequence of columns in tabular data, and -VectorSchemaRoot represents tabular data. Arrow also provides readers and -writers for loading data from and persisting data to storage. - -Create a ValueVector -******************** - -**ValueVectors** represent a sequence of values of the same type. -They are also known as "arrays" in the columnar format. - -Example: create a vector of 32-bit integers representing ``[1, null, 2]``: - -.. code-block:: Java - - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.IntVector; - - try( - BufferAllocator allocator = new RootAllocator(); - IntVector intVector = new IntVector("fixed-size-primitive-layout", allocator); - ){ - intVector.allocateNew(3); - intVector.set(0,1); - intVector.setNull(1); - intVector.set(2,2); - intVector.setValueCount(3); - System.out.println("Vector created in memory: " + intVector); - } - -.. code-block:: shell - - Vector created in memory: [1, null, 2] - - -Example: create a vector of UTF-8 encoded strings representing ``["one", "two", "three"]``: - -.. code-block:: Java - - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.VarCharVector; - - try( - BufferAllocator allocator = new RootAllocator(); - VarCharVector varCharVector = new VarCharVector("variable-size-primitive-layout", allocator); - ){ - varCharVector.allocateNew(3); - varCharVector.set(0, "one".getBytes()); - varCharVector.set(1, "two".getBytes()); - varCharVector.set(2, "three".getBytes()); - varCharVector.setValueCount(3); - System.out.println("Vector created in memory: " + varCharVector); - } - -.. code-block:: shell - - Vector created in memory: [one, two, three] - -Create a Field -************** - -**Fields** are used to denote the particular columns of tabular data. -They consist of a name, a data type, a flag indicating whether the column can have null values, -and optional key-value metadata. - -Example: create a field named "document" of string type: - -.. code-block:: Java - - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - import java.util.HashMap; - import java.util.Map; - - Map metadata = new HashMap<>(); - metadata.put("A", "Id card"); - metadata.put("B", "Passport"); - metadata.put("C", "Visa"); - Field document = new Field("document", - new FieldType(true, new ArrowType.Utf8(), /*dictionary*/ null, metadata), - /*children*/ null); - System.out.println("Field created: " + document + ", Metadata: " + document.getMetadata()); - -.. code-block:: shell - - Field created: document: Utf8, Metadata: {A=Id card, B=Passport, C=Visa} - -Create a Schema -*************** - -**Schemas** hold a sequence of fields together with some optional metadata. - -Example: Create a schema describing datasets with two columns: -an int32 column "A" and a UTF8-encoded string column "B" - -.. code-block:: Java - - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - import org.apache.arrow.vector.types.pojo.Schema; - import java.util.HashMap; - import java.util.Map; - import static java.util.Arrays.asList; - - Map metadata = new HashMap<>(); - metadata.put("K1", "V1"); - metadata.put("K2", "V2"); - Field a = new Field("A", FieldType.nullable(new ArrowType.Int(32, true)), /*children*/ null); - Field b = new Field("B", FieldType.nullable(new ArrowType.Utf8()), /*children*/ null); - Schema schema = new Schema(asList(a, b), metadata); - System.out.println("Schema created: " + schema); - -.. code-block:: shell - - Schema created: Schema(metadata: {K1=V1, K2=V2}) - -Create a VectorSchemaRoot -************************* - -A **VectorSchemaRoot** combines ValueVectors with a Schema to represent tabular data. - -Example: Create a dataset of names (strings) and ages (32-bit signed integers). - -.. code-block:: Java - - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.IntVector; - import org.apache.arrow.vector.VarCharVector; - import org.apache.arrow.vector.VectorSchemaRoot; - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - import org.apache.arrow.vector.types.pojo.Schema; - import java.nio.charset.StandardCharsets; - import java.util.HashMap; - import java.util.Map; - import static java.util.Arrays.asList; - - Field age = new Field("age", - FieldType.nullable(new ArrowType.Int(32, true)), - /*children*/null - ); - Field name = new Field("name", - FieldType.nullable(new ArrowType.Utf8()), - /*children*/null - ); - Schema schema = new Schema(asList(age, name), /*metadata*/ null); - try( - BufferAllocator allocator = new RootAllocator(); - VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); - IntVector ageVector = (IntVector) root.getVector("age"); - VarCharVector nameVector = (VarCharVector) root.getVector("name"); - ){ - ageVector.allocateNew(3); - ageVector.set(0, 10); - ageVector.set(1, 20); - ageVector.set(2, 30); - nameVector.allocateNew(3); - nameVector.set(0, "Dave".getBytes(StandardCharsets.UTF_8)); - nameVector.set(1, "Peter".getBytes(StandardCharsets.UTF_8)); - nameVector.set(2, "Mary".getBytes(StandardCharsets.UTF_8)); - root.setRowCount(3); - System.out.println("VectorSchemaRoot created: \n" + root.contentToTSVString()); - } - -.. code-block:: shell - - VectorSchemaRoot created: - age name - 10 Dave - 20 Peter - 30 Mary - - -Interprocess Communication (IPC) -******************************** - -Arrow data can be written to and read from disk, and both of these can be done in -a streaming and/or random-access fashion depending on application requirements. - -**Write data to an arrow file** - -Example: Write the dataset from the previous example to an Arrow IPC file (random-access). - -.. code-block:: Java - - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.IntVector; - import org.apache.arrow.vector.VarCharVector; - import org.apache.arrow.vector.VectorSchemaRoot; - import org.apache.arrow.vector.ipc.ArrowFileWriter; - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - import org.apache.arrow.vector.types.pojo.Schema; - import java.io.File; - import java.io.FileOutputStream; - import java.io.IOException; - import java.nio.charset.StandardCharsets; - import java.util.HashMap; - import java.util.Map; - import static java.util.Arrays.asList; - - Field age = new Field("age", - FieldType.nullable(new ArrowType.Int(32, true)), - /*children*/ null); - Field name = new Field("name", - FieldType.nullable(new ArrowType.Utf8()), - /*children*/ null); - Schema schema = new Schema(asList(age, name)); - try( - BufferAllocator allocator = new RootAllocator(); - VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); - IntVector ageVector = (IntVector) root.getVector("age"); - VarCharVector nameVector = (VarCharVector) root.getVector("name"); - ){ - ageVector.allocateNew(3); - ageVector.set(0, 10); - ageVector.set(1, 20); - ageVector.set(2, 30); - nameVector.allocateNew(3); - nameVector.set(0, "Dave".getBytes(StandardCharsets.UTF_8)); - nameVector.set(1, "Peter".getBytes(StandardCharsets.UTF_8)); - nameVector.set(2, "Mary".getBytes(StandardCharsets.UTF_8)); - root.setRowCount(3); - File file = new File("random_access_file.arrow"); - try ( - FileOutputStream fileOutputStream = new FileOutputStream(file); - ArrowFileWriter writer = new ArrowFileWriter(root, /*provider*/ null, fileOutputStream.getChannel()); - ) { - writer.start(); - writer.writeBatch(); - writer.end(); - System.out.println("Record batches written: " + writer.getRecordBlocks().size() - + ". Number of rows written: " + root.getRowCount()); - } catch (IOException e) { - e.printStackTrace(); - } - } - -.. code-block:: shell - - Record batches written: 1. Number of rows written: 3 - -**Read data from an arrow file** - -Example: Read the dataset from the previous example from an Arrow IPC file (random-access). - -.. code-block:: Java - - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.ipc.ArrowFileReader; - import org.apache.arrow.vector.ipc.message.ArrowBlock; - import org.apache.arrow.vector.VectorSchemaRoot; - import java.io.File; - import java.io.FileInputStream; - import java.io.FileOutputStream; - import java.io.IOException; - - try( - BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - FileInputStream fileInputStream = new FileInputStream(new File("random_access_file.arrow")); - ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator); - ){ - System.out.println("Record batches in file: " + reader.getRecordBlocks().size()); - for (ArrowBlock arrowBlock : reader.getRecordBlocks()) { - reader.loadRecordBatch(arrowBlock); - VectorSchemaRoot root = reader.getVectorSchemaRoot(); - System.out.println("VectorSchemaRoot read: \n" + root.contentToTSVString()); - } - } catch (IOException e) { - e.printStackTrace(); - } - -.. code-block:: shell - - Record batches in file: 1 - VectorSchemaRoot read: - age name - 10 Dave - 20 Peter - 30 Mary - -More examples available at `Arrow Java Cookbook`_. - -.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/java/reference/index.rst b/docs/source/java/reference/index.rst deleted file mode 100644 index 523ac0c7f74..00000000000 --- a/docs/source/java/reference/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Java Reference (javadoc) -======================== - -Stub page for the Java reference docs; actual source is located in the java/ directory. diff --git a/docs/source/java/substrait.rst b/docs/source/java/substrait.rst deleted file mode 100644 index 50485526130..00000000000 --- a/docs/source/java/substrait.rst +++ /dev/null @@ -1,203 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -========= -Substrait -========= - -The ``arrow-dataset`` module can execute Substrait_ plans via the :doc:`Acero <../cpp/acero>` -query engine. - -.. contents:: - -Executing Queries Using Substrait Plans -======================================= - -Plans can reference data in files via URIs, or "named tables" that must be provided along with the plan. - -Here is an example of a Java program that queries a Parquet file using Java Substrait -(this example use `Substrait Java`_ project to compile a SQL query to a Substrait plan): - -.. code-block:: Java - - import com.google.common.collect.ImmutableList; - import io.substrait.isthmus.SqlToSubstrait; - import io.substrait.proto.Plan; - import org.apache.arrow.dataset.file.FileFormat; - import org.apache.arrow.dataset.file.FileSystemDatasetFactory; - import org.apache.arrow.dataset.jni.NativeMemoryPool; - import org.apache.arrow.dataset.scanner.ScanOptions; - import org.apache.arrow.dataset.scanner.Scanner; - import org.apache.arrow.dataset.source.Dataset; - import org.apache.arrow.dataset.source.DatasetFactory; - import org.apache.arrow.dataset.substrait.AceroSubstraitConsumer; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.ipc.ArrowReader; - import org.apache.calcite.sql.parser.SqlParseException; - - import java.nio.ByteBuffer; - import java.util.HashMap; - import java.util.Map; - - public class ClientSubstrait { - public static void main(String[] args) { - String uri = "file:///data/tpch_parquet/nation.parquet"; - ScanOptions options = new ScanOptions(/*batchSize*/ 32768); - try ( - BufferAllocator allocator = new RootAllocator(); - DatasetFactory datasetFactory = new FileSystemDatasetFactory(allocator, NativeMemoryPool.getDefault(), - FileFormat.PARQUET, uri); - Dataset dataset = datasetFactory.finish(); - Scanner scanner = dataset.newScan(options); - ArrowReader reader = scanner.scanBatches() - ) { - // map table to reader - Map mapTableToArrowReader = new HashMap<>(); - mapTableToArrowReader.put("NATION", reader); - // get binary plan - Plan plan = getPlan(); - ByteBuffer substraitPlan = ByteBuffer.allocateDirect(plan.toByteArray().length); - substraitPlan.put(plan.toByteArray()); - // run query - try (ArrowReader arrowReader = new AceroSubstraitConsumer(allocator).runQuery( - substraitPlan, - mapTableToArrowReader - )) { - while (arrowReader.loadNextBatch()) { - System.out.println(arrowReader.getVectorSchemaRoot().contentToTSVString()); - } - } - } catch (Exception e) { - e.printStackTrace(); - } - } - - static Plan getPlan() throws SqlParseException { - String sql = "SELECT * from nation"; - String nation = "CREATE TABLE NATION (N_NATIONKEY BIGINT NOT NULL, N_NAME CHAR(25), " + - "N_REGIONKEY BIGINT NOT NULL, N_COMMENT VARCHAR(152))"; - SqlToSubstrait sqlToSubstrait = new SqlToSubstrait(); - Plan plan = sqlToSubstrait.execute(sql, ImmutableList.of(nation)); - return plan; - } - } - -.. code-block:: text - - // Results example: - FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) - 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai - 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon - -Executing Projections and Filters Using Extended Expressions -============================================================ - -Dataset also supports projections and filters with Substrait's `Extended Expression`_. -This requires the substrait-java library. - -This Java program: - -- Loads a Parquet file containing the "nation" table from the TPC-H benchmark. -- Applies a filter: - - ``N_NATIONKEY > 18`` -- Projects two new columns: - - ``N_REGIONKEY + 10`` - - ``N_NAME || ' - ' || N_COMMENT`` - - - -.. code-block:: Java - - import com.google.common.collect.ImmutableList; - import io.substrait.isthmus.SqlExpressionToSubstrait; - import io.substrait.proto.ExtendedExpression; - import org.apache.arrow.dataset.file.FileFormat; - import org.apache.arrow.dataset.file.FileSystemDatasetFactory; - import org.apache.arrow.dataset.jni.NativeMemoryPool; - import org.apache.arrow.dataset.scanner.ScanOptions; - import org.apache.arrow.dataset.scanner.Scanner; - import org.apache.arrow.dataset.source.Dataset; - import org.apache.arrow.dataset.source.DatasetFactory; - import org.apache.arrow.memory.BufferAllocator; - import org.apache.arrow.memory.RootAllocator; - import org.apache.arrow.vector.ipc.ArrowReader; - import org.apache.calcite.sql.parser.SqlParseException; - - import java.nio.ByteBuffer; - import java.util.Base64; - import java.util.Optional; - - public class ClientSubstraitExtendedExpressionsCookbook { - - public static void main(String[] args) throws SqlParseException { - projectAndFilterDataset(); - } - - private static void projectAndFilterDataset() throws SqlParseException { - String uri = "file:///Users/data/tpch_parquet/nation.parquet"; - ScanOptions options = - new ScanOptions.Builder(/*batchSize*/ 32768) - .columns(Optional.empty()) - .substraitFilter(getByteBuffer(new String[]{"N_NATIONKEY > 18"})) - .substraitProjection(getByteBuffer(new String[]{"N_REGIONKEY + 10", - "N_NAME || CAST(' - ' as VARCHAR) || N_COMMENT"})) - .build(); - try (BufferAllocator allocator = new RootAllocator(); - DatasetFactory datasetFactory = - new FileSystemDatasetFactory( - allocator, NativeMemoryPool.getDefault(), FileFormat.PARQUET, uri); - Dataset dataset = datasetFactory.finish(); - Scanner scanner = dataset.newScan(options); - ArrowReader reader = scanner.scanBatches()) { - while (reader.loadNextBatch()) { - System.out.println(reader.getVectorSchemaRoot().contentToTSVString()); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - private static ByteBuffer getByteBuffer(String[] sqlExpression) throws SqlParseException { - String schema = - "CREATE TABLE NATION (N_NATIONKEY INT NOT NULL, N_NAME VARCHAR, " - + "N_REGIONKEY INT NOT NULL, N_COMMENT VARCHAR)"; - SqlExpressionToSubstrait expressionToSubstrait = new SqlExpressionToSubstrait(); - ExtendedExpression expression = - expressionToSubstrait.convert(sqlExpression, ImmutableList.of(schema)); - byte[] expressionToByte = - Base64.getDecoder().decode(Base64.getEncoder().encodeToString(expression.toByteArray())); - ByteBuffer byteBuffer = ByteBuffer.allocateDirect(expressionToByte.length); - byteBuffer.put(expressionToByte); - return byteBuffer; - } - } - -.. code-block:: text - - column-1 column-2 - 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account - 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely - 12 VIETNAM - hely enticingly express accounts. even, final - 13 RUSSIA - requests against the platelets use never according to the quickly regular pint - 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull - 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be - -.. _`Substrait`: https://substrait.io/ -.. _`Substrait Java`: https://github.com/substrait-io/substrait-java -.. _`Acero`: https://arrow.apache.org/docs/cpp/acero.html -.. _`Extended Expression`: https://github.com/substrait-io/substrait/blob/main/site/docs/expressions/extended_expression.md diff --git a/docs/source/java/table.rst b/docs/source/java/table.rst deleted file mode 100644 index 5aa95e153ce..00000000000 --- a/docs/source/java/table.rst +++ /dev/null @@ -1,378 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -===== -Table -===== - -**NOTE**: The Table API is experimental and subject to change. See the list of limitations below. - -`Table`_ is an immutable tabular data structure based on `FieldVector`_. Like `VectorSchemaRoot`_, ``Table`` is a columnar data structure backed by Arrow arrays, or more specifically, by ``FieldVector`` objects. It differs from ``VectorSchemaRoot`` mainly in that it is fully immutable and lacks support for batch operations. Anyone processing batches of tabular data in a pipeline should continue to use ``VectorSchemaRoot``. Finally, the ``Table`` API is mainly row-oriented, so in some ways it's more like the JDBC API than the ``VectorSchemaRoot`` API, but you can still use ``FieldReaders`` to work with data in a columnar fashion. - -Mutation in Table and VectorSchemaRoot -====================================== - -``VectorSchemaRoot`` provides a thin wrapper on the vectors that hold its data. Individual vectors can be retrieved from a vector schema root. These vectors have *setters* for modifying their elements, making ``VectorSchemaRoot`` immutable only by convention. The protocol for mutating a vector is documented in the `ValueVector`_ interface: - -- values need to be written in order (e.g. index 0, 1, 2, 5) -- null vectors start with all values as null before writing anything -- for variable width types, the offset vector should be all zeros before writing -- you must call setValueCount before a vector can be read -- you should never write to a vector once it has been read. - -The rules aren't enforced by the API so the programmer is responsible for ensuring that they are followed. Failure to do so could lead to runtime exceptions. - -``Table``, on the other hand, is immutable. The underlying vectors are not exposed. When a table is created from existing vectors, their memory is transferred to new vectors, so subsequent changes to the original vectors can't impact the new table's values. - -Features and limitations -====================================== - -A basic set of table functionality is currently available: - -- Create a table from vectors or ``VectorSchemaRoot`` -- Iterate tables by row, or set the current row index directly -- Access vector values as primitives, objects, and/or nullable `ValueHolder`_ instances (depending on type) -- Get a ``FieldReader`` for any vector -- Add and remove vectors, creating new tables -- Encode and decode a table's vectors using dictionary encoding -- Export table data for use by native code -- Print representative data to TSV strings -- Get a table's schema -- Slice tables -- Convert table to ``VectorSchemaRoot`` - -Limitations in the 11.0.0 release: - -- No support ``ChunkedArray`` or any form of row-group. Support for chunked arrays or row groups will be considered for a future release. -- No support for the C-Stream API. Support for the streaming API is contingent on chunked array support -- No support for creating tables directly from Java POJOs. All data held by a table must be imported via a ``VectorSchemaRoot``, or from collections or arrays of vectors. - -The Table API -============= - -Like ``VectorSchemaRoot``, a table contains a `Schema`_ and an ordered collection of ``FieldVector`` objects, but it is designed to be accessed via a row-oriented interface. - -Creating a Table from a VectorSchemaRoot -**************************************** - -Tables are created from a ``VectorSchemaRoot`` as shown below. The memory buffers holding the data are transferred from the vector schema root to new vectors in the new table, clearing the source vectors in the process. This ensures that the data in your new table is never changed. Since the buffers are transferred rather than copied, this is a very low overhead operation. - -.. code-block:: Java - - Table t = new Table(someVectorSchemaRoot); - -If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of ``ValueVector#setSafe()``), it would reflect those changes, but the values in table *t* are unchanged. - -Creating a Table from FieldVectors -********************************** - -Tables can be created from ``FieldVectors`` as shown below, using 'var-arg' array arguments: - -.. code-block:: Java - - IntVector myVector = createMyIntVector(); - VectorSchemaRoot vsr1 = new VectorSchemaRoot(myVector); - -or by passing a collection: - -.. code-block:: Java - - IntVector myVector = createMyIntVector(); - List fvList = List.of(myVector); - VectorSchemaRoot vsr1 = new VectorSchemaRoot(fvList); - -It is rarely a good idea to share vectors between multiple vector schema roots, and it would not be a good idea to share them between vector schema roots and tables. Creating a ``VectorSchemaRoot`` from a list of vectors does not cause the reference counts for the vectors to be incremented. Unless you manage the counts manually, the code below would lead to more references than reference counts, and that could lead to trouble. There is an implicit assumption that the vectors were created for use by *one* ``VectorSchemaRoot`` that this code violates. - -*Don't do this:* - -.. code-block:: Java - - IntVector myVector = createMyIntVector(); // Reference count for myVector = 1 - VectorSchemaRoot vsr1 = new VectorSchemaRoot(myVector); // Still one reference - VectorSchemaRoot vsr2 = new VectorSchemaRoot(myVector); - // Ref count is still one, but there are two VSRs with a reference to myVector - vsr2.clear(); // Reference count for myVector is 0. - -What is happening is that the reference counter works at a lower level than the ``VectorSchemaRoot`` interface. A reference counter counts references to `ArrowBuf`_ instances that control memory buffers. It doesn't count references to the vectors that hold those ArrowBufs. In the example above, each ``ArrowBuf`` is held by one vector, so there is only one reference. This distinction is blurred when you call the ``VectorSchemaRoot``'s clear() method, which frees the memory held by each of the vectors it references even though another instance references the same vectors. - -When you create tables from vectors, it's assumed that there are no external references to those vectors. To be certain, the buffers underlying these vectors are transferred to new vectors in the new table, and the original vectors are cleared. - -*Don't do this either, but note the difference from above:* - -.. code-block:: Java - - IntVector myVector = createMyIntVector(); // Reference count for myVector = 1 - Table t1 = new Table(myVector); - // myVector is cleared; Table t1 has a new hidden vector with the data from myVector - Table t2 = new Table(myVector); - // t2 has no rows because myVector was just cleared - // t1 continues to have the data from the original vector - t2.clear(); - // no change because t2 is already empty and t1 is independent - -With tables, memory is explicitly transferred on instantiation so the buffers held by a table are held by *only* that table. - -Creating Tables with dictionary-encoded vectors -*********************************************** - -Another point of difference is that ``VectorSchemaRoot`` is uninformed about any dictionary-encoding of its vectors, while tables hold an optional `DictionaryProvider`_ instance. If any vectors in the source data are encoded, a DictionaryProvider must be set to un-encode the values. - -.. code-block:: Java - - VectorSchemaRoot vsr = myVsr(); - DictionaryProvider provider = myProvider(); - Table t = new Table(vsr, provider); - -In ``Table``, dictionaries are used like they are with vectors. To decode a vector, the user provides the name of the vector to decode and the dictionary id: - -.. code-block:: Java - - Table t = new Table(vsr, provider); - ValueVector decodedName = t.decode("name", 1L); - -To encode a vector from a table, a similar approach is used: - -.. code-block:: Java - - Table t = new Table(vsr, provider); - ValueVector encodedName = t.encode("name", 1L); - -Freeing memory explicitly -************************* - -Tables use off-heap memory that must be freed when it is no longer needed. ``Table`` implements ``AutoCloseable`` so the best way to create one is in a try-with-resources block: - -.. code-block:: Java - - try (VectorSchemaRoot vsr = myMethodForGettingVsrs(); - Table t = new Table(vsr)) { - // do useful things. - } - -If you don't use a try-with-resources block, you must close the table manually: - -.. code-block:: Java - - try { - VectorSchemaRoot vsr = myMethodForGettingVsrs(); - Table t = new Table(vsr); - // do useful things. - } finally { - vsr.close(); - t.close(); - } - -Manual closing should be performed in a finally block. - -Getting the schema -****************** - -You get the table's schema just as you would with a vector schema root: - -.. code-block:: Java - - Schema s = table.getSchema(); - -Adding and removing vectors -*************************** - -``Table`` provides facilities for adding and removing vectors modeled on the same functionality in ``VectorSchemaRoot``. These operations return new instances rather than modifying the original instance in-place. - -.. code-block:: Java - - try (Table t = new Table(vectorList)) { - IntVector v3 = new IntVector("3", intFieldType, allocator); - Table t2 = t.addVector(2, v3); - Table t3 = t2.removeVector(1); - // don't forget to close t2 and t3 - } - -Slicing tables -************** - -``Table`` supports *slice()* operations, where a slice of a source table is a second Table that refers to a single, contiguous range of rows in the source. - -.. code-block:: Java - - try (Table t = new Table(vectorList)) { - Table t2 = t.slice(100, 200); // creates a slice referencing the values in range (100, 200] - ... - } - -This raises the question: If you create a slice with *all* the values in the source table (as shown below), how would that differ from a new Table constructed with the same vectors as the source? - -.. code-block:: Java - - try (Table t = new Table(vectorList)) { - Table t2 = t.slice(0, t.getRowCount()); // creates a slice referencing all the values in t - // ... - } - -The difference is that when you *construct* a new table, the buffers are transferred from the source vectors to new vectors in the destination. With a slice, both tables share the same underlying vectors. That's OK, though, since both tables are immutable. - -Using FieldReaders -****************** - -You can get a `FieldReader`_ for any vector in the Table passing either the `Field`_, vector index, or vector name as an argument. The signatures are the same as in ``VectorSchemaRoot``. - -.. code-block:: Java - - FieldReader nameReader = table.getReader("user_name"); - -Row operations -************** - -Row-based access is supported by the `Row`_ object. ``Row`` provides *get()* methods by both vector name and vector position, but no *set()* operations. - -It is important to recognize that rows are NOT reified as objects, but rather operate like a cursor where the data from numerous logical rows in the table can be viewed (one at a time) using the same ``Row`` instance. See "Moving from row-to-row" below for information about navigating through the table. - -Getting a row -************* - -Calling ``immutableRow()`` on any table instance returns a new ``Row`` instance. - -.. code-block:: Java - - Row r = table.immutableRow(); - -Moving from row-to-row -********************** - -Since rows are iterable, you can traverse a table using a standard while loop: - -.. code-block:: Java - - Row r = table.immutableRow(); - while (r.hasNext()) { - r.next(); - // do something useful here - } - -``Table`` implements ``Iterable`` so you can access rows directly from a table in an enhanced *for* loop: - -.. code-block:: Java - - for (Row row: table) { - int age = row.getInt("age"); - boolean nameIsNull = row.isNull("name"); - ... - } - -Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the ``Row#setPosition()`` method, so you can skip to a specific row. Row numbers are 0-based. - -.. code-block:: Java - - Row r = table.immutableRow(); - int age101 = r.setPosition(101); // change position directly to 101 - -Any changes to position are applied to all the columns in the table. - -Note that you must call ``next()``, or ``setPosition()`` before accessing values via a row. Failure to do so results in a runtime exception. - -Read operations using rows -************************** - -Methods are available for getting values by vector name and vector index, where index is the 0-based position of the vector in the table. For example, assuming 'age' is the 13th vector in 'table', the following two gets are equivalent: - -.. code-block:: Java - - Row r = table.immutableRow(); - r.next(); // position the row at the first value - int age1 = r.get("age"); // gets the value of vector named 'age' in the table at row 0 - int age2 = r.get(12); // gets the value of the 13th vector in the table at row 0 - -You can also get value using a nullable ``ValueHolder``. For example: - -.. code-block:: Java - - NullableIntHolder holder = new NullableIntHolder(); - int b = row.getInt("age", holder); - -This can be used to retrieve values without creating a new Object for each. - -In addition to getting values, you can check if a value is null using ``isNull()``. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. - -.. code-block:: Java - - boolean name0isNull = row.isNull("name"); - -You can also get the current row number: - -.. code-block:: Java - - int row = row.getRowNumber(); - -Reading values as Objects -************************* - -For any given vector type, the basic *get()* method returns a primitive value wherever possible. For example, *getTimeStampMicro()* returns a long value that encodes the timestamp. To get the LocalDateTime object representing that timestamp in Java, another method with 'Obj' appended to the name is provided. For example: - -.. code-block:: Java - - long ts = row.getTimeStampMicro(); - LocalDateTime tsObject = row.getTimeStampMicroObj(); - -The exception to this naming scheme is for complex vector types (List, Map, Schema, Union, DenseUnion, and ExtensionType). These always return objects rather than primitives so no "Obj" extension is required. It is expected that some users may subclass ``Row`` to add getters that are more specific to their needs. - -Reading VarChars and LargeVarChars -********************************** - -Strings in arrow are represented as byte arrays encoded with the UTF-8 charset. You can get either a String result or the actual byte array. - -.. code-block:: Java - - byte[] b = row.getVarChar("first_name"); - String s = row.getVarCharObj("first_name"); // uses the default encoding (UTF-8) - -Converting a Table to a VectorSchemaRoot -**************************************** - -Tables can be converted to vector schema roots using the *toVectorSchemaRoot()* method. Buffers are transferred to the vector schema root and the source table is cleared. - -.. code-block:: Java - - VectorSchemaRoot root = myTable.toVectorSchemaRoot(); - -Working with the C-Data interface -********************************* - -The ability to work with native code is required for many Arrow features. This section describes how tables can be be exported for use with native code - -Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the ``exportTable()`` methods in the `Data`_ class avoids this concern. - -.. code-block:: Java - - Data.exportTable(bufferAllocator, table, dictionaryProvider, outArrowArray); - -If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to ``exportTable()`` can be omitted and the table's provider attribute will be used: - -.. code-block:: Java - - Data.exportTable(bufferAllocator, table, outArrowArray); - -.. _`ArrowBuf`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/memory/ArrowBuf.html -.. _`Data`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/c/Data.html -.. _`DictionaryProvider`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/dictionary/DictionaryProvider.html -.. _`Field`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Field.html -.. _`FieldReader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/complex/reader/FieldReader.html -.. _`FieldVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/FieldVector.html -.. _`Row`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Row.html -.. _`Schema`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html -.. _`Table`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Table.html -.. _`ValueHolder`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/holders/ValueHolder.html -.. _`ValueVector`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/ValueVector.html -.. _`VectorSchemaRoot`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorSchemaRoot.html diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst deleted file mode 100644 index 1c3e123cf50..00000000000 --- a/docs/source/java/vector.rst +++ /dev/null @@ -1,366 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -=========== -ValueVector -=========== - -:class:`ValueVector` interface (which called Array in C++ implementation and -the :doc:`the specification <../format/Columnar>`) is an abstraction that is used to store a -sequence of values having the same type in an individual column. Internally, those values are -represented by one or several buffers, the number and meaning of which depend on the vector’s data type. - -There are concrete subclasses of :class:`ValueVector` for each primitive data type -and nested type described in the specification. There are a few differences in naming -with the type names described in the specification: -Table with non-intuitive names (BigInt = 64 bit integer, etc). - -It is important that vector is allocated before attempting to read or write, -:class:`ValueVector` "should" strive to guarantee this order of operation: -create > allocate > mutate > set value count > access > clear (or allocate to start the process over). -We will go through a concrete example to demonstrate each operation in the next section. - -Vector Life Cycle -================= - -As discussed above, each vector goes through several steps in its life cycle, -and each step is triggered by a vector operation. In particular, we have the following vector operations: - -1. **Vector creation**: we create a new vector object by, for example, the vector constructor. -The following code creates a new ``IntVector`` by the constructor: - -.. code-block:: Java - - RootAllocator allocator = new RootAllocator(Long.MAX_VALUE); - ... - IntVector vector = new IntVector("int vector", allocator); - -By now, a vector object is created. However, no underlying memory has been allocated, so we need the -following step. - -2. **Vector allocation**: in this step, we allocate memory for the vector. For most vectors, we -have two options: 1) if we know the maximum vector capacity, we can specify it by calling the -``allocateNew(int)`` method; 2) otherwise, we should call the ``allocateNew()`` method, and a default -capacity will be allocated for it. For our running example, we assume that the vector capacity never -exceeds 10: - -.. code-block:: Java - - vector.allocateNew(10); - -3. **Vector mutation**: now we can populate the vector with values we desire. For all vectors, we can populate -vector values through vector writers (An example will be given in the next section). For primitive types, -we can also mutate the vector by the set methods. There are two classes of set methods: 1) if we can -be sure the vector has enough capacity, we can call the ``set(index, value)`` method. 2) if we are not sure -about the vector capacity, we should call the ``setSafe(index, value)`` method, which will automatically -take care of vector reallocation, if the capacity is not sufficient. For our running example, we know the -vector has enough capacity, so we can call - -.. code-block:: Java - - vector.set(/*index*/5, /*value*/25); - -4. **Set value count**: for this step, we set the value count of the vector by calling the -``setValueCount(int)`` method: - -.. code-block:: Java - - vector.setValueCount(10); - -After this step, the vector enters an immutable state. In other words, we should no longer mutate it. -(Unless we reuse the vector by allocating it again. This will be discussed shortly.) - -5. **Vector access**: it is time to access vector values. Similarly, we have two options to access values: -1) get methods and 2) vector reader. Vector reader works for all types of vectors, while get methods are -only available for primitive vectors. A concrete example for vector reader will be given in the next section. -Below is an example of vector access by get method: - -.. code-block:: Java - - int value = vector.get(5); // value == 25 - -6. **Vector clear**: when we are done with the vector, we should clear it to release its memory. This is done by -calling the ``close()`` method: - -.. code-block:: Java - - vector.close(); - -Some points to note about the steps above: - -* The steps are not necessarily performed in a linear sequence. Instead, they can be in a loop. For example, - when a vector enters the access step, we can also go back to the vector mutation step, and then set value - count, access vector, and so on. - -* We should try to make sure the above steps are carried out in order. Otherwise, the vector - may be in an undefined state, and some unexpected behavior may occur. However, this restriction - is not strict. That means it is possible that we violates the order above, but still get - correct results. - -* When mutating vector values through set methods, we should prefer ``set(index, value)`` methods to - ``setSafe(index, value)`` methods whenever possible, to avoid unnecessary performance overhead of handling - vector capacity. - -* All vectors implement the ``AutoCloseable`` interface. So they must be closed explicitly when they are - no longer used, to avoid resource leak. To make sure of this, it is recommended to place vector related operations - into a try-with-resources block. - -* For fixed width vectors (e.g. IntVector), we can set values at different indices in arbitrary orders. - For variable width vectors (e.g. VarCharVector), however, we must set values in non-decreasing order of the - indices. Otherwise, the values after the set position will become invalid. For example, suppose we use the - following statements to populate a variable width vector: - -.. code-block:: Java - - VarCharVector vector = new VarCharVector("vector", allocator); - vector.allocateNew(); - vector.setSafe(0, "zero"); - vector.setSafe(1, "one"); - ... - vector.setSafe(9, "nine"); - -Then we set the value at position 5 again: - -.. code-block:: Java - - vector.setSafe(5, "5"); - -After that, the values at positions 6, 7, 8, and 9 of the vector will become invalid. - -Building ValueVector -==================== - -Note that the current implementation doesn't enforce the rule that Arrow objects are immutable. -:class:`ValueVector` instances could be created directly by using new keyword, there are -set/setSafe APIs and concrete subclasses of FieldWriter for populating values. - -For example, the code below shows how to build a :class:`BigIntVector`, in this case, we build a -vector of the range 0 to 7 where the element that should hold the fourth value is nulled - -.. code-block:: Java - - try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - BigIntVector vector = new BigIntVector("vector", allocator)) { - vector.allocateNew(8); - vector.set(0, 1); - vector.set(1, 2); - vector.set(2, 3); - vector.setNull(3); - vector.set(4, 5); - vector.set(5, 6); - vector.set(6, 7); - vector.set(7, 8); - vector.setValueCount(8); // this will finalizes the vector by convention. - ... - } - -The :class:`BigIntVector` holds two ArrowBufs. The first buffer holds the null bitmap, which consists -here of a single byte with the bits 1|1|1|1|0|1|1|1 (the bit is 1 if the value is non-null). -The second buffer contains all the above values. As the fourth entry is null, the value at that position -in the buffer is undefined. Note compared with set API, setSafe API would check value capacity before setting -values and reallocate buffers if necessary. - -Here is how to build a vector using writer - -.. code-block:: Java - - try (BigIntVector vector = new BigIntVector("vector", allocator); - BigIntWriter writer = new BigIntWriterImpl(vector)) { - writer.setPosition(0); - writer.writeBigInt(1); - writer.setPosition(1); - writer.writeBigInt(2); - writer.setPosition(2); - writer.writeBigInt(3); - // writer.setPosition(3) is not called which means the fourth value is null. - writer.setPosition(4); - writer.writeBigInt(5); - writer.setPosition(5); - writer.writeBigInt(6); - writer.setPosition(6); - writer.writeBigInt(7); - writer.setPosition(7); - writer.writeBigInt(8); - } - -There are get API and concrete subclasses of :class:`FieldReader` for accessing vector values, what needs -to be declared is that writer/reader is not as efficient as direct access - -.. code-block:: Java - - // access via get API - for (int i = 0; i < vector.getValueCount(); i++) { - if (!vector.isNull(i)) { - System.out.println(vector.get(i)); - } - } - - // access via reader - BigIntReader reader = vector.getReader(); - for (int i = 0; i < vector.getValueCount(); i++) { - reader.setPosition(i); - if (reader.isSet()) { - System.out.println(reader.readLong()); - } - } - -Building ListVector -=================== - -A :class:`ListVector` is a vector that holds a list of values for each index. Working with one you need to handle the same steps as mentioned above (create > allocate > mutate > set value count > access > clear), but the details of how you accomplish this are slightly different since you need to both create the vector and set the list of values for each index. - -For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. - -.. code-block:: Java - - try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); - ListVector listVector = ListVector.empty("vector", allocator)) { - UnionListWriter writer = listVector.getWriter(); - for (int i = 0; i < 10; i++) { - writer.startList(); - writer.setPosition(i); - for (int j = 0; j < 5; j++) { - writer.writeInt(j * i); - } - writer.setValueCount(5); - writer.endList(); - } - listVector.setValueCount(10); - } - -:class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. - -.. code-block:: Java - - // access via get API - for (int i = 0; i < listVector.getValueCount(); i++) { - if (!listVector.isNull(i)) { - ArrayList elements = (ArrayList) listVector.getObject(i); - for (Integer element : elements) { - System.out.println(element); - } - } - } - - // access via reader - UnionListReader reader = listVector.getReader(); - for (int i = 0; i < listVector.getValueCount(); i++) { - reader.setPosition(i); - while (reader.next()) { - IntReader intReader = reader.reader(); - if (intReader.isSet()) { - System.out.println(intReader.readInteger()); - } - } - } - -Dictionary Encoding -=================== - -Dictionary encoding is a form of compression where values of one type are replaced by values of a smaller type: an array of ints replacing an array of strings is a common example. The mapping between the original values and the replacements is held in a 'dictionary'. Since the dictionary needs only one copy of each of the longer values, the combination of the dictionary and the array of smaller values may use less memory. The more repetitive the original data, the greater the savings. - -A ``FieldVector`` can be dictionary encoded for performance or improved memory efficiency. Nearly any type of vector might be encoded if there are many values, but few unique values. - -There are a few steps involved in the encoding process: - -1. Create a regular, un-encoded vector and populate it -2. Create a dictionary vector of the same type as the un-encoded vector. This vector must have the same values, but each unique value in the un-encoded vector need appear here only once. -3. Create a ``Dictionary``. It will contain the dictionary vector, plus a ``DictionaryEncoding`` object that holds the encoding's metadata and settings values. -4. Create a ``DictionaryEncoder``. -5. Call the encode() method on the ``DictionaryEncoder`` to produce an encoded version of the original vector. -6. (Optional) Call the decode() method on the encoded vector to re-create the original values. - -The encoded values will be integers. Depending on how many unique values you have, you can use ``TinyIntVector``, ``SmallIntVector``, ``IntVector``, or ``BigIntVector`` to hold them. You specify the type when you create your ``DictionaryEncoding`` instance. You might wonder where those integers come from: the dictionary vector is a regular vector, so the value's index position in that vector is used as its encoded value. - -Another critical attribute in ``DictionaryEncoding`` is the id. It's important to understand how the id is used, so we cover that later in this section. - -This result will be a new vector (for example, an ``IntVector``) that can act in place of the original vector (for example, a ``VarCharVector``). When you write the data in arrow format, it is both the new ``IntVector`` plus the dictionary that is written: you will need the dictionary later to retrieve the original values. - -.. code-block:: Java - - // 1. create a vector for the un-encoded data and populate it - VarCharVector unencoded = new VarCharVector("unencoded", allocator); - // now put some data in it before continuing - - // 2. create a vector to hold the dictionary and populate it - VarCharVector dictionaryVector = new VarCharVector("dictionary", allocator); - - // 3. create a dictionary object - Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); - - // 4. create a dictionary encoder - DictionaryEncoder encoder = new DictionaryEncoder.encode(dictionary, allocator); - - // 5. encode the data - IntVector encoded = (IntVector) encoder.encode(unencoded); - - // 6. re-create an un-encoded version from the encoded vector - VarCharVector decoded = (VarCharVector) encoder.decode(encoded); - -One thing we haven't discussed is how to create the dictionary vector from the original un-encoded values. That is left to the library user since a custom method will likely be more efficient than a general utility. Since the dictionary vector is just a normal vector, you can populate its values with the standard APIs. - -Finally, you can package a number of dictionaries together, which is useful if you're working with a ``VectorSchemaRoot`` with several dictionary-encoded vectors. This is done using an object called a ``DictionaryProvider``. as shown in the example below. Note that we don't put the dictionary vectors in the same ``VectorSchemaRoot`` as the data vectors, as they will generally have fewer values. - - -.. code-block:: Java - - DictionaryProvider.MapDictionaryProvider provider = - new DictionaryProvider.MapDictionaryProvider(); - - provider.put(dictionary); - -The ``DictionaryProvider`` is simply a map of identifiers to ``Dictionary`` objects, where each identifier is a long value. In the above code you will see it as the first argument to the ``DictionaryEncoding`` constructor. - -This is where the ``DictionaryEncoding``'s 'id' attribute comes in. This value is used to connect dictionaries to instances of ``VectorSchemaRoot``, using a ``DictionaryProvider``. Here's how that works: - -* The ``VectorSchemaRoot`` has a ``Schema`` object containing a list of ``Field`` objects. -* The field has an attribute called 'dictionary', but it holds a ``DictionaryEncoding`` rather than a ``Dictionary`` -* As mentioned, the ``DictionaryProvider`` holds dictionaries indexed by a long value. This value is the id from your ``DictionaryEncoding``. -* To retrieve the dictionary for a vector in a ``VectorSchemaRoot``, you get the field associated with the vector, get its dictionary attribute, and use that object's id to look up the correct dictionary in the provider. - -.. code-block:: Java - - // create the encoded vector, the Dictionary and DictionaryProvider as discussed above - - // Create a VectorSchemaRoot with one encoded vector - VectorSchemaRoot vsr = new VectorSchemaRoot(List.of(encoded)); - - // now we want to decode our vector, so we retrieve its dictionary from the provider - Field f = vsr.getField(encoded.getName()); - DictionaryEncoding encoding = f.getDictionary(); - Dictionary dictionary = provider.lookup(encoding.getId()); - -As you can see, a ``DictionaryProvider`` is handy for managing the dictionaries associated with a ``VectorSchemaRoot``. More importantly, it helps package the dictionaries for a ``VectorSchemaRoot`` when it's written. The classes ``ArrowFileWriter`` and ``ArrowStreamWriter`` both accept an optional ``DictionaryProvider`` argument for that purpose. You can find example code for writing dictionaries in the documentation for (:doc:`ipc`). ``ArrowReader`` and its subclasses also implement the ``DictionaryProvider`` interface, so you can retrieve the actual dictionaries when reading a file. - -Slicing -======= - -Similar with C++ implementation, it is possible to make zero-copy slices of vectors to obtain a vector -referring to some logical sub-sequence of the data through :class:`TransferPair` - -.. code-block:: Java - - IntVector vector = new IntVector("intVector", allocator); - for (int i = 0; i < 10; i++) { - vector.setSafe(i, i); - } - vector.setValueCount(10); - - TransferPair tp = vector.getTransferPair(allocator); - tp.splitAndTransfer(0, 5); - IntVector sliced = (IntVector) tp.getTo(); - // In this case, the vector values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] and the sliceVector values are [0, 1, 2, 3, 4]. diff --git a/docs/source/java/vector_schema_root.rst b/docs/source/java/vector_schema_root.rst deleted file mode 100644 index 3615fe9c726..00000000000 --- a/docs/source/java/vector_schema_root.rst +++ /dev/null @@ -1,163 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -============ -Tabular Data -============ - -While arrays (aka: :doc:`ValueVector <./vector>`) represent a one-dimensional sequence of -homogeneous values, data often comes in the form of two-dimensional sets of -heterogeneous data (such as database tables, CSV files...). Arrow provides -several abstractions to handle such data conveniently and efficiently. - -Fields -====== - -Fields are used to denote the particular columns of tabular data. -A field, i.e. an instance of `Field`_, holds together a field name, a data -type, and some optional key-value metadata. - -.. code-block:: Java - - // Create a column "document" of string type with metadata - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - - Map metadata = new HashMap<>(); - metadata.put("A", "Id card"); - metadata.put("B", "Passport"); - metadata.put("C", "Visa"); - Field document = new Field("document", new FieldType(true, new ArrowType.Utf8(), /*dictionary*/ null, metadata), /*children*/ null); - -Schemas -======= - -A `Schema`_ describes the overall structure consisting of any number of columns. It holds a sequence of fields together -with some optional schema-wide metadata (in addition to per-field metadata). - -.. code-block:: Java - - // Create a schema describing datasets with two columns: - // a int32 column "A" and a utf8-encoded string column "B" - import org.apache.arrow.vector.types.pojo.ArrowType; - import org.apache.arrow.vector.types.pojo.Field; - import org.apache.arrow.vector.types.pojo.FieldType; - import org.apache.arrow.vector.types.pojo.Schema; - import static java.util.Arrays.asList; - - Map metadata = new HashMap<>(); - metadata.put("K1", "V1"); - metadata.put("K2", "V2"); - Field a = new Field("A", FieldType.nullable(new ArrowType.Int(32, true)), null); - Field b = new Field("B", FieldType.nullable(new ArrowType.Utf8()), null); - Schema schema = new Schema(asList(a, b), metadata); - -VectorSchemaRoot -================ - -A `VectorSchemaRoot`_ is a container for batches of data. Batches flow through -VectorSchemaRoot as part of a pipeline. - -.. note:: - - VectorSchemaRoot is somewhat analogous to tables or record batches in the - other Arrow implementations in that they all are 2D datasets, but their - usage is different. - -The recommended usage is to create a single VectorSchemaRoot based on a known -schema and populate data over and over into that root in a stream of batches, -rather than creating a new instance each time (see `Flight`_ or -``ArrowFileWriter`` as examples). Thus at any one point, a VectorSchemaRoot may -have data or may have no data (say it was transferred downstream or not yet -populated). - -Here is an example of creating a VectorSchemaRoot: - -.. code-block:: Java - - BitVector bitVector = new BitVector("boolean", allocator); - VarCharVector varCharVector = new VarCharVector("varchar", allocator); - bitVector.allocateNew(); - varCharVector.allocateNew(); - for (int i = 0; i < 10; i++) { - bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); - varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); - } - bitVector.setValueCount(10); - varCharVector.setValueCount(10); - - List fields = Arrays.asList(bitVector.getField(), varCharVector.getField()); - List vectors = Arrays.asList(bitVector, varCharVector); - VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(fields, vectors); - -Data can be loaded into/unloaded from a VectorSchemaRoot via `VectorLoader`_ -and `VectorUnloader`_. They handle converting between VectorSchemaRoot and -`ArrowRecordBatch`_ (a representation of a RecordBatch :ref:`IPC ` -message). For example: - -.. code-block:: Java - - // create a VectorSchemaRoot root1 and convert its data into recordBatch - VectorSchemaRoot root1 = new VectorSchemaRoot(fields, vectors); - VectorUnloader unloader = new VectorUnloader(root1); - ArrowRecordBatch recordBatch = unloader.getRecordBatch(); - - // create a VectorSchemaRoot root2 and load the recordBatch - VectorSchemaRoot root2 = VectorSchemaRoot.create(root1.getSchema(), allocator); - VectorLoader loader = new VectorLoader(root2); - loader.load(recordBatch); - -A new VectorSchemaRoot can be sliced from an existing root without copying -data: - -.. code-block:: Java - - // 0 indicates start index (inclusive) and 5 indicated length (exclusive). - VectorSchemaRoot newRoot = vectorSchemaRoot.slice(0, 5); - -Table -===== - -A `Table`_ is an immutable tabular data structure, very similar to VectorSchemaRoot, in that it is also built on ValueVectors and schemas. Unlike VectorSchemaRoot, Table is not designed for batch processing. Here is a version of the example above, showing how to create a Table, rather than a VectorSchemaRoot: - -.. code-block:: Java - - BitVector bitVector = new BitVector("boolean", allocator); - VarCharVector varCharVector = new VarCharVector("varchar", allocator); - bitVector.allocateNew(); - varCharVector.allocateNew(); - for (int i = 0; i < 10; i++) { - bitVector.setSafe(i, i % 2 == 0 ? 0 : 1); - varCharVector.setSafe(i, ("test" + i).getBytes(StandardCharsets.UTF_8)); - } - bitVector.setValueCount(10); - varCharVector.setValueCount(10); - - List vectors = Arrays.asList(bitVector, varCharVector); - Table table = new Table(vectors); - -See the :doc:`table` documentation for more information. - -.. _`ArrowRecordBatch`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/ipc/message/ArrowRecordBatch.html -.. _`Field`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Field.html -.. _`Flight`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/flight/package-summary.html -.. _`Schema`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/types/pojo/Schema.html -.. _`Table`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/table/Table.html -.. _`VectorLoader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorLoader.html -.. _`VectorSchemaRoot`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorSchemaRoot.html -.. _`VectorUnloader`: https://arrow.apache.org/docs/java/reference/org/apache/arrow/vector/VectorUnloader.html diff --git a/docs/source/js/index.rst b/docs/source/js/index.rst deleted file mode 100644 index 2ab205a08b8..00000000000 --- a/docs/source/js/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _js: - -JavaScript docs -=============== - -Stub page for the JavaScript docs; actual source is located in js/ sub-directory. From 8dd357d1381aaf40259602a8e9ac4d99ab70dcdf Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 31 Oct 2025 17:36:36 -0400 Subject: [PATCH 07/15] GH-47983: [CI][R] R nightly upload workflow failing for a few weeks (#47984) ### Rationale for this change PR #47727 refactored the R nightly upload workflow to handle r-pkg and r-lib files separately, but we needed to update the variable name used by `file.copy()`, causing the workflow to fail with "object 'current_path' not found" error. ### What changes are included in this PR? Add `current_path <- c(current_pkg_path, current_lib_path)` to combine the two path vectors before the `file.copy()` call on line 145. ### Are these changes tested? No ### Are there any user-facing changes? No * GitHub Issue: #47983 Authored-by: Nic Crane Signed-off-by: Nic Crane --- .github/workflows/r_nightly.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 4e12fce9545..4f4a111fd91 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -128,9 +128,10 @@ jobs: pattern = "r-lib", recursive = TRUE ) + current_path <- c(current_pkg_path, current_lib_path) files <- c( sub("r-pkg", repo_root, current_pkg_path), - sub("r-lib", paste0(repo_root, "__r-lib"), current_lib_path), + sub("r-lib", paste0(repo_root, "__r-lib"), current_lib_path) ) # decode contrib.url from artifact name: From d165954db5ff5de4ce29c7edebe97ce1e06c52b6 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Fri, 31 Oct 2025 18:13:42 -0400 Subject: [PATCH 08/15] GH-48010: [C++] Update bundled RE2 from 2022-06-01 to 2023-03-01 (#48011) ### Rationale for this change Arrow currently bundles RE2 version 2022-06-01, which fails to build on musl libc systems (Alpine Linux) due to missing `#include ` in RE2's `util/pcre.h`. This affects R package installations from source on Alpine (#46769), C++ builds on Alpine (#43350, #41619), and causes CRAN extended musl checks to fail. ### What changes are included in this PR? Updated `cpp/thirdparty/versions.txt` ### Are these changes tested? Yep - CI checks ### Are there any user-facing changes? No. * GitHub Issue: #48010 Authored-by: Nic Crane Signed-off-by: Nic Crane --- cpp/thirdparty/versions.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 7ba1f4f876b..fd596f9a24d 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -99,8 +99,11 @@ ARROW_PROTOBUF_BUILD_SHA256_CHECKSUM=2f723218f6cb709ae4cdc4fb5ed56a5951fc5d466f0 # warnings. ARROW_RAPIDJSON_BUILD_VERSION=232389d4f1012dddec4ef84861face2d2ba85709 ARROW_RAPIDJSON_BUILD_SHA256_CHECKSUM=b9290a9a6d444c8e049bd589ab804e0ccf2b05dc5984a19ed5ae75d090064806 -ARROW_RE2_BUILD_VERSION=2022-06-01 -ARROW_RE2_BUILD_SHA256_CHECKSUM=f89c61410a072e5cbcf8c27e3a778da7d6fd2f2b5b1445cd4f4508bee946ab0f +# RE2 2023-03-01 is pinned to avoid Abseil dependency. Versions after 2023-06-01 +# require Abseil, which would add significant build time and complexity, particularly +# for CRAN builds. This version includes musl libc support (GH-48010). +ARROW_RE2_BUILD_VERSION=2023-03-01 +ARROW_RE2_BUILD_SHA256_CHECKSUM=7a9a4824958586980926a300b4717202485c4b4115ac031822e29aa4ef207e48 ARROW_SNAPPY_BUILD_VERSION=1.2.2 ARROW_SNAPPY_BUILD_SHA256_CHECKSUM=90f74bc1fbf78a6c56b3c4a082a05103b3a56bb17bca1a27e052ea11723292dc ARROW_SUBSTRAIT_BUILD_VERSION=v0.44.0 From 44f82a402422237adc32551a26ea20ab45366282 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 1 Nov 2025 23:38:22 -0700 Subject: [PATCH 09/15] GH-47923: [CI] Use macos-15-intel instead of macos-13 for macOS x86 runner (#47690) ### Rationale for this change `macos-13` github runners are https://github.blog/changelog/2025-09-19-github-actions-macos-13-runner-image-is-closing-down/ This PR replaces `macos-13` with `macos-15-intel` based on recommendations [here](https://docs.github.com/en/actions/reference/runners/github-hosted-runners#standard-github-hosted-runners-for-public-repositories) ### What changes are included in this PR? Change all references of `macos-13` to `macos-15-intel` ### Are these changes tested? Yes, CI ### Are there any user-facing changes? No. * GitHub Issue: #47923 Lead-authored-by: Kevin Liu Co-authored-by: Kevin Liu Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 4 ++-- .github/workflows/matlab.yml | 2 +- .github/workflows/python.yml | 4 ++-- .github/workflows/verify_rc.yml | 2 +- dev/tasks/matlab/github.yml | 2 +- dev/tasks/python-wheels/github.osx.yml | 2 +- dev/tasks/r/github.packages.yml | 6 +++--- dev/tasks/tasks.yml | 6 +++--- 8 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index c002b6e5cf2..bb9571042c3 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -186,7 +186,7 @@ jobs: matrix: include: - architecture: AMD64 - macos-version: "13" + macos-version: "15-intel" - architecture: ARM64 macos-version: "14" env: @@ -257,7 +257,7 @@ jobs: restore-keys: cpp-ccache-macos-${{ matrix.macos-version }}- - name: Build run: | - if [ "${{ matrix.macos-version }}" = "13" ]; then + if [ "${{ matrix.macos-version }}" = "15-intel" ]; then # This is a workaround. # # Homebrew uses /usr/local as prefix. So packages diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 11a0da2a348..fbdac4a8b06 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -94,7 +94,7 @@ jobs: matrix: include: - architecture: AMD64 - macos-version: "13" + macos-version: "15-intel" - architecture: ARM64 macos-version: "14" steps: diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 361f6be6be8..5aa7a43c568 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -148,7 +148,7 @@ jobs: matrix: include: - architecture: AMD64 - macos-version: "13" + macos-version: "15-intel" - architecture: ARM64 macos-version: "14" env: @@ -219,7 +219,7 @@ jobs: - name: Build shell: bash run: | - if [ "${{ matrix.macos-version }}" = "13" ]; then + if [ "${{ matrix.macos-version }}" = "15-intel" ]; then # This is a workaround. # # Homebrew uses /usr/local as prefix. So packages diff --git a/.github/workflows/verify_rc.yml b/.github/workflows/verify_rc.yml index b0eaa1924c5..eb4287882f4 100644 --- a/.github/workflows/verify_rc.yml +++ b/.github/workflows/verify_rc.yml @@ -195,7 +195,7 @@ jobs: fail-fast: false matrix: runs-on: - - macos-13 + - macos-15-intel - macos-14 env: RC: ${{ needs.target.outputs.rc }} diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index cbbdb7a1334..6fdb313cfb2 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -64,7 +64,7 @@ jobs: strategy: matrix: platform: - - { architecture: "AMD64", macos-version: "13", architecture-suffix: "x64"} + - { architecture: "AMD64", macos-version: "15-intel", architecture-suffix: "x64"} - { architecture: "ARM64", macos-version: "14", architecture-suffix: "arm64"} steps: {{ macros.github_checkout_arrow()|indent }} diff --git a/dev/tasks/python-wheels/github.osx.yml b/dev/tasks/python-wheels/github.osx.yml index ef8e90f4129..fb57f131ad1 100644 --- a/dev/tasks/python-wheels/github.osx.yml +++ b/dev/tasks/python-wheels/github.osx.yml @@ -51,7 +51,7 @@ jobs: - name: Install System Dependencies run: | - brew install bash bison coreutils ninja + brew install bash bison coreutils mono ninja echo "$(brew --prefix bison)/bin" >> $GITHUB_PATH - name: Homebrew packages diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 44366945979..bbe306ab7c8 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -64,7 +64,7 @@ jobs: fail-fast: false matrix: platform: - - { runs_on: macos-13, arch: "x86_64" } + - { runs_on: macos-15-intel, arch: "x86_64" } - { runs_on: macos-14, arch: "arm64" } openssl: ['3.0', '1.1'] env: @@ -216,7 +216,7 @@ jobs: matrix: platform: - { runs_on: 'windows-latest', name: "Windows"} - - { runs_on: macos-13, name: "macOS x86_64"} + - { runs_on: macos-15-intel, name: "macOS x86_64"} - { runs_on: macos-14, name: "macOS arm64" } r_version: [oldrel, release] steps: @@ -396,7 +396,7 @@ jobs: matrix: platform: - {runs_on: "ubuntu-latest", name: "Linux"} - - {runs_on: "macos-13" , name: "macOS"} + - {runs_on: "macos-15-intel" , name: "macOS"} steps: - name: Install R uses: r-lib/actions/setup-r@v2 diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 749042779ed..6cf11d66b9f 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -219,7 +219,7 @@ tasks: python_version: "{{ python_version }}" python_abi_tag: "{{ abi_tag }}" macos_deployment_target: "12.0" - runs_on: "macos-13" + runs_on: "macos-15-intel" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-macosx_12_0_x86_64.whl @@ -346,7 +346,7 @@ tasks: params: target: {{ target }} use_conda: True - github_runner: "macos-13" + github_runner: "macos-15-intel" {% endfor %} {% for target in ["cpp", @@ -358,7 +358,7 @@ tasks: template: verify-rc/github.macos.yml params: target: {{ target }} - github_runner: "macos-13" + github_runner: "macos-15-intel" {% endfor %} {% for target in ["cpp", From ff089cc01d8bee1a495be8a0a9c31a28b72fcd8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 3 Nov 2025 08:39:30 +0100 Subject: [PATCH 10/15] GH-47975: [Docs][Python] Remove experimental warning on PyCapsule documentation (#47976) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Follow up from discussion on the mailing list thread: https://lists.apache.org/thread/ncfmmd1429qjsr07j5f5ds177w4wb2s6 The documentation reads: > Warning: > The Arrow PyCapsule Interface should be considered experimental There haven't been major updates to the PyCapsule Interface during the last 2 years, since it was created and has seen widespread usage as tracked on [this comment](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008). The only big change was adding C Device. ### What changes are included in this PR? Remove experimental note from the documentation. I've validated there are no notes on the code / API about experimental around the PyCapsule Interface. ### Are these changes tested? No ### Are there any user-facing changes? Yes, PyCapsule won't be considered experimental anymore. * GitHub Issue: #47975 Authored-by: RaΓΊl Cumplido Signed-off-by: RaΓΊl Cumplido --- docs/source/format/CDataInterface/PyCapsuleInterface.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index f4f6b54849e..06315a4b398 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -22,8 +22,6 @@ The Arrow PyCapsule Interface ============================= -.. warning:: The Arrow PyCapsule Interface should be considered experimental - Rationale ========= From 534ef71eca582006668f6f4ac83b47dd695d2020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 3 Nov 2025 12:23:20 +0100 Subject: [PATCH 11/15] GH-48025: [C++][GLib] Replace instances where build path is being added to built artifacts (#48026) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change There are several places where we are adding the build path on our built artifacts. We should avoid that in order to improve reproducibility of our builds. ### What changes are included in this PR? General build system hygiene improvement redacting absolute paths from the compiler flags to improve build reproducibility and caching efficiency. Update for mkenums from full `@ filename@` to `@ basename@` as seen on their official documentation: > The base name of the input file currently being processed (e.g. foo.h). Typically you want to use `@ basename@` in place of `@ filename@` in your templates, to improve the reproducibility of the build. (Since: 2.22) ### Are these changes tested? Validated locally that reprotest doesn't fail on `enums` anymore. CI and tests have run. ### Are there any user-facing changes? No * GitHub Issue: #48025 Lead-authored-by: RaΓΊl Cumplido Co-authored-by: Sutou Kouhei Signed-off-by: RaΓΊl Cumplido --- c_glib/arrow-dataset-glib/enums.c.template | 2 +- c_glib/arrow-dataset-glib/enums.h.template | 2 +- c_glib/arrow-glib/enums.c.template | 2 +- c_glib/arrow-glib/enums.h.template | 2 +- c_glib/gandiva-glib/enums.c.template | 2 +- c_glib/gandiva-glib/enums.h.template | 2 +- cpp/src/arrow/CMakeLists.txt | 4 ++++ cpp/src/arrow/util/config.h.cmake | 1 - cpp/src/arrow/util/config_internal.h.cmake | 2 ++ 9 files changed, 12 insertions(+), 7 deletions(-) diff --git a/c_glib/arrow-dataset-glib/enums.c.template b/c_glib/arrow-dataset-glib/enums.c.template index 8921ab06252..d1f774a7673 100644 --- a/c_glib/arrow-dataset-glib/enums.c.template +++ b/c_glib/arrow-dataset-glib/enums.c.template @@ -23,7 +23,7 @@ /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/c_glib/arrow-dataset-glib/enums.h.template b/c_glib/arrow-dataset-glib/enums.h.template index 8b89a8b031b..f658f47495c 100644 --- a/c_glib/arrow-dataset-glib/enums.h.template +++ b/c_glib/arrow-dataset-glib/enums.h.template @@ -29,7 +29,7 @@ G_BEGIN_DECLS /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/c_glib/arrow-glib/enums.c.template b/c_glib/arrow-glib/enums.c.template index 6806ed194ef..9a5a9ba09c0 100644 --- a/c_glib/arrow-glib/enums.c.template +++ b/c_glib/arrow-glib/enums.c.template @@ -23,7 +23,7 @@ /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/c_glib/arrow-glib/enums.h.template b/c_glib/arrow-glib/enums.h.template index e49b717fb30..ee1fb5f7a07 100644 --- a/c_glib/arrow-glib/enums.h.template +++ b/c_glib/arrow-glib/enums.h.template @@ -29,7 +29,7 @@ G_BEGIN_DECLS /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/c_glib/gandiva-glib/enums.c.template b/c_glib/gandiva-glib/enums.c.template index 7ea2ea7b5f5..5f8b80a7702 100644 --- a/c_glib/gandiva-glib/enums.c.template +++ b/c_glib/gandiva-glib/enums.c.template @@ -23,7 +23,7 @@ /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/c_glib/gandiva-glib/enums.h.template b/c_glib/gandiva-glib/enums.h.template index d362e14c1b2..d07ed468010 100644 --- a/c_glib/gandiva-glib/enums.h.template +++ b/c_glib/gandiva-glib/enums.h.template @@ -29,7 +29,7 @@ G_BEGIN_DECLS /*** BEGIN file-production ***/ -/* enumerations from "@filename@" */ +/* enumerations from "@basename@" */ /*** END file-production ***/ /*** BEGIN value-header ***/ diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 25e5749335a..2e5c67e07b6 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -342,6 +342,10 @@ macro(append_runtime_avx512_src SRCS SRC) endmacro() # Write out compile-time configuration constants +string(REPLACE "${CMAKE_SOURCE_DIR}" "" REDACTED_CXX_FLAGS + ${CMAKE_CXX_FLAGS}) +string(REPLACE "${CMAKE_BINARY_DIR}" "" REDACTED_CXX_FLAGS + ${REDACTED_CXX_FLAGS}) configure_file("util/config.h.cmake" "util/config.h" ESCAPE_QUOTES) configure_file("util/config_internal.h.cmake" "util/config_internal.h" ESCAPE_QUOTES) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/util/config.h" diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index ddff1379b1d..cf98757c4a8 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -27,7 +27,6 @@ #define ARROW_CXX_COMPILER_ID "@CMAKE_CXX_COMPILER_ID@" #define ARROW_CXX_COMPILER_VERSION "@CMAKE_CXX_COMPILER_VERSION@" -#define ARROW_CXX_COMPILER_FLAGS "@CMAKE_CXX_FLAGS@" #define ARROW_BUILD_TYPE "@UPPERCASE_BUILD_TYPE@" diff --git a/cpp/src/arrow/util/config_internal.h.cmake b/cpp/src/arrow/util/config_internal.h.cmake index e90f7ee12da..5d96e6fc68a 100644 --- a/cpp/src/arrow/util/config_internal.h.cmake +++ b/cpp/src/arrow/util/config_internal.h.cmake @@ -18,5 +18,7 @@ // These variables are not exposed as they can make compilation caching // and increment builds less efficient. +#define ARROW_CXX_COMPILER_FLAGS "@REDACTED_CXX_FLAGS@" + #define ARROW_GIT_ID "@ARROW_GIT_ID@" #define ARROW_GIT_DESCRIPTION "@ARROW_GIT_DESCRIPTION@" From d9c188170638610cc49929e9490d52c9276ab9f9 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 3 Nov 2025 22:09:42 -0500 Subject: [PATCH 12/15] GH-47961: [C++] Fix Meson's Boost process version detection (#48017) ### Rationale for this change Meson is missing some defines that offer compatibility for different boost versions ### What changes are included in this PR? The Meson configuration is updated to include required boost defines ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #47961 Authored-by: Will Ayd Signed-off-by: Sutou Kouhei --- cpp/src/arrow/meson.build | 45 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/meson.build b/cpp/src/arrow/meson.build index 703d5976aeb..8887da9174c 100644 --- a/cpp/src/arrow/meson.build +++ b/cpp/src/arrow/meson.build @@ -623,22 +623,55 @@ if needs_testing modules: ['filesystem'], required: false, ) - if not filesystem_dep.found() + base_process_dep = dependency('boost', modules: ['process'], required: false) + + if not (filesystem_dep.found() and base_process_dep.found()) cmake = import('cmake') boost_opt = cmake.subproject_options() boost_opt.add_cmake_defines( - {'BOOST_INCLUDE_LIBRARIES': 'filesystem;system'}, + {'BOOST_INCLUDE_LIBRARIES': 'filesystem;process'}, ) + if get_option('default_library') != 'static' + boost_opt.add_cmake_defines({'BUILD_SHARED_LIBS': 'ON'}) + endif boost_proj = cmake.subproject('boost', options: boost_opt) filesystem_dep = boost_proj.dependency('boost_filesystem') + base_process_dep = boost_proj.dependency('boost_process') + endif + + boost_process_have_v2 = false + process_compile_args = [] + if base_process_dep.version() >= '1.86' + process_compile_args += [ + '-DBOOST_PROCESS_HAVE_V1', + '-DBOOST_PROCESS_HAVE_V2', + ] + boost_process_have_v2 = true + elif base_process_dep.version() >= '1.80' + process_compile_args += ['-DBOOST_PROCESS_HAVE_V2'] + boost_process_have_v2 = true endif + if (boost_process_have_v2 and host_machine.system() != 'windows') + # We can't use v2 API on Windows because v2 API doesn't support + # process group[1] and GCS testbench uses multiple processes[2]. + # + # [1] https://github.com/boostorg/process/issues/259 + # [2] https://github.com/googleapis/storage-testbench/issues/669 + process_compile_args += ['-DBOOST_PROCESS_USE_V2'] + endif + process_dep = declare_dependency( + dependencies: [base_process_dep], + compile_args: process_compile_args, + ) + gtest_dep = dependency('gtest') gtest_main_dep = dependency('gtest_main') gtest_dep = dependency('gtest') gmock_dep = dependency('gmock') else filesystem_dep = disabler() + process_dep = disabler() gtest_dep = disabler() gtest_main_dep = disabler() gtest_dep = disabler() @@ -649,7 +682,13 @@ if needs_testing arrow_testing_lib = static_library( 'arrow_testing', sources: arrow_testing_srcs, - dependencies: [arrow_dep, filesystem_dep, gmock_dep, gtest_dep], + dependencies: [ + arrow_dep, + process_dep, + filesystem_dep, + gmock_dep, + gtest_dep, + ], ) arrow_testing_dep = declare_dependency(link_with: [arrow_testing_lib]) From 5eaf553bfc7aa639fd67bd622b6b808e71fbba39 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 4 Nov 2025 20:00:22 +0900 Subject: [PATCH 13/15] GH-48044: [Packaging][RPM][Parquet] Don't install `parquet-glib.pc` by `parquet-devel` (#48045) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change If `parquet-devel` installs `parquet-glib.pc`, `parquet-devel` depends on `parquet-glib-devel` automatically. It's not an expected behavior. `parquet-devel` should not depend on `parquet-glib-devel`. ### What changes are included in this PR? Don't install `parquet-glib.pc` by `parquet-devel`. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48044 Authored-by: Sutou Kouhei Signed-off-by: RaΓΊl Cumplido --- dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 32bd076e821..28feb1c6bbd 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -596,7 +596,7 @@ Libraries and header files for Apache Parquet C++. %{_libdir}/cmake/Parquet/ %{_libdir}/libparquet.a %{_libdir}/libparquet.so -%{_libdir}/pkgconfig/parquet*.pc +%{_libdir}/pkgconfig/parquet.pc %package -n %{name}%{so_version}-glib-libs Summary: Runtime libraries for Apache Arrow GLib From 550d37130bb9fd282be7fe389ca747d01ad12a97 Mon Sep 17 00:00:00 2001 From: "Alina (Xi) Li" Date: Thu, 16 Oct 2025 14:32:57 -0700 Subject: [PATCH 14/15] Extract SQLColAttribute implementation Co-Authored-By: justing-bq Co-Authored-By: alinalibq --- cpp/src/arrow/flight/sql/odbc/odbc_api.cc | 86 +- .../flight_sql_result_set_metadata.cc | 31 +- .../flight_sql_result_set_metadata.h | 3 +- .../sql/odbc/odbc_impl/odbc_descriptor.cc | 7 +- .../odbc/odbc_impl/spi/result_set_metadata.h | 6 +- .../flight/sql/odbc/tests/CMakeLists.txt | 1 + .../flight/sql/odbc/tests/columns_test.cc | 1235 +++++++++++++++++ 7 files changed, 1350 insertions(+), 19 deletions(-) create mode 100644 cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_api.cc b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc index 01780f0efe2..71da444088c 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_api.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_api.cc @@ -951,8 +951,90 @@ SQLRETURN SQLColAttribute(SQLHSTMT stmt, SQLUSMALLINT record_number, << ", output_length: " << static_cast(output_length) << ", numeric_attribute_ptr: " << static_cast(numeric_attribute_ptr); - // GH-47721 TODO: Implement SQLColAttribute, pre-requisite requires SQLColumns - return SQL_INVALID_HANDLE; + + using ODBC::ODBCDescriptor; + using ODBC::ODBCStatement; + return ODBCStatement::ExecuteWithDiagnostics(stmt, SQL_ERROR, [=]() { + ODBCStatement* statement = reinterpret_cast(stmt); + ODBCDescriptor* ird = statement->GetIRD(); + SQLINTEGER output_length_int; + switch (field_identifier) { + // Numeric attributes + // internal is SQLLEN, no conversion is needed + case SQL_DESC_DISPLAY_SIZE: + case SQL_DESC_OCTET_LENGTH: { + ird->GetField(record_number, field_identifier, numeric_attribute_ptr, + buffer_length, &output_length_int); + break; + } + // internal is SQLULEN, conversion is needed. + case SQL_COLUMN_LENGTH: // ODBC 2.0 + case SQL_DESC_LENGTH: { + SQLULEN temp; + ird->GetField(record_number, field_identifier, &temp, buffer_length, + &output_length_int); + if (numeric_attribute_ptr) { + *numeric_attribute_ptr = static_cast(temp); + } + break; + } + // internal is SQLINTEGER, conversion is needed. + case SQL_DESC_AUTO_UNIQUE_VALUE: + case SQL_DESC_CASE_SENSITIVE: + case SQL_DESC_NUM_PREC_RADIX: { + SQLINTEGER temp; + ird->GetField(record_number, field_identifier, &temp, buffer_length, + &output_length_int); + if (numeric_attribute_ptr) { + *numeric_attribute_ptr = static_cast(temp); + } + break; + } + // internal is SQLSMALLINT, conversion is needed. + case SQL_DESC_CONCISE_TYPE: + case SQL_DESC_COUNT: + case SQL_DESC_FIXED_PREC_SCALE: + case SQL_DESC_TYPE: + case SQL_DESC_NULLABLE: + case SQL_COLUMN_PRECISION: // ODBC 2.0 + case SQL_DESC_PRECISION: + case SQL_COLUMN_SCALE: // ODBC 2.0 + case SQL_DESC_SCALE: + case SQL_DESC_SEARCHABLE: + case SQL_DESC_UNNAMED: + case SQL_DESC_UNSIGNED: + case SQL_DESC_UPDATABLE: { + SQLSMALLINT temp; + ird->GetField(record_number, field_identifier, &temp, buffer_length, + &output_length_int); + if (numeric_attribute_ptr) { + *numeric_attribute_ptr = static_cast(temp); + } + break; + } + // Character attributes + case SQL_DESC_BASE_COLUMN_NAME: + case SQL_DESC_BASE_TABLE_NAME: + case SQL_DESC_CATALOG_NAME: + case SQL_DESC_LABEL: + case SQL_DESC_LITERAL_PREFIX: + case SQL_DESC_LITERAL_SUFFIX: + case SQL_DESC_LOCAL_TYPE_NAME: + case SQL_DESC_NAME: + case SQL_DESC_SCHEMA_NAME: + case SQL_DESC_TABLE_NAME: + case SQL_DESC_TYPE_NAME: + ird->GetField(record_number, field_identifier, character_attribute_ptr, + buffer_length, &output_length_int); + break; + default: + throw DriverException("Invalid descriptor field", "HY091"); + } + if (output_length) { + *output_length = static_cast(output_length_int); + } + return SQL_SUCCESS; + }); } SQLRETURN SQLGetTypeInfo(SQLHSTMT stmt, SQLSMALLINT data_type) { diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.cc index 8ac3c7ed752..d760caffa7a 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.cc @@ -20,6 +20,7 @@ #include "arrow/flight/sql/column_metadata.h" #include "arrow/flight/sql/odbc/odbc_impl/platform.h" #include "arrow/flight/sql/odbc/odbc_impl/util.h" +#include "arrow/util/key_value_metadata.h" #include #include "arrow/flight/sql/odbc/odbc_impl/exceptions.h" @@ -40,12 +41,8 @@ constexpr int32_t DefaultDecimalPrecision = 38; constexpr int32_t DefaultLengthForVariableLengthColumns = 1024; namespace { -std::shared_ptr empty_metadata_map(new KeyValueMetadata); - inline ColumnMetadata GetMetadata(const std::shared_ptr& field) { - const auto& metadata_map = field->metadata(); - - ColumnMetadata metadata(metadata_map ? metadata_map : empty_metadata_map); + ColumnMetadata metadata(field->metadata()); return metadata; } @@ -207,10 +204,13 @@ size_t FlightSqlResultSetMetadata::GetOctetLength(int column_position) { .value_or(DefaultLengthForVariableLengthColumns); } -std::string FlightSqlResultSetMetadata::GetTypeName(int column_position) { +std::string FlightSqlResultSetMetadata::GetTypeName(int column_position, int data_type) { ColumnMetadata metadata = GetMetadata(schema_->field(column_position - 1)); - return metadata.GetTypeName().ValueOrElse([] { return ""; }); + return metadata.GetTypeName().ValueOrElse([data_type] { + // If we get an empty type name, figure out the type name from the data_type. + return util::GetTypeNameFromSqlDataType(data_type); + }); } Updatability FlightSqlResultSetMetadata::GetUpdatable(int column_position) { @@ -241,18 +241,29 @@ bool FlightSqlResultSetMetadata::IsUnsigned(int column_position) { const std::shared_ptr& field = schema_->field(column_position - 1); switch (field->type()->id()) { + case Type::INT8: + case Type::INT16: + case Type::INT32: + case Type::INT64: + case Type::DOUBLE: + case Type::FLOAT: + case Type::HALF_FLOAT: + case Type::DECIMAL32: + case Type::DECIMAL64: + case Type::DECIMAL128: + case Type::DECIMAL256: + return false; case Type::UINT8: case Type::UINT16: case Type::UINT32: case Type::UINT64: - return true; default: - return false; + return true; } } bool FlightSqlResultSetMetadata::IsFixedPrecScale(int column_position) { - // TODO: Flight SQL column metadata does not have this, should we add to the spec? + // Precision for Arrow data types are modifiable by the user return false; } diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.h b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.h index 0d141a4bb9c..11b1678c24d 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.h +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/flight_sql_result_set_metadata.h @@ -77,7 +77,7 @@ class FlightSqlResultSetMetadata : public ResultSetMetadata { size_t GetOctetLength(int column_position) override; - std::string GetTypeName(int column_position) override; + std::string GetTypeName(int column_position, int data_type) override; Updatability GetUpdatable(int column_position) override; @@ -87,6 +87,7 @@ class FlightSqlResultSetMetadata : public ResultSetMetadata { Searchability IsSearchable(int column_position) override; + /// \brief Returns true if the column is unsigned (not numeric) bool IsUnsigned(int column_position) override; bool IsFixedPrecScale(int column_position) override; diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_descriptor.cc b/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_descriptor.cc index d2b7f8865ca..9ec68973bb8 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_descriptor.cc +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/odbc_descriptor.cc @@ -479,6 +479,8 @@ void ODBCDescriptor::PopulateFromResultSetMetadata(ResultSetMetadata* rsmd) { for (size_t i = 0; i < records_.size(); ++i) { size_t one_based_index = i + 1; + int16_t concise_type = rsmd->GetConciseType(one_based_index); + records_[i].base_column_name = rsmd->GetBaseColumnName(one_based_index); records_[i].base_table_name = rsmd->GetBaseTableName(one_based_index); records_[i].catalog_name = rsmd->GetCatalogName(one_based_index); @@ -489,9 +491,8 @@ void ODBCDescriptor::PopulateFromResultSetMetadata(ResultSetMetadata* rsmd) { records_[i].name = rsmd->GetName(one_based_index); records_[i].schema_name = rsmd->GetSchemaName(one_based_index); records_[i].table_name = rsmd->GetTableName(one_based_index); - records_[i].type_name = rsmd->GetTypeName(one_based_index); - records_[i].concise_type = GetSqlTypeForODBCVersion( - rsmd->GetConciseType(one_based_index), is_2x_connection_); + records_[i].type_name = rsmd->GetTypeName(one_based_index, concise_type); + records_[i].concise_type = GetSqlTypeForODBCVersion(concise_type, is_2x_connection_); records_[i].data_ptr = nullptr; records_[i].indicator_ptr = nullptr; records_[i].display_size = rsmd->GetColumnDisplaySize(one_based_index); diff --git a/cpp/src/arrow/flight/sql/odbc/odbc_impl/spi/result_set_metadata.h b/cpp/src/arrow/flight/sql/odbc/odbc_impl/spi/result_set_metadata.h index 38f81fc9c3e..a33784cc79b 100644 --- a/cpp/src/arrow/flight/sql/odbc/odbc_impl/spi/result_set_metadata.h +++ b/cpp/src/arrow/flight/sql/odbc/odbc_impl/spi/result_set_metadata.h @@ -17,9 +17,8 @@ #pragma once -#include "arrow/flight/sql/odbc/odbc_impl/types.h" - #include +#include "arrow/flight/sql/odbc/odbc_impl/types.h" namespace arrow::flight::sql::odbc { @@ -143,8 +142,9 @@ class ResultSetMetadata { /// \brief It returns the data type as a string. /// \param column_position [in] the position of the column, starting from 1. + /// \param data_type [in] the data type of the column. /// \return the data type string. - virtual std::string GetTypeName(int column_position) = 0; + virtual std::string GetTypeName(int column_position, int data_type) = 0; /// \brief It returns a numeric values indicate the updatability of the /// column. diff --git a/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt b/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt index 4bc240637e7..7e3e1eb9034 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt +++ b/cpp/src/arrow/flight/sql/odbc/tests/CMakeLists.txt @@ -34,6 +34,7 @@ add_arrow_test(flight_sql_odbc_test SOURCES odbc_test_suite.cc odbc_test_suite.h + columns_test.cc connection_test.cc # Enable Protobuf cleanup after test execution # GH-46889: move protobuf_test_util to a more common location diff --git a/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc b/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc new file mode 100644 index 00000000000..3905d0474f8 --- /dev/null +++ b/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc @@ -0,0 +1,1235 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "arrow/flight/sql/odbc/tests/odbc_test_suite.h" + +#include "arrow/flight/sql/odbc/odbc_impl/platform.h" + +#include +#include +#include + +#include + +namespace arrow::flight::sql::odbc { + +template +class ColumnsTest : public T {}; + +class ColumnsMockTest : public FlightSQLODBCMockTestBase {}; +class ColumnsRemoteTest : public FlightSQLODBCRemoteTestBase {}; +using TestTypes = ::testing::Types; +TYPED_TEST_SUITE(ColumnsTest, TestTypes); + +template +class ColumnsOdbcV2Test : public T {}; + +class ColumnsOdbcV2MockTest : public FlightSQLOdbcV2MockTestBase {}; +class ColumnsOdbcV2RemoteTest : public FlightSQLOdbcV2RemoteTestBase {}; +using TestTypesOdbcV2 = ::testing::Types; +TYPED_TEST_SUITE(ColumnsOdbcV2Test, TestTypesOdbcV2); + +namespace { +// Helper functions + +void CheckSQLColAttribute(SQLHSTMT stmt, SQLUSMALLINT idx, + const std::wstring& expected_column_name, + SQLLEN expected_data_type, SQLLEN expected_concise_type, + SQLLEN expected_display_size, SQLLEN expected_prec_scale, + SQLLEN expected_length, + const std::wstring& expected_literal_prefix, + const std::wstring& expected_literal_suffix, + SQLLEN expected_column_size, SQLLEN expected_column_scale, + SQLLEN expected_column_nullability, + SQLLEN expected_num_prec_radix, SQLLEN expected_octet_length, + SQLLEN expected_searchable, SQLLEN expected_unsigned_column) { + std::vector name(kOdbcBufferSize); + SQLSMALLINT name_len = 0; + std::vector base_column_name(kOdbcBufferSize); + SQLSMALLINT column_name_len = 0; + std::vector label(kOdbcBufferSize); + SQLSMALLINT label_len = 0; + std::vector prefix(kOdbcBufferSize); + SQLSMALLINT prefix_len = 0; + std::vector suffix(kOdbcBufferSize); + SQLSMALLINT suffix_len = 0; + SQLLEN data_type = 0; + SQLLEN concise_type = 0; + SQLLEN display_size = 0; + SQLLEN prec_scale = 0; + SQLLEN length = 0; + SQLLEN size = 0; + SQLLEN scale = 0; + SQLLEN nullability = 0; + SQLLEN num_prec_radix = 0; + SQLLEN octet_length = 0; + SQLLEN searchable = 0; + SQLLEN unsigned_col = 0; + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_NAME, &name[0], + (SQLSMALLINT)name.size(), &name_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_BASE_COLUMN_NAME, &base_column_name[0], + (SQLSMALLINT)base_column_name.size(), &column_name_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_LABEL, &label[0], + (SQLSMALLINT)label.size(), &label_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_TYPE, 0, 0, 0, &data_type)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_CONCISE_TYPE, 0, 0, 0, &concise_type)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_DISPLAY_SIZE, 0, 0, 0, &display_size)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_FIXED_PREC_SCALE, 0, 0, 0, &prec_scale)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_LENGTH, 0, 0, 0, &length)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_LITERAL_PREFIX, &prefix[0], + (SQLSMALLINT)prefix.size(), &prefix_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_LITERAL_SUFFIX, &suffix[0], + (SQLSMALLINT)suffix.size(), &suffix_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_PRECISION, 0, 0, 0, &size)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_SCALE, 0, 0, 0, &scale)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_NULLABLE, 0, 0, 0, &nullability)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, SQL_DESC_NUM_PREC_RADIX, 0, 0, 0, + &num_prec_radix)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_OCTET_LENGTH, 0, 0, 0, &octet_length)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_SEARCHABLE, 0, 0, 0, &searchable)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_DESC_UNSIGNED, 0, 0, 0, &unsigned_col)); + + std::wstring name_str = ConvertToWString(name, name_len); + std::wstring base_column_name_str = ConvertToWString(base_column_name, column_name_len); + std::wstring label_str = ConvertToWString(label, label_len); + std::wstring prefixStr = ConvertToWString(prefix, prefix_len); + + // Assume column name, base column name, and label are equivalent in the result set + EXPECT_EQ(expected_column_name, name_str); + EXPECT_EQ(expected_column_name, base_column_name_str); + EXPECT_EQ(expected_column_name, label_str); + EXPECT_EQ(expected_data_type, data_type); + EXPECT_EQ(expected_concise_type, concise_type); + EXPECT_EQ(expected_display_size, display_size); + EXPECT_EQ(expected_prec_scale, prec_scale); + EXPECT_EQ(expected_length, length); + EXPECT_EQ(expected_literal_prefix, prefixStr); + EXPECT_EQ(expected_column_size, size); + EXPECT_EQ(expected_column_scale, scale); + EXPECT_EQ(expected_column_nullability, nullability); + EXPECT_EQ(expected_num_prec_radix, num_prec_radix); + EXPECT_EQ(expected_octet_length, octet_length); + EXPECT_EQ(expected_searchable, searchable); + EXPECT_EQ(expected_unsigned_column, unsigned_col); +} + +void CheckSQLColAttributes(SQLHSTMT stmt, SQLUSMALLINT idx, + const std::wstring& expected_column_name, + SQLLEN expected_data_type, SQLLEN expected_display_size, + SQLLEN expected_prec_scale, SQLLEN expected_length, + SQLLEN expected_column_size, SQLLEN expected_column_scale, + SQLLEN expected_column_nullability, SQLLEN expected_searchable, + SQLLEN expected_unsigned_column) { + std::vector name(kOdbcBufferSize); + SQLSMALLINT name_len = 0; + std::vector label(kOdbcBufferSize); + SQLSMALLINT label_len = 0; + SQLLEN data_type = 0; + SQLLEN display_size = 0; + SQLLEN prec_scale = 0; + SQLLEN length = 0; + SQLLEN size = 0; + SQLLEN scale = 0; + SQLLEN nullability = 0; + SQLLEN searchable = 0; + SQLLEN unsigned_col = 0; + + EXPECT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, SQL_COLUMN_NAME, &name[0], + (SQLSMALLINT)name.size(), &name_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, SQL_COLUMN_LABEL, &label[0], + (SQLSMALLINT)label.size(), &label_len, 0)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_TYPE, 0, 0, 0, &data_type)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_DISPLAY_SIZE, 0, 0, 0, &display_size)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(stmt, idx, SQL_COLUMN_MONEY, 0, 0, 0, &prec_scale)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_LENGTH, 0, 0, 0, &length)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_PRECISION, 0, 0, 0, &size)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, SQL_COLUMN_SCALE, 0, 0, 0, &scale)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_NULLABLE, 0, 0, 0, &nullability)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_SEARCHABLE, 0, 0, 0, &searchable)); + + EXPECT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, SQL_COLUMN_UNSIGNED, 0, 0, 0, &unsigned_col)); + + std::wstring name_str = ConvertToWString(name, name_len); + std::wstring label_str = ConvertToWString(label, label_len); + + EXPECT_EQ(expected_column_name, name_str); + EXPECT_EQ(expected_column_name, label_str); + EXPECT_EQ(expected_data_type, data_type); + EXPECT_EQ(expected_display_size, display_size); + EXPECT_EQ(expected_length, length); + EXPECT_EQ(expected_column_size, size); + EXPECT_EQ(expected_column_scale, scale); + EXPECT_EQ(expected_column_nullability, nullability); + EXPECT_EQ(expected_searchable, searchable); + EXPECT_EQ(expected_unsigned_column, unsigned_col); +} + +void CheckSQLColAttributeString(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, + SQLUSMALLINT field_identifier, + const std::wstring& expected_attr_string) { + if (!wsql.empty()) { + // Execute query + std::vector sql0(wsql.begin(), wsql.end()); + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); + } + + // check SQLColAttribute string attribute + std::vector str_val(kOdbcBufferSize); + SQLSMALLINT str_len = 0; + + ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, field_identifier, &str_val[0], + (SQLSMALLINT)str_val.size(), &str_len, 0)); + + std::wstring attr_str = ConvertToWString(str_val, str_len); + ASSERT_EQ(expected_attr_string, attr_str); +} + +void CheckSQLColAttributeNumeric(SQLHSTMT stmt, const std::wstring& wsql, + SQLUSMALLINT idx, SQLUSMALLINT field_identifier, + SQLLEN expected_attr_numeric) { + // Execute query and check SQLColAttribute numeric attribute + std::vector sql0(wsql.begin(), wsql.end()); + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); + + SQLLEN num_val = 0; + ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, field_identifier, 0, 0, 0, &num_val)); + ASSERT_EQ(expected_attr_numeric, num_val); +} + +void CheckSQLColAttributesString(SQLHSTMT stmt, const std::wstring& wsql, + SQLUSMALLINT idx, SQLUSMALLINT field_identifier, + const std::wstring& expected_attr_string) { + if (!wsql.empty()) { + // Execute query + std::vector sql0(wsql.begin(), wsql.end()); + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); + } + + // check ODBC 2.0 API SQLColAttributes string attribute + std::vector str_val(kOdbcBufferSize); + SQLSMALLINT str_len = 0; + + ASSERT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, field_identifier, &str_val[0], + (SQLSMALLINT)str_val.size(), &str_len, 0)); + + std::wstring attr_str = ConvertToWString(str_val, str_len); + ASSERT_EQ(expected_attr_string, attr_str); +} + +void CheckSQLColAttributesNumeric(SQLHSTMT stmt, const std::wstring& wsql, + SQLUSMALLINT idx, SQLUSMALLINT field_identifier, + SQLLEN expected_attr_numeric) { + // Execute query and check ODBC 2.0 API SQLColAttributes numeric attribute + std::vector sql0(wsql.begin(), wsql.end()); + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); + + SQLLEN num_val = 0; + ASSERT_EQ(SQL_SUCCESS, + SQLColAttributes(stmt, idx, field_identifier, 0, 0, 0, &num_val)); + ASSERT_EQ(expected_attr_numeric, num_val); +} +} // namespace + +TYPED_TEST(ColumnsTest, SQLColAttributeTestInputData) { + std::wstring wsql = L"SELECT 1 as col1;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + SQLUSMALLINT idx = 1; + std::vector character_attr(kOdbcBufferSize); + SQLSMALLINT character_attr_len = 0; + SQLLEN numeric_attr = 0; + + // All character values populated + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(this->stmt, idx, SQL_DESC_NAME, &character_attr[0], + (SQLSMALLINT)character_attr.size(), &character_attr_len, 0)); + + // All numeric values populated + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(this->stmt, idx, SQL_DESC_COUNT, 0, 0, 0, &numeric_attr)); + + // Pass null values, driver should not throw error + EXPECT_EQ(SQL_SUCCESS, + SQLColAttribute(this->stmt, idx, SQL_COLUMN_TABLE_NAME, 0, 0, 0, 0)); + + EXPECT_EQ(SQL_SUCCESS, SQLColAttribute(this->stmt, idx, SQL_DESC_COUNT, 0, 0, 0, 0)); +} + +TYPED_TEST(ColumnsTest, SQLColAttributeGetCharacterLen) { + std::wstring wsql = L"SELECT 1 as col1;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + SQLSMALLINT character_attr_len = 0; + + // Check length of character attribute + ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(this->stmt, 1, SQL_DESC_BASE_COLUMN_NAME, 0, 0, + &character_attr_len, 0)); + EXPECT_EQ(4 * ODBC::GetSqlWCharSize(), character_attr_len); +} + +TYPED_TEST(ColumnsTest, SQLColAttributeInvalidFieldId) { + std::wstring wsql = L"SELECT 1 as col1;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + SQLUSMALLINT invalid_field_id = -100; + SQLUSMALLINT idx = 1; + std::vector character_attr(kOdbcBufferSize); + SQLSMALLINT character_attr_len = 0; + SQLLEN numeric_attr = 0; + + ASSERT_EQ(SQL_ERROR, + SQLColAttribute(this->stmt, idx, invalid_field_id, &character_attr[0], + (SQLSMALLINT)character_attr.size(), &character_attr_len, 0)); + // Verify invalid descriptor field identifier error state is returned + VerifyOdbcErrorState(SQL_HANDLE_STMT, this->stmt, kErrorStateHY091); +} + +TYPED_TEST(ColumnsTest, SQLColAttributeInvalidColId) { + std::wstring wsql = L"SELECT 1 as col1;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + SQLUSMALLINT invalid_col_id = 2; + std::vector character_attr(kOdbcBufferSize); + SQLSMALLINT character_attr_len = 0; + + ASSERT_EQ(SQL_ERROR, + SQLColAttribute(this->stmt, invalid_col_id, SQL_DESC_BASE_COLUMN_NAME, + &character_attr[0], (SQLSMALLINT)character_attr.size(), + &character_attr_len, 0)); + // Verify invalid descriptor index error state is returned + VerifyOdbcErrorState(SQL_HANDLE_STMT, this->stmt, kErrorState07009); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeAllTypes) { + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + CheckSQLColAttribute(this->stmt, 1, + std::wstring(L"bigint_col"), // expected_column_name + SQL_BIGINT, // expected_data_type + SQL_BIGINT, // expected_concise_type + 20, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 8, // expected_octet_length + SQL_PRED_NONE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 2, + std::wstring(L"char_col"), // expected_column_name + SQL_WVARCHAR, // expected_data_type + SQL_WVARCHAR, // expected_concise_type + 0, // expected_display_size + SQL_FALSE, // expected_prec_scale + 0, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 0, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 0, // expected_octet_length + SQL_PRED_NONE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 3, + std::wstring(L"varbinary_col"), // expected_column_name + SQL_BINARY, // expected_data_type + SQL_BINARY, // expected_concise_type + 0, // expected_display_size + SQL_FALSE, // expected_prec_scale + 0, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 0, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 0, // expected_octet_length + SQL_PRED_NONE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 4, + std::wstring(L"double_col"), // expected_column_name + SQL_DOUBLE, // expected_data_type + SQL_DOUBLE, // expected_concise_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 2, // expected_num_prec_radix + 8, // expected_octet_length + SQL_PRED_NONE, // expected_searchable + SQL_FALSE); // expected_unsigned_column +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesAllTypesODBCVer2) { + // Tests ODBC 2.0 API SQLColAttributes + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + CheckSQLColAttributes(this->stmt, 1, + std::wstring(L"bigint_col"), // expected_column_name + SQL_BIGINT, // expected_data_type + 20, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_PRED_NONE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 2, + std::wstring(L"char_col"), // expected_column_name + SQL_WVARCHAR, // expected_data_type + 0, // expected_display_size + SQL_FALSE, // expected_prec_scale + 0, // expected_length + 0, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_PRED_NONE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 3, + std::wstring(L"varbinary_col"), // expected_column_name + SQL_BINARY, // expected_data_type + 0, // expected_display_size + SQL_FALSE, // expected_prec_scale + 0, // expected_length + 0, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_PRED_NONE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 4, + std::wstring(L"double_col"), // expected_column_name + SQL_DOUBLE, // expected_data_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_PRED_NONE, // expected_searchable + SQL_FALSE); // expected_unsigned_column +} + +TEST_F(ColumnsRemoteTest, TestSQLColAttributeAllTypes) { + // Test assumes there is a table $scratch.ODBCTest in remote server + + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + CheckSQLColAttribute(this->stmt, 1, + std::wstring(L"sinteger_max"), // expected_column_name + SQL_INTEGER, // expected_data_type + SQL_INTEGER, // expected_concise_type + 11, // expected_display_size + SQL_FALSE, // expected_prec_scale + 4, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 4, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 4, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 2, + std::wstring(L"sbigint_max"), // expected_column_name + SQL_BIGINT, // expected_data_type + SQL_BIGINT, // expected_concise_type + 20, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 3, + std::wstring(L"decimal_positive"), // expected_column_name + SQL_DECIMAL, // expected_data_type + SQL_DECIMAL, // expected_concise_type + 40, // expected_display_size + SQL_FALSE, // expected_prec_scale + 19, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 19, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 40, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 4, + std::wstring(L"float_max"), // expected_column_name + SQL_FLOAT, // expected_data_type + SQL_FLOAT, // expected_concise_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 2, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 5, + std::wstring(L"double_max"), // expected_column_name + SQL_DOUBLE, // expected_data_type + SQL_DOUBLE, // expected_concise_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 2, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 6, + std::wstring(L"bit_true"), // expected_column_name + SQL_BIT, // expected_data_type + SQL_BIT, // expected_concise_type + 1, // expected_display_size + SQL_FALSE, // expected_prec_scale + 1, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 1, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 1, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 7, + std::wstring(L"date_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_TYPE_DATE, // expected_concise_type + 10, // expected_display_size + SQL_FALSE, // expected_prec_scale + 10, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 10, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 6, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 8, + std::wstring(L"time_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_TYPE_TIME, // expected_concise_type + 12, // expected_display_size + SQL_FALSE, // expected_prec_scale + 12, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 12, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 6, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 9, + std::wstring(L"timestamp_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_TYPE_TIMESTAMP, // expected_concise_type + 23, // expected_display_size + SQL_FALSE, // expected_prec_scale + 23, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 23, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 16, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column +} + +TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributeAllTypesODBCVer2) { + // Test assumes there is a table $scratch.ODBCTest in remote server + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + CheckSQLColAttribute(this->stmt, 1, + std::wstring(L"sinteger_max"), // expected_column_name + SQL_INTEGER, // expected_data_type + SQL_INTEGER, // expected_concise_type + 11, // expected_display_size + SQL_FALSE, // expected_prec_scale + 4, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 4, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 4, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 2, + std::wstring(L"sbigint_max"), // expected_column_name + SQL_BIGINT, // expected_data_type + SQL_BIGINT, // expected_concise_type + 20, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 3, + std::wstring(L"decimal_positive"), // expected_column_name + SQL_DECIMAL, // expected_data_type + SQL_DECIMAL, // expected_concise_type + 40, // expected_display_size + SQL_FALSE, // expected_prec_scale + 19, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 19, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 10, // expected_num_prec_radix + 40, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 4, + std::wstring(L"float_max"), // expected_column_name + SQL_FLOAT, // expected_data_type + SQL_FLOAT, // expected_concise_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 2, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 5, + std::wstring(L"double_max"), // expected_column_name + SQL_DOUBLE, // expected_data_type + SQL_DOUBLE, // expected_concise_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 2, // expected_num_prec_radix + 8, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 6, + std::wstring(L"bit_true"), // expected_column_name + SQL_BIT, // expected_data_type + SQL_BIT, // expected_concise_type + 1, // expected_display_size + SQL_FALSE, // expected_prec_scale + 1, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 1, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 1, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 7, + std::wstring(L"date_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_DATE, // expected_concise_type + 10, // expected_display_size + SQL_FALSE, // expected_prec_scale + 10, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 10, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 6, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 8, + std::wstring(L"time_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_TIME, // expected_concise_type + 12, // expected_display_size + SQL_FALSE, // expected_prec_scale + 12, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 12, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 6, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttribute(this->stmt, 9, + std::wstring(L"timestamp_max"), // expected_column_name + SQL_DATETIME, // expected_data_type + SQL_TIMESTAMP, // expected_concise_type + 23, // expected_display_size + SQL_FALSE, // expected_prec_scale + 23, // expected_length + std::wstring(L""), // expected_literal_prefix + std::wstring(L""), // expected_literal_suffix + 23, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + 0, // expected_num_prec_radix + 16, // expected_octet_length + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column +} + +TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesAllTypesODBCVer2) { + // Tests ODBC 2.0 API SQLColAttributes + // Test assumes there is a table $scratch.ODBCTest in remote server + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + std::vector sql0(wsql.begin(), wsql.end()); + + ASSERT_EQ(SQL_SUCCESS, + SQLExecDirect(this->stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(this->stmt)); + + CheckSQLColAttributes(this->stmt, 1, + std::wstring(L"sinteger_max"), // expected_column_name + SQL_INTEGER, // expected_data_type + 11, // expected_display_size + SQL_FALSE, // expected_prec_scale + 4, // expected_length + 4, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 2, + std::wstring(L"sbigint_max"), // expected_column_name + SQL_BIGINT, // expected_data_type + 20, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 3, + std::wstring(L"decimal_positive"), // expected_column_name + SQL_DECIMAL, // expected_data_type + 40, // expected_display_size + SQL_FALSE, // expected_prec_scale + 19, // expected_length + 19, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 4, + std::wstring(L"float_max"), // expected_column_name + SQL_FLOAT, // expected_data_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 5, + std::wstring(L"double_max"), // expected_column_name + SQL_DOUBLE, // expected_data_type + 24, // expected_display_size + SQL_FALSE, // expected_prec_scale + 8, // expected_length + 8, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_FALSE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 6, + std::wstring(L"bit_true"), // expected_column_name + SQL_BIT, // expected_data_type + 1, // expected_display_size + SQL_FALSE, // expected_prec_scale + 1, // expected_length + 1, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 7, + std::wstring(L"date_max"), // expected_column_name + SQL_DATE, // expected_data_type + 10, // expected_display_size + SQL_FALSE, // expected_prec_scale + 10, // expected_length + 10, // expected_column_size + 0, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 8, + std::wstring(L"time_max"), // expected_column_name + SQL_TIME, // expected_data_type + 12, // expected_display_size + SQL_FALSE, // expected_prec_scale + 12, // expected_length + 12, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column + + CheckSQLColAttributes(this->stmt, 9, + std::wstring(L"timestamp_max"), // expected_column_name + SQL_TIMESTAMP, // expected_data_type + 23, // expected_display_size + SQL_FALSE, // expected_prec_scale + 23, // expected_length + 23, // expected_column_size + 3, // expected_column_scale + SQL_NULLABLE, // expected_column_nullability + SQL_SEARCHABLE, // expected_searchable + SQL_TRUE); // expected_unsigned_column +} + +TYPED_TEST(ColumnsTest, TestSQLColAttributeCaseSensitive) { + // Arrow limitation: returns SQL_FALSE for case sensitive column + + std::wstring wsql = this->GetQueryAllDataTypes(); + // Int column + CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_CASE_SENSITIVE, SQL_FALSE); + SQLFreeStmt(this->stmt, SQL_CLOSE); + // Varchar column + CheckSQLColAttributeNumeric(this->stmt, wsql, 28, SQL_DESC_CASE_SENSITIVE, SQL_FALSE); +} + +TYPED_TEST(ColumnsOdbcV2Test, TestSQLColAttributesCaseSensitive) { + // Arrow limitation: returns SQL_FALSE for case sensitive column + // Tests ODBC 2.0 API SQLColAttributes + + std::wstring wsql = this->GetQueryAllDataTypes(); + // Int column + CheckSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_CASE_SENSITIVE, SQL_FALSE); + SQLFreeStmt(this->stmt, SQL_CLOSE); + // Varchar column + CheckSQLColAttributesNumeric(this->stmt, wsql, 28, SQL_COLUMN_CASE_SENSITIVE, + SQL_FALSE); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeUniqueValue) { + // Mock server limitation: returns false for auto-increment column + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_AUTO_UNIQUE_VALUE, SQL_FALSE); +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesAutoIncrement) { + // Tests ODBC 2.0 API SQLColAttributes + // Mock server limitation: returns false for auto-increment column + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_COLUMN_AUTO_INCREMENT, SQL_FALSE); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeBasetable_name) { + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_BASE_TABLE_NAME, + std::wstring(L"AllTypesTable")); +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributestable_name) { + // Tests ODBC 2.0 API SQLColAttributes + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TABLE_NAME, + std::wstring(L"AllTypesTable")); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributecatalog_name) { + // Mock server limitattion: mock doesn't return catalog for result metadata, + // and the defautl catalog should be 'main' + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsRemoteTest, TestSQLColAttributecatalog_name) { + // Remote server does not have catalogs + + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesQualifierName) { + // Mock server limitattion: mock doesn't return catalog for result metadata, + // and the defautl catalog should be 'main' + // Tests ODBC 2.0 API SQLColAttributes + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesQualifierName) { + // Remote server does not have catalogs + // Tests ODBC 2.0 API SQLColAttributes + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, + std::wstring(L"")); +} + +TYPED_TEST(ColumnsTest, TestSQLColAttributeCount) { + std::wstring wsql = this->GetQueryAllDataTypes(); + // Pass 0 as column number, driver should ignore it + CheckSQLColAttributeNumeric(this->stmt, wsql, 0, SQL_DESC_COUNT, 32); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeLocalTypeName) { + std::wstring wsql = this->GetQueryAllDataTypes(); + // Mock server doesn't have local type name + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsRemoteTest, TestSQLColAttributeLocalTypeName) { + std::wstring wsql = this->GetQueryAllDataTypes(); + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, + std::wstring(L"INTEGER")); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeschema_name) { + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + // Mock server doesn't have schemas + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsRemoteTest, TestSQLColAttributeschema_name) { + // Test assumes there is a table $scratch.ODBCTest in remote server + + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + // Remote server limitation: doesn't return schema name, expected schema name is + // $scratch + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesOwnerName) { + // Tests ODBC 2.0 API SQLColAttributes + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + // Mock server doesn't have schemas + CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesOwnerName) { + // Test assumes there is a table $scratch.ODBCTest in remote server + // Tests ODBC 2.0 API SQLColAttributes + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + // Remote server limitation: doesn't return schema name, expected schema name is + // $scratch + CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, + std::wstring(L"")); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributetable_name) { + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TABLE_NAME, + std::wstring(L"AllTypesTable")); +} + +TEST_F(ColumnsMockTest, TestSQLColAttributeTypeName) { + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, + std::wstring(L"BIGINT")); + CheckSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, + std::wstring(L"WVARCHAR")); + CheckSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, + std::wstring(L"BINARY")); + CheckSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, + std::wstring(L"DOUBLE")); +} + +TEST_F(ColumnsRemoteTest, TestSQLColAttributeTypeName) { + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, + std::wstring(L"INTEGER")); + CheckSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, + std::wstring(L"BIGINT")); + CheckSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, + std::wstring(L"DECIMAL")); + CheckSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, + std::wstring(L"FLOAT")); + CheckSQLColAttributeString(this->stmt, L"", 5, SQL_DESC_TYPE_NAME, + std::wstring(L"DOUBLE")); + CheckSQLColAttributeString(this->stmt, L"", 6, SQL_DESC_TYPE_NAME, + std::wstring(L"BOOLEAN")); + CheckSQLColAttributeString(this->stmt, L"", 7, SQL_DESC_TYPE_NAME, + std::wstring(L"DATE")); + CheckSQLColAttributeString(this->stmt, L"", 8, SQL_DESC_TYPE_NAME, + std::wstring(L"TIME")); + CheckSQLColAttributeString(this->stmt, L"", 9, SQL_DESC_TYPE_NAME, + std::wstring(L"TIMESTAMP")); +} + +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesTypeName) { + // Tests ODBC 2.0 API SQLColAttributes + this->CreateTableAllDataType(); + + std::wstring wsql = L"SELECT * from AllTypesTable;"; + // Mock server doesn't return data source-dependent data type name + CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, + std::wstring(L"BIGINT")); + CheckSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, + std::wstring(L"WVARCHAR")); + CheckSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, + std::wstring(L"BINARY")); + CheckSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, + std::wstring(L"DOUBLE")); +} + +TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesTypeName) { + // Tests ODBC 2.0 API SQLColAttributes + std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; + CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, + std::wstring(L"INTEGER")); + CheckSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, + std::wstring(L"BIGINT")); + CheckSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, + std::wstring(L"DECIMAL")); + CheckSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, + std::wstring(L"FLOAT")); + CheckSQLColAttributesString(this->stmt, L"", 5, SQL_COLUMN_TYPE_NAME, + std::wstring(L"DOUBLE")); + CheckSQLColAttributesString(this->stmt, L"", 6, SQL_COLUMN_TYPE_NAME, + std::wstring(L"BOOLEAN")); + CheckSQLColAttributesString(this->stmt, L"", 7, SQL_COLUMN_TYPE_NAME, + std::wstring(L"DATE")); + CheckSQLColAttributesString(this->stmt, L"", 8, SQL_COLUMN_TYPE_NAME, + std::wstring(L"TIME")); + CheckSQLColAttributesString(this->stmt, L"", 9, SQL_COLUMN_TYPE_NAME, + std::wstring(L"TIMESTAMP")); +} + +TYPED_TEST(ColumnsTest, TestSQLColAttributeUnnamed) { + std::wstring wsql = this->GetQueryAllDataTypes(); + CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UNNAMED, SQL_NAMED); +} + +TYPED_TEST(ColumnsTest, TestSQLColAttributeUpdatable) { + std::wstring wsql = this->GetQueryAllDataTypes(); + // Mock server and remote server do not return updatable information + CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UPDATABLE, + SQL_ATTR_READWRITE_UNKNOWN); +} + +TYPED_TEST(ColumnsOdbcV2Test, TestSQLColAttributesUpdatable) { + // Tests ODBC 2.0 API SQLColAttributes + std::wstring wsql = this->GetQueryAllDataTypes(); + // Mock server and remote server do not return updatable information + CheckSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_UPDATABLE, + SQL_ATTR_READWRITE_UNKNOWN); +} + +} // namespace arrow::flight::sql::odbc From f3e267e3224d937e042a638666d7014b3e928a25 Mon Sep 17 00:00:00 2001 From: "Alina (Xi) Li" Date: Mon, 10 Nov 2025 10:51:50 -0800 Subject: [PATCH 15/15] Develop utility getters for ODBC APIs --- .../flight/sql/odbc/tests/columns_test.cc | 283 ++++++++++-------- 1 file changed, 153 insertions(+), 130 deletions(-) diff --git a/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc b/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc index 3905d0474f8..9e42edf69a0 100644 --- a/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc +++ b/cpp/src/arrow/flight/sql/odbc/tests/columns_test.cc @@ -219,9 +219,8 @@ void CheckSQLColAttributes(SQLHSTMT stmt, SQLUSMALLINT idx, EXPECT_EQ(expected_unsigned_column, unsigned_col); } -void CheckSQLColAttributeString(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, - SQLUSMALLINT field_identifier, - const std::wstring& expected_attr_string) { +void GetSQLColAttributeString(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, + SQLUSMALLINT field_identifier, std::wstring& value) { if (!wsql.empty()) { // Execute query std::vector sql0(wsql.begin(), wsql.end()); @@ -238,28 +237,11 @@ void CheckSQLColAttributeString(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMA ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, field_identifier, &str_val[0], (SQLSMALLINT)str_val.size(), &str_len, 0)); - std::wstring attr_str = ConvertToWString(str_val, str_len); - ASSERT_EQ(expected_attr_string, attr_str); + value = ConvertToWString(str_val, str_len); } -void CheckSQLColAttributeNumeric(SQLHSTMT stmt, const std::wstring& wsql, - SQLUSMALLINT idx, SQLUSMALLINT field_identifier, - SQLLEN expected_attr_numeric) { - // Execute query and check SQLColAttribute numeric attribute - std::vector sql0(wsql.begin(), wsql.end()); - ASSERT_EQ(SQL_SUCCESS, - SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); - - ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); - - SQLLEN num_val = 0; - ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, field_identifier, 0, 0, 0, &num_val)); - ASSERT_EQ(expected_attr_numeric, num_val); -} - -void CheckSQLColAttributesString(SQLHSTMT stmt, const std::wstring& wsql, - SQLUSMALLINT idx, SQLUSMALLINT field_identifier, - const std::wstring& expected_attr_string) { +void GetSQLColAttributesString(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, + SQLUSMALLINT field_identifier, std::wstring& value) { if (!wsql.empty()) { // Execute query std::vector sql0(wsql.begin(), wsql.end()); @@ -269,21 +251,19 @@ void CheckSQLColAttributesString(SQLHSTMT stmt, const std::wstring& wsql, ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); } - // check ODBC 2.0 API SQLColAttributes string attribute + // check SQLColAttribute string attribute std::vector str_val(kOdbcBufferSize); SQLSMALLINT str_len = 0; ASSERT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, field_identifier, &str_val[0], (SQLSMALLINT)str_val.size(), &str_len, 0)); - std::wstring attr_str = ConvertToWString(str_val, str_len); - ASSERT_EQ(expected_attr_string, attr_str); + value = ConvertToWString(str_val, str_len); } -void CheckSQLColAttributesNumeric(SQLHSTMT stmt, const std::wstring& wsql, - SQLUSMALLINT idx, SQLUSMALLINT field_identifier, - SQLLEN expected_attr_numeric) { - // Execute query and check ODBC 2.0 API SQLColAttributes numeric attribute +void GetSQLColAttributeNumeric(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, + SQLUSMALLINT field_identifier, SQLLEN* value) { + // Execute query and check SQLColAttribute numeric attribute std::vector sql0(wsql.begin(), wsql.end()); ASSERT_EQ(SQL_SUCCESS, SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); @@ -291,9 +271,20 @@ void CheckSQLColAttributesNumeric(SQLHSTMT stmt, const std::wstring& wsql, ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); SQLLEN num_val = 0; + ASSERT_EQ(SQL_SUCCESS, SQLColAttribute(stmt, idx, field_identifier, 0, 0, 0, value)); +} + +void GetSQLColAttributesNumeric(SQLHSTMT stmt, const std::wstring& wsql, SQLUSMALLINT idx, + SQLUSMALLINT field_identifier, SQLLEN* value) { + // Execute query and check SQLColAttribute numeric attribute + std::vector sql0(wsql.begin(), wsql.end()); ASSERT_EQ(SQL_SUCCESS, - SQLColAttributes(stmt, idx, field_identifier, 0, 0, 0, &num_val)); - ASSERT_EQ(expected_attr_numeric, num_val); + SQLExecDirect(stmt, &sql0[0], static_cast(sql0.size()))); + + ASSERT_EQ(SQL_SUCCESS, SQLFetch(stmt)); + + SQLLEN num_val = 0; + ASSERT_EQ(SQL_SUCCESS, SQLColAttributes(stmt, idx, field_identifier, 0, 0, 0, value)); } } // namespace @@ -981,10 +972,13 @@ TYPED_TEST(ColumnsTest, TestSQLColAttributeCaseSensitive) { std::wstring wsql = this->GetQueryAllDataTypes(); // Int column - CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_CASE_SENSITIVE, SQL_FALSE); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_CASE_SENSITIVE, &value); + ASSERT_EQ(SQL_FALSE, value); SQLFreeStmt(this->stmt, SQL_CLOSE); // Varchar column - CheckSQLColAttributeNumeric(this->stmt, wsql, 28, SQL_DESC_CASE_SENSITIVE, SQL_FALSE); + GetSQLColAttributeNumeric(this->stmt, wsql, 28, SQL_DESC_CASE_SENSITIVE, &value); + ASSERT_EQ(SQL_FALSE, value); } TYPED_TEST(ColumnsOdbcV2Test, TestSQLColAttributesCaseSensitive) { @@ -993,11 +987,13 @@ TYPED_TEST(ColumnsOdbcV2Test, TestSQLColAttributesCaseSensitive) { std::wstring wsql = this->GetQueryAllDataTypes(); // Int column - CheckSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_CASE_SENSITIVE, SQL_FALSE); + SQLLEN value; + GetSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_CASE_SENSITIVE, &value); + ASSERT_EQ(SQL_FALSE, value); SQLFreeStmt(this->stmt, SQL_CLOSE); // Varchar column - CheckSQLColAttributesNumeric(this->stmt, wsql, 28, SQL_COLUMN_CASE_SENSITIVE, - SQL_FALSE); + GetSQLColAttributesNumeric(this->stmt, wsql, 28, SQL_COLUMN_CASE_SENSITIVE, &value); + ASSERT_EQ(SQL_FALSE, value); } TEST_F(ColumnsMockTest, TestSQLColAttributeUniqueValue) { @@ -1005,7 +1001,9 @@ TEST_F(ColumnsMockTest, TestSQLColAttributeUniqueValue) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_AUTO_UNIQUE_VALUE, SQL_FALSE); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_AUTO_UNIQUE_VALUE, &value); + ASSERT_EQ(SQL_FALSE, value); } TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesAutoIncrement) { @@ -1014,42 +1012,48 @@ TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesAutoIncrement) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_COLUMN_AUTO_INCREMENT, SQL_FALSE); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_COLUMN_AUTO_INCREMENT, &value); + ASSERT_EQ(SQL_FALSE, value); } -TEST_F(ColumnsMockTest, TestSQLColAttributeBasetable_name) { +TEST_F(ColumnsMockTest, TestSQLColAttributeBaseTableName) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_BASE_TABLE_NAME, - std::wstring(L"AllTypesTable")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_BASE_TABLE_NAME, value); + ASSERT_EQ(std::wstring(L"AllTypesTable"), value); } -TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributestable_name) { +TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesTableName) { // Tests ODBC 2.0 API SQLColAttributes this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TABLE_NAME, - std::wstring(L"AllTypesTable")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TABLE_NAME, value); + ASSERT_EQ(std::wstring(L"AllTypesTable"), value); } -TEST_F(ColumnsMockTest, TestSQLColAttributecatalog_name) { +TEST_F(ColumnsMockTest, TestSQLColAttributeCatalogName) { // Mock server limitattion: mock doesn't return catalog for result metadata, // and the defautl catalog should be 'main' this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } -TEST_F(ColumnsRemoteTest, TestSQLColAttributecatalog_name) { +TEST_F(ColumnsRemoteTest, TestSQLColAttributeCatalogName) { // Remote server does not have catalogs std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_CATALOG_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesQualifierName) { @@ -1059,54 +1063,62 @@ TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesQualifierName) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesQualifierName) { // Remote server does not have catalogs // Tests ODBC 2.0 API SQLColAttributes std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_COLUMN_QUALIFIER_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TYPED_TEST(ColumnsTest, TestSQLColAttributeCount) { std::wstring wsql = this->GetQueryAllDataTypes(); // Pass 0 as column number, driver should ignore it - CheckSQLColAttributeNumeric(this->stmt, wsql, 0, SQL_DESC_COUNT, 32); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 0, SQL_DESC_COUNT, &value); + ASSERT_EQ(32, value); } TEST_F(ColumnsMockTest, TestSQLColAttributeLocalTypeName) { std::wstring wsql = this->GetQueryAllDataTypes(); // Mock server doesn't have local type name - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TEST_F(ColumnsRemoteTest, TestSQLColAttributeLocalTypeName) { std::wstring wsql = this->GetQueryAllDataTypes(); - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, - std::wstring(L"INTEGER")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_DESC_LOCAL_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"INTEGER"), value); } -TEST_F(ColumnsMockTest, TestSQLColAttributeschema_name) { +TEST_F(ColumnsMockTest, TestSQLColAttributeSchemaName) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; // Mock server doesn't have schemas - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } -TEST_F(ColumnsRemoteTest, TestSQLColAttributeschema_name) { +TEST_F(ColumnsRemoteTest, TestSQLColAttributeSchemaName) { // Test assumes there is a table $scratch.ODBCTest in remote server std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; // Remote server limitation: doesn't return schema name, expected schema name is // $scratch - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_SCHEMA_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesOwnerName) { @@ -1115,8 +1127,9 @@ TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesOwnerName) { std::wstring wsql = L"SELECT * from AllTypesTable;"; // Mock server doesn't have schemas - CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesOwnerName) { @@ -1125,52 +1138,56 @@ TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesOwnerName) { std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; // Remote server limitation: doesn't return schema name, expected schema name is // $scratch - CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, - std::wstring(L"")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_OWNER_NAME, value); + ASSERT_EQ(std::wstring(L""), value); } -TEST_F(ColumnsMockTest, TestSQLColAttributetable_name) { +TEST_F(ColumnsMockTest, TestSQLColAttributeTableName) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TABLE_NAME, - std::wstring(L"AllTypesTable")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TABLE_NAME, value); + ASSERT_EQ(std::wstring(L"AllTypesTable"), value); } TEST_F(ColumnsMockTest, TestSQLColAttributeTypeName) { this->CreateTableAllDataType(); std::wstring wsql = L"SELECT * from AllTypesTable;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, - std::wstring(L"BIGINT")); - CheckSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, - std::wstring(L"WVARCHAR")); - CheckSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, - std::wstring(L"BINARY")); - CheckSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, - std::wstring(L"DOUBLE")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BIGINT"), value); + GetSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"WVARCHAR"), value); + GetSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BINARY"), value); + GetSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DOUBLE"), value); } TEST_F(ColumnsRemoteTest, TestSQLColAttributeTypeName) { std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; - CheckSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, - std::wstring(L"INTEGER")); - CheckSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, - std::wstring(L"BIGINT")); - CheckSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, - std::wstring(L"DECIMAL")); - CheckSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, - std::wstring(L"FLOAT")); - CheckSQLColAttributeString(this->stmt, L"", 5, SQL_DESC_TYPE_NAME, - std::wstring(L"DOUBLE")); - CheckSQLColAttributeString(this->stmt, L"", 6, SQL_DESC_TYPE_NAME, - std::wstring(L"BOOLEAN")); - CheckSQLColAttributeString(this->stmt, L"", 7, SQL_DESC_TYPE_NAME, - std::wstring(L"DATE")); - CheckSQLColAttributeString(this->stmt, L"", 8, SQL_DESC_TYPE_NAME, - std::wstring(L"TIME")); - CheckSQLColAttributeString(this->stmt, L"", 9, SQL_DESC_TYPE_NAME, - std::wstring(L"TIMESTAMP")); + std::wstring value; + GetSQLColAttributeString(this->stmt, wsql, 1, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"INTEGER"), value); + GetSQLColAttributeString(this->stmt, L"", 2, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BIGINT"), value); + GetSQLColAttributeString(this->stmt, L"", 3, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DECIMAL"), value); + GetSQLColAttributeString(this->stmt, L"", 4, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"FLOAT"), value); + GetSQLColAttributeString(this->stmt, L"", 5, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DOUBLE"), value); + GetSQLColAttributeString(this->stmt, L"", 6, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BOOLEAN"), value); + GetSQLColAttributeString(this->stmt, L"", 7, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DATE"), value); + GetSQLColAttributeString(this->stmt, L"", 8, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"TIME"), value); + GetSQLColAttributeString(this->stmt, L"", 9, SQL_DESC_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"TIMESTAMP"), value); } TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesTypeName) { @@ -1179,57 +1196,63 @@ TEST_F(ColumnsOdbcV2MockTest, TestSQLColAttributesTypeName) { std::wstring wsql = L"SELECT * from AllTypesTable;"; // Mock server doesn't return data source-dependent data type name - CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, - std::wstring(L"BIGINT")); - CheckSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, - std::wstring(L"WVARCHAR")); - CheckSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, - std::wstring(L"BINARY")); - CheckSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, - std::wstring(L"DOUBLE")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BIGINT"), value); + GetSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"WVARCHAR"), value); + GetSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BINARY"), value); + GetSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DOUBLE"), value); } TEST_F(ColumnsOdbcV2RemoteTest, TestSQLColAttributesTypeName) { // Tests ODBC 2.0 API SQLColAttributes std::wstring wsql = L"SELECT * from $scratch.ODBCTest;"; - CheckSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, - std::wstring(L"INTEGER")); - CheckSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, - std::wstring(L"BIGINT")); - CheckSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, - std::wstring(L"DECIMAL")); - CheckSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, - std::wstring(L"FLOAT")); - CheckSQLColAttributesString(this->stmt, L"", 5, SQL_COLUMN_TYPE_NAME, - std::wstring(L"DOUBLE")); - CheckSQLColAttributesString(this->stmt, L"", 6, SQL_COLUMN_TYPE_NAME, - std::wstring(L"BOOLEAN")); - CheckSQLColAttributesString(this->stmt, L"", 7, SQL_COLUMN_TYPE_NAME, - std::wstring(L"DATE")); - CheckSQLColAttributesString(this->stmt, L"", 8, SQL_COLUMN_TYPE_NAME, - std::wstring(L"TIME")); - CheckSQLColAttributesString(this->stmt, L"", 9, SQL_COLUMN_TYPE_NAME, - std::wstring(L"TIMESTAMP")); + std::wstring value; + GetSQLColAttributesString(this->stmt, wsql, 1, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"INTEGER"), value); + GetSQLColAttributesString(this->stmt, L"", 2, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BIGINT"), value); + GetSQLColAttributesString(this->stmt, L"", 3, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DECIMAL"), value); + GetSQLColAttributesString(this->stmt, L"", 4, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"FLOAT"), value); + GetSQLColAttributesString(this->stmt, L"", 5, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DOUBLE"), value); + GetSQLColAttributesString(this->stmt, L"", 6, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"BOOLEAN"), value); + GetSQLColAttributesString(this->stmt, L"", 7, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"DATE"), value); + GetSQLColAttributesString(this->stmt, L"", 8, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"TIME"), value); + GetSQLColAttributesString(this->stmt, L"", 9, SQL_COLUMN_TYPE_NAME, value); + ASSERT_EQ(std::wstring(L"TIMESTAMP"), value); } TYPED_TEST(ColumnsTest, TestSQLColAttributeUnnamed) { std::wstring wsql = this->GetQueryAllDataTypes(); - CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UNNAMED, SQL_NAMED); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UNNAMED, &value); + ASSERT_EQ(SQL_NAMED, value); } TYPED_TEST(ColumnsTest, TestSQLColAttributeUpdatable) { std::wstring wsql = this->GetQueryAllDataTypes(); // Mock server and remote server do not return updatable information - CheckSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UPDATABLE, - SQL_ATTR_READWRITE_UNKNOWN); + SQLLEN value; + GetSQLColAttributeNumeric(this->stmt, wsql, 1, SQL_DESC_UPDATABLE, &value); + ASSERT_EQ(SQL_ATTR_READWRITE_UNKNOWN, value); } TYPED_TEST(ColumnsOdbcV2Test, TestSQLColAttributesUpdatable) { // Tests ODBC 2.0 API SQLColAttributes std::wstring wsql = this->GetQueryAllDataTypes(); // Mock server and remote server do not return updatable information - CheckSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_UPDATABLE, - SQL_ATTR_READWRITE_UNKNOWN); + SQLLEN value; + GetSQLColAttributesNumeric(this->stmt, wsql, 1, SQL_COLUMN_UPDATABLE, &value); + ASSERT_EQ(SQL_ATTR_READWRITE_UNKNOWN, value); } } // namespace arrow::flight::sql::odbc