diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index edf23e970d0..d5bb1a8c1e9 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -62,6 +62,14 @@ ChunkedArray::ChunkedArray(ArrayVector chunks, std::shared_ptr type) } } +int64_t ChunkedArray::ComputeLogicalNullCount() const { + int64_t count = 0; + for (const auto& chunk : chunks_) { + count += chunk->ComputeLogicalNullCount(); + } + return count; +} + Result> ChunkedArray::Make(ArrayVector chunks, std::shared_ptr type) { if (type == nullptr) { diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 2b581d0bb6a..82e173ca31c 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -108,6 +108,16 @@ class ARROW_EXPORT ChunkedArray { /// \return the total number of nulls among all chunks int64_t null_count() const { return null_count_; } + /// \brief Computes the logical null count across all chunks + /// + /// This returns the sum of Array::ComputeLogicalNullCount() over the chunks. + /// Unlike null_count(), it accounts for types that carry logical nulls + /// without a validity bitmap, such as union and run-end encoded arrays; for + /// those types the count is recomputed on every call. + /// + /// \see Array::ComputeLogicalNullCount + int64_t ComputeLogicalNullCount() const; + /// \return the total number of chunks in the chunked array int num_chunks() const { return static_cast(chunks_.size()); } diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 90b90a731b6..1862e308b38 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/array/builder_run_end.h" #include "arrow/chunk_resolver.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -76,6 +77,37 @@ TEST_F(TestChunkedArray, Make) { ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0}, int16())); } +TEST_F(TestChunkedArray, ComputeLogicalNullCount) { + // For types with a validity bitmap, the logical null count matches + // null_count() (the sum over chunks). + auto chunk0 = ArrayFromJSON(int32(), "[1, null, 3]"); + auto chunk1 = ArrayFromJSON(int32(), "[null, 5]"); + ChunkedArray with_bitmap({chunk0, chunk1}); + ASSERT_EQ(with_bitmap.null_count(), 2); + ASSERT_EQ(with_bitmap.ComputeLogicalNullCount(), 2); + + // An empty chunked array has no logical nulls. + ASSERT_OK_AND_ASSIGN(auto empty, ChunkedArray::MakeEmpty(int32())); + ASSERT_EQ(empty->ComputeLogicalNullCount(), 0); + + // Run-end encoded arrays carry logical nulls without a top-level validity + // bitmap, so null_count() is 0 while the logical null count is not. + auto pool = default_memory_pool(); + auto ree_type = run_end_encoded(int32(), int32()); + RunEndEncodedBuilder ree_builder(pool, std::make_shared(pool), + std::make_shared(pool), ree_type); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(2), 2)); + ASSERT_OK(ree_builder.AppendNulls(3)); + ASSERT_OK_AND_ASSIGN(auto ree_chunk0, ree_builder.Finish()); + ASSERT_OK(ree_builder.AppendNulls(4)); + ASSERT_OK(ree_builder.AppendScalar(*MakeScalar(8), 5)); + ASSERT_OK_AND_ASSIGN(auto ree_chunk1, ree_builder.Finish()); + + ChunkedArray ree_ca({ree_chunk0, ree_chunk1}, ree_type); + ASSERT_EQ(ree_ca.null_count(), 0); + ASSERT_EQ(ree_ca.ComputeLogicalNullCount(), 7); +} + TEST_F(TestChunkedArray, MakeEmpty) { ASSERT_OK_AND_ASSIGN(std::shared_ptr empty, ChunkedArray::MakeEmpty(int64()));