diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 04391be0ac78..70245906f848 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -403,6 +403,16 @@ TEST(StringViewArray, Validate) { }), Ok()); + // Variadic buffer slots, when present, must contain real buffers. + EXPECT_THAT(MakeBinaryViewArray({nullptr}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }), + Raises(StatusCode::Invalid, + ::testing::HasSubstr("null variadic buffer at buffer index #2 " + "(variadic buffer index #0)"))); + // non-inline views are expected to reference only buffers managed by the array EXPECT_THAT( MakeBinaryViewArray( diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index c1d96375bf09..16bc9187af46 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -504,6 +504,12 @@ struct ValidateArrayImpl { : *layout.variadic_spec; if (buffer == nullptr) { + if (layout.variadic_spec && i >= static_cast(layout.buffers.size())) { + return Status::Invalid("Array of type ", type.ToString(), + " has a null variadic buffer at buffer index #", i, + " (variadic buffer index #", i - layout.buffers.size(), + ")"); + } continue; } int64_t min_buffer_size = 0; diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 82e74098d248..e0398b9c2aec 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -608,7 +608,8 @@ struct ArrayExporter { export_.variadic_buffer_sizes_.resize(variadic_buffers.size()); size_t i = 0; for (const auto& buf : variadic_buffers) { - export_.variadic_buffer_sizes_[i++] = buf->size(); + // The C Data Interface allows null variadic buffer pointers with size 0. + export_.variadic_buffer_sizes_[i++] = buf == nullptr ? 0 : buf->size(); } export_.buffers_.back() = export_.variadic_buffer_sizes_.data(); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index cb204806f90c..1e946c8114be 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -930,6 +930,30 @@ TEST_F(TestArrayExport, PrimitiveSliced) { TestPrimitive(factory); } +TEST_F(TestArrayExport, NullVariadicBuffers) { + // GH-49740: _export_to_c segmentation fault for binary_view array. + for (const auto& type : {binary_view(), utf8_view()}) { + auto arr = + MakeArray(ArrayData::Make(type, /*length=*/2, + {nullptr, + Buffer::FromVector(std::vector{ + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }), + nullptr})); + + struct ArrowArray c_export; + ASSERT_OK(ExportArray(*arr, &c_export)); + ArrayExportGuard guard(&c_export); + + ASSERT_EQ(c_export.n_buffers, 4); + ASSERT_EQ(c_export.buffers[2], nullptr); + ASSERT_NE(c_export.buffers[3], nullptr); + const auto* variadic_buffer_sizes = static_cast(c_export.buffers[3]); + ASSERT_EQ(variadic_buffer_sizes[0], 0); + } +} + constexpr std::string_view binary_view_buffer_content0 = "12345foo bar baz quux", binary_view_buffer_content1 = "BinaryViewMultipleBuffers"; @@ -2996,6 +3020,43 @@ TEST_F(TestArrayImport, String) { CheckImport(ArrayFromJSON(large_binary(), "[]")); } +TEST_F(TestArrayImport, NullVariadicBuffers) { + // The C Data Interface allows null variadic buffer pointers with size 0. + // Import normalizes them to non-null zero-size buffers in Arrow C++. + std::vector views = { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + }; + constexpr int64_t null_variadic_buffer_sizes[] = {0}; + const void* null_variadic_buffer[] = { + nullptr, + views.data(), + nullptr, + null_variadic_buffer_sizes, + }; + + for (const auto& type : {binary_view(), utf8_view()}) { + FillStringViewLike(/*length=*/2, /*null_count=*/0, /*offset=*/0, null_variadic_buffer, + /*data_buffer_count=*/1); + + ArrayReleaseCallback cb(&c_struct_); + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_struct_, type)); + ASSERT_TRUE(ArrowArrayIsReleased(&c_struct_)); + Reset(); + + ASSERT_OK(array->ValidateFull()); + AssertArraysEqual(*ArrayFromJSON(type, R"(["hello", "world"])"), *array, + /*verbose=*/true); + + ASSERT_EQ(array->data()->buffers.size(), 3); + ASSERT_NE(array->data()->buffers[2], nullptr); + ASSERT_EQ(array->data()->buffers[2]->size(), 0); + cb.AssertNotCalled(); + array.reset(); + cb.AssertCalled(); + } +} + TEST_F(TestArrayImport, StringWithOffset) { FillStringLike(3, 0, 1, string_buffers_no_nulls1); CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])")); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 4d0aa943ed90..52b79c8d452d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -438,8 +438,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou auto* out_views = output->GetMutableValues(1); - // If all entries are inline, we can drop the extra data buffer for - // large strings in output->buffers[2]. + // If all entries are inline, there are no variadic data buffers to expose. bool all_entries_are_inline = true; VisitSetBitRunsVoid( validity, output->offset, output->length, @@ -463,8 +462,9 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } }); if (all_entries_are_inline) { - output->buffers[2] = nullptr; + output->buffers.resize(2); } + return Status::OK(); } @@ -508,11 +508,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const int32_t fixed_size_width = input.type->byte_width(); const int64_t total_length = input.offset + input.length; + // Values of width <= BinaryViewType::kInlineSize are stored inline in the + // views, so the output only needs a variadic data buffer for larger widths. + const bool values_are_inline = fixed_size_width <= BinaryViewType::kInlineSize; + const size_t num_buffers = values_are_inline ? 2 : 3; ArrayData* output = out->array_data().get(); DCHECK_EQ(output->length, input.length); output->offset = input.offset; - output->buffers.resize(3); + output->buffers.resize(num_buffers); output->SetNullCount(input.null_count); // Share the validity bitmap buffer output->buffers[0] = input.GetBuffer(0); @@ -539,7 +543,7 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou } // Inline string and non-inline string loops - if (fixed_size_width <= BinaryViewType::kInlineSize) { + if (values_are_inline) { int32_t data_offset = static_cast(input.offset) * fixed_size_width; for (int64_t i = 0; i < input.length; i++) { auto& out_view = out_views[i]; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 4ff58040e05e..51e6ca534cd5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -3307,9 +3307,12 @@ TEST(Cast, BinaryToString) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - if (!is_binary_view_like(*string_type)) { + if (is_binary_view_like(*string_type)) { + ASSERT_EQ(strings->data()->buffers.size(), 2); + } else { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); } } @@ -3349,9 +3352,12 @@ TEST(Cast, BinaryOrStringToBinary) { // N.B. null buffer is not always the same if input sliced AssertBufferSame(*invalid_utf8, *strings, 0); - // ARROW-16757: we no longer zero copy, but the contents are equal - ASSERT_NE(invalid_utf8->data()->buffers[1].get(), strings->data()->buffers[2].get()); - if (!is_binary_view_like(*to_type)) { + if (is_binary_view_like(*to_type)) { + ASSERT_EQ(strings->data()->buffers.size(), 2); + } else { + // ARROW-16757: we no longer zero copy, but the contents are equal + ASSERT_NE(invalid_utf8->data()->buffers[1].get(), + strings->data()->buffers[2].get()); ASSERT_TRUE(invalid_utf8->data()->buffers[1]->Equals(*strings->data()->buffers[2])); } @@ -3361,6 +3367,75 @@ TEST(Cast, BinaryOrStringToBinary) { } } +TEST(Cast, BinaryOrStringToView) { + // GH-49740: when all values were inline, the cast left a null variadic + // buffer slot behind and exporting to the C data interface crashed. + const std::vector> cases = { + // Empty inputs are handled before the cast kernel runs. + {"[]", 2}, + {R"(["a", null, "e"])", 2}, + {"[null, null]", 2}, + {R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3}, + {R"(["a", null, "aaaaaaaaaaaaa"])", 3}, + }; + + for (auto from_type : {binary(), large_binary(), utf8(), large_utf8()}) { + for (auto to_type : {binary_view(), utf8_view()}) { + for (const auto& [json, expected_num_buffers] : cases) { + auto input = ArrayFromJSON(from_type, json); + auto expected = ArrayFromJSON(to_type, json); + + ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type)); + ValidateOutput(*casted); + AssertArraysEqual(*expected, *casted); + + ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + if (expected_num_buffers == 3) { + ASSERT_NE(casted->data()->buffers[2], nullptr) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + } + } + } + } +} + +TEST(Cast, FixedSizeBinaryToView) { + // Fixed-size binary uses a separate cast kernel with the same GH-49740 + // issue. Cover non-empty widths on both sides of the BinaryView inline limit. + const std::vector, std::string, int64_t>> cases = { + // Empty inputs are handled before the cast kernel runs. + {fixed_size_binary(1), "[]", 2}, + {fixed_size_binary(13), "[]", 2}, + {fixed_size_binary(1), R"(["a", null, "e"])", 2}, + {fixed_size_binary(1), "[null, null]", 2}, + {fixed_size_binary(12), R"(["aaaaaaaaaaaa", null])", 2}, + {fixed_size_binary(13), R"(["aaaaaaaaaaaaa", null, "eeeeeeeeeeeee"])", 3}, + }; + + for (const auto& [from_type, json, expected_num_buffers] : cases) { + for (auto to_type : {binary_view(), utf8_view()}) { + auto input = ArrayFromJSON(from_type, json); + auto expected = ArrayFromJSON(to_type, json); + + ASSERT_OK_AND_ASSIGN(auto casted, Cast(*input, to_type)); + ValidateOutput(*casted); + AssertArraysEqual(*expected, *casted); + + ASSERT_EQ(casted->data()->buffers.size(), expected_num_buffers) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + if (expected_num_buffers == 3) { + ASSERT_NE(casted->data()->buffers[2], nullptr) + << "from: " << from_type->ToString() << ", to: " << to_type->ToString() + << ", values: " << json; + } + } + } +} + TEST(Cast, StringToString) { for (auto from_type : {utf8(), utf8_view(), large_utf8()}) { for (auto to_type : {utf8(), utf8_view(), large_utf8()}) { diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 3dbab0dfc174..85e9f221c3d3 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -144,6 +144,7 @@ def generate_column(self, size, name=None): TEST_INT_MAX = 2 ** 31 - 1 TEST_INT_MIN = ~TEST_INT_MAX +BINARY_VIEW_INLINE_SIZE = 12 class IntegerField(PrimitiveField): @@ -625,14 +626,19 @@ def column_class(self): def _get_type(self): return OrderedDict([('name', 'utf8')]) + def _random_string_size(self): + return 7 + + def _random_value(self): + return tobytes(random_utf8(self._random_string_size())) + def generate_column(self, size, name=None): - K = 7 is_valid = self._make_is_valid(size) values = [] for i in range(size): if is_valid[i]: - values.append(tobytes(random_utf8(K))) + values.append(self._random_value()) else: values.append(b"") @@ -671,6 +677,13 @@ def _get_type(self): return OrderedDict([('name', 'binaryview')]) +class InlineBinaryViewField(BinaryViewField): + # Generate only inline values, leaving no variadic data buffers. + + def _random_sizes(self, size): + return np.arange(size, dtype=np.int32) % (BINARY_VIEW_INLINE_SIZE + 1) + + class StringViewField(StringField): @property @@ -681,6 +694,19 @@ def _get_type(self): return OrderedDict([('name', 'utf8view')]) +class InlineStringViewField(StringViewField): + + def _random_string_size(self): + # The test alphabet contains up to 3-byte UTF-8 code points, so four + # characters fit in the 12-byte inline representation. + return 4 + + def _random_value(self): + value = super()._random_value() + assert len(value) <= BINARY_VIEW_INLINE_SIZE + return value + + class Schema(object): def __init__(self, fields, metadata=None): @@ -771,14 +797,13 @@ def _get_buffers(self): # a small default data buffer size is used so we can exercise # arrays with multiple data buffers with small data sets DEFAULT_BUFFER_SIZE = 32 - INLINE_SIZE = 12 for i, v in enumerate(self.values): if not self.is_valid[i]: v = b'' assert isinstance(v, bytes) - if len(v) <= INLINE_SIZE: + if len(v) <= BINARY_VIEW_INLINE_SIZE: # Append an inline view, skip data buffer management. views.append(OrderedDict([ ('SIZE', len(v)), @@ -1784,6 +1809,8 @@ def generate_binary_view_case(): fields = [ BinaryViewField('bv'), StringViewField('sv'), + InlineBinaryViewField('bv_inline'), + InlineStringViewField('sv_inline'), ] batch_sizes = [0, 7, 256] return _generate_file("binary_view", fields, batch_sizes) diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 481c387d5337..0d9fd7245972 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -234,6 +234,39 @@ def test_export_import_array(): ) +@needs_cffi +def test_export_cast_binary_view_all_inline(): + # GH-49740: _export_to_c segmentation fault for binary_view array. + c_schema = ffi.new("struct ArrowSchema*") + ptr_schema = int(ffi.cast("uintptr_t", c_schema)) + c_array = ffi.new("struct ArrowArray*") + ptr_array = int(ffi.cast("uintptr_t", c_array)) + + gc.collect() # Make sure no Arrow data dangles in a ref cycle + old_allocated = pa.total_allocated_bytes() + + # The cast used to leave a null variadic buffer slot behind when all + # values were inline. + arr = pa.array([b"a", None, b"e"], type=pa.binary()).cast(pa.binary_view()) + arr.validate(full=True) + py_value = arr.to_pylist() + arr._export_to_c(ptr_array, ptr_schema) + + assert c_array.length == 3 + # validity, views and the appended variadic buffer sizes + assert c_array.n_buffers == 3 + + del arr + arr_new = pa.Array._import_from_c(ptr_array, ptr_schema) + assert arr_new.to_pylist() == py_value + assert arr_new.type == pa.binary_view() + del arr_new + assert pa.total_allocated_bytes() == old_allocated + # Now released + with assert_schema_released: + pa.Array._import_from_c(ptr_array, ptr_schema) + + @needs_cffi def test_export_import_device_array(): check_export_import_array(