diff --git a/include/paimon/data/blob.h b/include/paimon/data/blob.h index 9396bbb95..aa0cb54cf 100644 --- a/include/paimon/data/blob.h +++ b/include/paimon/data/blob.h @@ -97,7 +97,8 @@ class PAIMON_EXPORT Blob { /// @param metadata A map of key-value metadata to be attached to the field. /// @return A result containing a unique pointer to the generated `ArrowSchema` or an error. static Result> ArrowField( - const std::string& field_name, std::unordered_map metadata = {}); + const std::string& field_name, bool nullable = false, + std::unordered_map metadata = {}); private: class Impl; diff --git a/include/paimon/defs.h b/include/paimon/defs.h index 400e59e84..70f2aa0b0 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -365,7 +365,7 @@ struct PAIMON_EXPORT Options { /// "partition.legacy-name" - The legacy partition name is using `ToString` for all types. If /// false, using casting to string for all types. Default value is "true". static const char PARTITION_GENERATE_LEGACY_NAME[]; - /// "blob-as-descriptor" - Read and write blob field using blob descriptor rather than blob + /// "blob-as-descriptor" - Read blob field using blob descriptor rather than blob /// bytes. Default value is "false". static const char BLOB_AS_DESCRIPTOR[]; /// "blob-field" - Specifies column names that should be stored as blob type. This is used diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index a517184a4..77ff23131 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -159,6 +159,7 @@ set(PAIMON_CORE_SRCS core/bucket/hive_bucket_function.cpp core/bucket/mod_bucket_function.cpp core/bucket/bucket_id_calculator.cpp + core/casting/binary_to_blob_cast_executor.cpp core/casting/binary_to_string_cast_executor.cpp core/casting/boolean_to_decimal_cast_executor.cpp core/casting/boolean_to_numeric_cast_executor.cpp @@ -222,6 +223,7 @@ set(PAIMON_CORE_SRCS core/io/key_value_meta_projection_consumer.cpp core/io/key_value_projection_consumer.cpp core/io/key_value_projection_reader.cpp + core/io/external_storage_blob_writer.cpp core/io/multiple_blob_file_writer.cpp core/io/rolling_blob_file_writer.cpp core/manifest/file_kind.cpp @@ -601,6 +603,7 @@ if(PAIMON_BUILD_TESTS) core/io/file_index_evaluator_test.cpp core/io/single_file_writer_test.cpp core/io/rolling_blob_file_writer_test.cpp + core/io/external_storage_blob_writer_test.cpp core/global_index/indexed_split_test.cpp core/manifest/file_source_test.cpp core/manifest/file_kind_test.cpp diff --git a/src/paimon/common/data/blob.cpp b/src/paimon/common/data/blob.cpp index 14b1ae099..e17455f1e 100644 --- a/src/paimon/common/data/blob.cpp +++ b/src/paimon/common/data/blob.cpp @@ -105,8 +105,9 @@ Result> Blob::ToData(const std::shared_ptr& } Result> Blob::ArrowField( - const std::string& field_name, std::unordered_map metadata) { - auto blob_field = BlobUtils::ToArrowField(field_name, /*nullable=*/false, metadata); + const std::string& field_name, bool nullable, + std::unordered_map metadata) { + auto blob_field = BlobUtils::ToArrowField(field_name, nullable, metadata); auto field = std::make_unique<::ArrowSchema>(); PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportField(*blob_field, field.get())); return field; diff --git a/src/paimon/common/data/blob_descriptor.h b/src/paimon/common/data/blob_descriptor.h index 526a1050a..090577787 100644 --- a/src/paimon/common/data/blob_descriptor.h +++ b/src/paimon/common/data/blob_descriptor.h @@ -38,7 +38,7 @@ namespace paimon { /// | 13 + N | offset | long | 8 | /// | 21 + N | length | long | 8 | -class BlobDescriptor { +class PAIMON_EXPORT BlobDescriptor { public: static Result> Create(const std::string& uri, int64_t offset, int64_t length); diff --git a/src/paimon/common/data/blob_test.cpp b/src/paimon/common/data/blob_test.cpp index a105ea1f6..9a6f709e0 100644 --- a/src/paimon/common/data/blob_test.cpp +++ b/src/paimon/common/data/blob_test.cpp @@ -144,38 +144,34 @@ TEST_F(BlobTest, TestNewInputStreamWithDynamicLength) { } TEST_F(BlobTest, TestArrowField) { - { - // basic: field name, non-nullable by default - ASSERT_OK_AND_ASSIGN(auto schema, Blob::ArrowField("my_blob")); + for (bool nullable : {false, true}) { + ASSERT_OK_AND_ASSIGN(auto schema, Blob::ArrowField("my_blob", nullable)); ASSERT_NE(schema, nullptr); - // import back to arrow::Field to verify auto field_result = arrow::ImportField(schema.get()); ASSERT_TRUE(field_result.ok()); auto field = field_result.ValueUnsafe(); ASSERT_EQ(field->name(), "my_blob"); ASSERT_EQ(field->type()->id(), arrow::Type::LARGE_BINARY); - ASSERT_FALSE(field->nullable()); + ASSERT_EQ(field->nullable(), nullable); ASSERT_TRUE(field->HasMetadata()); auto extension_type = field->metadata()->Get("paimon.extension.type"); ASSERT_TRUE(extension_type.ok()); ASSERT_EQ(extension_type.ValueUnsafe(), "paimon.type.blob"); } { - // with custom metadata std::unordered_map custom_metadata = { {"custom_key", "custom_value"}}; - ASSERT_OK_AND_ASSIGN(auto schema, Blob::ArrowField("meta_blob", custom_metadata)); + ASSERT_OK_AND_ASSIGN(auto schema, + Blob::ArrowField("meta_blob", /*nullable=*/false, custom_metadata)); auto field = arrow::ImportField(schema.get()).ValueUnsafe(); ASSERT_EQ(field->name(), "meta_blob"); ASSERT_FALSE(field->nullable()); ASSERT_TRUE(field->HasMetadata()); - // blob extension metadata should be present auto extension_type = field->metadata()->Get("paimon.extension.type"); ASSERT_TRUE(extension_type.ok()); ASSERT_EQ(extension_type.ValueUnsafe(), "paimon.type.blob"); - // custom metadata should also be present auto custom_val = field->metadata()->Get("custom_key"); ASSERT_TRUE(custom_val.ok()); ASSERT_EQ(custom_val.ValueUnsafe(), "custom_value"); diff --git a/src/paimon/common/data/blob_utils.cpp b/src/paimon/common/data/blob_utils.cpp index 84835071d..75ff4302a 100644 --- a/src/paimon/common/data/blob_utils.cpp +++ b/src/paimon/common/data/blob_utils.cpp @@ -17,65 +17,77 @@ #include "paimon/common/data/blob_utils.h" #include -#include +#include #include #include "arrow/api.h" #include "arrow/array/array_nested.h" #include "arrow/type.h" +#include "fmt/format.h" #include "paimon/common/data/blob_defs.h" +#include "paimon/common/data/blob_descriptor.h" +#include "paimon/common/types/data_field.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/string_utils.h" - namespace arrow { class Array; } namespace paimon { - BlobUtils::SeparatedSchemas BlobUtils::SeparateBlobSchema( - const std::shared_ptr& schema) { - std::vector> remaining_fields; + const std::shared_ptr& schema, const std::set& inline_fields) { + std::vector> main_fields; std::vector> blob_fields; - for (auto i = 0; i < schema->num_fields(); i++) { + for (int32_t i = 0; i < schema->num_fields(); i++) { auto field = schema->field(i); - if (IsBlobField(field)) { + if (IsBlobField(field) && inline_fields.count(field->name()) == 0) { + // Non-inline BLOB -> goes to blob file blob_fields.emplace_back(field); } else { - remaining_fields.emplace_back(field); + // Non-blob fields OR inline BLOB fields -> stay in main + main_fields.emplace_back(field); } } SeparatedSchemas result; - result.main_schema = arrow::schema(remaining_fields); + result.main_schema = arrow::schema(main_fields); result.blob_schema = arrow::schema(blob_fields); return result; } Result BlobUtils::SeparateBlobArray( - const std::shared_ptr& struct_array) { + const std::shared_ptr& struct_array, + const std::set& inline_fields) { std::shared_ptr old_type = std::static_pointer_cast(struct_array->type()); const auto& old_fields = old_type->fields(); const auto& old_arrays = struct_array->fields(); - std::vector> remaining_fields; - std::vector> remaining_arrays; - std::vector> blob_fields; - std::vector> blob_arrays; + arrow::ArrayVector main_arrays; + arrow::ArrayVector blob_arrays; + arrow::FieldVector main_fields; + arrow::FieldVector blob_fields; for (size_t i = 0; i < old_fields.size(); i++) { - if (IsBlobField(old_fields[i])) { + if (IsBlobField(old_fields[i]) && inline_fields.count(old_fields[i]->name()) == 0) { blob_fields.push_back(old_fields[i]); blob_arrays.push_back(old_arrays[i]); } else { - remaining_fields.push_back(old_fields[i]); - remaining_arrays.push_back(old_arrays[i]); + main_fields.push_back(old_fields[i]); + main_arrays.push_back(old_arrays[i]); } } + if (blob_fields.empty()) { + return Status::Invalid( + "SeparateBlobArray expects at least one non-inline blob field, but got none."); + } + if (main_fields.empty()) { + return Status::Invalid("SeparateBlobArray expects at least one main field, but got none."); + } + SeparatedStructArrays result; PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(result.main_array, - arrow::StructArray::Make(remaining_arrays, remaining_fields)); + arrow::StructArray::Make(main_arrays, main_fields)); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(result.blob_array, arrow::StructArray::Make(blob_arrays, blob_fields)); return result; @@ -114,4 +126,66 @@ std::shared_ptr BlobUtils::ToArrowField( return arrow::field(field_name, arrow::large_binary(), nullable, std::make_shared(metadata)); } + +Status BlobUtils::ValidateInlineBlobDescriptors( + const std::shared_ptr& struct_array, + const std::set& inline_descriptor_fields) { + if (inline_descriptor_fields.empty()) { + return Status::OK(); + } + if (!struct_array) { + return Status::Invalid("array in ValidateInlineBlobDescriptors must be a struct_array"); + } + for (const auto& field_name : inline_descriptor_fields) { + auto field_array = struct_array->GetFieldByName(field_name); + if (!field_array) { + continue; + } + const auto* binary_array = + arrow::internal::checked_cast(field_array.get()); + if (!binary_array) { + return Status::Invalid( + fmt::format("cannot cast array for field {} to LargeBinaryArray", field_name)); + } + for (int64_t row = 0; row < binary_array->length(); ++row) { + if (binary_array->IsNull(row)) { + continue; + } + auto value = binary_array->GetView(row); + PAIMON_ASSIGN_OR_RAISE(bool is_descriptor, + BlobDescriptor::IsBlobDescriptor(value.data(), value.size())); + if (!is_descriptor) { + return Status::Invalid(fmt::format( + "BLOB inline field {} configured by blob-descriptor-field or blob-view-field " + "require values to be a BlobDescriptor or BlobViewStruct.", + field_name)); + } + } + } + return Status::OK(); +} + +std::vector BlobUtils::ConvertBlobInlineDataFields( + const std::vector& data_fields, const std::vector& blob_inline_fields) { + if (blob_inline_fields.empty()) { + return data_fields; + } + + std::set blob_inline_field_set(blob_inline_fields.begin(), + blob_inline_fields.end()); + std::vector converted_fields; + converted_fields.reserve(data_fields.size()); + for (const auto& data_field : data_fields) { + if (blob_inline_field_set.find(data_field.Name()) == blob_inline_field_set.end()) { + converted_fields.push_back(data_field); + continue; + } + + auto binary_field = arrow::field(data_field.Name(), arrow::binary(), data_field.Nullable(), + data_field.ArrowField()->metadata()); + converted_fields.emplace_back(data_field.Id(), binary_field, data_field.Description()); + } + return converted_fields; +} + } // namespace paimon diff --git a/src/paimon/common/data/blob_utils.h b/src/paimon/common/data/blob_utils.h index 2b5539f7a..211f15f84 100644 --- a/src/paimon/common/data/blob_utils.h +++ b/src/paimon/common/data/blob_utils.h @@ -17,8 +17,10 @@ #pragma once #include +#include #include #include +#include #include "paimon/result.h" #include "paimon/visibility.h" @@ -30,6 +32,10 @@ class Schema; class StructArray; } // namespace arrow +namespace paimon { +class DataField; +} // namespace paimon + namespace paimon { /// Utils for blob type. class PAIMON_EXPORT BlobUtils { @@ -38,23 +44,29 @@ class PAIMON_EXPORT BlobUtils { ~BlobUtils() = delete; struct SeparatedSchemas { - /// Non-blob fields + /// Non-blob fields (includes inline blob fields when inline_fields is provided) std::shared_ptr main_schema; - /// Blob fields only + /// Blob fields that go to separate .blob files std::shared_ptr blob_schema; }; struct SeparatedStructArrays { - /// Non-blob fields + /// Non-blob fields (includes inline blob fields when inline_fields is provided) std::shared_ptr main_array; - /// Blob fields only + /// Blob fields that go to separate .blob files std::shared_ptr blob_array; }; - static SeparatedSchemas SeparateBlobSchema(const std::shared_ptr& schema); + /// Separates schema with inline field awareness. + /// BLOB fields in inline_fields stay in main_schema; others go to blob_schema. + static SeparatedSchemas SeparateBlobSchema(const std::shared_ptr& schema, + const std::set& inline_fields); + /// Separates array with inline field awareness. + /// BLOB fields in inline_fields stay in main_array; others go to blob_array. static Result SeparateBlobArray( - const std::shared_ptr& struct_array); + const std::shared_ptr& struct_array, + const std::set& inline_fields); static bool IsBlobField(const std::shared_ptr& field); static bool IsBlobMetadata(const std::shared_ptr& metadata); @@ -63,6 +75,18 @@ class PAIMON_EXPORT BlobUtils { static std::shared_ptr ToArrowField( const std::string& field_name, bool nullable = false, std::unordered_map metadata = {}); + + static Status ValidateInlineBlobDescriptors( + const std::shared_ptr& struct_array, + const std::set& inline_descriptor_fields); + + /// Converts inline blob DataFields from large_binary to binary type. + /// Inline blob fields use large_binary in the table schema (because they are BLOB type), + /// but are stored as binary in data files. This conversion aligns the field type with + /// the actual on-disk storage format for correct reading. + static std::vector ConvertBlobInlineDataFields( + const std::vector& data_fields, + const std::vector& blob_inline_fields); }; } // namespace paimon diff --git a/src/paimon/common/data/blob_utils_test.cpp b/src/paimon/common/data/blob_utils_test.cpp index f8835379d..f2a02f10d 100644 --- a/src/paimon/common/data/blob_utils_test.cpp +++ b/src/paimon/common/data/blob_utils_test.cpp @@ -20,7 +20,10 @@ #include "arrow/c/bridge.h" #include "gtest/gtest.h" #include "paimon/common/data/blob_defs.h" +#include "paimon/common/data/blob_descriptor.h" +#include "paimon/common/types/data_field.h" #include "paimon/data/blob.h" +#include "paimon/memory/memory_pool.h" #include "paimon/testing/utils/testharness.h" namespace paimon::test { @@ -74,7 +77,8 @@ TEST_F(BlobUtilsTest, SeparateBlobSchema) { std::shared_ptr original_schema = arrow::schema({int_field, string_field, blob_field_1}); - BlobUtils::SeparatedSchemas schemas = BlobUtils::SeparateBlobSchema(original_schema); + BlobUtils::SeparatedSchemas schemas = + BlobUtils::SeparateBlobSchema(original_schema, /*inline_fields=*/{}); std::shared_ptr expected_main_schema = arrow::schema({int_field, string_field}); @@ -85,17 +89,46 @@ TEST_F(BlobUtilsTest, SeparateBlobSchema) { } { std::shared_ptr no_blob_schema = arrow::schema({int_field, string_field}); - BlobUtils::SeparatedSchemas no_blob_schemas = BlobUtils::SeparateBlobSchema(no_blob_schema); + BlobUtils::SeparatedSchemas no_blob_schemas = + BlobUtils::SeparateBlobSchema(no_blob_schema, /*inline_fields=*/{}); ASSERT_TRUE(no_blob_schemas.main_schema->Equals(*no_blob_schema)); ASSERT_EQ(no_blob_schemas.blob_schema->num_fields(), 0); } { std::shared_ptr only_blob_schema = arrow::schema({blob_field_1}); BlobUtils::SeparatedSchemas only_blob_schemas = - BlobUtils::SeparateBlobSchema(only_blob_schema); + BlobUtils::SeparateBlobSchema(only_blob_schema, /*inline_fields=*/{}); ASSERT_TRUE(only_blob_schemas.blob_schema->Equals(*only_blob_schema)); ASSERT_EQ(only_blob_schemas.main_schema->num_fields(), 0); } + { + // Inline blob field stays in main_schema instead of going to blob_schema + auto blob_field_2 = BlobUtils::ToArrowField("f4_blob_2", false); + std::shared_ptr schema = + arrow::schema({int_field, blob_field_1, blob_field_2, string_field}); + + BlobUtils::SeparatedSchemas schemas = + BlobUtils::SeparateBlobSchema(schema, /*inline_fields=*/{"f3_blob_1"}); + + // f3_blob_1 is inline -> stays in main; f4_blob_2 goes to blob + std::shared_ptr expected_main = + arrow::schema({int_field, blob_field_1, string_field}); + ASSERT_TRUE(schemas.main_schema->Equals(*expected_main)); + + std::shared_ptr expected_blob = arrow::schema({blob_field_2}); + ASSERT_TRUE(schemas.blob_schema->Equals(*expected_blob)); + } + { + // All blob fields are inline -> blob_schema is empty + std::shared_ptr schema = + arrow::schema({int_field, blob_field_1, string_field}); + + BlobUtils::SeparatedSchemas schemas = + BlobUtils::SeparateBlobSchema(schema, /*inline_fields=*/{"f3_blob_1"}); + + ASSERT_TRUE(schemas.main_schema->Equals(*schema)); + ASSERT_EQ(schemas.blob_schema->num_fields(), 0); + } } TEST_F(BlobUtilsTest, SeparateBlobArray) { @@ -125,7 +158,8 @@ TEST_F(BlobUtilsTest, SeparateBlobArray) { std::shared_ptr struct_array = std::static_pointer_cast(raw_struct_array); - ASSERT_OK_AND_ASSIGN(auto separated, BlobUtils::SeparateBlobArray(struct_array)); + ASSERT_OK_AND_ASSIGN(auto separated, + BlobUtils::SeparateBlobArray(struct_array, /*inline_fields=*/{})); std::shared_ptr expected_main_type = arrow::struct_({int_field, string_field}); ASSERT_TRUE(separated.main_array->type()->Equals(*expected_main_type)); @@ -137,6 +171,198 @@ TEST_F(BlobUtilsTest, SeparateBlobArray) { ASSERT_TRUE(separated.blob_array->type()->Equals(*expected_blob_type)); ASSERT_EQ(separated.blob_array->num_fields(), 1); ASSERT_TRUE(separated.blob_array->field(0)->Equals(*blob_array_data)); + + // All blob fields are inline -> should return error (no blob field to separate) + ASSERT_NOK_WITH_MSG( + BlobUtils::SeparateBlobArray(struct_array, /*inline_fields=*/{"f2_blob"}), + "SeparateBlobArray expects at least one non-inline blob field, but got none."); + + // All fields are blob with no inline -> no main field -> should return error + auto all_blob_struct = arrow::StructArray::Make({blob_array_data}, {blob_field}).ValueOrDie(); + auto all_blob_sa = std::dynamic_pointer_cast(all_blob_struct); + ASSERT_NOK_WITH_MSG(BlobUtils::SeparateBlobArray(all_blob_sa, /*inline_fields=*/{}), + "SeparateBlobArray expects at least one main field, but got none."); +} + +TEST_F(BlobUtilsTest, SeparateBlobArrayWithPartialInline) { + auto int_field = arrow::field("f1_int", arrow::int32()); + std::shared_ptr blob_field_1 = BlobUtils::ToArrowField("f2_blob_1", false); + std::shared_ptr blob_field_2 = BlobUtils::ToArrowField("f3_blob_2", true); + auto schema = arrow::schema({int_field, blob_field_1, blob_field_2}); + + arrow::Int32Builder int_builder; + ASSERT_TRUE(int_builder.AppendValues({1, 2}).ok()); + auto int_array = int_builder.Finish().ValueOrDie(); + + arrow::LargeBinaryBuilder blob_builder_1; + ASSERT_TRUE(blob_builder_1.Append("a", 1).ok()); + ASSERT_TRUE(blob_builder_1.Append("b", 1).ok()); + auto blob_array_1 = blob_builder_1.Finish().ValueOrDie(); + + arrow::LargeBinaryBuilder blob_builder_2; + ASSERT_TRUE(blob_builder_2.Append("x", 1).ok()); + ASSERT_TRUE(blob_builder_2.AppendNull().ok()); + auto blob_array_2 = blob_builder_2.Finish().ValueOrDie(); + + auto raw_struct_array = + arrow::StructArray::Make({int_array, blob_array_1, blob_array_2}, schema->fields()) + .ValueOrDie(); + auto struct_array = std::static_pointer_cast(raw_struct_array); + + // f2_blob_1 is inline, f3_blob_2 goes to blob + ASSERT_OK_AND_ASSIGN(auto separated, BlobUtils::SeparateBlobArray( + struct_array, /*inline_fields=*/{"f2_blob_1"})); + + std::shared_ptr expected_main_type = arrow::struct_({int_field, blob_field_1}); + ASSERT_TRUE(separated.main_array->type()->Equals(*expected_main_type)); + ASSERT_EQ(separated.main_array->num_fields(), 2); + ASSERT_TRUE(separated.main_array->field(0)->Equals(*int_array)); + ASSERT_TRUE(separated.main_array->field(1)->Equals(*blob_array_1)); + + std::shared_ptr expected_blob_type = arrow::struct_({blob_field_2}); + ASSERT_TRUE(separated.blob_array->type()->Equals(*expected_blob_type)); + ASSERT_EQ(separated.blob_array->num_fields(), 1); + ASSERT_TRUE(separated.blob_array->field(0)->Equals(*blob_array_2)); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsEmptyFields) { + // Empty inline_descriptor_fields -> always OK + arrow::LargeBinaryBuilder builder; + ASSERT_TRUE(builder.Append("random_data").ok()); + auto array = builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({array}, {BlobUtils::ToArrowField("b0")}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_OK(BlobUtils::ValidateInlineBlobDescriptors(sa, {})); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsFieldNotPresent) { + // Field not in struct_array -> skip, OK + arrow::Int32Builder int_builder; + ASSERT_TRUE(int_builder.Append(42).ok()); + auto int_array = int_builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({int_array}, {arrow::field("f0", arrow::int32())}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + // "b0" does not exist in the struct -> should pass + ASSERT_OK(BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0"})); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsWithValidDescriptor) { + // Valid BlobDescriptor bytes -> OK + auto pool = GetDefaultPool(); + ASSERT_OK_AND_ASSIGN(auto descriptor, BlobDescriptor::Create("file:///tmp/test.bin", 0, 100)); + auto serialized = descriptor->Serialize(pool); + + arrow::LargeBinaryBuilder builder; + ASSERT_TRUE(builder.Append(serialized->data(), serialized->size()).ok()); + auto blob_array = builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({blob_array}, {BlobUtils::ToArrowField("b0")}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_OK(BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0"})); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsWithNullValue) { + // Null values in blob column -> skip, OK + arrow::LargeBinaryBuilder builder; + ASSERT_TRUE(builder.AppendNull().ok()); + auto blob_array = builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({blob_array}, {BlobUtils::ToArrowField("b0")}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_OK(BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0"})); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsWithRawBytes) { + // Raw bytes (not a descriptor) -> error + arrow::LargeBinaryBuilder builder; + ASSERT_TRUE(builder.Append("not_a_descriptor_just_raw_data").ok()); + auto blob_array = builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({blob_array}, {BlobUtils::ToArrowField("b0")}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_NOK_WITH_MSG( + BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0"}), + "BLOB inline field b0 configured by blob-descriptor-field or blob-view-field " + "require values to be a BlobDescriptor or BlobViewStruct."); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsMixedValidAndInvalid) { + // First row is valid descriptor, second row is raw bytes -> error on row 1 + auto pool = GetDefaultPool(); + ASSERT_OK_AND_ASSIGN(auto descriptor, BlobDescriptor::Create("file:///tmp/test.bin", 0, 100)); + auto serialized = descriptor->Serialize(pool); + + arrow::LargeBinaryBuilder builder; + ASSERT_TRUE(builder.Append(serialized->data(), serialized->size()).ok()); + ASSERT_TRUE(builder.Append("raw_bytes_not_descriptor").ok()); + auto blob_array = builder.Finish().ValueOrDie(); + auto struct_array = + arrow::StructArray::Make({blob_array}, {BlobUtils::ToArrowField("b0")}).ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_NOK_WITH_MSG( + BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0"}), + "BLOB inline field b0 configured by blob-descriptor-field or blob-view-field " + "require values to be a BlobDescriptor or BlobViewStruct."); +} + +TEST_F(BlobUtilsTest, ValidateInlineBlobDescriptorsMultipleFields) { + // Two inline fields: b0 is valid, b1 has raw bytes -> error on b1 + auto pool = GetDefaultPool(); + ASSERT_OK_AND_ASSIGN(auto descriptor, BlobDescriptor::Create("file:///tmp/test.bin", 0, 100)); + auto serialized = descriptor->Serialize(pool); + + arrow::LargeBinaryBuilder b0_builder; + ASSERT_TRUE(b0_builder.Append(serialized->data(), serialized->size()).ok()); + auto b0_array = b0_builder.Finish().ValueOrDie(); + + arrow::LargeBinaryBuilder b1_builder; + ASSERT_TRUE(b1_builder.Append("invalid_raw_data").ok()); + auto b1_array = b1_builder.Finish().ValueOrDie(); + + auto struct_array = + arrow::StructArray::Make({b0_array, b1_array}, + {BlobUtils::ToArrowField("b0"), BlobUtils::ToArrowField("b1")}) + .ValueOrDie(); + auto sa = std::dynamic_pointer_cast(struct_array); + ASSERT_NOK_WITH_MSG( + BlobUtils::ValidateInlineBlobDescriptors(sa, {"b0", "b1"}), + "BLOB inline field b1 configured by blob-descriptor-field or blob-view-field " + "require values to be a BlobDescriptor or BlobViewStruct."); +} + +TEST_F(BlobUtilsTest, TestConvertBlobInlineDataFields) { + // Schema with a blob field (large_binary with blob metadata) and normal fields. + auto blob_field = BlobUtils::ToArrowField("blob_col", /*nullable=*/true); + std::vector data_fields = {DataField(0, arrow::field("int_col", arrow::int32())), + DataField(1, blob_field), + DataField(2, arrow::field("str_col", arrow::utf8()))}; + + // Without inline fields — blob_col stays as large_binary + { + auto result = BlobUtils::ConvertBlobInlineDataFields(data_fields, {}); + ASSERT_EQ(result.size(), 3); + ASSERT_EQ(result[1].ArrowField()->type()->id(), arrow::Type::LARGE_BINARY); + } + + // With inline fields — blob_col should be converted from large_binary to binary + { + auto result = BlobUtils::ConvertBlobInlineDataFields(data_fields, {"blob_col"}); + ASSERT_EQ(result.size(), 3); + ASSERT_EQ(result[1].ArrowField()->type()->id(), arrow::Type::BINARY); + ASSERT_EQ(result[1].Name(), "blob_col"); + ASSERT_EQ(result[1].Nullable(), true); + // Other fields unchanged + ASSERT_EQ(result[0].ArrowField()->type()->id(), arrow::Type::INT32); + ASSERT_EQ(result[2].ArrowField()->type()->id(), arrow::Type::STRING); + } + + // Non-matching inline field name — no conversion should happen + { + auto result = BlobUtils::ConvertBlobInlineDataFields(data_fields, {"non_existent_field"}); + ASSERT_EQ(result[1].ArrowField()->type()->id(), arrow::Type::LARGE_BINARY); + } } } // namespace paimon::test diff --git a/src/paimon/core/append/append_only_writer.cpp b/src/paimon/core/append/append_only_writer.cpp index a8ab13ea2..b23975d90 100644 --- a/src/paimon/core/append/append_only_writer.cpp +++ b/src/paimon/core/append/append_only_writer.cpp @@ -33,11 +33,13 @@ #include "paimon/core/io/data_file_path_factory.h" #include "paimon/core/io/data_file_writer.h" #include "paimon/core/io/data_increment.h" +#include "paimon/core/io/external_storage_blob_writer.h" #include "paimon/core/io/multiple_blob_file_writer.h" #include "paimon/core/io/rolling_blob_file_writer.h" #include "paimon/core/io/rolling_file_writer.h" #include "paimon/core/io/single_file_writer.h" #include "paimon/core/manifest/file_source.h" +#include "paimon/core/operation/blob_file_context.h" #include "paimon/core/utils/commit_increment.h" #include "paimon/format/file_format.h" #include "paimon/format/file_format_factory.h" @@ -82,6 +84,36 @@ Status AppendOnlyWriter::Write(std::unique_ptr&& batch) { if (writer_ == nullptr) { PAIMON_ASSIGN_OR_RAISE(writer_, CreateRollingRowWriter()); } + + // Transform batch for external storage descriptor fields before writing. + if (external_storage_writer_) { + auto data_type = arrow::struct_(write_schema_->fields()); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_array, + arrow::ImportArray(batch->GetData(), data_type)); + auto struct_array = std::dynamic_pointer_cast(arrow_array); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr transformed, + external_storage_writer_->TransformBatch(struct_array)); + auto transformed_struct = std::dynamic_pointer_cast(transformed); + // TODO(lc.lsz): validate blob view + PAIMON_RETURN_NOT_OK(BlobUtils::ValidateInlineBlobDescriptors(transformed_struct, + inline_descriptor_fields_)); + ::ArrowArray c_transformed; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*transformed, &c_transformed)); + return writer_->Write(&c_transformed); + } + + if (!inline_descriptor_fields_.empty()) { + auto data_type = arrow::struct_(write_schema_->fields()); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_array, + arrow::ImportArray(batch->GetData(), data_type)); + auto struct_array = std::dynamic_pointer_cast(arrow_array); + // TODO(lc.lsz): validate blob view + PAIMON_RETURN_NOT_OK( + BlobUtils::ValidateInlineBlobDescriptors(struct_array, inline_descriptor_fields_)); + ::ArrowArray c_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*struct_array, &c_array)); + return writer_->Write(&c_array); + } return writer_->Write(batch->GetData()); } @@ -153,14 +185,45 @@ Status AppendOnlyWriter::Flush(bool wait_for_latest_compaction, bool forced_full return Status::OK(); } -AppendOnlyWriter::RollingFileWriterResult AppendOnlyWriter::CreateRollingRowWriter() const { - auto schemas = BlobUtils::SeparateBlobSchema(write_schema_); - if (schemas.blob_schema && schemas.blob_schema->num_fields() > 0) { - return CreateRollingBlobWriter(schemas); +AppendOnlyWriter::RollingFileWriterResult AppendOnlyWriter::CreateRollingRowWriter() { + auto blob_context = BlobFileContext::Create(write_schema_, options_); + std::optional> main_write_cols = write_cols_; + + // Save inline descriptor fields for validation in Write() + if (blob_context) { + inline_descriptor_fields_ = blob_context->GetDescriptorFields(); + } + + // Initialize ExternalStorageBlobWriter if needed + if (blob_context && blob_context->RequireExternalStorageWriter()) { + assert(blob_context->GetExternalStoragePath()); + external_storage_writer_ = std::make_unique( + write_schema_, blob_context->GetExternalStorageFields(), + blob_context->GetExternalStoragePath().value(), schema_id_, seq_num_counter_, + path_factory_, options_, memory_pool_); + if (!main_write_cols) { + // To align with java, when require external storage writer, main writer will set write + // cols in DataFileMeta + main_write_cols = write_schema_->field_names(); + } + } + + if (blob_context && blob_context->RequireBlobFileWriter()) { + // Use context-aware schema separation: inline BLOB fields stay in main + auto schemas = + BlobUtils::SeparateBlobSchema(write_schema_, blob_context->GetInlineFields()); + return CreateRollingBlobWriter(schemas, blob_context->GetInlineFields()); + } else if (!blob_context) { + // No BLOB fields at all -> plain rolling writer + return std::make_unique>>( + options_.GetTargetFileSize(/*has_primary_key=*/false), + GetDataFileWriterCreator(write_schema_, main_write_cols)); } else { + // All BLOB fields are inline, no .blob files needed -> plain rolling writer + // The main data file contains all fields including inline descriptors/views. return std::make_unique>>( options_.GetTargetFileSize(/*has_primary_key=*/false), - GetDataFileWriterCreator(write_schema_, write_cols_)); + GetDataFileWriterCreator(write_schema_, main_write_cols)); } } @@ -212,7 +275,7 @@ AppendOnlyWriter::SingleFileWriterCreator AppendOnlyWriter::GetBlobFileWriterCre } AppendOnlyWriter::RollingFileWriterResult AppendOnlyWriter::CreateRollingBlobWriter( - const BlobUtils::SeparatedSchemas& schemas) const { + const BlobUtils::SeparatedSchemas& schemas, const std::set& inline_fields) const { // Multiple blob fields are supported. Each blob field gets its own rolling file writer // via MultipleBlobFileWriter. auto blob_schema = schemas.blob_schema; @@ -249,7 +312,7 @@ AppendOnlyWriter::RollingFileWriterResult AppendOnlyWriter::CreateRollingBlobWri return std::make_unique( options_.GetTargetFileSize(/*has_primary_key=*/false), GetDataFileWriterCreator(schemas.main_schema, schemas.main_schema->field_names()), - blob_schema, blob_writer_creator, arrow::struct_(write_schema_->fields())); + blob_schema, blob_writer_creator, arrow::struct_(write_schema_->fields()), inline_fields); } Status AppendOnlyWriter::Sync() { @@ -275,10 +338,14 @@ Status AppendOnlyWriter::Close() { writer_.reset(); } + if (external_storage_writer_) { + PAIMON_RETURN_NOT_OK(external_storage_writer_->Close()); + external_storage_writer_.reset(); + } + if (compact_deletion_file_ != nullptr) { compact_deletion_file_->Clean(); } return Status::OK(); } - } // namespace paimon diff --git a/src/paimon/core/append/append_only_writer.h b/src/paimon/core/append/append_only_writer.h index e403e2508..d1b4339d8 100644 --- a/src/paimon/core/append/append_only_writer.h +++ b/src/paimon/core/append/append_only_writer.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,7 @@ class Schema; namespace paimon { class CommitIncrement; +class ExternalStorageBlobWriter; class RecordBatch; template class RollingFileWriter; @@ -94,9 +96,10 @@ class AppendOnlyWriter : public BatchWriter { using RollingFileWriterResult = Result>>>; - RollingFileWriterResult CreateRollingRowWriter() const; + RollingFileWriterResult CreateRollingRowWriter(); RollingFileWriterResult CreateRollingBlobWriter( - const BlobUtils::SeparatedSchemas& schemas) const; + const BlobUtils::SeparatedSchemas& schemas, + const std::set& inline_fields) const; Result DrainIncrement(); Status Flush(bool wait_for_latest_compaction, bool forced_full_compaction); @@ -130,6 +133,8 @@ class AppendOnlyWriter : public BatchWriter { std::shared_ptr compact_deletion_file_; std::unique_ptr>> writer_; + std::unique_ptr external_storage_writer_; + std::set inline_descriptor_fields_; }; } // namespace paimon diff --git a/src/paimon/core/casting/binary_to_blob_cast_executor.cpp b/src/paimon/core/casting/binary_to_blob_cast_executor.cpp new file mode 100644 index 000000000..3716eebef --- /dev/null +++ b/src/paimon/core/casting/binary_to_blob_cast_executor.cpp @@ -0,0 +1,81 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/casting/binary_to_blob_cast_executor.h" + +#include +#include + +#include "arrow/array/array_binary.h" +#include "arrow/buffer.h" +#include "arrow/type.h" +#include "fmt/format.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/status.h" + +namespace arrow { +class Array; +} // namespace arrow + +namespace paimon { +Result BinaryToBlobCastExecutor::Cast( + const Literal& literal, const std::shared_ptr& target_type) const { + return Status::Invalid( + fmt::format("BinaryToBlobCastExecutor does not support literal cast from {} to {}", + static_cast(literal.GetType()), target_type->ToString())); +} + +Result> BinaryToBlobCastExecutor::Cast( + const std::shared_ptr& array, const std::shared_ptr& target_type, + arrow::MemoryPool* pool) const { + if (array->type_id() != arrow::Type::BINARY) { + return Status::Invalid( + fmt::format("BinaryToBlobCastExecutor only supports binary input, got {}", + array->type()->ToString())); + } + if (target_type->id() != arrow::Type::LARGE_BINARY) { + return Status::Invalid( + fmt::format("BinaryToBlobCastExecutor only supports large_binary target, got {}", + target_type->ToString())); + } + + auto binary_array = std::static_pointer_cast(array); + if (binary_array->offset() != 0) { + return Status::Invalid("BinaryToBlobCastExecutor only supports arrays with zero offset"); + } + + const int64_t length = binary_array->length(); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( + std::shared_ptr large_offsets_buffer, + arrow::AllocateBuffer((length + 1) * static_cast(sizeof(int64_t)), pool)); + auto* large_offsets = reinterpret_cast(large_offsets_buffer->mutable_data()); + for (int64_t row_index = 0; row_index <= length; row_index++) { + large_offsets[row_index] = binary_array->value_offset(row_index); + } + + std::shared_ptr null_bitmap = binary_array->null_bitmap(); + if (binary_array->null_count() == 0) { + null_bitmap.reset(); + } + + auto value_data = binary_array->value_data(); + auto array_data = + arrow::ArrayData::Make(target_type, length, {null_bitmap, large_offsets_buffer, value_data}, + binary_array->null_count()); + return arrow::MakeArray(array_data); +} + +} // namespace paimon diff --git a/src/paimon/core/casting/binary_to_blob_cast_executor.h b/src/paimon/core/casting/binary_to_blob_cast_executor.h new file mode 100644 index 000000000..e62983d68 --- /dev/null +++ b/src/paimon/core/casting/binary_to_blob_cast_executor.h @@ -0,0 +1,42 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "arrow/array/array_base.h" +#include "paimon/core/casting/cast_executor.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" + +namespace arrow { +class DataType; +class MemoryPool; +} // namespace arrow + +namespace paimon { +class BinaryToBlobCastExecutor : public CastExecutor { + public: + Result Cast(const Literal& literal, + const std::shared_ptr& target_type) const override; + + Result> Cast(const std::shared_ptr& array, + const std::shared_ptr& target_type, + arrow::MemoryPool* pool) const override; +}; + +} // namespace paimon diff --git a/src/paimon/core/casting/cast_executor_factory.cpp b/src/paimon/core/casting/cast_executor_factory.cpp index a09d53fa7..8fa721974 100644 --- a/src/paimon/core/casting/cast_executor_factory.cpp +++ b/src/paimon/core/casting/cast_executor_factory.cpp @@ -18,6 +18,7 @@ #include +#include "paimon/core/casting/binary_to_blob_cast_executor.h" #include "paimon/core/casting/binary_to_string_cast_executor.h" #include "paimon/core/casting/boolean_to_decimal_cast_executor.h" #include "paimon/core/casting/boolean_to_numeric_cast_executor.h" @@ -146,6 +147,8 @@ CastExecutorFactory::CastExecutorFactory() { REGISTER_CAST_EXECUTOR(STRING, BINARY, BinaryToStringCastExecutor); + REGISTER_CAST_EXECUTOR(BLOB, BINARY, BinaryToBlobCastExecutor); + REGISTER_CAST_EXECUTOR(STRING, DATE, DateToStringCastExecutor); REGISTER_CAST_EXECUTOR(TIMESTAMP, DATE, DateToTimestampCastExecutor); diff --git a/src/paimon/core/casting/cast_executor_factory_test.cpp b/src/paimon/core/casting/cast_executor_factory_test.cpp index a12bba603..6d67603c0 100644 --- a/src/paimon/core/casting/cast_executor_factory_test.cpp +++ b/src/paimon/core/casting/cast_executor_factory_test.cpp @@ -17,6 +17,7 @@ #include "paimon/core/casting/cast_executor_factory.h" #include "gtest/gtest.h" +#include "paimon/core/casting/binary_to_blob_cast_executor.h" #include "paimon/core/casting/binary_to_string_cast_executor.h" #include "paimon/core/casting/boolean_to_decimal_cast_executor.h" #include "paimon/core/casting/boolean_to_numeric_cast_executor.h" @@ -120,6 +121,13 @@ TEST(CastExecutorFactoryTest, TestRegister) { ASSERT_TRUE(cast_executor); ASSERT_TRUE(std::dynamic_pointer_cast(cast_executor)); } + { + auto* factory = CastExecutorFactory::GetCastExecutorFactory(); + ASSERT_FALSE(factory->executor_map_.empty()); + auto cast_executor = factory->GetCastExecutor(FieldType::BINARY, FieldType::BLOB); + ASSERT_TRUE(cast_executor); + ASSERT_TRUE(std::dynamic_pointer_cast(cast_executor)); + } { auto* factory = CastExecutorFactory::GetCastExecutorFactory(); ASSERT_FALSE(factory->executor_map_.empty()); diff --git a/src/paimon/core/casting/cast_executor_test.cpp b/src/paimon/core/casting/cast_executor_test.cpp index 8c97d58cb..499b7cc17 100644 --- a/src/paimon/core/casting/cast_executor_test.cpp +++ b/src/paimon/core/casting/cast_executor_test.cpp @@ -34,6 +34,7 @@ #include "paimon/common/utils/date_time_utils.h" #include "paimon/common/utils/decimal_utils.h" #include "paimon/common/utils/field_type_utils.h" +#include "paimon/core/casting/binary_to_blob_cast_executor.h" #include "paimon/core/casting/binary_to_string_cast_executor.h" #include "paimon/core/casting/boolean_to_decimal_cast_executor.h" #include "paimon/core/casting/boolean_to_numeric_cast_executor.h" @@ -1299,6 +1300,43 @@ TEST_F(CastExecutorTest, TestBinaryToStringCastExecutorCastArray) { } } +TEST_F(CastExecutorTest, TestBinaryToBlobCastExecutorCastLiteral) { + auto cast_executor = std::make_shared(); + std::string src_data = "blob-descriptor-bytes"; + ASSERT_NOK_WITH_MSG( + cast_executor->Cast(Literal(FieldType::BINARY, src_data.data(), src_data.size()), + arrow::large_binary()), + "BinaryToBlobCastExecutor does not support literal cast"); +} + +TEST_F(CastExecutorTest, TestBinaryToBlobCastExecutorCastArray) { + auto cast_executor = std::make_shared(); + auto src_array = arrow::ipc::internal::json::ArrayFromJSON( + arrow::binary(), R"(["foo", "bar", "", null, "blob"])") + .ValueOrDie(); + auto expected_array = arrow::ipc::internal::json::ArrayFromJSON( + arrow::large_binary(), R"(["foo", "bar", "", null, "blob"])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr target_array, + cast_executor->Cast(src_array, arrow::large_binary(), arrow::default_memory_pool())); + ASSERT_TRUE(target_array->Equals(expected_array)); + ASSERT_EQ(target_array->data()->buffers[2], src_array->data()->buffers[2]); +} + +TEST_F(CastExecutorTest, TestBinaryToBlobCastExecutorCastArrayWithOffset) { + auto cast_executor = std::make_shared(); + auto src_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::binary(), R"(["skip", "foo", "bar"])") + .ValueOrDie() + ->Slice(1, 2); + + ASSERT_NOK_WITH_MSG( + cast_executor->Cast(src_array, arrow::large_binary(), arrow::default_memory_pool()), + "BinaryToBlobCastExecutor only supports arrays with zero offset"); +} + TEST_F(CastExecutorTest, TestDateToStringCastExecutorCastLiteral) { auto cast_executor = std::make_shared(); // date values ranging from 0000-01-01 to 9999-12-31 diff --git a/src/paimon/core/io/data_file_path_factory.h b/src/paimon/core/io/data_file_path_factory.h index 110a7035e..34a315a98 100644 --- a/src/paimon/core/io/data_file_path_factory.h +++ b/src/paimon/core/io/data_file_path_factory.h @@ -62,6 +62,12 @@ class DataFilePathFactory : public PathFactory { return NewPathFromName(NewFileName(data_file_prefix_, ".blob")); } + /// Creates a new blob file path under the given external storage path for descriptor fields. + std::string NewExternalStorageBlobPath(const std::string& external_storage_path) const { + std::string file_name = NewFileName(data_file_prefix_, ".blob"); + return PathUtil::JoinPath(external_storage_path, file_name); + } + std::string NewPathFromName(const std::string& file_name) const { if (external_path_provider_ != nullptr) { return external_path_provider_->GetNextExternalDataPath(file_name); diff --git a/src/paimon/core/io/data_file_path_factory_test.cpp b/src/paimon/core/io/data_file_path_factory_test.cpp index 9bac530e4..6009db102 100644 --- a/src/paimon/core/io/data_file_path_factory_test.cpp +++ b/src/paimon/core/io/data_file_path_factory_test.cpp @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "paimon/common/data/binary_row.h" #include "paimon/common/fs/external_path_provider.h" +#include "paimon/common/utils/string_utils.h" #include "paimon/core/io/data_file_meta.h" #include "paimon/core/manifest/file_source.h" #include "paimon/core/stats/simple_stats.h" @@ -56,6 +57,20 @@ TEST_F(DataFilePathFactoryTest, TestNewPath) { ASSERT_EQ(factory_.NewPathFromName("index-file"), "/tmp/index-file"); } +TEST_F(DataFilePathFactoryTest, TestNewExternalStorageBlobPath) { + std::string blob_path1 = factory_.NewExternalStorageBlobPath("/tmp/external_blob"); + std::string blob_path2 = factory_.NewExternalStorageBlobPath("/tmp/external_blob"); + + // Paths are unique (counter increments) + ASSERT_NE(blob_path1, blob_path2); + // Both start with the external storage path joined with the data file prefix + ASSERT_TRUE(StringUtils::StartsWith(blob_path1, "/tmp/external_blob/data-")); + ASSERT_TRUE(StringUtils::StartsWith(blob_path2, "/tmp/external_blob/data-")); + // Both end with .blob extension + ASSERT_TRUE(StringUtils::EndsWith(blob_path1, ".blob")); + ASSERT_TRUE(StringUtils::EndsWith(blob_path2, ".blob")); +} + TEST_F(DataFilePathFactoryTest, TestNewPathWithDataFilePrefixAndExternalPath) { DataFilePathFactory factory; ASSERT_OK_AND_ASSIGN( diff --git a/src/paimon/core/io/external_storage_blob_writer.cpp b/src/paimon/core/io/external_storage_blob_writer.cpp new file mode 100644 index 000000000..4ce51ae2c --- /dev/null +++ b/src/paimon/core/io/external_storage_blob_writer.cpp @@ -0,0 +1,228 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/external_storage_blob_writer.h" + +#include +#include + +#include "arrow/array/array_nested.h" +#include "arrow/array/builder_binary.h" +#include "arrow/c/bridge.h" +#include "arrow/type.h" +#include "paimon/common/data/blob_descriptor.h" +#include "paimon/common/data/blob_utils.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/common/utils/scope_guard.h" +#include "paimon/core/io/data_file_path_factory.h" +#include "paimon/core/io/data_file_writer.h" +#include "paimon/format/blob/blob_writer_builder.h" +#include "paimon/format/file_format.h" +#include "paimon/format/file_format_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/memory/memory_pool.h" + +namespace paimon { + +ExternalStorageBlobWriter::ExternalStorageBlobWriter( + const std::shared_ptr& write_schema, + const std::set& external_storage_fields, const std::string& external_storage_path, + int64_t schema_id, const std::shared_ptr& seq_num_counter, + const std::shared_ptr& path_factory, const CoreOptions& options, + const std::shared_ptr& memory_pool) + : write_schema_(write_schema), + external_storage_fields_(external_storage_fields), + external_storage_path_(external_storage_path), + schema_id_(schema_id), + seq_num_counter_(seq_num_counter), + path_factory_(path_factory), + memory_pool_(memory_pool), + options_(options) {} + +Result> +ExternalStorageBlobWriter::CreateFieldRollingWriter(FieldWriter* field_writer) { + auto field = write_schema_->GetFieldByName(field_writer->field_name); + if (!field) { + return Status::Invalid("External storage field '{}' not found in write schema", + field_writer->field_name); + } + + auto single_field_schema = arrow::schema({field}); + ::ArrowSchema arrow_schema; + ScopeGuard guard([&arrow_schema]() { ArrowSchemaRelease(&arrow_schema); }); + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*single_field_schema, &arrow_schema)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr format, + FileFormatFactory::Get("blob", options_.ToMap())); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr writer_builder, + format->CreateWriterBuilder(&arrow_schema, options_.GetWriteBatchSize())); + writer_builder->WithMemoryPool(memory_pool_); + + // Inject WriteConsumer to capture BlobDescriptors during writes + auto blob_writer_builder = std::dynamic_pointer_cast(writer_builder); + if (!blob_writer_builder) { + return Status::Invalid( + "writer_builder cannot be casted to BlobWriterBuilder in ExternalStorageBlobWriter"); + } + blob_writer_builder->WithWriteConsumer( + [field_writer](std::unique_ptr descriptor) -> bool { + field_writer->captured_descriptors.push_back(std::move(descriptor)); + return true; // Always flush for single row. + }); + + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*single_field_schema, &arrow_schema)); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr stats_extractor, + format->CreateStatsExtractor(&arrow_schema)); + + std::vector write_cols = {field_writer->field_name}; + auto single_blob_file_writer_creator = [this, writer_builder, stats_extractor, write_cols]() + -> Result>>> { + auto writer = std::make_unique( + /*compression=*/"none", std::function(), schema_id_, + seq_num_counter_, FileSource::Append(), stats_extractor, + path_factory_->IsExternalPath(), write_cols, memory_pool_); + PAIMON_RETURN_NOT_OK(writer->Init( + options_.GetFileSystem(), + path_factory_->NewExternalStorageBlobPath(external_storage_path_), writer_builder)); + return writer; + }; + + return std::make_unique(options_.GetBlobTargetFileSize(), + single_blob_file_writer_creator); +} + +Status ExternalStorageBlobWriter::InitializeFieldWritersIfNeeded() { + if (initialized_) { + return Status::OK(); + } + for (int32_t i = 0; i < write_schema_->num_fields(); ++i) { + const auto& field = write_schema_->field(i); + if (external_storage_fields_.count(field->name()) > 0) { + FieldWriter fw; + fw.field_name = field->name(); + fw.field_index = i; + field_writers_.push_back(std::move(fw)); + } + } + // Create rolling writers after push_back so FieldWriter addresses are stable + // for the consumer lambda capture. + for (auto& fw : field_writers_) { + PAIMON_ASSIGN_OR_RAISE(fw.rolling_writer, CreateFieldRollingWriter(&fw)); + } + initialized_ = true; + return Status::OK(); +} + +Result> ExternalStorageBlobWriter::TransformField( + const std::shared_ptr& column, FieldWriter* field_writer) { + int64_t num_rows = column->length(); + + // Clear captured descriptors before processing this batch + field_writer->captured_descriptors.clear(); + + // Write each row via RollingFileWriter; the consumer captures the descriptor + for (int64_t row = 0; row < num_rows; ++row) { + std::shared_ptr slice = column->Slice(row, 1); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( + std::shared_ptr single_row_struct, + arrow::StructArray::Make({slice}, {field_writer->field_name})); + + ::ArrowArray c_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*single_row_struct, &c_array)); + PAIMON_RETURN_NOT_OK(field_writer->rolling_writer->Write(&c_array)); + } + + // Validate captured descriptor count + if (static_cast(field_writer->captured_descriptors.size()) != num_rows) { + return Status::Invalid( + "Captured descriptor count {} does not match row count {} for field '{}'", + field_writer->captured_descriptors.size(), num_rows, field_writer->field_name); + } + + // Build descriptor column from captured descriptors + arrow::LargeBinaryBuilder descriptor_builder; + PAIMON_RETURN_NOT_OK_FROM_ARROW(descriptor_builder.Reserve(num_rows)); + for (int64_t row = 0; row < num_rows; ++row) { + const auto& descriptor = field_writer->captured_descriptors[row]; + if (!descriptor) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(descriptor_builder.AppendNull()); + } else { + auto serialized = descriptor->Serialize(memory_pool_); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + descriptor_builder.Append(serialized->data(), serialized->size())); + } + } + + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr descriptor_array, + descriptor_builder.Finish()); + return descriptor_array; +} + +Result> ExternalStorageBlobWriter::TransformBatch( + const std::shared_ptr& batch) { + if (external_storage_fields_.empty()) { + return batch; + } + + PAIMON_RETURN_NOT_OK(InitializeFieldWritersIfNeeded()); + + if (field_writers_.empty()) { + return batch; + } + + // Collect all arrays and field names from the original batch + std::vector> result_arrays; + std::vector result_names; + result_arrays.reserve(batch->num_fields()); + result_names.reserve(batch->num_fields()); + + for (int32_t col = 0; col < batch->num_fields(); ++col) { + result_names.push_back(batch->type()->field(col)->name()); + result_arrays.push_back(batch->field(col)); + } + + // Transform each external storage field and replace in result + for (FieldWriter& fw : field_writers_) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr descriptor_array, + TransformField(batch->field(fw.field_index), &fw)); + result_arrays[fw.field_index] = descriptor_array; + } + + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr result, + arrow::StructArray::Make(result_arrays, result_names)); + return result; +} + +Status ExternalStorageBlobWriter::Close() { + for (FieldWriter& fw : field_writers_) { + if (fw.rolling_writer) { + PAIMON_RETURN_NOT_OK(fw.rolling_writer->Close()); + } + } + return Status::OK(); +} + +void ExternalStorageBlobWriter::Abort() { + for (FieldWriter& fw : field_writers_) { + if (fw.rolling_writer) { + fw.rolling_writer->Abort(); + fw.rolling_writer.reset(); + } + } + field_writers_.clear(); +} + +} // namespace paimon diff --git a/src/paimon/core/io/external_storage_blob_writer.h b/src/paimon/core/io/external_storage_blob_writer.h new file mode 100644 index 000000000..600dd6f96 --- /dev/null +++ b/src/paimon/core/io/external_storage_blob_writer.h @@ -0,0 +1,111 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paimon/common/data/blob_descriptor.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/io/rolling_file_writer.h" +#include "paimon/core/io/single_file_writer.h" +#include "paimon/logging.h" +#include "paimon/result.h" +#include "paimon/status.h" +namespace arrow { +class Schema; +class StructArray; +} // namespace arrow + +namespace paimon { + +class FileSystem; +class LongCounter; +class MemoryPool; +class DataFilePathFactory; + +/// Batch-oriented writer for descriptor BLOB fields that writes raw data to external storage. +/// +/// For each configured external_storage field, this writer: +/// 1. Uses RollingFileWriter (same infra as MultipleBlobFileWriter) with BlobFormatWriter +/// 2. Injects a WriteConsumer into BlobFormatWriter to capture each row's BlobDescriptor +/// 3. After writing a batch, constructs a descriptor column from captured descriptors +/// +/// After TransformBatch(), the returned StructArray has descriptor columns replaced with +/// serialized BlobDescriptor bytes (large_binary), ready to be written into the main data file. +class ExternalStorageBlobWriter { + public: + using BlobRollingWriter = RollingFileWriter<::ArrowArray*, std::shared_ptr>; + + ExternalStorageBlobWriter(const std::shared_ptr& write_schema, + const std::set& external_storage_fields, + const std::string& external_storage_path, int64_t schema_id, + const std::shared_ptr& seq_num_counter, + const std::shared_ptr& path_factory, + const CoreOptions& options, + const std::shared_ptr& memory_pool); + + /// Transforms a batch by writing external storage fields to .blob files and replacing + /// the BLOB values with serialized BlobDescriptor bytes. + Result> TransformBatch( + const std::shared_ptr& batch); + + /// Closes all internal blob writers and flushes pending data. + Status Close(); + + /// Aborts all internal blob writers. + void Abort(); + + private: + /// Per-field writer state for one external storage blob field. + struct FieldWriter { + std::string field_name; + int32_t field_index; + std::unique_ptr rolling_writer; + /// Descriptors captured by the WriteConsumer callback during writes. + std::vector> captured_descriptors; + }; + + /// Lazily initializes per-field writers on first call to TransformBatch. + Status InitializeFieldWritersIfNeeded(); + + /// Writes all rows of a single external blob field via RollingFileWriter and returns + /// a descriptor column (LargeBinary) built from captured BlobDescriptors. + Result> TransformField( + const std::shared_ptr& column, FieldWriter* field_writer); + + /// Creates a RollingFileWriter for one external storage blob field with consumer injected. + Result> CreateFieldRollingWriter(FieldWriter* field_writer); + + std::shared_ptr write_schema_; + std::set external_storage_fields_; + std::string external_storage_path_; + int64_t schema_id_; + std::shared_ptr seq_num_counter_; + std::shared_ptr path_factory_; + std::shared_ptr memory_pool_; + CoreOptions options_; + + std::vector field_writers_; + bool initialized_ = false; +}; + +} // namespace paimon diff --git a/src/paimon/core/io/external_storage_blob_writer_test.cpp b/src/paimon/core/io/external_storage_blob_writer_test.cpp new file mode 100644 index 000000000..e715f03d4 --- /dev/null +++ b/src/paimon/core/io/external_storage_blob_writer_test.cpp @@ -0,0 +1,150 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/io/external_storage_blob_writer.h" + +#include +#include + +#include "arrow/api.h" +#include "arrow/ipc/json_simple.h" +#include "gtest/gtest.h" +#include "paimon/common/data/blob_descriptor.h" +#include "paimon/common/data/blob_utils.h" +#include "paimon/common/utils/long_counter.h" +#include "paimon/core/core_options.h" +#include "paimon/core/io/data_file_path_factory.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class ExternalStorageBlobWriterTest : public ::testing::Test { + protected: + void SetUp() override { + dir_ = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir_); + + pool_ = GetDefaultPool(); + seq_num_counter_ = std::make_shared(0); + + // Create CoreOptions with blob format + ASSERT_OK_AND_ASSIGN(options_, CoreOptions::FromMap({})); + file_system_ = options_.GetFileSystem(); + + // Create external storage directory + external_storage_path_ = dir_->Str() + "/external_blob"; + ASSERT_OK(file_system_->Mkdirs(external_storage_path_)); + + // Initialize DataFilePathFactory + path_factory_ = std::make_shared(); + ASSERT_OK(path_factory_->Init(dir_->Str(), "blob", "data-", nullptr)); + + // Schema: int_col (int32) + blob_col (blob) + auto int_field = arrow::field("int_col", arrow::int32()); + auto blob_field = BlobUtils::ToArrowField("blob_col", false); + write_schema_ = arrow::schema({int_field, blob_field}); + } + + std::unique_ptr dir_; + std::shared_ptr pool_; + std::shared_ptr seq_num_counter_; + CoreOptions options_; + std::shared_ptr file_system_; + std::shared_ptr path_factory_; + std::shared_ptr write_schema_; + std::string external_storage_path_; +}; + +TEST_F(ExternalStorageBlobWriterTest, TestEmptyExternalFields) { + // No external storage fields -> TransformBatch returns original batch + ExternalStorageBlobWriter writer(write_schema_, /*external_storage_fields=*/{}, + external_storage_path_, /*schema_id=*/0, seq_num_counter_, + path_factory_, options_, pool_); + + auto input = std::static_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(write_schema_->fields()), + R"([[42, "hello"]])") + .ValueOrDie()); + + ASSERT_OK_AND_ASSIGN(auto result, writer.TransformBatch(input)); + ASSERT_TRUE(result->Equals(*input)); + + ASSERT_OK(writer.Close()); +} + +TEST_F(ExternalStorageBlobWriterTest, TestTransformBatchReplacesBlob) { + std::set external_fields = {"blob_col"}; + ExternalStorageBlobWriter writer(write_schema_, external_fields, external_storage_path_, + /*schema_id=*/0, seq_num_counter_, path_factory_, options_, + pool_); + + auto struct_type = arrow::struct_(write_schema_->fields()); + auto input = std::static_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(struct_type, R"([[10, "data1"], [20, "data2"]])") + .ValueOrDie()); + + auto original_int_col = input->field(0); + + ASSERT_OK_AND_ASSIGN(auto result, writer.TransformBatch(input)); + + // int_col should be unchanged + ASSERT_EQ(result->num_fields(), 2); + ASSERT_TRUE(result->field(0)->Equals(*original_int_col)); + + // blob_col should be replaced with serialized BlobDescriptors + auto descriptor_col = std::static_pointer_cast(result->field(1)); + ASSERT_EQ(descriptor_col->length(), 2); + + for (int64_t i = 0; i < 2; ++i) { + ASSERT_FALSE(descriptor_col->IsNull(i)); + auto view = descriptor_col->GetView(i); + ASSERT_OK_AND_ASSIGN(auto descriptor, + BlobDescriptor::Deserialize(view.data(), view.size())); + ASSERT_EQ(descriptor->Length(), 5); + ASSERT_TRUE(descriptor->Uri().find(external_storage_path_) != std::string::npos); + } + + ASSERT_OK(writer.Close()); +} + +TEST_F(ExternalStorageBlobWriterTest, TestAbort) { + std::set external_fields = {"blob_col"}; + ExternalStorageBlobWriter writer(write_schema_, external_fields, external_storage_path_, + /*schema_id=*/0, seq_num_counter_, path_factory_, options_, + pool_); + + auto input = std::static_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(write_schema_->fields()), + R"([[1, "abort_test"]])") + .ValueOrDie()); + + ASSERT_OK(writer.TransformBatch(input)); + + // Verify blob files exist before abort + std::vector> files_before; + ASSERT_OK(file_system_->ListDir(external_storage_path_, &files_before)); + ASSERT_FALSE(files_before.empty()); + + // Abort should clean up written blob files + writer.Abort(); + + std::vector> files_after; + ASSERT_OK(file_system_->ListDir(external_storage_path_, &files_after)); + ASSERT_TRUE(files_after.empty()); +} + +} // namespace paimon::test diff --git a/src/paimon/core/io/field_mapping_reader_test.cpp b/src/paimon/core/io/field_mapping_reader_test.cpp index 53f0c0d0a..491f67246 100644 --- a/src/paimon/core/io/field_mapping_reader_test.cpp +++ b/src/paimon/core/io/field_mapping_reader_test.cpp @@ -32,6 +32,7 @@ #include "arrow/ipc/json_simple.h" #include "arrow/util/checked_cast.h" #include "gtest/gtest.h" +#include "paimon/common/data/blob_utils.h" #include "paimon/common/types/data_field.h" #include "paimon/core/utils/field_mapping.h" #include "paimon/defs.h" @@ -181,6 +182,9 @@ class FieldMappingReaderTest : public ::testing::Test { auto expected_chunk_array = std::make_shared(arrow::ArrayVector({expect_array})); + ASSERT_TRUE(result_array->type()->Equals(expected_chunk_array->type())) + << result_array->type()->ToString() << expected_chunk_array->type()->ToString(); + ASSERT_TRUE(result_array->Equals(expected_chunk_array)) << result_array->ToString() << expected_chunk_array->ToString(); } @@ -706,6 +710,33 @@ TEST_F(FieldMappingReaderTest, TestSchemaEvolutionWithDictType) { partition, expected_array); } +TEST_F(FieldMappingReaderTest, TestReadInlineBlobAsBinaryDataFile) { + // data_fields uses binary type because inline blob fields are stored as binary in data files + std::vector data_fields = { + DataField(0, arrow::field("descriptor", arrow::binary(), /*nullable=*/true)), + }; + auto data_schema = DataField::ConvertDataFieldsToArrowSchema(data_fields); + std::string json_str = R"([ + ["descriptor-1"], + [null], + ["descriptor-2"] + ])"; + auto data_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(data_schema->fields()), json_str) + .ValueOrDie()); + + std::vector read_fields = { + DataField(0, BlobUtils::ToArrowField("descriptor", /*nullable=*/true)), + }; + auto read_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields); + auto expected = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(read_schema->fields()), json_str) + .ValueOrDie()); + + CheckResult(data_schema, data_array, read_schema, /*predicate=*/nullptr, + /*partition_keys=*/{}, BinaryRow::EmptyRow(), expected); +} + TEST_F(FieldMappingReaderTest, TestReadWithSchemaEvolutionRenameCombinedCast) { // Test all 4 combinations of rename × cast: // f0: no rename, no cast (utf8 → utf8, name unchanged) diff --git a/src/paimon/core/io/rolling_blob_file_writer.cpp b/src/paimon/core/io/rolling_blob_file_writer.cpp index 9019e7392..41b069b5d 100644 --- a/src/paimon/core/io/rolling_blob_file_writer.cpp +++ b/src/paimon/core/io/rolling_blob_file_writer.cpp @@ -44,12 +44,13 @@ RollingBlobFileWriter::RollingBlobFileWriter( std::function>()> create_file_writer, const std::shared_ptr& blob_schema, MultipleBlobFileWriter::BlobWriterCreator blob_writer_creator, - const std::shared_ptr& data_type) + const std::shared_ptr& data_type, const std::set& inline_fields) : RollingFileWriter<::ArrowArray*, std::shared_ptr>(target_file_size, create_file_writer), blob_schema_(blob_schema), blob_writer_creator_(std::move(blob_writer_creator)), data_type_(data_type), + inline_fields_(inline_fields), logger_(Logger::GetLogger("RollingBlobFileWriter")) {} Status RollingBlobFileWriter::Write(::ArrowArray* record) { @@ -67,7 +68,7 @@ Status RollingBlobFileWriter::Write(::ArrowArray* record) { auto struct_array = std::dynamic_pointer_cast(arrow_array); PAIMON_ASSIGN_OR_RAISE(BlobUtils::SeparatedStructArrays separated_arrays, - BlobUtils::SeparateBlobArray(struct_array)); + BlobUtils::SeparateBlobArray(struct_array, inline_fields_)); // Write main (non-blob) data ::ArrowArray c_main_array; PAIMON_RETURN_NOT_OK_FROM_ARROW( diff --git a/src/paimon/core/io/rolling_blob_file_writer.h b/src/paimon/core/io/rolling_blob_file_writer.h index 169fbbd70..b55eacd9f 100644 --- a/src/paimon/core/io/rolling_blob_file_writer.h +++ b/src/paimon/core/io/rolling_blob_file_writer.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "arrow/array/array_nested.h" @@ -62,7 +63,8 @@ class RollingBlobFileWriter std::function>()> create_file_writer, const std::shared_ptr& blob_schema, MultipleBlobFileWriter::BlobWriterCreator blob_writer_creator, - const std::shared_ptr& data_type); + const std::shared_ptr& data_type, + const std::set& inline_fields); ~RollingBlobFileWriter() override = default; Status Write(::ArrowArray* record) override; @@ -85,6 +87,7 @@ class RollingBlobFileWriter MultipleBlobFileWriter::BlobWriterCreator blob_writer_creator_; std::unique_ptr blob_writer_; std::shared_ptr data_type_; + std::set inline_fields_; std::unique_ptr logger_; }; diff --git a/src/paimon/core/operation/abstract_split_read.cpp b/src/paimon/core/operation/abstract_split_read.cpp index f5f37631e..5098e1729 100644 --- a/src/paimon/core/operation/abstract_split_read.cpp +++ b/src/paimon/core/operation/abstract_split_read.cpp @@ -21,6 +21,7 @@ #include #include "arrow/type.h" +#include "paimon/common/data/blob_utils.h" #include "paimon/common/reader/delegating_prefetch_reader.h" #include "paimon/common/reader/predicate_batch_reader.h" #include "paimon/common/reader/prefetch_file_batch_reader_impl.h" @@ -182,6 +183,10 @@ Result> AbstractSplitRead::CreateFieldMappingRe // load schema to get data schema PAIMON_ASSIGN_OR_RAISE(data_schema, schema_manager_->ReadSchema(file_meta->schema_id)); } + PAIMON_ASSIGN_OR_RAISE(CoreOptions data_options, + CoreOptions::FromMap(data_schema->Options(), options_.GetFileSystem())); + auto blob_inline_fields = data_options.GetBlobInlineFields(); + std::unique_ptr field_mapping; if (!data_schema->PrimaryKeys().empty()) { // for pk table, add special fields to file schema when field mapping @@ -195,8 +200,10 @@ Result> AbstractSplitRead::CreateFieldMappingRe PAIMON_ASSIGN_OR_RAISE( std::vector projected_data_fields, ProjectFieldsForRowTrackingAndDataEvolution(data_schema, file_meta->write_cols)); + auto converted_fields = + BlobUtils::ConvertBlobInlineDataFields(projected_data_fields, blob_inline_fields); PAIMON_ASSIGN_OR_RAISE(field_mapping, - field_mapping_builder->CreateFieldMapping(projected_data_fields)); + field_mapping_builder->CreateFieldMapping(converted_fields)); } auto read_schema = DataField::ConvertDataFieldsToArrowSchema( diff --git a/src/paimon/core/operation/append_only_file_store_write.cpp b/src/paimon/core/operation/append_only_file_store_write.cpp index e3ffe4473..0986c6d85 100644 --- a/src/paimon/core/operation/append_only_file_store_write.cpp +++ b/src/paimon/core/operation/append_only_file_store_write.cpp @@ -71,10 +71,6 @@ AppendOnlyFileStoreWrite::AppendOnlyFileStoreWrite( is_streaming_mode, ignore_num_bucket_check, executor, pool), logger_(Logger::GetLogger("AppendOnlyFileStoreWrite")) { write_cols_ = write_schema->field_names(); - auto schemas = BlobUtils::SeparateBlobSchema(schema_); - if (schemas.blob_schema && schemas.blob_schema->num_fields() > 0) { - with_blob_ = true; - } // optimize write_cols to null in following cases: // 1. write_schema contains all columns // 2. TODO(xinyu.lxy) write_schema contains all columns and append _ROW_ID & _SEQUENCE_NUMBER @@ -172,9 +168,7 @@ Result> AppendOnlyFileStoreWrite::CreateWriter( file_store_path_factory_->CreateDataFilePathFactory(partition, bucket)); std::shared_ptr compact_manager; - auto schemas = BlobUtils::SeparateBlobSchema(write_schema_); - if (options_.WriteOnly() || options_.DataEvolutionEnabled() || options_.GetBucket() == -1 || - with_blob_) { + if (options_.WriteOnly() || options_.DataEvolutionEnabled() || options_.GetBucket() == -1) { compact_manager = std::make_shared(); } else { auto dv_factory = diff --git a/src/paimon/core/operation/append_only_file_store_write.h b/src/paimon/core/operation/append_only_file_store_write.h index 1d3d7e726..e41002c00 100644 --- a/src/paimon/core/operation/append_only_file_store_write.h +++ b/src/paimon/core/operation/append_only_file_store_write.h @@ -115,7 +115,6 @@ class AppendOnlyFileStoreWrite : public AbstractFileStoreWrite { const std::vector>& files) const; std::optional> write_cols_; - bool with_blob_ = false; std::unique_ptr logger_; }; diff --git a/src/paimon/core/operation/file_store_scan.cpp b/src/paimon/core/operation/file_store_scan.cpp index 925a3afce..14b353489 100644 --- a/src/paimon/core/operation/file_store_scan.cpp +++ b/src/paimon/core/operation/file_store_scan.cpp @@ -27,6 +27,7 @@ #include "arrow/type.h" #include "fmt/format.h" #include "paimon/common/data/binary_array.h" +#include "paimon/common/data/blob_utils.h" #include "paimon/common/executor/future.h" #include "paimon/common/predicate/literal_converter.h" #include "paimon/common/types/data_field.h" @@ -354,8 +355,12 @@ Status FileStoreScan::SplitAndSetFilter(const std::vector& partitio PAIMON_ASSIGN_OR_RAISE(std::unique_ptr mapping_builder, FieldMappingBuilder::Create(arrow_schema, partition_keys, scan_filters->GetPredicate())); + PAIMON_ASSIGN_OR_RAISE(std::vector data_fields, + DataField::ConvertArrowSchemaToDataFields(arrow_schema)); + auto converted_fields = BlobUtils::ConvertBlobInlineDataFields( + data_fields, core_options_.GetBlobInlineFields()); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr mapping, - mapping_builder->CreateFieldMapping(arrow_schema)); + mapping_builder->CreateFieldMapping(converted_fields)); if (mapping->partition_info != std::nullopt) { const auto& partition_info = mapping->partition_info.value(); partition_schema_ = diff --git a/src/paimon/core/schema/schema_validation.cpp b/src/paimon/core/schema/schema_validation.cpp index 8a92b7e90..343435c98 100644 --- a/src/paimon/core/schema/schema_validation.cpp +++ b/src/paimon/core/schema/schema_validation.cpp @@ -444,12 +444,8 @@ Status SchemaValidation::ValidateBlobFields(const TableSchema& schema, const Cor const auto& blob_descriptor_names = options.GetBlobDescriptorFields(); const auto& blob_view_names = options.GetBlobViewFields(); const auto& blob_external_storage_names = options.GetBlobExternalStorageFields(); - std::vector configured_blob_like_names = configured_blob_names; - configured_blob_like_names.insert(configured_blob_like_names.end(), - blob_descriptor_names.begin(), blob_descriptor_names.end()); - configured_blob_like_names.insert(configured_blob_like_names.end(), blob_view_names.begin(), - blob_view_names.end()); - if (configured_blob_like_names.empty() && blob_external_storage_names.empty()) { + if (configured_blob_names.empty() && blob_descriptor_names.empty() && blob_view_names.empty() && + blob_external_storage_names.empty()) { return Status::OK(); } diff --git a/src/paimon/core/utils/field_mapping.cpp b/src/paimon/core/utils/field_mapping.cpp index 0809cd944..e24ee7277 100644 --- a/src/paimon/core/utils/field_mapping.cpp +++ b/src/paimon/core/utils/field_mapping.cpp @@ -16,10 +16,8 @@ #include "paimon/core/utils/field_mapping.h" -#include -#include #include -#include +#include #include "arrow/type.h" #include "fmt/format.h" diff --git a/src/paimon/core/utils/field_mapping.h b/src/paimon/core/utils/field_mapping.h index 05b79dfcd..0c0abc04b 100644 --- a/src/paimon/core/utils/field_mapping.h +++ b/src/paimon/core/utils/field_mapping.h @@ -15,8 +15,6 @@ */ #pragma once -#include -#include #include #include #include diff --git a/src/paimon/format/avro/avro_direct_encoder.cpp b/src/paimon/format/avro/avro_direct_encoder.cpp index d8ace9f7d..f2740009b 100644 --- a/src/paimon/format/avro/avro_direct_encoder.cpp +++ b/src/paimon/format/avro/avro_direct_encoder.cpp @@ -224,7 +224,14 @@ Status AvroDirectEncoder::EncodeArrowToAvro(const ::avro::NodePtr& avro_node, return Status::OK(); } - // Handle regular BYTES + // Handle regular BYTES (binary or large_binary) + if (array.type()->id() == arrow::Type::LARGE_BINARY) { + const auto& large_binary_array = + arrow::internal::checked_cast(array); + std::string_view value = large_binary_array.GetView(row_index); + encoder->encodeBytes(reinterpret_cast(value.data()), value.size()); + return Status::OK(); + } const auto& binary_array = arrow::internal::checked_cast(array); std::string_view value = binary_array.GetView(row_index); diff --git a/src/paimon/format/avro/avro_file_batch_reader_test.cpp b/src/paimon/format/avro/avro_file_batch_reader_test.cpp index 15ae3a908..a8ed3bb6c 100644 --- a/src/paimon/format/avro/avro_file_batch_reader_test.cpp +++ b/src/paimon/format/avro/avro_file_batch_reader_test.cpp @@ -403,6 +403,56 @@ TEST_F(AvroFileBatchReaderTest, TestGetNumberOfRows) { } } +TEST_F(AvroFileBatchReaderTest, TestReadBinaryWrittenFromBinaryAndLargeBinary) { + auto check_binary_read_result = [&](const std::shared_ptr& write_type, + const std::string& file_name) { + std::string data_json = R"([ + ["descriptor-1"], + [""], + [null], + ["descriptor-2"] + ])"; + auto write_field = arrow::field("f0", write_type); + auto write_data_type = arrow::struct_({write_field}); + auto write_array = + arrow::ipc::internal::json::ArrayFromJSON(write_data_type, data_json).ValueOrDie(); + + std::string file_path = PathUtil::JoinPath(dir_->Str(), file_name); + WriteData(write_array, file_path, /*compression=*/"null"); + + // Read back with binary schema + auto read_field = arrow::field("f0", arrow::binary()); + auto read_data_type = arrow::struct_({read_field}); + + ASSERT_OK_AND_ASSIGN(auto reader_builder, + file_format_->CreateReaderBuilder(/*batch_size=*/1024)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr in, fs_->Open(file_path)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, reader_builder->Build(in)); + + // Check GetFileSchema: regardless of write type, avro file schema is always binary + ASSERT_OK_AND_ASSIGN(auto c_file_schema, batch_reader->GetFileSchema()); + auto file_schema = arrow::ImportSchema(c_file_schema.get()).ValueOrDie(); + arrow::Schema expected_file_schema({read_field}); + ASSERT_TRUE(file_schema->Equals(expected_file_schema)); + + auto read_schema = arrow::schema({read_field}); + std::unique_ptr c_schema = std::make_unique(); + ASSERT_TRUE(arrow::ExportSchema(*read_schema, c_schema.get()).ok()); + EXPECT_OK(batch_reader->SetReadSchema(c_schema.get(), /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt)); + + ASSERT_OK_AND_ASSIGN(auto result_array, ::paimon::test::ReadResultCollector::CollectResult( + batch_reader.get())); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(read_data_type, data_json).ValueOrDie(); + auto expected_chunked_array = std::make_shared(expected_array); + ASSERT_TRUE(result_array->Equals(expected_chunked_array)); + }; + + check_binary_read_result(arrow::binary(), "binary.avro"); + check_binary_read_result(arrow::large_binary(), "large-binary.avro"); +} + INSTANTIATE_TEST_SUITE_P(TestParam, AvroFileBatchReaderTest, ::testing::Values(false, true)); } // namespace paimon::avro::test diff --git a/src/paimon/format/avro/avro_schema_converter.cpp b/src/paimon/format/avro/avro_schema_converter.cpp index 7a2726218..54c22d927 100644 --- a/src/paimon/format/avro/avro_schema_converter.cpp +++ b/src/paimon/format/avro/avro_schema_converter.cpp @@ -267,6 +267,7 @@ Result<::avro::Schema> AvroSchemaConverter::ArrowTypeToAvroSchema( case arrow::Type::STRING: return nullable ? NullableSchema(::avro::StringSchema()) : ::avro::StringSchema(); case arrow::Type::BINARY: + case arrow::Type::LARGE_BINARY: return nullable ? NullableSchema(::avro::BytesSchema()) : ::avro::BytesSchema(); case arrow::Type::type::DATE32: { ::avro::IntSchema date_schema; diff --git a/src/paimon/format/avro/avro_stats_extractor.cpp b/src/paimon/format/avro/avro_stats_extractor.cpp index 680c23766..c8bde0d38 100644 --- a/src/paimon/format/avro/avro_stats_extractor.cpp +++ b/src/paimon/format/avro/avro_stats_extractor.cpp @@ -92,6 +92,7 @@ Result> AvroStatsExtractor::FetchColumnStatistics( case arrow::Type::type::DOUBLE: return ColumnStats::CreateDoubleColumnStats(std::nullopt, std::nullopt, std::nullopt); case arrow::Type::type::BINARY: + case arrow::Type::type::LARGE_BINARY: return ColumnStats::CreateStringColumnStats(std::nullopt, std::nullopt, std::nullopt); case arrow::Type::type::STRING: return ColumnStats::CreateStringColumnStats(std::nullopt, std::nullopt, std::nullopt); diff --git a/src/paimon/format/blob/blob_file_batch_reader_test.cpp b/src/paimon/format/blob/blob_file_batch_reader_test.cpp index dbc0f392a..bde27d64d 100644 --- a/src/paimon/format/blob/blob_file_batch_reader_test.cpp +++ b/src/paimon/format/blob/blob_file_batch_reader_test.cpp @@ -235,10 +235,9 @@ TEST_P(BlobFileBatchReaderTest, EmptyFile) { file_system->Create(dir->Str() + "/file.blob", /*overwrite=*/true)); std::shared_ptr blob_field = BlobUtils::ToArrowField("blob_col"); auto struct_type = arrow::struct_({blob_field}); - bool blob_as_descriptor = GetParam(); ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor, output_stream, struct_type, - file_system, pool_)); + BlobFormatWriter::Create(output_stream, struct_type, + /*write_consumer=*/nullptr, file_system, pool_)); ASSERT_OK(writer->Flush()); ASSERT_OK(writer->Finish()); diff --git a/src/paimon/format/blob/blob_format_writer.cpp b/src/paimon/format/blob/blob_format_writer.cpp index f41e21e90..a21299d77 100644 --- a/src/paimon/format/blob/blob_format_writer.cpp +++ b/src/paimon/format/blob/blob_format_writer.cpp @@ -21,6 +21,7 @@ #include "arrow/api.h" #include "arrow/c/bridge.h" #include "paimon/common/data/blob_defs.h" +#include "paimon/common/data/blob_descriptor.h" #include "paimon/common/data/blob_utils.h" #include "paimon/common/memory/memory_segment_utils.h" #include "paimon/common/metrics/metrics_impl.h" @@ -31,23 +32,24 @@ namespace paimon::blob { -BlobFormatWriter::BlobFormatWriter(bool blob_as_descriptor, - const std::shared_ptr& out, +BlobFormatWriter::BlobFormatWriter(const std::shared_ptr& out, const std::string& uri, const std::shared_ptr& data_type, + WriteConsumer write_consumer, const std::shared_ptr& fs, const std::shared_ptr& pool) - : blob_as_descriptor_(blob_as_descriptor), - out_(out), + : out_(out), + uri_(uri), data_type_(data_type), fs_(fs), - pool_(pool) { + pool_(pool), + write_consumer_(std::move(write_consumer)) { metrics_ = std::make_shared(); tmp_buffer_ = Bytes::AllocateBytes(kTmpBufferSize, pool_.get()); } Result> BlobFormatWriter::Create( - bool blob_as_descriptor, const std::shared_ptr& out, - const std::shared_ptr& data_type, const std::shared_ptr& fs, + const std::shared_ptr& out, const std::shared_ptr& data_type, + WriteConsumer write_consumer, const std::shared_ptr& fs, const std::shared_ptr& pool) { if (out == nullptr) { return Status::Invalid("blob format writer create failed. out is nullptr"); @@ -66,8 +68,9 @@ Result> BlobFormatWriter::Create( return Status::Invalid( fmt::format("field {} is not BLOB", data_type->field(0)->ToString())); } + PAIMON_ASSIGN_OR_RAISE(std::string uri, out->GetUri()); return std::unique_ptr( - new BlobFormatWriter(blob_as_descriptor, out, data_type, fs, pool)); + new BlobFormatWriter(out, uri, data_type, std::move(write_consumer), fs, pool)); } Status BlobFormatWriter::AddBatch(ArrowArray* batch) { @@ -91,6 +94,9 @@ Status BlobFormatWriter::AddBatch(ArrowArray* batch) { // Child-level null: record kNullBinLength, skip data writing (aligned with Java) if (child_array->IsNull(0)) { bin_lengths_.push_back(BlobDefs::kNullBinLength); + if (write_consumer_) { + write_consumer_(/*descriptor=*/nullptr); + } return Status::OK(); } @@ -103,7 +109,27 @@ Status BlobFormatWriter::AddBatch(ArrowArray* batch) { assert(blob_array.length() == 1); PAIMON_RETURN_NOT_OK(WriteBlob(blob_array.GetView(0))); - PAIMON_RETURN_NOT_OK(Flush()); + if (write_consumer_) { + // Construct BlobDescriptor from the blob just written. + // blob format: magic(4) + content + bin_length(8) + crc32(4) + // bin_length covers all of the above, so content_length = bin_length - 16. + // The stream is now positioned at the end of crc32, i.e., previous_pos + bin_length. + int64_t bin_length = bin_lengths_.back(); + PAIMON_ASSIGN_OR_RAISE(int64_t end_pos, out_->GetPos()); + int64_t blob_start_pos = end_pos - bin_length; + int64_t content_offset = blob_start_pos + BlobDefs::kContentStartOffset; + int64_t content_length = bin_length - BlobDefs::kTotalMetaLength; + + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr descriptor, + BlobDescriptor::Create(uri_, content_offset, content_length)); + bool should_flush = write_consumer_(std::move(descriptor)); + if (should_flush) { + PAIMON_RETURN_NOT_OK(Flush()); + } + } else { + // Java does not flush when writeConsumer is null. + PAIMON_RETURN_NOT_OK(Flush()); + } return Status::OK(); } @@ -138,8 +164,13 @@ Status BlobFormatWriter::WriteBlob(std::string_view blob_data) { PAIMON_RETURN_NOT_OK(WriteWithCrc32(kMagicNumberBytes->data(), kMagicNumberBytes->size())); // write blob content + // Dynamically check whether blob_data is a serialized BlobDescriptor (by magic header) + // rather than relying on blob_as_descriptor_ config. This is consistent with Java behavior: + // at write time, the input bytes are auto-detected as descriptor or raw data. std::unique_ptr in; - if (blob_as_descriptor_) { + PAIMON_ASSIGN_OR_RAISE(bool is_descriptor, + BlobDescriptor::IsBlobDescriptor(blob_data.data(), blob_data.size())); + if (is_descriptor) { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr blob, Blob::FromDescriptor(blob_data.data(), blob_data.size())); PAIMON_ASSIGN_OR_RAISE(in, blob->NewInputStream(fs_)); diff --git a/src/paimon/format/blob/blob_format_writer.h b/src/paimon/format/blob/blob_format_writer.h index c52437e8d..372906ea8 100644 --- a/src/paimon/format/blob/blob_format_writer.h +++ b/src/paimon/format/blob/blob_format_writer.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -36,6 +37,7 @@ struct ArrowArray; namespace paimon { class Blob; +class BlobDescriptor; class FileSystem; class Metrics; class OutputStream; @@ -47,9 +49,14 @@ namespace paimon::blob { // https://cwiki.apache.org/confluence/display/PAIMON/PIP-35%3A+Introduce+Blob+to+store+multimodal+data class BlobFormatWriter : public FormatWriter { public: + /// Callback invoked after each blob row is written. + /// Receives the BlobDescriptor of the written blob (nullptr for null blobs). + /// Similar to Java's BlobConsumer. Returns true if the output stream should be flushed. + using WriteConsumer = std::function descriptor)>; + static Result> Create( - bool blob_as_descriptor, const std::shared_ptr& out, - const std::shared_ptr& data_type, const std::shared_ptr& fs, + const std::shared_ptr& out, const std::shared_ptr& data_type, + WriteConsumer write_consumer, const std::shared_ptr& fs, const std::shared_ptr& pool); Status AddBatch(ArrowArray* batch) override; @@ -65,9 +72,9 @@ class BlobFormatWriter : public FormatWriter { } private: - BlobFormatWriter(bool blob_as_descriptor, const std::shared_ptr& out, + BlobFormatWriter(const std::shared_ptr& out, const std::string& uri, const std::shared_ptr& data_type, - const std::shared_ptr& fs, + WriteConsumer write_consumer, const std::shared_ptr& fs, const std::shared_ptr& pool); Status WriteBlob(std::string_view blob_data); @@ -83,15 +90,16 @@ class BlobFormatWriter : public FormatWriter { static constexpr uint32_t kTmpBufferSize = 1024 * 1024; private: - bool blob_as_descriptor_; uint32_t crc32_ = 0; std::vector bin_lengths_; std::shared_ptr out_; + std::string uri_; PAIMON_UNIQUE_PTR tmp_buffer_; std::shared_ptr data_type_; std::shared_ptr fs_; std::shared_ptr pool_; std::shared_ptr metrics_; + WriteConsumer write_consumer_; }; } // namespace paimon::blob diff --git a/src/paimon/format/blob/blob_format_writer_test.cpp b/src/paimon/format/blob/blob_format_writer_test.cpp index 3f3779108..506f93cfe 100644 --- a/src/paimon/format/blob/blob_format_writer_test.cpp +++ b/src/paimon/format/blob/blob_format_writer_test.cpp @@ -18,9 +18,11 @@ #include #include +#include #include "arrow/c/bridge.h" #include "gtest/gtest.h" +#include "paimon/common/data/blob_descriptor.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/stream_utils.h" #include "paimon/data/blob.h" @@ -90,8 +92,8 @@ INSTANTIATE_TEST_SUITE_P(BlobAsDescriptor, BlobFormatWriterTest, ::testing::Valu TEST_P(BlobFormatWriterTest, TestSimple) { // write ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); std::vector> expected_blobs; std::string file1 = paimon::test::GetDataDir() + "/avro/data/avro_with_null"; @@ -149,41 +151,82 @@ TEST_P(BlobFormatWriterTest, TestSimple) { } } +TEST_P(BlobFormatWriterTest, TestWriteConsumerReceivesDescriptors) { + std::vector> captured_descriptors; + BlobFormatWriter::WriteConsumer consumer = + [&captured_descriptors](std::unique_ptr descriptor) -> bool { + captured_descriptors.push_back(std::move(descriptor)); + return true; // request flush + }; + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr writer, + BlobFormatWriter::Create(output_stream_, struct_type_, consumer, file_system_, pool_)); + + // Write a normal blob row + std::string file = paimon::test::GetDataDir() + "/xxhash.data"; + ASSERT_OK_AND_ASSIGN(std::shared_ptr blob, + Blob::FromPath(file, /*offset=*/0, /*length=*/91)); + ASSERT_OK_AND_ASSIGN(auto array, PrepareBlobArray(blob)); + ASSERT_OK(AddBatchOnce(writer, array)); + + ASSERT_EQ(captured_descriptors.size(), 1); + ASSERT_TRUE(captured_descriptors[0]); + ASSERT_EQ(captured_descriptors[0]->Uri(), dir_->Str() + "/file.blob"); + ASSERT_EQ(captured_descriptors[0]->Offset(), 4); // after magic(4) + ASSERT_EQ(captured_descriptors[0]->Length(), 91); + + // Write a null blob row — consumer should receive nullptr descriptor + arrow::StructBuilder struct_builder(struct_type_, arrow::default_memory_pool(), + {std::make_shared()}); + auto blob_builder = static_cast(struct_builder.field_builder(0)); + ASSERT_TRUE(struct_builder.Append().ok()); + ASSERT_TRUE(blob_builder->AppendNull().ok()); + std::shared_ptr null_array; + ASSERT_TRUE(struct_builder.Finish(&null_array).ok()); + ASSERT_OK(AddBatchOnce(writer, null_array)); + + ASSERT_EQ(captured_descriptors.size(), 2); + ASSERT_FALSE(captured_descriptors[1]); + + ASSERT_OK(writer->Finish()); +} + TEST_P(BlobFormatWriterTest, TestCreateWithInvalidParameters) { // Test with nullptr output stream - ASSERT_NOK_WITH_MSG( - BlobFormatWriter::Create(blob_as_descriptor_, nullptr, struct_type_, file_system_, pool_), - "blob format writer create failed. out is nullptr"); + ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(nullptr, struct_type_, /*write_consumer=*/nullptr, + file_system_, pool_), + "blob format writer create failed. out is nullptr"); // Test with nullptr data type - ASSERT_NOK_WITH_MSG( - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, nullptr, file_system_, pool_), - "blob format writer create failed. data_type is nullptr"); + ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(output_stream_, nullptr, + /*write_consumer=*/nullptr, file_system_, pool_), + "blob format writer create failed. data_type is nullptr"); // Test with nullptr memory pool - ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, nullptr), + ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, nullptr), "blob format writer create failed. pool is nullptr"); // Test with invalid field count (more than 1 field) auto multi_field_type = arrow::struct_( {arrow::field("blob_col1", arrow::binary()), arrow::field("blob_col2", arrow::binary())}); - ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, - multi_field_type, file_system_, pool_), + ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(output_stream_, multi_field_type, + /*write_consumer=*/nullptr, file_system_, pool_), "blob data type field number 2 is not 1"); // Test with non-blob field (missing blob metadata) auto non_blob_field = arrow::field("regular_col", arrow::binary()); auto non_blob_type = arrow::struct_({non_blob_field}); - ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, non_blob_type, - file_system_, pool_), + ASSERT_NOK_WITH_MSG(BlobFormatWriter::Create(output_stream_, non_blob_type, + /*write_consumer=*/nullptr, file_system_, pool_), "field regular_col: binary is not BLOB"); } TEST_P(BlobFormatWriterTest, TestInvalidCase) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Test nullptr batch ASSERT_NOK_WITH_MSG(writer->AddBatch(nullptr), @@ -201,8 +244,8 @@ TEST_P(BlobFormatWriterTest, TestInvalidCase) { TEST_P(BlobFormatWriterTest, TestAddBatchWithInvalidBatchLength) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Test batch with wrong length (not 1) arrow::StructBuilder struct_builder(struct_type_, arrow::default_memory_pool(), @@ -229,8 +272,8 @@ TEST_P(BlobFormatWriterTest, TestAddBatchWithInvalidBatchLength) { TEST_P(BlobFormatWriterTest, TestReachTargetSize) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Initially should not reach target size ASSERT_OK_AND_ASSIGN(bool reached, writer->ReachTargetSize(true, 1000)); @@ -254,8 +297,8 @@ TEST_P(BlobFormatWriterTest, TestReachTargetSize) { TEST_P(BlobFormatWriterTest, TestGetWriterMetrics) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); auto metrics = writer->GetWriterMetrics(); ASSERT_TRUE(metrics); @@ -264,8 +307,8 @@ TEST_P(BlobFormatWriterTest, TestGetWriterMetrics) { TEST_P(BlobFormatWriterTest, TestEmptyWriter) { // Test creating a writer and finishing without adding any data ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); ASSERT_OK(writer->Flush()); ASSERT_OK(writer->Finish()); @@ -285,8 +328,8 @@ TEST_P(BlobFormatWriterTest, TestEmptyWriter) { TEST_P(BlobFormatWriterTest, TestLargeBlob) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Create a temporary large file for testing std::string large_file_path = dir_->Str() + "/large_test_file.bin"; @@ -340,8 +383,8 @@ TEST_P(BlobFormatWriterTest, TestLargeBlob) { TEST_P(BlobFormatWriterTest, TestAddBatchWithNullValues) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Write one row with child-level null blob arrow::StructBuilder struct_builder(struct_type_, arrow::default_memory_pool(), @@ -388,8 +431,8 @@ TEST_P(BlobFormatWriterTest, TestAddBatchWithNullValues) { auto null_c_array = std::make_unique(); ASSERT_TRUE(arrow::ExportArray(*null_struct_array, null_c_array.get()).ok()); ASSERT_OK_AND_ASSIGN(std::shared_ptr writer2, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); ASSERT_NOK_WITH_MSG(writer2->AddBatch(null_c_array.get()), "BlobFormatWriter does not support struct-level null."); ArrowArrayRelease(null_c_array.get()); @@ -397,8 +440,8 @@ TEST_P(BlobFormatWriterTest, TestAddBatchWithNullValues) { TEST_P(BlobFormatWriterTest, TestAddBatchWithZeroLengthBlob) { ASSERT_OK_AND_ASSIGN(std::shared_ptr writer, - BlobFormatWriter::Create(blob_as_descriptor_, output_stream_, struct_type_, - file_system_, pool_)); + BlobFormatWriter::Create(output_stream_, struct_type_, + /*write_consumer=*/nullptr, file_system_, pool_)); // Create a zero-length file std::string zero_file_path = dir_->Str() + "/zero_length_file.bin"; diff --git a/src/paimon/format/blob/blob_writer_builder.h b/src/paimon/format/blob/blob_writer_builder.h index 0ddde46d1..497d1889b 100644 --- a/src/paimon/format/blob/blob_writer_builder.h +++ b/src/paimon/format/blob/blob_writer_builder.h @@ -24,8 +24,6 @@ #include #include "arrow/api.h" -#include "paimon/common/utils/options_utils.h" -#include "paimon/defs.h" #include "paimon/format/blob/blob_format_writer.h" #include "paimon/format/format_writer.h" #include "paimon/format/writer_builder.h" @@ -61,16 +59,19 @@ class BlobWriterBuilder : public SpecificFSWriterBuilder { return this; } + /// Sets a write consumer that will be called after each blob row is written. + BlobWriterBuilder* WithWriteConsumer(BlobFormatWriter::WriteConsumer consumer) { + write_consumer_ = std::move(consumer); + return this; + } + Result> Build(const std::shared_ptr& out, const std::string& compression) override { assert(out); if (fs_ == nullptr) { return Status::Invalid("File system is nullptr. Please call WithFileSystem() first."); } - PAIMON_ASSIGN_OR_RAISE( - bool blob_as_descriptor, - OptionsUtils::GetValueFromMap(options_, Options::BLOB_AS_DESCRIPTOR, false)); - return BlobFormatWriter::Create(blob_as_descriptor, out, data_type_, fs_, pool_); + return BlobFormatWriter::Create(out, data_type_, write_consumer_, fs_, pool_); } private: @@ -78,6 +79,7 @@ class BlobWriterBuilder : public SpecificFSWriterBuilder { std::shared_ptr data_type_; std::map options_; std::shared_ptr fs_; + BlobFormatWriter::WriteConsumer write_consumer_; }; } // namespace paimon::blob diff --git a/src/paimon/format/blob/blob_writer_builder_test.cpp b/src/paimon/format/blob/blob_writer_builder_test.cpp index 61ef15ca3..349b8ea8b 100644 --- a/src/paimon/format/blob/blob_writer_builder_test.cpp +++ b/src/paimon/format/blob/blob_writer_builder_test.cpp @@ -16,9 +16,16 @@ #include "paimon/format/blob/blob_writer_builder.h" +#include + #include "arrow/api.h" +#include "arrow/c/bridge.h" #include "gtest/gtest.h" +#include "paimon/common/data/blob_descriptor.h" #include "paimon/common/data/blob_utils.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/defs.h" +#include "paimon/format/format_writer.h" #include "paimon/fs/file_system.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/testing/utils/testharness.h" @@ -52,4 +59,34 @@ TEST_F(BlobWriterBuilderTest, TestSimple) { ASSERT_OK(builder.Build(output_stream_, "none")); } +TEST_F(BlobWriterBuilderTest, TestWithWriteConsumer) { + std::vector> captured; + BlobWriterBuilder builder(struct_type_, {{Options::BLOB_AS_DESCRIPTOR, "false"}}); + builder.WithFileSystem(file_system_); + builder.WithWriteConsumer([&captured](std::unique_ptr descriptor) -> bool { + captured.push_back(std::move(descriptor)); + return true; + }); + + ASSERT_OK_AND_ASSIGN(auto writer, builder.Build(output_stream_, "none")); + + // Build a single-row struct array with raw blob data + arrow::StructBuilder struct_builder(struct_type_, arrow::default_memory_pool(), + {std::make_shared()}); + auto blob_builder = static_cast(struct_builder.field_builder(0)); + ASSERT_TRUE(struct_builder.Append().ok()); + ASSERT_TRUE(blob_builder->Append("hello", 5).ok()); + std::shared_ptr array; + ASSERT_TRUE(struct_builder.Finish(&array).ok()); + + auto c_array = std::make_unique(); + ASSERT_TRUE(arrow::ExportArray(*array, c_array.get()).ok()); + ASSERT_OK(writer->AddBatch(c_array.get())); + + ASSERT_EQ(captured.size(), 1); + ASSERT_TRUE(captured[0]); + ASSERT_EQ(captured[0]->Length(), 5); + ASSERT_OK(writer->Finish()); +} + } // namespace paimon::blob::test diff --git a/src/paimon/format/orc/orc_adapter.cpp b/src/paimon/format/orc/orc_adapter.cpp index b1dd16c2d..d4819e128 100644 --- a/src/paimon/format/orc/orc_adapter.cpp +++ b/src/paimon/format/orc/orc_adapter.cpp @@ -1316,6 +1316,9 @@ arrow::Status WriteBatch(const arrow::Array& array, ::orc::ColumnVectorBatch* co case arrow::Type::type::BINARY: return WriteGenericBatch( array, column_vector_batch); + case arrow::Type::type::LARGE_BINARY: + return WriteGenericBatch( + array, column_vector_batch); case arrow::Type::type::STRING: return WriteGenericBatch( array, column_vector_batch); @@ -1379,6 +1382,7 @@ arrow::Result> GetOrcType(const arrow::DataType& ty case arrow::Type::type::STRING: return ::orc::createPrimitiveType(::orc::TypeKind::STRING); case arrow::Type::type::BINARY: + case arrow::Type::type::LARGE_BINARY: return ::orc::createPrimitiveType(::orc::TypeKind::BINARY); case arrow::Type::type::DATE32: return ::orc::createPrimitiveType(::orc::TypeKind::DATE); diff --git a/src/paimon/format/orc/orc_adapter_test.cpp b/src/paimon/format/orc/orc_adapter_test.cpp index 51577e1a6..6813d32c2 100644 --- a/src/paimon/format/orc/orc_adapter_test.cpp +++ b/src/paimon/format/orc/orc_adapter_test.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -181,12 +182,13 @@ TEST_F(OrcAdapterTest, TestGetOrcType) { auto col21_field = arrow::field("col21", arrow::timestamp(arrow::TimeUnit::MILLI, timezone)); auto col22_field = arrow::field("col22", arrow::timestamp(arrow::TimeUnit::MICRO, timezone)); auto col23_field = arrow::field("col23", arrow::timestamp(arrow::TimeUnit::NANO, timezone)); + auto col24_field = arrow::field("col24", arrow::large_binary()); auto arrow_schema = std::make_shared(arrow::FieldVector( {col1_field, col2_field, col3_field, col4_field, col5_field, col6_field, col7_field, col8_field, col9_field, col10_field, col11_field, col12_field, col13_field, col14_field, col15_field, col16_field, col17_field, col18_field, - col19_field, col20_field, col21_field, col22_field, col23_field})); + col19_field, col20_field, col21_field, col22_field, col23_field, col24_field})); ASSERT_OK_AND_ASSIGN(std::unique_ptr<::orc::Type> orc_type, OrcAdapter::GetOrcType(*arrow_schema)); ASSERT_TRUE(orc_type); @@ -196,7 +198,7 @@ TEST_F(OrcAdapterTest, TestGetOrcType) { "array,col14:map,col15:timestamp,col16:struct,col17:timestamp,col18:timestamp,col19:timestamp,col20:timestamp " "with local time zone,col21:timestamp with local time zone,col22:timestamp with local time " - "zone,col23:timestamp with local time zone>", + "zone,col23:timestamp with local time zone,col24:binary>", orc_type->toString()); } @@ -206,11 +208,6 @@ TEST_F(OrcAdapterTest, TestGetOrcTypeWithInvalidArrowType) { auto arrow_schema = arrow::schema(arrow::FieldVector({col1_field})); ASSERT_NOK(OrcAdapter::GetOrcType(*arrow_schema)); } - { - auto col1_field = arrow::field("col1", arrow::large_binary()); - auto arrow_schema = arrow::schema(arrow::FieldVector({col1_field})); - ASSERT_NOK(OrcAdapter::GetOrcType(*arrow_schema)); - } { auto col1_field = arrow::field("col1", arrow::uint32()); auto arrow_schema = arrow::schema(arrow::FieldVector({col1_field})); @@ -567,6 +564,39 @@ TEST_P(OrcAdapterTest, TestAppendBatchWithBinaryForAllNull) { ASSERT_TRUE(converted_array->Equals(src_array)) << converted_array->ToString(); } +TEST_P(OrcAdapterTest, TestWriteBatchWithLargeBinary) { + arrow::FieldVector fields = {arrow::field("f0", arrow::large_binary())}; + auto src_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ + ["descriptor-1"], + [""], + [null], + ["descriptor-2"] + ])") + .ValueOrDie()); + + auto [orc_reader_holder, read_batch] = GenerateOrcReadBatch(src_array); + auto* struct_batch = dynamic_cast<::orc::StructVectorBatch*>(read_batch.get()); + ASSERT_TRUE(struct_batch); + ASSERT_EQ(1, struct_batch->fields.size()); + + auto* large_binary_batch = dynamic_cast<::orc::StringVectorBatch*>(struct_batch->fields[0]); + ASSERT_TRUE(large_binary_batch); + ASSERT_EQ(4, large_binary_batch->numElements); + + std::vector expected_values = {"descriptor-1", "", "descriptor-2"}; + ASSERT_TRUE(large_binary_batch->notNull[0]); + ASSERT_EQ(expected_values[0], + std::string(large_binary_batch->data[0], large_binary_batch->length[0])); + ASSERT_TRUE(large_binary_batch->notNull[1]); + ASSERT_EQ(expected_values[1], + std::string(large_binary_batch->data[1], large_binary_batch->length[1])); + ASSERT_FALSE(large_binary_batch->notNull[2]); + ASSERT_TRUE(large_binary_batch->notNull[3]); + ASSERT_EQ(expected_values[2], + std::string(large_binary_batch->data[3], large_binary_batch->length[3])); +} + TEST_P(OrcAdapterTest, TestDecimalAndTimestamp) { auto timezone = DateTimeUtils::GetLocalTimezoneName(); arrow::FieldVector fields = { diff --git a/src/paimon/format/orc/orc_file_batch_reader_test.cpp b/src/paimon/format/orc/orc_file_batch_reader_test.cpp index defbf4b55..7e3158c5b 100644 --- a/src/paimon/format/orc/orc_file_batch_reader_test.cpp +++ b/src/paimon/format/orc/orc_file_batch_reader_test.cpp @@ -194,6 +194,50 @@ INSTANTIATE_TEST_SUITE_P(TestParam, OrcFileBatchReaderTest, ::testing::Values(TestParam{128 * 1024, false}, TestParam{16, false}, TestParam{16, true})); +TEST_F(OrcFileBatchReaderTest, TestReadBinaryWrittenFromBinaryAndLargeBinary) { + auto dir = paimon::test::UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + auto file_system = dir->GetFileSystem(); + + auto check_binary_read_result = [&](const std::shared_ptr& write_type, + const std::string& file_name) { + std::string data_json = R"([ + ["descriptor-1"], + [""], + [null], + ["descriptor-2"] + ])"; + auto write_field = arrow::field("f0", write_type); + auto write_schema = arrow::schema({write_field}); + auto write_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({write_field}), data_json) + .ValueOrDie()); + + std::string file_path = dir->Str() + "/" + file_name; + WriteArray(file_system, file_path, write_array, write_schema, /*options=*/{}); + + auto read_field = arrow::field("f0", arrow::binary()); + arrow::Schema read_schema({read_field}); + auto orc_batch_reader = PrepareOrcFileBatchReader(file_path, &read_schema, batch_size_, + DEFAULT_NATURAL_READ_SIZE); + + ASSERT_OK_AND_ASSIGN(auto c_file_schema, orc_batch_reader->GetFileSchema()); + auto file_schema = arrow::ImportSchema(c_file_schema.get()).ValueOrDie(); + ASSERT_TRUE(file_schema->Equals(read_schema)); + + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({read_field}), data_json) + .ValueOrDie()); + auto expected_chunked_array = std::make_shared(expected_array); + ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( + orc_batch_reader.get())); + ASSERT_TRUE(result_array->Equals(expected_chunked_array)); + }; + + check_binary_read_result(arrow::binary(), "binary.orc"); + check_binary_read_result(arrow::large_binary(), "large-binary.orc"); +} + TEST_F(OrcFileBatchReaderTest, TestSetReadSchema) { std::string file_name = paimon::test::GetDataDir() + "/orc/append_09.db/append_09/f1=10/bucket-1/" diff --git a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp index 6a41d8c78..f5cf99b9d 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader_test.cpp @@ -161,6 +161,48 @@ class ParquetFileBatchReaderTest : public ::testing::Test, std::shared_ptr struct_array_; }; +TEST_F(ParquetFileBatchReaderTest, TestReadBinaryWrittenFromBinaryAndLargeBinary) { + auto check_binary_read_result = [&](const std::shared_ptr& write_type, + const std::string& file_name) { + std::string data_json = R"([ + ["descriptor-1"], + [""], + [null], + ["descriptor-2"] + ])"; + auto write_field = arrow::field("f0", write_type); + auto write_schema = arrow::schema({write_field}); + auto write_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({write_field}), data_json) + .ValueOrDie()); + + std::string file_path = PathUtil::JoinPath(dir_->Str(), file_name); + WriteArray(file_path, write_array, write_schema, /*write_batch_size=*/write_array->length(), + /*enable_dictionary=*/false, /*max_row_group_length=*/write_array->length()); + + auto read_field = arrow::field("f0", arrow::binary()); + auto read_schema = arrow::schema({read_field}); + auto parquet_batch_reader = + PrepareParquetFileBatchReader(file_path, read_schema, /*predicate=*/nullptr, + /*selection_bitmap=*/std::nullopt, batch_size_); + + ASSERT_OK_AND_ASSIGN(auto c_file_schema, parquet_batch_reader->GetFileSchema()); + auto file_schema = arrow::ImportSchema(c_file_schema.get()).ValueOrDie(); + ASSERT_TRUE(file_schema->Equals(*read_schema)); + + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({read_field}), data_json) + .ValueOrDie()); + auto expected_chunked_array = std::make_shared(expected_array); + ASSERT_OK_AND_ASSIGN(auto result_array, paimon::test::ReadResultCollector::CollectResult( + parquet_batch_reader.get())); + ASSERT_TRUE(result_array->Equals(expected_chunked_array)); + }; + + check_binary_read_result(arrow::binary(), "binary.parquet"); + check_binary_read_result(arrow::large_binary(), "large-binary.parquet"); +} + TEST_F(ParquetFileBatchReaderTest, TestSimple) { std::string file_name = paimon::test::GetDataDir() + "/parquet/parquet_append_table.db/parquet_append_table/bucket-0/" diff --git a/src/paimon/testing/utils/test_helper.h b/src/paimon/testing/utils/test_helper.h index 0fccb85ae..a837ec2a5 100644 --- a/src/paimon/testing/utils/test_helper.h +++ b/src/paimon/testing/utils/test_helper.h @@ -239,74 +239,6 @@ class TestHelper { return result_blobs; } - // need to reconstruct the blob array, because the array in read result do not have blob meta - Result> ReconstructBlobArray( - const std::shared_ptr& array, const std::shared_ptr& schema) { - ::ArrowArray c_array; - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, &c_array)); - ::ArrowSchema new_c_schema; - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*schema, &new_c_schema)); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto new_array, - arrow::ImportArray(&c_array, &new_c_schema)); - return new_array; - } - - Result ReadAndCheckResultForBlobTable( - const std::shared_ptr& all_columns_schema, - const std::vector>& splits, const std::string& main_expected_json, - const std::vector>& expected_blob_descriptors) { - ReadContextBuilder read_context_builder(table_path_); - read_context_builder.SetOptions(options_); - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, - read_context_builder.Finish()); - PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); - PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(splits)); - PAIMON_ASSIGN_OR_RAISE(auto read_result, - ReadResultCollector::CollectResult(batch_reader.get())); - - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto concat_array, - arrow::Concatenate(read_result->chunks())); - PAIMON_ASSIGN_OR_RAISE(auto reconstruct_array, - ReconstructBlobArray(concat_array, all_columns_schema)); - PAIMON_ASSIGN_OR_RAISE( - auto separated_array, - BlobUtils::SeparateBlobArray( - std::dynamic_pointer_cast(reconstruct_array))); - - arrow::EqualOptions equal_options = arrow::EqualOptions::Defaults(); - - // check main columns - auto separated_schema = BlobUtils::SeparateBlobSchema(all_columns_schema); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - auto main_expected_array, - arrow::ipc::internal::json::ArrayFromJSON( - arrow::struct_(separated_schema.main_schema->fields()), main_expected_json)); - auto main_expected_chunk_array = std::make_shared(main_expected_array); - bool main_equal = main_expected_chunk_array->Equals( - arrow::ChunkedArray(separated_array.main_array), equal_options.diff_sink(&std::cout)); - if (!main_equal) { - std::cout << "[expected_data_type]" << main_expected_chunk_array->type()->ToString() - << std::endl; - std::cout << "[actual_data_type]" << separated_array.main_array->type()->ToString() - << std::endl; - std::cout << "[expected]:" << main_expected_chunk_array->ToString() << std::endl; - std::cout << "[actual]: " << separated_array.main_array->ToString() << std::endl; - } - - // check blob column - std::vector> expected_blobs; - for (const auto& descriptor : expected_blob_descriptors) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr blob, - Blob::FromDescriptor(descriptor->data(), descriptor->size())); - expected_blobs.emplace_back(blob); - } - PAIMON_ASSIGN_OR_RAISE(auto result_blobs, ToBlobs(separated_array.blob_array)); - PAIMON_ASSIGN_OR_RAISE(bool blob_equal, CheckBlobsEqual(result_blobs, expected_blobs, fs_)); - - table_read.reset(); - return main_equal && blob_equal; - } - Result ReadAndCheckResult(const std::shared_ptr& data_type, const std::vector>& splits, const std::string& expected_result) { diff --git a/test/inte/blob_table_inte_test.cpp b/test/inte/blob_table_inte_test.cpp index afe308b8f..75010ebc0 100644 --- a/test/inte/blob_table_inte_test.cpp +++ b/test/inte/blob_table_inte_test.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -76,11 +77,18 @@ class RecordBatch; } // namespace paimon namespace paimon::test { + +struct ReadResult { + std::unique_ptr batch_reader; + std::shared_ptr chunked_array; +}; + class BlobTableInteTest : public testing::Test, public ::testing::WithParamInterface { public: void SetUp() override { pool_ = GetDefaultPool(); dir_ = UniqueTestDirectory::Create("local"); + blob_dir_ = UniqueTestDirectory::Create("local"); } void TearDown() override { @@ -89,7 +97,13 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter void CreateTable(const std::vector& partition_keys, const std::map& options) const { - auto schema = arrow::schema(fields_); + CreateTable(fields_, partition_keys, options); + } + + void CreateTable(const arrow::FieldVector& fields, + const std::vector& partition_keys, + const std::map& options) const { + auto schema = arrow::schema(fields); ::ArrowSchema c_schema; ASSERT_TRUE(arrow::ExportSchema(*schema, &c_schema).ok()); @@ -160,11 +174,10 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter return file_store_commit->Commit(commit_msgs); } - Status ScanAndRead(const std::string& table_path, const std::vector& read_schema, - const std::shared_ptr& expected_array, - const std::shared_ptr& predicate = nullptr, - const std::vector& row_ranges = {}) const { - // scan + /// Scan table and return the plan (without reading data). + Result> ScanTable(const std::string& table_path, + const std::shared_ptr& predicate = nullptr, + const std::vector& row_ranges = {}) const { ScanContextBuilder scan_context_builder(table_path); scan_context_builder.SetPredicate(predicate); if (!row_ranges.empty()) { @@ -174,47 +187,72 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter PAIMON_ASSIGN_OR_RAISE(auto scan_context, scan_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_scan, TableScan::Create(std::move(scan_context))); PAIMON_ASSIGN_OR_RAISE(auto result_plan, table_scan->CreatePlan()); - if (!expected_array) { - EXPECT_TRUE(result_plan->Splits().empty()); - } + return result_plan; + } - // read - auto splits = result_plan->Splits(); + /// Read from table using a pre-scanned plan, returning the ChunkedArray and batch_reader. + /// The batch_reader must outlive the returned ChunkedArray (array memory depends on reader). + Result ReadTable(const std::string& table_path, + const std::vector& read_schema, + const std::shared_ptr& plan, + const std::shared_ptr& predicate = nullptr, + const std::map& options = {}) const { + auto splits = plan->Splits(); ReadContextBuilder read_context_builder(table_path); read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); + if (!options.empty()) { + read_context_builder.SetOptions(options); + } PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(splits)); PAIMON_ASSIGN_OR_RAISE(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); + return ReadResult{std::move(batch_reader), std::move(read_result)}; + } - if (!expected_array) { - EXPECT_FALSE(read_result); - return Status::OK(); - } - // add row kind array for expected array + /// Convenience: scan + read in one call. + Result ScanAndReadResult(const std::string& table_path, + const std::vector& read_schema, + const std::shared_ptr& predicate = nullptr, + const std::vector& row_ranges = {}) const { + PAIMON_ASSIGN_OR_RAISE(auto result_plan, ScanTable(table_path, predicate, row_ranges)); + return ReadTable(table_path, read_schema, result_plan, predicate); + } + + /// Prepend a _VALUE_KIND (Insert) column to a StructArray. + static Result> PrependRowKindColumn( + const std::shared_ptr& array) { auto row_kind_scalar = std::make_shared(RowKind::Insert()->ToByteValue()); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - auto row_kind_array, - arrow::MakeArrayFromScalar(*row_kind_scalar, expected_array->length())); - arrow::ArrayVector expected_with_row_kind_fields = expected_array->fields(); - std::vector expected_with_row_kind_field_names = - arrow::schema(expected_array->type()->fields())->field_names(); - expected_with_row_kind_fields.insert(expected_with_row_kind_fields.begin(), row_kind_array); - expected_with_row_kind_field_names.insert(expected_with_row_kind_field_names.begin(), - "_VALUE_KIND"); - - // check read result + auto row_kind_array, arrow::MakeArrayFromScalar(*row_kind_scalar, array->length())); + arrow::ArrayVector fields_with_row_kind = array->fields(); + std::vector names_with_row_kind = + arrow::schema(array->type()->fields())->field_names(); + fields_with_row_kind.insert(fields_with_row_kind.begin(), row_kind_array); + names_with_row_kind.insert(names_with_row_kind.begin(), "_VALUE_KIND"); PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - auto expected_with_row_kind_array, - arrow::StructArray::Make(expected_with_row_kind_fields, - expected_with_row_kind_field_names)); - auto expected_chunk_array = - std::make_shared(expected_with_row_kind_array); - EXPECT_TRUE(expected_chunk_array->Equals(read_result)) - << "result:" << read_result->ToString() << std::endl + auto result, arrow::StructArray::Make(fields_with_row_kind, names_with_row_kind)); + return std::dynamic_pointer_cast(result); + } + + Status ScanAndRead(const std::string& table_path, const std::vector& read_schema, + const std::shared_ptr& expected_array, + const std::shared_ptr& predicate = nullptr, + const std::vector& row_ranges = {}) const { + PAIMON_ASSIGN_OR_RAISE(auto scan_read, + ScanAndReadResult(table_path, read_schema, predicate, row_ranges)); + + if (!expected_array) { + EXPECT_FALSE(scan_read.chunked_array); + return Status::OK(); + } + PAIMON_ASSIGN_OR_RAISE(auto expected_with_row_kind, PrependRowKindColumn(expected_array)); + auto expected_chunk_array = std::make_shared(expected_with_row_kind); + EXPECT_TRUE(expected_chunk_array->Equals(scan_read.chunked_array)) + << "result:" << scan_read.chunked_array->ToString() << std::endl << "expected:" << expected_chunk_array->ToString(); return Status::OK(); } @@ -236,9 +274,133 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter .ValueOrDie()); } + /// Convert a StructArray with raw blob bytes into a StructArray with serialized + /// BlobDescriptor bytes. Each raw blob value is written to a temporary file, and + /// the corresponding cell is replaced with the serialized BlobDescriptor pointing + /// to that file. + /// Common framework for transforming blob fields in a StructArray. + /// Non-blob fields are kept as-is; blob fields are processed row-by-row via `transform_row`. + /// `transform_row` receives (binary_value_view) and returns the transformed bytes via builder. + using BlobRowTransform = + std::function; + + Result> TransformBlobFields( + const std::shared_ptr& input_array, + const std::set& blob_fields, BlobRowTransform transform_row) const { + auto fields = input_array->type()->fields(); + arrow::ArrayVector child_arrays; + + for (const auto& field : fields) { + auto col = input_array->GetFieldByName(field->name()); + if (blob_fields.count(field->name()) == 0) { + child_arrays.push_back(col); + continue; + } + const auto& binary_array = + arrow::internal::checked_cast(*col); + arrow::LargeBinaryBuilder builder; + for (int64_t i = 0; i < binary_array.length(); ++i) { + if (binary_array.IsNull(i)) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder.AppendNull()); + continue; + } + PAIMON_RETURN_NOT_OK(transform_row(binary_array.GetView(i), &builder)); + } + std::shared_ptr result_col; + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder.Finish(&result_col)); + child_arrays.push_back(result_col); + } + + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(auto result, + arrow::StructArray::Make(child_arrays, fields)); + return result; + } + + Result> ConvertRawBlobToDescriptor( + const std::shared_ptr& raw_array, + const std::set& blob_fields) { + auto fs = std::make_shared(); + return TransformBlobFields( + raw_array, blob_fields, + [&](const std::string_view& raw_value, arrow::LargeBinaryBuilder* builder) -> Status { + std::string file_path = + blob_dir_->Str() + "/blob_" + std::to_string(blob_file_counter_++) + ".bin"; + PAIMON_ASSIGN_OR_RAISE(auto out, fs->Create(file_path, /*overwrite=*/true)); + PAIMON_ASSIGN_OR_RAISE( + auto written, + out->Write(raw_value.data(), static_cast(raw_value.size()))); + PAIMON_RETURN_NOT_OK(out->Flush()); + PAIMON_RETURN_NOT_OK(out->Close()); + if (static_cast(written) != raw_value.size()) { + return Status::Invalid("Short write: expected {}, wrote {}", raw_value.size(), + written); + } + PAIMON_ASSIGN_OR_RAISE(auto blob, Blob::FromPath(file_path)); + auto descriptor = blob->ToDescriptor(pool_); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + builder->Append(descriptor->data(), descriptor->size())); + return Status::OK(); + }); + } + + /// Convert a StructArray with serialized BlobDescriptor bytes back to a StructArray + /// with raw blob bytes. Only blob fields are resolved; other columns (including + /// _VALUE_KIND) are kept as-is. + Result> ConvertDescriptorToRawBlob( + const std::shared_ptr& desc_array, + const std::set& blob_fields) const { + auto fs = std::make_shared(); + return TransformBlobFields( + desc_array, blob_fields, + [&](const std::string_view& descriptor_bytes, + arrow::LargeBinaryBuilder* builder) -> Status { + PAIMON_ASSIGN_OR_RAISE(auto blob, Blob::FromDescriptor(descriptor_bytes.data(), + descriptor_bytes.size())); + PAIMON_ASSIGN_OR_RAISE(auto data, blob->ToData(fs, pool_)); + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(data->data(), data->size())); + return Status::OK(); + }); + } + + /// Verify DataFileMeta properties from a scan plan. + /// Each vector element corresponds to one expected DataFileMeta (ordered by file index). + static void VerifyDataFileMetas( + const std::shared_ptr& plan, size_t expected_file_count, + const std::vector& expected_row_counts, + const std::vector& expected_min_seqs, + const std::vector& expected_max_seqs, + const std::vector& expected_first_row_ids, + const std::vector>>& expected_write_cols) { + std::vector> all_files; + for (const auto& split : plan->Splits()) { + auto data_split = std::dynamic_pointer_cast(split); + ASSERT_TRUE(data_split); + for (const auto& file : data_split->DataFiles()) { + all_files.push_back(file); + } + } + ASSERT_EQ(all_files.size(), expected_file_count); + ASSERT_EQ(expected_row_counts.size(), expected_file_count); + ASSERT_EQ(expected_min_seqs.size(), expected_file_count); + ASSERT_EQ(expected_max_seqs.size(), expected_file_count); + ASSERT_EQ(expected_first_row_ids.size(), expected_file_count); + ASSERT_EQ(expected_write_cols.size(), expected_file_count); + for (size_t i = 0; i < all_files.size(); ++i) { + const auto& file = all_files[i]; + EXPECT_EQ(file->row_count, expected_row_counts[i]); + EXPECT_EQ(file->min_sequence_number, expected_min_seqs[i]); + EXPECT_EQ(file->max_sequence_number, expected_max_seqs[i]); + ASSERT_TRUE(file->first_row_id.has_value()); + EXPECT_EQ(file->first_row_id.value(), expected_first_row_ids[i]); + EXPECT_EQ(file->write_cols, expected_write_cols[i]); + } + } + private: std::shared_ptr pool_; std::unique_ptr dir_; + std::unique_ptr blob_dir_; + int blob_file_counter_ = 0; arrow::FieldVector fields_ = {arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("f1"), arrow::field("f2", arrow::utf8())}; }; @@ -261,142 +423,75 @@ INSTANTIATE_TEST_SUITE_P(FileFormat, BlobTableInteTest, ::testing::ValuesIn(GetTestValuesForBlobTableInteTest())); TEST_P(BlobTableInteTest, TestAppendTableWriteWithBlobAsDescriptorTrue) { - auto dir = UniqueTestDirectory::Create(); arrow::FieldVector fields = {arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), BlobUtils::ToArrowField("blob", true)}; - auto schema = arrow::schema(fields); - auto file_format = GetParam(); std::map options = { - {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, file_format}, + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, {Options::BLOB_AS_DESCRIPTOR, "true"}, {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - ASSERT_OK_AND_ASSIGN( - auto helper, TestHelper::Create(dir->Str(), schema, /*partition_keys=*/{}, - /*primary_keys=*/{}, options, /*is_streaming_mode=*/true)); - int64_t commit_identifier = 0; - - auto generate_blob_array = [&](const std::vector>& blob_descriptors) - -> std::shared_ptr { - arrow::StructBuilder struct_builder( - arrow::struct_(fields), arrow::default_memory_pool(), - {std::make_shared(), std::make_shared(), - std::make_shared()}); - auto string_builder = static_cast(struct_builder.field_builder(0)); - auto int_builder = static_cast(struct_builder.field_builder(1)); - auto binary_builder = - static_cast(struct_builder.field_builder(2)); - for (size_t i = 0; i < blob_descriptors.size(); ++i) { - EXPECT_TRUE(struct_builder.Append().ok()); - EXPECT_TRUE(string_builder->Append("str_" + std::to_string(i)).ok()); - if (i % 3 == 0) { - // test null - EXPECT_TRUE(int_builder->AppendNull().ok()); - } else { - EXPECT_TRUE(int_builder->Append(i).ok()); - } - EXPECT_TRUE( - binary_builder->Append(blob_descriptors[i]->data(), blob_descriptors[i]->size()) - .ok()); - } - std::shared_ptr array; - EXPECT_TRUE(struct_builder.Finish(&array).ok()); - return array; - }; - - // prepare data - std::vector> expected_blob_descriptors; - std::string file1 = paimon::test::GetDataDir() + "/avro/data/avro_with_null"; - ASSERT_OK_AND_ASSIGN(auto blob1, Blob::FromPath(file1)); - expected_blob_descriptors.emplace_back(blob1->ToDescriptor(pool_)); - - std::string file2 = paimon::test::GetDataDir() + "/xxhash.data"; - ASSERT_OK_AND_ASSIGN(auto blob2, Blob::FromPath(file2, /*offset=*/0, /*length=*/91)); - expected_blob_descriptors.emplace_back(blob2->ToDescriptor(pool_)); - ASSERT_OK_AND_ASSIGN(auto blob3, Blob::FromPath(file2, /*offset=*/92, /*length=*/85)); - expected_blob_descriptors.emplace_back(blob3->ToDescriptor(pool_)); - ASSERT_OK_AND_ASSIGN(auto blob4, Blob::FromPath(file2, /*offset=*/300, /*length=*/3000)); - expected_blob_descriptors.emplace_back(blob4->ToDescriptor(pool_)); - - auto array = generate_blob_array(expected_blob_descriptors); - ::ArrowArray arrow_array; - ASSERT_TRUE(arrow::ExportArray(*array, &arrow_array).ok()); - RecordBatchBuilder batch_builder(&arrow_array); - ASSERT_OK_AND_ASSIGN(std::unique_ptr batch, batch_builder.Finish()); + // prepare data: input uses plain raw blob bytes for readability + std::string raw_json = R"([ + ["str_0", null, "hello_blob_0"], + ["str_1", 1, "blob_data_1"], + ["str_2", 2, "blob_data_2"], + ["str_3", null, "blob_data_3"] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, ConvertRawBlobToDescriptor(raw_array, {"blob"})); + // write descriptor array + auto schema = arrow::schema(fields); ASSERT_OK_AND_ASSIGN(auto commit_msgs, - helper->WriteAndCommit(std::move(batch), commit_identifier++, - /*expected_commit_messages=*/std::nullopt)); - - arrow::FieldVector fields_with_row_kind = fields; - fields_with_row_kind.insert(fields_with_row_kind.begin(), - arrow::field("_VALUE_KIND", arrow::int8())); - auto schema_with_row_kind = arrow::schema(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, - helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); - std::string expected_data = R"([ - [0, "str_0", null], - [0, "str_1", 1], - [0, "str_2", 2], - [0, "str_3", null] - ])"; - ASSERT_OK_AND_ASSIGN(bool success, helper->ReadAndCheckResultForBlobTable( - schema_with_row_kind, data_splits, expected_data, - expected_blob_descriptors)); - ASSERT_TRUE(success); + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // read result contains descriptors pointing to paimon internal blob files + // resolve descriptors back to raw bytes, then prepend _VALUE_KIND and compare + ASSERT_OK_AND_ASSIGN(auto result, ScanAndReadResult(table_path, schema->field_names())); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"blob"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); } TEST_P(BlobTableInteTest, TestAppendTableWriteWithBlobAsDescriptorFalse) { - auto dir = UniqueTestDirectory::Create(); arrow::FieldVector fields = {arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), BlobUtils::ToArrowField("blob", true)}; - auto schema = arrow::schema(fields); - auto file_format = GetParam(); std::map options = { - {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, file_format}, + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, {Options::BLOB_AS_DESCRIPTOR, "false"}, {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - ASSERT_OK_AND_ASSIGN( - auto helper, TestHelper::Create(dir->Str(), schema, /*partition_keys=*/{}, - /*primary_keys=*/{}, options, /*is_streaming_mode=*/true)); - int64_t commit_identifier = 0; - - std::string data = R"([ + std::string data_json = R"([ ["str_0", null, "apple"], ["str_1", 1, "banana"], ["str_2", 2, "cat"], ["str_3", null, "dog"] ])"; - ASSERT_OK_AND_ASSIGN(std::unique_ptr batch, - TestHelper::MakeRecordBatch(arrow::struct_(fields), data, - /*partition_map=*/{}, /*bucket=*/0, {})); + auto write_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), data_json).ValueOrDie()); + auto schema = arrow::schema(fields); ASSERT_OK_AND_ASSIGN(auto commit_msgs, - helper->WriteAndCommit(std::move(batch), commit_identifier++, - /*expected_commit_messages=*/std::nullopt)); - - arrow::FieldVector fields_with_row_kind = fields; - fields_with_row_kind.insert(fields_with_row_kind.begin(), - arrow::field("_VALUE_KIND", arrow::int8())); - auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, - helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); - std::string expected_data = R"([ - [0, "str_0", null, "apple"], - [0, "str_1", 1, "banana"], - [0, "str_2", 2, "cat"], - [0, "str_3", null, "dog"] - ])"; - ASSERT_OK_AND_ASSIGN(bool success, - helper->ReadAndCheckResult(data_type, data_splits, expected_data)); - ASSERT_TRUE(success); + WriteArray(table_path, {}, schema->field_names(), {write_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // BLOB_AS_DESCRIPTOR=false: blob data is stored inline, read result should match input + ASSERT_OK(ScanAndRead(table_path, schema->field_names(), write_array)); } TEST_P(BlobTableInteTest, TestBasic) { @@ -591,7 +686,7 @@ TEST_P(BlobTableInteTest, TestOnlySomeColumns) { ])") .ValueOrDie()); ASSERT_NOK_WITH_MSG(WriteArray(table_path, {}, write_cols1, {src_array1}), - "Can't infer struct array length with 0 child arrays"); + "SeparateBlobArray expects at least one main field, but got none."); } TEST_P(BlobTableInteTest, TestMultipleAppendsDifferentFirstRowIds) { @@ -1298,7 +1393,6 @@ TEST_P(BlobTableInteTest, TestWithRowIdsForMultipleBlobFiles) { {Options::BUCKET, "-1"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, - {Options::BLOB_AS_DESCRIPTOR, "false"}, {Options::FILE_SYSTEM, "local"}}; CreateTable(/*partition_keys=*/{}, options); std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); @@ -1398,107 +1492,60 @@ TEST_P(BlobTableInteTest, TestWithRowIdsForMultipleBlobFiles) { } TEST_P(BlobTableInteTest, TestAppendTableWriteWithMultipleBlobFields) { - auto dir = UniqueTestDirectory::Create(); arrow::FieldVector fields = { arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), BlobUtils::ToArrowField("blob1", true), BlobUtils::ToArrowField("blob2", true)}; - auto schema = arrow::schema(fields); - auto file_format = GetParam(); std::map options = { - {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, file_format}, + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, - {Options::BLOB_AS_DESCRIPTOR, "false"}, {Options::FILE_SYSTEM, "local"}}; - - ASSERT_OK_AND_ASSIGN( - auto helper, TestHelper::Create(dir->Str(), schema, /*partition_keys=*/{}, - /*primary_keys=*/{}, options, /*is_streaming_mode=*/true)); - int64_t commit_identifier = 0; + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - std::string data = R"([ + std::string data_json = R"([ ["str_0", null, "apple", "red"], ["str_1", 1, "banana", "yellow"], ["str_2", 2, "cat", "black"] ])"; - ASSERT_OK_AND_ASSIGN(std::unique_ptr batch, - TestHelper::MakeRecordBatch(arrow::struct_(fields), data, - /*partition_map=*/{}, /*bucket=*/0, {})); + auto write_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), data_json).ValueOrDie()); + auto schema = arrow::schema(fields); ASSERT_OK_AND_ASSIGN(auto commit_msgs, - helper->WriteAndCommit(std::move(batch), commit_identifier++, - /*expected_commit_messages=*/std::nullopt)); - ASSERT_EQ(commit_msgs.size(), 1); - - ASSERT_OK_AND_ASSIGN(std::optional snapshot, helper->LatestSnapshot()); - ASSERT_TRUE(snapshot); - ASSERT_EQ(1, snapshot.value().Id()); - ASSERT_EQ(3, snapshot.value().NextRowId().value()); - - // Scan and read: verify all fields including multiple blob fields - arrow::FieldVector fields_with_row_kind = fields; - fields_with_row_kind.insert(fields_with_row_kind.begin(), - arrow::field("_VALUE_KIND", arrow::int8())); - auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, - helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); - std::string expected_data = R"([ - [0, "str_0", null, "apple", "red"], - [0, "str_1", 1, "banana", "yellow"], - [0, "str_2", 2, "cat", "black"] - ])"; - ASSERT_OK_AND_ASSIGN(bool success, - helper->ReadAndCheckResult(data_type, data_splits, expected_data)); - ASSERT_TRUE(success); + WriteArray(table_path, {}, schema->field_names(), {write_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + ASSERT_OK(ScanAndRead(table_path, schema->field_names(), write_array)); } TEST_P(BlobTableInteTest, TestAppendWriteWithNullBlob) { - auto dir = UniqueTestDirectory::Create(); arrow::FieldVector fields = {arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("blob", true)}; - auto schema = arrow::schema(fields); - auto file_format = GetParam(); std::map options = {{Options::MANIFEST_FORMAT, "orc"}, - {Options::FILE_FORMAT, file_format}, + {Options::FILE_FORMAT, GetParam()}, {Options::BUCKET, "-1"}, {Options::FILE_SYSTEM, "local"}, {Options::ROW_TRACKING_ENABLED, "true"}, - {Options::DATA_EVOLUTION_ENABLED, "true"}, - {Options::BLOB_AS_DESCRIPTOR, "false"}}; - - ASSERT_OK_AND_ASSIGN( - auto helper, TestHelper::Create(dir->Str(), schema, /*partition_keys=*/{}, - /*primary_keys=*/{}, options, /*is_streaming_mode=*/true)); + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); // Write: row 0 non-null blob, row 1 null blob, row 2 non-null blob - std::string data = R"([ + std::string data_json = R"([ [1, "hello"], [2, null], [3, "world"] ])"; - ASSERT_OK_AND_ASSIGN(std::unique_ptr batch, - TestHelper::MakeRecordBatch(arrow::struct_(fields), data, - /*partition_map=*/{}, /*bucket=*/0, {})); + auto write_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), data_json).ValueOrDie()); + + auto schema = arrow::schema(fields); ASSERT_OK_AND_ASSIGN(auto commit_msgs, - helper->WriteAndCommit(std::move(batch), /*commit_identifier=*/0, - /*expected_commit_messages=*/std::nullopt)); - - // Read and verify - arrow::FieldVector fields_with_row_kind = fields; - fields_with_row_kind.insert(fields_with_row_kind.begin(), - arrow::field("_VALUE_KIND", arrow::int8())); - auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, - helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); - std::string expected_data = R"([ - [0, 1, "hello"], - [0, 2, null], - [0, 3, "world"] - ])"; - ASSERT_OK_AND_ASSIGN(bool success, - helper->ReadAndCheckResult(data_type, data_splits, expected_data)); - ASSERT_TRUE(success); + WriteArray(table_path, {}, schema->field_names(), {write_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + ASSERT_OK(ScanAndRead(table_path, schema->field_names(), write_array)); } TEST_P(BlobTableInteTest, TestReadTableWithMultiBlobFields) { @@ -1574,4 +1621,767 @@ TEST_P(BlobTableInteTest, TestReadTableWithMultiBlobFields) { } } +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldWithoutExternalStorage) { + if (GetParam() == "lance") { + return; + } + // Two blob fields configured via BLOB_DESCRIPTOR_FIELD, no external storage. + arrow::FieldVector fields = {arrow::field("f0", arrow::int32()), + BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // Input uses plain raw bytes for readability + std::string raw_json = R"([ + [1, "image_data_0", "video_data_0"], + [2, "image_data_1", "video_data_1"], + [3, "image_data_2", "video_data_2"] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, ConvertRawBlobToDescriptor(raw_array, {"b0", "b1"})); + + // write descriptor array + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Scan and verify DataFileMeta: no external storage -> write_cols should be nullopt + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/1, /*expected_row_counts=*/{3}, + /*expected_min_seqs=*/{1}, /*expected_max_seqs=*/{1}, + /*expected_first_row_ids=*/{0}, + /*expected_write_cols=*/{std::nullopt}); + + // Read and resolve descriptors back to raw bytes + std::map read_options = {}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // Descriptor bytes should be unchanged (inline, not repacked) + ASSERT_TRUE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + ASSERT_TRUE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldWithExternalStorage) { + if (GetParam() == "lance") { + return; + } + // Two blob fields configured via BLOB_DESCRIPTOR_FIELD + BLOB_EXTERNAL_STORAGE_FIELD + // with BLOB_EXTERNAL_STORAGE_PATH pointing to blob_dir_. + arrow::FieldVector fields = {arrow::field("f0", arrow::int32()), + BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // Input uses plain raw bytes for readability + std::string raw_json = R"([ + [1, "image_data_0", "video_data_0"], + [2, "image_data_1", "video_data_1"], + [3, "image_data_2", "video_data_2"] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, ConvertRawBlobToDescriptor(raw_array, {"b0", "b1"})); + + // write descriptor array + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Scan and verify DataFileMeta: with external storage -> write_cols should be explicit + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/1, /*expected_row_counts=*/{3}, + /*expected_min_seqs=*/{1}, /*expected_max_seqs=*/{1}, + /*expected_first_row_ids=*/{0}, + /*expected_write_cols=*/{std::vector{"f0", "b0", "b1"}}); + + // Read and resolve descriptors back to raw bytes + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "true"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // Descriptor bytes should differ (repacked by external storage) + ASSERT_FALSE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + ASSERT_FALSE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldPartialExternalStorage) { + if (GetParam() == "lance") { + return; + } + // 4 blob fields: b0,b1 have external storage, b2,b3 are descriptor-only (no external storage). + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1,b2,b3"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // Input uses plain raw bytes for readability; some blob fields are null + std::string raw_json = R"([ + [1, "img_0", null, "doc_0", "log_0"], + [2, null, "vid_1", null, "log_1"], + [3, "img_2", "vid_2", "doc_2", null ] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, + ConvertRawBlobToDescriptor(raw_array, {"b0", "b1", "b2", "b3"})); + + // write descriptor array + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Scan and verify DataFileMeta: external storage on b0,b1 -> write_cols should be explicit + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas( + plan, /*expected_file_count=*/1, /*expected_row_counts=*/{3}, + /*expected_min_seqs=*/{1}, /*expected_max_seqs=*/{1}, + /*expected_first_row_ids=*/{0}, + /*expected_write_cols=*/{std::vector{"f0", "b0", "b1", "b2", "b3"}}); + + // Read and resolve all descriptors back to raw bytes + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "true"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + ASSERT_OK_AND_ASSIGN(auto resolved, + ConvertDescriptorToRawBlob(read_struct, {"b0", "b1", "b2", "b3"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // b0,b1 repacked by external storage, should differ + ASSERT_FALSE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + ASSERT_FALSE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); + // b2,b3 inline descriptor, should match + ASSERT_TRUE(read_struct->GetFieldByName("b2")->Equals(desc_array->GetFieldByName("b2"))); + ASSERT_TRUE(read_struct->GetFieldByName("b3")->Equals(desc_array->GetFieldByName("b3"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldPartialInline) { + if (GetParam() == "lance") { + return; + } + // 4 blob fields: b0,b1 are descriptor (inline), b2,b3 are regular blob (written to .blob + // files). No external storage. + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // Input uses plain raw bytes: + // b0: all non-null, b1: has nulls, b2: all non-null, b3: has nulls + std::string raw_json = R"([ + [1, "img_0", null, "raw_2_0", "raw_3_0"], + [2, "img_1", "vid_1", "raw_2_1", null ], + [3, "img_2", null, "raw_2_2", "raw_3_2" ] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, + ConvertRawBlobToDescriptor(raw_array, {"b0", "b1", "b2", "b3"})); + + // write: b0,b1 as descriptor bytes; b2,b3 as raw bytes (paimon writes them to .blob files) + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Scan and verify DataFileMeta: b2,b3 go to .blob files, "f0", "b0", "b1" go to main files. + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/3, /*expected_row_counts=*/{3, 3, 3}, + /*expected_min_seqs=*/{1, 1, 1}, /*expected_max_seqs=*/{1, 1, 1}, + /*expected_first_row_ids=*/{0, 0, 0}, + /*expected_write_cols=*/ + {std::vector{"f0", "b0", "b1"}, std::vector{"b2"}, + std::vector{"b3"}}); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "true"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // b0,b1 inline descriptor (not repacked), should match input + ASSERT_TRUE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + ASSERT_TRUE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); + + // Resolve b0,b1 descriptors back to raw bytes, then compare full struct + ASSERT_OK_AND_ASSIGN(auto resolved, + ConvertDescriptorToRawBlob(read_struct, {"b0", "b1", "b2", "b3"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldPartialExternalStorageRepack) { + if (GetParam() == "lance") { + return; + } + // 4 blob fields: b0,b1 are descriptor + external-storage-field WITH external-storage-path. + // b2,b3 are regular blob (written to .blob files). + // All blob descriptors get repacked by external storage or .blob writer. + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // b0: all non-null, b1: has nulls, b2: all non-null, b3: has nulls + std::string raw_json = R"([ + [1, "img_0", null, "raw_2_0", "raw_3_0"], + [2, "img_1", "vid_1", "raw_2_1", null ], + [3, "img_2", null, "raw_2_2", "raw_3_2" ] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, + ConvertRawBlobToDescriptor(raw_array, {"b0", "b1", "b2", "b3"})); + + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // b0,b1 repacked to external storage; b2,b3 go to .blob files. + // Main file contains f0,b0,b1; .blob files for b2 and b3. + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/3, /*expected_row_counts=*/{3, 3, 3}, + /*expected_min_seqs=*/{1, 1, 1}, /*expected_max_seqs=*/{1, 1, 1}, + /*expected_first_row_ids=*/{0, 0, 0}, + /*expected_write_cols=*/ + {std::vector{"f0", "b0", "b1"}, std::vector{"b2"}, + std::vector{"b3"}}); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "true"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // Resolve descriptors back to raw bytes and compare + ASSERT_OK_AND_ASSIGN(auto resolved, + ConvertDescriptorToRawBlob(read_struct, {"b0", "b1", "b2", "b3"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // All blob columns should differ from input desc_array (all repacked) + ASSERT_FALSE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + ASSERT_FALSE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); + ASSERT_FALSE(read_struct->GetFieldByName("b2")->Equals(desc_array->GetFieldByName("b2"))); + ASSERT_FALSE(read_struct->GetFieldByName("b3")->Equals(desc_array->GetFieldByName("b3"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldPartialExternalStorageSingleField) { + if (GetParam() == "lance") { + return; + } + // 4 blob fields: b0,b1 are descriptor; only b1 has external storage. + // b2,b3 are regular blob (written to .blob files). + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // b0: all non-null, b1: has nulls, b2: all non-null, b3: has nulls + std::string raw_json = R"([ + [1, "img_0", null, "raw_2_0", "raw_3_0"], + [2, "img_1", "vid_1", "raw_2_1", null ], + [3, "img_2", null, "raw_2_2", "raw_3_2" ] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array, + ConvertRawBlobToDescriptor(raw_array, {"b0", "b1", "b2", "b3"})); + + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // b1 repacked to external storage; b2,b3 go to .blob files; b0 stays inline in main file. + // Main file contains f0,b0,b1; .blob files for b2 and b3. + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/3, /*expected_row_counts=*/{3, 3, 3}, + /*expected_min_seqs=*/{1, 1, 1}, /*expected_max_seqs=*/{1, 1, 1}, + /*expected_first_row_ids=*/{0, 0, 0}, + /*expected_write_cols=*/ + {std::vector{"f0", "b0", "b1"}, std::vector{"b2"}, + std::vector{"b3"}}); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "true"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // Resolve all descriptors back to raw bytes and compare + ASSERT_OK_AND_ASSIGN(auto resolved, + ConvertDescriptorToRawBlob(read_struct, {"b0", "b1", "b2", "b3"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // b0 is inline descriptor (not repacked), should match input + ASSERT_TRUE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); + // b1 is repacked by external storage, should differ + ASSERT_FALSE(read_struct->GetFieldByName("b1")->Equals(desc_array->GetFieldByName("b1"))); + // b2,b3 are repacked by .blob writer, should differ + ASSERT_FALSE(read_struct->GetFieldByName("b2")->Equals(desc_array->GetFieldByName("b2"))); + ASSERT_FALSE(read_struct->GetFieldByName("b3")->Equals(desc_array->GetFieldByName("b3"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldPartialExternalStorageNoAsDescriptor) { + if (GetParam() == "lance") { + return; + } + // Same as TestBlobDescriptorFieldPartialExternalStorageSingleField but without + // BLOB_AS_DESCRIPTOR in table options. Only b0 is explicitly converted to descriptor before + // write. b1 is written as raw bytes but still configured as descriptor field, so paimon should + // auto-convert it to descriptor internally (write auto-detects descriptor via magic header). + // After read with BLOB_AS_DESCRIPTOR=true, b0 and b1 are both stored as descriptor. + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // b0: all non-null, b1: has nulls, b2: all non-null, b3: has nulls + std::string raw_json = R"([ + [1, "img_0", null, "raw_2_0", "raw_3_0"], + [2, "img_1", "vid_1", "raw_2_1", null ], + [3, "img_2", null, "raw_2_2", "raw_3_2" ] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + // Only convert b0 to descriptor; b1,b2,b3 remain as raw bytes + ASSERT_OK_AND_ASSIGN(auto desc_array, ConvertRawBlobToDescriptor(raw_array, {"b0"})); + + auto schema = arrow::schema(fields); + ASSERT_OK_AND_ASSIGN(auto commit_msgs, + WriteArray(table_path, {}, schema->field_names(), {desc_array})); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // b1 repacked to external storage; b2,b3 go to .blob files; b0 stays inline in main file. + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + VerifyDataFileMetas(plan, /*expected_file_count=*/3, /*expected_row_counts=*/{3, 3, 3}, + /*expected_min_seqs=*/{1, 1, 1}, /*expected_max_seqs=*/{1, 1, 1}, + /*expected_first_row_ids=*/{0, 0, 0}, + /*expected_write_cols=*/ + {std::vector{"f0", "b0", "b1"}, std::vector{"b2"}, + std::vector{"b3"}}); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "false"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, schema->field_names(), plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // After read, b0 and b1 are both descriptor-stored; resolve all back to raw bytes + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(raw_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + + // b0 is inline descriptor (not repacked), should match input desc_array + ASSERT_TRUE(read_struct->GetFieldByName("b0")->Equals(desc_array->GetFieldByName("b0"))); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorMultiCommitAndShuffledReadSchema) { + if (GetParam() == "lance") { + return; + } + // Similar to TestBlobDescriptorFieldPartialExternalStorageNoAsDescriptor but: + // 1. Multiple write+commit rounds + // 2. Read schema is shuffled: b3, b2, b1, b0, f0 + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + auto schema = arrow::schema(fields); + + // --- First write+commit --- + std::string raw_json_1 = R"([ + [1, "img_0", null, "raw_2_0", "raw_3_0"], + [2, "img_1", "vid_1", "raw_2_1", null ] + ])"; + auto raw_array_1 = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json_1).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array_1, ConvertRawBlobToDescriptor(raw_array_1, {"b0"})); + ASSERT_OK_AND_ASSIGN(auto commit_msgs_1, + WriteArray(table_path, {}, schema->field_names(), {desc_array_1})); + ASSERT_OK(Commit(table_path, commit_msgs_1)); + + // --- Second write+commit --- + std::string raw_json_2 = R"([ + [3, "img_2", "vid_2", "raw_2_2", "raw_3_2"], + [4, null, "vid_3", "raw_2_3", "raw_3_3"] + ])"; + auto raw_array_2 = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json_2).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array_2, ConvertRawBlobToDescriptor(raw_array_2, {"b0"})); + ASSERT_OK_AND_ASSIGN(auto commit_msgs_2, + WriteArray(table_path, {}, schema->field_names(), {desc_array_2})); + ASSERT_OK(Commit(table_path, commit_msgs_2)); + + // --- Third write+commit --- + std::string raw_json_3 = R"([ + [5, "img_4", null, "raw_2_4", null ], + [6, "img_5", "vid_5", null, "raw_3_5"] + ])"; + auto raw_array_3 = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json_3).ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto desc_array_3, ConvertRawBlobToDescriptor(raw_array_3, {"b0"})); + ASSERT_OK_AND_ASSIGN(auto commit_msgs_3, + WriteArray(table_path, {}, schema->field_names(), {desc_array_3})); + ASSERT_OK(Commit(table_path, commit_msgs_3)); + + // test read + { + // --- Read with shuffled schema: b3, b2, b1, b0, f0 --- + std::vector shuffled_read_schema = {"b3", "b2", "b1", "b0", "f0"}; + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "false"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, shuffled_read_schema, plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // Build expected array in shuffled order from all 3 batches + arrow::FieldVector shuffled_fields = { + BlobUtils::ToArrowField("b3", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b0", true), + arrow::field("f0", arrow::int32())}; + std::string expected_json = R"([ + ["raw_3_0", "raw_2_0", null, "img_0", 1], + [null, "raw_2_1", "vid_1", "img_1", 2], + ["raw_3_2", "raw_2_2", "vid_2", "img_2", 3], + ["raw_3_3", "raw_2_3", "vid_3", null, 4], + [null, "raw_2_4", null, "img_4", 5], + ["raw_3_5", null, "vid_5", "img_5", 6] + ])"; + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(shuffled_fields), + expected_json) + .ValueOrDie()); + + // Resolve descriptors (b0, b1 are descriptor fields) back to raw bytes + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(expected_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + } + { + // test scan and read with GlobalIndexResult + std::vector shuffled_read_schema = {"b3", "b2", "b1", "b0", "f0"}; + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path, /*predicate=*/nullptr, + /*row_ranges=*/{Range(1, 3), Range(5, 5)})); + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "false"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, shuffled_read_schema, plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + + // Build expected array in shuffled order from all 3 batches + arrow::FieldVector shuffled_fields = { + BlobUtils::ToArrowField("b3", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b0", true), + arrow::field("f0", arrow::int32())}; + std::string expected_json = R"([ + [null, "raw_2_1", "vid_1", "img_1", 2], + ["raw_3_2", "raw_2_2", "vid_2", "img_2", 3], + ["raw_3_3", "raw_2_3", "vid_3", null, 4], + ["raw_3_5", null, "vid_5", "img_5", 6] + ])"; + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(shuffled_fields), + expected_json) + .ValueOrDie()); + + // Resolve descriptors (b0, b1 are descriptor fields) back to raw bytes + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(expected_array)); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); + } +} + +TEST_P(BlobTableInteTest, TestDataEvolutionWithBlobDescriptorField) { + if (GetParam() == "lance") { + return; + } + // Test DataEvolution (split-column write) combined with blob descriptor fields. + // Schema: f0(int32), b0(blob descriptor inline), b1(blob descriptor+external), b2(blob), + // b3(blob) + // Commit 1: file A writes (f0, b2, b3) + // Commit 2: file B writes (f0, b0, b1) with SetFirstRowId(0) + // -> merges with commit 1 + // Commit 3: file A writes (f0, b0, b1, b3) + // Commit 4: file B writes (b0, b1, b3) with SetFirstRowId(3) + // -> merges with commit 3 + arrow::FieldVector fields = { + arrow::field("f0", arrow::int32()), BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true), BlobUtils::ToArrowField("b2", true), + BlobUtils::ToArrowField("b3", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, + {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, + {Options::BLOB_EXTERNAL_STORAGE_FIELD, "b1"}, + {Options::BLOB_EXTERNAL_STORAGE_PATH, blob_dir_->Str()}, + {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // --- Commit 1: file A (f0, b2, b3), Commit 2: file B (f0, b0, b1) SetFirstRowId(0) --- + std::string file_a1_json = R"([ + [1, "raw_2_0", "raw_3_0"], + [2, "raw_2_1", null ], + [3, null, "raw_3_2"] + ])"; + arrow::FieldVector file_a1_fields = {fields[0], fields[3], fields[4]}; + auto file_a1_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(file_a1_fields), file_a1_json) + .ValueOrDie()); + + std::string file_b1_json = R"([ + [1, "img_0", "vid_0"], + [2, "img_1", null ], + [3, "img_2", "vid_2"] + ])"; + arrow::FieldVector file_b1_fields = {fields[0], fields[1], fields[2]}; + auto file_b1_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(file_b1_fields), file_b1_json) + .ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto file_b1_desc, ConvertRawBlobToDescriptor(file_b1_array, {"b0"})); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs_a1, + WriteArray(table_path, {}, {"f0", "b2", "b3"}, {file_a1_array})); + ASSERT_OK(Commit(table_path, commit_msgs_a1)); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs_b1, + WriteArray(table_path, {}, {"f0", "b0", "b1"}, {file_b1_desc})); + SetFirstRowId(0, commit_msgs_b1); + ASSERT_OK(Commit(table_path, commit_msgs_b1)); + + // --- Commit 3: file A (f0, b0, b1, b3), Commit 4: file B (b0, b1, b3) SetFirstRowId(3) --- + // Duplicate cols b0, b1, b3: file B (commit 4, newer) takes precedence. + std::string file_a2_json = R"([ + [4, "img_3_old", "vid_3_old", "raw_3_3_old"], + [5, null, "vid_4_old", "raw_3_4_old"], + [6, "img_5_old", null, null ] + ])"; + arrow::FieldVector file_a2_fields = {fields[0], fields[1], fields[2], fields[4]}; + auto file_a2_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(file_a2_fields), file_a2_json) + .ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto file_a2_desc, ConvertRawBlobToDescriptor(file_a2_array, {"b0"})); + + std::string file_b2_json = R"([ + ["img_3", "vid_3", "raw_3_3"], + [null, "vid_4", "raw_3_4"], + ["img_5", null, null ] + ])"; + arrow::FieldVector file_b2_fields = {fields[1], fields[2], fields[4]}; + auto file_b2_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(file_b2_fields), file_b2_json) + .ValueOrDie()); + ASSERT_OK_AND_ASSIGN(auto file_b2_desc, ConvertRawBlobToDescriptor(file_b2_array, {"b0"})); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs_a2, + WriteArray(table_path, {}, {"f0", "b0", "b1", "b3"}, {file_a2_desc})); + ASSERT_OK(Commit(table_path, commit_msgs_a2)); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs_b2, + WriteArray(table_path, {}, {"b0", "b1", "b3"}, {file_b2_desc})); + SetFirstRowId(3, commit_msgs_b2); + ASSERT_OK(Commit(table_path, commit_msgs_b2)); + + // --- Read all data with full schema --- + std::vector read_schema = {"f0", "b0", "b1", "b2", "b3"}; + ASSERT_OK_AND_ASSIGN(auto plan, ScanTable(table_path)); + + std::map read_options = {{Options::BLOB_AS_DESCRIPTOR, "false"}}; + ASSERT_OK_AND_ASSIGN(auto result, ReadTable(table_path, read_schema, plan, + /*predicate=*/nullptr, read_options)); + ASSERT_TRUE(result.chunked_array); + auto read_concat = arrow::Concatenate(result.chunked_array->chunks()).ValueOrDie(); + auto read_struct = std::dynamic_pointer_cast(read_concat); + ASSERT_EQ(read_struct->length(), 6); + + // Expected: round1 all columns present; round2 b2=null, b0/b1/b3 from file B (newer) + std::string expected_json = R"([ + [1, "img_0", "vid_0", "raw_2_0", "raw_3_0"], + [2, "img_1", null, "raw_2_1", null ], + [3, "img_2", "vid_2", null, "raw_3_2" ], + [4, "img_3", "vid_3", null, "raw_3_3" ], + [5, null, "vid_4", null, "raw_3_4" ], + [6, "img_5", null, null, null ] + ])"; + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), expected_json) + .ValueOrDie()); + + // Resolve descriptors back to raw bytes + ASSERT_OK_AND_ASSIGN(auto resolved, ConvertDescriptorToRawBlob(read_struct, {"b0", "b1"})); + ASSERT_OK_AND_ASSIGN(auto expected_with_rk, PrependRowKindColumn(expected_array)); + ASSERT_TRUE(resolved->type()->Equals(expected_with_rk->type())); + ASSERT_TRUE(resolved->Equals(expected_with_rk)); +} + +TEST_P(BlobTableInteTest, TestBlobDescriptorFieldWriteRawBytesDirectly) { + if (GetParam() == "lance") { + return; + } + // Similar to TestBlobDescriptorFieldWithoutExternalStorage but writes raw bytes directly + // without converting to descriptor first. The writer should auto-detect that the data + // is NOT a descriptor (no magic header) and handle it accordingly. + arrow::FieldVector fields = {arrow::field("f0", arrow::int32()), + BlobUtils::ToArrowField("b0", true), + BlobUtils::ToArrowField("b1", true)}; + + std::map options = { + {Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, GetParam()}, + {Options::TARGET_FILE_SIZE, "700"}, {Options::BUCKET, "-1"}, + {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}, + {Options::BLOB_DESCRIPTOR_FIELD, "b0,b1"}, {Options::FILE_SYSTEM, "local"}}; + CreateTable(fields, /*partition_keys=*/{}, options); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + + // Write raw bytes directly (no ConvertRawBlobToDescriptor) + std::string raw_json = R"([ + [1, "image_data_0", "video_data_0"], + [2, "image_data_1", "video_data_1"], + [3, "image_data_2", "video_data_2"] + ])"; + auto raw_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), raw_json).ValueOrDie()); + + auto schema = arrow::schema(fields); + ASSERT_NOK_WITH_MSG(WriteArray(table_path, {}, schema->field_names(), {raw_array}), + "BLOB inline field b0 configured by blob-descriptor-field or " + "blob-view-field require values " + "to be a BlobDescriptor or BlobViewStruct."); +} + } // namespace paimon::test