From 897132987a49a532fe16b323e2491e4876a46880 Mon Sep 17 00:00:00 2001 From: Thomas Tanon Date: Thu, 12 Mar 2026 15:34:40 +0100 Subject: [PATCH] Add datafusion/arrow_canonical_extension_types feature Wraps `parquet/arrow_canonical_extension_types` and `arrow-schema/canonical_extension_types` This feature is enabled by default (`arrow-schema/canonical_extension_types` was enabled by default) It might be a good flag to hide behind support for canonical extension types in various DataFusion operators --- .github/workflows/rust.yml | 2 + datafusion/core/Cargo.toml | 6 +- .../arrow_canonical_extension_types.rs | 65 +++++++++++++++++++ datafusion/core/tests/parquet/mod.rs | 2 + 4 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 datafusion/core/tests/parquet/arrow_canonical_extension_types.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index af37b470a498b..e88a41b9c9125 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -226,6 +226,8 @@ jobs: run: cargo check --profile ci --no-default-features -p datafusion --features=string_expressions - name: Check datafusion (unicode_expressions) run: cargo check --profile ci --no-default-features -p datafusion --features=unicode_expressions + - name: Check datafusion (arrow_canonical_extension_types) + run: cargo check --profile ci --no-default-features -p datafusion --features=arrow_canonical_extension_types - name: Check parquet encryption (parquet_encryption) run: cargo check --profile ci --no-default-features -p datafusion --features=parquet_encryption diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index be507e0691a19..ad109eb0cd0a5 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -67,6 +67,7 @@ default = [ "parquet", "recursive_protection", "sql", + "arrow_canonical_extension_types", ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) @@ -108,11 +109,12 @@ unicode_expressions = [ "datafusion-sql?/unicode_expressions", "datafusion-functions/unicode_expressions", ] +arrow_canonical_extension_types = ["arrow-schema/canonical_extension_types", "parquet?/arrow_canonical_extension_types"] extended_tests = [] [dependencies] arrow = { workspace = true } -arrow-schema = { workspace = true, features = ["canonical_extension_types"] } +arrow-schema = { workspace = true } async-trait = { workspace = true } bzip2 = { workspace = true, optional = true } chrono = { workspace = true } @@ -149,7 +151,7 @@ liblzma = { workspace = true, optional = true } log = { workspace = true } object_store = { workspace = true } parking_lot = { workspace = true } -parquet = { workspace = true, optional = true, default-features = true } +parquet = { workspace = true, optional = true, default-features = true, features = ["arrow_canonical_extension_types"] } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } sqlparser = { workspace = true, optional = true } tempfile = { workspace = true } diff --git a/datafusion/core/tests/parquet/arrow_canonical_extension_types.rs b/datafusion/core/tests/parquet/arrow_canonical_extension_types.rs new file mode 100644 index 0000000000000..8a2b46b91feea --- /dev/null +++ b/datafusion/core/tests/parquet/arrow_canonical_extension_types.rs @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::*; +use arrow_schema::extension::Uuid; +use parquet::arrow::ArrowWriter; +use parquet::basic::LogicalType; +use parquet::file::metadata::ParquetMetaDataReader; +use std::fs::File; +use tempfile::TempDir; + +#[tokio::test] +async fn test_uuid_roundtrip() { + let tmp_dir = TempDir::new().unwrap(); + + // Create mock schema and data + let schema = Arc::new(Schema::new(vec![ + Field::new("uuid", DataType::FixedSizeBinary(16), false) + .with_extension_type(Uuid), + ])); + let uuids = Arc::new(FixedSizeBinaryArray::from(vec![b"abcdefghijklmnop"])); + let record_batch = RecordBatch::try_new(schema.clone(), vec![uuids]).unwrap(); + + // We write a Parquet file + let table_path = tmp_dir.path().join("test.parquet"); + let file = File::create(&table_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, schema.clone(), None).unwrap(); + writer.write(&record_batch).unwrap(); + writer.close().unwrap(); + + // We check we indeed use the UUID type in Parquet + let parquet_metadata = ParquetMetaDataReader::new() + .parse_and_finish(&File::open(&table_path).unwrap()) + .unwrap(); + assert_eq!( + parquet_metadata.file_metadata().schema().get_fields()[0] + .get_basic_info() + .logical_type_ref(), + Some(&LogicalType::Uuid) + ); + + // We read the Parquet file and make sure the UUID extension type has been kept + let data_frame = SessionContext::new() + .read_parquet( + vec![table_path.to_str().unwrap()], + ParquetReadOptions::default().skip_metadata(false), + ) + .await + .unwrap(); + assert_eq!(*data_frame.schema().inner(), schema) +} diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 0535ddd9247d4..ad753e41cd41a 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -44,6 +44,8 @@ use parquet::file::properties::{EnabledStatistics, WriterProperties}; use std::sync::Arc; use tempfile::NamedTempFile; +#[cfg(feature = "arrow_canonical_extension_types")] +mod arrow_canonical_extension_types; mod custom_reader; #[cfg(feature = "parquet_encryption")] mod encryption;