From b05d5e9981c9b94483469965dcafbbda78a8204c Mon Sep 17 00:00:00 2001 From: eric_song Date: Sun, 15 Mar 2026 01:37:36 +1100 Subject: [PATCH 1/7] fix: avoid casts for equivalent nested types --- .../expr/src/type_coercion/functions.rs | 46 +++++++++++++++++-- .../optimizer/src/analyzer/type_coercion.rs | 36 +++++++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index d5cb98a46ef43..42a1087271ab0 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -130,7 +130,7 @@ pub fn fields_with_udf( let valid_types = get_valid_types_with_udf(type_signature, ¤t_types, func)?; if valid_types .iter() - .any(|data_type| data_type == ¤t_types) + .any(|data_type| data_types_match(data_type, ¤t_types)) { return Ok(current_fields.to_vec()); } @@ -236,7 +236,7 @@ pub fn data_types( get_valid_types(function_name.as_ref(), type_signature, current_types)?; if valid_types .iter() - .any(|data_type| data_type == current_types) + .any(|data_type| data_types_match(data_type, current_types)) { return Ok(current_types.to_vec()); } @@ -307,6 +307,14 @@ fn try_coerce_types( ) } +fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> bool { + valid_types.len() == current_types.len() + && valid_types + .iter() + .zip(current_types) + .all(|(valid_type, current_type)| valid_type.equals_datatype(current_type)) +} + fn get_valid_types_with_udf( signature: &TypeSignature, current_types: &[DataType], @@ -757,7 +765,7 @@ fn maybe_data_types( for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; - if current_type == valid_type { + if current_type.equals_datatype(valid_type) { new_type.push(current_type.clone()) } else { // attempt to coerce. @@ -789,7 +797,7 @@ fn maybe_data_types_without_coercion( for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; - if current_type == valid_type { + if current_type.equals_datatype(valid_type) { new_type.push(current_type.clone()) } else if can_cast_types(current_type, valid_type) { // validate the valid type is castable from the current type @@ -1223,6 +1231,36 @@ mod tests { Ok(()) } + #[test] + fn test_fields_with_udf_preserves_equivalent_nested_types() -> Result<()> { + let struct_fields = vec![ + Field::new("id", DataType::Utf8, true), + Field::new("prim", DataType::Boolean, true), + ]; + let current_type = DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(struct_fields.clone().into()), + true, + ))); + let signature_type = DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(struct_fields.into()), + true, + ))); + + assert!(current_type.equals_datatype(&signature_type)); + + let current_fields = vec![Arc::new(Field::new("field", current_type, true))]; + let coerced_fields = fields_with_udf( + ¤t_fields, + &MockUdf(Signature::exact(vec![signature_type], Volatility::Stable)), + )?; + + assert_eq!(coerced_fields, current_fields); + + Ok(()) + } + #[test] fn test_nested_wildcard_fixed_size_lists() -> Result<()> { let type_into = DataType::FixedSizeList( diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index efc9984acb9b0..5a3903dbd674d 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1770,6 +1770,42 @@ mod test { ) } + #[test] + fn scalar_function_preserves_equivalent_nested_types() -> Result<()> { + let struct_fields = vec![ + Field::new("id", Utf8, true), + Field::new("prim", DataType::Boolean, true), + ]; + let current_type = DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(struct_fields.clone().into()), + true, + ))); + let signature_type = DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(struct_fields.into()), + true, + ))); + let empty = empty_with_type(current_type); + let fun = ScalarUDF::new_from_impl(TestScalarUDF { + signature: Signature::exact(vec![signature_type], Volatility::Stable), + }); + let scalar_function_expr = + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); + let plan = LogicalPlan::Projection(Projection::try_new( + vec![scalar_function_expr], + empty, + )?); + + assert_analyzed_plan_eq!( + plan, + @r" + Projection: TestScalarUDF(a) + EmptyRelation: rows=0 + " + ) + } + #[test] fn agg_udaf() -> Result<()> { let empty = empty(); From 4af3b3ee5c21fbf06372144135d9f72c66621fb3 Mon Sep 17 00:00:00 2001 From: eric_song Date: Sun, 15 Mar 2026 02:15:46 +1100 Subject: [PATCH 2/7] fix: narrow nested type matching --- .../expr/src/type_coercion/functions.rs | 138 +++++++++++++++++- 1 file changed, 135 insertions(+), 3 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 42a1087271ab0..5567408b46668 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -312,7 +312,41 @@ fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> boo && valid_types .iter() .zip(current_types) - .all(|(valid_type, current_type)| valid_type.equals_datatype(current_type)) + .all(|(valid_type, current_type)| { + data_type_matches_ignoring_list_field_name(valid_type, current_type) + }) +} + +fn data_type_matches_ignoring_list_field_name( + valid_type: &DataType, + current_type: &DataType, +) -> bool { + if valid_type == current_type { + return true; + } + + match (valid_type, current_type) { + (DataType::List(valid_field), DataType::List(current_field)) + | (DataType::LargeList(valid_field), DataType::LargeList(current_field)) => { + valid_field.is_nullable() == current_field.is_nullable() + && data_type_matches_ignoring_list_field_name( + valid_field.data_type(), + current_field.data_type(), + ) + } + ( + DataType::FixedSizeList(valid_field, valid_size), + DataType::FixedSizeList(current_field, current_size), + ) => { + valid_size == current_size + && valid_field.is_nullable() == current_field.is_nullable() + && data_type_matches_ignoring_list_field_name( + valid_field.data_type(), + current_field.data_type(), + ) + } + _ => false, + } } fn get_valid_types_with_udf( @@ -765,7 +799,11 @@ fn maybe_data_types( for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; - if current_type.equals_datatype(valid_type) { + // Keep exact equality here. Some kernels such as `make_array` + // require nested field names/order to match exactly at runtime. + // Structural-equivalence short-circuiting is handled earlier by + // `data_types_match`. + if current_type == valid_type { new_type.push(current_type.clone()) } else { // attempt to coerce. @@ -797,7 +835,11 @@ fn maybe_data_types_without_coercion( for (i, valid_type) in valid_types.iter().enumerate() { let current_type = ¤t_types[i]; - if current_type.equals_datatype(valid_type) { + // Keep exact equality here. Some kernels such as `make_array` + // require nested field names/order to match exactly at runtime. + // Structural-equivalence short-circuiting is handled earlier by + // `data_types_match`. + if current_type == valid_type { new_type.push(current_type.clone()) } else if can_cast_types(current_type, valid_type) { // validate the valid type is castable from the current type @@ -1052,6 +1094,96 @@ mod tests { } } + #[test] + fn test_maybe_data_types_uses_exact_nested_types() { + let struct_fields = vec![ + Field::new("id", DataType::Utf8, true), + Field::new("prim", DataType::Boolean, true), + ]; + let current_type = DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(struct_fields.clone().into()), + true, + ))); + let valid_type = DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(struct_fields.into()), + true, + ))); + + assert!(current_type.equals_datatype(&valid_type)); + assert_ne!(current_type, valid_type); + assert_eq!( + maybe_data_types(&[valid_type.clone()], &[current_type]), + Some(vec![valid_type]) + ); + } + + #[test] + fn test_maybe_data_types_without_coercion_uses_exact_nested_types() { + let valid_type = DataType::Struct( + vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + ] + .into(), + ); + let current_type = DataType::Struct( + vec![ + Field::new("b", DataType::Int64, true), + Field::new("a", DataType::Int64, true), + ] + .into(), + ); + + assert!(current_type.equals_datatype(&valid_type)); + assert_ne!(current_type, valid_type); + assert_eq!( + maybe_data_types_without_coercion(&[valid_type.clone()], &[current_type]), + Some(vec![valid_type]) + ); + } + + #[test] + fn test_data_types_match_ignores_list_field_name() { + let struct_fields = vec![ + Field::new("id", DataType::Utf8, true), + Field::new("prim", DataType::Boolean, true), + ]; + let current_type = DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(struct_fields.clone().into()), + true, + ))); + let valid_type = DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(struct_fields.into()), + true, + ))); + + assert!(data_types_match(&[valid_type], &[current_type])); + } + + #[test] + fn test_data_types_match_respects_struct_field_order() { + let valid_type = DataType::Struct( + vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + ] + .into(), + ); + let current_type = DataType::Struct( + vec![ + Field::new("b", DataType::Int64, true), + Field::new("a", DataType::Int64, true), + ] + .into(), + ); + + assert!(!data_types_match(&[valid_type], &[current_type])); + } + #[test] fn test_get_valid_types_numeric() -> Result<()> { let get_valid_types_flatten = From 9bec12cd8919d98b185217b4db08076cfec4fe8f Mon Sep 17 00:00:00 2001 From: eric_song Date: Sun, 15 Mar 2026 02:57:06 +1100 Subject: [PATCH 3/7] style: simplify nested type matching --- .../expr/src/type_coercion/functions.rs | 63 ++++++++----------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 5567408b46668..f6a74e5b5166a 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -308,45 +308,31 @@ fn try_coerce_types( } fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> bool { - valid_types.len() == current_types.len() - && valid_types - .iter() - .zip(current_types) - .all(|(valid_type, current_type)| { - data_type_matches_ignoring_list_field_name(valid_type, current_type) - }) -} - -fn data_type_matches_ignoring_list_field_name( - valid_type: &DataType, - current_type: &DataType, -) -> bool { - if valid_type == current_type { - return true; + fn field_matches(valid: &FieldRef, current: &FieldRef) -> bool { + valid.is_nullable() == current.is_nullable() + && data_type_matches(valid.data_type(), current.data_type()) } - match (valid_type, current_type) { - (DataType::List(valid_field), DataType::List(current_field)) - | (DataType::LargeList(valid_field), DataType::LargeList(current_field)) => { - valid_field.is_nullable() == current_field.is_nullable() - && data_type_matches_ignoring_list_field_name( - valid_field.data_type(), - current_field.data_type(), - ) - } - ( - DataType::FixedSizeList(valid_field, valid_size), - DataType::FixedSizeList(current_field, current_size), - ) => { - valid_size == current_size - && valid_field.is_nullable() == current_field.is_nullable() - && data_type_matches_ignoring_list_field_name( - valid_field.data_type(), - current_field.data_type(), - ) + fn data_type_matches(valid: &DataType, current: &DataType) -> bool { + match (valid, current) { + (valid, current) if valid == current => true, + (DataType::List(valid), DataType::List(current)) + | (DataType::LargeList(valid), DataType::LargeList(current)) => { + field_matches(valid, current) + } + ( + DataType::FixedSizeList(valid, valid_size), + DataType::FixedSizeList(current, current_size), + ) => valid_size == current_size && field_matches(valid, current), + _ => false, } - _ => false, } + + valid_types.len() == current_types.len() + && valid_types + .iter() + .zip(current_types) + .all(|(valid_type, current_type)| data_type_matches(valid_type, current_type)) } fn get_valid_types_with_udf( @@ -1114,7 +1100,7 @@ mod tests { assert!(current_type.equals_datatype(&valid_type)); assert_ne!(current_type, valid_type); assert_eq!( - maybe_data_types(&[valid_type.clone()], &[current_type]), + maybe_data_types(std::slice::from_ref(&valid_type), &[current_type]), Some(vec![valid_type]) ); } @@ -1139,7 +1125,10 @@ mod tests { assert!(current_type.equals_datatype(&valid_type)); assert_ne!(current_type, valid_type); assert_eq!( - maybe_data_types_without_coercion(&[valid_type.clone()], &[current_type]), + maybe_data_types_without_coercion( + std::slice::from_ref(&valid_type), + &[current_type], + ), Some(vec![valid_type]) ); } From 1c5a3394f9d471aa1ce8faf89025819b2654d9ad Mon Sep 17 00:00:00 2001 From: eric_song Date: Mon, 16 Mar 2026 12:10:18 +1100 Subject: [PATCH 4/7] fix: support list view nested type matching --- .../expr/src/type_coercion/functions.rs | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index f6a74e5b5166a..dc2d0febce2b0 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -317,7 +317,9 @@ fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> boo match (valid, current) { (valid, current) if valid == current => true, (DataType::List(valid), DataType::List(current)) - | (DataType::LargeList(valid), DataType::LargeList(current)) => { + | (DataType::LargeList(valid), DataType::LargeList(current)) + | (DataType::ListView(valid), DataType::ListView(current)) + | (DataType::LargeListView(valid), DataType::LargeListView(current)) => { field_matches(valid, current) } ( @@ -1153,6 +1155,26 @@ mod tests { assert!(data_types_match(&[valid_type], &[current_type])); } + #[test] + fn test_data_types_match_ignores_list_view_field_name() { + let struct_fields = vec![ + Field::new("id", DataType::Utf8, true), + Field::new("prim", DataType::Boolean, true), + ]; + let current_type = DataType::ListView(Arc::new(Field::new( + "item", + DataType::Struct(struct_fields.clone().into()), + true, + ))); + let valid_type = DataType::ListView(Arc::new(Field::new( + "element", + DataType::Struct(struct_fields.into()), + true, + ))); + + assert!(data_types_match(&[valid_type], &[current_type])); + } + #[test] fn test_data_types_match_respects_struct_field_order() { let valid_type = DataType::Struct( From 538e515dd7a5f3a0f3e229bddc4883f44159c84b Mon Sep 17 00:00:00 2001 From: eric_song Date: Tue, 17 Mar 2026 13:54:24 +1100 Subject: [PATCH 5/7] fix: relax nested type matching --- .../expr/src/type_coercion/functions.rs | 95 +++++++++++++++- .../optimizer/src/analyzer/type_coercion.rs | 104 ++++++++++++++++++ .../test_files/spark/array/array.slt | 69 ++++++++++++ 3 files changed, 264 insertions(+), 4 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index dc2d0febce2b0..c094f5bf29d99 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -308,24 +308,39 @@ fn try_coerce_types( } fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> bool { - fn field_matches(valid: &FieldRef, current: &FieldRef) -> bool { - valid.is_nullable() == current.is_nullable() + fn field_matches(valid: &FieldRef, current: &FieldRef, compare_name: bool) -> bool { + (!compare_name || valid.name() == current.name()) + && valid.is_nullable() == current.is_nullable() && data_type_matches(valid.data_type(), current.data_type()) } fn data_type_matches(valid: &DataType, current: &DataType) -> bool { match (valid, current) { (valid, current) if valid == current => true, + // Ignore wrapper field names for nested types whose physical layout + // only depends on the child type, not the synthetic field name + // chosen by a particular Arrow producer (for example parquet lists). (DataType::List(valid), DataType::List(current)) | (DataType::LargeList(valid), DataType::LargeList(current)) | (DataType::ListView(valid), DataType::ListView(current)) | (DataType::LargeListView(valid), DataType::LargeListView(current)) => { - field_matches(valid, current) + field_matches(valid, current, false) } ( DataType::FixedSizeList(valid, valid_size), DataType::FixedSizeList(current, current_size), - ) => valid_size == current_size && field_matches(valid, current), + ) => valid_size == current_size && field_matches(valid, current, false), + ( + DataType::Map(valid, valid_sorted), + DataType::Map(current, current_sorted), + ) => valid_sorted == current_sorted && field_matches(valid, current, false), + (DataType::Struct(valid), DataType::Struct(current)) => { + valid.len() == current.len() + && valid + .iter() + .zip(current.iter()) + .all(|(valid, current)| field_matches(valid, current, true)) + } _ => false, } } @@ -1175,6 +1190,78 @@ mod tests { assert!(data_types_match(&[valid_type], &[current_type])); } + #[test] + fn test_data_types_match_recurses_through_struct_fields() { + let current_type = DataType::Struct( + vec![Field::new( + "a", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + )] + .into(), + ); + let valid_type = DataType::Struct( + vec![Field::new( + "a", + DataType::List(Arc::new(Field::new("element", DataType::Int64, true))), + true, + )] + .into(), + ); + + assert!(data_types_match(&[valid_type], &[current_type])); + } + + #[test] + fn test_data_types_match_ignores_map_field_name() { + let current_type = DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Field::new("key", DataType::Utf8, false), + Field::new( + "value", + DataType::List(Arc::new(Field::new( + "item", + DataType::Int64, + true, + ))), + true, + ), + ] + .into(), + ), + false, + )), + false, + ); + let valid_type = DataType::Map( + Arc::new(Field::new( + "pairs", + DataType::Struct( + vec![ + Field::new("key", DataType::Utf8, false), + Field::new( + "value", + DataType::List(Arc::new(Field::new( + "element", + DataType::Int64, + true, + ))), + true, + ), + ] + .into(), + ), + false, + )), + false, + ); + + assert!(data_types_match(&[valid_type], &[current_type])); + } + #[test] fn test_data_types_match_respects_struct_field_order() { let valid_type = DataType::Struct( diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 5a3903dbd674d..a2ae62f82adc8 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1806,6 +1806,110 @@ mod test { ) } + #[test] + fn scalar_function_preserves_equivalent_nested_types_in_structs() -> Result<()> { + let current_type = DataType::Struct( + vec![Field::new( + "a", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + )] + .into(), + ); + let signature_type = DataType::Struct( + vec![Field::new( + "a", + DataType::List(Arc::new(Field::new("element", DataType::Int64, true))), + true, + )] + .into(), + ); + let empty = empty_with_type(current_type); + let fun = ScalarUDF::new_from_impl(TestScalarUDF { + signature: Signature::exact(vec![signature_type], Volatility::Stable), + }); + let scalar_function_expr = + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); + let plan = LogicalPlan::Projection(Projection::try_new( + vec![scalar_function_expr], + empty, + )?); + + assert_analyzed_plan_eq!( + plan, + @r" + Projection: TestScalarUDF(a) + EmptyRelation: rows=0 + " + ) + } + + #[test] + fn scalar_function_preserves_equivalent_nested_types_in_maps() -> Result<()> { + let current_type = DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Field::new("key", Utf8, false), + Field::new( + "value", + DataType::List(Arc::new(Field::new( + "item", + DataType::Int64, + true, + ))), + true, + ), + ] + .into(), + ), + false, + )), + false, + ); + let signature_type = DataType::Map( + Arc::new(Field::new( + "pairs", + DataType::Struct( + vec![ + Field::new("key", Utf8, false), + Field::new( + "value", + DataType::List(Arc::new(Field::new( + "element", + DataType::Int64, + true, + ))), + true, + ), + ] + .into(), + ), + false, + )), + false, + ); + let empty = empty_with_type(current_type); + let fun = ScalarUDF::new_from_impl(TestScalarUDF { + signature: Signature::exact(vec![signature_type], Volatility::Stable), + }); + let scalar_function_expr = + Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); + let plan = LogicalPlan::Projection(Projection::try_new( + vec![scalar_function_expr], + empty, + )?); + + assert_analyzed_plan_eq!( + plan, + @r" + Projection: TestScalarUDF(a) + EmptyRelation: rows=0 + " + ) + } + #[test] fn agg_udaf() -> Result<()> { let empty = empty(); diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 79dca1c10a7d0..8783f761e711e 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -85,3 +85,72 @@ query ? SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3)); ---- [[1, 2], [3]] + +statement ok +set datafusion.explain.logical_plan_only = true; + +query TT +EXPLAIN VERBOSE SELECT array_element(arrow_cast([1, 2], 'LargeList(Int64)'), 1); +---- +initial_logical_plan +01)Projection: array_element(arrow_cast(make_array(Int64(1), Int64(2)), Utf8("LargeList(Int64)")), Int64(1)) +02)--EmptyRelation: rows=1 +logical_plan after resolve_grouping_function SAME TEXT AS ABOVE +logical_plan after type_coercion SAME TEXT AS ABOVE +analyzed_logical_plan SAME TEXT AS ABOVE +logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE +logical_plan after optimize_unions SAME TEXT AS ABOVE +logical_plan after simplify_expressions +01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)] +02)--EmptyRelation: rows=1 +logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE +logical_plan after eliminate_join SAME TEXT AS ABOVE +logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE +logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE +logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE +logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE +logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE +logical_plan after eliminate_filter SAME TEXT AS ABOVE +logical_plan after eliminate_cross_join SAME TEXT AS ABOVE +logical_plan after eliminate_limit SAME TEXT AS ABOVE +logical_plan after propagate_empty_relation SAME TEXT AS ABOVE +logical_plan after filter_null_join_keys SAME TEXT AS ABOVE +logical_plan after eliminate_outer_join SAME TEXT AS ABOVE +logical_plan after push_down_limit SAME TEXT AS ABOVE +logical_plan after push_down_filter SAME TEXT AS ABOVE +logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE +logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE +logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE +logical_plan after optimize_projections SAME TEXT AS ABOVE +logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE +logical_plan after optimize_unions SAME TEXT AS ABOVE +logical_plan after simplify_expressions SAME TEXT AS ABOVE +logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE +logical_plan after eliminate_join SAME TEXT AS ABOVE +logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE +logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE +logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE +logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE +logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE +logical_plan after eliminate_filter SAME TEXT AS ABOVE +logical_plan after eliminate_cross_join SAME TEXT AS ABOVE +logical_plan after eliminate_limit SAME TEXT AS ABOVE +logical_plan after propagate_empty_relation SAME TEXT AS ABOVE +logical_plan after filter_null_join_keys SAME TEXT AS ABOVE +logical_plan after eliminate_outer_join SAME TEXT AS ABOVE +logical_plan after push_down_limit SAME TEXT AS ABOVE +logical_plan after push_down_filter SAME TEXT AS ABOVE +logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE +logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE +logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE +logical_plan after optimize_projections SAME TEXT AS ABOVE +logical_plan +01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)] +02)--EmptyRelation: rows=1 + +statement ok +set datafusion.explain.logical_plan_only = false; From 215a74ea577628501451f2355d71813088bd3699 Mon Sep 17 00:00:00 2001 From: eric_song Date: Wed, 18 Mar 2026 13:34:40 +1100 Subject: [PATCH 6/7] test: simplify nested type coverage --- .../expr/src/type_coercion/functions.rs | 6 +- .../optimizer/src/analyzer/type_coercion.rs | 140 ------------------ .../test_files/spark/array/array.slt | 58 +------- 3 files changed, 4 insertions(+), 200 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index c094f5bf29d99..99d4090a59a74 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -314,12 +314,12 @@ fn data_types_match(valid_types: &[DataType], current_types: &[DataType]) -> boo && data_type_matches(valid.data_type(), current.data_type()) } + // Allow nested types that differ only by wrapper field names chosen by an + // Arrow producer (for example parquet lists/maps), but keep struct field + // names and ordering exact because some kernels depend on them at runtime. fn data_type_matches(valid: &DataType, current: &DataType) -> bool { match (valid, current) { (valid, current) if valid == current => true, - // Ignore wrapper field names for nested types whose physical layout - // only depends on the child type, not the synthetic field name - // chosen by a particular Arrow producer (for example parquet lists). (DataType::List(valid), DataType::List(current)) | (DataType::LargeList(valid), DataType::LargeList(current)) | (DataType::ListView(valid), DataType::ListView(current)) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index a2ae62f82adc8..efc9984acb9b0 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1770,146 +1770,6 @@ mod test { ) } - #[test] - fn scalar_function_preserves_equivalent_nested_types() -> Result<()> { - let struct_fields = vec![ - Field::new("id", Utf8, true), - Field::new("prim", DataType::Boolean, true), - ]; - let current_type = DataType::List(Arc::new(Field::new( - "item", - DataType::Struct(struct_fields.clone().into()), - true, - ))); - let signature_type = DataType::List(Arc::new(Field::new( - "element", - DataType::Struct(struct_fields.into()), - true, - ))); - let empty = empty_with_type(current_type); - let fun = ScalarUDF::new_from_impl(TestScalarUDF { - signature: Signature::exact(vec![signature_type], Volatility::Stable), - }); - let scalar_function_expr = - Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); - let plan = LogicalPlan::Projection(Projection::try_new( - vec![scalar_function_expr], - empty, - )?); - - assert_analyzed_plan_eq!( - plan, - @r" - Projection: TestScalarUDF(a) - EmptyRelation: rows=0 - " - ) - } - - #[test] - fn scalar_function_preserves_equivalent_nested_types_in_structs() -> Result<()> { - let current_type = DataType::Struct( - vec![Field::new( - "a", - DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), - true, - )] - .into(), - ); - let signature_type = DataType::Struct( - vec![Field::new( - "a", - DataType::List(Arc::new(Field::new("element", DataType::Int64, true))), - true, - )] - .into(), - ); - let empty = empty_with_type(current_type); - let fun = ScalarUDF::new_from_impl(TestScalarUDF { - signature: Signature::exact(vec![signature_type], Volatility::Stable), - }); - let scalar_function_expr = - Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); - let plan = LogicalPlan::Projection(Projection::try_new( - vec![scalar_function_expr], - empty, - )?); - - assert_analyzed_plan_eq!( - plan, - @r" - Projection: TestScalarUDF(a) - EmptyRelation: rows=0 - " - ) - } - - #[test] - fn scalar_function_preserves_equivalent_nested_types_in_maps() -> Result<()> { - let current_type = DataType::Map( - Arc::new(Field::new( - "entries", - DataType::Struct( - vec![ - Field::new("key", Utf8, false), - Field::new( - "value", - DataType::List(Arc::new(Field::new( - "item", - DataType::Int64, - true, - ))), - true, - ), - ] - .into(), - ), - false, - )), - false, - ); - let signature_type = DataType::Map( - Arc::new(Field::new( - "pairs", - DataType::Struct( - vec![ - Field::new("key", Utf8, false), - Field::new( - "value", - DataType::List(Arc::new(Field::new( - "element", - DataType::Int64, - true, - ))), - true, - ), - ] - .into(), - ), - false, - )), - false, - ); - let empty = empty_with_type(current_type); - let fun = ScalarUDF::new_from_impl(TestScalarUDF { - signature: Signature::exact(vec![signature_type], Volatility::Stable), - }); - let scalar_function_expr = - Expr::ScalarFunction(ScalarFunction::new_udf(Arc::new(fun), vec![col("a")])); - let plan = LogicalPlan::Projection(Projection::try_new( - vec![scalar_function_expr], - empty, - )?); - - assert_analyzed_plan_eq!( - plan, - @r" - Projection: TestScalarUDF(a) - EmptyRelation: rows=0 - " - ) - } - #[test] fn agg_udaf() -> Result<()> { let empty = empty(); diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 8783f761e711e..1096bc9746348 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -90,64 +90,8 @@ statement ok set datafusion.explain.logical_plan_only = true; query TT -EXPLAIN VERBOSE SELECT array_element(arrow_cast([1, 2], 'LargeList(Int64)'), 1); +EXPLAIN SELECT array_element(arrow_cast([1, 2], 'LargeList(Int64)'), 1); ---- -initial_logical_plan -01)Projection: array_element(arrow_cast(make_array(Int64(1), Int64(2)), Utf8("LargeList(Int64)")), Int64(1)) -02)--EmptyRelation: rows=1 -logical_plan after resolve_grouping_function SAME TEXT AS ABOVE -logical_plan after type_coercion SAME TEXT AS ABOVE -analyzed_logical_plan SAME TEXT AS ABOVE -logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE -logical_plan after optimize_unions SAME TEXT AS ABOVE -logical_plan after simplify_expressions -01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)] -02)--EmptyRelation: rows=1 -logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE -logical_plan after eliminate_join SAME TEXT AS ABOVE -logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE -logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE -logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE -logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE -logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE -logical_plan after eliminate_filter SAME TEXT AS ABOVE -logical_plan after eliminate_cross_join SAME TEXT AS ABOVE -logical_plan after eliminate_limit SAME TEXT AS ABOVE -logical_plan after propagate_empty_relation SAME TEXT AS ABOVE -logical_plan after filter_null_join_keys SAME TEXT AS ABOVE -logical_plan after eliminate_outer_join SAME TEXT AS ABOVE -logical_plan after push_down_limit SAME TEXT AS ABOVE -logical_plan after push_down_filter SAME TEXT AS ABOVE -logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE -logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE -logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE -logical_plan after optimize_projections SAME TEXT AS ABOVE -logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE -logical_plan after optimize_unions SAME TEXT AS ABOVE -logical_plan after simplify_expressions SAME TEXT AS ABOVE -logical_plan after replace_distinct_aggregate SAME TEXT AS ABOVE -logical_plan after eliminate_join SAME TEXT AS ABOVE -logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE -logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE -logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE -logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE -logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE -logical_plan after eliminate_filter SAME TEXT AS ABOVE -logical_plan after eliminate_cross_join SAME TEXT AS ABOVE -logical_plan after eliminate_limit SAME TEXT AS ABOVE -logical_plan after propagate_empty_relation SAME TEXT AS ABOVE -logical_plan after filter_null_join_keys SAME TEXT AS ABOVE -logical_plan after eliminate_outer_join SAME TEXT AS ABOVE -logical_plan after push_down_limit SAME TEXT AS ABOVE -logical_plan after push_down_filter SAME TEXT AS ABOVE -logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE -logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE -logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE -logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan 01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)] 02)--EmptyRelation: rows=1 From 00d249e9b1d64e24c08f5e338a910f62a32d56ee Mon Sep 17 00:00:00 2001 From: eric_song Date: Thu, 19 Mar 2026 02:41:30 +1100 Subject: [PATCH 7/7] test: expand nested type sqllogictests --- .../test_files/spark/array/array.slt | 37 ++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/spark/array/array.slt b/datafusion/sqllogictest/test_files/spark/array/array.slt index 1096bc9746348..3c07b2326adf7 100644 --- a/datafusion/sqllogictest/test_files/spark/array/array.slt +++ b/datafusion/sqllogictest/test_files/spark/array/array.slt @@ -86,15 +86,42 @@ SELECT array(arrow_cast(array(1,2), 'LargeList(Int64)'), array(3)); ---- [[1, 2], [3]] -statement ok -set datafusion.explain.logical_plan_only = true; - query TT EXPLAIN SELECT array_element(arrow_cast([1, 2], 'LargeList(Int64)'), 1); ---- logical_plan 01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)] 02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[1 as arrow_cast(make_array(Int64(1),Int64(2)),Utf8("LargeList(Int64)"))[Int64(1)]] +02)--PlaceholderRowExec + +query TT +EXPLAIN SELECT get_field(array_element(MAP {'ECID': [{id: 1, prim: true}, {id: 2, prim: false}]}['ECID'], 1), 'id'); +---- +logical_plan +01)Projection: Int64(1) AS map(make_array(Utf8("ECID")),make_array(make_array(named_struct(Utf8("id"),Int64(1),Utf8("prim"),Boolean(true)),named_struct(Utf8("id"),Int64(2),Utf8("prim"),Boolean(false)))))[ECID][Int64(1)][id] +02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[1 as map(make_array(Utf8("ECID")),make_array(make_array(named_struct(Utf8("id"),Int64(1),Utf8("prim"),Boolean(true)),named_struct(Utf8("id"),Int64(2),Utf8("prim"),Boolean(false)))))[ECID][Int64(1)][id]] +02)--PlaceholderRowExec -statement ok -set datafusion.explain.logical_plan_only = false; +query TT +EXPLAIN SELECT array_element(get_field({'a': [1, 2]}, 'a'), 1); +---- +logical_plan +01)Projection: Int64(1) AS named_struct(Utf8("a"),make_array(Int64(1),Int64(2)))[a][Int64(1)] +02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[1 as named_struct(Utf8("a"),make_array(Int64(1),Int64(2)))[a][Int64(1)]] +02)--PlaceholderRowExec + +query TT +EXPLAIN SELECT array_element(arrow_cast(make_array(1, 2), 'FixedSizeList(2, Int64)'), 1); +---- +logical_plan +01)Projection: Int64(1) AS arrow_cast(make_array(Int64(1),Int64(2)),Utf8("FixedSizeList(2, Int64)"))[Int64(1)] +02)--EmptyRelation: rows=1 +physical_plan +01)ProjectionExec: expr=[1 as arrow_cast(make_array(Int64(1),Int64(2)),Utf8("FixedSizeList(2, Int64)"))[Int64(1)]] +02)--PlaceholderRowExec