From d5cf748b1a4ede490fe0468da949a43ef909bd37 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Mon, 9 Mar 2026 12:45:37 +0100 Subject: [PATCH 1/2] fix: inherit field names from left projection in set expressions when missing Otherwise they might end up having the same column names, which is prohibited. This is only the case for literals, other expression types should have default unique names. --- datafusion/sql/src/set_expr.rs | 101 ++++++++++++++++++- datafusion/sql/tests/sql_integration.rs | 54 ++++++++++ datafusion/sqllogictest/test_files/union.slt | 24 +++++ 3 files changed, 176 insertions(+), 3 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index d4e771cb48585..6e9b2ea5d5340 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -17,10 +17,12 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, + DFSchemaRef, DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, }; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned}; +use sqlparser::ast::{ + Expr as SQLExpr, Ident, SelectItem, SetExpr, SetOperator, SetQuantifier, Spanned, +}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -42,7 +44,28 @@ impl SqlToRel<'_, S> { let left_span = Span::try_from_sqlparser_span(left.span()); let right_span = Span::try_from_sqlparser_span(right.span()); let left_plan = self.set_expr_to_plan(*left, planner_context); - let right_plan = self.set_expr_to_plan(*right, planner_context); + + // For non-*ByName operations, add missing aliases to right side using left schema's + // column names. This allows queries like + // `SELECT 1 c1, 0 c2, 0 c3 UNION ALL SELECT 2, 0, 0` + // where the right side has duplicate literal values. + // We only do this if the left side succeeded. + let right = if let Ok(plan) = &left_plan + && plan.schema().fields().len() > 1 + && matches!( + set_quantifier, + SetQuantifier::All + | SetQuantifier::Distinct + | SetQuantifier::None + ) { + alias_set_expr(*right, plan.schema()) + } else { + *right + }; + + let right_plan = self.set_expr_to_plan(right, planner_context); + + // Handle errors from both sides, collecting them if both failed let (left_plan, right_plan) = match (left_plan, right_plan) { (Ok(left_plan), Ok(right_plan)) => (left_plan, right_plan), (Err(left_err), Err(right_err)) => { @@ -160,3 +183,75 @@ impl SqlToRel<'_, S> { } } } + +// Adds aliases to SELECT items in a SetExpr using the provided schema. +// This ensures that unnamed expressions on the right side of a UNION/INTERSECT/EXCEPT +// get aliased with the column names from the left side, allowing queries like +// `SELECT 1 AS a, 0 AS b, 0 AS c UNION ALL SELECT 2, 0, 0` to work correctly. +fn alias_set_expr(set_expr: SetExpr, schema: &DFSchemaRef) -> SetExpr { + match set_expr { + SetExpr::Select(mut select) => { + alias_select_items(&mut select.projection, schema); + SetExpr::Select(select) + } + SetExpr::SetOperation { + op, + left, + right, + set_quantifier, + } => { + // For nested set operations, only alias the leftmost branch + // since that's what determines the output column names + SetExpr::SetOperation { + op, + left: Box::new(alias_set_expr(*left, schema)), + right, + set_quantifier, + } + } + SetExpr::Query(mut query) => { + // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) + query.body = Box::new(alias_set_expr(*query.body, schema)); + SetExpr::Query(query) + } + // For other cases (Values, etc.), return as-is + other => other, + } +} + +// Adds aliases to literal value expressions where missing, based on the input schema, as these are +// the ones that can cause duplicate name issues (e.g. `SELECT 0, 0` has two columns named `Int64(0)`). +// Other expressions typically have unique names. +fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { + let mut col_idx = 0; + for item in items.iter_mut() { + match item { + SelectItem::UnnamedExpr(expr) if is_literal_value(expr) => { + if let Some(field) = schema.fields().get(col_idx) { + *item = SelectItem::ExprWithAlias { + expr: expr.clone(), + alias: Ident::new(field.name()), + }; + } + col_idx += 1; + } + SelectItem::UnnamedExpr(_) | SelectItem::ExprWithAlias { .. } => { + col_idx += 1; + } + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { + // Wildcards expand to multiple columns - skip position tracking + } + } + } +} + +/// Returns true if the expression is a literal value that could cause duplicate names. +fn is_literal_value(expr: &SQLExpr) -> bool { + matches!( + expr, + SQLExpr::Value(_) + | SQLExpr::UnaryOp { .. } + | SQLExpr::TypedString { .. } + | SQLExpr::Interval(_) + ) +} diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 29c17be69ce5f..e864fae9a9144 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2655,6 +2655,60 @@ fn union_all_by_name_same_column_names() { ); } +#[test] +fn union_all_with_duplicate_literals() { + let sql = "SELECT 0 a, 0 b UNION ALL SELECT 1, 1"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + Union + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn intersect_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c INTERSECT SELECT 1, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftSemi Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn except_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c EXCEPT SELECT 2, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftAnti Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(2) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + #[test] fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index d858d0ae3ea4e..d0ad5c8bb3c58 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -256,6 +256,30 @@ Bob_new John John_new +# Test UNION ALL with unaliased duplicate literal values on the right side. +# The second projection will inherit field names from the first one, and so +# pass the unique projection expression name check. +query TII rowsort +SELECT name, 1 as table, 1 as row FROM t1 WHERE id = 1 +UNION ALL +SELECT name, 2, 2 FROM t2 WHERE id = 2 +---- +Alex 1 1 +Bob 2 2 + +# Test nested UNION, EXCEPT, INTERSECT with duplicate unaliased literals. +# Only the first SELECT has column aliases, which should propagate to all projections. +query III rowsort +SELECT 1 as a, 0 as b, 0 as c +UNION ALL +((SELECT 2, 0, 0 UNION ALL SELECT 3, 0, 0) EXCEPT SELECT 3, 0, 0) +UNION ALL +(SELECT 4, 0, 0 INTERSECT SELECT 4, 0, 0) +---- +1 0 0 +2 0 0 +4 0 0 + # Plan is unnested query TT EXPLAIN SELECT name FROM t1 UNION ALL (SELECT name from t2 UNION ALL SELECT name || '_new' from t2) From 4d06d452bfa0ba0724d57284e7adf12c5d254368 Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 12 Mar 2026 09:25:01 +0100 Subject: [PATCH 2/2] ref: alter set expressions directly instead of creating new ones --- datafusion/sql/src/set_expr.rs | 50 ++++++++++------------------------ 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 6e9b2ea5d5340..bb9875182b264 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -38,7 +38,7 @@ impl SqlToRel<'_, S> { SetExpr::SetOperation { op, left, - right, + mut right, set_quantifier, } => { let left_span = Span::try_from_sqlparser_span(left.span()); @@ -47,23 +47,22 @@ impl SqlToRel<'_, S> { // For non-*ByName operations, add missing aliases to right side using left schema's // column names. This allows queries like - // `SELECT 1 c1, 0 c2, 0 c3 UNION ALL SELECT 2, 0, 0` + // `SELECT 1 a, 1 b UNION ALL SELECT 2, 2` // where the right side has duplicate literal values. // We only do this if the left side succeeded. - let right = if let Ok(plan) = &left_plan + if let Ok(plan) = &left_plan && plan.schema().fields().len() > 1 && matches!( set_quantifier, SetQuantifier::All | SetQuantifier::Distinct | SetQuantifier::None - ) { - alias_set_expr(*right, plan.schema()) - } else { - *right - }; + ) + { + alias_set_expr(&mut right, plan.schema()) + } - let right_plan = self.set_expr_to_plan(right, planner_context); + let right_plan = self.set_expr_to_plan(*right, planner_context); // Handle errors from both sides, collecting them if both failed let (left_plan, right_plan) = match (left_plan, right_plan) { @@ -188,34 +187,15 @@ impl SqlToRel<'_, S> { // This ensures that unnamed expressions on the right side of a UNION/INTERSECT/EXCEPT // get aliased with the column names from the left side, allowing queries like // `SELECT 1 AS a, 0 AS b, 0 AS c UNION ALL SELECT 2, 0, 0` to work correctly. -fn alias_set_expr(set_expr: SetExpr, schema: &DFSchemaRef) -> SetExpr { +fn alias_set_expr(set_expr: &mut SetExpr, schema: &DFSchemaRef) { match set_expr { - SetExpr::Select(mut select) => { - alias_select_items(&mut select.projection, schema); - SetExpr::Select(select) - } - SetExpr::SetOperation { - op, - left, - right, - set_quantifier, - } => { - // For nested set operations, only alias the leftmost branch - // since that's what determines the output column names - SetExpr::SetOperation { - op, - left: Box::new(alias_set_expr(*left, schema)), - right, - set_quantifier, - } - } - SetExpr::Query(mut query) => { - // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) - query.body = Box::new(alias_set_expr(*query.body, schema)); - SetExpr::Query(query) - } + SetExpr::Select(select) => alias_select_items(&mut select.projection, schema), + // For nested set operations, only alias the leftmost branch + SetExpr::SetOperation { left, .. } => alias_set_expr(left, schema), + // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) + SetExpr::Query(query) => alias_set_expr(&mut query.body, schema), // For other cases (Values, etc.), return as-is - other => other, + _other => (), } }