diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index d4e771cb48585..bb9875182b264 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -17,10 +17,12 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, + DFSchemaRef, DataFusionError, Diagnostic, Result, Span, not_impl_err, plan_err, }; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned}; +use sqlparser::ast::{ + Expr as SQLExpr, Ident, SelectItem, SetExpr, SetOperator, SetQuantifier, Spanned, +}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -36,13 +38,33 @@ impl SqlToRel<'_, S> { SetExpr::SetOperation { op, left, - right, + mut right, set_quantifier, } => { let left_span = Span::try_from_sqlparser_span(left.span()); let right_span = Span::try_from_sqlparser_span(right.span()); let left_plan = self.set_expr_to_plan(*left, planner_context); + + // For non-*ByName operations, add missing aliases to right side using left schema's + // column names. This allows queries like + // `SELECT 1 a, 1 b UNION ALL SELECT 2, 2` + // where the right side has duplicate literal values. + // We only do this if the left side succeeded. + if let Ok(plan) = &left_plan + && plan.schema().fields().len() > 1 + && matches!( + set_quantifier, + SetQuantifier::All + | SetQuantifier::Distinct + | SetQuantifier::None + ) + { + alias_set_expr(&mut right, plan.schema()) + } + let right_plan = self.set_expr_to_plan(*right, planner_context); + + // Handle errors from both sides, collecting them if both failed let (left_plan, right_plan) = match (left_plan, right_plan) { (Ok(left_plan), Ok(right_plan)) => (left_plan, right_plan), (Err(left_err), Err(right_err)) => { @@ -160,3 +182,56 @@ impl SqlToRel<'_, S> { } } } + +// Adds aliases to SELECT items in a SetExpr using the provided schema. +// This ensures that unnamed expressions on the right side of a UNION/INTERSECT/EXCEPT +// get aliased with the column names from the left side, allowing queries like +// `SELECT 1 AS a, 0 AS b, 0 AS c UNION ALL SELECT 2, 0, 0` to work correctly. +fn alias_set_expr(set_expr: &mut SetExpr, schema: &DFSchemaRef) { + match set_expr { + SetExpr::Select(select) => alias_select_items(&mut select.projection, schema), + // For nested set operations, only alias the leftmost branch + SetExpr::SetOperation { left, .. } => alias_set_expr(left, schema), + // Handle parenthesized queries like (SELECT ... UNION ALL SELECT ...) + SetExpr::Query(query) => alias_set_expr(&mut query.body, schema), + // For other cases (Values, etc.), return as-is + _other => (), + } +} + +// Adds aliases to literal value expressions where missing, based on the input schema, as these are +// the ones that can cause duplicate name issues (e.g. `SELECT 0, 0` has two columns named `Int64(0)`). +// Other expressions typically have unique names. +fn alias_select_items(items: &mut [SelectItem], schema: &DFSchemaRef) { + let mut col_idx = 0; + for item in items.iter_mut() { + match item { + SelectItem::UnnamedExpr(expr) if is_literal_value(expr) => { + if let Some(field) = schema.fields().get(col_idx) { + *item = SelectItem::ExprWithAlias { + expr: expr.clone(), + alias: Ident::new(field.name()), + }; + } + col_idx += 1; + } + SelectItem::UnnamedExpr(_) | SelectItem::ExprWithAlias { .. } => { + col_idx += 1; + } + SelectItem::Wildcard(_) | SelectItem::QualifiedWildcard(_, _) => { + // Wildcards expand to multiple columns - skip position tracking + } + } + } +} + +/// Returns true if the expression is a literal value that could cause duplicate names. +fn is_literal_value(expr: &SQLExpr) -> bool { + matches!( + expr, + SQLExpr::Value(_) + | SQLExpr::UnaryOp { .. } + | SQLExpr::TypedString { .. } + | SQLExpr::Interval(_) + ) +} diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 29c17be69ce5f..e864fae9a9144 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2655,6 +2655,60 @@ fn union_all_by_name_same_column_names() { ); } +#[test] +fn union_all_with_duplicate_literals() { + let sql = "SELECT 0 a, 0 b UNION ALL SELECT 1, 1"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + Union + Projection: Int64(0) AS a, Int64(0) AS b + EmptyRelation: rows=1 + Projection: Int64(1) AS a, Int64(1) AS b + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn intersect_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c INTERSECT SELECT 1, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftSemi Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + +#[test] +fn except_with_duplicate_literals() { + let sql = "SELECT 1 as a, 0 as b, 0 as c EXCEPT SELECT 2, 0, 0"; + let plan = logical_plan(sql).unwrap(); + assert_snapshot!( + plan, + @r" + LeftAnti Join: left.a = right.a, left.b = right.b, left.c = right.c + Distinct: + SubqueryAlias: left + Projection: Int64(1) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + SubqueryAlias: right + Projection: Int64(2) AS a, Int64(0) AS b, Int64(0) AS c + EmptyRelation: rows=1 + " + ); +} + #[test] fn empty_over() { let sql = "SELECT order_id, MAX(order_id) OVER () from orders"; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index d858d0ae3ea4e..d0ad5c8bb3c58 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -256,6 +256,30 @@ Bob_new John John_new +# Test UNION ALL with unaliased duplicate literal values on the right side. +# The second projection will inherit field names from the first one, and so +# pass the unique projection expression name check. +query TII rowsort +SELECT name, 1 as table, 1 as row FROM t1 WHERE id = 1 +UNION ALL +SELECT name, 2, 2 FROM t2 WHERE id = 2 +---- +Alex 1 1 +Bob 2 2 + +# Test nested UNION, EXCEPT, INTERSECT with duplicate unaliased literals. +# Only the first SELECT has column aliases, which should propagate to all projections. +query III rowsort +SELECT 1 as a, 0 as b, 0 as c +UNION ALL +((SELECT 2, 0, 0 UNION ALL SELECT 3, 0, 0) EXCEPT SELECT 3, 0, 0) +UNION ALL +(SELECT 4, 0, 0 INTERSECT SELECT 4, 0, 0) +---- +1 0 0 +2 0 0 +4 0 0 + # Plan is unnested query TT EXPLAIN SELECT name FROM t1 UNION ALL (SELECT name from t2 UNION ALL SELECT name || '_new' from t2)