diff --git a/datafusion/spark/src/function/datetime/date_diff.rs b/datafusion/spark/src/function/datetime/date_diff.rs new file mode 100644 index 0000000000000..094c35eec56b5 --- /dev/null +++ b/datafusion/spark/src/function/datetime/date_diff.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::types::{NativeType, logical_date, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, internal_err}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ExprSchemable, Operator, ReturnFieldArgs, + ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, + binary_expr, +}; + +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkDateDiff { + signature: Signature, + aliases: Vec, +} + +impl Default for SparkDateDiff { + fn default() -> Self { + Self::new() + } +} + +impl SparkDateDiff { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Timestamp, + ], + NativeType::Date, + ), + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Timestamp, + ], + NativeType::Date, + ), + ], + Volatility::Immutable, + ), + aliases: vec!["datediff".to_string()], + } + } +} + +impl ScalarUDFImpl for SparkDateDiff { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "date_diff" + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + Ok(Arc::new(Field::new(self.name(), DataType::Int32, nullable))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { + internal_err!( + "Apache Spark `date_diff` should have been simplified to standard subtraction" + ) + } + + fn simplify( + &self, + args: Vec, + info: &SimplifyContext, + ) -> Result { + let [end, start] = take_function_args(self.name(), args)?; + let end = end.cast_to(&DataType::Date32, info.schema())?; + let start = start.cast_to(&DataType::Date32, info.schema())?; + Ok(ExprSimplifyResult::Simplified( + binary_expr(end, Operator::Minus, start) + .cast_to(&DataType::Int32, info.schema())?, + )) + } +} diff --git a/datafusion/spark/src/function/datetime/mod.rs b/datafusion/spark/src/function/datetime/mod.rs index 92d7eab32c11d..7c270c14408b8 100644 --- a/datafusion/spark/src/function/datetime/mod.rs +++ b/datafusion/spark/src/function/datetime/mod.rs @@ -16,6 +16,7 @@ // under the License. pub mod date_add; +pub mod date_diff; pub mod date_part; pub mod date_sub; pub mod date_trunc; @@ -32,6 +33,7 @@ use datafusion_functions::make_udf_function; use std::sync::Arc; make_udf_function!(date_add::SparkDateAdd, date_add); +make_udf_function!(date_diff::SparkDateDiff, date_diff); make_udf_function!(date_part::SparkDatePart, date_part); make_udf_function!(date_sub::SparkDateSub, date_sub); make_udf_function!(date_trunc::SparkDateTrunc, date_trunc); @@ -91,6 +93,11 @@ pub mod expr_fn { "Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.", arg1 arg2 )); + export_functions!(( + date_diff, + "Returns the number of days from start `start` to end `end`.", + end start + )); export_functions!(( date_trunc, "Truncates a timestamp `ts` to the unit specified by the format `fmt`.", @@ -110,13 +117,13 @@ pub mod expr_fn { date_part, "Extracts a part of the date or time from a date, time, or timestamp expression.", arg1 arg2 - )); } pub fn functions() -> Vec> { vec![ date_add(), + date_diff(), date_part(), date_sub(), date_trunc(), diff --git a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt index c5871ab41e183..b0952d6a43510 100644 --- a/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt +++ b/datafusion/sqllogictest/test_files/spark/datetime/date_diff.slt @@ -15,18 +15,138 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT date_diff('2009-07-30', '2009-07-31'); -## PySpark 3.5.5 Result: {'date_diff(2009-07-30, 2009-07-31)': -1, 'typeof(date_diff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'} -#query -#SELECT date_diff('2009-07-30'::string, '2009-07-31'::string); - -## Original Query: SELECT date_diff('2009-07-31', '2009-07-30'); -## PySpark 3.5.5 Result: {'date_diff(2009-07-31, 2009-07-30)': 1, 'typeof(date_diff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'} -#query -#SELECT date_diff('2009-07-31'::string, '2009-07-30'::string); +# date input +query I +SELECT date_diff('2009-07-30'::date, '2009-07-31'::date); +---- +-1 + +query I +SELECT date_diff('2009-07-31'::date, '2009-07-30'::date); +---- +1 + +query I +SELECT date_diff('2009-07-31'::string, '2009-07-30'::date); +---- +1 + +query I +SELECT date_diff('2009-07-31'::timestamp, '2009-07-30'::date); +---- +1 + +# Date64 input +query I +SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), arrow_cast('2009-07-30', 'Date64')); +---- +1 + +query I +SELECT date_diff(arrow_cast('2009-07-30', 'Date64'), arrow_cast('2009-07-31', 'Date64')); +---- +-1 + +# Mixed Date32 and Date64 input +query I +SELECT date_diff('2009-07-31'::date, arrow_cast('2009-07-30', 'Date64')); +---- +1 + +query I +SELECT date_diff(arrow_cast('2009-07-31', 'Date64'), '2009-07-30'::date); +---- +1 + + +# Same date returns 0 +query I +SELECT date_diff('2009-07-30'::date, '2009-07-30'::date); +---- +0 + +# Large difference +query I +SELECT date_diff('2020-01-01'::date, '1970-01-01'::date); +---- +18262 + +# timestamp input +query I +SELECT date_diff('2009-07-30 12:34:56'::timestamp, '2009-07-31 23:45:01'::timestamp); +---- +-1 + +query I +SELECT date_diff('2009-07-31 23:45:01'::timestamp, '2009-07-30 12:34:56'::timestamp); +---- +1 + +query I +SELECT date_diff('2009-07-31 23:45:01'::string, '2009-07-30 12:34:56'::timestamp); +---- +1 + +# string input +query I +SELECT date_diff('2009-07-30', '2009-07-31'); +---- +-1 + +query I +SELECT date_diff('2009-07-31', '2009-07-30'); +---- +1 + +# NULL handling +query I +SELECT date_diff(NULL::date, '2009-07-30'::date); +---- +NULL + +query I +SELECT date_diff('2009-07-31'::date, NULL::date); +---- +NULL + +query I +SELECT date_diff(NULL::date, NULL::date); +---- +NULL + +query I +SELECT date_diff(column1, column2) +FROM VALUES +('2009-07-30'::date, '2009-07-31'::date), +('2009-07-31'::date, '2009-07-30'::date), +(NULL::date, '2009-07-30'::date), +('2009-07-31'::date, NULL::date), +(NULL::date, NULL::date); +---- +-1 +1 +NULL +NULL +NULL + + +# Alias datediff +query I +SELECT datediff('2009-07-30'::date, '2009-07-31'::date); +---- +-1 + +query I +SELECT datediff(column1, column2) +FROM VALUES +('2009-07-30'::date, '2009-07-31'::date), +('2009-07-31'::date, '2009-07-30'::date), +(NULL::date, '2009-07-30'::date), +('2009-07-31'::date, NULL::date), +(NULL::date, NULL::date); +---- +-1 +1 +NULL +NULL +NULL diff --git a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt b/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt deleted file mode 100644 index 223e2c313ae86..0000000000000 --- a/datafusion/sqllogictest/test_files/spark/datetime/datediff.slt +++ /dev/null @@ -1,32 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT datediff('2009-07-30', '2009-07-31'); -## PySpark 3.5.5 Result: {'datediff(2009-07-30, 2009-07-31)': -1, 'typeof(datediff(2009-07-30, 2009-07-31))': 'int', 'typeof(2009-07-30)': 'string', 'typeof(2009-07-31)': 'string'} -#query -#SELECT datediff('2009-07-30'::string, '2009-07-31'::string); - -## Original Query: SELECT datediff('2009-07-31', '2009-07-30'); -## PySpark 3.5.5 Result: {'datediff(2009-07-31, 2009-07-30)': 1, 'typeof(datediff(2009-07-31, 2009-07-30))': 'int', 'typeof(2009-07-31)': 'string', 'typeof(2009-07-30)': 'string'} -#query -#SELECT datediff('2009-07-31'::string, '2009-07-30'::string);