Skip to content
131 changes: 131 additions & 0 deletions datafusion/spark/src/function/datetime/date_trunc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::any::Any;
use std::sync::Arc;

use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
use datafusion_common::types::{NativeType, logical_string};
use datafusion_common::utils::take_function_args;
use datafusion_common::{Result, ScalarValue, internal_err};
use datafusion_expr::expr::ScalarFunction;
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
use datafusion_expr::{
Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
Signature, TypeSignatureClass, Volatility,
};

/// Spark date_trunc supports extra format aliases.
/// <https://spark.apache.org/docs/latest/api/sql/index.html#date_trunc>
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct SparkDateTrunc {
signature: Signature,
}

impl Default for SparkDateTrunc {
fn default() -> Self {
Self::new()
}
}

impl SparkDateTrunc {
pub fn new() -> Self {
Self {
signature: Signature::coercible(
vec![
Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
Coercion::new_implicit(
TypeSignatureClass::Timestamp,
vec![TypeSignatureClass::Native(logical_string())],
NativeType::Timestamp(TimeUnit::Microsecond, None),
),
],
Volatility::Immutable,
),
}
}
}

impl ScalarUDFImpl for SparkDateTrunc {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"date_trunc"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be used instead")
}

fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let nullable = args.arg_fields.iter().any(|f| f.is_nullable());

Ok(Arc::new(Field::new(
self.name(),
args.arg_fields[1].data_type().clone(),
nullable,
)))
}

fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
internal_err!(
"spark date_trunc should have been simplified to standard date_trunc"
)
}

fn simplify(
&self,
args: Vec<Expr>,
_info: &SimplifyContext,
) -> Result<ExprSimplifyResult> {
let [fmt_expr, ts_expr] = take_function_args(self.name(), args)?;

let fmt = match fmt_expr.as_literal() {
Some(ScalarValue::Utf8(Some(v)))
| Some(ScalarValue::Utf8View(Some(v)))
| Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
_ => {
return internal_err!(
Comment thread
cht42 marked this conversation as resolved.
Outdated
"First argument of `DATE_TRUNC` must be non-null scalar Utf8"
);
}
};

// Map Spark-specific fmt aliases to datafusion ones
let fmt = match fmt.as_str() {
"yy" | "yyyy" => "year",
"mm" | "mon" => "month",
"dd" => "day",
Comment thread
andygrove marked this conversation as resolved.
other => other,
};

let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt), None);

Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
ScalarFunction::new_udf(
datafusion_functions::datetime::date_trunc(),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just concerned about if matching return field nullability here is something we should watch for?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep DF's date_trunc returm field will be nullable.. if #19511 goes through it should fix this issue

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see #19511 landing anytime soon so we might need to fix this in the DF date_trunc to ensure consistency here

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

vec![fmt_expr, ts_expr],
),
)))
}
}
24 changes: 24 additions & 0 deletions datafusion/spark/src/function/datetime/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@

pub mod date_add;
pub mod date_sub;
pub mod date_trunc;
pub mod extract;
pub mod last_day;
pub mod make_dt_interval;
pub mod make_interval;
pub mod next_day;
pub mod time_trunc;
pub mod trunc;

use datafusion_expr::ScalarUDF;
use datafusion_functions::make_udf_function;
Expand All @@ -36,6 +39,9 @@ make_udf_function!(last_day::SparkLastDay, last_day);
make_udf_function!(make_dt_interval::SparkMakeDtInterval, make_dt_interval);
make_udf_function!(make_interval::SparkMakeInterval, make_interval);
make_udf_function!(next_day::SparkNextDay, next_day);
make_udf_function!(date_trunc::SparkDateTrunc, date_trunc);
make_udf_function!(time_trunc::SparkTimeTrunc, time_trunc);
make_udf_function!(trunc::SparkTrunc, trunc);

pub mod expr_fn {
use datafusion_functions::export_functions;
Expand Down Expand Up @@ -83,6 +89,21 @@ pub mod expr_fn {
"Returns the first date which is later than start_date and named as indicated. The function returns NULL if at least one of the input parameters is NULL.",
arg1 arg2
));
export_functions!((
date_trunc,
"Truncates a timestamp `ts` to the unit specified by the format `fmt`.",
fmt ts
));
export_functions!((
time_trunc,
"Truncates a time `t` to the unit specified by the format `fmt`.",
fmt t
));
export_functions!((
trunc,
"Truncates a date `dt` to the unit specified by the format `fmt`.",
dt fmt
));
}

pub fn functions() -> Vec<Arc<ScalarUDF>> {
Expand All @@ -96,5 +117,8 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
make_dt_interval(),
make_interval(),
next_day(),
date_trunc(),
time_trunc(),
trunc(),
]
}
122 changes: 122 additions & 0 deletions datafusion/spark/src/function/datetime/time_trunc.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::any::Any;
use std::sync::Arc;

use arrow::datatypes::{DataType, Field, FieldRef};
use datafusion_common::types::logical_string;
use datafusion_common::{Result, ScalarValue, internal_err};
use datafusion_expr::expr::ScalarFunction;
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
use datafusion_expr::{
Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl,
Signature, TypeSignatureClass, Volatility,
};

/// Spark time_trunc function only handles time inputs.
/// <https://spark.apache.org/docs/latest/api/sql/index.html#time_trunc>
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct SparkTimeTrunc {
signature: Signature,
}

impl Default for SparkTimeTrunc {
fn default() -> Self {
Self::new()
}
}

impl SparkTimeTrunc {
pub fn new() -> Self {
Self {
signature: Signature::coercible(
vec![
Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
Coercion::new_exact(TypeSignatureClass::Time),
],
Volatility::Immutable,
),
}
}
}

impl ScalarUDFImpl for SparkTimeTrunc {
fn as_any(&self) -> &dyn Any {
self
}

fn name(&self) -> &str {
"time_trunc"
}

fn signature(&self) -> &Signature {
&self.signature
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
internal_err!("return_field_from_args should be used instead")
}

fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> {
let nullable = args.arg_fields.iter().any(|f| f.is_nullable());

Ok(Arc::new(Field::new(
self.name(),
args.arg_fields[1].data_type().clone(),
nullable,
)))
}

fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> {
internal_err!(
"spark time_trunc should have been simplified to standard date_trunc"
)
}

fn simplify(
&self,
args: Vec<Expr>,
_info: &SimplifyContext,
) -> Result<ExprSimplifyResult> {
let fmt_expr = &args[0];
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
let fmt_expr = &args[0];
let [fmt_expr, time_expr] = take_function_args(self.name(), args)?;


let fmt = match fmt_expr.as_literal() {
Some(ScalarValue::Utf8(Some(v)))
| Some(ScalarValue::Utf8View(Some(v)))
| Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(),
_ => {
return internal_err!(
"First argument of `TIME_TRUNC` must be non-null scalar Utf8"
);
}
};

if !matches!(
fmt.as_str(),
"hour" | "minute" | "second" | "millisecond" | "microsecond"
) {
return internal_err!(
Comment thread
cht42 marked this conversation as resolved.
Outdated
"The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond"
);
}

Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction(
ScalarFunction::new_udf(datafusion_functions::datetime::date_trunc(), args),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fmt is normalized (lowercased) above and validated but here you pass the original args (non-normalized).
Maybe it will be better to pass the fmt:

let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt.as_str()), None);
...
ScalarFunction::new_udf(
    datafusion_functions::datetime::date_trunc(),
    vec![fmt_expr, time_expr],
),

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't matter, DF will handle the original argument as well and lowercase it

)))
}
}
Loading
Loading