Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions datafusion/functions-nested/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ datafusion-physical-expr-common = { workspace = true }
hashbrown = { workspace = true }
itertools = { workspace = true, features = ["use_std"] }
itoa = { workspace = true }
memchr = { workspace = true }
log = { workspace = true }

[dev-dependencies]
Expand Down Expand Up @@ -113,3 +114,7 @@ name = "array_position"
[[bench]]
harness = false
name = "array_sort"

[[bench]]
harness = false
name = "string_to_array"
245 changes: 245 additions & 0 deletions datafusion/functions-nested/benches/string_to_array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, StringArray};
use arrow::datatypes::{DataType, Field};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
use datafusion_functions_nested::string::StringToArray;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::hint::black_box;
use std::sync::Arc;

const NUM_ROWS: usize = 1000;
const SEED: u64 = 42;

fn criterion_benchmark(c: &mut Criterion) {
// Single-char delimiter
let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string())));
bench_string_to_array(
c,
"string_to_array_single_char_delim",
create_csv_strings,
&comma,
None,
);

// Multi-char delimiter
let double_colon =
ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string())));
bench_string_to_array(
c,
"string_to_array_multi_char_delim",
create_multi_delim_strings,
&double_colon,
None,
);

// With null_str argument
let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string())));
bench_string_to_array(
c,
"string_to_array_with_null_str",
create_csv_strings_with_nulls,
&comma,
Some(&null_str),
);

// NULL delimiter
let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None));
bench_string_to_array(
c,
"string_to_array_null_delim",
create_short_strings,
&null_delim,
None,
);

// Columnar delimiter (fall-back path)
bench_string_to_array_columnar_delim(c);
}

fn bench_string_to_array_columnar_delim(c: &mut Criterion) {
let mut group = c.benchmark_group("string_to_array_columnar_delim");

for &num_elements in &[5, 20, 100] {
let string_array = create_csv_strings(num_elements);
let delimiter_array: ArrayRef =
Arc::new(StringArray::from(vec![Some(","); NUM_ROWS]));

let args = vec![
ColumnarValue::Array(string_array.clone()),
ColumnarValue::Array(delimiter_array),
];
let arg_fields = vec![
Field::new("str", DataType::Utf8, true).into(),
Field::new("delimiter", DataType::Utf8, false).into(),
];

let return_field = Field::new(
"result",
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
true,
);

group.bench_with_input(
BenchmarkId::from_parameter(num_elements),
&num_elements,
|b, _| {
let udf = StringToArray::new();
b.iter(|| {
black_box(
udf.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: NUM_ROWS,
return_field: return_field.clone().into(),
config_options: Arc::new(ConfigOptions::default()),
})
.unwrap(),
)
})
},
);
}

group.finish();
}

fn bench_string_to_array(
c: &mut Criterion,
group_name: &str,
make_strings: fn(usize) -> ArrayRef,
delimiter: &ColumnarValue,
null_str: Option<&ColumnarValue>,
) {
let mut group = c.benchmark_group(group_name);

for &num_elements in &[5, 20, 100] {
let string_array = make_strings(num_elements);

let mut args = vec![
ColumnarValue::Array(string_array.clone()),
delimiter.clone(),
];
let mut arg_fields = vec![
Field::new("str", DataType::Utf8, true).into(),
Field::new("delimiter", DataType::Utf8, true).into(),
];
if let Some(ns) = null_str {
args.push(ns.clone());
arg_fields.push(Field::new("null_str", DataType::Utf8, true).into());
}

let return_field = Field::new(
"result",
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
true,
);

group.bench_with_input(
BenchmarkId::from_parameter(num_elements),
&num_elements,
|b, _| {
let udf = StringToArray::new();
b.iter(|| {
black_box(
udf.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: NUM_ROWS,
return_field: return_field.clone().into(),
config_options: Arc::new(ConfigOptions::default()),
})
.unwrap(),
)
})
},
);
}

group.finish();
}

/// Creates strings like "val1,val2,val3,...,valN" with `num_elements` elements.
fn create_csv_strings(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| format!("val{}", rng.random_range(0..1000)))
.collect();
Some(parts.join(","))
})
.collect();
Arc::new(strings)
}

/// Creates strings like "val1::val2::val3::...::valN".
fn create_multi_delim_strings(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| format!("val{}", rng.random_range(0..1000)))
.collect();
Some(parts.join("::"))
})
.collect();
Arc::new(strings)
}

/// Creates CSV strings where ~10% of elements are the literal "NULL".
fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| {
if rng.random::<f64>() < 0.1 {
"NULL".to_string()
} else {
format!("val{}", rng.random_range(0..1000))
}
})
.collect();
Some(parts.join(","))
})
.collect();
Arc::new(strings)
}

/// Creates short strings (length = `num_chars`) for the NULL-delimiter
/// (split-into-characters) benchmark.
fn create_short_strings(num_chars: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let s: String = (0..num_chars)
.map(|_| rng.random_range(b'a'..=b'z') as char)
.collect();
Some(s)
})
.collect();
Arc::new(strings)
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Loading
Loading