diff --git a/datafusion/functions-nested/src/repeat.rs b/datafusion/functions-nested/src/repeat.rs index 825530923fa79..ab6fef5ccab36 100644 --- a/datafusion/functions-nested/src/repeat.rs +++ b/datafusion/functions-nested/src/repeat.rs @@ -31,7 +31,7 @@ use arrow::datatypes::{ }; use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array}; use datafusion_common::types::{NativeType, logical_int64}; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::{Result, exec_datafusion_err}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, @@ -179,7 +179,18 @@ fn general_repeat( array: &ArrayRef, count_array: &Int64Array, ) -> Result { - let (offsets, total_repeated_values) = build_repeat_offsets::(count_array)?; + let total_repeated_values = + (0..count_array.len()).try_fold(0usize, |total, idx| { + total + .checked_add(get_count_with_validity(count_array, idx)) + .ok_or_else(|| { + exec_datafusion_err!( + "array_repeat: total repeated values overflowed usize" + ) + }) + })?; + ensure_repeated_values_fit::(total_repeated_values)?; + let (offsets, _) = build_repeat_offsets::(count_array)?; let mut take_indices = Vec::with_capacity(total_repeated_values); @@ -230,13 +241,13 @@ fn general_list_repeat( - list_offsets[i].to_usize().unwrap(); inner_total = checked_repeat_len_add(inner_total, checked_repeat_len_mul(len, count)?)?; - ensure_array_repeat_output_len::(inner_total)?; + ensure_repeated_values_fit::(inner_total)?; } } // Build inner structures - ensure_vec_capacity::(checked_repeat_len_add(outer_total, 1)?)?; - let mut inner_offsets = Vec::with_capacity(outer_total + 1); + let inner_offsets_capacity = checked_offset_slots_capacity::(outer_total)?; + let mut inner_offsets = Vec::with_capacity(inner_offsets_capacity); let mut take_indices = Vec::with_capacity(inner_total); let mut inner_nulls = BooleanBufferBuilder::new(outer_total); let mut inner_running = 0usize; @@ -251,12 +262,8 @@ fn general_list_repeat( for _ in 0..count { inner_running = checked_repeat_len_add(inner_running, row_len)?; - ensure_array_repeat_output_len::(inner_running)?; - let offset = O::from_usize(inner_running).ok_or_else(|| { - DataFusionError::Execution(format!( - "array_repeat: offset {inner_running} exceeds the maximum value for offset type" - )) - })?; + ensure_repeated_values_fit::(inner_running)?; + let offset = checked_repeat_offset::(inner_running)?; inner_offsets.push(offset); inner_nulls.append(list_is_valid); if list_is_valid { @@ -293,19 +300,16 @@ fn general_list_repeat( fn build_repeat_offsets( count_array: &Int64Array, ) -> Result<(Vec, usize)> { - let mut offsets = Vec::with_capacity(count_array.len() + 1); + let offsets_capacity = checked_offset_slots_capacity::(count_array.len())?; + let mut offsets = Vec::with_capacity(offsets_capacity); offsets.push(O::zero()); let mut running_offset = 0usize; for idx in 0..count_array.len() { let count = get_count_with_validity(count_array, idx); running_offset = checked_repeat_len_add(running_offset, count)?; - ensure_array_repeat_output_len::(running_offset)?; - let offset = O::from_usize(running_offset).ok_or_else(|| { - DataFusionError::Execution(format!( - "array_repeat: offset {running_offset} exceeds the maximum value for offset type" - )) - })?; + ensure_repeated_values_fit::(running_offset)?; + let offset = checked_repeat_offset::(running_offset)?; offsets.push(offset); } @@ -313,47 +317,43 @@ fn build_repeat_offsets( } fn checked_repeat_len_add(lhs: usize, rhs: usize) -> Result { - lhs.checked_add(rhs).ok_or_else(|| { - DataFusionError::Execution(ARRAY_REPEAT_LENGTH_EXCEEDED.to_string()) - }) + lhs.checked_add(rhs) + .ok_or_else(|| exec_datafusion_err!("{}", ARRAY_REPEAT_LENGTH_EXCEEDED)) } fn checked_repeat_len_mul(lhs: usize, rhs: usize) -> Result { - lhs.checked_mul(rhs).ok_or_else(|| { - DataFusionError::Execution(ARRAY_REPEAT_LENGTH_EXCEEDED.to_string()) - }) + lhs.checked_mul(rhs) + .ok_or_else(|| exec_datafusion_err!("{}", ARRAY_REPEAT_LENGTH_EXCEEDED)) } -fn ensure_array_repeat_output_len(len: usize) -> Result<()> { - if len > max_array_repeat_output_len::() { - return Err(DataFusionError::Execution( - ARRAY_REPEAT_LENGTH_EXCEEDED.to_string(), - )); - } +fn ensure_repeated_values_fit(len: usize) -> Result<()> { + ensure_vec_capacity::(len)?; + checked_repeat_offset::(len)?; Ok(()) } fn ensure_vec_capacity(len: usize) -> Result<()> { if len > max_vec_elements::() { - return Err(DataFusionError::Execution( - ARRAY_REPEAT_LENGTH_EXCEEDED.to_string(), - )); + return Err(exec_datafusion_err!("{}", ARRAY_REPEAT_LENGTH_EXCEEDED)); } Ok(()) } -fn max_array_repeat_output_len() -> usize { - max_offset_elements::().min(max_vec_elements::()) +fn checked_offset_slots_capacity(len: usize) -> Result { + let capacity = checked_repeat_len_add(len, 1)?; + ensure_vec_capacity::(capacity)?; + + Ok(capacity) } -fn max_offset_elements() -> usize { - if size_of::() == size_of::() { - i32::MAX as usize - } else { - i64::MAX as usize - } +fn checked_repeat_offset(offset: usize) -> Result { + O::from_usize(offset).ok_or_else(|| { + exec_datafusion_err!( + "array_repeat: offset {offset} exceeds the maximum value for offset type" + ) + }) } fn max_vec_elements() -> usize { @@ -387,9 +387,25 @@ mod tests { let count: ArrayRef = Arc::new(Int64Array::from(vec![i64::MAX])); let err = array_repeat_inner(&[element, count]).unwrap_err(); - assert_eq!( - err.to_string(), - "Execution error: array_repeat: requested length exceeds maximum array size" + assert!( + err.to_string().starts_with( + "Execution error: array_repeat: requested length exceeds maximum array size" + ), + "unexpected error: {err}" + ); + } + + #[test] + fn scalar_count_exceeding_list_offset_limit_returns_error() { + let element: ArrayRef = Arc::new(Int64Array::from(vec![1])); + let count: ArrayRef = Arc::new(Int64Array::from(vec![i32::MAX as i64 + 1])); + + let err = array_repeat_inner(&[element, count]).unwrap_err(); + assert!( + err.to_string().starts_with( + "Execution error: array_repeat: offset 2147483648 exceeds the maximum value for offset type" + ), + "unexpected error: {err}" ); } } diff --git a/datafusion/sqllogictest/test_files/array/array_repeat.slt b/datafusion/sqllogictest/test_files/array/array_repeat.slt index 9f17c449c88c2..5073c7d4c5822 100644 --- a/datafusion/sqllogictest/test_files/array/array_repeat.slt +++ b/datafusion/sqllogictest/test_files/array/array_repeat.slt @@ -79,6 +79,16 @@ Select ---- [] [] [] [] +# array_repeat returns an execution error on scalar output-size overflow +query error DataFusion error: Execution error: array_repeat: total repeated values overflowed usize +SELECT array_repeat(1, c) +FROM ( + VALUES + (9223372036854775807), + (9223372036854775807), + (9223372036854775807) +) AS t(c); + # array_repeat with columns #1 statement ok