Skip to content

Commit 850b40c

Browse files
committed
reuse char buffers
1 parent 4ba442a commit 850b40c

File tree

2 files changed

+33
-8
lines changed

2 files changed

+33
-8
lines changed

datafusion/common/src/utils/mod.rs

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -794,27 +794,42 @@ pub mod datafusion_strsim {
794794
pub fn levenshtein_with_threshold(a: &str, b: &str, threshold: i32) -> i32 {
795795
let mut p = Vec::new();
796796
let mut d = Vec::new();
797-
levenshtein_with_threshold_and_buffer(a, b, threshold, &mut p, &mut d)
797+
let mut a_buf = Vec::new();
798+
let mut b_buf = Vec::new();
799+
levenshtein_with_threshold_and_buffer(
800+
a,
801+
b,
802+
threshold,
803+
&mut p,
804+
&mut d,
805+
&mut a_buf,
806+
&mut b_buf,
807+
)
798808
}
799809

800810
/// Calculates the Levenshtein distance with a threshold using reusable buffers.
801811
/// See [`levenshtein_with_threshold`] for details.
802812
///
803-
/// The `p` and `d` buffers will be resized as needed and reused across calls.
813+
/// The `p`, `d`, `a_buf`, and `b_buf` buffers will be resized as needed
814+
/// and reused across calls.
804815
pub fn levenshtein_with_threshold_and_buffer(
805816
a: &str,
806817
b: &str,
807818
threshold: i32,
808819
p: &mut Vec<i32>,
809820
d: &mut Vec<i32>,
821+
a_buf: &mut Vec<char>,
822+
b_buf: &mut Vec<char>,
810823
) -> i32 {
811-
let a_chars: Vec<char> = a.chars().collect();
812-
let b_chars: Vec<char> = b.chars().collect();
824+
a_buf.clear();
825+
a_buf.extend(a.chars());
826+
b_buf.clear();
827+
b_buf.extend(b.chars());
813828

814-
let (s, t, n, m) = if a_chars.len() <= b_chars.len() {
815-
(&a_chars, &b_chars, a_chars.len(), b_chars.len())
829+
let (s, t, n, m) = if a_buf.len() <= b_buf.len() {
830+
(a_buf.as_slice(), b_buf.as_slice(), a_buf.len(), b_buf.len())
816831
} else {
817-
(&b_chars, &a_chars, b_chars.len(), a_chars.len())
832+
(b_buf.as_slice(), a_buf.as_slice(), b_buf.len(), a_buf.len())
818833
};
819834
// n <= m is guaranteed
820835

datafusion/functions/src/string/levenshtein.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,8 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
187187
let mut cache = Vec::new();
188188
let mut p_buf = Vec::new();
189189
let mut d_buf = Vec::new();
190+
let mut a_buf = Vec::new();
191+
let mut b_buf = Vec::new();
190192

191193
match coercion_data_type {
192194
DataType::Utf8View => {
@@ -208,6 +210,8 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
208210
&mut cache,
209211
&mut p_buf,
210212
&mut d_buf,
213+
&mut a_buf,
214+
&mut b_buf,
211215
)
212216
})
213217
.collect();
@@ -232,6 +236,8 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
232236
&mut cache,
233237
&mut p_buf,
234238
&mut d_buf,
239+
&mut a_buf,
240+
&mut b_buf,
235241
)
236242
})
237243
.collect();
@@ -256,6 +262,8 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
256262
&mut cache,
257263
&mut p_buf,
258264
&mut d_buf,
265+
&mut a_buf,
266+
&mut b_buf,
259267
)
260268
.map(|d| d as i64)
261269
})
@@ -280,11 +288,13 @@ fn compute_distance(
280288
cache: &mut Vec<usize>,
281289
p_buf: &mut Vec<i32>,
282290
d_buf: &mut Vec<i32>,
291+
a_buf: &mut Vec<char>,
292+
b_buf: &mut Vec<char>,
283293
) -> Option<i32> {
284294
match (s1, s2) {
285295
(Some(s1), Some(s2)) => match threshold {
286296
Some(t) => Some(datafusion_strsim::levenshtein_with_threshold_and_buffer(
287-
s1, s2, t, p_buf, d_buf,
297+
s1, s2, t, p_buf, d_buf, a_buf, b_buf,
288298
)),
289299
None => {
290300
Some(datafusion_strsim::levenshtein_with_buffer(s1, s2, cache) as i32)

0 commit comments

Comments
 (0)