From c9e1e7961e7e3f9d9ae00e61048440f46a360a32 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 6 Mar 2026 11:45:18 -0500 Subject: [PATCH 1/5] gpu: transpose patches 4x faster Eliminate all of the wasteful allocation, replace it with a 3-pass algorithm that inserts indices/values directly into their final positions. Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/types.rs | 93 ++++++++++++------------- 1 file changed, 44 insertions(+), 49 deletions(-) diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index 2e6d429bb23..80c38c34cd8 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -7,7 +7,6 @@ use vortex::buffer::Buffer; use vortex::buffer::BufferMut; -use vortex::buffer::buffer_mut; use vortex_array::Canonical; use vortex_array::buffer::BufferHandle; use vortex_array::dtype::IntegerPType; @@ -40,19 +39,6 @@ const fn patch_lanes() -> usize { if size_of::() < 8 { 32 } else { 16 } } -#[derive(Clone)] -struct Chunk { - lanes: Vec>, -} - -impl Default for Chunk { - fn default() -> Self { - Self { - lanes: vec![Lane::::default(); patch_lanes::()], - } - } -} - /// A set of patches of values `V` existing in host buffers. #[allow(dead_code)] pub struct HostPatches { @@ -122,23 +108,6 @@ impl HostPatches { } } -#[derive(Debug, Default, Clone)] -struct Lane { - indices: Vec, - values: Vec, -} - -impl Lane { - fn push(&mut self, index: u16, value: V) { - self.indices.push(index); - self.values.push(value); - } - - fn len(&self) -> usize { - self.indices.len() - } -} - /// Transpose a set of patches from the default sorted layout into the data parallel layout. #[allow(clippy::cognitive_complexity)] pub async fn transpose_patches( @@ -180,8 +149,8 @@ pub async fn transpose_patches( #[allow(clippy::cast_possible_truncation)] fn transpose( - indices: &[I], - values: &[V], + indices_in: &[I], + values_in: &[V], offset: usize, array_len: usize, ) -> HostPatches { @@ -193,30 +162,56 @@ fn transpose( ); let n_lanes = patch_lanes::(); - let mut chunks: Vec> = vec![Chunk::default(); n_chunks]; - // For each chunk, for each lane, push new values - for (index, &value) in std::iter::zip(indices, values) { + // We know upfront how many indices and values we'll have. + let mut indices_buffer = BufferMut::with_capacity(indices_in.len()); + let mut values_buffer = BufferMut::with_capacity(values_in.len()); + + // number of patches in each chunk. + let mut lane_offsets: BufferMut = BufferMut::zeroed(n_chunks * n_lanes + 1); + + // Scan the index/values once to get chunk/lane counts + for index in indices_in { let index = index.as_() - offset; + let chunk = index / 1024; + let lane = index % n_lanes; + + lane_offsets[chunk * n_lanes + lane + 1] += 1; + } + // Prefix-sum sizes -> offsets + for index in 1..lane_offsets.len() { + lane_offsets[index] += lane_offsets[index - 1]; + } + + // Loop over patches, writing them to final positions + let indices_out = indices_buffer.spare_capacity_mut(); + let values_out = values_buffer.spare_capacity_mut(); + for (index, &value) in std::iter::zip(indices_in, values_in) { + let index = index.as_() - offset; let chunk = index / 1024; let lane = index % n_lanes; - chunks[chunk].lanes[lane].push((index % 1024) as u16, value); + let position = &mut lane_offsets[chunk * n_lanes + lane]; + indices_out[*position as usize].write((index % 1024) as u16); + values_out[*position as usize].write(value); + *position += 1; } - // Reshuffle the different containers into a single contiguous buffer each for indices/values - let mut lane_offset = 0; - let mut lane_offsets = buffer_mut![0u32]; - let mut indices_buffer = BufferMut::empty(); - let mut values_buffer = BufferMut::empty(); - for chunk in chunks { - for lane in chunk.lanes { - indices_buffer.extend_from_slice(&lane.indices); - values_buffer.extend_from_slice(&lane.values); - lane_offset += lane.len() as u32; - lane_offsets.push(lane_offset); - } + // SAFETY: we know there are exactly indices_in.len() indices/values, and we just + // set them to the appropriate values in the loop above. + unsafe { + indices_buffer.set_len(indices_in.len()); + values_buffer.set_len(values_in.len()); + } + + // Now, pass over all the indices and values again and subtract out the position increments. + for index in indices_in { + let index = index.as_() - offset; + let chunk = index / 1024; + let lane = index % n_lanes; + + lane_offsets[chunk * n_lanes + lane] -= 1; } HostPatches { From 84be73d1c485e963dd182017044f3a90c12ebca8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Mar 2026 10:18:56 -0400 Subject: [PATCH 2/5] add 4096 test cases Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/types.rs | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index 80c38c34cd8..eb40cd190e0 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -82,6 +82,20 @@ impl HostPatches { } } + /// Apply the patches on top of the other buffer. + #[cfg(test)] + fn apply(&self, output: &mut BufferMut) { + for chunk in 0..self.n_chunks { + for lane in 0..self.n_lanes { + let patches = self.patches(chunk, lane); + for (&index, &value) in std::iter::zip(patches.indices, patches.values) { + let full_index = chunk * 1024 + (index as usize); + output[full_index] = value; + } + } + } + } + /// Export the patches for use on the device associated with the provided execution context. pub async fn export_to_device( mut self, @@ -227,6 +241,15 @@ fn transpose( mod tests { use vortex::buffer::BufferMut; use vortex::buffer::buffer; + use vortex::buffer::buffer_mut; + use vortex_array::ExecutionCtx; + use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::dtype::NativePType; + use vortex_array::patches::Patches; + use vortex_error::VortexResult; use crate::kernel::patches::types::transpose; @@ -280,4 +303,54 @@ mod tests { assert_eq!(transposed.patches(3, 4).values, &[80]); assert_eq!(transposed.patches(3, 4).indices, &[4]); } + + #[test] + #[allow(clippy::cast_possible_truncation)] + fn test_transpose_complex() -> VortexResult<()> { + test_case(1024, 0, &[0], &[0f32])?; + test_case(512, 512, &[512, 513, 514], &[10i8, 20, 30])?; + test_case(10_000, 100, &[500, 1_000, 1_001, 1_002], &[1i16, 2, 3, 4])?; + + // Try every size from 0..4096 and a range of indices. + for len in 1..4096 { + let offset = len / 2; + + let indices: Vec = (offset..len).map(|x| x as u32).collect(); + + test_case(len, offset, &indices, &indices)?; + } + + Ok(()) + } + + fn test_case( + len: usize, + offset: usize, + patch_indices: &[u32], + patch_values: &[V], + ) -> VortexResult<()> { + let mut data = buffer_mut![V::default(); len]; + let array = PrimitiveArray::from_iter(data.iter().copied()); + + let patches = Patches::new( + len, + offset, + PrimitiveArray::from_iter(patch_indices.iter().copied()).into_array(), + PrimitiveArray::from_iter(patch_values.iter().copied()).into_array(), + None, + )?; + + // Verify that the outputs match between Patches and transpose_patches(). + let mut ctx = ExecutionCtx::new(LEGACY_SESSION.clone()); + let patched = array.patch(&patches, &mut ctx)?.into_array(); + + let transposed = transpose(patch_indices, patch_values, offset, len); + transposed.apply(&mut data); + + let patched_transposed = data.freeze().into_array(); + + assert_arrays_eq!(patched, patched_transposed); + + Ok(()) + } } From 29a93a507e620739da87d616929f8a9f8e7afc11 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Mar 2026 10:26:55 -0400 Subject: [PATCH 3/5] step by 20 to avoid wasting time Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index eb40cd190e0..cd03f94526c 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -312,7 +312,7 @@ mod tests { test_case(10_000, 100, &[500, 1_000, 1_001, 1_002], &[1i16, 2, 3, 4])?; // Try every size from 0..4096 and a range of indices. - for len in 1..4096 { + for len in (1..4096).step_by(20) { let offset = len / 2; let indices: Vec = (offset..len).map(|x| x as u32).collect(); From 0dfd42f26a9bf5d38495dd07b14bb9505dd2091e Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Mar 2026 10:27:48 -0400 Subject: [PATCH 4/5] add step-by amount Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/types.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index cd03f94526c..c57d7a37351 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -311,8 +311,7 @@ mod tests { test_case(512, 512, &[512, 513, 514], &[10i8, 20, 30])?; test_case(10_000, 100, &[500, 1_000, 1_001, 1_002], &[1i16, 2, 3, 4])?; - // Try every size from 0..4096 and a range of indices. - for len in (1..4096).step_by(20) { + for len in (1..4096).step_by(5) { let offset = len / 2; let indices: Vec = (offset..len).map(|x| x as u32).collect(); From 32ee6e3ecb902f907ca77d330dd7adb91e3aa892 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 9 Mar 2026 10:28:45 -0400 Subject: [PATCH 5/5] add step by amount Signed-off-by: Andrew Duffy --- vortex-cuda/src/kernel/patches/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs index c57d7a37351..b33f5e09c93 100644 --- a/vortex-cuda/src/kernel/patches/types.rs +++ b/vortex-cuda/src/kernel/patches/types.rs @@ -311,7 +311,7 @@ mod tests { test_case(512, 512, &[512, 513, 514], &[10i8, 20, 30])?; test_case(10_000, 100, &[500, 1_000, 1_001, 1_002], &[1i16, 2, 3, 4])?; - for len in (1..4096).step_by(5) { + for len in (1..4096).step_by(10) { let offset = len / 2; let indices: Vec = (offset..len).map(|x| x as u32).collect();