Skip to content

Commit 7aa556a

Browse files
authored
chore[gpu]: split plan into unmaterialized / materialized (#7175)
This allows for checking whether the shared memory usage blows past the max GPU device shared memory before launching the dyn dispatch kernel, as well as having a separate materialize step we can fine tune that moves buffers from the host to the GPU. Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 9855100 commit 7aa556a

4 files changed

Lines changed: 452 additions & 442 deletions

File tree

vortex-cuda/benches/dynamic_dispatch_cuda.rs

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,9 @@ use vortex::session::VortexSession;
3838
use vortex_cuda::CudaDeviceBuffer;
3939
use vortex_cuda::CudaExecutionCtx;
4040
use vortex_cuda::CudaSession;
41-
use vortex_cuda::dynamic_dispatch;
42-
use vortex_cuda::dynamic_dispatch::DynamicDispatchPlan;
41+
use vortex_cuda::dynamic_dispatch::CudaDispatchPlan;
42+
use vortex_cuda::dynamic_dispatch::MaterializedPlan;
43+
use vortex_cuda::dynamic_dispatch::UnmaterializedPlan;
4344
use vortex_cuda_macros::cuda_available;
4445
use vortex_cuda_macros::cuda_not_available;
4546

@@ -51,14 +52,14 @@ const BENCH_ARGS: &[(usize, &str)] = &[
5152

5253
/// Launch the dynamic_dispatch kernel and return GPU-timed duration.
5354
///
54-
/// This deliberately does not use `DynamicDispatchPlan::execute` because the
55+
/// This deliberately does not use `CudaDispatchPlan::execute` because the
5556
/// benchmark pre-allocates the output buffer and device plan once, then reuses
5657
/// them across iterations.
5758
fn run_timed(
5859
cuda_ctx: &mut CudaExecutionCtx,
5960
array_len: usize,
6061
output_buf: &CudaDeviceBuffer,
61-
device_plan: &Arc<cudarc::driver::CudaSlice<DynamicDispatchPlan>>,
62+
device_plan: &Arc<cudarc::driver::CudaSlice<CudaDispatchPlan>>,
6263
shared_mem_bytes: u32,
6364
) -> VortexResult<Duration> {
6465
let cuda_function = cuda_ctx.load_function("dynamic_dispatch", &[PType::U32])?;
@@ -111,40 +112,43 @@ fn run_timed(
111112

112113
/// Benchmark runner: builds a dynamic plan and launches the kernel.
113114
struct BenchRunner {
114-
_plan: DynamicDispatchPlan,
115+
_plan: CudaDispatchPlan,
115116
smem_bytes: u32,
116117
len: usize,
117118
// Keep alive
118-
device_plan: Arc<cudarc::driver::CudaSlice<DynamicDispatchPlan>>,
119+
device_plan: Arc<cudarc::driver::CudaSlice<CudaDispatchPlan>>,
119120
output_buf: CudaDeviceBuffer,
120121
_plan_buffers: Vec<vortex::array::buffer::BufferHandle>,
121122
}
122123

123124
impl BenchRunner {
124125
fn new(array: &vortex::array::ArrayRef, len: usize, cuda_ctx: &CudaExecutionCtx) -> Self {
125-
let (plan, plan_buffers) =
126-
dynamic_dispatch::build_plan(array, cuda_ctx).vortex_expect("build_plan");
127-
let smem_bytes = plan.shared_mem_bytes::<u32>();
126+
let MaterializedPlan {
127+
dispatch_plan,
128+
device_buffers,
129+
shared_mem_bytes,
130+
} = UnmaterializedPlan::new(array)
131+
.and_then(|p| p.materialize(cuda_ctx))
132+
.vortex_expect("build_dyn_dispatch_plan");
128133

129134
let device_plan = Arc::new(
130135
cuda_ctx
131136
.stream()
132-
.clone_htod(std::slice::from_ref(&plan))
137+
.clone_htod(std::slice::from_ref(&dispatch_plan))
133138
.expect("htod plan"),
134139
);
135140

136-
let output_slice = cuda_ctx
137-
.device_alloc::<u32>(len.next_multiple_of(1024))
138-
.expect("alloc output");
139-
let output_buf = CudaDeviceBuffer::new(output_slice);
140-
141141
Self {
142-
_plan: plan,
143-
smem_bytes,
142+
_plan: dispatch_plan,
143+
smem_bytes: shared_mem_bytes,
144144
len,
145145
device_plan,
146-
output_buf,
147-
_plan_buffers: plan_buffers,
146+
output_buf: CudaDeviceBuffer::new(
147+
cuda_ctx
148+
.device_alloc::<u32>(len.next_multiple_of(1024))
149+
.expect("alloc output"),
150+
),
151+
_plan_buffers: device_buffers,
148152
}
149153
}
150154

0 commit comments

Comments
 (0)