@@ -38,8 +38,9 @@ use vortex::session::VortexSession;
3838use vortex_cuda:: CudaDeviceBuffer ;
3939use vortex_cuda:: CudaExecutionCtx ;
4040use vortex_cuda:: CudaSession ;
41- use vortex_cuda:: dynamic_dispatch;
42- use vortex_cuda:: dynamic_dispatch:: DynamicDispatchPlan ;
41+ use vortex_cuda:: dynamic_dispatch:: CudaDispatchPlan ;
42+ use vortex_cuda:: dynamic_dispatch:: MaterializedPlan ;
43+ use vortex_cuda:: dynamic_dispatch:: UnmaterializedPlan ;
4344use vortex_cuda_macros:: cuda_available;
4445use vortex_cuda_macros:: cuda_not_available;
4546
@@ -51,14 +52,14 @@ const BENCH_ARGS: &[(usize, &str)] = &[
5152
5253/// Launch the dynamic_dispatch kernel and return GPU-timed duration.
5354///
54- /// This deliberately does not use `DynamicDispatchPlan ::execute` because the
55+ /// This deliberately does not use `CudaDispatchPlan ::execute` because the
5556/// benchmark pre-allocates the output buffer and device plan once, then reuses
5657/// them across iterations.
5758fn run_timed (
5859 cuda_ctx : & mut CudaExecutionCtx ,
5960 array_len : usize ,
6061 output_buf : & CudaDeviceBuffer ,
61- device_plan : & Arc < cudarc:: driver:: CudaSlice < DynamicDispatchPlan > > ,
62+ device_plan : & Arc < cudarc:: driver:: CudaSlice < CudaDispatchPlan > > ,
6263 shared_mem_bytes : u32 ,
6364) -> VortexResult < Duration > {
6465 let cuda_function = cuda_ctx. load_function ( "dynamic_dispatch" , & [ PType :: U32 ] ) ?;
@@ -111,40 +112,43 @@ fn run_timed(
111112
112113/// Benchmark runner: builds a dynamic plan and launches the kernel.
113114struct BenchRunner {
114- _plan : DynamicDispatchPlan ,
115+ _plan : CudaDispatchPlan ,
115116 smem_bytes : u32 ,
116117 len : usize ,
117118 // Keep alive
118- device_plan : Arc < cudarc:: driver:: CudaSlice < DynamicDispatchPlan > > ,
119+ device_plan : Arc < cudarc:: driver:: CudaSlice < CudaDispatchPlan > > ,
119120 output_buf : CudaDeviceBuffer ,
120121 _plan_buffers : Vec < vortex:: array:: buffer:: BufferHandle > ,
121122}
122123
123124impl BenchRunner {
124125 fn new ( array : & vortex:: array:: ArrayRef , len : usize , cuda_ctx : & CudaExecutionCtx ) -> Self {
125- let ( plan, plan_buffers) =
126- dynamic_dispatch:: build_plan ( array, cuda_ctx) . vortex_expect ( "build_plan" ) ;
127- let smem_bytes = plan. shared_mem_bytes :: < u32 > ( ) ;
126+ let MaterializedPlan {
127+ dispatch_plan,
128+ device_buffers,
129+ shared_mem_bytes,
130+ } = UnmaterializedPlan :: new ( array)
131+ . and_then ( |p| p. materialize ( cuda_ctx) )
132+ . vortex_expect ( "build_dyn_dispatch_plan" ) ;
128133
129134 let device_plan = Arc :: new (
130135 cuda_ctx
131136 . stream ( )
132- . clone_htod ( std:: slice:: from_ref ( & plan ) )
137+ . clone_htod ( std:: slice:: from_ref ( & dispatch_plan ) )
133138 . expect ( "htod plan" ) ,
134139 ) ;
135140
136- let output_slice = cuda_ctx
137- . device_alloc :: < u32 > ( len. next_multiple_of ( 1024 ) )
138- . expect ( "alloc output" ) ;
139- let output_buf = CudaDeviceBuffer :: new ( output_slice) ;
140-
141141 Self {
142- _plan : plan ,
143- smem_bytes,
142+ _plan : dispatch_plan ,
143+ smem_bytes : shared_mem_bytes ,
144144 len,
145145 device_plan,
146- output_buf,
147- _plan_buffers : plan_buffers,
146+ output_buf : CudaDeviceBuffer :: new (
147+ cuda_ctx
148+ . device_alloc :: < u32 > ( len. next_multiple_of ( 1024 ) )
149+ . expect ( "alloc output" ) ,
150+ ) ,
151+ _plan_buffers : device_buffers,
148152 }
149153 }
150154
0 commit comments