-
Notifications
You must be signed in to change notification settings - Fork 229
Expand file tree
/
Copy pathlib.rs
More file actions
37 lines (31 loc) · 1.01 KB
/
lib.rs
File metadata and controls
37 lines (31 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
use cuda_std::{kernel, shared, thread};
#[kernel]
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) {
let shared_sum = shared::dynamic_shared_mem::<f32>();
let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize);
let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize);
let tid = thread::thread_idx_x() as usize;
let mut sum = 0f32;
for i in ((start_ind + tid)..x.len()).step_by(num_threads) {
sum += x[i] * y[i];
}
unsafe {
*shared_sum.add(tid) = sum;
}
let mut i = (thread::block_dim_x() >> 1) as usize;
while i > 0 {
thread::sync_threads();
if tid < i {
unsafe {
*shared_sum.add(tid) += *shared_sum.add(tid + i);
}
}
i >>= 1;
}
if tid == 0 {
unsafe {
*out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid);
}
}
}