Skip to content

Commit 0a77ca0

Browse files
committed
rdcycle_sync_diff & simx changes
1 parent 109bea8 commit 0a77ca0

4 files changed

Lines changed: 101 additions & 12 deletions

File tree

kernel/include/vx_intrinsics.h

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,94 @@ static inline uint64_t vx_rdcycle_sync() {
284284
return cycles;
285285
}
286286

287+
typedef struct {
288+
uint32_t hi;
289+
uint32_t lo;
290+
} __rdcycle_time;
291+
292+
static inline __attribute__((always_inline)) __rdcycle_time vx_rdcycle_sync_begin(void) {
293+
__rdcycle_time t;
294+
#if __riscv_xlen == 32
295+
__asm__ volatile (
296+
".insn r %2, 7, 0, x0, x0, x0\n\t"
297+
"csrr %0, %3\n\t"
298+
"csrr %1, %4\n\t"
299+
: "=r" (t.hi), "=r" (t.lo)
300+
: "i" (RISCV_CUSTOM0), "i" (VX_CSR_MCYCLE_H), "i" (VX_CSR_MCYCLE)
301+
: "memory"
302+
);
303+
#elif __riscv_xlen == 64
304+
uint64_t cycles;
305+
__asm__ volatile (
306+
".insn r %1, 7, 0, x0, x0, x0\n\t"
307+
"csrr %0, %2\n\t"
308+
: "=r" (cycles)
309+
: "i" (RISCV_CUSTOM0), "i" (VX_CSR_MCYCLE)
310+
: "memory"
311+
);
312+
t.hi = (uint32_t)(cycles >> 32);
313+
t.lo = (uint32_t)cycles;
314+
#else
315+
#error "Unsupported RISC-V XLEN"
316+
#endif
317+
return t;
318+
}
319+
320+
static inline __attribute__((always_inline)) __rdcycle_time vx_rdcycle_sync_end(void) {
321+
__rdcycle_time t;
322+
#if __riscv_xlen == 32
323+
__asm__ volatile (
324+
".insn r %2, 7, 0, x0, x0, x0\n\t"
325+
"csrr %0, %3\n\t"
326+
"csrr %1, %4\n\t"
327+
: "=r" (t.lo), "=r" (t.hi)
328+
: "i" (RISCV_CUSTOM0), "i" (VX_CSR_MCYCLE), "i" (VX_CSR_MCYCLE_H)
329+
: "memory"
330+
);
331+
#elif __riscv_xlen == 64
332+
uint64_t cycles;
333+
__asm__ volatile (
334+
".insn r %1, 7, 0, x0, x0, x0\n\t"
335+
"csrr %0, %2\n\t"
336+
: "=r" (cycles)
337+
: "i" (RISCV_CUSTOM0), "i" (VX_CSR_MCYCLE)
338+
: "memory"
339+
);
340+
t.hi = (uint32_t)(cycles >> 32);
341+
t.lo = (uint32_t)cycles;
342+
#else
343+
#error "Unsupported RISC-V XLEN"
344+
#endif
345+
return t;
346+
}
347+
348+
static inline __attribute__((always_inline)) uint64_t vx_rdcycle_sync_diff(__rdcycle_time start, __rdcycle_time end) {
349+
#if __riscv_xlen == 32
350+
uint32_t diff_hi = end.hi;
351+
uint32_t diff_lo = end.lo;
352+
uint32_t tmp = start.hi;
353+
uint32_t start_lo = start.lo;
354+
355+
__asm__ volatile (
356+
"sub %0, %0, %2\n\t"
357+
"sltu %2, %1, %3\n\t"
358+
"sub %1, %1, %3\n\t"
359+
"sub %0, %0, %2\n\t"
360+
: "+r" (diff_hi), "+r" (diff_lo), "+r" (tmp)
361+
: "r" (start_lo)
362+
: "memory"
363+
);
364+
365+
return ((uint64_t)diff_hi << 32) | diff_lo;
366+
#elif __riscv_xlen == 64
367+
uint64_t s = ((uint64_t)start.hi << 32) | start.lo;
368+
uint64_t e = ((uint64_t)end.hi << 32) | end.lo;
369+
return e - s;
370+
#else
371+
#error "Unsupported RISC-V XLEN"
372+
#endif
373+
}
374+
287375
// Memory fence
288376
inline void vx_fence() {
289377
__asm__ volatile ("fence iorw, iorw");

sim/simx/core.cpp

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -476,11 +476,6 @@ void Core::commit() {
476476

477477
// update scoreboard
478478
if (trace->eop) {
479-
bool release_wsync = false;
480-
if (auto wctl_type = std::get_if<WctlType>(&trace->op_type)) {
481-
release_wsync = (*wctl_type == WctlType::WSYNC);
482-
} //if the instruction is a wsync, set release_wsync to true. warp will continue to run after commits
483-
484479
if (trace->wb) {
485480
operands_.at(iw)->writeback(trace);
486481
scoreboard_.release(trace);
@@ -511,10 +506,6 @@ void Core::commit() {
511506
#endif
512507
// instruction completed
513508
pending_instrs_.remove(trace);
514-
515-
if (release_wsync) {
516-
this->resume(trace->wid); //allows the scheduler to select this warp to fetch new instructions
517-
}
518509
}
519510

520511
// delete the trace
@@ -536,11 +527,10 @@ bool Core::running() const {
536527
}
537528

538529
bool Core::warp_sync_ready(uint32_t wid, uint64_t uuid) const {
530+
(void)uuid;
539531
for (auto trace : pending_instrs_) {
540532
if (trace->wid != wid)
541533
continue;
542-
if (trace->uuid < uuid)
543-
return false;
544534
}
545535
return true;
546536
}

sim/simx/func_unit.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ void SfuUnit::tick() {
459459
if (trace->eop && !core_->warp_sync_ready(trace->wid, trace->uuid))
460460
continue; //skips the rest of the loop. Does not pop the input.
461461
output.send(trace, 2+delay);
462-
release_warp = false;
462+
release_warp = true;
463463
break;
464464
case WctlType::BAR: {
465465
output.send(trace, 2+delay);

tests/regression/sgemm_tcu/kernel.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "common.h"
22
#include <vx_spawn.h>
33
#include <vx_tensor.h>
4+
#include <vx_intrinsics.h>
45

56
namespace vt = vortex::tensor;
67
using ctx = vt::wmma_context<NUM_THREADS, vt::ITYPE, vt::OTYPE>;
@@ -9,6 +10,7 @@ void kernel_body(kernel_arg_t *__UNIFORM__ arg) {
910
auto pA = reinterpret_cast<ctx::input_t *>(arg->A_addr);
1011
auto pB = reinterpret_cast<ctx::input_t *>(arg->B_addr);
1112
auto pC = reinterpret_cast<ctx::output_t *>(arg->C_addr);
13+
auto pCycles = reinterpret_cast<uint64_t *>(arg->cycles_addr);
1214

1315
uint32_t M = arg->M;
1416
uint32_t N = arg->N;
@@ -25,6 +27,7 @@ void kernel_body(kernel_arg_t *__UNIFORM__ arg) {
2527
// Initialize accumulator tile to zero
2628
ctx::fill_fragment(fragC, 0);
2729

30+
uint64_t cycles = 0;
2831
for (int i = 0; i < K; i += ctx::tileK) {
2932
auto pTileA = pA + tile_row * K + i;
3033

@@ -42,7 +45,15 @@ void kernel_body(kernel_arg_t *__UNIFORM__ arg) {
4245
}
4346

4447
// Matrix multiply-accumulate: c += a * b
48+
__rdcycle_time t0 = vx_rdcycle_sync_begin();
4549
ctx::mma_sync(fragC, fragA, fragB, fragC);
50+
__rdcycle_time t1 = vx_rdcycle_sync_end();
51+
cycles += vx_rdcycle_sync_diff(t0, t1);
52+
}
53+
54+
if (0 == vx_thread_id()) {
55+
uint32_t block_id = blockIdx.y * arg->grid_dim[0] + blockIdx.x;
56+
pCycles[block_id] = cycles;
4657
}
4758

4859
// Store the computed C tile

0 commit comments

Comments
 (0)