Files
kernels/kernels/wu_arch_cases/case05_tensor_barrier/kernel.cpp

46 lines
1.2 KiB
C++

#include "common_wu_min.h"
#define CASE05_BARRIER_ID 1u
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x1, (%[bar_id] | (%[domain_tensor] << %[domain_shift]))\n\t"
"li x2, %[num_tensor]\n\t"
".insn r %[custom0], 4, 0, x0, x1, x2\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[bar_id] "i"(CASE05_BARRIER_ID),
[domain_tensor] "i"(VX_BARRIER_DOMAIN_TENSOR),
[domain_shift] "i"(VX_BARRIER_DOMAIN_SHIFT),
[num_tensor] "i"(NUM_TENSOR_WARPS),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x05u);
return 1;
}
wu_case_pass();
return 0;
}