diff --git a/kernels/wu_arch/Makefile b/kernels/wu_arch/Makefile new file mode 100644 index 00000000..76d35b84 --- /dev/null +++ b/kernels/wu_arch/Makefile @@ -0,0 +1,26 @@ +PROJECT = wu_arch + +VX_SRCS = kernel.cpp + +OPTS ?= -n1 + +WU_VARIANT_DUMPS = \ + kernel.radiance.barriers.dump + +all: kernel.radiance.dump $(WU_VARIANT_DUMPS) + +include ../common.mk + +kernel.radiance.barriers.dump: kernel.radiance.barriers.elf + $(VX_DP) -D $< > $@ + +kernel.radiance.barriers.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES) + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -DWU_RUN_DOMAIN_BARRIERS -o $@ + $(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@ + $(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@ + $(OBJCOPY) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@ + $(OBJCOPY) --set-section-flags .args=$(OBJCOPY_FLAGS) $@ + $(OBJCOPY) --update-section .operand.a=input.a.bin $@ || true + $(OBJCOPY) --update-section .operand.b=input.b.bin $@ || true + $(OBJCOPY) --update-section .operand.c=input.c.bin $@ || true + $(OBJCOPY) --update-section .args=args.bin $@ || true diff --git a/kernels/wu_arch/args.bin b/kernels/wu_arch/args.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch/args.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch/input.a.bin b/kernels/wu_arch/input.a.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch/input.a.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch/input.b.bin b/kernels/wu_arch/input.b.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch/input.b.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch/input.c.bin b/kernels/wu_arch/input.c.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch/input.c.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch/kernel.cpp b/kernels/wu_arch/kernel.cpp new file mode 100644 index 00000000..eaaf6a6d --- /dev/null +++ b/kernels/wu_arch/kernel.cpp @@ -0,0 +1,173 @@ +#include +#include + +#define DEV_SMEM_START_ADDR 0xff000000u +#define MAX_WARPS 8 +#define MINIMAL_INIT_WORDS 4 +#define WU_SCALAR_SPIN 32 +#define WU_TENSOR_SPIN 32 +#define WU_WAIT_SPIN 8192 +#define WU_STATUS_DONE 0x600du +#define WU_STATUS_SCALAR_BASE 0x5100u +#define WU_STATUS_TENSOR_BASE 0x7100u +#define WU_BARRIER_SCALAR 0u +#define WU_BARRIER_MASKED 1u +#define WU_BARRIER_TENSOR 2u + +extern "C" { +volatile uint32_t g_status[MAX_WARPS] __attribute__((aligned(32))); +volatile uint32_t g_scalar_seen[MAX_WARPS] __attribute__((aligned(32))); +volatile uint32_t g_tensor_seen[MAX_WARPS] __attribute__((aligned(32))); +volatile uint32_t g_spin_sink[MAX_WARPS] __attribute__((aligned(32))); +extern volatile uint64_t tohost; +} + +extern "C" void vx_perf_dump() {} + +static inline void wu_report_tohost(uint32_t exit_code) { + asm volatile("fence rw, rw" ::: "memory"); + tohost = (static_cast(exit_code) << 1) | 1u; + asm volatile("fence rw, rw" ::: "memory"); +} + +extern "C" int wu_main(); + +extern "C" void __attribute__((naked, section(".init"), used)) _start() { + asm volatile( + ".option push\n\t" + ".option norelax\n\t" + "la gp, __global_pointer\n\t" + ".option pop\n\t" + "csrr t0, %[csr_core]\n\t" + "bnez t0, 2f\n\t" + "li sp, %[stack_base]\n\t" + "call wu_main\n\t" + "mv gp, a0\n\t" + "2:\n\t" + ".insn r %[custom0], 0, 0, x0, x0, x0\n\t" + "1: j 1b\n\t" + : + : [csr_core] "i"(VX_CSR_CORE_ID), + [stack_base] "i"(STACK_BASE_ADDR), + [custom0] "i"(RISCV_CUSTOM0) + : "memory"); +} + +extern "C" void scalar_worker() { + const uint32_t wid = static_cast(vx_warp_id()); + const uint32_t tid = static_cast(vx_thread_id()); + volatile uint32_t mix = wid + 1u; + +#ifdef WU_RUN_DOMAIN_BARRIERS + vx_barrier_scalar(WU_BARRIER_SCALAR, NUM_SCALAR_WARPS); + vx_barrier_mask(WU_BARRIER_MASKED, vx_scalar_warp_mask()); +#endif + + for (uint32_t i = 0; i < WU_SCALAR_SPIN; ++i) + mix = (mix << 1) ^ (i + wid); + + if (tid == 0 && wid < MAX_WARPS) { + g_spin_sink[wid] = mix; + g_scalar_seen[wid] = WU_STATUS_SCALAR_BASE | wid; + } + + vx_tmc_zero(); + while (1) {} +} + +extern "C" void __attribute__((naked, noinline, used)) tensor_worker() { + asm volatile( + "csrr x5, %[csr_wid]\n\t" +#ifdef WU_RUN_DOMAIN_BARRIERS + "li x1, (%[bar_tensor] | (%[domain_tensor] << %[domain_shift]))\n\t" + "li x2, %[num_tensor]\n\t" + ".insn r %[custom0], 4, 0, x0, x1, x2\n\t" +#endif + ".insn r %[custom3], 3, 0, x0, x0, x0\n\t" + "li x7, %[tensor_spin]\n\t" + "1:\n\t" + "addi x7, x7, -1\n\t" + "bnez x7, 1b\n\t" + "slli x5, x5, 2\n\t" + "la x6, g_tensor_seen\n\t" + "add x6, x6, x5\n\t" + "li x7, %[tensor_base]\n\t" + "srli x5, x5, 2\n\t" + "or x7, x7, x5\n\t" + "sw x7, 0(x6)\n\t" + ".insn r %[custom0], 0, 0, x0, x0, x0\n\t" + "2: j 2b\n\t" + : + : [csr_wid] "i"(VX_CSR_WARP_ID), + [custom0] "i"(RISCV_CUSTOM0), + [custom3] "i"(RISCV_CUSTOM3), + [bar_tensor] "i"(WU_BARRIER_TENSOR), + [domain_tensor] "i"(VX_BARRIER_DOMAIN_TENSOR), + [domain_shift] "i"(VX_BARRIER_DOMAIN_SHIFT), + [num_tensor] "i"(NUM_TENSOR_WARPS), + [tensor_spin] "i"(WU_TENSOR_SPIN), + [tensor_base] "i"(WU_STATUS_TENSOR_BASE) + : "memory"); +} + +static void init_state() { + g_status[0] = 0; + for (uint32_t i = 0; i < MAX_WARPS; ++i) { + g_scalar_seen[i] = 0; + g_tensor_seen[i] = 0; + } + + volatile uint32_t *smem = + reinterpret_cast(DEV_SMEM_START_ADDR); + for (uint32_t i = 0; i < MINIMAL_INIT_WORDS; ++i) + smem[i] = 0x100u + i; +} + +static int wait_for_wu_completion() { + for (uint32_t spin = 0; spin < WU_WAIT_SPIN; ++spin) { + uint32_t done = 1; + for (uint32_t wid = 0; wid < NUM_SCALAR_WARPS; ++wid) + done &= (g_scalar_seen[wid] == (WU_STATUS_SCALAR_BASE | wid)); + for (uint32_t wid = NUM_SCALAR_WARPS; wid < NUM_WARPS; ++wid) + done &= (g_tensor_seen[wid] == (WU_STATUS_TENSOR_BASE | wid)); + if (done) + return 0; + } + return 1; +} + +extern "C" int wu_main() { + if (vx_core_id() != 0 || vx_warp_id() != 0) + return 0; + if (vx_thread_id() != 0) + return 0; + + init_state(); + + const uint32_t other_scalar_warps = vx_scalar_warp_mask() & ~1u; + if (other_scalar_warps != 0) + vx_spawn_scalar(other_scalar_warps, scalar_worker); + + vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker); + +#ifdef WU_RUN_DOMAIN_BARRIERS + vx_barrier_scalar(WU_BARRIER_SCALAR, NUM_SCALAR_WARPS); + vx_barrier_mask(WU_BARRIER_MASKED, vx_scalar_warp_mask()); +#endif + + volatile uint32_t mix = 1; + for (uint32_t i = 0; i < WU_SCALAR_SPIN; ++i) + mix = (mix << 1) ^ (i + 3u); + g_spin_sink[0] = mix; + g_scalar_seen[0] = WU_STATUS_SCALAR_BASE; + + if (wait_for_wu_completion() != 0) { + g_status[0] = 0xe001u; + wu_report_tohost(1); + return 1; + } + + g_status[0] = WU_STATUS_DONE; + wu_report_tohost(0); + return 0; +} diff --git a/kernels/wu_arch_hgemm/Makefile b/kernels/wu_arch_hgemm/Makefile new file mode 100644 index 00000000..3d93c9c6 --- /dev/null +++ b/kernels/wu_arch_hgemm/Makefile @@ -0,0 +1,9 @@ +PROJECT = wu_arch_hgemm + +VX_SRCS = kernel.cpp +OPTS ?= -n1 + +include ../common.mk + +args.bin input.a.bin input.b.bin input.c.bin: ../wu_arch_cases/zero.bin + cp $< $@ diff --git a/kernels/wu_arch_hgemm/README.md b/kernels/wu_arch_hgemm/README.md new file mode 100644 index 00000000..8046f77d --- /dev/null +++ b/kernels/wu_arch_hgemm/README.md @@ -0,0 +1,8 @@ +# wu_arch_hgemm + +Tensor-warp HGEMM smoke test for the Wu split scalar/tensor warp configuration. + +Scalar warp 0 initializes the shared-memory B operand, spawns only the tensor +warp mask, waits for tensor warps `NUM_SCALAR_WARPS..NUM_WARPS-1`, and reports +completion through `tohost`. Tensor warps execute the Blackwell custom HGEMM +instruction sequence and then stop themselves. diff --git a/kernels/wu_arch_hgemm/args.bin b/kernels/wu_arch_hgemm/args.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch_hgemm/args.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch_hgemm/input.a.bin b/kernels/wu_arch_hgemm/input.a.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch_hgemm/input.a.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch_hgemm/input.b.bin b/kernels/wu_arch_hgemm/input.b.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch_hgemm/input.b.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch_hgemm/input.c.bin b/kernels/wu_arch_hgemm/input.c.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/wu_arch_hgemm/input.c.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/wu_arch_hgemm/kernel.cpp b/kernels/wu_arch_hgemm/kernel.cpp new file mode 100644 index 00000000..c5925531 --- /dev/null +++ b/kernels/wu_arch_hgemm/kernel.cpp @@ -0,0 +1,87 @@ +#include "../wu_arch_cases/common_wu_min.h" + +#define DEV_SMEM_START_ADDR 0xff000000u +#define WU_CASE_TENSOR_HGEMM_BASE 0x7500u + +#define BW_REP2(x) x, x +#define BW_REP4(x) BW_REP2(x), BW_REP2(x) +#define BW_REP8(x) BW_REP4(x), BW_REP4(x) + +extern "C" { +volatile uint32_t g_hgemm_a_row[8] __attribute__((aligned(32))) = { + BW_REP8(0x3c003c00u)}; +volatile uint32_t g_hgemm_b_row[8] __attribute__((aligned(32))) = { + BW_REP8(0x40004000u)}; +volatile uint32_t g_hgemm_c_row[8] __attribute__((aligned(32))) = { + BW_REP8(0x3f800000u)}; +} + +#undef BW_REP2 +#undef BW_REP4 +#undef BW_REP8 + +extern "C" void __attribute__((naked, noinline, used)) tensor_hgemm_worker() { + asm volatile( + "csrr x5, %[csr_wid]\n\t" + "slli x1, x5, 11\n\t" + "addi x2, x1, 1024\n\t" + "la x6, g_hgemm_a_row\n\t" + "la x3, g_hgemm_c_row\n\t" + "li x7, 0\n\t" + "1:\n\t" + "add x4, x1, x7\n\t" + ".insn r %[custom3], 2, 0, x0, x4, x6\n\t" + "add x4, x2, x7\n\t" + ".insn r %[custom3], 2, 0, x0, x4, x3\n\t" + "addi x7, x7, 32\n\t" + "li x4, 1024\n\t" + "blt x7, x4, 1b\n\t" + ".insn r %[custom3], 3, 0, x0, x0, x0\n\t" + "li x4, %[smem_base]\n\t" + ".insn r %[custom3], 0, 0, x2, x1, x4\n\t" + ".insn r %[custom3], 1, 0, x0, x0, x0\n\t" + "csrr x5, %[csr_wid]\n\t" + "slli x6, x5, 2\n\t" + "la x7, g_seen\n\t" + "add x7, x7, x6\n\t" + "li x6, %[hgemm_base]\n\t" + "or x6, x6, x5\n\t" + "sw x6, 0(x7)\n\t" + ".insn r %[custom0], 0, 0, x0, x0, x0\n\t" + "2: j 2b\n\t" + : + : [csr_wid] "i"(VX_CSR_WARP_ID), + [custom0] "i"(RISCV_CUSTOM0), + [custom3] "i"(RISCV_CUSTOM3), + [smem_base] "i"(DEV_SMEM_START_ADDR), + [hgemm_base] "i"(WU_CASE_TENSOR_HGEMM_BASE) + : "memory"); +} + +extern "C" int wu_main() { + if (!wu_is_leader()) { + return 0; + } + + wu_case_reset(); + + volatile uint32_t *smem_b = + reinterpret_cast(DEV_SMEM_START_ADDR); + for (uint32_t frag = 0; frag < 32u; ++frag) { + const uint32_t row = frag * 8u; + for (uint32_t i = 0; i < 8u; ++i) { + smem_b[row + i] = g_hgemm_b_row[i]; + } + } + + vx_spawn_tensor(vx_tensor_warp_mask(), tensor_hgemm_worker); + + if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, + WU_CASE_TENSOR_HGEMM_BASE) != 0) { + wu_case_fail(0x09u); + return 1; + } + + wu_case_pass(); + return 0; +}