From 8f7dba5920c5948c0d73c205bc1fb9bf74e50559 Mon Sep 17 00:00:00 2001 From: abnerhexu Date: Wed, 6 May 2026 14:50:28 +0800 Subject: [PATCH] Add Blackwell instructions test kernel and update linker script - Add kernels/blackwell_insts/ with test kernel and input data - Update linker script with extended memory layout - Remove obsolete sgemm_tcore_blackwell Makefile - Update VX_types.h and common.h --- .gitignore | 5 + kernels/blackwell_insts/Makefile | 7 + kernels/blackwell_insts/args.bin | 1 + kernels/blackwell_insts/input.a.bin | 1 + kernels/blackwell_insts/input.b.bin | 1 + kernels/blackwell_insts/input.c.bin | 1 + kernels/blackwell_insts/kernel.cpp | 192 +++++++++++++++++++++++++ kernels/sgemm_tcore/common.h | 2 +- kernels/sgemm_tcore_blackwell/Makefile | 8 -- lib/include/VX_types.h | 2 +- lib/linker/vx_link64.ld | 25 ++++ 11 files changed, 235 insertions(+), 10 deletions(-) create mode 100644 .gitignore create mode 100644 kernels/blackwell_insts/Makefile create mode 100644 kernels/blackwell_insts/args.bin create mode 100644 kernels/blackwell_insts/input.a.bin create mode 100644 kernels/blackwell_insts/input.b.bin create mode 100644 kernels/blackwell_insts/input.c.bin create mode 100644 kernels/blackwell_insts/kernel.cpp delete mode 100644 kernels/sgemm_tcore_blackwell/Makefile diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..15751023 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +**/*.o +.codex +**/*.elf +**/*.dump +**/*.a \ No newline at end of file diff --git a/kernels/blackwell_insts/Makefile b/kernels/blackwell_insts/Makefile new file mode 100644 index 00000000..18b199d6 --- /dev/null +++ b/kernels/blackwell_insts/Makefile @@ -0,0 +1,7 @@ +PROJECT = blackwell_insts + +VX_SRCS = kernel.cpp + +OPTS ?= -n1 + +include ../common.mk diff --git a/kernels/blackwell_insts/args.bin b/kernels/blackwell_insts/args.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/blackwell_insts/args.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/blackwell_insts/input.a.bin b/kernels/blackwell_insts/input.a.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/blackwell_insts/input.a.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/blackwell_insts/input.b.bin b/kernels/blackwell_insts/input.b.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/blackwell_insts/input.b.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/blackwell_insts/input.c.bin b/kernels/blackwell_insts/input.c.bin new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/kernels/blackwell_insts/input.c.bin @@ -0,0 +1 @@ +0 diff --git a/kernels/blackwell_insts/kernel.cpp b/kernels/blackwell_insts/kernel.cpp new file mode 100644 index 00000000..427c2999 --- /dev/null +++ b/kernels/blackwell_insts/kernel.cpp @@ -0,0 +1,192 @@ +#include +#include +#include + +#define DEV_SMEM_START_ADDR 0xff000000u + +#define BW_REP2(x) x, x +#define BW_REP4(x) BW_REP2(x), BW_REP2(x) +#define BW_REP8(x) BW_REP4(x), BW_REP4(x) +#define BW_REP16(x) BW_REP8(x), BW_REP8(x) +#define BW_REP32(x) BW_REP16(x), BW_REP16(x) +#define BW_REP64(x) BW_REP32(x), BW_REP32(x) +#define BW_REP128(x) BW_REP64(x), BW_REP64(x) +#define BW_REP256(x) BW_REP128(x), BW_REP128(x) + +static volatile uint32_t g_a[256] __attribute__((aligned(32))) = { + BW_REP256(0x3c003c00u)}; // two fp16 1.0 values +static volatile uint32_t g_b[256] __attribute__((aligned(32))) = { + BW_REP256(0x40004000u)}; // two fp16 2.0 values +static volatile uint32_t g_c[256] __attribute__((aligned(32))) = { + BW_REP256(0x3f800000u)}; // one fp32 1.0 value +static volatile uint32_t g_dst[256] __attribute__((aligned(32))); +static volatile uint32_t g_debug[16] __attribute__((aligned(32))); +static volatile uint32_t g_status __attribute__((aligned(4))); + +#undef BW_REP2 +#undef BW_REP4 +#undef BW_REP8 +#undef BW_REP16 +#undef BW_REP32 +#undef BW_REP64 +#undef BW_REP128 +#undef BW_REP256 + +struct kernel_arg_t { + volatile uint32_t *a; + volatile uint32_t *b; + volatile uint32_t *c; + volatile uint32_t *dst; + volatile uint32_t *debug; + volatile uint32_t *status; +}; + +static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) { + asm volatile(".insn r %0, 2, 0, x0, %1, %2" + : + : "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem) + : "memory"); +} + +static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) { + asm volatile(".insn r %0, 6, 0, x0, %1, %2" + : + : "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem) + : "memory"); +} + +static inline void tcgen05_cp_wait() { + asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3) + : "memory"); +} + +static inline void bwgmma(uint32_t addr_tmem_c, + uint32_t addr_tmem_a, + uint32_t addr_smem_b) { + asm volatile(".insn r %0, 0, 0, %1, %2, %3" + : + : "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a), + "r"(addr_smem_b) + : "memory"); +} + +static inline void bwgmma_wait() { + asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3) + : "memory"); +} + +static inline float tcgen05_ld_f32(uint32_t addr_tmem) { + float value; + asm volatile(".insn r %1, 4, 0, %0, %2, x0" + : "=f"(value) + : "i"(RISCV_CUSTOM3), "r"(addr_tmem) + : "memory"); + return value; +} + +static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) { + asm volatile(".insn r %0, 5, 0, %1, %2, x0" + : + : "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem) + : "memory"); +} + +static inline uint32_t f32_bits(float value) { + union { + float f; + uint32_t u; + } bits = {value}; + return bits.u; +} + +extern "C" void vx_perf_dump() {} + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) + __attribute__((convergent)); + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + if (task_id != 0) + return; + + volatile uint32_t *a = arg->a; + volatile uint32_t *b = arg->b; + volatile uint32_t *c = arg->c; + volatile uint32_t *dst = arg->dst; + volatile uint32_t *debug = arg->debug; + volatile uint32_t *status = arg->status; + + const uint32_t tmem_a = 0x000; + const uint32_t tmem_c = 0x400; + const uint32_t tmem_st_scratch = 0x800; + volatile uint32_t *smem_b_ptr = + reinterpret_cast(DEV_SMEM_START_ADDR); + const uint32_t smem_b = reinterpret_cast(smem_b_ptr); + const uint32_t expected = 0x42820000u; // 65.0f + const uint32_t expected_st = 0x3f800000u; // 1.0f + + if (status != nullptr) + status[0] = 0x100u; + + for (int i = 0; i < 256; ++i) + smem_b_ptr[i] = b[i]; + + for (int frag = 0; frag < 32; ++frag) { + const uint32_t offset = static_cast(frag * 32); + tcgen05_cp(tmem_a + offset, + reinterpret_cast(&a[frag * 8])); + tcgen05_cp(tmem_c + offset, + reinterpret_cast(&c[frag * 8])); + } + tcgen05_cp_wait(); + + const float st_value = 1.0f; + tcgen05_st_f32(tmem_st_scratch, st_value); + const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch)); + debug[0] = st_bits; + if (st_bits != expected_st) { + if (status != nullptr) + status[0] = 0xe002u; + return; + } + + bwgmma(tmem_c, tmem_a, smem_b); + bwgmma_wait(); + + const float ld_value = tcgen05_ld_f32(tmem_c); + const uint32_t ld_bits = f32_bits(ld_value); + debug[1] = ld_bits; + tcgen05_st_f32(tmem_st_scratch + 32, ld_value); + + for (int frag = 0; frag < 32; ++frag) { + tcgen05_cb(tmem_c + static_cast(frag * 32), + reinterpret_cast(&dst[frag * 8])); + } + tcgen05_cp_wait(); + + if (ld_bits != expected) { + if (status != nullptr) + status[0] = 0xe001u; + return; + } + + for (int i = 0; i < 256; ++i) { + if (dst[i] != expected) { + if (status != nullptr) + status[0] = 0xe100u | static_cast(i & 0xff); + debug[2] = static_cast(i); + debug[3] = dst[i]; + return; + } + } + + if (status != nullptr) + status[0] = 0x600du; +} + +int main() { + kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status}; + + vx_spawn_tasks_contiguous(1, reinterpret_cast(kernel_body), + &arg); + return (g_status == 0x600du) ? 0 : 1; +} diff --git a/kernels/sgemm_tcore/common.h b/kernels/sgemm_tcore/common.h index 5c84f3b7..a69afbf4 100644 --- a/kernels/sgemm_tcore/common.h +++ b/kernels/sgemm_tcore/common.h @@ -6,7 +6,7 @@ #define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000 #define DEV_SMEM_START_ADDR 0xff000000 -typedef struct { +typedef struct __attribute__((packed)) { uint32_t dim_m; uint32_t dim_n; uint32_t dim_k; diff --git a/kernels/sgemm_tcore_blackwell/Makefile b/kernels/sgemm_tcore_blackwell/Makefile deleted file mode 100644 index 8f7ad01c..00000000 --- a/kernels/sgemm_tcore_blackwell/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -PROJECT = sgemm_tcore_blackwell - -VX_SRCS = ../sgemm_tcore/kernel.cpp -VX_INCLUDES = ../sgemm_tcore/sgemm_impl.hpp ../sgemm_tcore/common.h - -OPTS ?= -n16 - -include ../common.mk diff --git a/lib/include/VX_types.h b/lib/include/VX_types.h index 7ad1ca68..4e2cdf12 100644 --- a/lib/include/VX_types.h +++ b/lib/include/VX_types.h @@ -184,10 +184,10 @@ #define VX_CSR_CORE_ID 0xCC2 #define VX_CSR_WARP_MASK 0xCC3 #define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM +#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom #define VX_CSR_NUM_THREADS 0xFC0 #define VX_CSR_NUM_WARPS 0xFC1 #define VX_CSR_NUM_CORES 0xFC2 #endif // VX_TYPES_VH - diff --git a/lib/linker/vx_link64.ld b/lib/linker/vx_link64.ld index 072281f7..68056db1 100644 --- a/lib/linker/vx_link64.ld +++ b/lib/linker/vx_link64.ld @@ -7,6 +7,15 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv", "elf64-littleriscv") OUTPUT_ARCH(riscv) ENTRY(_start) + +MEMORY { + DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M + DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K + DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M + DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M + DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M +} + SECTIONS { . = STARTUP_ADDR; @@ -249,4 +258,20 @@ SECTIONS .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } + .args : { + *(.args) + . += 8K; + } > DRAMARG + .operand.a : { + *(.operand.a) + . += 32K; + } > DRAM1 + .operand.b : { + *(.operand.b) + . += 32K; + } > DRAM2 + .operand.c : { + *(.operand.c) + . += 32K; + } > DRAM3 }