Add Blackwell instructions test kernel and update linker script

- Add kernels/blackwell_insts/ with test kernel and input data
- Update linker script with extended memory layout
- Remove obsolete sgemm_tcore_blackwell Makefile
- Update VX_types.h and common.h
This commit is contained in:
2026-05-06 14:50:28 +08:00
parent bcc566b621
commit 8f7dba5920
11 changed files with 235 additions and 10 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
**/*.o
.codex
**/*.elf
**/*.dump
**/*.a

View File

@@ -0,0 +1,7 @@
PROJECT = blackwell_insts
VX_SRCS = kernel.cpp
OPTS ?= -n1
include ../common.mk

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1,192 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#define DEV_SMEM_START_ADDR 0xff000000u
#define BW_REP2(x) x, x
#define BW_REP4(x) BW_REP2(x), BW_REP2(x)
#define BW_REP8(x) BW_REP4(x), BW_REP4(x)
#define BW_REP16(x) BW_REP8(x), BW_REP8(x)
#define BW_REP32(x) BW_REP16(x), BW_REP16(x)
#define BW_REP64(x) BW_REP32(x), BW_REP32(x)
#define BW_REP128(x) BW_REP64(x), BW_REP64(x)
#define BW_REP256(x) BW_REP128(x), BW_REP128(x)
static volatile uint32_t g_a[256] __attribute__((aligned(32))) = {
BW_REP256(0x3c003c00u)}; // two fp16 1.0 values
static volatile uint32_t g_b[256] __attribute__((aligned(32))) = {
BW_REP256(0x40004000u)}; // two fp16 2.0 values
static volatile uint32_t g_c[256] __attribute__((aligned(32))) = {
BW_REP256(0x3f800000u)}; // one fp32 1.0 value
static volatile uint32_t g_dst[256] __attribute__((aligned(32)));
static volatile uint32_t g_debug[16] __attribute__((aligned(32)));
static volatile uint32_t g_status __attribute__((aligned(4)));
#undef BW_REP2
#undef BW_REP4
#undef BW_REP8
#undef BW_REP16
#undef BW_REP32
#undef BW_REP64
#undef BW_REP128
#undef BW_REP256
struct kernel_arg_t {
volatile uint32_t *a;
volatile uint32_t *b;
volatile uint32_t *c;
volatile uint32_t *dst;
volatile uint32_t *debug;
volatile uint32_t *status;
};
static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 2, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 6, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cp_wait() {
asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline void bwgmma(uint32_t addr_tmem_c,
uint32_t addr_tmem_a,
uint32_t addr_smem_b) {
asm volatile(".insn r %0, 0, 0, %1, %2, %3"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a),
"r"(addr_smem_b)
: "memory");
}
static inline void bwgmma_wait() {
asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline float tcgen05_ld_f32(uint32_t addr_tmem) {
float value;
asm volatile(".insn r %1, 4, 0, %0, %2, x0"
: "=f"(value)
: "i"(RISCV_CUSTOM3), "r"(addr_tmem)
: "memory");
return value;
}
static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) {
asm volatile(".insn r %0, 5, 0, %1, %2, x0"
:
: "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem)
: "memory");
}
static inline uint32_t f32_bits(float value) {
union {
float f;
uint32_t u;
} bits = {value};
return bits.u;
}
extern "C" void vx_perf_dump() {}
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg)
__attribute__((convergent));
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
if (task_id != 0)
return;
volatile uint32_t *a = arg->a;
volatile uint32_t *b = arg->b;
volatile uint32_t *c = arg->c;
volatile uint32_t *dst = arg->dst;
volatile uint32_t *debug = arg->debug;
volatile uint32_t *status = arg->status;
const uint32_t tmem_a = 0x000;
const uint32_t tmem_c = 0x400;
const uint32_t tmem_st_scratch = 0x800;
volatile uint32_t *smem_b_ptr =
reinterpret_cast<volatile uint32_t *>(DEV_SMEM_START_ADDR);
const uint32_t smem_b = reinterpret_cast<uint32_t>(smem_b_ptr);
const uint32_t expected = 0x42820000u; // 65.0f
const uint32_t expected_st = 0x3f800000u; // 1.0f
if (status != nullptr)
status[0] = 0x100u;
for (int i = 0; i < 256; ++i)
smem_b_ptr[i] = b[i];
for (int frag = 0; frag < 32; ++frag) {
const uint32_t offset = static_cast<uint32_t>(frag * 32);
tcgen05_cp(tmem_a + offset,
reinterpret_cast<uint32_t>(&a[frag * 8]));
tcgen05_cp(tmem_c + offset,
reinterpret_cast<uint32_t>(&c[frag * 8]));
}
tcgen05_cp_wait();
const float st_value = 1.0f;
tcgen05_st_f32(tmem_st_scratch, st_value);
const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch));
debug[0] = st_bits;
if (st_bits != expected_st) {
if (status != nullptr)
status[0] = 0xe002u;
return;
}
bwgmma(tmem_c, tmem_a, smem_b);
bwgmma_wait();
const float ld_value = tcgen05_ld_f32(tmem_c);
const uint32_t ld_bits = f32_bits(ld_value);
debug[1] = ld_bits;
tcgen05_st_f32(tmem_st_scratch + 32, ld_value);
for (int frag = 0; frag < 32; ++frag) {
tcgen05_cb(tmem_c + static_cast<uint32_t>(frag * 32),
reinterpret_cast<uint32_t>(&dst[frag * 8]));
}
tcgen05_cp_wait();
if (ld_bits != expected) {
if (status != nullptr)
status[0] = 0xe001u;
return;
}
for (int i = 0; i < 256; ++i) {
if (dst[i] != expected) {
if (status != nullptr)
status[0] = 0xe100u | static_cast<uint32_t>(i & 0xff);
debug[2] = static_cast<uint32_t>(i);
debug[3] = dst[i];
return;
}
}
if (status != nullptr)
status[0] = 0x600du;
}
int main() {
kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status};
vx_spawn_tasks_contiguous(1, reinterpret_cast<vx_spawn_tasks_cb>(kernel_body),
&arg);
return (g_status == 0x600du) ? 0 : 1;
}

View File

@@ -6,7 +6,7 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000 #define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000
#define DEV_SMEM_START_ADDR 0xff000000 #define DEV_SMEM_START_ADDR 0xff000000
typedef struct { typedef struct __attribute__((packed)) {
uint32_t dim_m; uint32_t dim_m;
uint32_t dim_n; uint32_t dim_n;
uint32_t dim_k; uint32_t dim_k;

View File

@@ -1,8 +0,0 @@
PROJECT = sgemm_tcore_blackwell
VX_SRCS = ../sgemm_tcore/kernel.cpp
VX_INCLUDES = ../sgemm_tcore/sgemm_impl.hpp ../sgemm_tcore/common.h
OPTS ?= -n16
include ../common.mk

View File

@@ -184,10 +184,10 @@
#define VX_CSR_CORE_ID 0xCC2 #define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3 #define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM #define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
#define VX_CSR_NUM_THREADS 0xFC0 #define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1 #define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2 #define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH #endif // VX_TYPES_VH

View File

@@ -7,6 +7,15 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
"elf64-littleriscv") "elf64-littleriscv")
OUTPUT_ARCH(riscv) OUTPUT_ARCH(riscv)
ENTRY(_start) ENTRY(_start)
MEMORY {
DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M
DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
}
SECTIONS SECTIONS
{ {
. = STARTUP_ADDR; . = STARTUP_ADDR;
@@ -249,4 +258,20 @@ SECTIONS
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
.args : {
*(.args)
. += 8K;
} > DRAMARG
.operand.a : {
*(.operand.a)
. += 32K;
} > DRAM1
.operand.b : {
*(.operand.b)
. += 32K;
} > DRAM2
.operand.c : {
*(.operand.c)
. += 32K;
} > DRAM3
} }