Add Blackwell instructions test kernel and update linker script
- Add kernels/blackwell_insts/ with test kernel and input data - Update linker script with extended memory layout - Remove obsolete sgemm_tcore_blackwell Makefile - Update VX_types.h and common.h
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
**/*.o
|
||||
.codex
|
||||
**/*.elf
|
||||
**/*.dump
|
||||
**/*.a
|
||||
7
kernels/blackwell_insts/Makefile
Normal file
7
kernels/blackwell_insts/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
PROJECT = blackwell_insts
|
||||
|
||||
VX_SRCS = kernel.cpp
|
||||
|
||||
OPTS ?= -n1
|
||||
|
||||
include ../common.mk
|
||||
1
kernels/blackwell_insts/args.bin
Normal file
1
kernels/blackwell_insts/args.bin
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
1
kernels/blackwell_insts/input.a.bin
Normal file
1
kernels/blackwell_insts/input.a.bin
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
1
kernels/blackwell_insts/input.b.bin
Normal file
1
kernels/blackwell_insts/input.b.bin
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
1
kernels/blackwell_insts/input.c.bin
Normal file
1
kernels/blackwell_insts/input.c.bin
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
192
kernels/blackwell_insts/kernel.cpp
Normal file
192
kernels/blackwell_insts/kernel.cpp
Normal file
@@ -0,0 +1,192 @@
|
||||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
|
||||
#define DEV_SMEM_START_ADDR 0xff000000u
|
||||
|
||||
#define BW_REP2(x) x, x
|
||||
#define BW_REP4(x) BW_REP2(x), BW_REP2(x)
|
||||
#define BW_REP8(x) BW_REP4(x), BW_REP4(x)
|
||||
#define BW_REP16(x) BW_REP8(x), BW_REP8(x)
|
||||
#define BW_REP32(x) BW_REP16(x), BW_REP16(x)
|
||||
#define BW_REP64(x) BW_REP32(x), BW_REP32(x)
|
||||
#define BW_REP128(x) BW_REP64(x), BW_REP64(x)
|
||||
#define BW_REP256(x) BW_REP128(x), BW_REP128(x)
|
||||
|
||||
static volatile uint32_t g_a[256] __attribute__((aligned(32))) = {
|
||||
BW_REP256(0x3c003c00u)}; // two fp16 1.0 values
|
||||
static volatile uint32_t g_b[256] __attribute__((aligned(32))) = {
|
||||
BW_REP256(0x40004000u)}; // two fp16 2.0 values
|
||||
static volatile uint32_t g_c[256] __attribute__((aligned(32))) = {
|
||||
BW_REP256(0x3f800000u)}; // one fp32 1.0 value
|
||||
static volatile uint32_t g_dst[256] __attribute__((aligned(32)));
|
||||
static volatile uint32_t g_debug[16] __attribute__((aligned(32)));
|
||||
static volatile uint32_t g_status __attribute__((aligned(4)));
|
||||
|
||||
#undef BW_REP2
|
||||
#undef BW_REP4
|
||||
#undef BW_REP8
|
||||
#undef BW_REP16
|
||||
#undef BW_REP32
|
||||
#undef BW_REP64
|
||||
#undef BW_REP128
|
||||
#undef BW_REP256
|
||||
|
||||
struct kernel_arg_t {
|
||||
volatile uint32_t *a;
|
||||
volatile uint32_t *b;
|
||||
volatile uint32_t *c;
|
||||
volatile uint32_t *dst;
|
||||
volatile uint32_t *debug;
|
||||
volatile uint32_t *status;
|
||||
};
|
||||
|
||||
static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) {
|
||||
asm volatile(".insn r %0, 2, 0, x0, %1, %2"
|
||||
:
|
||||
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) {
|
||||
asm volatile(".insn r %0, 6, 0, x0, %1, %2"
|
||||
:
|
||||
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void tcgen05_cp_wait() {
|
||||
asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void bwgmma(uint32_t addr_tmem_c,
|
||||
uint32_t addr_tmem_a,
|
||||
uint32_t addr_smem_b) {
|
||||
asm volatile(".insn r %0, 0, 0, %1, %2, %3"
|
||||
:
|
||||
: "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a),
|
||||
"r"(addr_smem_b)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline void bwgmma_wait() {
|
||||
asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline float tcgen05_ld_f32(uint32_t addr_tmem) {
|
||||
float value;
|
||||
asm volatile(".insn r %1, 4, 0, %0, %2, x0"
|
||||
: "=f"(value)
|
||||
: "i"(RISCV_CUSTOM3), "r"(addr_tmem)
|
||||
: "memory");
|
||||
return value;
|
||||
}
|
||||
|
||||
static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) {
|
||||
asm volatile(".insn r %0, 5, 0, %1, %2, x0"
|
||||
:
|
||||
: "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
static inline uint32_t f32_bits(float value) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t u;
|
||||
} bits = {value};
|
||||
return bits.u;
|
||||
}
|
||||
|
||||
extern "C" void vx_perf_dump() {}
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg)
|
||||
__attribute__((convergent));
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
if (task_id != 0)
|
||||
return;
|
||||
|
||||
volatile uint32_t *a = arg->a;
|
||||
volatile uint32_t *b = arg->b;
|
||||
volatile uint32_t *c = arg->c;
|
||||
volatile uint32_t *dst = arg->dst;
|
||||
volatile uint32_t *debug = arg->debug;
|
||||
volatile uint32_t *status = arg->status;
|
||||
|
||||
const uint32_t tmem_a = 0x000;
|
||||
const uint32_t tmem_c = 0x400;
|
||||
const uint32_t tmem_st_scratch = 0x800;
|
||||
volatile uint32_t *smem_b_ptr =
|
||||
reinterpret_cast<volatile uint32_t *>(DEV_SMEM_START_ADDR);
|
||||
const uint32_t smem_b = reinterpret_cast<uint32_t>(smem_b_ptr);
|
||||
const uint32_t expected = 0x42820000u; // 65.0f
|
||||
const uint32_t expected_st = 0x3f800000u; // 1.0f
|
||||
|
||||
if (status != nullptr)
|
||||
status[0] = 0x100u;
|
||||
|
||||
for (int i = 0; i < 256; ++i)
|
||||
smem_b_ptr[i] = b[i];
|
||||
|
||||
for (int frag = 0; frag < 32; ++frag) {
|
||||
const uint32_t offset = static_cast<uint32_t>(frag * 32);
|
||||
tcgen05_cp(tmem_a + offset,
|
||||
reinterpret_cast<uint32_t>(&a[frag * 8]));
|
||||
tcgen05_cp(tmem_c + offset,
|
||||
reinterpret_cast<uint32_t>(&c[frag * 8]));
|
||||
}
|
||||
tcgen05_cp_wait();
|
||||
|
||||
const float st_value = 1.0f;
|
||||
tcgen05_st_f32(tmem_st_scratch, st_value);
|
||||
const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch));
|
||||
debug[0] = st_bits;
|
||||
if (st_bits != expected_st) {
|
||||
if (status != nullptr)
|
||||
status[0] = 0xe002u;
|
||||
return;
|
||||
}
|
||||
|
||||
bwgmma(tmem_c, tmem_a, smem_b);
|
||||
bwgmma_wait();
|
||||
|
||||
const float ld_value = tcgen05_ld_f32(tmem_c);
|
||||
const uint32_t ld_bits = f32_bits(ld_value);
|
||||
debug[1] = ld_bits;
|
||||
tcgen05_st_f32(tmem_st_scratch + 32, ld_value);
|
||||
|
||||
for (int frag = 0; frag < 32; ++frag) {
|
||||
tcgen05_cb(tmem_c + static_cast<uint32_t>(frag * 32),
|
||||
reinterpret_cast<uint32_t>(&dst[frag * 8]));
|
||||
}
|
||||
tcgen05_cp_wait();
|
||||
|
||||
if (ld_bits != expected) {
|
||||
if (status != nullptr)
|
||||
status[0] = 0xe001u;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
if (dst[i] != expected) {
|
||||
if (status != nullptr)
|
||||
status[0] = 0xe100u | static_cast<uint32_t>(i & 0xff);
|
||||
debug[2] = static_cast<uint32_t>(i);
|
||||
debug[3] = dst[i];
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (status != nullptr)
|
||||
status[0] = 0x600du;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status};
|
||||
|
||||
vx_spawn_tasks_contiguous(1, reinterpret_cast<vx_spawn_tasks_cb>(kernel_body),
|
||||
&arg);
|
||||
return (g_status == 0x600du) ? 0 : 1;
|
||||
}
|
||||
@@ -6,7 +6,7 @@
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000
|
||||
#define DEV_SMEM_START_ADDR 0xff000000
|
||||
|
||||
typedef struct {
|
||||
typedef struct __attribute__((packed)) {
|
||||
uint32_t dim_m;
|
||||
uint32_t dim_n;
|
||||
uint32_t dim_k;
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
PROJECT = sgemm_tcore_blackwell
|
||||
|
||||
VX_SRCS = ../sgemm_tcore/kernel.cpp
|
||||
VX_INCLUDES = ../sgemm_tcore/sgemm_impl.hpp ../sgemm_tcore/common.h
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
include ../common.mk
|
||||
@@ -184,10 +184,10 @@
|
||||
#define VX_CSR_CORE_ID 0xCC2
|
||||
#define VX_CSR_WARP_MASK 0xCC3
|
||||
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
|
||||
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
|
||||
|
||||
#define VX_CSR_NUM_THREADS 0xFC0
|
||||
#define VX_CSR_NUM_WARPS 0xFC1
|
||||
#define VX_CSR_NUM_CORES 0xFC2
|
||||
|
||||
#endif // VX_TYPES_VH
|
||||
|
||||
|
||||
@@ -7,6 +7,15 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
|
||||
"elf64-littleriscv")
|
||||
OUTPUT_ARCH(riscv)
|
||||
ENTRY(_start)
|
||||
|
||||
MEMORY {
|
||||
DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M
|
||||
DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
|
||||
DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
|
||||
DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
|
||||
DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
|
||||
}
|
||||
|
||||
SECTIONS
|
||||
{
|
||||
. = STARTUP_ADDR;
|
||||
@@ -249,4 +258,20 @@ SECTIONS
|
||||
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
|
||||
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
|
||||
|
||||
.args : {
|
||||
*(.args)
|
||||
. += 8K;
|
||||
} > DRAMARG
|
||||
.operand.a : {
|
||||
*(.operand.a)
|
||||
. += 32K;
|
||||
} > DRAM1
|
||||
.operand.b : {
|
||||
*(.operand.b)
|
||||
. += 32K;
|
||||
} > DRAM2
|
||||
.operand.c : {
|
||||
*(.operand.c)
|
||||
. += 32K;
|
||||
} > DRAM3
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user