13 Commits

Author SHA1 Message Date
Virgo-AE Eval
0884ba6fcb Merge branch 'ae' into ae-hopper 2025-02-07 14:52:27 -08:00
Richard Yan
fd2fe71ca1 Merge branch 'ae' into ae-hopper 2025-01-31 03:53:00 -08:00
Richard Yan
8d71815809 Merge branch 'ae' into ae-hopper 2025-01-30 23:40:48 -08:00
Richard Yan
63f476eb83 Merge branch 'ae' into ae-hopper 2025-01-30 15:34:58 -08:00
Hansung Kim
0711f5f7a3 Merge branch 'ae' into ae-hopper 2025-01-30 13:24:50 -08:00
Hansung Kim
97227577b5 Merge branch 'ae' into ae-hopper 2025-01-30 01:48:09 -08:00
Richard Yan
3cd6aacc17 Merge branch 'ae' into ae-hopper 2025-01-30 01:35:10 -08:00
Hansung Kim
e4f8f3481c Merge branch 'ae' into ae-hopper 2025-01-30 01:05:31 -08:00
Hansung Kim
c7f713c71e Merge branch 'ae' into ae-hopper 2025-01-30 00:49:23 -08:00
Hansung Kim
b06e345706 Merge branch 'ae' into ae-hopper 2025-01-30 00:35:10 -08:00
Hansung Kim
8a635b5fcb Set TENSOR_HOPPER to 1, add missing markers 2025-01-30 00:34:13 -08:00
Richard Yan
f23b2a3fcc Merge branch 'ae' into ae-hopper 2025-01-29 23:31:21 -08:00
Richard Yan
ac34a8f5f5 hopper changes 2025-01-29 22:22:34 -08:00
46 changed files with 9 additions and 1019 deletions

5
.gitignore vendored
View File

@@ -1,5 +0,0 @@
**/*.o
.codex
**/*.elf
**/*.dump
**/*.a

View File

@@ -1,7 +0,0 @@
PROJECT = blackwell_insts
VX_SRCS = kernel.cpp
OPTS ?= -n1
include ../common.mk

View File

@@ -1 +0,0 @@
0

View File

@@ -1 +0,0 @@
0

View File

@@ -1 +0,0 @@
0

View File

@@ -1 +0,0 @@
0

View File

@@ -1,192 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#define DEV_SMEM_START_ADDR 0xff000000u
#define BW_REP2(x) x, x
#define BW_REP4(x) BW_REP2(x), BW_REP2(x)
#define BW_REP8(x) BW_REP4(x), BW_REP4(x)
#define BW_REP16(x) BW_REP8(x), BW_REP8(x)
#define BW_REP32(x) BW_REP16(x), BW_REP16(x)
#define BW_REP64(x) BW_REP32(x), BW_REP32(x)
#define BW_REP128(x) BW_REP64(x), BW_REP64(x)
#define BW_REP256(x) BW_REP128(x), BW_REP128(x)
static volatile uint32_t g_a[256] __attribute__((aligned(32))) = {
BW_REP256(0x3c003c00u)}; // two fp16 1.0 values
static volatile uint32_t g_b[256] __attribute__((aligned(32))) = {
BW_REP256(0x40004000u)}; // two fp16 2.0 values
static volatile uint32_t g_c[256] __attribute__((aligned(32))) = {
BW_REP256(0x3f800000u)}; // one fp32 1.0 value
static volatile uint32_t g_dst[256] __attribute__((aligned(32)));
static volatile uint32_t g_debug[16] __attribute__((aligned(32)));
static volatile uint32_t g_status __attribute__((aligned(4)));
#undef BW_REP2
#undef BW_REP4
#undef BW_REP8
#undef BW_REP16
#undef BW_REP32
#undef BW_REP64
#undef BW_REP128
#undef BW_REP256
struct kernel_arg_t {
volatile uint32_t *a;
volatile uint32_t *b;
volatile uint32_t *c;
volatile uint32_t *dst;
volatile uint32_t *debug;
volatile uint32_t *status;
};
static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 2, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 6, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cp_wait() {
asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline void bwgmma(uint32_t addr_tmem_c,
uint32_t addr_tmem_a,
uint32_t addr_smem_b) {
asm volatile(".insn r %0, 0, 0, %1, %2, %3"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a),
"r"(addr_smem_b)
: "memory");
}
static inline void bwgmma_wait() {
asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline float tcgen05_ld_f32(uint32_t addr_tmem) {
float value;
asm volatile(".insn r %1, 4, 0, %0, %2, x0"
: "=f"(value)
: "i"(RISCV_CUSTOM3), "r"(addr_tmem)
: "memory");
return value;
}
static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) {
asm volatile(".insn r %0, 5, 0, %1, %2, x0"
:
: "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem)
: "memory");
}
static inline uint32_t f32_bits(float value) {
union {
float f;
uint32_t u;
} bits = {value};
return bits.u;
}
extern "C" void vx_perf_dump() {}
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg)
__attribute__((convergent));
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
if (task_id != 0)
return;
volatile uint32_t *a = arg->a;
volatile uint32_t *b = arg->b;
volatile uint32_t *c = arg->c;
volatile uint32_t *dst = arg->dst;
volatile uint32_t *debug = arg->debug;
volatile uint32_t *status = arg->status;
const uint32_t tmem_a = 0x000;
const uint32_t tmem_c = 0x400;
const uint32_t tmem_st_scratch = 0x800;
volatile uint32_t *smem_b_ptr =
reinterpret_cast<volatile uint32_t *>(DEV_SMEM_START_ADDR);
const uint32_t smem_b = reinterpret_cast<uint32_t>(smem_b_ptr);
const uint32_t expected = 0x42820000u; // 65.0f
const uint32_t expected_st = 0x3f800000u; // 1.0f
if (status != nullptr)
status[0] = 0x100u;
for (int i = 0; i < 256; ++i)
smem_b_ptr[i] = b[i];
for (int frag = 0; frag < 32; ++frag) {
const uint32_t offset = static_cast<uint32_t>(frag * 32);
tcgen05_cp(tmem_a + offset,
reinterpret_cast<uint32_t>(&a[frag * 8]));
tcgen05_cp(tmem_c + offset,
reinterpret_cast<uint32_t>(&c[frag * 8]));
}
tcgen05_cp_wait();
const float st_value = 1.0f;
tcgen05_st_f32(tmem_st_scratch, st_value);
const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch));
debug[0] = st_bits;
if (st_bits != expected_st) {
if (status != nullptr)
status[0] = 0xe002u;
return;
}
bwgmma(tmem_c, tmem_a, smem_b);
bwgmma_wait();
const float ld_value = tcgen05_ld_f32(tmem_c);
const uint32_t ld_bits = f32_bits(ld_value);
debug[1] = ld_bits;
tcgen05_st_f32(tmem_st_scratch + 32, ld_value);
for (int frag = 0; frag < 32; ++frag) {
tcgen05_cb(tmem_c + static_cast<uint32_t>(frag * 32),
reinterpret_cast<uint32_t>(&dst[frag * 8]));
}
tcgen05_cp_wait();
if (ld_bits != expected) {
if (status != nullptr)
status[0] = 0xe001u;
return;
}
for (int i = 0; i < 256; ++i) {
if (dst[i] != expected) {
if (status != nullptr)
status[0] = 0xe100u | static_cast<uint32_t>(i & 0xff);
debug[2] = static_cast<uint32_t>(i);
debug[3] = dst[i];
return;
}
}
if (status != nullptr)
status[0] = 0x600du;
}
int main() {
kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status};
vx_spawn_tasks_contiguous(1, reinterpret_cast<vx_spawn_tasks_cb>(kernel_body),
&arg);
return (g_status == 0x600du) ? 0 : 1;
}

View File

@@ -6,7 +6,7 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000 #define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000
#define DEV_SMEM_START_ADDR 0xff000000 #define DEV_SMEM_START_ADDR 0xff000000
typedef struct __attribute__((packed)) { typedef struct {
uint32_t dim_m; uint32_t dim_m;
uint32_t dim_n; uint32_t dim_n;
uint32_t dim_k; uint32_t dim_k;

View File

@@ -95,6 +95,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4 constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant"); static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
MARK_BEG();
constexpr uint32_t smem_a_offset = 0; constexpr uint32_t smem_a_offset = 0;
constexpr uint32_t smem_a_dbuf_offset = 1 * quartile; constexpr uint32_t smem_a_dbuf_offset = 1 * quartile;
constexpr uint32_t smem_b_offset = constexpr uint32_t smem_b_offset =
@@ -119,6 +121,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
threadblocks_per_cluster, threadblock_id_in_cluster, threadblocks_per_cluster, threadblock_id_in_cluster,
sharedmem_per_threadblock); sharedmem_per_threadblock);
MARK_END();
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL); float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL); float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL); float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);

View File

@@ -19,7 +19,7 @@ using float_type = float16_t;
// Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses // Generate kernel for the Hopper-style SMEM-decoupled tensor core. This uses
// asynchronous HGMMA and HGMMA_WAIT instructions. // asynchronous HGMMA and HGMMA_WAIT instructions.
#define TENSOR_HOPPER 0 #define TENSOR_HOPPER 1
// Constraints on parameters: // Constraints on parameters:
// * Memory: // * Memory:
@@ -267,34 +267,6 @@ inline void vx_wgmma_wait() {
asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)); asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
} }
inline void vx_tcgen05_cp(const uint32_t addr_tmem, const uint32_t addr_smem) {
asm volatile(".insn r %0, 2, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem),
"r"(addr_smem));
}
inline void vx_tcgen05_cp_wait() {
asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
}
inline void vx_bwgmma(const uint32_t addr_tmem_a, const uint32_t addr_smem_b) {
asm volatile(".insn r %0, 0, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem_a),
"r"(addr_smem_b));
}
inline void vx_bwgmma_wait() {
asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
}
inline void vx_tcgen05_ld(const uint32_t addr_tmem, const uint32_t rd_hint) {
asm volatile(".insn r %0, 4, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
"r"(addr_tmem));
}
inline void vx_tcgen05_st(const uint32_t addr_tmem, const uint32_t rd_hint) {
asm volatile(".insn r %0, 5, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
"r"(addr_tmem));
}
// Remap logical row/col coordinate of a matrix element to a memory index that // Remap logical row/col coordinate of a matrix element to a memory index that
// follows the 2-level block-row-major layout that Gemmini DMA uses // follows the 2-level block-row-major layout that Gemmini DMA uses
template <bool use_dma, uint32_t dim_col> template <bool use_dma, uint32_t dim_col>

View File

@@ -1,40 +0,0 @@
CASES := \
case00_boot_scalar \
case01_scalar_spawn \
case02_tensor_spawn_stop \
case03_dual_fetch_issue \
case04_scalar_barrier \
case05_tensor_barrier \
case06_masked_barrier \
case07_tensor_csr_tmc \
case08_tensor_lsu_optional
SMOKE_CASES := \
case00_boot_scalar \
case01_scalar_spawn \
case02_tensor_spawn_stop \
case03_dual_fetch_issue
BARRIER_CASES := \
case04_scalar_barrier \
case05_tensor_barrier \
case06_masked_barrier
.PHONY: all smoke barriers full clean clean-all $(CASES)
all: full
smoke: $(SMOKE_CASES)
barriers: $(BARRIER_CASES)
full: $(CASES)
$(CASES):
$(MAKE) -C $@
clean:
set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean; done
clean-all:
set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean-all; done

View File

@@ -1,48 +0,0 @@
# Wu Architecture Staged Cases
This directory contains small bare-metal kernels for incremental Wu architecture testing. The original `kernels/wu_arch` kernel is useful as an integrated test, but it combines scalar spawning, tensor spawning, barriers, tensor control, and memory behavior in one large workload. These cases isolate those surfaces so failures can be reproduced faster under Verilator.
## Case List
- `case00_boot_scalar`: minimal scalar boot, status writes, and pass marker.
- `case01_scalar_spawn`: scalar warp spawning without tensor warps or barriers.
- `case02_tensor_spawn_stop`: tensor warp spawn, marker store, and stop.
- `case03_dual_fetch_issue`: scalar and tensor warps active together to exercise split scheduling and issue.
- `case04_scalar_barrier`: scalar-domain barrier release.
- `case05_tensor_barrier`: tensor-domain barrier through tensor control.
- `case06_masked_barrier`: explicit mixed `BAR_MASK` with scalar warp 0 and tensor warps.
- `case07_tensor_csr_tmc`: tensor CSR/TMC path without barrier behavior.
- `case08_tensor_lsu_optional`: tensor LSU store/load marker path; keep last because memory interaction is broader and slower.
Each case has its own `README.md` describing the test objective, RTL surface, and expected pass marker.
## Build
Use the suite Makefile from this directory:
```sh
make smoke -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
make barriers -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
make full -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
```
`smoke` builds the boot/spawn/dual-issue cases. `barriers` builds the barrier-focused cases. `full` builds all cases.
## Verilator Run Notes
For RTL simulation, use the same simulator setup as the main Virgo artifact, but run these ELFs one at a time:
- `VM_PARALLEL_BUILDS=1`
- `LOADMEM=1`, so `SimDRAM::memory_init()` preloads the ELF instead of relying on slow runtime SimTSI writes.
- `CCACHE_DIR=/tmp/ccache` when ccache is enabled in the sandbox.
- Use `/home/hexu/dse/firtool-1.62.0` for firtool and `/usr/local/bin/verilator` for Verilator.
- Keep system `gcc/g++` on `PATH`; do not use the `gcc/g++` injected by `chipyard/env.sh`.
- For generated Verilator C++ compilation, prefer `-O0 -fno-inline` to reduce compile time.
## Cleanup
```sh
make clean-all
```
This removes kernel ELF/dump outputs and the generated placeholder input blobs in each case directory.

View File

@@ -1,17 +0,0 @@
PROJECT ?= wu_arch_case
VX_SRCS = kernel.cpp
VX_CFLAGS += -I..
VORTEX_KN_PATH ?= $(realpath ../../../lib)
GEMMINI_SW_PATH ?= $(realpath ../../../lib/gemmini)
OPTS ?= -n1
include ../../common.mk
args.bin input.a.bin input.b.bin input.c.bin: ../zero.bin
cp $< $@
clean-all: clean-wu-case-inputs
.PHONY: clean-wu-case-inputs
clean-wu-case-inputs:
rm -f args.bin input.a.bin input.b.bin input.c.bin

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case00_boot_scalar
include ../case.mk

View File

@@ -1,15 +0,0 @@
# Case 00: Boot Scalar
## Test Objective
Verify the minimal Wu bare-metal entry path: core 0, warp 0, thread 0 reaches `wu_main`, can write the shared status arrays, and can terminate with `WU_CASE_PASS`.
## RTL Surface Covered
- Minimal `_start` path from `common_wu_min.h`
- Scalar warp 0 fetch/decode/issue
- Scalar ALU/store path for status writes
## Expected Result
`g_status[0] == WU_CASE_PASS` and `g_seen[0] == WU_CASE_SCALAR_BASE`.

View File

@@ -1,13 +0,0 @@
#include "common_wu_min.h"
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
g_seen[0] = WU_CASE_SCALAR_BASE;
g_aux[0] = static_cast<uint32_t>(vx_num_warps());
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case01_scalar_spawn
include ../case.mk

View File

@@ -1,16 +0,0 @@
# Case 01: Scalar Spawn
## Test Objective
Verify scalar-domain warp spawning without tensor warps, barriers, or shared tensor resources.
## RTL Surface Covered
- Scalar scheduler output
- Scalar fetch/decode/issue path
- Scalar `WSPAWN` mask path
- Scalar store path for per-warp completion markers
## Expected Result
Every scalar warp writes `WU_CASE_SCALAR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.

View File

@@ -1,29 +0,0 @@
#include "common_wu_min.h"
extern "C" void scalar_worker() {
wu_short_delay(wu_wid());
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t spawn_mask = wu_scalar_mask_without_warp0();
if (spawn_mask != 0) {
vx_spawn_scalar(spawn_mask, scalar_worker);
}
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x01u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case02_tensor_spawn_stop
include ../case.mk

View File

@@ -1,16 +0,0 @@
# Case 02: Tensor Spawn Stop
## Test Objective
Verify tensor warps can be spawned, scheduled, issued, write a completion marker, and stop without barriers or tensor LSU stress.
## RTL Surface Covered
- Tensor scheduler output
- Tensor fetch/decode/issue path
- Tensor ALU/store for a minimal marker
- Tensor-domain stop via `TMC zero`
## Expected Result
Every tensor warp writes `WU_CASE_TENSOR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.

View File

@@ -1,36 +0,0 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x02u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case03_dual_fetch_issue
include ../case.mk

View File

@@ -1,16 +0,0 @@
# Case 03: Dual Fetch Issue
## Test Objective
Verify scalar and tensor domains can be active together and both make forward progress through the shared fetch path and split issue paths.
## RTL Surface Covered
- Shared fetch arbitration between scalar and tensor schedule streams
- Scalar decode/issue domain
- Tensor decode/issue domain
- Completion markers from both warp classes
## Expected Result
All scalar warps write `WU_CASE_SCALAR_BASE | wid`, all tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,59 +0,0 @@
#include "common_wu_min.h"
extern "C" void scalar_worker() {
wu_short_delay(wu_wid());
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x6, %[spin]\n\t"
"1:\n\t"
"addi x6, x6, -1\n\t"
"bnez x6, 1b\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"2: j 2b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[spin] "i"(WU_CASE_SHORT_SPIN),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
if (scalar_mask != 0) {
vx_spawn_scalar(scalar_mask, scalar_worker);
}
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
wu_short_delay(0);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x31u);
return 1;
}
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x32u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case04_scalar_barrier
include ../case.mk

View File

@@ -1,15 +0,0 @@
# Case 04: Scalar Barrier
## Test Objective
Verify scalar-domain `BAR` synchronizes only scalar warps and releases them correctly.
## RTL Surface Covered
- Scalar WCTL barrier path
- Scheduler scalar barrier mask handling
- Scalar wakeup after barrier release
## Expected Result
All scalar warps pass `vx_barrier_scalar`, write `WU_CASE_SCALAR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,33 +0,0 @@
#include "common_wu_min.h"
#define CASE04_BARRIER_ID 0u
extern "C" void scalar_worker() {
vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
if (scalar_mask != 0) {
vx_spawn_scalar(scalar_mask, scalar_worker);
}
vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x04u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case05_tensor_barrier
include ../case.mk

View File

@@ -1,15 +0,0 @@
# Case 05: Tensor Barrier
## Test Objective
Verify tensor-domain `BAR` is handled by tensor control and releases tensor warps without relying on scalar SFU dispatch.
## RTL Surface Covered
- Tensor control barrier decode
- Tensor warp-control merge into scheduler
- Scheduler tensor-domain barrier mask handling
## Expected Result
All tensor warps pass the tensor-domain barrier, write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,45 +0,0 @@
#include "common_wu_min.h"
#define CASE05_BARRIER_ID 1u
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x1, (%[bar_id] | (%[domain_tensor] << %[domain_shift]))\n\t"
"li x2, %[num_tensor]\n\t"
".insn r %[custom0], 4, 0, x0, x1, x2\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[bar_id] "i"(CASE05_BARRIER_ID),
[domain_tensor] "i"(VX_BARRIER_DOMAIN_TENSOR),
[domain_shift] "i"(VX_BARRIER_DOMAIN_SHIFT),
[num_tensor] "i"(NUM_TENSOR_WARPS),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x05u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case06_masked_barrier
include ../case.mk

View File

@@ -1,16 +0,0 @@
# Case 06: Masked Barrier
## Test Objective
Verify `BAR_MASK` can synchronize an explicit mixed mask containing scalar warp 0 and all tensor warps.
## RTL Surface Covered
- Scalar-side masked barrier issue
- Tensor-side masked barrier issue through tensor control
- Scheduler explicit barrier mask release
- Scalar/tensor warp-control merge when both domains participate in one barrier
## Expected Result
Scalar warp 0 and all tensor warps pass the same `BAR_MASK`; tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,47 +0,0 @@
#include "common_wu_min.h"
#define CASE06_BARRIER_ID 2u
#define CASE06_BARRIER_MASK (vx_tensor_warp_mask() | 1u)
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x1, %[bar_id]\n\t"
"li x2, %[mask]\n\t"
".insn r %[custom0], 7, 0, x0, x1, x2\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[bar_id] "i"(CASE06_BARRIER_ID),
[mask] "i"(((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS | 1u),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
vx_barrier_mask(CASE06_BARRIER_ID, CASE06_BARRIER_MASK);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_mask(vx_tensor_warp_mask(), WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x06u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case07_tensor_csr_tmc
include ../case.mk

View File

@@ -1,15 +0,0 @@
# Case 07: Tensor CSR TMC
## Test Objective
Verify tensor control handles legal tensor-domain CSR reads and `TMC` operations without involving barrier behavior.
## RTL Surface Covered
- Tensor CSRRS path for `VX_CSR_WARP_ID`
- Tensor TMC path setting a single active lane
- Tensor control completion and tensor-domain stop
## Expected Result
Every tensor warp writes `WU_CASE_TENSOR_CSR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,38 +0,0 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x6, 1\n\t"
".insn r %[custom0], 0, 0, x0, x6, x0\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_csr_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_csr_base] "i"(WU_CASE_TENSOR_CSR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_CSR_BASE) != 0) {
wu_case_fail(0x07u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,3 +0,0 @@
PROJECT = wu_arch_case08_tensor_lsu_optional
include ../case.mk

View File

@@ -1,15 +0,0 @@
# Case 08: Tensor LSU Optional
## Test Objective
Verify tensor-domain LSU can store and reload a small per-warp value. This is intentionally last because memory hierarchy interaction is slower and has a larger debug surface.
## RTL Surface Covered
- Tensor LSU dispatch
- Tensor LSU response/writeback path
- Shared memory hierarchy merge after tensor-domain issue
## Expected Result
Every tensor warp stores and reloads `WU_CASE_TENSOR_LSU_BASE | wid`, writes that value to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -1,40 +0,0 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_case_mem\n\t"
"add x7, x7, x6\n\t"
"li x8, %[tensor_lsu_base]\n\t"
"or x8, x8, x5\n\t"
"sw x8, 0(x7)\n\t"
"lw x8, 0(x7)\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"sw x8, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_lsu_base] "i"(WU_CASE_TENSOR_LSU_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_LSU_BASE) != 0) {
wu_case_fail(0x08u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -1,141 +0,0 @@
#ifndef WU_ARCH_CASES_COMMON_WU_MIN_H
#define WU_ARCH_CASES_COMMON_WU_MIN_H
#include <stdint.h>
#include <vx_intrinsics.h>
#define WU_CASE_MAX_WARPS 8u
#define WU_CASE_WAIT_SPIN 1024u
#define WU_CASE_SHORT_SPIN 8u
#define WU_CASE_PASS 0x600du
#define WU_CASE_FAIL_BASE 0xe000u
#define WU_CASE_SCALAR_BASE 0x5100u
#define WU_CASE_TENSOR_BASE 0x7100u
#define WU_CASE_TENSOR_CSR_BASE 0x7300u
#define WU_CASE_TENSOR_LSU_BASE 0x7400u
extern "C" {
volatile uint32_t g_status[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_seen[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_aux[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_case_mem[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
extern volatile uint64_t tohost;
}
extern "C" void vx_perf_dump() {}
extern "C" int wu_main();
extern "C" void __attribute__((naked, section(".init"), used)) _start() {
asm volatile(
".option push\n\t"
".option norelax\n\t"
"la gp, __global_pointer\n\t"
".option pop\n\t"
"csrr t0, %[csr_core]\n\t"
"bnez t0, 2f\n\t"
"li sp, %[stack_base]\n\t"
"call wu_main\n\t"
"mv gp, a0\n\t"
"2:\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_core] "i"(VX_CSR_CORE_ID),
[stack_base] "i"(STACK_BASE_ADDR),
[custom0] "i"(RISCV_CUSTOM0)
: "memory");
}
static inline uint32_t wu_wid() {
return static_cast<uint32_t>(vx_warp_id());
}
static inline uint32_t wu_tid() {
return static_cast<uint32_t>(vx_thread_id());
}
static inline uint32_t wu_scalar_mask_without_warp0() {
return vx_scalar_warp_mask() & ~1u;
}
static inline void wu_case_reset() {
for (uint32_t i = 0; i < WU_CASE_MAX_WARPS; ++i) {
g_status[i] = 0;
g_seen[i] = 0;
g_aux[i] = 0;
g_case_mem[i] = 0;
}
}
static inline void wu_short_delay(uint32_t seed) {
volatile uint32_t value = seed + 1u;
for (uint32_t i = 0; i < WU_CASE_SHORT_SPIN; ++i) {
value = (value << 1) ^ (seed + i);
}
if (wu_wid() < WU_CASE_MAX_WARPS && wu_tid() == 0) {
g_aux[wu_wid()] = value;
}
}
static inline int wu_wait_seen_range(uint32_t first_wid, uint32_t end_wid, uint32_t base) {
for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
uint32_t done = 1;
for (uint32_t wid = first_wid; wid < end_wid; ++wid) {
done &= (g_seen[wid] == (base | wid));
}
if (done) {
return 0;
}
}
return 1;
}
static inline int wu_wait_seen_mask(uint32_t warp_mask, uint32_t base) {
for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
uint32_t done = 1;
for (uint32_t wid = 0; wid < WU_CASE_MAX_WARPS; ++wid) {
if (warp_mask & (1u << wid)) {
done &= (g_seen[wid] == (base | wid));
}
}
if (done) {
return 0;
}
}
return 1;
}
static inline void wu_mark_seen(uint32_t base) {
const uint32_t wid = wu_wid();
if (wu_tid() == 0 && wid < WU_CASE_MAX_WARPS) {
g_seen[wid] = base | wid;
}
}
static inline void wu_stop_warp() {
vx_tmc_zero();
while (1) {}
}
static inline int wu_is_leader() {
return vx_core_id() == 0 && vx_warp_id() == 0 && vx_thread_id() == 0;
}
static inline void wu_report_tohost(uint32_t exit_code) {
asm volatile("fence rw, rw" ::: "memory");
tohost = (static_cast<uint64_t>(exit_code) << 1) | 1u;
asm volatile("fence rw, rw" ::: "memory");
}
static inline void wu_case_pass() {
g_status[0] = WU_CASE_PASS;
wu_report_tohost(0);
}
static inline void wu_case_fail(uint32_t code) {
g_status[0] = WU_CASE_FAIL_BASE | code;
wu_report_tohost(code ? code : 1u);
}
#endif

View File

@@ -1 +0,0 @@
0

View File

@@ -84,7 +84,7 @@
#endif #endif
#ifndef NUM_CORES #ifndef NUM_CORES
#define NUM_CORES 8 #define NUM_CORES 4
#endif #endif
#ifndef NUM_WARPS #ifndef NUM_WARPS

View File

@@ -184,10 +184,10 @@
#define VX_CSR_CORE_ID 0xCC2 #define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3 #define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM #define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
#define VX_CSR_NUM_THREADS 0xFC0 #define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1 #define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2 #define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH #endif // VX_TYPES_VH

View File

@@ -18,7 +18,7 @@
#include <stdio.h> #include <stdio.h>
#ifndef CORES_PER_CLUSTER #ifndef CORES_PER_CLUSTER
#define CORES_PER_CLUSTER 8 #define CORES_PER_CLUSTER 4
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -7,15 +7,6 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
"elf64-littleriscv") "elf64-littleriscv")
OUTPUT_ARCH(riscv) OUTPUT_ARCH(riscv)
ENTRY(_start) ENTRY(_start)
MEMORY {
DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M
DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
}
SECTIONS SECTIONS
{ {
. = STARTUP_ADDR; . = STARTUP_ADDR;
@@ -258,20 +249,4 @@ SECTIONS
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
.args : {
*(.args)
. += 8K;
} > DRAMARG
.operand.a : {
*(.operand.a)
. += 32K;
} > DRAM1
.operand.b : {
*(.operand.b)
. += 32K;
} > DRAM2
.operand.c : {
*(.operand.c)
. += 32K;
} > DRAM3
} }