15 Commits

Author SHA1 Message Date
e7229dae27 Checkpoint wu arch cases before scalar spawn wrapper 2026-05-24 10:51:58 +08:00
8f7dba5920 Add Blackwell instructions test kernel and update linker script
- Add kernels/blackwell_insts/ with test kernel and input data
- Update linker script with extended memory layout
- Remove obsolete sgemm_tcore_blackwell Makefile
- Update VX_types.h and common.h
2026-05-06 14:50:28 +08:00
bcc566b621 Add Blackwell SGEMM kernel scaffolding 2026-04-25 10:15:31 +08:00
Virgo-AE Eval
71f713b9fc Disable git pull for archive
Only use local refs in the archive for reproducibility.
2025-02-07 14:51:25 -08:00
Richard Yan
9847072eff fix hexadecile 2025-01-31 02:02:18 -08:00
Richard Yan
f8c51669c1 fix toolchain env sh 2025-01-30 21:17:12 -08:00
Richard Yan
17a9d31be5 fix dma invocation 2025-01-30 15:33:58 -08:00
Hansung Kim
238b942133 Add missing library remake 2025-01-30 13:24:23 -08:00
Hansung Kim
2c1ac4e938 Do git pull to make sure up-to-date 2025-01-30 01:47:35 -08:00
Richard Yan
9cdee597b6 Merge branch 'ae' of https://github.com/richardyrh/virgo-kernels into ae 2025-01-30 01:34:29 -08:00
Hansung Kim
6bdc6af607 Fix branch name and dims for flash script 2025-01-30 01:15:57 -08:00
Hansung Kim
b73147cd06 Add compile and operand generate script for flash 2025-01-30 01:04:20 -08:00
Hansung Kim
471f89e371 Add arg binary for flash 2025-01-30 01:02:12 -08:00
Hansung Kim
7e1fc54c97 Fix typo in path 2025-01-30 00:41:42 -08:00
Hansung Kim
50c8f1c410 Add operand generate script for tcore 2025-01-29 23:33:09 -08:00
54 changed files with 1376 additions and 19 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
**/*.o
.codex
**/*.elf
**/*.dump
**/*.a

View File

@@ -0,0 +1,7 @@
PROJECT = blackwell_insts
VX_SRCS = kernel.cpp
OPTS ?= -n1
include ../common.mk

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1,192 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#define DEV_SMEM_START_ADDR 0xff000000u
#define BW_REP2(x) x, x
#define BW_REP4(x) BW_REP2(x), BW_REP2(x)
#define BW_REP8(x) BW_REP4(x), BW_REP4(x)
#define BW_REP16(x) BW_REP8(x), BW_REP8(x)
#define BW_REP32(x) BW_REP16(x), BW_REP16(x)
#define BW_REP64(x) BW_REP32(x), BW_REP32(x)
#define BW_REP128(x) BW_REP64(x), BW_REP64(x)
#define BW_REP256(x) BW_REP128(x), BW_REP128(x)
static volatile uint32_t g_a[256] __attribute__((aligned(32))) = {
BW_REP256(0x3c003c00u)}; // two fp16 1.0 values
static volatile uint32_t g_b[256] __attribute__((aligned(32))) = {
BW_REP256(0x40004000u)}; // two fp16 2.0 values
static volatile uint32_t g_c[256] __attribute__((aligned(32))) = {
BW_REP256(0x3f800000u)}; // one fp32 1.0 value
static volatile uint32_t g_dst[256] __attribute__((aligned(32)));
static volatile uint32_t g_debug[16] __attribute__((aligned(32)));
static volatile uint32_t g_status __attribute__((aligned(4)));
#undef BW_REP2
#undef BW_REP4
#undef BW_REP8
#undef BW_REP16
#undef BW_REP32
#undef BW_REP64
#undef BW_REP128
#undef BW_REP256
struct kernel_arg_t {
volatile uint32_t *a;
volatile uint32_t *b;
volatile uint32_t *c;
volatile uint32_t *dst;
volatile uint32_t *debug;
volatile uint32_t *status;
};
static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 2, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) {
asm volatile(".insn r %0, 6, 0, x0, %1, %2"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
: "memory");
}
static inline void tcgen05_cp_wait() {
asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline void bwgmma(uint32_t addr_tmem_c,
uint32_t addr_tmem_a,
uint32_t addr_smem_b) {
asm volatile(".insn r %0, 0, 0, %1, %2, %3"
:
: "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a),
"r"(addr_smem_b)
: "memory");
}
static inline void bwgmma_wait() {
asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
: "memory");
}
static inline float tcgen05_ld_f32(uint32_t addr_tmem) {
float value;
asm volatile(".insn r %1, 4, 0, %0, %2, x0"
: "=f"(value)
: "i"(RISCV_CUSTOM3), "r"(addr_tmem)
: "memory");
return value;
}
static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) {
asm volatile(".insn r %0, 5, 0, %1, %2, x0"
:
: "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem)
: "memory");
}
static inline uint32_t f32_bits(float value) {
union {
float f;
uint32_t u;
} bits = {value};
return bits.u;
}
extern "C" void vx_perf_dump() {}
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg)
__attribute__((convergent));
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
if (task_id != 0)
return;
volatile uint32_t *a = arg->a;
volatile uint32_t *b = arg->b;
volatile uint32_t *c = arg->c;
volatile uint32_t *dst = arg->dst;
volatile uint32_t *debug = arg->debug;
volatile uint32_t *status = arg->status;
const uint32_t tmem_a = 0x000;
const uint32_t tmem_c = 0x400;
const uint32_t tmem_st_scratch = 0x800;
volatile uint32_t *smem_b_ptr =
reinterpret_cast<volatile uint32_t *>(DEV_SMEM_START_ADDR);
const uint32_t smem_b = reinterpret_cast<uint32_t>(smem_b_ptr);
const uint32_t expected = 0x42820000u; // 65.0f
const uint32_t expected_st = 0x3f800000u; // 1.0f
if (status != nullptr)
status[0] = 0x100u;
for (int i = 0; i < 256; ++i)
smem_b_ptr[i] = b[i];
for (int frag = 0; frag < 32; ++frag) {
const uint32_t offset = static_cast<uint32_t>(frag * 32);
tcgen05_cp(tmem_a + offset,
reinterpret_cast<uint32_t>(&a[frag * 8]));
tcgen05_cp(tmem_c + offset,
reinterpret_cast<uint32_t>(&c[frag * 8]));
}
tcgen05_cp_wait();
const float st_value = 1.0f;
tcgen05_st_f32(tmem_st_scratch, st_value);
const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch));
debug[0] = st_bits;
if (st_bits != expected_st) {
if (status != nullptr)
status[0] = 0xe002u;
return;
}
bwgmma(tmem_c, tmem_a, smem_b);
bwgmma_wait();
const float ld_value = tcgen05_ld_f32(tmem_c);
const uint32_t ld_bits = f32_bits(ld_value);
debug[1] = ld_bits;
tcgen05_st_f32(tmem_st_scratch + 32, ld_value);
for (int frag = 0; frag < 32; ++frag) {
tcgen05_cb(tmem_c + static_cast<uint32_t>(frag * 32),
reinterpret_cast<uint32_t>(&dst[frag * 8]));
}
tcgen05_cp_wait();
if (ld_bits != expected) {
if (status != nullptr)
status[0] = 0xe001u;
return;
}
for (int i = 0; i < 256; ++i) {
if (dst[i] != expected) {
if (status != nullptr)
status[0] = 0xe100u | static_cast<uint32_t>(i & 0xff);
debug[2] = static_cast<uint32_t>(i);
debug[3] = dst[i];
return;
}
}
if (status != nullptr)
status[0] = 0x600du;
}
int main() {
kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status};
vx_spawn_tasks_contiguous(1, reinterpret_cast<vx_spawn_tasks_cb>(kernel_body),
&arg);
return (g_status == 0x600du) ? 0 : 1;
}

View File

@@ -0,0 +1 @@
args.seq1024.headdim64.bin

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,45 @@
#!/bin/bash
archs=("ampere" "virgo")
if [ -z "$TOOLDIR" ]; then
echo "error: \$TOOLDIR not set. Did you run source ci/toolchain_env.sh?"
exit 1
fi
check_exists() {
if ! [ -f "$1" ]; then
echo "error: looked for file $1 that does not exist."
exit 1
fi
}
# generate operands
echo "generating flash_attn operands for seqlen 1024, headdim 64"
python3 flash_attn.py 1024 64 64
mv -v input.a.col.bin input.a.rand.fp32.seqlen1024headdim64.col.bin
mv -v input.a.row.bin input.a.rand.fp32.seqlen1024headdim64.row.bin
mv -v input.b.bin input.b.rand.fp32.seqlen1024headdim64.row.bin
mv -v input.c.bin input.c.rand.fp32.seqlen1024headdim64.row.bin
ln -sf input.a.rand.fp32.seqlen1024headdim64.row.bin input.a.bin
ln -sf input.b.rand.fp32.seqlen1024headdim64.row.bin input.b.bin
ln -sf input.c.rand.fp32.seqlen1024headdim64.row.bin input.c.bin
for arch in "${archs[@]}"; do
git checkout ae-flash-$arch
# git pull
# re-compile libvortexrt.a
pushd ../../lib
make
popd
echo "compiling flash_attn kernel for $arch with seqlen 1024, headdim 64"
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
touch kernel.gemmini.cpp
make CONFIG=flash.$arch.seqlen1024.headdim64
done

View File

@@ -0,0 +1,159 @@
import sys
import numpy as np
def parse_mnk():
if len(sys.argv) != 4:
print(f"usage: {sys.argv[0]} dimM dimN dimK", file=sys.stderr)
sys.exit(1)
m = int(sys.argv[1])
n = int(sys.argv[2])
k = int(sys.argv[3])
return (m, n, k)
# Reorder array in a way that groups two adjacent elements along the column to
# be now adjacent along the row. This way, when the resulting fp16 array is
# read in column-major order with 32-bit granularity, the fp16 elements will be
# read in the same order as regular fp32 elements in column-major.
#
# For example:
# [[1 2]
# [3 4]
# [5 6]
# [7 8]]
# becomes
# [[1 3 2 4]
# [5 7 6 8]]
def pack_fp16_by_column(array):
rows = array.shape[0]
cols = array.shape[1]
T = array.transpose([1, 0])
T_packed = T.reshape([cols, -1, 2])
result = T_packed.transpose([1, 0, 2])
return result
# Do the same as pack_fp16_by_column, but for every two elements along the row.
def pack_fp16_by_row(array):
rows = array.shape[0]
cols = array.shape[1]
result = array.reshape([rows, -1, 2])
return result
if __name__ == "__main__":
seqlen, _, headdim = parse_mnk()
rand = True
if not rand:
A_array = np.arange(seqlen * headdim).reshape([seqlen, headdim])
B_array = np.arange(headdim * seqlen).reshape([headdim, seqlen])
C_array = np.arange(seqlen * seqlen).reshape([seqlen, headdim])
else:
np.random.seed(0)
A_array = np.random.rand(seqlen, headdim) - 0.5
B_array = np.random.rand(headdim, seqlen) - 0.5
C_array = np.random.rand(seqlen, headdim) - 0.5
# C_array = np.zeros([M, N])
fp16 = False
if fp16:
A_packed = pack_fp16_by_row(A_array)
AT_packed = A_packed.transpose([1, 0, 2])
AT_array = AT_packed.reshape([-1, seqlen * 2])
AT_array.astype('float16').tofile("input.a.col.bin")
# print('AT:')
# print(AT_array)
B_packed = pack_fp16_by_column(B_array)
B_array = B_packed.reshape([-1, headdim * 2])
B_array.astype('float16').tofile("input.b.row.bin")
# print('B:')
# print(B_array)
else:
A_array.astype('float32').tofile("input.a.row.bin")
AT_array = A_array.transpose([1, 0])
AT_array.astype('float32').tofile("input.a.col.bin")
B_array.astype('float32').tofile("input.b.bin")
C_array.astype('float32').tofile("input.c.bin")
# print('AT:')
# print(AT_array)
# print('B:')
# print(B_array)
assert((seqlen % 64) == 0)
Br = 64
Bc = Br
rowmax = np.zeros([Br])
rowsum = np.zeros([Br])
O = np.zeros([Br, headdim])
def exp2(x):
return (x**2) / 2.0 + x + 1.0
full_S = A_array @ B_array
full_S_T = full_S.transpose([1, 0])
full_S.astype('float32').tofile("full_S.bin")
col_to_save = 0
for col in range(0, seqlen, Bc):
print(f"tile iteration {col}~{col + Bc} ======================================")
# FIXME: only work with the first 64 rows of Q for now
Q_tile = A_array[0:64, :]
K_tile = B_array[:, col:col+Bc]
S = Q_tile @ K_tile
if col == col_to_save:
print('S_expected:')
print(S)
S.astype('float32').tofile("S_expected.bin")
# generate rowmax result in online softmax
rowmax_this = np.max(S, axis=1)
rowmax_prev = rowmax.copy()
rowmax = np.maximum(rowmax, rowmax_this)
if col == col_to_save:
rowmax.astype('float32').tofile("rowmax.bin")
# subtrace rowmax from each row by broadcasting
# (placeholder for exp)
x = S - rowmax[:, np.newaxis]
P = exp2(x)
# for i in range(3, 4):
# P += (x**i) / np.math.factorial(i)
# P = np.exp(exp)
# print('P error:')
# print(P / np.exp(x))
if col == col_to_save:
print('P_expected:')
print(P)
P.astype('float32').tofile("P_expected.bin")
P.transpose([1, 0]).astype('float32').tofile("P_expected.col.bin")
rowsum_this = np.sum(P, axis=1)
x = rowmax_prev - rowmax_this
rowsum = exp2(x) * rowsum + rowsum_this
if col == col_to_save:
rowsum.astype('float32').tofile("rowsum.bin")
x = rowmax_prev - rowmax
O = O / (exp2(x)[:, np.newaxis])
if col == col_to_save:
print('O_before_PV:')
print(O)
O.astype('float32').tofile("O_before_PV.bin")
V = C_array[col:col+Bc, :]
if col == col_to_save:
V.astype('float32').tofile("V_expected.bin")
# O = P.transpose([1, 0]) @ V
O = O + P @ V
if col == col_to_save:
print('O_after_PV:')
print(O)
O.astype('float32').tofile("O_after_PV.bin")

View File

@@ -1,5 +1,14 @@
#!/bin/sh
# hopper and virgo has the same SIMT configurations
git checkout ae-hopper
# git pull
# re-compile libvortexrt.a
pushd ../../lib
make
popd
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
echo "input binaries not found, generating operands"
python3 generate_operands.py

View File

@@ -6,7 +6,7 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000
#define DEV_SMEM_START_ADDR 0xff000000
typedef struct {
typedef struct __attribute__((packed)) {
uint32_t dim_m;
uint32_t dim_n;
uint32_t dim_k;

View File

@@ -41,12 +41,22 @@ check_exists() {
fi
}
# generate operands
for dim in "${dims[@]}"; do
echo "generating operands for dim $dim"
python3 generate_operands.py $dim $dim $dim
mv -v input.a.col.bin input.a.rand01.fp16.m${dim}n${dim}k${dim}.col.swizzle_fp16.bin
mv -v input.a.row.bin input.a.rand01.fp16.m${dim}n${dim}k${dim}.row.swizzle_fp16.bin
mv -v input.b.row.bin input.b.rand01.fp16.m${dim}n${dim}k${dim}.row.bin
mv -v input.b.row.swizzled.bin input.b.rand01.fp16.m${dim}n${dim}k${dim}.row.swizzle_fp16.bin
done
for arch in "${archs[@]}"; do
git checkout ae-$arch
# git pull
# re-compile libvortexrt.a
# FIXME after restructure
pushd ../../libs
pushd ../../lib
make
popd

View File

@@ -0,0 +1,116 @@
import sys
import numpy as np
def parse_mnk():
if len(sys.argv) != 4:
print(f"usage: {sys.argv[0]} dimM dimN dimK", file=sys.stderr)
sys.exit(1)
m = int(sys.argv[1])
n = int(sys.argv[2])
k = int(sys.argv[3])
return (m, n, k)
# Reorder array in a way that groups two adjacent elements along the column to
# be now adjacent along the row. This way, when the resulting fp16 array is
# read in column-major order with 32-bit granularity, the fp16 elements will be
# read in the same order as regular fp32 elements in column-major.
#
# For example:
# [[1 2]
# [3 4]
# [5 6]
# [7 8]]
# becomes
# [[1 3 2 4]
# [5 7 6 8]]
def pack_fp16_by_column(array):
rows = array.shape[0]
cols = array.shape[1]
T = array.transpose([1, 0])
T_packed = T.reshape([cols, -1, 2])
result = T_packed.transpose([1, 0, 2])
return result
# Do the same as pack_fp16_by_column, but for every two elements along the row.
def pack_fp16_by_row(array):
rows = array.shape[0]
cols = array.shape[1]
result = array.reshape([rows, -1, 2])
return result
if __name__ == "__main__":
M, N, K = parse_mnk()
rand = True
if not rand:
A_array = np.arange(M * K).reshape([M, K])
B_array = np.arange(K * N).reshape([K, N])
# C_array = np.arange(M * N).reshape([M, N])
C_array = np.zeros([M, N])
else:
np.random.seed(0)
A_array = np.random.rand(M, K)
B_array = np.random.rand(K, N)
C_array = np.random.rand(N, K)
# C_array = np.zeros([M, N])
with open('a_matrix.h', 'w') as f:
for i in range(A_array.shape[0]):
for j in range(A_array.shape[1]):
f.write(f'{A_array[i,j]:f}f, ')
f.write('\n')
with open('b_matrix.h', 'w') as f:
for i in range(B_array.shape[0]):
for j in range(B_array.shape[1]):
f.write(f'{B_array[i,j]:f}f, ')
f.write('\n')
with open('c_matrix.h', 'w') as f:
for i in range(C_array.shape[0]):
for j in range(C_array.shape[1]):
f.write(f'{C_array[i,j]:f}f, ')
f.write('\n')
np.savez("abc", A_array=A_array, B_array=B_array, C_array=C_array)
fp16 = True
if fp16:
A_packed = pack_fp16_by_row(A_array)
A_swizzled = A_packed.reshape([-1, M * 2])
A_swizzled.astype('float16').tofile("input.a.row.bin")
AT_packed = A_packed.transpose([1, 0, 2])
AT_swizzled = AT_packed.reshape([-1, M * 2])
AT_swizzled.astype('float16').tofile("input.a.col.bin")
print('A:')
print(A_swizzled)
print('AT:')
print(AT_swizzled)
B_array.astype('float16').tofile("input.b.row.bin")
# B_packed_row = pack_fp16_by_row(B_array)
# B_packed_row = B_packed_row.reshape([-1, N * 2])
# B_packed_row.astype('float16').tofile("input.b.row.bin")
B_packed = pack_fp16_by_column(B_array)
B_swizzled = B_packed.reshape([-1, N * 2])
B_swizzled.astype('float16').tofile("input.b.row.swizzled.bin")
print('B:')
print(B_swizzled)
else:
A_array.astype('float32').tofile("input.a.row.bin")
AT_array = A_array.transpose([1, 0])
AT_array.astype('float32').tofile("input.a.col.bin")
B_array.astype('float32').tofile("input.b.bin")
C_array.astype('float32').tofile("input.c.bin")
print('AT:')
print(AT_array)
print('B:')
print(B_array)
D_expected = A_array @ B_array
D_expected.astype('float32').tofile("d_expected.bin")
print('D_expected:')
print(D_expected)

View File

@@ -267,6 +267,34 @@ inline void vx_wgmma_wait() {
asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
}
inline void vx_tcgen05_cp(const uint32_t addr_tmem, const uint32_t addr_smem) {
asm volatile(".insn r %0, 2, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem),
"r"(addr_smem));
}
inline void vx_tcgen05_cp_wait() {
asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
}
inline void vx_bwgmma(const uint32_t addr_tmem_a, const uint32_t addr_smem_b) {
asm volatile(".insn r %0, 0, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem_a),
"r"(addr_smem_b));
}
inline void vx_bwgmma_wait() {
asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
}
inline void vx_tcgen05_ld(const uint32_t addr_tmem, const uint32_t rd_hint) {
asm volatile(".insn r %0, 4, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
"r"(addr_tmem));
}
inline void vx_tcgen05_st(const uint32_t addr_tmem, const uint32_t rd_hint) {
asm volatile(".insn r %0, 5, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
"r"(addr_tmem));
}
// Remap logical row/col coordinate of a matrix element to a memory index that
// follows the 2-level block-row-major layout that Gemmini DMA uses
template <bool use_dma, uint32_t dim_col>
@@ -1190,10 +1218,10 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
(uint64_t)(B + /*block_k:*/ 0 * BK * dim_n + block_n * BN),
k_LOOP_WS_CONFIG_ADDRS_AB)
// GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | GEMMINI_CISC_SET_AB_STRIDE);
gemmini_fence();
GEMMINI_CISC_CMD_I(10);
GEMMINI_CISC_CMD_R((11 << 16) | (0 << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
gemmini_fence();
#if 0
@@ -1257,7 +1285,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
k_LOOP_WS_CONFIG_ADDRS_AB)
// GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
gemmini_fence();
// gemmini_fence();
// block_k is even: opcode 11 (write to local_a_buf)
// block_k is odd: opcode 10 (write to local_a)
@@ -1266,8 +1294,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
// the last iteration of the k-loop is prefetching for the first
// iteration of the n-loop. The ping-poing indexing has to match for
// the two loop end to connect.
const uint32_t opcode = 11 - (block_k & 1);
GEMMINI_CISC_CMD_I(opcode);
const uint32_t a_hexadecile = 4 - ((block_k & 1) * 4);
const uint32_t b_hexadecile = a_hexadecile + 11;
GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
// // TODO: branch is probably slow
// if (block_k & 1) {
// GEMMINI_CISC_CMD_I(12);

View File

@@ -0,0 +1,40 @@
CASES := \
case00_boot_scalar \
case01_scalar_spawn \
case02_tensor_spawn_stop \
case03_dual_fetch_issue \
case04_scalar_barrier \
case05_tensor_barrier \
case06_masked_barrier \
case07_tensor_csr_tmc \
case08_tensor_lsu_optional
SMOKE_CASES := \
case00_boot_scalar \
case01_scalar_spawn \
case02_tensor_spawn_stop \
case03_dual_fetch_issue
BARRIER_CASES := \
case04_scalar_barrier \
case05_tensor_barrier \
case06_masked_barrier
.PHONY: all smoke barriers full clean clean-all $(CASES)
all: full
smoke: $(SMOKE_CASES)
barriers: $(BARRIER_CASES)
full: $(CASES)
$(CASES):
$(MAKE) -C $@
clean:
set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean; done
clean-all:
set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean-all; done

View File

@@ -0,0 +1,48 @@
# Wu Architecture Staged Cases
This directory contains small bare-metal kernels for incremental Wu architecture testing. The original `kernels/wu_arch` kernel is useful as an integrated test, but it combines scalar spawning, tensor spawning, barriers, tensor control, and memory behavior in one large workload. These cases isolate those surfaces so failures can be reproduced faster under Verilator.
## Case List
- `case00_boot_scalar`: minimal scalar boot, status writes, and pass marker.
- `case01_scalar_spawn`: scalar warp spawning without tensor warps or barriers.
- `case02_tensor_spawn_stop`: tensor warp spawn, marker store, and stop.
- `case03_dual_fetch_issue`: scalar and tensor warps active together to exercise split scheduling and issue.
- `case04_scalar_barrier`: scalar-domain barrier release.
- `case05_tensor_barrier`: tensor-domain barrier through tensor control.
- `case06_masked_barrier`: explicit mixed `BAR_MASK` with scalar warp 0 and tensor warps.
- `case07_tensor_csr_tmc`: tensor CSR/TMC path without barrier behavior.
- `case08_tensor_lsu_optional`: tensor LSU store/load marker path; keep last because memory interaction is broader and slower.
Each case has its own `README.md` describing the test objective, RTL surface, and expected pass marker.
## Build
Use the suite Makefile from this directory:
```sh
make smoke -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
make barriers -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
make full -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
```
`smoke` builds the boot/spawn/dual-issue cases. `barriers` builds the barrier-focused cases. `full` builds all cases.
## Verilator Run Notes
For RTL simulation, use the same simulator setup as the main Virgo artifact, but run these ELFs one at a time:
- `VM_PARALLEL_BUILDS=1`
- `LOADMEM=1`, so `SimDRAM::memory_init()` preloads the ELF instead of relying on slow runtime SimTSI writes.
- `CCACHE_DIR=/tmp/ccache` when ccache is enabled in the sandbox.
- Use `/home/hexu/dse/firtool-1.62.0` for firtool and `/usr/local/bin/verilator` for Verilator.
- Keep system `gcc/g++` on `PATH`; do not use the `gcc/g++` injected by `chipyard/env.sh`.
- For generated Verilator C++ compilation, prefer `-O0 -fno-inline` to reduce compile time.
## Cleanup
```sh
make clean-all
```
This removes kernel ELF/dump outputs and the generated placeholder input blobs in each case directory.

View File

@@ -0,0 +1,17 @@
PROJECT ?= wu_arch_case
VX_SRCS = kernel.cpp
VX_CFLAGS += -I..
VORTEX_KN_PATH ?= $(realpath ../../../lib)
GEMMINI_SW_PATH ?= $(realpath ../../../lib/gemmini)
OPTS ?= -n1
include ../../common.mk
args.bin input.a.bin input.b.bin input.c.bin: ../zero.bin
cp $< $@
clean-all: clean-wu-case-inputs
.PHONY: clean-wu-case-inputs
clean-wu-case-inputs:
rm -f args.bin input.a.bin input.b.bin input.c.bin

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case00_boot_scalar
include ../case.mk

View File

@@ -0,0 +1,15 @@
# Case 00: Boot Scalar
## Test Objective
Verify the minimal Wu bare-metal entry path: core 0, warp 0, thread 0 reaches `wu_main`, can write the shared status arrays, and can terminate with `WU_CASE_PASS`.
## RTL Surface Covered
- Minimal `_start` path from `common_wu_min.h`
- Scalar warp 0 fetch/decode/issue
- Scalar ALU/store path for status writes
## Expected Result
`g_status[0] == WU_CASE_PASS` and `g_seen[0] == WU_CASE_SCALAR_BASE`.

View File

@@ -0,0 +1,13 @@
#include "common_wu_min.h"
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
g_seen[0] = WU_CASE_SCALAR_BASE;
g_aux[0] = static_cast<uint32_t>(vx_num_warps());
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case01_scalar_spawn
include ../case.mk

View File

@@ -0,0 +1,16 @@
# Case 01: Scalar Spawn
## Test Objective
Verify scalar-domain warp spawning without tensor warps, barriers, or shared tensor resources.
## RTL Surface Covered
- Scalar scheduler output
- Scalar fetch/decode/issue path
- Scalar `WSPAWN` mask path
- Scalar store path for per-warp completion markers
## Expected Result
Every scalar warp writes `WU_CASE_SCALAR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.

View File

@@ -0,0 +1,29 @@
#include "common_wu_min.h"
extern "C" void scalar_worker() {
wu_short_delay(wu_wid());
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t spawn_mask = wu_scalar_mask_without_warp0();
if (spawn_mask != 0) {
vx_spawn_scalar(spawn_mask, scalar_worker);
}
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x01u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case02_tensor_spawn_stop
include ../case.mk

View File

@@ -0,0 +1,16 @@
# Case 02: Tensor Spawn Stop
## Test Objective
Verify tensor warps can be spawned, scheduled, issued, write a completion marker, and stop without barriers or tensor LSU stress.
## RTL Surface Covered
- Tensor scheduler output
- Tensor fetch/decode/issue path
- Tensor ALU/store for a minimal marker
- Tensor-domain stop via `TMC zero`
## Expected Result
Every tensor warp writes `WU_CASE_TENSOR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.

View File

@@ -0,0 +1,36 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x02u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case03_dual_fetch_issue
include ../case.mk

View File

@@ -0,0 +1,16 @@
# Case 03: Dual Fetch Issue
## Test Objective
Verify scalar and tensor domains can be active together and both make forward progress through the shared fetch path and split issue paths.
## RTL Surface Covered
- Shared fetch arbitration between scalar and tensor schedule streams
- Scalar decode/issue domain
- Tensor decode/issue domain
- Completion markers from both warp classes
## Expected Result
All scalar warps write `WU_CASE_SCALAR_BASE | wid`, all tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,59 @@
#include "common_wu_min.h"
extern "C" void scalar_worker() {
wu_short_delay(wu_wid());
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x6, %[spin]\n\t"
"1:\n\t"
"addi x6, x6, -1\n\t"
"bnez x6, 1b\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"2: j 2b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[spin] "i"(WU_CASE_SHORT_SPIN),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
if (scalar_mask != 0) {
vx_spawn_scalar(scalar_mask, scalar_worker);
}
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
wu_short_delay(0);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x31u);
return 1;
}
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x32u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case04_scalar_barrier
include ../case.mk

View File

@@ -0,0 +1,15 @@
# Case 04: Scalar Barrier
## Test Objective
Verify scalar-domain `BAR` synchronizes only scalar warps and releases them correctly.
## RTL Surface Covered
- Scalar WCTL barrier path
- Scheduler scalar barrier mask handling
- Scalar wakeup after barrier release
## Expected Result
All scalar warps pass `vx_barrier_scalar`, write `WU_CASE_SCALAR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,33 @@
#include "common_wu_min.h"
#define CASE04_BARRIER_ID 0u
extern "C" void scalar_worker() {
vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
if (scalar_mask != 0) {
vx_spawn_scalar(scalar_mask, scalar_worker);
}
vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x04u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case05_tensor_barrier
include ../case.mk

View File

@@ -0,0 +1,15 @@
# Case 05: Tensor Barrier
## Test Objective
Verify tensor-domain `BAR` is handled by tensor control and releases tensor warps without relying on scalar SFU dispatch.
## RTL Surface Covered
- Tensor control barrier decode
- Tensor warp-control merge into scheduler
- Scheduler tensor-domain barrier mask handling
## Expected Result
All tensor warps pass the tensor-domain barrier, write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,45 @@
#include "common_wu_min.h"
#define CASE05_BARRIER_ID 1u
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x1, (%[bar_id] | (%[domain_tensor] << %[domain_shift]))\n\t"
"li x2, %[num_tensor]\n\t"
".insn r %[custom0], 4, 0, x0, x1, x2\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[bar_id] "i"(CASE05_BARRIER_ID),
[domain_tensor] "i"(VX_BARRIER_DOMAIN_TENSOR),
[domain_shift] "i"(VX_BARRIER_DOMAIN_SHIFT),
[num_tensor] "i"(NUM_TENSOR_WARPS),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x05u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case06_masked_barrier
include ../case.mk

View File

@@ -0,0 +1,16 @@
# Case 06: Masked Barrier
## Test Objective
Verify `BAR_MASK` can synchronize an explicit mixed mask containing scalar warp 0 and all tensor warps.
## RTL Surface Covered
- Scalar-side masked barrier issue
- Tensor-side masked barrier issue through tensor control
- Scheduler explicit barrier mask release
- Scalar/tensor warp-control merge when both domains participate in one barrier
## Expected Result
Scalar warp 0 and all tensor warps pass the same `BAR_MASK`; tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,47 @@
#include "common_wu_min.h"
#define CASE06_BARRIER_ID 2u
#define CASE06_BARRIER_MASK (vx_tensor_warp_mask() | 1u)
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x1, %[bar_id]\n\t"
"li x2, %[mask]\n\t"
".insn r %[custom0], 7, 0, x0, x1, x2\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[bar_id] "i"(CASE06_BARRIER_ID),
[mask] "i"(((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS | 1u),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
vx_barrier_mask(CASE06_BARRIER_ID, CASE06_BARRIER_MASK);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_mask(vx_tensor_warp_mask(), WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x06u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case07_tensor_csr_tmc
include ../case.mk

View File

@@ -0,0 +1,15 @@
# Case 07: Tensor CSR TMC
## Test Objective
Verify tensor control handles legal tensor-domain CSR reads and `TMC` operations without involving barrier behavior.
## RTL Surface Covered
- Tensor CSRRS path for `VX_CSR_WARP_ID`
- Tensor TMC path setting a single active lane
- Tensor control completion and tensor-domain stop
## Expected Result
Every tensor warp writes `WU_CASE_TENSOR_CSR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,38 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x6, 1\n\t"
".insn r %[custom0], 0, 0, x0, x6, x0\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_csr_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_csr_base] "i"(WU_CASE_TENSOR_CSR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_CSR_BASE) != 0) {
wu_case_fail(0x07u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,3 @@
PROJECT = wu_arch_case08_tensor_lsu_optional
include ../case.mk

View File

@@ -0,0 +1,15 @@
# Case 08: Tensor LSU Optional
## Test Objective
Verify tensor-domain LSU can store and reload a small per-warp value. This is intentionally last because memory hierarchy interaction is slower and has a larger debug surface.
## RTL Surface Covered
- Tensor LSU dispatch
- Tensor LSU response/writeback path
- Shared memory hierarchy merge after tensor-domain issue
## Expected Result
Every tensor warp stores and reloads `WU_CASE_TENSOR_LSU_BASE | wid`, writes that value to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.

View File

@@ -0,0 +1,40 @@
#include "common_wu_min.h"
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_case_mem\n\t"
"add x7, x7, x6\n\t"
"li x8, %[tensor_lsu_base]\n\t"
"or x8, x8, x5\n\t"
"sw x8, 0(x7)\n\t"
"lw x8, 0(x7)\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"sw x8, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[tensor_lsu_base] "i"(WU_CASE_TENSOR_LSU_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_LSU_BASE) != 0) {
wu_case_fail(0x08u);
return 1;
}
wu_case_pass();
return 0;
}

View File

@@ -0,0 +1,141 @@
#ifndef WU_ARCH_CASES_COMMON_WU_MIN_H
#define WU_ARCH_CASES_COMMON_WU_MIN_H
#include <stdint.h>
#include <vx_intrinsics.h>
#define WU_CASE_MAX_WARPS 8u
#define WU_CASE_WAIT_SPIN 1024u
#define WU_CASE_SHORT_SPIN 8u
#define WU_CASE_PASS 0x600du
#define WU_CASE_FAIL_BASE 0xe000u
#define WU_CASE_SCALAR_BASE 0x5100u
#define WU_CASE_TENSOR_BASE 0x7100u
#define WU_CASE_TENSOR_CSR_BASE 0x7300u
#define WU_CASE_TENSOR_LSU_BASE 0x7400u
extern "C" {
volatile uint32_t g_status[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_seen[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_aux[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
volatile uint32_t g_case_mem[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
extern volatile uint64_t tohost;
}
extern "C" void vx_perf_dump() {}
extern "C" int wu_main();
extern "C" void __attribute__((naked, section(".init"), used)) _start() {
asm volatile(
".option push\n\t"
".option norelax\n\t"
"la gp, __global_pointer\n\t"
".option pop\n\t"
"csrr t0, %[csr_core]\n\t"
"bnez t0, 2f\n\t"
"li sp, %[stack_base]\n\t"
"call wu_main\n\t"
"mv gp, a0\n\t"
"2:\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"1: j 1b\n\t"
:
: [csr_core] "i"(VX_CSR_CORE_ID),
[stack_base] "i"(STACK_BASE_ADDR),
[custom0] "i"(RISCV_CUSTOM0)
: "memory");
}
static inline uint32_t wu_wid() {
return static_cast<uint32_t>(vx_warp_id());
}
static inline uint32_t wu_tid() {
return static_cast<uint32_t>(vx_thread_id());
}
static inline uint32_t wu_scalar_mask_without_warp0() {
return vx_scalar_warp_mask() & ~1u;
}
static inline void wu_case_reset() {
for (uint32_t i = 0; i < WU_CASE_MAX_WARPS; ++i) {
g_status[i] = 0;
g_seen[i] = 0;
g_aux[i] = 0;
g_case_mem[i] = 0;
}
}
static inline void wu_short_delay(uint32_t seed) {
volatile uint32_t value = seed + 1u;
for (uint32_t i = 0; i < WU_CASE_SHORT_SPIN; ++i) {
value = (value << 1) ^ (seed + i);
}
if (wu_wid() < WU_CASE_MAX_WARPS && wu_tid() == 0) {
g_aux[wu_wid()] = value;
}
}
static inline int wu_wait_seen_range(uint32_t first_wid, uint32_t end_wid, uint32_t base) {
for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
uint32_t done = 1;
for (uint32_t wid = first_wid; wid < end_wid; ++wid) {
done &= (g_seen[wid] == (base | wid));
}
if (done) {
return 0;
}
}
return 1;
}
static inline int wu_wait_seen_mask(uint32_t warp_mask, uint32_t base) {
for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
uint32_t done = 1;
for (uint32_t wid = 0; wid < WU_CASE_MAX_WARPS; ++wid) {
if (warp_mask & (1u << wid)) {
done &= (g_seen[wid] == (base | wid));
}
}
if (done) {
return 0;
}
}
return 1;
}
static inline void wu_mark_seen(uint32_t base) {
const uint32_t wid = wu_wid();
if (wu_tid() == 0 && wid < WU_CASE_MAX_WARPS) {
g_seen[wid] = base | wid;
}
}
static inline void wu_stop_warp() {
vx_tmc_zero();
while (1) {}
}
static inline int wu_is_leader() {
return vx_core_id() == 0 && vx_warp_id() == 0 && vx_thread_id() == 0;
}
static inline void wu_report_tohost(uint32_t exit_code) {
asm volatile("fence rw, rw" ::: "memory");
tohost = (static_cast<uint64_t>(exit_code) << 1) | 1u;
asm volatile("fence rw, rw" ::: "memory");
}
static inline void wu_case_pass() {
g_status[0] = WU_CASE_PASS;
wu_report_tohost(0);
}
static inline void wu_case_fail(uint32_t code) {
g_status[0] = WU_CASE_FAIL_BASE | code;
wu_report_tohost(code ? code : 1u);
}
#endif

View File

@@ -0,0 +1 @@
0

View File

@@ -184,10 +184,10 @@
#define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
#define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH

View File

@@ -7,6 +7,15 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
"elf64-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(_start)
MEMORY {
DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M
DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
}
SECTIONS
{
. = STARTUP_ADDR;
@@ -249,4 +258,20 @@ SECTIONS
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
.args : {
*(.args)
. += 8K;
} > DRAMARG
.operand.a : {
*(.operand.a)
. += 32K;
} > DRAM1
.operand.b : {
*(.operand.b)
. += 32K;
} > DRAM2
.operand.c : {
*(.operand.c)
. += 32K;
} > DRAM3
}

View File

@@ -1,21 +1,23 @@
#!/bin/sh
#!/bin/bash
# Copyright 2023 blaise
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TOOLDIR=${TOOLDIR:=$HOME/build/vortex-toolchain-prebuilt}
export TOOLDIR
ENV_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
AE_TOOLCHAIN_DIR="$(realpath ${ENV_SCRIPT_DIR}/../../toolchain)"
export TOOLDIR=${AE_TOOLCHAIN_DIR}/vortex-toolchain-prebuilt
export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH
@@ -27,7 +29,7 @@ export YOSYS_PATH=$TOOLDIR/yosys
export PATH=$YOSYS_PATH/bin:$PATH
# LLVM_POCL seems to be only used in tests/opencl
export LLVM_POCL=/home/virgo-ae/build/llvm-vortex2
export LLVM_VORTEX=/home/virgo-ae/build/llvm-vortex2
export POCL_CC_PATH=/home/virgo-ae/build/pocl-vortex2/compiler
export POCL_RT_PATH=/home/virgo-ae/build/pocl-vortex2/runtime
export LLVM_POCL=${AE_TOOLCHAIN_DIR}/llvm-vortex2
export LLVM_VORTEX=${AE_TOOLCHAIN_DIR}/llvm-vortex2
export POCL_CC_PATH=${AE_TOOLCHAIN_DIR}/pocl-vortex2/compiler
export POCL_RT_PATH=${AE_TOOLCHAIN_DIR}/pocl-vortex2/runtime