Checkpoint wu arch cases before scalar spawn wrapper

Add Blackwell instructions test kernel and update linker script
- Add kernels/blackwell_insts/ with test kernel and input data - Update linker script with extended memory layout - Remove obsolete sgemm_tcore_blackwell Makefile - Update VX_types.h and common.h
2026-05-24 10:51:58 +08:00 · 2026-05-06 14:50:28 +08:00 · 2026-04-25 10:15:31 +08:00 · 2025-02-07 14:51:25 -08:00 · 2025-01-31 02:02:18 -08:00 · 2025-01-30 21:17:12 -08:00
54 changed files with 1376 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+**/*.o
+.codex
+**/*.elf
+**/*.dump
+**/*.a
--- a/kernels/blackwell_insts/Makefile
+++ b/kernels/blackwell_insts/Makefile
@@ -0,0 +1,7 @@
+PROJECT = blackwell_insts
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n1
+
+include ../common.mk
--- a/kernels/blackwell_insts/args.bin
+++ b/kernels/blackwell_insts/args.bin
@@ -0,0 +1 @@
+0
--- a/kernels/blackwell_insts/input.a.bin
+++ b/kernels/blackwell_insts/input.a.bin
@@ -0,0 +1 @@
+0
--- a/kernels/blackwell_insts/input.b.bin
+++ b/kernels/blackwell_insts/input.b.bin
@@ -0,0 +1 @@
+0
--- a/kernels/blackwell_insts/input.c.bin
+++ b/kernels/blackwell_insts/input.c.bin
@@ -0,0 +1 @@
+0
--- a/kernels/blackwell_insts/kernel.cpp
+++ b/kernels/blackwell_insts/kernel.cpp
@@ -0,0 +1,192 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+
+#define DEV_SMEM_START_ADDR 0xff000000u
+
+#define BW_REP2(x) x, x
+#define BW_REP4(x) BW_REP2(x), BW_REP2(x)
+#define BW_REP8(x) BW_REP4(x), BW_REP4(x)
+#define BW_REP16(x) BW_REP8(x), BW_REP8(x)
+#define BW_REP32(x) BW_REP16(x), BW_REP16(x)
+#define BW_REP64(x) BW_REP32(x), BW_REP32(x)
+#define BW_REP128(x) BW_REP64(x), BW_REP64(x)
+#define BW_REP256(x) BW_REP128(x), BW_REP128(x)
+
+static volatile uint32_t g_a[256] __attribute__((aligned(32))) = {
+    BW_REP256(0x3c003c00u)}; // two fp16 1.0 values
+static volatile uint32_t g_b[256] __attribute__((aligned(32))) = {
+    BW_REP256(0x40004000u)}; // two fp16 2.0 values
+static volatile uint32_t g_c[256] __attribute__((aligned(32))) = {
+    BW_REP256(0x3f800000u)}; // one fp32 1.0 value
+static volatile uint32_t g_dst[256] __attribute__((aligned(32)));
+static volatile uint32_t g_debug[16] __attribute__((aligned(32)));
+static volatile uint32_t g_status __attribute__((aligned(4)));
+
+#undef BW_REP2
+#undef BW_REP4
+#undef BW_REP8
+#undef BW_REP16
+#undef BW_REP32
+#undef BW_REP64
+#undef BW_REP128
+#undef BW_REP256
+
+struct kernel_arg_t {
+  volatile uint32_t *a;
+  volatile uint32_t *b;
+  volatile uint32_t *c;
+  volatile uint32_t *dst;
+  volatile uint32_t *debug;
+  volatile uint32_t *status;
+};
+
+static inline void tcgen05_cp(uint32_t addr_tmem, uint32_t addr_gmem) {
+  asm volatile(".insn r %0, 2, 0, x0, %1, %2"
+               :
+               : "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
+               : "memory");
+}
+
+static inline void tcgen05_cb(uint32_t addr_tmem, uint32_t addr_gmem) {
+  asm volatile(".insn r %0, 6, 0, x0, %1, %2"
+               :
+               : "i"(RISCV_CUSTOM3), "r"(addr_tmem), "r"(addr_gmem)
+               : "memory");
+}
+
+static inline void tcgen05_cp_wait() {
+  asm volatile(".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
+               : "memory");
+}
+
+static inline void bwgmma(uint32_t addr_tmem_c,
+                          uint32_t addr_tmem_a,
+                          uint32_t addr_smem_b) {
+  asm volatile(".insn r %0, 0, 0, %1, %2, %3"
+               :
+               : "i"(RISCV_CUSTOM3), "r"(addr_tmem_c), "r"(addr_tmem_a),
+                 "r"(addr_smem_b)
+               : "memory");
+}
+
+static inline void bwgmma_wait() {
+  asm volatile(".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3)
+               : "memory");
+}
+
+static inline float tcgen05_ld_f32(uint32_t addr_tmem) {
+  float value;
+  asm volatile(".insn r %1, 4, 0, %0, %2, x0"
+               : "=f"(value)
+               : "i"(RISCV_CUSTOM3), "r"(addr_tmem)
+               : "memory");
+  return value;
+}
+
+static inline void tcgen05_st_f32(uint32_t addr_tmem, float value) {
+  asm volatile(".insn r %0, 5, 0, %1, %2, x0"
+               :
+               : "i"(RISCV_CUSTOM3), "f"(value), "r"(addr_tmem)
+               : "memory");
+}
+
+static inline uint32_t f32_bits(float value) {
+  union {
+    float f;
+    uint32_t u;
+  } bits = {value};
+  return bits.u;
+}
+
+extern "C" void vx_perf_dump() {}
+
+void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg)
+    __attribute__((convergent));
+
+void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
+  if (task_id != 0)
+    return;
+
+  volatile uint32_t *a = arg->a;
+  volatile uint32_t *b = arg->b;
+  volatile uint32_t *c = arg->c;
+  volatile uint32_t *dst = arg->dst;
+  volatile uint32_t *debug = arg->debug;
+  volatile uint32_t *status = arg->status;
+
+  const uint32_t tmem_a = 0x000;
+  const uint32_t tmem_c = 0x400;
+  const uint32_t tmem_st_scratch = 0x800;
+  volatile uint32_t *smem_b_ptr =
+      reinterpret_cast<volatile uint32_t *>(DEV_SMEM_START_ADDR);
+  const uint32_t smem_b = reinterpret_cast<uint32_t>(smem_b_ptr);
+  const uint32_t expected = 0x42820000u; // 65.0f
+  const uint32_t expected_st = 0x3f800000u; // 1.0f
+
+  if (status != nullptr)
+    status[0] = 0x100u;
+
+  for (int i = 0; i < 256; ++i)
+    smem_b_ptr[i] = b[i];
+
+  for (int frag = 0; frag < 32; ++frag) {
+    const uint32_t offset = static_cast<uint32_t>(frag * 32);
+    tcgen05_cp(tmem_a + offset,
+               reinterpret_cast<uint32_t>(&a[frag * 8]));
+    tcgen05_cp(tmem_c + offset,
+               reinterpret_cast<uint32_t>(&c[frag * 8]));
+  }
+  tcgen05_cp_wait();
+
+  const float st_value = 1.0f;
+  tcgen05_st_f32(tmem_st_scratch, st_value);
+  const uint32_t st_bits = f32_bits(tcgen05_ld_f32(tmem_st_scratch));
+  debug[0] = st_bits;
+  if (st_bits != expected_st) {
+    if (status != nullptr)
+      status[0] = 0xe002u;
+    return;
+  }
+
+  bwgmma(tmem_c, tmem_a, smem_b);
+  bwgmma_wait();
+
+  const float ld_value = tcgen05_ld_f32(tmem_c);
+  const uint32_t ld_bits = f32_bits(ld_value);
+  debug[1] = ld_bits;
+  tcgen05_st_f32(tmem_st_scratch + 32, ld_value);
+
+  for (int frag = 0; frag < 32; ++frag) {
+    tcgen05_cb(tmem_c + static_cast<uint32_t>(frag * 32),
+               reinterpret_cast<uint32_t>(&dst[frag * 8]));
+  }
+  tcgen05_cp_wait();
+
+  if (ld_bits != expected) {
+    if (status != nullptr)
+      status[0] = 0xe001u;
+    return;
+  }
+
+  for (int i = 0; i < 256; ++i) {
+    if (dst[i] != expected) {
+      if (status != nullptr)
+        status[0] = 0xe100u | static_cast<uint32_t>(i & 0xff);
+      debug[2] = static_cast<uint32_t>(i);
+      debug[3] = dst[i];
+      return;
+    }
+  }
+
+  if (status != nullptr)
+    status[0] = 0x600du;
+}
+
+int main() {
+  kernel_arg_t arg = {g_a, g_b, g_c, g_dst, g_debug, &g_status};
+
+  vx_spawn_tasks_contiguous(1, reinterpret_cast<vx_spawn_tasks_cb>(kernel_body),
+                            &arg);
+  return (g_status == 0x600du) ? 0 : 1;
+}
--- a/kernels/flash_attention/args.bin
+++ b/kernels/flash_attention/args.bin
@@ -0,0 +1 @@
+args.seq1024.headdim64.bin
--- a/kernels/flash_attention/args.seq1024.headdim64.bin
+++ b/kernels/flash_attention/args.seq1024.headdim64.bin
--- a/kernels/flash_attention/args.seq128.headdim64.bin
+++ b/kernels/flash_attention/args.seq128.headdim64.bin
--- a/kernels/flash_attention/args.seq192.headdim64.bin
+++ b/kernels/flash_attention/args.seq192.headdim64.bin
--- a/kernels/flash_attention/args.seq64.headdim64.bin
+++ b/kernels/flash_attention/args.seq64.headdim64.bin
--- a/kernels/flash_attention/compile_flash.sh
+++ b/kernels/flash_attention/compile_flash.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+archs=("ampere" "virgo")
+
+if [ -z "$TOOLDIR" ]; then
+    echo "error: \$TOOLDIR not set.  Did you run source ci/toolchain_env.sh?"
+    exit 1
+fi
+
+check_exists() {
+    if ! [ -f "$1" ]; then
+        echo "error: looked for file $1 that does not exist."
+        exit 1
+    fi
+}
+
+# generate operands
+echo "generating flash_attn operands for seqlen 1024, headdim 64"
+python3 flash_attn.py 1024 64 64
+mv -v input.a.col.bin input.a.rand.fp32.seqlen1024headdim64.col.bin
+mv -v input.a.row.bin input.a.rand.fp32.seqlen1024headdim64.row.bin
+mv -v input.b.bin input.b.rand.fp32.seqlen1024headdim64.row.bin
+mv -v input.c.bin input.c.rand.fp32.seqlen1024headdim64.row.bin
+ln -sf input.a.rand.fp32.seqlen1024headdim64.row.bin input.a.bin
+ln -sf input.b.rand.fp32.seqlen1024headdim64.row.bin input.b.bin
+ln -sf input.c.rand.fp32.seqlen1024headdim64.row.bin input.c.bin
+
+for arch in "${archs[@]}"; do
+    git checkout ae-flash-$arch
+    # git pull
+
+    # re-compile libvortexrt.a
+    pushd ../../lib
+    make
+    popd
+
+    echo "compiling flash_attn kernel for $arch with seqlen 1024, headdim 64"
+
+    # touch source file to force re-building, as the Makefile does not track
+    # binary changes
+    touch kernel.cpp
+    touch kernel.gemmini.cpp
+
+    make CONFIG=flash.$arch.seqlen1024.headdim64
+done
--- a/kernels/flash_attention/flash_attn.py
+++ b/kernels/flash_attention/flash_attn.py
@@ -0,0 +1,159 @@
+import sys
+import numpy as np
+
+def parse_mnk():
+    if len(sys.argv) != 4:
+        print(f"usage: {sys.argv[0]} dimM dimN dimK", file=sys.stderr)
+        sys.exit(1)
+    m = int(sys.argv[1])
+    n = int(sys.argv[2])
+    k = int(sys.argv[3])
+    return (m, n, k)
+
+
+# Reorder array in a way that groups two adjacent elements along the column to
+# be now adjacent along the row.  This way, when the resulting fp16 array is
+# read in column-major order with 32-bit granularity, the fp16 elements will be
+# read in the same order as regular fp32 elements in column-major.
+#
+# For example:
+# [[1 2]
+#  [3 4]
+#  [5 6]
+#  [7 8]]
+# becomes
+# [[1 3 2 4]
+#  [5 7 6 8]]
+def pack_fp16_by_column(array):
+    rows = array.shape[0]
+    cols = array.shape[1]
+
+    T = array.transpose([1, 0])
+    T_packed = T.reshape([cols, -1, 2])
+    result = T_packed.transpose([1, 0, 2])
+    return result
+
+
+# Do the same as pack_fp16_by_column, but for every two elements along the row.
+def pack_fp16_by_row(array):
+    rows = array.shape[0]
+    cols = array.shape[1]
+
+    result = array.reshape([rows, -1, 2])
+    return result
+
+
+if __name__ == "__main__":
+    seqlen, _, headdim = parse_mnk()
+
+    rand = True
+    if not rand:
+        A_array = np.arange(seqlen * headdim).reshape([seqlen, headdim])
+        B_array = np.arange(headdim * seqlen).reshape([headdim, seqlen])
+        C_array = np.arange(seqlen * seqlen).reshape([seqlen, headdim])
+    else:
+        np.random.seed(0)
+        A_array = np.random.rand(seqlen, headdim) - 0.5
+        B_array = np.random.rand(headdim, seqlen) - 0.5
+        C_array = np.random.rand(seqlen, headdim) - 0.5
+        # C_array = np.zeros([M, N])
+
+    fp16 = False
+    if fp16:
+        A_packed = pack_fp16_by_row(A_array)
+        AT_packed = A_packed.transpose([1, 0, 2])
+        AT_array = AT_packed.reshape([-1, seqlen * 2])
+        AT_array.astype('float16').tofile("input.a.col.bin")
+        # print('AT:')
+        # print(AT_array)
+        B_packed = pack_fp16_by_column(B_array)
+        B_array = B_packed.reshape([-1, headdim * 2])
+        B_array.astype('float16').tofile("input.b.row.bin")
+        # print('B:')
+        # print(B_array)
+    else:
+        A_array.astype('float32').tofile("input.a.row.bin")
+        AT_array = A_array.transpose([1, 0])
+        AT_array.astype('float32').tofile("input.a.col.bin")
+        B_array.astype('float32').tofile("input.b.bin")
+        C_array.astype('float32').tofile("input.c.bin")
+        # print('AT:')
+        # print(AT_array)
+        # print('B:')
+        # print(B_array)
+
+    assert((seqlen % 64) == 0)
+
+    Br = 64
+    Bc = Br
+
+    rowmax = np.zeros([Br])
+    rowsum = np.zeros([Br])
+    O = np.zeros([Br, headdim])
+
+    def exp2(x):
+        return (x**2) / 2.0 + x + 1.0
+
+    full_S = A_array @ B_array
+    full_S_T = full_S.transpose([1, 0])
+    full_S.astype('float32').tofile("full_S.bin")
+
+    col_to_save = 0
+
+    for col in range(0, seqlen, Bc):
+        print(f"tile iteration {col}~{col + Bc} ======================================")
+
+        # FIXME: only work with the first 64 rows of Q for now
+        Q_tile = A_array[0:64, :]
+        K_tile = B_array[:, col:col+Bc]
+
+        S = Q_tile @ K_tile
+        if col == col_to_save:
+            print('S_expected:')
+            print(S)
+            S.astype('float32').tofile("S_expected.bin")
+
+        # generate rowmax result in online softmax
+        rowmax_this = np.max(S, axis=1)
+        rowmax_prev = rowmax.copy()
+        rowmax = np.maximum(rowmax, rowmax_this)
+        if col == col_to_save:
+            rowmax.astype('float32').tofile("rowmax.bin")
+
+        # subtrace rowmax from each row by broadcasting
+        # (placeholder for exp)
+        x = S - rowmax[:, np.newaxis]
+        P = exp2(x)
+        # for i in range(3, 4):
+        #     P += (x**i) / np.math.factorial(i)
+        # P = np.exp(exp)
+        # print('P error:')
+        # print(P / np.exp(x))
+        if col == col_to_save:
+            print('P_expected:')
+            print(P)
+            P.astype('float32').tofile("P_expected.bin")
+            P.transpose([1, 0]).astype('float32').tofile("P_expected.col.bin")
+
+        rowsum_this = np.sum(P, axis=1)
+        x = rowmax_prev - rowmax_this
+        rowsum = exp2(x) * rowsum + rowsum_this
+        if col == col_to_save:
+            rowsum.astype('float32').tofile("rowsum.bin")
+
+        x = rowmax_prev - rowmax
+        O = O / (exp2(x)[:, np.newaxis])
+        if col == col_to_save:
+            print('O_before_PV:')
+            print(O)
+            O.astype('float32').tofile("O_before_PV.bin")
+
+        V = C_array[col:col+Bc, :]
+        if col == col_to_save:
+            V.astype('float32').tofile("V_expected.bin")
+        # O = P.transpose([1, 0]) @ V
+        O = O + P @ V
+        if col == col_to_save:
+            print('O_after_PV:')
+            print(O)
+            O.astype('float32').tofile("O_after_PV.bin")
--- a/kernels/sgemm_gemmini_dma/compile_virgo.sh
+++ b/kernels/sgemm_gemmini_dma/compile_virgo.sh
@@ -1,5 +1,14 @@
 #!/bin/sh

+# hopper and virgo has the same SIMT configurations
+git checkout ae-hopper
+# git pull
+
+# re-compile libvortexrt.a
+pushd ../../lib
+make
+popd
+
 if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
    echo "input binaries not found, generating operands"
    python3 generate_operands.py
--- a/kernels/sgemm_tcore/common.h
+++ b/kernels/sgemm_tcore/common.h
@@ -6,7 +6,7 @@
 #define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000
 #define DEV_SMEM_START_ADDR 0xff000000

-typedef struct {
+typedef struct __attribute__((packed)) {
  uint32_t dim_m;
  uint32_t dim_n;
  uint32_t dim_k;
--- a/kernels/sgemm_tcore/compile_tcore.sh
+++ b/kernels/sgemm_tcore/compile_tcore.sh
@@ -41,12 +41,22 @@ check_exists() {
    fi
 }

+# generate operands
+for dim in "${dims[@]}"; do
+    echo "generating operands for dim $dim"
+    python3 generate_operands.py $dim $dim $dim
+    mv -v input.a.col.bin input.a.rand01.fp16.m${dim}n${dim}k${dim}.col.swizzle_fp16.bin
+    mv -v input.a.row.bin input.a.rand01.fp16.m${dim}n${dim}k${dim}.row.swizzle_fp16.bin
+    mv -v input.b.row.bin input.b.rand01.fp16.m${dim}n${dim}k${dim}.row.bin
+    mv -v input.b.row.swizzled.bin input.b.rand01.fp16.m${dim}n${dim}k${dim}.row.swizzle_fp16.bin
+done
+
 for arch in "${archs[@]}"; do
    git checkout ae-$arch
+    # git pull

    # re-compile libvortexrt.a
-    # FIXME after restructure
-    pushd ../../libs
+    pushd ../../lib
    make
    popd

--- a/kernels/sgemm_tcore/generate_operands.py
+++ b/kernels/sgemm_tcore/generate_operands.py
@@ -0,0 +1,116 @@
+import sys
+import numpy as np
+
+def parse_mnk():
+    if len(sys.argv) != 4:
+        print(f"usage: {sys.argv[0]} dimM dimN dimK", file=sys.stderr)
+        sys.exit(1)
+    m = int(sys.argv[1])
+    n = int(sys.argv[2])
+    k = int(sys.argv[3])
+    return (m, n, k)
+
+
+# Reorder array in a way that groups two adjacent elements along the column to
+# be now adjacent along the row.  This way, when the resulting fp16 array is
+# read in column-major order with 32-bit granularity, the fp16 elements will be
+# read in the same order as regular fp32 elements in column-major.
+#
+# For example:
+# [[1 2]
+#  [3 4]
+#  [5 6]
+#  [7 8]]
+# becomes
+# [[1 3 2 4]
+#  [5 7 6 8]]
+def pack_fp16_by_column(array):
+    rows = array.shape[0]
+    cols = array.shape[1]
+
+    T = array.transpose([1, 0])
+    T_packed = T.reshape([cols, -1, 2])
+    result = T_packed.transpose([1, 0, 2])
+    return result
+
+
+# Do the same as pack_fp16_by_column, but for every two elements along the row.
+def pack_fp16_by_row(array):
+    rows = array.shape[0]
+    cols = array.shape[1]
+
+    result = array.reshape([rows, -1, 2])
+    return result
+
+
+if __name__ == "__main__":
+    M, N, K = parse_mnk()
+
+    rand = True
+    if not rand:
+        A_array = np.arange(M * K).reshape([M, K])
+        B_array = np.arange(K * N).reshape([K, N])
+        # C_array = np.arange(M * N).reshape([M, N])
+        C_array = np.zeros([M, N])
+    else:
+        np.random.seed(0)
+        A_array = np.random.rand(M, K)
+        B_array = np.random.rand(K, N)
+        C_array = np.random.rand(N, K)
+        # C_array = np.zeros([M, N])
+
+    with open('a_matrix.h', 'w') as f:
+        for i in range(A_array.shape[0]):
+            for j in range(A_array.shape[1]):
+                f.write(f'{A_array[i,j]:f}f, ')
+            f.write('\n')
+    with open('b_matrix.h', 'w') as f:
+        for i in range(B_array.shape[0]):
+            for j in range(B_array.shape[1]):
+                f.write(f'{B_array[i,j]:f}f, ')
+            f.write('\n')
+    with open('c_matrix.h', 'w') as f:
+        for i in range(C_array.shape[0]):
+            for j in range(C_array.shape[1]):
+                f.write(f'{C_array[i,j]:f}f, ')
+            f.write('\n')
+
+    np.savez("abc", A_array=A_array, B_array=B_array, C_array=C_array)
+
+    fp16 = True
+    if fp16:
+        A_packed = pack_fp16_by_row(A_array)
+        A_swizzled = A_packed.reshape([-1, M * 2])
+        A_swizzled.astype('float16').tofile("input.a.row.bin")
+        AT_packed = A_packed.transpose([1, 0, 2])
+        AT_swizzled = AT_packed.reshape([-1, M * 2])
+        AT_swizzled.astype('float16').tofile("input.a.col.bin")
+        print('A:')
+        print(A_swizzled)
+        print('AT:')
+        print(AT_swizzled)
+        B_array.astype('float16').tofile("input.b.row.bin")
+        # B_packed_row = pack_fp16_by_row(B_array)
+        # B_packed_row = B_packed_row.reshape([-1, N * 2])
+        # B_packed_row.astype('float16').tofile("input.b.row.bin")
+        B_packed = pack_fp16_by_column(B_array)
+        B_swizzled = B_packed.reshape([-1, N * 2])
+        B_swizzled.astype('float16').tofile("input.b.row.swizzled.bin")
+        print('B:')
+        print(B_swizzled)
+    else:
+        A_array.astype('float32').tofile("input.a.row.bin")
+        AT_array = A_array.transpose([1, 0])
+        AT_array.astype('float32').tofile("input.a.col.bin")
+        B_array.astype('float32').tofile("input.b.bin")
+        C_array.astype('float32').tofile("input.c.bin")
+        print('AT:')
+        print(AT_array)
+        print('B:')
+        print(B_array)
+
+    D_expected = A_array @ B_array
+    D_expected.astype('float32').tofile("d_expected.bin")
+    print('D_expected:')
+    print(D_expected)
+
--- a/kernels/sgemm_tcore/sgemm_impl.hpp
+++ b/kernels/sgemm_tcore/sgemm_impl.hpp
@@ -267,6 +267,34 @@ inline void vx_wgmma_wait() {
  asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
 }

+inline void vx_tcgen05_cp(const uint32_t addr_tmem, const uint32_t addr_smem) {
+  asm volatile(".insn r %0, 2, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem),
+               "r"(addr_smem));
+}
+
+inline void vx_tcgen05_cp_wait() {
+  asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
+}
+
+inline void vx_bwgmma(const uint32_t addr_tmem_a, const uint32_t addr_smem_b) {
+  asm volatile(".insn r %0, 0, 0, x0, %1, %2" ::"i"(RISCV_CUSTOM3), "r"(addr_tmem_a),
+               "r"(addr_smem_b));
+}
+
+inline void vx_bwgmma_wait() {
+  asm volatile (".insn r %0, 1, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM3));
+}
+
+inline void vx_tcgen05_ld(const uint32_t addr_tmem, const uint32_t rd_hint) {
+  asm volatile(".insn r %0, 4, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
+               "r"(addr_tmem));
+}
+
+inline void vx_tcgen05_st(const uint32_t addr_tmem, const uint32_t rd_hint) {
+  asm volatile(".insn r %0, 5, 0, %1, %2, x0" ::"i"(RISCV_CUSTOM3), "r"(rd_hint),
+               "r"(addr_tmem));
+}
+
 // Remap logical row/col coordinate of a matrix element to a memory index that
 // follows the 2-level block-row-major layout that Gemmini DMA uses
 template <bool use_dma, uint32_t dim_col>
@@ -1190,10 +1218,10 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
                (uint64_t)(B + /*block_k:*/ 0 * BK * dim_n + block_n * BN),
                k_LOOP_WS_CONFIG_ADDRS_AB)
            // GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
-            GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
+            GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | GEMMINI_CISC_SET_AB_STRIDE);
            gemmini_fence();

-            GEMMINI_CISC_CMD_I(10);
+            GEMMINI_CISC_CMD_R((11 << 16) | (0 << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
            gemmini_fence();

 #if 0
@@ -1257,7 +1285,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
              k_LOOP_WS_CONFIG_ADDRS_AB)
          // GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
          GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
-          gemmini_fence();
+          // gemmini_fence();

          // block_k is even: opcode 11 (write to local_a_buf)
          // block_k is odd:  opcode 10 (write to local_a)
@@ -1266,8 +1294,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
          // the last iteration of the k-loop is prefetching for the first
          // iteration of the n-loop.  The ping-poing indexing has to match for
          // the two loop end to connect.
-          const uint32_t opcode = 11 - (block_k & 1);
-          GEMMINI_CISC_CMD_I(opcode);
+          const uint32_t a_hexadecile = 4 - ((block_k & 1) * 4);
+          const uint32_t b_hexadecile = a_hexadecile + 11;
+          GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
          // // TODO: branch is probably slow
          // if (block_k & 1) {
          //   GEMMINI_CISC_CMD_I(12);
--- a/kernels/wu_arch_cases/Makefile
+++ b/kernels/wu_arch_cases/Makefile
@@ -0,0 +1,40 @@
+CASES := \
+	case00_boot_scalar \
+	case01_scalar_spawn \
+	case02_tensor_spawn_stop \
+	case03_dual_fetch_issue \
+	case04_scalar_barrier \
+	case05_tensor_barrier \
+	case06_masked_barrier \
+	case07_tensor_csr_tmc \
+	case08_tensor_lsu_optional
+
+SMOKE_CASES := \
+	case00_boot_scalar \
+	case01_scalar_spawn \
+	case02_tensor_spawn_stop \
+	case03_dual_fetch_issue
+
+BARRIER_CASES := \
+	case04_scalar_barrier \
+	case05_tensor_barrier \
+	case06_masked_barrier
+
+.PHONY: all smoke barriers full clean clean-all $(CASES)
+
+all: full
+
+smoke: $(SMOKE_CASES)
+
+barriers: $(BARRIER_CASES)
+
+full: $(CASES)
+
+$(CASES):
+	$(MAKE) -C $@
+
+clean:
+	set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean; done
+
+clean-all:
+	set -e; for dir in $(CASES); do $(MAKE) -C $$dir clean-all; done
--- a/kernels/wu_arch_cases/README.md
+++ b/kernels/wu_arch_cases/README.md
@@ -0,0 +1,48 @@
+# Wu Architecture Staged Cases
+
+This directory contains small bare-metal kernels for incremental Wu architecture testing. The original `kernels/wu_arch` kernel is useful as an integrated test, but it combines scalar spawning, tensor spawning, barriers, tensor control, and memory behavior in one large workload. These cases isolate those surfaces so failures can be reproduced faster under Verilator.
+
+## Case List
+
+- `case00_boot_scalar`: minimal scalar boot, status writes, and pass marker.
+- `case01_scalar_spawn`: scalar warp spawning without tensor warps or barriers.
+- `case02_tensor_spawn_stop`: tensor warp spawn, marker store, and stop.
+- `case03_dual_fetch_issue`: scalar and tensor warps active together to exercise split scheduling and issue.
+- `case04_scalar_barrier`: scalar-domain barrier release.
+- `case05_tensor_barrier`: tensor-domain barrier through tensor control.
+- `case06_masked_barrier`: explicit mixed `BAR_MASK` with scalar warp 0 and tensor warps.
+- `case07_tensor_csr_tmc`: tensor CSR/TMC path without barrier behavior.
+- `case08_tensor_lsu_optional`: tensor LSU store/load marker path; keep last because memory interaction is broader and slower.
+
+Each case has its own `README.md` describing the test objective, RTL surface, and expected pass marker.
+
+## Build
+
+Use the suite Makefile from this directory:
+
+```sh
+make smoke -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
+make barriers -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
+make full -j4 LLVM_VORTEX=/home/hexu/dse/wu/virgo-artifact-full/toolchain/llvm-vortex2 RISCV_TOOLCHAIN_PATH=/home/hexu/dse/wu/virgo-artifact-full/chipyard/.conda-env/riscv-tools RISCV_PREFIX=riscv64-unknown-elf
+```
+
+`smoke` builds the boot/spawn/dual-issue cases. `barriers` builds the barrier-focused cases. `full` builds all cases.
+
+## Verilator Run Notes
+
+For RTL simulation, use the same simulator setup as the main Virgo artifact, but run these ELFs one at a time:
+
+- `VM_PARALLEL_BUILDS=1`
+- `LOADMEM=1`, so `SimDRAM::memory_init()` preloads the ELF instead of relying on slow runtime SimTSI writes.
+- `CCACHE_DIR=/tmp/ccache` when ccache is enabled in the sandbox.
+- Use `/home/hexu/dse/firtool-1.62.0` for firtool and `/usr/local/bin/verilator` for Verilator.
+- Keep system `gcc/g++` on `PATH`; do not use the `gcc/g++` injected by `chipyard/env.sh`.
+- For generated Verilator C++ compilation, prefer `-O0 -fno-inline` to reduce compile time.
+
+## Cleanup
+
+```sh
+make clean-all
+```
+
+This removes kernel ELF/dump outputs and the generated placeholder input blobs in each case directory.
--- a/kernels/wu_arch_cases/case.mk
+++ b/kernels/wu_arch_cases/case.mk
@@ -0,0 +1,17 @@
+PROJECT ?= wu_arch_case
+VX_SRCS = kernel.cpp
+VX_CFLAGS += -I..
+VORTEX_KN_PATH ?= $(realpath ../../../lib)
+GEMMINI_SW_PATH ?= $(realpath ../../../lib/gemmini)
+OPTS ?= -n1
+
+include ../../common.mk
+
+args.bin input.a.bin input.b.bin input.c.bin: ../zero.bin
+	cp $< $@
+
+clean-all: clean-wu-case-inputs
+
+.PHONY: clean-wu-case-inputs
+clean-wu-case-inputs:
+	rm -f args.bin input.a.bin input.b.bin input.c.bin
--- a/kernels/wu_arch_cases/case00_boot_scalar/Makefile
+++ b/kernels/wu_arch_cases/case00_boot_scalar/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case00_boot_scalar
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case00_boot_scalar/README.md
+++ b/kernels/wu_arch_cases/case00_boot_scalar/README.md
@@ -0,0 +1,15 @@
+# Case 00: Boot Scalar
+
+## Test Objective
+
+Verify the minimal Wu bare-metal entry path: core 0, warp 0, thread 0 reaches `wu_main`, can write the shared status arrays, and can terminate with `WU_CASE_PASS`.
+
+## RTL Surface Covered
+
+- Minimal `_start` path from `common_wu_min.h`
+- Scalar warp 0 fetch/decode/issue
+- Scalar ALU/store path for status writes
+
+## Expected Result
+
+`g_status[0] == WU_CASE_PASS` and `g_seen[0] == WU_CASE_SCALAR_BASE`.
--- a/kernels/wu_arch_cases/case00_boot_scalar/kernel.cpp
+++ b/kernels/wu_arch_cases/case00_boot_scalar/kernel.cpp
@@ -0,0 +1,13 @@
+#include "common_wu_min.h"
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  g_seen[0] = WU_CASE_SCALAR_BASE;
+  g_aux[0] = static_cast<uint32_t>(vx_num_warps());
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case01_scalar_spawn/Makefile
+++ b/kernels/wu_arch_cases/case01_scalar_spawn/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case01_scalar_spawn
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case01_scalar_spawn/README.md
+++ b/kernels/wu_arch_cases/case01_scalar_spawn/README.md
@@ -0,0 +1,16 @@
+# Case 01: Scalar Spawn
+
+## Test Objective
+
+Verify scalar-domain warp spawning without tensor warps, barriers, or shared tensor resources.
+
+## RTL Surface Covered
+
+- Scalar scheduler output
+- Scalar fetch/decode/issue path
+- Scalar `WSPAWN` mask path
+- Scalar store path for per-warp completion markers
+
+## Expected Result
+
+Every scalar warp writes `WU_CASE_SCALAR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.
--- a/kernels/wu_arch_cases/case01_scalar_spawn/kernel.cpp
+++ b/kernels/wu_arch_cases/case01_scalar_spawn/kernel.cpp
@@ -0,0 +1,29 @@
+#include "common_wu_min.h"
+
+extern "C" void scalar_worker() {
+  wu_short_delay(wu_wid());
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+  wu_stop_warp();
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+
+  const uint32_t spawn_mask = wu_scalar_mask_without_warp0();
+  if (spawn_mask != 0) {
+    vx_spawn_scalar(spawn_mask, scalar_worker);
+  }
+
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+  if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
+    wu_case_fail(0x01u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case02_tensor_spawn_stop/Makefile
+++ b/kernels/wu_arch_cases/case02_tensor_spawn_stop/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case02_tensor_spawn_stop
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case02_tensor_spawn_stop/README.md
+++ b/kernels/wu_arch_cases/case02_tensor_spawn_stop/README.md
@@ -0,0 +1,16 @@
+# Case 02: Tensor Spawn Stop
+
+## Test Objective
+
+Verify tensor warps can be spawned, scheduled, issued, write a completion marker, and stop without barriers or tensor LSU stress.
+
+## RTL Surface Covered
+
+- Tensor scheduler output
+- Tensor fetch/decode/issue path
+- Tensor ALU/store for a minimal marker
+- Tensor-domain stop via `TMC zero`
+
+## Expected Result
+
+Every tensor warp writes `WU_CASE_TENSOR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS` to `g_status[0]`.
--- a/kernels/wu_arch_cases/case02_tensor_spawn_stop/kernel.cpp
+++ b/kernels/wu_arch_cases/case02_tensor_spawn_stop/kernel.cpp
@@ -0,0 +1,36 @@
+#include "common_wu_min.h"
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "li x6, %[tensor_base]\n\t"
+      "or x6, x6, x5\n\t"
+      "sw x6, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [tensor_base] "i"(WU_CASE_TENSOR_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
+    wu_case_fail(0x02u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case03_dual_fetch_issue/Makefile
+++ b/kernels/wu_arch_cases/case03_dual_fetch_issue/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case03_dual_fetch_issue
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case03_dual_fetch_issue/README.md
+++ b/kernels/wu_arch_cases/case03_dual_fetch_issue/README.md
@@ -0,0 +1,16 @@
+# Case 03: Dual Fetch Issue
+
+## Test Objective
+
+Verify scalar and tensor domains can be active together and both make forward progress through the shared fetch path and split issue paths.
+
+## RTL Surface Covered
+
+- Shared fetch arbitration between scalar and tensor schedule streams
+- Scalar decode/issue domain
+- Tensor decode/issue domain
+- Completion markers from both warp classes
+
+## Expected Result
+
+All scalar warps write `WU_CASE_SCALAR_BASE | wid`, all tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case03_dual_fetch_issue/kernel.cpp
+++ b/kernels/wu_arch_cases/case03_dual_fetch_issue/kernel.cpp
@@ -0,0 +1,59 @@
+#include "common_wu_min.h"
+
+extern "C" void scalar_worker() {
+  wu_short_delay(wu_wid());
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+  wu_stop_warp();
+}
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "li x6, %[spin]\n\t"
+      "1:\n\t"
+      "addi x6, x6, -1\n\t"
+      "bnez x6, 1b\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "li x6, %[tensor_base]\n\t"
+      "or x6, x6, x5\n\t"
+      "sw x6, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "2: j 2b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [spin] "i"(WU_CASE_SHORT_SPIN),
+        [tensor_base] "i"(WU_CASE_TENSOR_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+
+  const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
+  if (scalar_mask != 0) {
+    vx_spawn_scalar(scalar_mask, scalar_worker);
+  }
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  wu_short_delay(0);
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+
+  if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
+    wu_case_fail(0x31u);
+    return 1;
+  }
+  if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
+    wu_case_fail(0x32u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case04_scalar_barrier/Makefile
+++ b/kernels/wu_arch_cases/case04_scalar_barrier/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case04_scalar_barrier
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case04_scalar_barrier/README.md
+++ b/kernels/wu_arch_cases/case04_scalar_barrier/README.md
@@ -0,0 +1,15 @@
+# Case 04: Scalar Barrier
+
+## Test Objective
+
+Verify scalar-domain `BAR` synchronizes only scalar warps and releases them correctly.
+
+## RTL Surface Covered
+
+- Scalar WCTL barrier path
+- Scheduler scalar barrier mask handling
+- Scalar wakeup after barrier release
+
+## Expected Result
+
+All scalar warps pass `vx_barrier_scalar`, write `WU_CASE_SCALAR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case04_scalar_barrier/kernel.cpp
+++ b/kernels/wu_arch_cases/case04_scalar_barrier/kernel.cpp
@@ -0,0 +1,33 @@
+#include "common_wu_min.h"
+
+#define CASE04_BARRIER_ID 0u
+
+extern "C" void scalar_worker() {
+  vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+  wu_stop_warp();
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+
+  const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
+  if (scalar_mask != 0) {
+    vx_spawn_scalar(scalar_mask, scalar_worker);
+  }
+
+  vx_barrier_scalar(CASE04_BARRIER_ID, NUM_SCALAR_WARPS);
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+
+  if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
+    wu_case_fail(0x04u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case05_tensor_barrier/Makefile
+++ b/kernels/wu_arch_cases/case05_tensor_barrier/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case05_tensor_barrier
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case05_tensor_barrier/README.md
+++ b/kernels/wu_arch_cases/case05_tensor_barrier/README.md
@@ -0,0 +1,15 @@
+# Case 05: Tensor Barrier
+
+## Test Objective
+
+Verify tensor-domain `BAR` is handled by tensor control and releases tensor warps without relying on scalar SFU dispatch.
+
+## RTL Surface Covered
+
+- Tensor control barrier decode
+- Tensor warp-control merge into scheduler
+- Scheduler tensor-domain barrier mask handling
+
+## Expected Result
+
+All tensor warps pass the tensor-domain barrier, write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case05_tensor_barrier/kernel.cpp
+++ b/kernels/wu_arch_cases/case05_tensor_barrier/kernel.cpp
@@ -0,0 +1,45 @@
+#include "common_wu_min.h"
+
+#define CASE05_BARRIER_ID 1u
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "li x1, (%[bar_id] | (%[domain_tensor] << %[domain_shift]))\n\t"
+      "li x2, %[num_tensor]\n\t"
+      ".insn r %[custom0], 4, 0, x0, x1, x2\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "li x6, %[tensor_base]\n\t"
+      "or x6, x6, x5\n\t"
+      "sw x6, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [bar_id] "i"(CASE05_BARRIER_ID),
+        [domain_tensor] "i"(VX_BARRIER_DOMAIN_TENSOR),
+        [domain_shift] "i"(VX_BARRIER_DOMAIN_SHIFT),
+        [num_tensor] "i"(NUM_TENSOR_WARPS),
+        [tensor_base] "i"(WU_CASE_TENSOR_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
+    wu_case_fail(0x05u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case06_masked_barrier/Makefile
+++ b/kernels/wu_arch_cases/case06_masked_barrier/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case06_masked_barrier
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case06_masked_barrier/README.md
+++ b/kernels/wu_arch_cases/case06_masked_barrier/README.md
@@ -0,0 +1,16 @@
+# Case 06: Masked Barrier
+
+## Test Objective
+
+Verify `BAR_MASK` can synchronize an explicit mixed mask containing scalar warp 0 and all tensor warps.
+
+## RTL Surface Covered
+
+- Scalar-side masked barrier issue
+- Tensor-side masked barrier issue through tensor control
+- Scheduler explicit barrier mask release
+- Scalar/tensor warp-control merge when both domains participate in one barrier
+
+## Expected Result
+
+Scalar warp 0 and all tensor warps pass the same `BAR_MASK`; tensor warps write `WU_CASE_TENSOR_BASE | wid`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case06_masked_barrier/kernel.cpp
+++ b/kernels/wu_arch_cases/case06_masked_barrier/kernel.cpp
@@ -0,0 +1,47 @@
+#include "common_wu_min.h"
+
+#define CASE06_BARRIER_ID 2u
+#define CASE06_BARRIER_MASK (vx_tensor_warp_mask() | 1u)
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "li x1, %[bar_id]\n\t"
+      "li x2, %[mask]\n\t"
+      ".insn r %[custom0], 7, 0, x0, x1, x2\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "li x6, %[tensor_base]\n\t"
+      "or x6, x6, x5\n\t"
+      "sw x6, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [bar_id] "i"(CASE06_BARRIER_ID),
+        [mask] "i"(((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS | 1u),
+        [tensor_base] "i"(WU_CASE_TENSOR_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  vx_barrier_mask(CASE06_BARRIER_ID, CASE06_BARRIER_MASK);
+  wu_mark_seen(WU_CASE_SCALAR_BASE);
+
+  if (wu_wait_seen_mask(vx_tensor_warp_mask(), WU_CASE_TENSOR_BASE) != 0) {
+    wu_case_fail(0x06u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case07_tensor_csr_tmc/Makefile
+++ b/kernels/wu_arch_cases/case07_tensor_csr_tmc/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case07_tensor_csr_tmc
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case07_tensor_csr_tmc/README.md
+++ b/kernels/wu_arch_cases/case07_tensor_csr_tmc/README.md
@@ -0,0 +1,15 @@
+# Case 07: Tensor CSR TMC
+
+## Test Objective
+
+Verify tensor control handles legal tensor-domain CSR reads and `TMC` operations without involving barrier behavior.
+
+## RTL Surface Covered
+
+- Tensor CSRRS path for `VX_CSR_WARP_ID`
+- Tensor TMC path setting a single active lane
+- Tensor control completion and tensor-domain stop
+
+## Expected Result
+
+Every tensor warp writes `WU_CASE_TENSOR_CSR_BASE | wid` to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case07_tensor_csr_tmc/kernel.cpp
+++ b/kernels/wu_arch_cases/case07_tensor_csr_tmc/kernel.cpp
@@ -0,0 +1,38 @@
+#include "common_wu_min.h"
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "li x6, 1\n\t"
+      ".insn r %[custom0], 0, 0, x0, x6, x0\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "li x6, %[tensor_csr_base]\n\t"
+      "or x6, x6, x5\n\t"
+      "sw x6, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [tensor_csr_base] "i"(WU_CASE_TENSOR_CSR_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_CSR_BASE) != 0) {
+    wu_case_fail(0x07u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/case08_tensor_lsu_optional/Makefile
+++ b/kernels/wu_arch_cases/case08_tensor_lsu_optional/Makefile
@@ -0,0 +1,3 @@
+PROJECT = wu_arch_case08_tensor_lsu_optional
+
+include ../case.mk
--- a/kernels/wu_arch_cases/case08_tensor_lsu_optional/README.md
+++ b/kernels/wu_arch_cases/case08_tensor_lsu_optional/README.md
@@ -0,0 +1,15 @@
+# Case 08: Tensor LSU Optional
+
+## Test Objective
+
+Verify tensor-domain LSU can store and reload a small per-warp value. This is intentionally last because memory hierarchy interaction is slower and has a larger debug surface.
+
+## RTL Surface Covered
+
+- Tensor LSU dispatch
+- Tensor LSU response/writeback path
+- Shared memory hierarchy merge after tensor-domain issue
+
+## Expected Result
+
+Every tensor warp stores and reloads `WU_CASE_TENSOR_LSU_BASE | wid`, writes that value to `g_seen[wid]`, and warp 0 writes `WU_CASE_PASS`.
--- a/kernels/wu_arch_cases/case08_tensor_lsu_optional/kernel.cpp
+++ b/kernels/wu_arch_cases/case08_tensor_lsu_optional/kernel.cpp
@@ -0,0 +1,40 @@
+#include "common_wu_min.h"
+
+extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
+  asm volatile(
+      "csrr x5, %[csr_wid]\n\t"
+      "slli x6, x5, 2\n\t"
+      "la x7, g_case_mem\n\t"
+      "add x7, x7, x6\n\t"
+      "li x8, %[tensor_lsu_base]\n\t"
+      "or x8, x8, x5\n\t"
+      "sw x8, 0(x7)\n\t"
+      "lw x8, 0(x7)\n\t"
+      "la x7, g_seen\n\t"
+      "add x7, x7, x6\n\t"
+      "sw x8, 0(x7)\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_wid] "i"(VX_CSR_WARP_ID),
+        [custom0] "i"(RISCV_CUSTOM0),
+        [tensor_lsu_base] "i"(WU_CASE_TENSOR_LSU_BASE)
+      : "memory");
+}
+
+extern "C" int wu_main() {
+  if (!wu_is_leader()) {
+    return 0;
+  }
+
+  wu_case_reset();
+  vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
+
+  if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_LSU_BASE) != 0) {
+    wu_case_fail(0x08u);
+    return 1;
+  }
+
+  wu_case_pass();
+  return 0;
+}
--- a/kernels/wu_arch_cases/common_wu_min.h
+++ b/kernels/wu_arch_cases/common_wu_min.h
@@ -0,0 +1,141 @@
+#ifndef WU_ARCH_CASES_COMMON_WU_MIN_H
+#define WU_ARCH_CASES_COMMON_WU_MIN_H
+
+#include <stdint.h>
+#include <vx_intrinsics.h>
+
+#define WU_CASE_MAX_WARPS 8u
+#define WU_CASE_WAIT_SPIN 1024u
+#define WU_CASE_SHORT_SPIN 8u
+
+#define WU_CASE_PASS 0x600du
+#define WU_CASE_FAIL_BASE 0xe000u
+#define WU_CASE_SCALAR_BASE 0x5100u
+#define WU_CASE_TENSOR_BASE 0x7100u
+#define WU_CASE_TENSOR_CSR_BASE 0x7300u
+#define WU_CASE_TENSOR_LSU_BASE 0x7400u
+
+extern "C" {
+volatile uint32_t g_status[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
+volatile uint32_t g_seen[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
+volatile uint32_t g_aux[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
+volatile uint32_t g_case_mem[WU_CASE_MAX_WARPS] __attribute__((aligned(32)));
+extern volatile uint64_t tohost;
+}
+
+extern "C" void vx_perf_dump() {}
+extern "C" int wu_main();
+
+extern "C" void __attribute__((naked, section(".init"), used)) _start() {
+  asm volatile(
+      ".option push\n\t"
+      ".option norelax\n\t"
+      "la gp, __global_pointer\n\t"
+      ".option pop\n\t"
+      "csrr t0, %[csr_core]\n\t"
+      "bnez t0, 2f\n\t"
+      "li sp, %[stack_base]\n\t"
+      "call wu_main\n\t"
+      "mv gp, a0\n\t"
+      "2:\n\t"
+      ".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
+      "1: j 1b\n\t"
+      :
+      : [csr_core] "i"(VX_CSR_CORE_ID),
+        [stack_base] "i"(STACK_BASE_ADDR),
+        [custom0] "i"(RISCV_CUSTOM0)
+      : "memory");
+}
+
+static inline uint32_t wu_wid() {
+  return static_cast<uint32_t>(vx_warp_id());
+}
+
+static inline uint32_t wu_tid() {
+  return static_cast<uint32_t>(vx_thread_id());
+}
+
+static inline uint32_t wu_scalar_mask_without_warp0() {
+  return vx_scalar_warp_mask() & ~1u;
+}
+
+static inline void wu_case_reset() {
+  for (uint32_t i = 0; i < WU_CASE_MAX_WARPS; ++i) {
+    g_status[i] = 0;
+    g_seen[i] = 0;
+    g_aux[i] = 0;
+    g_case_mem[i] = 0;
+  }
+}
+
+static inline void wu_short_delay(uint32_t seed) {
+  volatile uint32_t value = seed + 1u;
+  for (uint32_t i = 0; i < WU_CASE_SHORT_SPIN; ++i) {
+    value = (value << 1) ^ (seed + i);
+  }
+  if (wu_wid() < WU_CASE_MAX_WARPS && wu_tid() == 0) {
+    g_aux[wu_wid()] = value;
+  }
+}
+
+static inline int wu_wait_seen_range(uint32_t first_wid, uint32_t end_wid, uint32_t base) {
+  for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
+    uint32_t done = 1;
+    for (uint32_t wid = first_wid; wid < end_wid; ++wid) {
+      done &= (g_seen[wid] == (base | wid));
+    }
+    if (done) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static inline int wu_wait_seen_mask(uint32_t warp_mask, uint32_t base) {
+  for (uint32_t spin = 0; spin < WU_CASE_WAIT_SPIN; ++spin) {
+    uint32_t done = 1;
+    for (uint32_t wid = 0; wid < WU_CASE_MAX_WARPS; ++wid) {
+      if (warp_mask & (1u << wid)) {
+        done &= (g_seen[wid] == (base | wid));
+      }
+    }
+    if (done) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static inline void wu_mark_seen(uint32_t base) {
+  const uint32_t wid = wu_wid();
+  if (wu_tid() == 0 && wid < WU_CASE_MAX_WARPS) {
+    g_seen[wid] = base | wid;
+  }
+}
+
+static inline void wu_stop_warp() {
+  vx_tmc_zero();
+  while (1) {}
+}
+
+static inline int wu_is_leader() {
+  return vx_core_id() == 0 && vx_warp_id() == 0 && vx_thread_id() == 0;
+}
+
+static inline void wu_report_tohost(uint32_t exit_code) {
+  asm volatile("fence rw, rw" ::: "memory");
+  tohost = (static_cast<uint64_t>(exit_code) << 1) | 1u;
+  asm volatile("fence rw, rw" ::: "memory");
+}
+
+static inline void wu_case_pass() {
+  g_status[0] = WU_CASE_PASS;
+  wu_report_tohost(0);
+}
+
+static inline void wu_case_fail(uint32_t code) {
+  g_status[0] = WU_CASE_FAIL_BASE | code;
+  wu_report_tohost(code ? code : 1u);
+}
+
+#endif
--- a/kernels/wu_arch_cases/zero.bin
+++ b/kernels/wu_arch_cases/zero.bin
@@ -0,0 +1 @@
+0
--- a/lib/include/VX_types.h
+++ b/lib/include/VX_types.h
@@ -184,10 +184,10 @@
 #define VX_CSR_CORE_ID                  0xCC2
 #define VX_CSR_WARP_MASK                0xCC3
 #define VX_CSR_THREAD_MASK              0xCC4     // warning! this value is also used in LLVM
+#define VX_CSR_GCID                     0xCC5     // legacy global core id alias used by Radiance bootrom

 #define VX_CSR_NUM_THREADS              0xFC0
 #define VX_CSR_NUM_WARPS                0xFC1
 #define VX_CSR_NUM_CORES                0xFC2

 #endif // VX_TYPES_VH
-
--- a/lib/linker/vx_link64.ld
+++ b/lib/linker/vx_link64.ld
@@ -7,6 +7,15 @@ OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
 	      "elf64-littleriscv")
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
+
+MEMORY {
+  DRAM0    (rwx): ORIGIN = 0x80000000, LENGTH = 512M
+  DRAMARG  (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
+  DRAM1    (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
+  DRAM2    (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
+  DRAM3    (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
+}
+
 SECTIONS
 {
  . = STARTUP_ADDR;
@@ -249,4 +258,20 @@ SECTIONS
  .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
  /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }

+  .args : {
+    *(.args)
+    . += 8K;
+  } > DRAMARG
+  .operand.a : {
+    *(.operand.a)
+    . += 32K;
+  } > DRAM1
+  .operand.b : {
+    *(.operand.b)
+    . += 32K;
+  } > DRAM2
+  .operand.c : {
+    *(.operand.c)
+    . += 32K;
+  } > DRAM3
 }
--- a/scripts/toolchain_env.sh
+++ b/scripts/toolchain_env.sh
@@ -1,21 +1,23 @@
-#!/bin/sh
+#!/bin/bash

 # Copyright 2023 blaise
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

-TOOLDIR=${TOOLDIR:=$HOME/build/vortex-toolchain-prebuilt}
-export TOOLDIR
+ENV_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+AE_TOOLCHAIN_DIR="$(realpath ${ENV_SCRIPT_DIR}/../../toolchain)"
+
+export TOOLDIR=${AE_TOOLCHAIN_DIR}/vortex-toolchain-prebuilt

 export VERILATOR_ROOT=$TOOLDIR/verilator
 export PATH=$VERILATOR_ROOT/bin:$PATH
@@ -27,7 +29,7 @@ export YOSYS_PATH=$TOOLDIR/yosys
 export PATH=$YOSYS_PATH/bin:$PATH

 # LLVM_POCL seems to be only used in tests/opencl
-export LLVM_POCL=/home/virgo-ae/build/llvm-vortex2
-export LLVM_VORTEX=/home/virgo-ae/build/llvm-vortex2
-export POCL_CC_PATH=/home/virgo-ae/build/pocl-vortex2/compiler
-export POCL_RT_PATH=/home/virgo-ae/build/pocl-vortex2/runtime
+export LLVM_POCL=${AE_TOOLCHAIN_DIR}/llvm-vortex2
+export LLVM_VORTEX=${AE_TOOLCHAIN_DIR}/llvm-vortex2
+export POCL_CC_PATH=${AE_TOOLCHAIN_DIR}/pocl-vortex2/compiler
+export POCL_RT_PATH=${AE_TOOLCHAIN_DIR}/pocl-vortex2/runtime
Author	SHA1	Message	Date
abnerhexu	e7229dae27	Checkpoint wu arch cases before scalar spawn wrapper	2026-05-24 10:51:58 +08:00
abnerhexu	8f7dba5920	Add Blackwell instructions test kernel and update linker script - Add kernels/blackwell_insts/ with test kernel and input data - Update linker script with extended memory layout - Remove obsolete sgemm_tcore_blackwell Makefile - Update VX_types.h and common.h	2026-05-06 14:50:28 +08:00
abnerhexu	bcc566b621	Add Blackwell SGEMM kernel scaffolding	2026-04-25 10:15:31 +08:00
Virgo-AE Eval	71f713b9fc	Disable git pull for archive Only use local refs in the archive for reproducibility.	2025-02-07 14:51:25 -08:00
Richard Yan	9847072eff	fix hexadecile	2025-01-31 02:02:18 -08:00
Richard Yan	f8c51669c1	fix toolchain env sh	2025-01-30 21:17:12 -08:00
Richard Yan	17a9d31be5	fix dma invocation	2025-01-30 15:33:58 -08:00
Hansung Kim	238b942133	Add missing library remake	2025-01-30 13:24:23 -08:00
Hansung Kim	2c1ac4e938	Do git pull to make sure up-to-date	2025-01-30 01:47:35 -08:00
Richard Yan	9cdee597b6	Merge branch 'ae' of https://github.com/richardyrh/virgo-kernels into ae	2025-01-30 01:34:29 -08:00
Hansung Kim	6bdc6af607	Fix branch name and dims for flash script	2025-01-30 01:15:57 -08:00
Hansung Kim	b73147cd06	Add compile and operand generate script for flash	2025-01-30 01:04:20 -08:00
Hansung Kim	471f89e371	Add arg binary for flash	2025-01-30 01:02:12 -08:00
Hansung Kim	7e1fc54c97	Fix typo in path	2025-01-30 00:41:42 -08:00
Hansung Kim	50c8f1c410	Add operand generate script for tcore	2025-01-29 23:33:09 -08:00