diff --git a/.gitmodules b/.gitmodules
index af1d1a47..6bc2bb4c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "third_party/ramulator"]
 	path = third_party/ramulator
 	url = https://github.com/CMU-SAFARI/ramulator.git
+[submodule "third_party/gemmini-rocc-tests"]
+	path = third_party/gemmini-rocc-tests
+	url = https://github.com/ucb-bar/gemmini-rocc-tests
diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh
index 440a899e..3d4e2d41 100644
--- a/ci/toolchain_env.sh
+++ b/ci/toolchain_env.sh
@@ -24,3 +24,7 @@ export PATH=$SV2V_PATH/bin:$PATH
 
 export YOSYS_PATH=$TOOLDIR/yosys
 export PATH=$YOSYS_PATH/bin:$PATH
+
+export LLVM_VORTEX=$TOOLDIR/llvm-vortex
+export POCL_CC_PATH=$TOOLDIR/pocl/compiler
+export POCL_RT_PATH=$TOOLDIR/pocl/runtime
\ No newline at end of file
diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv
index 41f54b95..face7883 100644
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@@ -45,7 +45,7 @@ module VX_core import VX_gpu_pkg::*; #(
     output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
 
     // Status
-    output wire             busy
+    output wire             busy    //stays 1 when busy, 0 when done (termination) detect the negative edge
 );
     VX_schedule_if      schedule_if();
     VX_fetch_if         fetch_if();
@@ -272,7 +272,7 @@ module VX_core import VX_gpu_pkg::*; #(
 
 `endif
 
-`ifdef PERF_ENABLE
+`ifdef PERF_ENABLE  // expose these perf counter to console using $display, %time; flag: --perf=0?
 
     wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
     wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
@@ -345,7 +345,57 @@ module VX_core import VX_gpu_pkg::*; #(
     assign pipeline_perf_if.stores = perf_stores;
     assign pipeline_perf_if.load_latency = perf_dcache_lat;
     assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
-    assign pipeline_perf_if.load_latency = perf_dcache_lat;
+    real instrs = commit_csr_if.instret;
+    real cycles = sched_csr_if.cycles;
+    real icache_lat = perf_icache_lat;
+    real ifetches = perf_ifetches;
+    real dcache_lat = perf_dcache_lat;
+    real loads = perf_loads;
+    real scheduler_idles = pipeline_perf_if.sched_idles;
+    real scheduler_stalls = pipeline_perf_if.sched_stalls;
+    real ibuf_stalls = pipeline_perf_if.ibf_stalls;
+    real scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU];
+    real scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU];
+    real scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU];
+    real scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU];
+    real scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core;
+
+    real scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL];
+    real scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS];
+    real sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core;
+    
+    always @(negedge busy) begin
+        if (!reset) begin
+        $display("====================CORE : %d===================",CORE_ID);
+        $display("time : %t", $time);
+        $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle);
+        $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle);
+        $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle);
+        $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle);
+        $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle);
+        $display("perf_icache_pending_reads: %d", perf_icache_pending_reads);
+        $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads);
+        $display("perf_icache_req_fire: %b", perf_icache_req_fire);
+        $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire);
+        $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire);
+        $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r);
+        $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire);
+        $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r);
+        $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire);
+
+        $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles);
+        $display("scheduler idle: %d (%f)", pipeline_perf_if.sched_idles, scheduler_idles/cycles);
+        $display("scheduler stalls: %d (%f)", pipeline_perf_if.sched_stalls, scheduler_stalls/cycles);
+        $display("ibuffer stalls: %d (%f)",pipeline_perf_if.ibf_stalls, ibuf_stalls/cycles);
+        $display("issue stalls: %d(alu=%f, fpu=%f, lsu=%f, sfu=%f)",pipeline_perf_if.scb_stalls, scrb_alu_per_core/scrb_tot, scrb_fpu_per_core/scrb_tot, scrb_lsu_per_core/scrb_tot, scrb_sfu_per_core/scrb_tot);
+        $display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], scrb_csrs_per_core/sfu_tot, scrb_wctl_per_core/sfu_tot);
+        $display("ifetches: %d", perf_ifetches);
+        $display("ifetch latency: %f Cycles", icache_lat/ifetches);
+        $display("loads: %d", perf_loads);
+        $display("load latency: %f Cycles", dcache_lat/loads);
+        $display("stores: %d", perf_stores);
+        end
+    end
 
 `endif
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 07b8c97b..575707f8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,10 +51,10 @@ $(PROJECT).dump: $(PROJECT).a
 %.S.o: src/%.S
 	$(CC) $(CFLAGS) -c $< -o $@
 
-%.cpp.o: src/%.cpp
+%.cpp.o: src/%.cpp include/vx_spawn.h
 	$(CXX) $(CFLAGS) -c $< -o $@
 
-%.c.o: src/%.c
+%.c.o: src/%.c include/vx_spawn.h
 	$(CC) $(CFLAGS) -c $< -o $@
 
 $(PROJECT).a: $(OBJS)
diff --git a/kernel/include/gemmini_mmio.h b/kernel/include/gemmini_mmio.h
new file mode 100644
index 00000000..072fa8fc
--- /dev/null
+++ b/kernel/include/gemmini_mmio.h
@@ -0,0 +1,164 @@
+#ifndef GEMMINI_MMIO_H
+#define GEMMINI_MMIO_H
+#ifndef GEMMINI_PARAMS_H
+#error INCLUDE GEMMINI.H FIRST
+#endif
+
+#define SMEM_BASE 0xff000000
+#define SMEM_SIZE 0x4000
+#define SMEM_MASK (SMEM_SIZE - 1)
+#define SMEM_ADDR_END 0xff008000
+
+#define SPAD_BASE 0x0
+#define SPAD_ROW_SIZE (DIM * sizeof(elem_t))
+#define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE)
+#define SPAD_MASK (SPAD_NUM_ROWS - 1)
+
+#define PRINT_BUF ((char *) (SMEM_ADDR_END))
+#define GEMMINI_RS1_ADDR 0xff007010
+#define GEMMINI_RS2_ADDR 0xff007018
+#define GEMMINI_INST_ADDR 0xff007000
+#define GEMMINI_BUSY_ADDR 0xff007020
+
+#define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE)
+#define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE)
+
+// convert normal matrix i,j into tiled smem offset
+// top_in_tiles = i / DIM
+// left_in_tiles = j / DIM
+// num_tiles_before_current = top_in_tiles * (J / DIM) + left_in_tiles
+// smem_addr = num_tiles_before_current * DIM * DIM + (i % DIM) * DIM + (j % DIM)
+#define SMEM_MAT_OFFSET(i, j, J) \
+    (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM))
+
+// #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; }
+#undef gemmini_fence
+#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); }
+
+#undef ROCC_INSTRUCTION_RS1_RS2
+#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \
+    /* printf("function %d\n", funct); */              \
+    *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (rs1); \
+    *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (rs2); \
+    /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \
+    /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \
+    /* gemmini_fence(); */ \
+    *((volatile uint32_t*) GEMMINI_INST_ADDR) = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((funct) << 25); \
+    /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \
+}
+
+#define sp_tiled_matmul_full_spad_ws(A_sp_addr_start, B_sp_addr_start, D_sp_addr_start, C_dst_sp_addr_start,\
+  I, J, K, pad_I, pad_J, pad_K, a_transpose, b_transpose, full_C, low_D, acc, act, skips) \
+  gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A_sp_addr_start, (B_sp_addr_start) + (K) * (J) * DIM, NULL, \
+  C_dst_sp_addr_start, a_transpose, b_transpose, full_C, low_D, acc, act, 0, 0, false, skips)
+
+/* inline static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start,
+                                                const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start,
+                                                size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K,
+                                                bool a_transpose, bool b_transpose,
+                                                bool full_C, bool low_D, bool acc,
+                                                int act, int skip_mvout) {
+
+  gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K,
+                       A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start,
+                       a_transpose, b_transpose,
+                       full_C, low_D, acc,
+                       act, 0, 0, false, skip_mvout); */
+  /*
+  return;
+
+
+  // const uint32_t A_sp_addr_start = 0;
+  // const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM;
+  // const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+  const uint32_t C_sp_addr_start = 2 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3));
+  // const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) :
+  //   (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC);
+  const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
+  // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t);
+  const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
+  gemmini_fence();
+
+  if (a_transpose || b_transpose || (I < 4)) {
+    for (size_t k = 0; k < K; k++) {
+      for (size_t j = 0; j < J; j++) {
+        for (size_t i = 0; i < I; i++) {
+          const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) :
+            (A_sp_addr_start + (i*K + k)*DIM);
+          const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) :
+            (B_sp_addr_start + (k*J + j)*DIM);
+          const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
+          // Compute
+          uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
+          uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
+          gemmini_extended_preload(pre_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
+          if (i == 0) { // First iteration
+            gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          } else { // All other iterations
+            gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          }
+          if (k == K - 1) {
+            // Move-out C (if not normalizing)
+            // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) {
+              const size_t rounded_j = j; // (j / C_blocks) * C_blocks;
+              const uint32_t rounded_C_sp_addr = C_sp_addr; // C_sp_addr_start + (i*J + rounded_j)*DIM;
+
+              const uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C;
+
+              // const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j;
+              constexpr size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0);
+              constexpr size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0);
+
+              gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows);
+            // }
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t k = 0; k < K; k++) {
+      for (size_t j = 0; j < J; j++) {
+        uint32_t A_sp_addr = A_sp_addr_start + k * DIM; // (i*K + k)*DIM;
+        const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM;
+        uint32_t C_sp_addr = C_sp_addr_start + j * DIM; // (i*J + j)*DIM;
+        for (size_t i = 0; i < I; i += 4) {
+          // Compute
+          // constexpr uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
+          const uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
+          if (i == 0) { // First iteration
+            gemmini_extended_preload(B_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          } else { // All other iterations
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          }
+          if (k == K - 1) {
+            for (int x = 0; x < 3; x++) gemmini_fence();
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 3) * J + j) * DIM, 1, C_sp_addr + 3 * J * DIM, DIM, DIM);
+          }
+          A_sp_addr += 4 * K * DIM;
+          C_sp_addr += 4 * J * DIM;
+        }
+      }
+    }
+  }
+  gemmini_fence();
+}*/
+
+
+#endif
diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h
index d8797945..84dad2bc 100644
--- a/kernel/include/vx_spawn.h
+++ b/kernel/include/vx_spawn.h
@@ -17,6 +17,10 @@
 #include <stdint.h>
 #include <stdio.h>
 
+#ifndef CORES_PER_CLUSTER
+#define CORES_PER_CLUSTER 2
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,6 +52,7 @@ void vx_wspawn_wait();
 void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
 
 void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
+void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
 void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);
 
 void vx_serial(vx_serial_cb callback, void * arg);
diff --git a/kernel/linker/vx_link32.ld b/kernel/linker/vx_link32.ld
index d8a50026..ea5c4e56 100644
--- a/kernel/linker/vx_link32.ld
+++ b/kernel/linker/vx_link32.ld
@@ -7,6 +7,13 @@ OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv",
 	      "elf32-littleriscv")
 OUTPUT_ARCH(riscv)
 ENTRY(_start)
+
+MEMORY {
+  DRAM0    (rwx): ORIGIN = 0x80000000, LENGTH = 512M
+  DRAM1    (rwx): ORIGIN = 0xa0000000, LENGTH = 32K
+  DRAM2    (rwx): ORIGIN = 0xa1000000, LENGTH = 32K
+}
+
 SECTIONS
 {
   . = STARTUP_ADDR;
@@ -85,6 +92,7 @@ SECTIONS
   /* Adjust the address for the data segment.  We want to adjust up to
      the same address within the page on the next page up.  */
   . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
+
   /* Exception handling  */
   .eh_frame       : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
   .gnu_extab      : ONLY_IF_RW { *(.gnu_extab) }
@@ -166,6 +174,7 @@ SECTIONS
     *(.data .data.* .gnu.linkonce.d.*)
     SORT(CONSTRUCTORS)
   }
+    
   .data1          : { *(.data1) }
   .got            : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
   /* We want the small data sections together, so single-instruction offsets
@@ -200,6 +209,7 @@ SECTIONS
   }
   . = ALIGN(32 / 8);
   . = SEGMENT_START("ldata-segment", .);
+    
   . = ALIGN(32 / 8);
   __BSS_END__ = .;
     __global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
@@ -249,4 +259,12 @@ SECTIONS
   .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
   /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
 
+  .operand.a : {
+    *(.operand.a)
+    . += 32K;
+  }> DRAM1
+  .operand.b : {
+    *(.operand.b)
+    . += 32K;
+  }> DRAM2
 }
diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c
index b1ef7230..1d838c1f 100644
--- a/kernel/src/vx_spawn.c
+++ b/kernel/src/vx_spawn.c
@@ -74,15 +74,6 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
   }
 }
 
-static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
-  int cid = vx_core_id();
-  int tid = vx_thread_id();
-  
-  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
-  int task_id = p_wspawn_args->offset + tid;
-  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
-}
-
 static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
   int NT  = vx_num_threads();
   int NW  = vx_num_warps();
@@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
   }
 }
 
+static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
+  int NT  = vx_num_threads();
+  int NW  = vx_num_warps();
+  int cid = vx_core_id();
+  int wid = vx_warp_id();
+  int tid = vx_thread_id();
+
+  const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
+  // round-robin warp_id allocation across cores in cluster
+  const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
+
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+
+  int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
+  int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
+
+  vx_spawn_tasks_cb callback = p_wspawn_args->callback;
+  void* arg = p_wspawn_args->arg;
+
+  // sequential iterations
+  for (int wave_id = 0; wave_id < waves; ++wave_id) {
+    int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
+    callback(task_id, arg);
+  }
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
+  int cid = vx_core_id();
+  int tid = vx_thread_id();
+  
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+  int task_id = p_wspawn_args->offset + tid;
+  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
+  int NT  = vx_num_threads();
+  int cid = vx_core_id();
+  int tid = vx_thread_id();
+  int wid = vx_warp_id();
+
+  const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
+  // round-robin warp_id allocation across cores in cluster
+  const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
+
+  wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
+  // FIXME: This assumes that all cores but the last one are working with full
+  // warps, and only the last core has a partially-filled warp.
+  int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
+
+  int task_id = offset;
+  (p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
+}
+
 static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
   // activate all threads
   vx_tmc(-1);
@@ -111,11 +156,21 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
   spawn_tasks_contiguous_all_stub();
 
   // disable warp
-  // deadlock here on warps 1, 2, 3
   vx_tmc_zero();
 }
 
-static void __attribute__ ((noinline)) spawn_tasks_all_cb() {  
+static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
+  // activate all threads
+  vx_tmc(-1);
+
+  // call stub routine
+  spawn_tasks_cluster_all_stub();
+
+  // disable warp
+  vx_tmc_zero();
+}
+
+static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
   // activate all threads
   vx_tmc(-1);
 
@@ -126,6 +181,98 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
   vx_tmc_zero();
 }
 
+// This function runs in every core, but with only 1 warp and 1 thread enabled.
+// The logic in this function figures out how many warps/threads this particular
+// core has to enable to fulfill an entire grid of computation.
+void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
+  // device specs
+  const int NC = vx_num_cores();
+  const int NW = vx_num_warps();
+  const int NT = vx_num_threads();
+  // NOTE: assumes divisible
+  const int num_cluster = NC / CORES_PER_CLUSTER;
+
+  // current core id
+  int core_id = vx_core_id();
+  if (core_id >= NUM_CORES_MAX)
+    return;
+  const int cluster_id = core_id / CORES_PER_CLUSTER;
+  const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
+
+  // Distribute threads equally across as many cores as possible, even if they
+  // don't fill up NW*NT in a single core.  This makes sure the warps get evenly
+  // distributed in a single cluster
+  //
+  // TODO: Try to contain in a single cluster if possible?
+  const int num_active_cores = (num_tasks + (NT - 1)) / NT;
+  if (core_id >= num_active_cores)
+    return; // terminate extra cores
+
+  // FIXME: assumes num_tasks is divisible by num_cluster
+  const int num_tasks_this_cluster = num_tasks / num_cluster;
+  const int num_full_warps = num_tasks_this_cluster / NT;
+  const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
+  // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
+
+  int num_warps_this_core = num_full_warps / CORES_PER_CLUSTER;
+  const int num_warps_in_last_row = num_full_warps % CORES_PER_CLUSTER;
+  if (core_id_in_cluster < num_warps_in_last_row) {
+    num_warps_this_core++;
+  }
+  // if 0, last warp is full-threads enabled
+  int rem_threads_in_last_warp_this_core = 0;
+  if (rem_threads_in_last_warp != 0) {
+    if (core_id_in_cluster == num_warps_in_last_row - 1) {
+      rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
+    }
+  }
+
+  // sequential iterations
+  const int num_full_waves = num_warps_this_core / NW;
+  const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
+
+  const const int offset = cluster_id * num_tasks_this_cluster;
+  wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
+                                     rem_full_warps_in_last_wave};
+  g_wspawn_args[core_id] = &wspawn_args;
+
+  if (num_warps_this_core > 0) {
+    // execute callback on other warps
+    const int nw = MIN(num_warps_this_core, NW);
+    vx_wspawn(nw, spawn_tasks_cluster_all_cb);
+
+    // activate all threads
+    vx_tmc(-1);
+
+    // call stub routine
+    spawn_tasks_cluster_all_stub();
+
+    // back to single-threaded
+    vx_tmc_one();
+
+    // wait for spawn warps to terminate
+    vx_wspawn_wait();
+  }
+
+  // TODO: Instead of launching an additional wave just to work on remaining
+  // threads, handle this in the last wave amongst other full warps.
+  if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
+    // adjust offset
+    // FIXME: use rem_threads_in_last_warp_this_core
+    wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
+
+    // activate remaining threads
+    const int tmask = (1 << rem_threads_in_last_warp) - 1;
+    vx_tmc(tmask);
+
+    // call stub routine
+    spawn_tasks_cluster_rem_stub();
+
+    // back to single-threaded
+    vx_tmc_one();
+  }
+}
+
 void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
   int NC = vx_num_cores();
@@ -179,7 +326,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
     vx_tmc_one();
     
     // wait for spawn warps to terminate
-    // deadlock here on warp 0!
     vx_wspawn_wait();
 	}  
 
diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S
index b5065c95..49e520b6 100644
--- a/kernel/src/vx_start.S
+++ b/kernel/src/vx_start.S
@@ -102,6 +102,8 @@ init_regs:
 #endif
   csrr  t0, VX_CSR_MHARTID
   sll   t1, t0, STACK_LOG2_SIZE
+  sll   t2, t0, 4
+  add   t1, t1, t2
   sub   sp, sp, t1
 
   # set thread pointer register
diff --git a/tests/.gitignore b/tests/.gitignore
index a9884992..30ca0fa4 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1 +1,7 @@
 **/*.log
+.depend
+*.bin
+*.dump
+*.elf
+*.o
+*.ll
diff --git a/tests/kernel/gemmini_mmio/Makefile b/tests/kernel/gemmini_mmio/Makefile
new file mode 100644
index 00000000..390b7f81
--- /dev/null
+++ b/tests/kernel/gemmini_mmio/Makefile
@@ -0,0 +1,54 @@
+XLEN ?= 32
+
+ifeq ($(XLEN),64)
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+CFLAGS += -march=rv64imafd -mabi=lp64d
+else
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+CFLAGS += -march=rv32imaf -mabi=ilp32f
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
+
+VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+
+GEMMINI_SW_PATH ?= $(realpath /scratch/yrh/chipyard/generators/gemmini/software/gemmini-rocc-tests)
+
+CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
+AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
+DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
+CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
+
+SIM_DIR = ../../../sim
+
+CFLAGS += -O3 -funroll-loops -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
+CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH)
+
+LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
+
+PROJECT = gemmini_mmio
+
+SRCS = main.cpp
+
+all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
+
+$(PROJECT).bin: $(PROJECT).elf
+	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
+
+$(PROJECT).elf: $(SRCS)
+	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
+
+run-rtlsim: $(PROJECT).bin
+	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
+
+run-simx: $(PROJECT).bin
+	$(SIM_DIR)/simx/simx $(PROJECT).bin
+
+.depend: $(SRCS)
+	$(CC) $(CFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf *.elf *.bin *.dump .depend 
diff --git a/tests/kernel/gemmini_mmio/gemmini_mmio.h b/tests/kernel/gemmini_mmio/gemmini_mmio.h
new file mode 100644
index 00000000..e2876927
--- /dev/null
+++ b/tests/kernel/gemmini_mmio/gemmini_mmio.h
@@ -0,0 +1,162 @@
+#ifndef GEMMINI_MMIO_H
+#define GEMMINI_MMIO_H
+#ifndef GEMMINI_PARAMS_H
+    #error INCLUDE GEMMINI.H FIRST
+#endif
+
+#define SMEM_BASE 0xff000000
+#define SMEM_SIZE 0x4000
+#define SMEM_MASK (SMEM_SIZE - 1)
+#define SMEM_ADDR_END 0xff008000
+
+#define SPAD_BASE 0x0
+#define SPAD_ROW_SIZE (DIM * sizeof(elem_t))
+#define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE)
+#define SPAD_MASK (SPAD_NUM_ROWS - 1)
+
+#define PRINT_BUF ((char *) (SMEM_ADDR_END))
+#define GEMMINI_RS1_ADDR 0xff007010
+#define GEMMINI_RS2_ADDR 0xff007018
+#define GEMMINI_INST_ADDR 0xff007000
+#define GEMMINI_BUSY_ADDR 0xff007020
+
+#define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE)
+#define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE)
+
+// convert normal matrix i,j into tiled smem offset
+// top_in_tiles = i / DIM
+// left_in_tiles = j / DIM
+// num_tiles_before_current = top_in_tiles * (J / DIM) + left_in_tiles
+// smem_addr = num_tiles_before_current * DIM * DIM + (i % DIM) * DIM + (j % DIM)
+#define SMEM_MAT_OFFSET(i, j, J) \
+    (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM))
+
+// #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; }
+#undef gemmini_fence
+#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); }
+
+#undef ROCC_INSTRUCTION_RS1_RS2
+#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \
+    /* printf("function %d\n", funct); */ \
+    uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) (funct) << 25); \
+    *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (volatile uint64_t) (rs1); \
+    *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (volatile uint64_t) (rs2); \
+    /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \
+    /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \
+    /* gemmini_fence(); */ \
+    *((volatile uint32_t*) GEMMINI_INST_ADDR) = instruction; \
+    /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \
+}
+
+static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start,
+        const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start,
+        size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K,
+        bool a_transpose, bool b_transpose,
+        bool full_C, bool low_D,
+        bool no_bias, bool repeating_bias,
+        int act) {
+
+  gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K,
+    A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start,
+    a_transpose, b_transpose,
+    full_C, low_D, false,
+    act, 0, 0, false);
+  /*
+  return;
+
+
+  // const uint32_t A_sp_addr_start = 0;
+  // const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM;
+  // const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+  const uint32_t C_sp_addr_start = 2 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3));
+  // const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) :
+  //   (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC);
+  const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
+  // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t);
+  const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
+  gemmini_fence();
+
+  if (a_transpose || b_transpose || (I < 4)) {
+    for (size_t k = 0; k < K; k++) {
+      for (size_t j = 0; j < J; j++) {
+        for (size_t i = 0; i < I; i++) {
+          const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) :
+            (A_sp_addr_start + (i*K + k)*DIM);
+          const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) :
+            (B_sp_addr_start + (k*J + j)*DIM);
+          const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
+          // Compute
+          uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
+          uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
+          gemmini_extended_preload(pre_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
+          if (i == 0) { // First iteration
+            gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          } else { // All other iterations
+            gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          }
+          if (k == K - 1) {
+            // Move-out C (if not normalizing)
+            // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) {
+              const size_t rounded_j = j; // (j / C_blocks) * C_blocks;
+              const uint32_t rounded_C_sp_addr = C_sp_addr; // C_sp_addr_start + (i*J + rounded_j)*DIM;
+  
+              const uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C;
+  
+              // const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j;
+              constexpr size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0);
+              constexpr size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0);
+ 
+              gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows);
+            // }
+          }
+        }
+      }
+    }
+  } else {
+    for (size_t k = 0; k < K; k++) {
+      for (size_t j = 0; j < J; j++) {
+        uint32_t A_sp_addr = A_sp_addr_start + k * DIM; // (i*K + k)*DIM;
+        const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM;
+        uint32_t C_sp_addr = C_sp_addr_start + j * DIM; // (i*J + j)*DIM;
+        for (size_t i = 0; i < I; i += 4) {
+          // Compute
+          // constexpr uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
+          const uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
+          if (i == 0) { // First iteration
+            gemmini_extended_preload(B_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          } else { // All other iterations
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+            gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
+            gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
+          }
+          if (k == K - 1) {
+            for (int x = 0; x < 3; x++) gemmini_fence();
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM);
+            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 3) * J + j) * DIM, 1, C_sp_addr + 3 * J * DIM, DIM, DIM);
+          }
+          A_sp_addr += 4 * K * DIM;
+          C_sp_addr += 4 * J * DIM;
+        }
+      }
+    }
+  }
+  gemmini_fence();
+  */
+}
+
+
+#endif
diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp
new file mode 100644
index 00000000..d5be2558
--- /dev/null
+++ b/tests/kernel/gemmini_mmio/main.cpp
@@ -0,0 +1,144 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_print.h>
+#include <vx_spawn.h>
+#include "include/gemmini.h"
+#include "gemmini_mmio.h"
+
+#define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x))
+
+int main() {
+
+  int cid;
+  asm volatile ("csrr %0, 0xcc2" : "=r" (cid));
+  if (cid > 0) vx_tmc(0);
+
+  vx_tmc(0xff);
+
+  // load up A and B and C
+  const uint32_t spad_A = 0x00000000;
+  const uint32_t spad_B = 0x00000080; // 16B word addressed
+  const uint32_t acc_C = 0x80000000; // accmem + accumulate
+  const uint32_t spad_C = 0x00000100;
+
+  volatile float *smem_A = (float *) SPAD_TO_SMEM(spad_A); // 0xff000000; // byte addressed
+  float *smem_B = (float *) SPAD_TO_SMEM(spad_B); // 0xff000200;
+  float *smem_C = (float *) SPAD_TO_SMEM(spad_C); // 0xff000400;
+
+  int I = 32 / DIM;
+  int J = 32 / DIM;
+  int K = 32 / DIM;
+
+  char *print_buf = (char *) PRINT_BUF;
+
+  // int cid = vx_core_id();
+  int nc = vx_num_cores();
+  int nt = vx_num_threads();
+  int tid = vx_thread_id();
+
+  vx_tmc_one();
+  gemmini_config_ld(0);
+  gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0);
+  gemmini_config_st(0);
+  /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM);
+  sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM);
+  sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */
+
+  sprintf(print_buf, "DIM %d\n", DIM);
+  sprintf(print_buf, "num cores %d\n", nc);
+  sprintf(print_buf, "num threads %d\n", nt);
+  sprintf(print_buf, "thread ids ");
+  vx_tmc(-1);
+  sprintf(print_buf, "%d", tid);
+
+  uint32_t start_cycles, end_cycles;
+
+  rd_cycles(start_cycles);
+  // load A with 128->1 in row-major order
+  for (int t = 0; t < DIM * DIM / nt; t++) {
+    int n = tid + t * nt;
+    int x = n / DIM;
+    int y = n % DIM;
+    for (int k = 0; k < K; k++) {
+      for (int i = 0; i < I; i++) {
+        int tile_byte_offset = (i * K + k) * DIM * DIM;
+        smem_A[tile_byte_offset + n] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64);
+        // smem_A[tile_byte_offset + x * DIM + y] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64);
+      }
+    }
+  }
+
+  // load B with 0->191 in row-major order
+  for (int t = 0; t < DIM * DIM / nt; t++) {
+    int n = tid + t * nt;
+    int x = n / DIM;
+    int y = n % DIM;
+    for (int k = 0; k < K; k++) {
+      for (int j = 0; j < J; j++) {
+        int tile_byte_offset = (k * J + j) * DIM * DIM;
+        smem_B[tile_byte_offset + n] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64);
+      }
+      // smem_B[tile_byte_offset + x * DIM + y] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64);
+    }
+  }
+  rd_cycles(end_cycles);
+
+  // for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 1.f;
+  vx_tmc_one();
+  sprintf(print_buf, "\ndata loading took %d cycles for %d floats\n", end_cycles - start_cycles, DIM * DIM * (I * K + J * K));
+
+  gemmini_fence();
+
+  // sprintf(print_buf, "\nA in\n");
+  // for (int i = 0; i < I * DIM; i++) {
+  //   for (int j = 0; j < K * DIM; j++) {
+  //     sprintf(print_buf, "%d ", (int) (smem_A[SMEM_MAT_OFFSET(i, j, K * DIM)]));
+  //   }
+  //   sprintf(print_buf, "\n");
+  // }
+  // sprintf(print_buf, "\nB in\n");
+  // for (int i = 0; i < K * DIM; i++) {
+  //   for (int j = 0; j < J * DIM; j++) {
+  //     sprintf(print_buf, "%d ", (int) (smem_B[SMEM_MAT_OFFSET(i, j, J * DIM)]));
+  //   }
+  //   sprintf(print_buf, "\n");
+  //   if (i == 2) i = K * DIM - 3;
+  // }
+
+  uint32_t fence_cycles;
+  rd_cycles(start_cycles);
+  sp_tiled_matmul_full_spad_ws(spad_A, spad_B, /*spad_D=*/0, spad_C,
+      /*I=*/I, /*J=*/J, /*K=*/K, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0,
+      /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0,
+      /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION);
+
+  rd_cycles(fence_cycles);
+  gemmini_fence();
+  rd_cycles(end_cycles);
+  sprintf(print_buf, "gemmini cycles taken: %d, fence cycles: %d\n", end_cycles - start_cycles, end_cycles - fence_cycles);
+
+  // check results
+  for (int i = 0; i < I * DIM; i++) {
+    for (int j = 0; j < J * DIM; j++) {
+      int sum = 0;
+      for (int k = 0; k < K * DIM; k++) sum += ((I * K * DIM * DIM - i * K * DIM - k) % 64) * ((k * J * DIM + j) % 64);
+      if ((int) (smem_C[SMEM_MAT_OFFSET(i, j, J * DIM)] * 10) != (int) (sum * 10)) {
+        sprintf(print_buf, "TEST FAILED (actual/reference)\n");
+        for (int ii = 0; ii < I * DIM; ii++) {
+          for (int jj = 0; jj < J * DIM; jj++) {
+            sum = 0;
+            for (int k = 0; k < K * DIM; k++) sum += ((I * K * DIM * DIM - ii * K * DIM - k) % 64) * ((k * J * DIM + jj) % 64);
+            sprintf(print_buf, "%d/%d ", (int) (smem_C[SMEM_MAT_OFFSET(ii, jj, J * DIM)]), (int) sum);
+          }
+          sprintf(print_buf, "\n");
+        }
+        return 1;
+      }
+    }
+  }
+  sprintf(print_buf, "TEST PASSED\n");
+
+  vx_tmc(0);
+  return 0;
+}
diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc
index d7487c2f..dded468f 100644
--- a/tests/opencl/convolution/main.cc
+++ b/tests/opencl/convolution/main.cc
@@ -56,6 +56,27 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  fclose(fp);
+
+  return 0;
+}
+
 static bool compare_equal(float a, float b) {
   union fi_t { float f; int32_t i; };
   fi_t fa, fb;
@@ -216,6 +237,12 @@ int main (int argc, char **argv) {
     h_w[i] = static_cast<float>(rand()) / RAND_MAX;
   }
 
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("convolution.input.input.bin", h_i.data(), i_nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("convolution.input.weights.bin", h_w.data(), w_nbytes) != 0)
+    return EXIT_FAILURE;
+
   // Creating command queue
   commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
 
diff --git a/tests/opencl/flops/.depend b/tests/opencl/flops/.depend
new file mode 100644
index 00000000..6f7bdaac
--- /dev/null
+++ b/tests/opencl/flops/.depend
@@ -0,0 +1,8 @@
+main.o: main.cc \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h
diff --git a/tests/opencl/flops/.gitignore b/tests/opencl/flops/.gitignore
new file mode 100644
index 00000000..3ca9b5b2
--- /dev/null
+++ b/tests/opencl/flops/.gitignore
@@ -0,0 +1,6 @@
+flops
+*.o
+*.bin*
+*.pocl
+*.dump
+*.vcd
diff --git a/tests/opencl/flops/Makefile b/tests/opencl/flops/Makefile
new file mode 100644
index 00000000..a3301c6f
--- /dev/null
+++ b/tests/opencl/flops/Makefile
@@ -0,0 +1,7 @@
+PROJECT = flops
+
+SRCS = main.cc
+
+OPTS ?= -n64
+
+include ../common.mk
diff --git a/tests/opencl/flops/README b/tests/opencl/flops/README
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/opencl/flops/kernel.cl b/tests/opencl/flops/kernel.cl
new file mode 100644
index 00000000..181e1171
--- /dev/null
+++ b/tests/opencl/flops/kernel.cl
@@ -0,0 +1,13 @@
+__kernel void flops (__global volatile const float *src,
+						 __global volatile float *dst,
+						 __local volatile float *smem)
+{
+  int gid = get_global_id(0);
+  float f = 0.0f;
+  float incr = src[0];
+  __attribute__((opencl_unroll_hint(16)))
+  for (int i = 0; i < 5000; i++) {
+	  f += incr;
+  }
+  dst[gid] = f;
+}
diff --git a/tests/opencl/flops/main.cc b/tests/opencl/flops/main.cc
new file mode 100644
index 00000000..ebab1825
--- /dev/null
+++ b/tests/opencl/flops/main.cc
@@ -0,0 +1,237 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "flops"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  fclose(fp);
+
+  return 0;
+}
+
+static bool almost_equal(float a, float b, int ulp = 4) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem src_memobj = NULL;
+cl_mem dst_memobj = NULL;
+float *h_src = NULL;
+float *h_dst = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (src_memobj) clReleaseMemObject(src_memobj);
+  if (dst_memobj) clReleaseMemObject(dst_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_src) free(h_src);
+  if (h_dst) free(h_dst);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * sizeof(float);
+  src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  cl_int _err;
+  program = clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  // store entire array to sharedmem
+  size_t local_size = size;
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 2, local_size*sizeof(float), NULL));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_src = (float*)malloc(nbytes);
+  h_dst = (float*)malloc(nbytes);
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < size; ++i) {
+    h_src[i] = sinf(i)*sinf(i);
+    h_dst[i] = 0xdeadbeef;
+    //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]);
+  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("flops.input.src.bin", h_src, nbytes) != 0)
+    return EXIT_FAILURE;
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < size; ++i) {
+    float ref = h_src[i];
+    if (!almost_equal(h_dst[i], ref)) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f, src=%f\n", i, ref, h_dst[i], h_src[i]);
+      ++errors;
+    }
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
diff --git a/tests/opencl/sharedmem/.depend b/tests/opencl/sharedmem/.depend
new file mode 100644
index 00000000..6f7bdaac
--- /dev/null
+++ b/tests/opencl/sharedmem/.depend
@@ -0,0 +1,8 @@
+main.o: main.cc \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h
diff --git a/tests/opencl/sharedmem/.gitignore b/tests/opencl/sharedmem/.gitignore
new file mode 100644
index 00000000..ae170236
--- /dev/null
+++ b/tests/opencl/sharedmem/.gitignore
@@ -0,0 +1,5 @@
+sharedmem
+*.bin*
+*.pocl
+*.dump
+*.o
diff --git a/tests/opencl/sharedmem/Makefile b/tests/opencl/sharedmem/Makefile
new file mode 100644
index 00000000..bc0e3197
--- /dev/null
+++ b/tests/opencl/sharedmem/Makefile
@@ -0,0 +1,7 @@
+PROJECT = sharedmem
+
+SRCS = main.cc
+
+OPTS ?= -n64
+
+include ../common.mk
diff --git a/tests/opencl/sharedmem/README b/tests/opencl/sharedmem/README
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/opencl/sharedmem/kernel.cl b/tests/opencl/sharedmem/kernel.cl
new file mode 100644
index 00000000..3540343a
--- /dev/null
+++ b/tests/opencl/sharedmem/kernel.cl
@@ -0,0 +1,13 @@
+__kernel void sharedmem (__global volatile const float *src,
+						 __global volatile float *dst,
+						 __local volatile float *smem)
+{
+  int gid = get_global_id(0);
+  smem[gid] = src[gid];
+  float read;
+  __attribute__((opencl_unroll_hint))
+  for (int i = 0; i < 5000; i++) {
+	  read = smem[gid];
+  }
+  dst[gid] = read;
+}
diff --git a/tests/opencl/sharedmem/main.cc b/tests/opencl/sharedmem/main.cc
new file mode 100644
index 00000000..e53b2db4
--- /dev/null
+++ b/tests/opencl/sharedmem/main.cc
@@ -0,0 +1,237 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "sharedmem"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  fclose(fp);
+
+  return 0;
+}
+
+static bool almost_equal(float a, float b, int ulp = 4) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem src_memobj = NULL;
+cl_mem dst_memobj = NULL;
+float *h_src = NULL;
+float *h_dst = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (src_memobj) clReleaseMemObject(src_memobj);
+  if (dst_memobj) clReleaseMemObject(dst_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_src) free(h_src);
+  if (h_dst) free(h_dst);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * sizeof(float);
+  src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  cl_int _err;
+  program = clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  // store entire array to sharedmem
+  size_t local_size = size;
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 2, local_size*sizeof(float), NULL));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_src = (float*)malloc(nbytes);
+  h_dst = (float*)malloc(nbytes);
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < size; ++i) {
+    h_src[i] = sinf(i)*sinf(i);
+    h_dst[i] = 0xdeadbeef;
+    //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]);
+  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("sharedmem.input.src.bin", h_src, nbytes) != 0)
+    return EXIT_FAILURE;
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < size; ++i) {
+    float ref = h_src[i];
+    if (!almost_equal(h_dst[i], ref)) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f, src=%f\n", i, ref, h_dst[i], h_src[i]);
+      ++errors;
+    }
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
diff --git a/tests/opencl/smemcoherence/.depend b/tests/opencl/smemcoherence/.depend
new file mode 100644
index 00000000..6f7bdaac
--- /dev/null
+++ b/tests/opencl/smemcoherence/.depend
@@ -0,0 +1,8 @@
+main.o: main.cc \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \
+ /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h
diff --git a/tests/opencl/smemcoherence/.gitignore b/tests/opencl/smemcoherence/.gitignore
new file mode 100644
index 00000000..95d1c091
--- /dev/null
+++ b/tests/opencl/smemcoherence/.gitignore
@@ -0,0 +1,5 @@
+smemcoherence
+*.bin*
+*.pocl
+*.dump
+*.o
diff --git a/tests/opencl/smemcoherence/Makefile b/tests/opencl/smemcoherence/Makefile
new file mode 100644
index 00000000..0ee5beae
--- /dev/null
+++ b/tests/opencl/smemcoherence/Makefile
@@ -0,0 +1,7 @@
+PROJECT = smemcoherence
+
+SRCS = main.cc
+
+OPTS ?= -n64
+
+include ../common.mk
diff --git a/tests/opencl/smemcoherence/README b/tests/opencl/smemcoherence/README
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/opencl/smemcoherence/kernel.cl b/tests/opencl/smemcoherence/kernel.cl
new file mode 100644
index 00000000..ace1bbcd
--- /dev/null
+++ b/tests/opencl/smemcoherence/kernel.cl
@@ -0,0 +1,33 @@
+__kernel void smemcoherence (__global volatile const int *src,
+        __global volatile int *dst,
+        __local volatile int *smem,
+		int n)
+{
+    __local volatile int *markers = (__local int *)((__local unsigned char *)smem + 0x1000);
+    int gid = get_global_id(0);
+
+    // assumes total store ordering on smem
+    markers[gid] = 0;
+    smem[gid] = gid;
+    markers[gid] = 1;
+
+    // 0-th thread checks if all threads finished writing
+    if (gid == 0) {
+        int gridsize = get_global_size(0);
+		int retry = 0;
+		for (;; retry++) {
+			for (int i = 0; i < gridsize; i++) {
+				if (markers[i] != 1) {
+					goto try_again;
+				}
+			}
+			break;
+		try_again:;
+		}
+
+		for (int i = 0; i < n; i++) {
+			dst[i] = smem[i];
+		}
+		dst[n] = retry;
+    }
+}
diff --git a/tests/opencl/smemcoherence/main.cc b/tests/opencl/smemcoherence/main.cc
new file mode 100644
index 00000000..5bb1bd4f
--- /dev/null
+++ b/tests/opencl/smemcoherence/main.cc
@@ -0,0 +1,238 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "smemcoherence"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  fclose(fp);
+
+  return 0;
+}
+
+static bool almost_equal(float a, float b, int ulp = 4) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem src_memobj = NULL;
+cl_mem dst_memobj = NULL;
+int *h_src = NULL;
+int *h_dst = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (src_memobj) clReleaseMemObject(src_memobj);
+  if (dst_memobj) clReleaseMemObject(dst_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_src) free(h_src);
+  if (h_dst) free(h_dst);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  // + 1 for the trial value
+  size_t nbytes = (size + 1) * sizeof(int);
+  src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  cl_int _err;
+  program = clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t local_nbytes = 0x2000;
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 2, local_nbytes, NULL));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_src = (int*)malloc(nbytes);
+  h_dst = (int*)malloc(nbytes);
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < size; ++i) {
+    h_src[i] = i;
+    h_dst[i] = 0xdeadbeef;
+    //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]);
+  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("smemcoherence.input.src.bin", h_src, nbytes) != 0)
+    return EXIT_FAILURE;
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < size; ++i) {
+    int ref = i;
+    if (h_dst[i] != ref) {
+      printf("*** error: [%d] expected=%d, actual=%d\n", i, ref, h_dst[i]);
+      ++errors;
+    }
+  }
+  printf("smem check re-trial count: %d\n", h_dst[size]);
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc
index e443f7c5..97efce39 100644
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -52,6 +52,27 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  fclose(fp);
+
+  return 0;
+}
+
 static bool almost_equal(float a, float b, int ulp = 4) {
   union fi_t { int i; float f; };
   fi_t fa, fb;
@@ -171,6 +192,12 @@ int main (int argc, char **argv) {
     h_b[i] = cosf(i)*cosf(i);
   }
 
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("vecadd.input.a.size64.bin", h_a, nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("vecadd.input.b.size64.bin", h_b, nbytes) != 0)
+    return EXIT_FAILURE;
+
   // Creating command queue
   commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
 
@@ -180,8 +207,9 @@ int main (int argc, char **argv) {
 
   printf("Execute the kernel\n");
   size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
   auto time_start = std::chrono::high_resolution_clock::now();
-  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL));
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
   CL_CHECK(clFinish(commandQueue));
   auto time_end = std::chrono::high_resolution_clock::now();
   double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index 6a858edc..24a871eb 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -22,6 +22,7 @@ RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
 
 VORTEX_RT_PATH ?= $(realpath ../../../runtime)
 VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests)
 
 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
 
@@ -49,7 +50,7 @@ VX_CP  = $(LLVM_VORTEX)/bin/llvm-objcopy
 
 VX_CFLAGS += -v -O3 -std=c++17
 VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
+VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH)
 VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
 
 VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a
@@ -78,17 +79,42 @@ endif
 endif
 endif
 
-all: $(PROJECT) kernel.bin kernel.dump
+# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes
+CONFIGEXT = $(if $(CONFIG),.$(CONFIG),)
+
+all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump
 
 kernel.dump: kernel.elf
 	$(VX_DP) -D kernel.elf > kernel.dump
 
-kernel.bin: kernel.elf
+kernel.radiance.dump: kernel.radiance.elf
+	$(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump
+
+ifneq ($(CONFIG),)
+kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf
+	$(VX_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump
+endif
+
+kernel.bin: kernel.elf kernel.radiance.elf
 	$(VX_CP) -O binary kernel.elf kernel.bin
 
 kernel.elf: $(VX_SRCS)
 	$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
 
+OBJCOPY ?= "riscv32-unknown-elf-objcopy"
+OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
+kernel.radiance.elf: kernel.elf
+	$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf
+	$(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) kernel.radiance.elf
+	$(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) kernel.radiance.elf
+	$(OBJCOPY) --update-section .operand.a=input.a.bin kernel.radiance.elf
+	$(OBJCOPY) --update-section .operand.b=input.b.bin kernel.radiance.elf
+
+ifneq ($(CONFIG),)
+kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf
+	cp $< $@
+endif
+
 $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
 
@@ -115,7 +141,7 @@ clean:
 	rm -rf $(PROJECT) *.o .depend
 
 clean-all: clean
-	rm -rf *.elf *.bin *.dump
+	rm -rf kernel.elf kernel.dump
 
 ifneq ($(MAKECMDGOALS),clean)
     -include .depend
diff --git a/tests/regression/flops/.gitignore b/tests/regression/flops/.gitignore
new file mode 100644
index 00000000..c791df5d
--- /dev/null
+++ b/tests/regression/flops/.gitignore
@@ -0,0 +1,5 @@
+*.bin
+*.dump
+*.elf
+flops
+.depend
diff --git a/tests/regression/flops/Makefile b/tests/regression/flops/Makefile
new file mode 100644
index 00000000..b5d37285
--- /dev/null
+++ b/tests/regression/flops/Makefile
@@ -0,0 +1,9 @@
+PROJECT = flops
+
+SRCS = main.cpp common.h
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n16
+
+include ../common.mk
diff --git a/tests/regression/flops/common.h b/tests/regression/flops/common.h
new file mode 100644
index 00000000..a609a0b4
--- /dev/null
+++ b/tests/regression/flops/common.h
@@ -0,0 +1,15 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <cstdint>
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
+#define DEV_SMEM_START_ADDR 0xff000000
+
+typedef struct {
+  uint32_t size;
+  uint32_t addr_src;
+  uint32_t addr_dst;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/flops/flops b/tests/regression/flops/flops
new file mode 100755
index 00000000..dfd6a6c8
Binary files /dev/null and b/tests/regression/flops/flops differ
diff --git a/tests/regression/flops/kernel.cpp b/tests/regression/flops/kernel.cpp
new file mode 100644
index 00000000..773e4b95
--- /dev/null
+++ b/tests/regression/flops/kernel.cpp
@@ -0,0 +1,41 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
+  const float *A = (const float *)arg->addr_src;
+  float *C = (float *)arg->addr_dst;
+
+  int incr = A[task_id];
+  float sum = 0.0f;
+  float sum1 = 0.0f;
+  float sum2 = 0.0f;
+  float sum3 = 0.0f;
+  float sum4 = 0.0f;
+  float sum5 = 0.0f;
+#pragma unroll 8
+  for (int i = 0; i < 5000; i++) {
+    sum1 = sum2 + 5.0f;
+    sum2 = sum3 + 5.0f;
+    sum3 = sum4 + 5.0f;
+    sum4 = sum5 + 5.0f;
+    sum5 = sum1 + 5.0f;
+  }
+
+  sum = sum1 + sum2 + sum3 + sum4 + sum5;
+  C[task_id] = static_cast<float>(sum);
+}
+
+int main() {
+  kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR;
+  const uint32_t grid_size = arg->size;
+#ifdef RADIANCE
+  vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#else
+  // NOTE: This kernel assumes contiguous thread scheduling for efficient shared
+  // memory allocation, and therefore does not work with original vx_spawn_tasks
+  vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#endif
+  return 0;
+}
diff --git a/tests/regression/flops/main.cpp b/tests/regression/flops/main.cpp
new file mode 100644
index 00000000..72aa56ba
--- /dev/null
+++ b/tests/regression/flops/main.cpp
@@ -0,0 +1,252 @@
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include <vector>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.bin";
+uint32_t count = 0;
+
+std::vector<float> src_data;
+std::vector<float> ref_data;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    // vx_mem_free(device, kernel_arg.addr_a);
+    // vx_mem_free(device, kernel_arg.addr_b);
+    // vx_mem_free(device, kernel_arg.addr_c);
+    vx_dev_close(device);
+  }
+}
+
+void generate_source_data(size_t size) {
+  src_data.resize(size);
+
+  for (uint32_t i = 0; i < src_data.size(); ++i) {
+    src_data[i] = static_cast<float>(i);
+  }
+}
+
+void generate_reference_data(size_t size) {
+  ref_data.resize(size);
+
+  for (uint32_t i = 0; i < ref_data.size(); ++i) {
+    ref_data[i] = static_cast<float>(i) * 1000.0f;
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size,
+             uint32_t size) {
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_dst, buf_size));
+
+  std::cout << "downloading result C matrix from device, device mem address="
+            << std::hex << kernel_arg.addr_dst << ", size=" << std::dec
+            << buf_size << " bytes\n";
+  std::ofstream file("output.bin", std::ios::binary | std::ios::out);
+  if (!file) {
+    std::cerr << "error: failed to open output.bin for writing\n";
+    exit(EXIT_FAILURE);
+  }
+  file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
+  file.close();
+
+  std::ofstream ref_file("reference.bin", std::ios::binary | std::ios::out);
+  if (!ref_file) {
+    std::cerr << "error: failed to open reference.bin for writing\n";
+    exit(EXIT_FAILURE);
+  }
+  ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
+  ref_file.close();
+
+  // verify result
+  std::cout << "verify result" << std::endl;
+  {
+    int errors = 0;
+    auto buf_ptr = (float*)staging_buf.data();
+    for (uint32_t i = 0; i < size; ++i) {
+      float ref = ref_data.at(i);
+      float cur = buf_ptr[i];
+      if (std::abs((cur - ref) / ref) > 1e-6) {
+        std::cout << "error at result #" << std::dec << i
+                  << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;
+  RT_CHECK(vx_dev_open(&device));
+
+  size_t size = 64;
+
+  generate_source_data(size);
+  generate_reference_data(size);
+
+  uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]);
+  uint32_t dst_buf_size = ref_data.size() * sizeof(ref_data[0]);
+
+  std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+  // RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_src));
+  // RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_dst));
+  kernel_arg.addr_src = 0x20000UL;
+  kernel_arg.addr_dst = 0xc0000000UL;
+  kernel_arg.size = size;
+
+  std::cout << "dev_addr_src=0x" << std::hex << kernel_arg.addr_src << std::endl;
+  std::cout << "dev_addr_dst=0x" << std::hex << kernel_arg.addr_dst << std::endl;
+
+  // allocate staging buffer
+  {
+    std::cout << "allocate staging buffer" << std::endl;
+    uint32_t staging_buf_size = std::max<uint32_t>(
+        src_buf_size,
+        std::max<uint32_t>(
+            src_buf_size,
+            std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
+    staging_buf.resize(staging_buf_size);
+  }
+
+  // upload kernel argument
+  {
+    std::cout << "upload kernel argument" << std::endl;
+    auto buf_ptr = staging_buf.data();
+    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+    std::cout << "uploading argument buffer to device, device mem address="
+              << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
+              << sizeof(kernel_arg_t) << " bytes\n";
+    std::ofstream file("args.bin", std::ios::binary | std::ios::out);
+    if (!file) {
+        std::cerr << "error: failed to open args.bin for writing\n";
+        exit(EXIT_FAILURE);
+    }
+    file.write(reinterpret_cast<char *>(staging_buf.data()),
+               sizeof(kernel_arg_t));
+    file.close();
+  }
+
+  // upload source buffer
+  {
+    {
+        auto buf_ptr = staging_buf.data();
+        memcpy(buf_ptr, src_data.data(), src_data.size() * sizeof(float));
+        RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_src, staging_buf.data(),
+                                src_buf_size));
+
+        std::cout << "uploading source data to device, device mem address="
+                  << std::hex << kernel_arg.addr_src << ", size=" << std::dec
+                  << src_buf_size << " bytes\n";
+        std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
+        if (!file) {
+        std::cerr << "error: failed to open input.a.bin for writing\n";
+        exit(EXIT_FAILURE);
+        }
+        file.write(reinterpret_cast<char *>(buf_ptr), src_buf_size);
+        file.close();
+    }
+  }
+
+  // clear destination buffer
+  {
+    std::cout << "clear destination buffer" << std::endl;
+    auto buf_ptr = (int32_t*)staging_buf.data();
+    for (uint32_t i = 0; i < ref_data.size(); ++i) {
+      buf_ptr[i] = 0xdeadbeef;
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_dst, staging_buf.data(), dst_buf_size));
+  }
+
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.size));
+  std::cout << "PASSED!" << std::endl;
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;
+  cleanup();
+
+  return 0;
+}
diff --git a/tests/regression/sgemm_gemmini/.gitignore b/tests/regression/sgemm_gemmini/.gitignore
new file mode 100644
index 00000000..7c35ba59
--- /dev/null
+++ b/tests/regression/sgemm_gemmini/.gitignore
@@ -0,0 +1,5 @@
+*.bin
+*.dump
+*.elf
+sgemm_wg
+.depend
diff --git a/tests/regression/sgemm_gemmini/Makefile b/tests/regression/sgemm_gemmini/Makefile
new file mode 100644
index 00000000..a36f6d21
--- /dev/null
+++ b/tests/regression/sgemm_gemmini/Makefile
@@ -0,0 +1,9 @@
+PROJECT = sgemm_gemmini
+
+SRCS = main.cpp common.h
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n16
+
+include ../common.mk
diff --git a/tests/regression/sgemm_gemmini/common.h b/tests/regression/sgemm_gemmini/common.h
new file mode 100644
index 00000000..74941562
--- /dev/null
+++ b/tests/regression/sgemm_gemmini/common.h
@@ -0,0 +1,18 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <cstdint>
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
+#define DEV_SMEM_START_ADDR 0xff000000
+
+typedef struct {
+  uint32_t dim_m;
+  uint32_t dim_n;
+  uint32_t dim_k;
+  uint64_t addr_a;
+  uint64_t addr_b;
+  uint64_t addr_c;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp
new file mode 100644
index 00000000..f1893f6d
--- /dev/null
+++ b/tests/regression/sgemm_gemmini/kernel.cpp
@@ -0,0 +1,504 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_print.h>
+#include <vx_spawn.h>
+#include "common.h"
+#include "include/gemmini.h"
+#include "gemmini_mmio.h"
+
+#define TILE_M 32
+#define TILE_N 32
+#define TILE_K 32
+#define TILE_MN 1024
+#define TILE_MK 1024
+#define TILE_NK 1024
+
+#define NUM_CLUSTERS 1
+#define NUM_THREADS_IN_CLUSTER 128
+
+#define SMEM_ADDR_0K  ((float * const) 0xff000000)
+#define SMEM_ADDR_4K  ((float * const) 0xff001000)
+#define SMEM_ADDR_8K  ((float * const) 0xff002000)
+#define SMEM_ADDR_12K ((float * const) 0xff003000)
+#define SPAD_ADDR_0K 0x0
+#define SPAD_ADDR_4K 0x80
+#define SPAD_ADDR_8K 0x100
+#define SPAD_ADDR_12K 0x180
+
+// #define DEBUG_PRINT
+// #define EXT_ACCUMULATE
+#define HARDCODE
+#define DBUF
+// #define DETAILED_PERF
+
+#define rd_cycles_force(x) asm volatile ("csrr %0, mcycle" : "=r" (x))
+#ifdef DETAILED_PERF
+  #define rd_cycles(x) rd_cycles_force(x)
+#else
+  #define rd_cycles(x)
+#endif
+#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;})
+#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__)
+// #define PRINTF(...) vx_printf(__VA_ARGS__)
+
+inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) {
+  vx_fence();
+  vx_barrier(barrier_id, count);
+}
+
+void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg,
+                                 const uint32_t threadblock_id,
+                                 const uint32_t tid_in_threadblock) {
+  __asm__("matmul_start:");
+  const float * const A = (const float * const) arg->addr_a;
+  const float * const B = (const float * const) arg->addr_b;
+  float * const C = (float * const) arg->addr_c;
+
+  if (HW_TID() == 0) {
+    gemmini_config_ld(0);
+    gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0);
+    gemmini_config_st(0);
+    PRINTF("start\n");
+  }
+
+  vx_fence();
+
+  uint32_t marker0, marker1, marker2, marker3, marker4;
+  uint32_t marker5, marker6, marker7, marker8, marker9;
+  rd_cycles_force(marker0);
+
+  const uint32_t dim_m = arg->dim_m;
+  const uint32_t dim_n = arg->dim_n;
+  const uint32_t dim_k = arg->dim_k;
+  const uint32_t num_tiles_m = dim_m / TILE_M;
+  const uint32_t num_tiles_n = dim_n / TILE_N;
+  const uint32_t num_tiles_k = dim_k / TILE_K;
+  constexpr uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER;
+  constexpr uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster;
+  constexpr uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster;
+  constexpr uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster;
+  const uint32_t hw_tid = tid_in_threadblock % num_threads_in_cluster;
+
+  // the dram coordinates are (i1 + i0, j1 + j0). i0 and j0 are both spatially mapped only.
+  const uint32_t j0 = HW_TID() % DIM;
+  const uint32_t i0 = (HW_TID() / DIM) % DIM;
+
+  // j1 is both spatially and temporally mapped. j1 increases every iteration.
+  const uint32_t j1_idx = (HW_TID() / DIM / DIM) * DIM; // A: % TILE_K, B: % TILE_N, C: % TILE_N
+  // every iteratioon, j1 increases by j1_stride
+  constexpr uint32_t j1_stride = (num_threads_in_cluster / DIM / DIM) * DIM; // mod TILE_W after stride
+
+  // i1 is only temporally mapped. i1 increments every one or more iterations
+  constexpr uint32_t i1_stride = DIM; // step per increment (increment doesnt happen every iteration)
+  constexpr uint32_t i1_iters = (DIM * DIM * (TILE_K / DIM)) / num_threads_in_cluster; // num of iters before striding
+
+  const uint32_t num_tile_rows_per_tb = num_tiles_m / NUM_CLUSTERS;
+  for (uint32_t tile_i = num_tile_rows_per_tb * threadblock_id;
+                tile_i < num_tile_rows_per_tb * (threadblock_id + 1);
+                tile_i += 1) {
+    __asm__("i_loop:");
+    for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) {
+      __asm__("j_loop:");
+      float * const smem_c_tile_start = SMEM_ADDR_4K;
+      #ifndef EXT_ACCUMULATE
+      float * const smem_acc_tile_start = SMEM_ADDR_0K + HW_TID();
+      #else
+      float * const smem_acc_tile_start = SMEM_ADDR_8K + hw_tid;
+      #endif
+
+      __asm__("k_loop:");
+      for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) {
+        // TODO: double buffer
+        rd_cycles(marker1);
+
+        #ifdef HARDCODE
+          #if (TILE_MK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8
+            #error CANNOT UNROLL
+          #endif
+
+        constexpr uint32_t every_iter = j1_stride;
+        const uint32_t every_2iters_a = i1_stride * dim_k;
+        const uint32_t runtime_const_a = i0 * dim_k + j1_idx + j0;
+        const uint32_t every_2iters_b = i1_stride * dim_n;
+        const uint32_t runtime_const_b = i0 * dim_n + j1_idx + j0;
+
+        const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K + runtime_const_a;
+        const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N + runtime_const_b;
+        #ifdef DBUF
+        float * const smem_a_tile_start = ((tile_k & 1) ? SMEM_ADDR_4K : SMEM_ADDR_0K) + HW_TID();
+        float * const smem_b_tile_start = ((tile_k & 1) ? SMEM_ADDR_12K : SMEM_ADDR_8K) + HW_TID();
+        #else
+        float * const smem_a_tile_start = SMEM_ADDR_0K + HW_TID();
+        float * const smem_b_tile_start = SMEM_ADDR_12K + HW_TID();
+        #endif
+
+        {
+          __asm__("load_ab:");
+          float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0];
+          float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0];
+          float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1];
+          float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1];
+          smem_a_tile_start[0 * num_threads_in_cluster] = v0;
+          smem_a_tile_start[1 * num_threads_in_cluster] = v1;
+          smem_a_tile_start[2 * num_threads_in_cluster] = v2;
+          smem_a_tile_start[3 * num_threads_in_cluster] = v3;
+
+          __asm__("load_ab1:");
+          v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0];
+          v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0];
+          v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1];
+          v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1];
+          smem_b_tile_start[0 * num_threads_in_cluster] = v0;
+          smem_b_tile_start[1 * num_threads_in_cluster] = v1;
+          smem_b_tile_start[2 * num_threads_in_cluster] = v2;
+          smem_b_tile_start[3 * num_threads_in_cluster] = v3;
+
+          __asm__("load_ab2:");
+          v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2];
+          v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2];
+          v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3];
+          v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3];
+          smem_a_tile_start[4 * num_threads_in_cluster] = v0;
+          smem_a_tile_start[5 * num_threads_in_cluster] = v1;
+          smem_a_tile_start[6 * num_threads_in_cluster] = v2;
+          smem_a_tile_start[7 * num_threads_in_cluster] = v3;
+
+          __asm__("load_ab3:");
+          v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2];
+          v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2];
+          v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3];
+          v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3];
+          smem_b_tile_start[4 * num_threads_in_cluster] = v0;
+          smem_b_tile_start[5 * num_threads_in_cluster] = v1;
+          smem_b_tile_start[6 * num_threads_in_cluster] = v2;
+          smem_b_tile_start[7 * num_threads_in_cluster] = v3;
+
+          __asm__("end_loadab:");
+        }
+        #else
+        /* smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0];
+        smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0];
+        smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1];
+        smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1];
+        smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2];
+        smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2];
+        smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3];
+        smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = \
+          dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3];
+
+        smem_b_tile_start[0 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 0];
+        smem_b_tile_start[1 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 0];
+        smem_b_tile_start[2 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 1];
+        smem_b_tile_start[3 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 1];
+        smem_b_tile_start[4 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 2];
+        smem_b_tile_start[5 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 2];
+        smem_b_tile_start[6 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 3];
+        smem_b_tile_start[7 * num_threads_in_cluster + hw_tid] = \
+          dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; */
+
+        const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K;
+        const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N;
+        float * const smem_a_tile_start = SMEM_ADDR_0K;
+        float * const smem_b_tile_start = SMEM_ADDR_12K;
+
+        #pragma GCC unroll 8 // TODO: macro computed
+        for (uint32_t thread_i = 0, j1 = 0, i1 = 0;
+          thread_i < a_elems_per_thread;
+          thread_i += 1,
+          j1 = (j1 + j1_stride) % TILE_K,
+          i1 = (thread_i % i1_iters == 0) ? i1 + i1_stride : i1) {
+          smem_a_tile_start[thread_i * num_threads_in_cluster + hw_tid] = \
+            dram_a_tile_start[(0 + i0) * dim_k + j1 + j1_idx + j0];
+        }
+        // for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) {
+        //   uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i;
+        //   smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \
+        //     dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K];
+        // }
+        #pragma GCC unroll 8
+        for (int thread_i = 0; thread_i < b_elems_per_thread; thread_i++) {
+          uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i;
+          smem_b_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)] = \
+            dram_b_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N];
+        }
+        #endif
+
+        #ifdef DEBUG_PRINT
+        if (hw_tid == 0) {
+          PRINTF("\nA %d %d\n", tile_i, tile_k);
+          for (int i = 0; i < TILE_M; i += 8) {
+            for (int j = 0; j < TILE_K; j += 8) {
+              uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K);
+              PRINTF("%x %x ",
+                (int) (smem_a_tile_start[mat_offset]),
+                (int) (smem_a_tile_start[mat_offset + 4])
+              );
+            }
+            PRINTF("\n");
+          }
+          PRINTF("\nB %d %d\n", tile_k, tile_j);
+          for (int i = 0; i < TILE_K; i += 8) {
+            for (int j = 0; j < TILE_N; j += 8) {
+              uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N);
+              PRINTF("%x %x ",
+                (int) (smem_b_tile_start[mat_offset]),
+                (int) (smem_b_tile_start[mat_offset + 4])
+              );
+            }
+            PRINTF("\n");
+          }
+        }
+        #endif
+
+
+        rd_cycles(marker2);
+        // cluster wide barrier to wait for A and B loads to complete
+        threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS);
+        rd_cycles(marker3);
+        __asm__("gemmini:");
+        if (HW_TID() == 0) {
+          #ifdef DBUF
+            gemmini_fence();
+          #endif
+          sp_tiled_matmul_full_spad_ws(
+            #ifdef DBUF
+              (tile_k & 1) ? SPAD_ADDR_4K : SPAD_ADDR_0K, (tile_k & 1) ? SPAD_ADDR_12K : SPAD_ADDR_8K,
+            #else
+              SPAD_ADDR_0K, SPAD_ADDR_12K,
+            #endif
+            /*spad_D=*/0, /*spad_C=*/SPAD_ADDR_4K,
+            /*I=*/TILE_M / DIM, /*J=*/TILE_N / DIM, /*K=*/TILE_K / DIM, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0,
+            /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0,
+            #ifdef EXT_ACCUMULATE
+            /*acc=*/0, /*act=*/NO_ACTIVATION, /*skips=*/0x38U);
+            #else
+            /*acc=*/tile_k != 0, /*act=*/NO_ACTIVATION, /*skips=*/0xB8U);
+            #endif
+          #ifndef DBUF
+          gemmini_fence();
+          #endif
+        }
+        __asm__("end_gemmini:");
+        rd_cycles(marker4);
+        threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS);
+        rd_cycles(marker5);
+
+        // accumulate C matrix
+        #ifdef EXT_ACCUMULATE
+        __asm__("accumulate:");
+        if (tile_k == 0) {
+          #pragma GCC ivdep
+          #pragma GCC unroll 8
+          for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) {
+            constexpr uint32_t s = num_threads_in_cluster;
+            smem_acc_tile_start[thread_i * s] = smem_c_tile_start[hw_tid + s * thread_i];
+          }
+        } else {
+          #if (TILE_NK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8
+          #error CANNOT UNROLL
+          #endif
+          for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i += 8) {
+            constexpr uint32_t s = num_threads_in_cluster;
+            smem_acc_tile_start[s * 0] += smem_c_tile_start[hw_tid + s * 0];
+            smem_acc_tile_start[s * 1] += smem_c_tile_start[hw_tid + s * 1];
+            smem_acc_tile_start[s * 2] += smem_c_tile_start[hw_tid + s * 2];
+            smem_acc_tile_start[s * 3] += smem_c_tile_start[hw_tid + s * 3];
+            smem_acc_tile_start[s * 4] += smem_c_tile_start[hw_tid + s * 4];
+            smem_acc_tile_start[s * 5] += smem_c_tile_start[hw_tid + s * 5];
+            smem_acc_tile_start[s * 6] += smem_c_tile_start[hw_tid + s * 6];
+            smem_acc_tile_start[s * 7] += smem_c_tile_start[hw_tid + s * 7];
+          }
+        }
+        __asm__("end_accumulate:");
+        #endif
+
+        #ifdef DEBUG_PRINT
+        if (hw_tid == 0) {
+          PRINTF("\nC %d %d %d\n", tile_i, tile_j, tile_k);
+          for (int i = 0; i < TILE_M; i += 8) {
+            for (int j = 0; j < TILE_N; j += 8) {
+              uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N);
+              PRINTF("%d %d ",
+                (int) (smem_c_tile_start[mat_offset]),
+                (int) (smem_c_tile_start[mat_offset + 4])
+              );
+            }
+            PRINTF("\n");
+          }
+        }
+        #endif
+        rd_cycles(marker6);
+
+        /* if (HW_TID() == 0) {
+          PRINTF("\ntile start:           %d\n", marker1);
+          PRINTF("single tile cycles:   %d\n", marker6 - marker1);
+          PRINTF("A/B tile load cycles: %d\n", marker2 - marker1);
+          PRINTF("first barrier:        %d\n", marker3 - marker2);
+          PRINTF("gemmini cycles:       %d\n", marker4 - marker3);
+          PRINTF("second barrier:       %d\n", marker5 - marker4);
+        } */
+
+      }
+
+      #ifndef EXT_ACCUMULATE
+      threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS);
+      rd_cycles(marker6);
+      __asm__("mvout_spad_ser:");
+      // mvout to scratchpad for activation
+      if (HW_TID() == 0) {
+        __asm__("mvout_spad:");
+        #ifdef DBUF
+        gemmini_fence();
+        #endif
+        ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (4ULL << 32) | (4ULL << 16) | 4ULL, k_LOOP_WS_CONFIG_BOUNDS)
+        ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0x278U, k_LOOP_WS)
+        /* #pragma gcc unroll 16
+        for (int i = 0; i < TILE_MN / DIM; i += DIM) {
+          gemmini_mvout_spad(i, 0x80000000ULL + i); // FIXME: C is not necessarily at 0
+        } */
+        __asm__("mvout_spad_fence:");
+        gemmini_fence();
+      }
+      __asm__("mvout_spad_bar:");
+      threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS);
+      __asm__("end_mvout_spad:");
+      #endif
+      rd_cycles(marker7);
+
+      // move out to dram
+      __asm__("mvout_dram:");
+      #ifdef HARDCODE
+      #if (TILE_MN / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8
+        #error CANNOT UNROLL
+      #endif
+      constexpr uint32_t every_iter = j1_stride;
+      const uint32_t every_2iters = i1_stride * dim_n;
+      const uint32_t runtime_const = i0 * dim_n + j1_idx + j0;
+      float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N + runtime_const;
+
+      float v0 = smem_acc_tile_start[0 * num_threads_in_cluster];
+      float v1 = smem_acc_tile_start[1 * num_threads_in_cluster];
+      float v2 = smem_acc_tile_start[2 * num_threads_in_cluster];
+      float v3 = smem_acc_tile_start[3 * num_threads_in_cluster];
+      dram_c_tile_start[every_iter * 0 + every_2iters * 0] = v0;
+      dram_c_tile_start[every_iter * 1 + every_2iters * 0] = v1;
+      dram_c_tile_start[every_iter * 0 + every_2iters * 1] = v2;
+      dram_c_tile_start[every_iter * 1 + every_2iters * 1] = v3;
+
+      v0 = smem_acc_tile_start[4 * num_threads_in_cluster];
+      v1 = smem_acc_tile_start[5 * num_threads_in_cluster];
+      v2 = smem_acc_tile_start[6 * num_threads_in_cluster];
+      v3 = smem_acc_tile_start[7 * num_threads_in_cluster];
+      dram_c_tile_start[every_iter * 0 + every_2iters * 2] = v0;
+      dram_c_tile_start[every_iter * 1 + every_2iters * 2] = v1;
+      dram_c_tile_start[every_iter * 0 + every_2iters * 3] = v2;
+      dram_c_tile_start[every_iter * 1 + every_2iters * 3] = v3;
+
+      #else
+      /*dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 0] = \
+        smem_acc_tile_start[0 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 0] = \
+        smem_acc_tile_start[1 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 1] = \
+        smem_acc_tile_start[2 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 1] = \
+        smem_acc_tile_start[3 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 2] = \
+        smem_acc_tile_start[4 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 2] = \
+        smem_acc_tile_start[5 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 3] = \
+        smem_acc_tile_start[6 * num_threads_in_cluster];
+      dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 3] = \
+        smem_acc_tile_start[7 * num_threads_in_cluster];*/
+
+      #pragma GCC unroll 8
+      for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) {
+        uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i;
+        dram_c_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N] = \
+          *(SMEM_ADDR_8K + SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N));
+      }
+      #endif
+      __asm__("end_mvout_dram:");
+
+      rd_cycles(marker8);
+    }
+  }
+  // last thread block complete
+  if (threadblock_id == NUM_CLUSTERS - 1) {
+    threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS);
+    rd_cycles_force(marker9);
+    if (HW_TID() == 0) {
+      PRINTF("\ncomplete\n");
+      PRINTF("total cycles:         %d\n", marker9 - marker0);
+    }
+    #ifdef DETAILED_PERF
+      vx_tmc(0x81);
+      for (int x = 0; x < num_threads_in_cluster; x += num_threads_in_cluster - 1) {
+        if (HW_TID() == x) {
+          PRINTF("\ntile start:           %d\n", marker1);
+          PRINTF("single tile cycles:   %d\n", marker6 - marker1);
+          PRINTF("A/B tile load cycles: %d\n", marker2 - marker1);
+          PRINTF("first barrier:        %d\n", marker3 - marker2);
+          PRINTF("gemmini cycles:       %d\n", marker4 - marker3);
+          PRINTF("second barrier:       %d\n", marker5 - marker4);
+          #ifdef EXT_ACCUMULATE
+          PRINTF("accumulation cycles:  %d\n", marker6 - marker5);
+          #else
+          PRINTF("smem mvout cycles:    %d %d-%d\n", marker7 - marker6, marker7, marker6);
+          #endif
+          PRINTF("dram mvout cycles:    %d\n", marker8 - marker7);
+        }
+        threadblock_barrier(/*barrier_id=*/1, /*count=*/NUM_WARPS);
+      }
+    #endif
+    if (HW_TID() == 0) {
+      for (int i = 0; i < dim_m; i += 8) {
+        for (int j = 0; j < dim_n; j += 8) {
+          PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4]));
+        }
+        PRINTF("\n");
+      }
+    }
+  }
+  vx_tmc(0);
+}
+
+void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
+  // @perf: All threads are running these compute whose result is mostly same
+  // across the threadblock
+
+  const int threadblock_id = task_id / NUM_THREADS_IN_CLUSTER;
+  const int tid_in_threadblock = task_id % NUM_THREADS_IN_CLUSTER;
+
+  thread_block_matmul_gemmini(arg, threadblock_id, tid_in_threadblock);
+}
+
+int main() {
+  kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR;
+
+  const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER;
+  const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS;
+#ifdef RADIANCE
+  vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#else
+  // NOTE: This kernel assumes contiguous thread scheduling for efficient shared
+  // memory allocation, and therefore does not work with original vx_spawn_tasks
+  vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#endif
+  return 0;
+}
\ No newline at end of file
diff --git a/tests/regression/sgemm_gemmini/main.cpp b/tests/regression/sgemm_gemmini/main.cpp
new file mode 100644
index 00000000..54531062
--- /dev/null
+++ b/tests/regression/sgemm_gemmini/main.cpp
@@ -0,0 +1,274 @@
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include <vector>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+     cleanup();                                                       \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.bin";
+uint32_t count = 0;
+
+std::vector<float> src_a_data;
+std::vector<float> src_b_data;
+std::vector<float> ref_data;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    vx_mem_free(device, kernel_arg.addr_a);
+    vx_mem_free(device, kernel_arg.addr_b);
+    vx_mem_free(device, kernel_arg.addr_c);
+    vx_dev_close(device);
+  }
+}
+
+void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
+  src_a_data.resize(dim_m * dim_k);
+  src_b_data.resize(dim_k * dim_n);
+
+  for (uint32_t i = 0; i < src_a_data.size(); ++i) {
+    src_a_data[i] = static_cast<float>(i);
+    std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
+  }
+  for (uint32_t i = 0; i < src_b_data.size(); ++i) {
+    src_b_data[i] = static_cast<float>(i);
+    std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
+  }
+}
+
+void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
+  ref_data.resize(dim_m * dim_n);
+
+  for (uint32_t i = 0; i < dim_m; ++i) {
+    for (uint32_t j = 0; j < dim_n; ++j) {
+      float ref = 0.0f;
+      for (uint32_t k = 0; k < dim_k; ++k) {
+        ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
+      }
+      ref_data.at(dim_n * i + j) = ref;
+    }
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size,
+             uint32_t dim_m, uint32_t dim_n) {
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
+
+  // verify result
+  std::cout << "verify result" << std::endl;
+  {
+    int errors = 0;
+    auto buf_ptr = (float*)staging_buf.data();
+    for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
+      float ref = ref_data.at(i);
+      float cur = buf_ptr[i];
+      if (std::abs((cur - ref) / ref) > 1e-6) {
+        std::cout << "error at result #" << std::dec << i
+                  << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;
+  RT_CHECK(vx_dev_open(&device));
+
+  // FIXME: hardcoded
+  uint32_t dim_m = 64;
+  uint32_t dim_n = 64;
+  uint32_t dim_k = 64;
+
+  generate_source_matrix(dim_m, dim_n, dim_k);
+  generate_reference_matmul(dim_m, dim_n, dim_k);
+
+  uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
+  uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
+  uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
+
+  std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+  RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
+  RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
+  RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
+
+  kernel_arg.dim_m = dim_m;
+  kernel_arg.dim_n = dim_n;
+  kernel_arg.dim_k = dim_k;
+
+  std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
+  std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
+  std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
+
+  // allocate staging buffer
+  {
+    std::cout << "allocate staging buffer" << std::endl;
+    uint32_t staging_buf_size = std::max<uint32_t>(
+        src_a_buf_size,
+        std::max<uint32_t>(
+            src_b_buf_size,
+            std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
+    staging_buf.resize(staging_buf_size);
+  }
+
+  // upload kernel argument
+  {
+    std::cout << "upload kernel argument" << std::endl;
+    auto buf_ptr = staging_buf.data();
+    kernel_arg.addr_a = (uint64_t) 0x20000;
+    kernel_arg.addr_b = (uint64_t) 0x28000;
+    kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
+    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+
+    std::cout << "uploading argument buffer to device, device mem address="
+              << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
+              << sizeof(kernel_arg_t) << " bytes\n";
+    std::ofstream file("args.bin", std::ios::binary | std::ios::out);
+    if (!file) {
+        std::cerr << "error: failed to open args.bin for writing\n";
+        exit(EXIT_FAILURE);
+    }
+    file.write(reinterpret_cast<char *>(staging_buf.data()),
+               sizeof(kernel_arg_t));
+    file.close();
+
+    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  }
+
+  // upload source buffer
+  {
+    {
+        auto buf_ptr = staging_buf.data();
+        memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
+        RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
+                                src_a_buf_size));
+
+        std::cout << "uploading source A matrix to device, device mem address="
+                  << std::hex << kernel_arg.addr_a << ", size=" << std::dec
+                  << src_a_buf_size << " bytes\n";
+        std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
+        if (!file) {
+        std::cerr << "error: failed to open args.bin for writing\n";
+        exit(EXIT_FAILURE);
+        }
+        file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
+        file.close();
+    }
+    {
+        auto buf_ptr = staging_buf.data();
+        memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
+        RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
+                                src_b_buf_size));
+
+        std::cout << "uploading source B matrix to device, device mem address="
+                  << std::hex << kernel_arg.addr_b << ", size=" << std::dec
+                  << src_b_buf_size << " bytes\n";
+        std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
+        if (!file) {
+        std::cerr << "error: failed to open args.bin for writing\n";
+        exit(EXIT_FAILURE);
+        }
+        file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
+        file.close();
+    }
+  }
+
+  // clear destination buffer
+  {
+    std::cout << "clear destination buffer" << std::endl;
+    auto buf_ptr = (int32_t*)staging_buf.data();
+    for (uint32_t i = 0; i < ref_data.size(); ++i) {
+      buf_ptr[i] = 0xdeadbeef;
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
+  }
+
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
+  std::cout << "PASSED!" << std::endl;
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;
+  cleanup();
+
+  return 0;
+}
diff --git a/tests/regression/sgemm_gemmini/sgemm_gemmini b/tests/regression/sgemm_gemmini/sgemm_gemmini
new file mode 100755
index 00000000..67ade61b
Binary files /dev/null and b/tests/regression/sgemm_gemmini/sgemm_gemmini differ
diff --git a/tests/regression/sgemm_wg/.gitignore b/tests/regression/sgemm_wg/.gitignore
new file mode 100644
index 00000000..7c35ba59
--- /dev/null
+++ b/tests/regression/sgemm_wg/.gitignore
@@ -0,0 +1,5 @@
+*.bin
+*.dump
+*.elf
+sgemm_wg
+.depend
diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile
new file mode 100644
index 00000000..289369d2
--- /dev/null
+++ b/tests/regression/sgemm_wg/Makefile
@@ -0,0 +1,9 @@
+PROJECT = sgemm_wg
+
+SRCS = main.cpp common.h
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n16
+
+include ../common.mk
diff --git a/tests/regression/sgemm_wg/common.h b/tests/regression/sgemm_wg/common.h
new file mode 100644
index 00000000..74941562
--- /dev/null
+++ b/tests/regression/sgemm_wg/common.h
@@ -0,0 +1,18 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#include <cstdint>
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
+#define DEV_SMEM_START_ADDR 0xff000000
+
+typedef struct {
+  uint32_t dim_m;
+  uint32_t dim_n;
+  uint32_t dim_k;
+  uint64_t addr_a;
+  uint64_t addr_b;
+  uint64_t addr_c;
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp
new file mode 100644
index 00000000..86b7309d
--- /dev/null
+++ b/tests/regression/sgemm_wg/kernel.cpp
@@ -0,0 +1,192 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_print.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+// Constraints on parameters:
+// * Memory:
+//   (BM + BN) * BK * sizeof(float) <= sharedmem size.
+//   BM * BK == BN * BK >= threadblock size >= NT * CORES_PER_CLUSTER
+//     When larger, the kernel runs a sequential loop to read into sharedmem;
+//     but smaller case is not handled.
+// * Compute:
+//   ( M* N) / (TM*TN) == grid size >= NC*NW*NT
+//   (BM*BN) / (TM*TN) == threadblock size < NT * NW * CORES_PER_CLUSTER
+//   (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER
+// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields
+//   BM <= BK*TM*TN
+#define BM 32
+#define BN BM
+#define BK 8
+#define TM 4
+#define TN 4
+
+void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) {
+    vx_fence();
+    vx_barrier(barrier_id, count);
+}
+
+void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
+                              const uint32_t tid_in_threadblock,
+                              const uint32_t threadblock_dim_x,
+                              const uint32_t threadblock_dim_y,
+                              const uint32_t threadblock_id_x,
+                              const uint32_t threadblock_id_y,
+                              const uint32_t threadblock_id_in_cluster,
+                              float *sharedmem_per_threadblock) {
+  const float *A = (const float *)arg->addr_a;
+  const float *B = (const float *)arg->addr_b;
+  float *C = (float *)arg->addr_c;
+
+  // assumes NT == NW == matrix_dim
+  const uint32_t dim_m = arg->dim_m;
+  const uint32_t dim_n = arg->dim_n;
+  const uint32_t dim_k = arg->dim_k;
+
+  // FIXME: Output block size is assumed to be square, i.e. BM == BN
+  // const uint32_t BM = threadblock_dim_y;
+  // const uint32_t BN = threadblock_dim_y;
+  // const uint32_t BK = threadblock_dim_x;
+  // constexpr uint32_t BM = 8;
+  // constexpr uint32_t BN = 8;
+  // constexpr uint32_t BK = 2;
+
+  const uint32_t local_a_row = tid_in_threadblock / BK;
+  const uint32_t local_a_col = tid_in_threadblock % BK;
+  const uint32_t local_b_row = tid_in_threadblock / BN;
+  const uint32_t local_b_col = tid_in_threadblock % BN;
+  const uint32_t global_a_row = BM * threadblock_id_y + local_a_row;
+  const uint32_t global_b_col = BN * threadblock_id_x + local_b_col;
+
+  const uint32_t local_c_row = tid_in_threadblock / (BN / TN);
+  const uint32_t local_c_col = tid_in_threadblock % (BN / TN);
+
+  // each thread generates TM output element
+  float reg_c[TM * TN] = { 0.0f };
+  float reg_a[TM] = { 0.0f };
+  float reg_b[TN] = { 0.0f };
+
+  volatile float *local_a = sharedmem_per_threadblock;
+  // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y;
+  const size_t local_a_elems = (BM * BK);
+  volatile float *local_b = sharedmem_per_threadblock + local_a_elems;
+
+  constexpr uint32_t stride_a = (BM * BN) / BK / (TM * TN);
+  constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN);
+
+  for (uint32_t k = 0; k < dim_k; k += BK) {
+    // Data move from GMEM to SMEM
+    //
+    // Make sure global offset values for A and B are contiguous between
+    // neighboring threads to ensure GMEM coalescing.
+#pragma GCC unroll 2
+    for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) {
+      const uint32_t global_a_offset =
+          dim_k * (global_a_row + load_offset) + (k + local_a_col);
+      local_a[BK * (local_a_row + load_offset) + local_a_col] =
+          A[global_a_offset];
+    }
+#pragma GCC unroll 2
+    for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) {
+      const uint32_t global_b_offset =
+          dim_n * (k + local_b_row + load_offset) + global_b_col;
+      local_b[BN * (local_b_row + load_offset) + local_b_col] =
+          B[global_b_offset];
+    }
+
+    threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster,
+                        threadblock_dim_y);
+
+    // Compute single tile*tile matmul
+#pragma GCC unroll 4
+    for (uint32_t local_k = 0; local_k < BK; local_k++) {
+      // First, pump data from SMEM->RF
+#pragma GCC unroll TM
+      for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) {
+        reg_a[res_idx_m] =
+            local_a[BK * (TM * local_c_row + res_idx_m) + local_k];
+      }
+#pragma GCC unroll TN
+      for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) {
+        reg_b[res_idx_n] =
+            local_b[BN * local_k + (TN * local_c_col + res_idx_n)];
+      }
+
+      // Next, compute multiple result elements (TM*TN) by reusing data in RF
+#pragma GCC unroll TM
+      for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) {
+#pragma GCC unroll TN
+        for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) {
+          // NOTE use of local_b_row
+          reg_c[TN * res_idx_m + res_idx_n] +=
+              reg_a[res_idx_m] * reg_b[res_idx_n];
+          // reg_c[TN * res_idx_m + res_idx_n] +=
+          //     local_a[BK * (TM * local_c_row + res_idx_m) + local_k] *
+          //     local_b[BN * local_k + (TN * local_c_col + res_idx_n)];
+        }
+      }
+    }
+
+    threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster,
+                        threadblock_dim_y);
+  }
+
+  // Store result data from RF to GMEM
+#pragma GCC unroll TM
+  for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) {
+#pragma GCC unroll TN
+    for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) {
+      C[dim_n * (BM * threadblock_id_y + TM * local_c_row + res_idx_m) +
+        (BN * threadblock_id_x + TN * local_c_col + res_idx_n)] =
+          reg_c[TN * res_idx_m + res_idx_n];
+    }
+  }
+}
+
+void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
+  // @perf: All threads are running these compute whose result is mostly same
+  // across the threadblock
+
+  const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN);
+#ifdef RADIANCE
+  const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() /
+                                         threads_per_threadblock *
+                                         CORES_PER_CLUSTER;
+#else
+  const uint32_t threadblocks_per_core =
+      vx_num_threads() * vx_num_warps() / threads_per_threadblock;
+#endif
+  const uint32_t threadblock_dim_x = vx_num_threads();
+  const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core;
+  const int threadblock_id = task_id / threads_per_threadblock;
+  const int threadblock_id_in_cluster = threadblock_id % threadblocks_per_core;
+  const int tid_in_threadblock = task_id % threads_per_threadblock;
+
+  const uint32_t dim_m = arg->dim_m;
+  const uint32_t dim_n = arg->dim_n;
+  const uint32_t dim_n_in_blocks = dim_n / BN;
+  const int threadblock_id_x = threadblock_id % dim_n_in_blocks;
+  const int threadblock_id_y = threadblock_id / dim_n_in_blocks;
+
+  // "static" shared memory allocation.  This would determine threadblock
+  // occupancy of a single cluster
+  float *sharedmem_per_threadblock =
+      (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_cluster;
+  thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x,
+                    threadblock_dim_y, threadblock_id_x, threadblock_id_y,
+                    threadblock_id_in_cluster, sharedmem_per_threadblock);
+}
+
+int main() {
+  kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR;
+  const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN);
+#ifdef RADIANCE
+  vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#else
+  // NOTE: This kernel assumes contiguous thread scheduling for efficient shared
+  // memory allocation, and therefore does not work with original vx_spawn_tasks
+  vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
+#endif
+  return 0;
+}
diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp
new file mode 100644
index 00000000..62625c44
--- /dev/null
+++ b/tests/regression/sgemm_wg/main.cpp
@@ -0,0 +1,292 @@
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include <vector>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.bin";
+uint32_t count = 0;
+
+std::vector<float> src_a_data;
+std::vector<float> src_b_data;
+std::vector<float> ref_data;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {
+    // vx_mem_free(device, kernel_arg.addr_a);
+    // vx_mem_free(device, kernel_arg.addr_b);
+    // vx_mem_free(device, kernel_arg.addr_c);
+    vx_dev_close(device);
+  }
+}
+
+void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
+  src_a_data.resize(dim_m * dim_k);
+  src_b_data.resize(dim_k * dim_n);
+
+  for (uint32_t i = 0; i < src_a_data.size(); ++i) {
+    src_a_data[i] = static_cast<float>(i);
+    std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
+  }
+  for (uint32_t i = 0; i < src_b_data.size(); ++i) {
+    src_b_data[i] = static_cast<float>(i);
+    std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
+  }
+}
+
+void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
+  ref_data.resize(dim_m * dim_n);
+
+  for (uint32_t i = 0; i < dim_m; ++i) {
+    for (uint32_t j = 0; j < dim_n; ++j) {
+      float ref = 0.0f;
+      for (uint32_t k = 0; k < dim_k; ++k) {
+        ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
+      }
+      ref_data.at(dim_n * i + j) = ref;
+    }
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size,
+             uint32_t dim_m, uint32_t dim_n) {
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
+
+  std::cout << "downloading result C matrix from device, device mem address="
+            << std::hex << kernel_arg.addr_c << ", size=" << std::dec
+            << buf_size << " bytes\n";
+  std::ofstream file("output.c.bin", std::ios::binary | std::ios::out);
+  if (!file) {
+    std::cerr << "error: failed to open output.c.bin for writing\n";
+    exit(EXIT_FAILURE);
+  }
+  file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
+  file.close();
+
+  std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
+  if (!ref_file) {
+    std::cerr << "error: failed to open reference.c.bin for writing\n";
+    exit(EXIT_FAILURE);
+  }
+  ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
+  ref_file.close();
+
+  // verify result
+  std::cout << "verify result" << std::endl;
+  {
+    int errors = 0;
+    auto buf_ptr = (float*)staging_buf.data();
+    for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
+      float ref = ref_data.at(i);
+      float cur = buf_ptr[i];
+      if (std::abs((cur - ref) / ref) > 1e-6) {
+        std::cout << "error at result #" << std::dec << i
+                  << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;
+  RT_CHECK(vx_dev_open(&device));
+
+  // FIXME: hardcoded
+  uint32_t dim_m = 128;
+  uint32_t dim_n = 128;
+  uint32_t dim_k = 128;
+
+  generate_source_matrix(dim_m, dim_n, dim_k);
+  generate_reference_matmul(dim_m, dim_n, dim_k);
+
+  uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
+  uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
+  uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
+
+  std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+  // RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
+  // RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
+  // RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
+  kernel_arg.addr_a = 0x20000UL;
+  kernel_arg.addr_b = 0x28000UL;
+  kernel_arg.addr_c = 0xc0000000UL;
+
+  kernel_arg.dim_m = dim_m;
+  kernel_arg.dim_n = dim_n;
+  kernel_arg.dim_k = dim_k;
+
+  std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
+  std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
+  std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
+
+  // allocate staging buffer
+  {
+    std::cout << "allocate staging buffer" << std::endl;
+    uint32_t staging_buf_size = std::max<uint32_t>(
+        src_a_buf_size,
+        std::max<uint32_t>(
+            src_b_buf_size,
+            std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
+    staging_buf.resize(staging_buf_size);
+  }
+
+  // upload kernel argument
+  {
+    std::cout << "upload kernel argument" << std::endl;
+    auto buf_ptr = staging_buf.data();
+    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+    std::cout << "uploading argument buffer to device, device mem address="
+              << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
+              << sizeof(kernel_arg_t) << " bytes\n";
+    std::ofstream file("args.bin", std::ios::binary | std::ios::out);
+    if (!file) {
+        std::cerr << "error: failed to open args.bin for writing\n";
+        exit(EXIT_FAILURE);
+    }
+    file.write(reinterpret_cast<char *>(staging_buf.data()),
+               sizeof(kernel_arg_t));
+    file.close();
+  }
+
+  // upload source buffer
+  {
+    {
+        auto buf_ptr = staging_buf.data();
+        memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
+        RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
+                                src_a_buf_size));
+
+        std::cout << "uploading source A matrix to device, device mem address="
+                  << std::hex << kernel_arg.addr_a << ", size=" << std::dec
+                  << src_a_buf_size << " bytes\n";
+        std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
+        if (!file) {
+        std::cerr << "error: failed to open input.a.bin for writing\n";
+        exit(EXIT_FAILURE);
+        }
+        file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
+        file.close();
+    }
+    {
+        auto buf_ptr = staging_buf.data();
+        memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
+        RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
+                                src_b_buf_size));
+
+        std::cout << "uploading source B matrix to device, device mem address="
+                  << std::hex << kernel_arg.addr_b << ", size=" << std::dec
+                  << src_b_buf_size << " bytes\n";
+        std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
+        if (!file) {
+        std::cerr << "error: failed to open input.b.bin for writing\n";
+        exit(EXIT_FAILURE);
+        }
+        file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
+        file.close();
+    }
+  }
+
+  // clear destination buffer
+  {
+    std::cout << "clear destination buffer" << std::endl;
+    auto buf_ptr = (int32_t*)staging_buf.data();
+    for (uint32_t i = 0; i < ref_data.size(); ++i) {
+      buf_ptr[i] = 0xdeadbeef;
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
+  }
+
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
+  std::cout << "PASSED!" << std::endl;
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;
+  cleanup();
+
+  return 0;
+}
diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h
index 2b8f164a..a7b26936 100644
--- a/tests/regression/vecaddx/common.h
+++ b/tests/regression/vecaddx/common.h
@@ -1,7 +1,7 @@
 #ifndef _COMMON_H_
 #define _COMMON_H_
 
-#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
 
 #ifndef TYPE
 #define TYPE float
diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp
index 6ed42164..6e782586 100644
--- a/tests/regression/vecaddx/kernel.cpp
+++ b/tests/regression/vecaddx/kernel.cpp
@@ -13,6 +13,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
 
 int main() {
 	kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+#ifdef RADIANCE
+	vx_spawn_tasks_cluster(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
+#else
 	vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
+#endif
 	return 0;
 }
diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp
index 117f3470..e25ad5b4 100644
--- a/tests/regression/vecaddx/main.cpp
+++ b/tests/regression/vecaddx/main.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <fstream>
 #include <unistd.h>
 #include <string.h>
 #include <vector>
@@ -106,9 +107,9 @@ static void parse_args(int argc, char **argv) {
 
 void cleanup() {
   if (device) {    
-    vx_mem_free(device, kernel_arg.src0_addr);
-    vx_mem_free(device, kernel_arg.src1_addr);
-    vx_mem_free(device, kernel_arg.dst_addr);
+    // vx_mem_free(device, kernel_arg.src0_addr);
+    // vx_mem_free(device, kernel_arg.src1_addr);
+    // vx_mem_free(device, kernel_arg.dst_addr);
     vx_dev_close(device);
   }
 }
@@ -181,9 +182,12 @@ int main(int argc, char *argv[]) {
 
   // allocate device memory
   std::cout << "allocate device memory" << std::endl;
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
+  // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
+  // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
+  // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
+  kernel_arg.src0_addr = 0x20000UL;
+  kernel_arg.src1_addr = 0x28000UL;
+  kernel_arg.dst_addr = 0xc0000000UL;
 
   kernel_arg.num_points = num_points;
 
@@ -201,10 +205,19 @@ int main(int argc, char *argv[]) {
   memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
   RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
 
+  std::ofstream file("args.bin", std::ios::binary | std::ios::out);
+  if (!file) {
+    std::cerr << "error: failed to open args.bin for writing\n";
+    exit(EXIT_FAILURE);
+  }
+  file.write(reinterpret_cast<char *>(staging_buf.data()), sizeof(kernel_arg_t));
+  file.close();
+
   // generate source data
   source_data.resize(2 * num_points);
   for (uint32_t i = 0; i < source_data.size(); ++i) {
-    source_data[i] = Comparator<TYPE>::generate();
+    // source_data[i] = Comparator<TYPE>::generate();
+    source_data[i] = static_cast<float>(i);
   }
 
   // upload source buffer0
@@ -215,6 +228,14 @@ int main(int argc, char *argv[]) {
       buf_ptr[i] = source_data[2 * i + 0];
     }
     RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
+
+    std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
+    if (!file) {
+      std::cerr << "error: failed to open input.a.bin for writing\n";
+      exit(EXIT_FAILURE);
+    }
+    file.write(reinterpret_cast<char *>(buf_ptr), buf_size);
+    file.close();
   }
 
   // upload source buffer1
@@ -225,6 +246,14 @@ int main(int argc, char *argv[]) {
       buf_ptr[i] = source_data[2 * i + 1];
     }   
     RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
+
+    std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
+    if (!file) {
+      std::cerr << "error: failed to open input.b.bin for writing\n";
+      exit(EXIT_FAILURE);
+    }
+    file.write(reinterpret_cast<char *>(buf_ptr), buf_size);
+    file.close();
   }
 
   // clear destination buffer
@@ -243,4 +272,4 @@ int main(int argc, char *argv[]) {
   std::cout << "PASSED!" << std::endl;
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/third_party/gemmini-rocc-tests b/third_party/gemmini-rocc-tests
new file mode 160000
index 00000000..6148fc0d
--- /dev/null
+++ b/third_party/gemmini-rocc-tests
@@ -0,0 +1 @@
+Subproject commit 6148fc0d2c7a91ec87e72bdd3c3808c6f985a77e