diff --git a/tests/regression/sgemm_tcore/kernel.cpp b/tests/regression/sgemm_tcore/kernel.cpp
index 4f115aa6..1daed02d 100644
--- a/tests/regression/sgemm_tcore/kernel.cpp
+++ b/tests/regression/sgemm_tcore/kernel.cpp
@@ -7,6 +7,42 @@
 #include "include/gemmini.h"
 #include "gemmini_mmio.h"
 
+constexpr bool DEBUG = true;
+
+template <uint32_t tile_dim_row, uint32_t tile_dim_col>
+inline void thread_block_copy_tile(const float *src, float *dest,
+                                   const uint32_t tid_in_threadblock,
+                                   const uint32_t threads_per_threadblock,
+                                   const uint32_t threadblock_id_in_cluster) {
+  asm volatile("threadblock_copy_tile_start_%=:" ::);
+
+  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
+  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
+  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
+  const uint32_t warps_per_threadblock_per_core =
+      warps_in_threadblock / CORES_PER_CLUSTER;
+
+#pragma GCC unroll 1
+  for (int row_offset = 0; row_offset < tile_dim_row;
+       row_offset += warps_in_threadblock) {
+    const uint32_t row = row_offset + warp_id;
+    const uint32_t first_thread_offset = tile_dim_col * row;
+
+    constexpr uint32_t per_row_iter = tile_dim_col / NUM_THREADS;
+    uint32_t thread_offset = first_thread_offset + tid_in_warp;
+#pragma GCC unroll
+    for (int i = 0; i < per_row_iter; i++) {
+      dest[thread_offset] = src[thread_offset];
+      thread_offset += NUM_THREADS;
+    }
+
+    threadblock_barrier(threadblock_id_in_cluster,
+                        warps_per_threadblock_per_core);
+  }
+
+  asm volatile("threadblock_copy_tile_finish_%=:" ::);
+}
+
 void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
   // @perf: All threads are running these compute whose result is mostly same
   // across the threadblock
@@ -57,6 +93,24 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
       (float *)arg->addr_c, arg->dim_m, arg->dim_n, arg->dim_k,
       tid_in_threadblock, threadblocks_per_cluster, threadblock_id_in_cluster,
       sharedmem_per_threadblock);
+
+  float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
+  float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
+
+  const float *smem_A = reinterpret_cast<float *>(sharedmem_per_threadblock);
+  const float *smem_B = smem_A + 2 * BM * BK;
+
+  if constexpr (DEBUG) {
+    threadblock_barrier(threadblock_id_in_cluster,
+                        warps_per_threadblock_per_core);
+
+    thread_block_copy_tile<BM, BK>(smem_A, gmem_tmp_d0, tid_in_threadblock,
+                                   threads_per_threadblock,
+                                   threadblock_id_in_cluster);
+    thread_block_copy_tile<BK, BN>(smem_B, gmem_tmp_d1, tid_in_threadblock,
+                                   threads_per_threadblock,
+                                   threadblock_id_in_cluster);
+  }
 }
 
 int main() {
diff --git a/tests/regression/sgemm_tcore/sgemm_impl.hpp b/tests/regression/sgemm_tcore/sgemm_impl.hpp
index 785a538c..faa2c382 100644
--- a/tests/regression/sgemm_tcore/sgemm_impl.hpp
+++ b/tests/regression/sgemm_tcore/sgemm_impl.hpp
@@ -72,7 +72,7 @@ using float_type = float16_t;
 #define TRANSPOSE_AT_PRODUCE 0
 #define TRANSPOSE_AT_CONSUME 0
 
-#define GEMMINI_DMA 0
+#define GEMMINI_DMA 1
 #if SMEM_SIZE == 0x4000
 #define SMEM_ADDR_Q0 ((float * const) 0xff000000)
 #define SMEM_ADDR_Q1 ((float * const) 0xff001000)
@@ -83,7 +83,7 @@ using float_type = float16_t;
 #define SPAD_ADDR_Q2 0x100
 #define SPAD_ADDR_Q3 0x180
 #define BOUND_INST 0x400040004ULL
-#elif SMEM_SIZE == 0x10000
+#elif SMEM_SIZE >= 0x10000
 #define SMEM_ADDR_Q0 ((float * const) 0xff000000)
 #define SMEM_ADDR_Q1 ((float * const) 0xff004000)
 #define SMEM_ADDR_Q2 ((float * const) 0xff008000)