diff --git a/kernels/sgemm_tcore/sgemm_impl.hpp b/kernels/sgemm_tcore/sgemm_impl.hpp
index e0b2b8d3..4fd425e9 100644
--- a/kernels/sgemm_tcore/sgemm_impl.hpp
+++ b/kernels/sgemm_tcore/sgemm_impl.hpp
@@ -1257,7 +1257,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
               k_LOOP_WS_CONFIG_ADDRS_AB)
           // GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
           GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
-          gemmini_fence();
+          // gemmini_fence();
 
           // block_k is even: opcode 11 (write to local_a_buf)
           // block_k is odd:  opcode 10 (write to local_a)
@@ -1266,8 +1266,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
           // the last iteration of the k-loop is prefetching for the first
           // iteration of the n-loop.  The ping-poing indexing has to match for
           // the two loop end to connect.
-          const uint32_t a_hexadecile = (block_k & 1) * 4;
-          const uint32_t b_hexadecile = (block_k & 1) * 4 + 11;
+          const uint32_t a_hexadecile = 4 - ((block_k & 1) * 4);
+          const uint32_t b_hexadecile = a_hexadecile + 11;
           GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
           // // TODO: branch is probably slow
           // if (block_k & 1) {