From 17a9d31be58c6f2d9792bb81c83f8605de207204 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Thu, 30 Jan 2025 15:33:58 -0800 Subject: [PATCH] fix dma invocation --- kernels/sgemm_tcore/sgemm_impl.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernels/sgemm_tcore/sgemm_impl.hpp b/kernels/sgemm_tcore/sgemm_impl.hpp index 10829db2..376a88c9 100644 --- a/kernels/sgemm_tcore/sgemm_impl.hpp +++ b/kernels/sgemm_tcore/sgemm_impl.hpp @@ -1190,10 +1190,10 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, (uint64_t)(B + /*block_k:*/ 0 * BK * dim_n + block_n * BN), k_LOOP_WS_CONFIG_ADDRS_AB) // GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB - GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8); + GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | GEMMINI_CISC_SET_AB_STRIDE); gemmini_fence(); - GEMMINI_CISC_CMD_I(10); + GEMMINI_CISC_CMD_R((11 << 16) | (0 << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES); gemmini_fence(); #if 0 @@ -1266,8 +1266,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, // the last iteration of the k-loop is prefetching for the first // iteration of the n-loop. The ping-poing indexing has to match for // the two loop end to connect. - const uint32_t opcode = 11 - (block_k & 1); - GEMMINI_CISC_CMD_I(opcode); + const uint32_t a_hexadecile = (block_k & 1) * 4; + const uint32_t b_hexadecile = (block_k & 1) * 4 + 11; + GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES); // // TODO: branch is probably slow // if (block_k & 1) { // GEMMINI_CISC_CMD_I(12);