Merge branch 'ae' into ae-hopper

This commit is contained in:
Richard Yan
2025-01-30 15:34:58 -08:00

View File

@@ -1190,10 +1190,10 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
(uint64_t)(B + /*block_k:*/ 0 * BK * dim_n + block_n * BN),
k_LOOP_WS_CONFIG_ADDRS_AB)
// GEMMINI_CISC(8) does k_LOOP_WS_CONFIG_STRIDES_AB
GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | 8);
GEMMINI_CISC_CMD_R((dim_n << 20) | (dim_k << 8) | GEMMINI_CISC_SET_AB_STRIDE);
gemmini_fence();
GEMMINI_CISC_CMD_I(10);
GEMMINI_CISC_CMD_R((11 << 16) | (0 << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
gemmini_fence();
#if 0
@@ -1266,8 +1266,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
// the last iteration of the k-loop is prefetching for the first
// iteration of the n-loop. The ping-poing indexing has to match for
// the two loop end to connect.
const uint32_t opcode = 11 - (block_k & 1);
GEMMINI_CISC_CMD_I(opcode);
const uint32_t a_hexadecile = (block_k & 1) * 4;
const uint32_t b_hexadecile = (block_k & 1) * 4 + 11;
GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
// // TODO: branch is probably slow
// if (block_k & 1) {
// GEMMINI_CISC_CMD_I(12);