From 220ee0aa5ecebfcdf28f5b0cd912a80d369b1e22 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 31 May 2024 17:35:01 -0700 Subject: [PATCH] sgemm_tcore: Unroll around WMITER/WNITER This is within a very tight loop so it's worth unrolling at the risk of stack spills somewhere else. --- tests/regression/sgemm_tcore/kernel.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/regression/sgemm_tcore/kernel.cpp b/tests/regression/sgemm_tcore/kernel.cpp index 74cfd38a..5e048fc5 100644 --- a/tests/regression/sgemm_tcore/kernel.cpp +++ b/tests/regression/sgemm_tcore/kernel.cpp @@ -341,20 +341,19 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); #if USE_TENSOR_CORE -// #pragma GCC unroll 1 + // @perf: this loop spills to stack a lot because of all the flws in + // vx_wmma_load +#pragma GCC unroll 1 for (int i = 0; i < BK_LOOP; i++) { - // @perf: this loop spills to stack a lot because of all the flws in vx_wmma_load #pragma GCC unroll 1 for (uint32_t local_k = 0; local_k < BK; local_k += TCK) { // perform wmma // vx_wmma_load(local_a, local_b, warp_x, warp_y, tid_in_warp); - // FIXME: If multiple warps try to issue to Tensor Core at the same time, - // does one stall the other? // FIXME: this is wrong!! need separate accumulation register for // WM/WN_ITERS -#pragma GCC unroll 1 +#pragma GCC unroll 2 for (int wm_iter = 0; wm_iter < WMITER; wm_iter++) { -#pragma GCC unroll 1 +#pragma GCC unroll 2 for (int wn_iter = 0; wn_iter < WNITER; wn_iter++) { #if TC_SINGLE_WARP if (warp_in_threadblock == 0) {