diff --git a/tests/regression/flash_attention/kernel.gemmini.cpp b/tests/regression/flash_attention/kernel.gemmini.cpp index e3335861..b1ec5b29 100644 --- a/tests/regression/flash_attention/kernel.gemmini.cpp +++ b/tests/regression/flash_attention/kernel.gemmini.cpp @@ -402,13 +402,18 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { asm volatile ("dbuf_sel_end_%=:" :: ); { + // fence completion of the GEMMs in the previous loop iterations. Note + // this is done at the start of the loop to maximize window of + // overlapping. + // + // NOTE: this ideally needs to be put inside tid_in_warpgroup == 0 + // branch, but that triggers a TL source ID re-used assertion we haven't + // looked at yet. + gemmini_fence(); + // do all of GEMM kickoffs before the SIMT compute // if (tid_in_warpgroup == 0) { - // fence completion of the GEMMs in the previous loop iterations. Note - // this is done at the start of the loop to maximize window of - // overlapping. - gemmini_fence(); if (tile_k >= 2) // delay GEMM II by 2 iters for pipelining { @@ -689,10 +694,13 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { threadblock_barrier(warpgroup_id_in_cluster, warps_per_warpgroup_per_core); - - // instead of fencing here, we fence at the start of the loop to maximize - // overlapping + // @perf: instead of fencing here, fence at the start of the loop to + // maximize overlapping // gemmini_fence(); + + // // reconverge after mmio + // threadblock_barrier(warpgroup_id_in_cluster, + // warps_per_warpgroup_per_core); } }