From 98eb7cb594924f1bad056015e633462fbf113ae2 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 22 Oct 2024 21:10:55 -0700
Subject: [PATCH] tensor: Block both HGMMA/HGMMA_WAIT at scoreboard

If we let back-to-back HGMMAs pass at scoreboard, we can't accurately
keep track of the busy state of the tensor core and block WAITs
accordingly.

TODO: Distinguish "ready-to-fire" from "ready-to-use-writeback".
---
 hw/rtl/core/VX_scoreboard.sv         | 6 ++++--
 hw/rtl/core/VX_tensor_hopper_core.sv | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 2a39c058..67c077ef 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -209,13 +209,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         assign perf_issue_fires_per_cycle[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
     `endif
 
-        // NOTE(hansung): why is inuse_rd checked? to prevent WAW?
         wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
     `ifdef EXT_T_HOPPER
         wire hgmma_wait = ibuffer_if[i].valid &&
             (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
             (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
-        wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
+        // block both HGMMA and HGMMA_WAIT until inuse goes down.  If we pass
+        // HGMMA through, we can't accurately keep track of the busy state of
+        // the tensor core and block WAITs accordingly.
+        wire hgmma_ready = !inuse_tensor[ibuffer_if[i].data.wis];
         wire operands_ready = (~(| operands_busy)) && hgmma_ready;
     `else
         wire operands_ready = ~(| operands_busy);
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 32fa6f5b..b03b0b0e 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -80,8 +80,8 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         );
     end
 
-    // this shouldn't really happen unless there's a big contention over
-    // the commit stage
+    // NOTE: this is not an error but tells us if backend doesn't keep up with
+    // HGMMA calls from the kernel
     `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
     wire initiate_ready;
@@ -222,7 +222,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             commit_if.data.PC     = execute_if_data_PC[0];
             commit_if.data.wb     = execute_if_data_wb[0];
             commit_if.data.rd     = execute_if_data_rd[0];
-            commit_if.data.data   = '0; // FIXME ?
+            commit_if.data.data   = '0; // can be arbitrary as rd is zero
             commit_if.data.tensor = 1'b0;
             commit_if.data.pid    = 1'b0;
             commit_if.data.sop    = 1'b1;