From 98eb7cb594924f1bad056015e633462fbf113ae2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 22 Oct 2024 21:10:55 -0700 Subject: [PATCH] tensor: Block both HGMMA/HGMMA_WAIT at scoreboard If we let back-to-back HGMMAs pass at scoreboard, we can't accurately keep track of the busy state of the tensor core and block WAITs accordingly. TODO: Distinguish "ready-to-fire" from "ready-to-use-writeback". --- hw/rtl/core/VX_scoreboard.sv | 6 ++++-- hw/rtl/core/VX_tensor_hopper_core.sv | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 2a39c058..67c077ef 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -209,13 +209,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #( assign perf_issue_fires_per_cycle[i] = ibuffer_if[i].valid && ibuffer_if[i].ready; `endif - // NOTE(hansung): why is inuse_rd checked? to prevent WAW? wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; `ifdef EXT_T_HOPPER wire hgmma_wait = ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); - wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); + // block both HGMMA and HGMMA_WAIT until inuse goes down. If we pass + // HGMMA through, we can't accurately keep track of the busy state of + // the tensor core and block WAITs accordingly. + wire hgmma_ready = !inuse_tensor[ibuffer_if[i].data.wis]; wire operands_ready = (~(| operands_busy)) && hgmma_ready; `else wire operands_ready = ~(| operands_busy); diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 32fa6f5b..b03b0b0e 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -80,8 +80,8 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( ); end - // this shouldn't really happen unless there's a big contention over - // the commit stage + // NOTE: this is not an error but tells us if backend doesn't keep up with + // HGMMA calls from the kernel `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) wire initiate_ready; @@ -222,7 +222,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( commit_if.data.PC = execute_if_data_PC[0]; commit_if.data.wb = execute_if_data_wb[0]; commit_if.data.rd = execute_if_data_rd[0]; - commit_if.data.data = '0; // FIXME ? + commit_if.data.data = '0; // can be arbitrary as rd is zero commit_if.data.tensor = 1'b0; commit_if.data.pid = 1'b0; commit_if.data.sop = 1'b1;