From 408a9b5d2adcc64b76beb84c912da57d92b0b63d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 17:18:01 -0700 Subject: [PATCH] tensor: Write stall logic for hgmma_wait HGMMA_WAIT instruction stalls at issue when inuse_tensor is set, which is done by the previous HGMMA insn. Currently inuse_tensor is never set back to zero. --- hw/rtl/VX_define.vh | 5 ++++- hw/rtl/core/VX_decode.sv | 11 +++++++++++ hw/rtl/core/VX_scoreboard.sv | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index bb96a149..61ce41be 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -254,7 +254,10 @@ `define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) -`define INST_TENSOR_HMMA 4'b0000 +`define INST_TENSOR_HMMA 4'b0000 +// Hopper WGMMA-style asynchronous op +`define INST_TENSOR_HGMMA 4'b0001 +`define INST_TENSOR_HGMMA_WAIT 4'b0010 /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index cf21d72f..df3500f2 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -543,6 +543,16 @@ module VX_decode #( end `ifdef EXT_T_ENABLE `INST_EXT4: begin + `ifdef EXT_T_HOPPER + ex_type = `EX_TENSOR; + // tensor core macroop is encoded as r-type + if (func3[0]) begin + op_type = `INST_TENSOR_HGMMA_WAIT; + end else begin + op_type = `INST_TENSOR_HGMMA; + end + // rd/rs1/rs2/rs3 unused + `else ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; // tensor core macroop is encoded as r-type @@ -551,6 +561,7 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); `USED_IREG (rs3); + `endif end `endif default:; diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 42a876f5..59886098 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -146,6 +146,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // have an explicit destination register, use a separate status bit. reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; + wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); + wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]; @@ -208,7 +211,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // NOTE(hansung): why is inuse_rd checked? to prevent WAW? wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; + `ifdef EXT_T_HOPPER + wire hgmma_wait = ibuffer_if[i].valid && + (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); + wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); + wire operands_ready = (~(| operands_busy)) && hgmma_ready; + `else wire operands_ready = ~(| operands_busy); + `endif wire stg_valid_in, stg_ready_in; assign stg_valid_in = ibuffer_if[i].valid && operands_ready; @@ -238,6 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end + `ifdef EXT_T_HOPPER + if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin + inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1; + end + `endif end `ifdef PERF_ENABLE if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin