tensor: Write stall logic for hgmma_wait
HGMMA_WAIT instruction stalls at issue when inuse_tensor is set, which is done by the previous HGMMA insn. Currently inuse_tensor is never set back to zero.
This commit is contained in:
@@ -254,7 +254,10 @@
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
`define INST_TENSOR_HMMA 4'b0000
|
||||
`define INST_TENSOR_HMMA 4'b0000
|
||||
// Hopper WGMMA-style asynchronous op
|
||||
`define INST_TENSOR_HGMMA 4'b0001
|
||||
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
@@ -543,6 +543,16 @@ module VX_decode #(
|
||||
end
|
||||
`ifdef EXT_T_ENABLE
|
||||
`INST_EXT4: begin
|
||||
`ifdef EXT_T_HOPPER
|
||||
ex_type = `EX_TENSOR;
|
||||
// tensor core macroop is encoded as r-type
|
||||
if (func3[0]) begin
|
||||
op_type = `INST_TENSOR_HGMMA_WAIT;
|
||||
end else begin
|
||||
op_type = `INST_TENSOR_HGMMA;
|
||||
end
|
||||
// rd/rs1/rs2/rs3 unused
|
||||
`else
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
// tensor core macroop is encoded as r-type
|
||||
@@ -551,6 +561,7 @@ module VX_decode #(
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
|
||||
@@ -146,6 +146,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
// have an explicit destination register, use a separate status bit.
|
||||
reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
|
||||
|
||||
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
||||
|
||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||
@@ -208,7 +211,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
// NOTE(hansung): why is inuse_rd checked? to prevent WAW?
|
||||
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||
`ifdef EXT_T_HOPPER
|
||||
wire hgmma_wait = ibuffer_if[i].valid &&
|
||||
(ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
||||
wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
|
||||
wire operands_ready = (~(| operands_busy)) && hgmma_ready;
|
||||
`else
|
||||
wire operands_ready = ~(| operands_busy);
|
||||
`endif
|
||||
|
||||
wire stg_valid_in, stg_ready_in;
|
||||
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
|
||||
@@ -238,6 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||
end
|
||||
`ifdef EXT_T_HOPPER
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
|
||||
inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`ifdef PERF_ENABLE
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||
|
||||
Reference in New Issue
Block a user