tensor: Separate async commit from tensor commit

With this we can prioritize commit of the async hgmma instructions over
the "ghost" commits from the TC.
This commit is contained in:
Hansung Kim
2024-10-11 21:32:20 -07:00
parent 717fe7ff29
commit 4dcbc31a88
2 changed files with 83 additions and 33 deletions

View File

@@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #(
// probably want to change this at some point
// (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
wire [`ISSUE_WIDTH-1:0] final_hmma;
// if this is a "ghost" commit generated from the tensor core, don't count
// toward committed
wire [`ISSUE_WIDTH-1:0] tensor_ghost;
`ifdef EXT_T_ENABLE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
// if PC is 0, this means it is not final step of a wmma, shouldn't be committed
assign final_hmma[i] = (commit_if[i].data.PC != 32'b0);
// handle 'x' with ===. FIXME fix unitialization
assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
end
`else
assign final_hmma = '1;
assign tensor_ghost = '0;
`endif
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),

View File

@@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
- wb
- rd
*/
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
wire [`NUM_WARPS-1:0] execute_if_data_wb;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
wire [`NUM_WARPS-1:0] execute_if_data_wb;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
@@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
("runtime error: WGMMA execute not supported for warps other than 0!"))
wire metadata_deq;
logic metadata_deq;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
// Metadata queue for commit_if. This simply copies execute_if's
@@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
// FIXME: commit only warp 0
wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
@@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
.push(enq),
.pop(deq),
.data_in({execute_if.data.uuid, execute_if.data.wid,
execute_if.data.tmask, execute_if.data.PC,
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd}),
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
execute_if_data_tmask[i], execute_if_data_PC[i],
execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
execute_if_data_wb[i], execute_if_data_rd[i]}),
.empty(metadata_queue_emptys[i]),
`UNUSED_PIN(alm_empty),
@@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
// the commit stage
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
wire initiate_ready; // FIXME: unused
wire initiate_ready;
wire writeback_valid;
wire writeback_last;
logic writeback_ready;
wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
// dequeue metadata at the last writeback
assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
// skip HGMMA_WAIT for kickoff
wire initiate_valid = metadata_valid && not_wait;
// we're recycling execute_if.op_type as operands_if.op_type which might
// have a different width; let's be safe
`STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))
VX_tensor_hopper_core #(
) tensor_hopper_core (
.clk(clk),
.reset(reset),
.initiate_valid(metadata_valid),
.initiate_valid(initiate_valid),
.initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
.initiate_ready(initiate_ready),
.writeback_valid(writeback_valid),
`UNUSED_PIN(writeback_wid),
.writeback_last(writeback_last),
.writeback_ready(commit_if.ready)
.writeback_ready(writeback_ready)
);
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
assign commit_if.valid = writeback_valid;
assign commit_if.data.uuid = execute_if_data_uuid[0];
assign commit_if.data.wid = execute_if_data_wid[0];
assign commit_if.data.tmask = execute_if_data_tmask[0];
assign commit_if.data.PC = execute_if_data_PC[0];
assign commit_if.data.wb = writeback_last;
// custom rd
assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
assign commit_if.data.data = wb_data;
assign commit_if.data.tensor = writeback_last;
assign commit_if.data.pid = 1'b0;
assign commit_if.data.sop = 1'b1;
// eop is deliberately set so that we don't underflow the pending_instr
// buffer in VX_schedule. An instruction is considered committed only
// when the eop bit is set to one (see VX_commit).
assign commit_if.data.eop = writeback_last;
always @(*) begin
metadata_deq = 1'b0;
// if there's something in the meta queue, give it priority for commit,
// since every HGMMA instructions are asynchronous and should not
// block
if (metadata_valid) begin
// block tensor core writeback
writeback_ready = 1'b0;
commit_if.valid = metadata_valid;
commit_if.data.uuid = execute_if_data_uuid[0];
commit_if.data.wid = execute_if_data_wid[0];
commit_if.data.tmask = execute_if_data_tmask[0];
commit_if.data.PC = execute_if_data_PC[0];
commit_if.data.wb = execute_if_data_wb[0];
commit_if.data.rd = execute_if_data_rd[0];
commit_if.data.data = wb_data; // FIXME ?
commit_if.data.tensor = 1'b0;
commit_if.data.pid = 1'b0;
commit_if.data.sop = 1'b1;
commit_if.data.eop = 1'b1;
// block meta queue until tensor core is ready. This will
// effectively stall further issue of async HGMMA when tensor core
// is busy with too many outstanding requests (depth of meta queue).
// be careful to not miss the commit backpressure.
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
end else begin
// allow tensor core writeback, provided there's no commit
// backpressure
writeback_ready = commit_if.ready;
commit_if.valid = writeback_valid;
commit_if.data.uuid = '0;
commit_if.data.wid = '0; // FIXME
commit_if.data.tmask = {NUM_LANES{1'b1}};
commit_if.data.PC = '0;
commit_if.data.wb = writeback_last;
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
commit_if.data.data = wb_data;
// mark as "ghost" commit. This will prevent this commit from
// decrementing from pending_instr buffer
commit_if.data.tensor = 1'b1;
// eop is deliberately set so that we don't underflow the pending_instr
// buffer in VX_schedule. An instruction is considered committed only
// when the eop bit is set to one (see VX_commit).
// only the last ghost commit has eop set, which will trigger
// scoreboard to clear out the busy bit.
commit_if.data.eop = writeback_last;
end
end
endmodule