tensor: Separate async commit from tensor commit
With this we can prioritize commit of the async hgmma instructions over the "ghost" commits from the TC.
This commit is contained in:
@@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
// probably want to change this at some point
|
||||
// (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
|
||||
wire [`ISSUE_WIDTH-1:0] final_hmma;
|
||||
// if this is a "ghost" commit generated from the tensor core, don't count
|
||||
// toward committed
|
||||
wire [`ISSUE_WIDTH-1:0] tensor_ghost;
|
||||
`ifdef EXT_T_ENABLE
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
// if PC is 0, this means it is not final step of a wmma, shouldn't be committed
|
||||
assign final_hmma[i] = (commit_if[i].data.PC != 32'b0);
|
||||
// handle 'x' with ===. FIXME fix unitialization
|
||||
assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
|
||||
end
|
||||
`else
|
||||
assign final_hmma = '1;
|
||||
assign tensor_ghost = '0;
|
||||
`endif
|
||||
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
|
||||
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||
|
||||
@@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
- wb
|
||||
- rd
|
||||
*/
|
||||
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
|
||||
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
|
||||
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
|
||||
wire [`NUM_WARPS-1:0] execute_if_data_wb;
|
||||
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
|
||||
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
|
||||
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
|
||||
wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
||||
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
|
||||
wire [`NUM_WARPS-1:0] execute_if_data_wb;
|
||||
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
|
||||
|
||||
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
|
||||
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
|
||||
@@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
|
||||
("runtime error: WGMMA execute not supported for warps other than 0!"))
|
||||
|
||||
wire metadata_deq;
|
||||
logic metadata_deq;
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
// Metadata queue for commit_if. This simply copies execute_if's
|
||||
@@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
// FIXME: commit only warp 0
|
||||
wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
|
||||
VX_fifo_queue #(
|
||||
.DATAW(DATAW),
|
||||
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||
@@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
.push(enq),
|
||||
.pop(deq),
|
||||
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||
execute_if.data.tmask, execute_if.data.PC,
|
||||
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||
execute_if.data.wb, execute_if.data.rd}),
|
||||
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
|
||||
execute_if_data_tmask[i], execute_if_data_PC[i],
|
||||
execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
|
||||
execute_if_data_wb[i], execute_if_data_rd[i]}),
|
||||
.empty(metadata_queue_emptys[i]),
|
||||
`UNUSED_PIN(alm_empty),
|
||||
@@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
// the commit stage
|
||||
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
|
||||
|
||||
wire initiate_ready; // FIXME: unused
|
||||
wire initiate_ready;
|
||||
wire writeback_valid;
|
||||
wire writeback_last;
|
||||
logic writeback_ready;
|
||||
|
||||
wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
|
||||
// dequeue metadata at the last writeback
|
||||
assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
|
||||
wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
|
||||
// skip HGMMA_WAIT for kickoff
|
||||
wire initiate_valid = metadata_valid && not_wait;
|
||||
|
||||
// we're recycling execute_if.op_type as operands_if.op_type which might
|
||||
// have a different width; let's be safe
|
||||
`STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
|
||||
("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))
|
||||
|
||||
VX_tensor_hopper_core #(
|
||||
) tensor_hopper_core (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
.initiate_valid(metadata_valid),
|
||||
.initiate_valid(initiate_valid),
|
||||
.initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
|
||||
.initiate_ready(initiate_ready),
|
||||
|
||||
.writeback_valid(writeback_valid),
|
||||
`UNUSED_PIN(writeback_wid),
|
||||
.writeback_last(writeback_last),
|
||||
.writeback_ready(commit_if.ready)
|
||||
.writeback_ready(writeback_ready)
|
||||
);
|
||||
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
|
||||
|
||||
assign commit_if.valid = writeback_valid;
|
||||
assign commit_if.data.uuid = execute_if_data_uuid[0];
|
||||
assign commit_if.data.wid = execute_if_data_wid[0];
|
||||
assign commit_if.data.tmask = execute_if_data_tmask[0];
|
||||
assign commit_if.data.PC = execute_if_data_PC[0];
|
||||
assign commit_if.data.wb = writeback_last;
|
||||
// custom rd
|
||||
assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
|
||||
assign commit_if.data.data = wb_data;
|
||||
assign commit_if.data.tensor = writeback_last;
|
||||
assign commit_if.data.pid = 1'b0;
|
||||
assign commit_if.data.sop = 1'b1;
|
||||
// eop is deliberately set so that we don't underflow the pending_instr
|
||||
// buffer in VX_schedule. An instruction is considered committed only
|
||||
// when the eop bit is set to one (see VX_commit).
|
||||
assign commit_if.data.eop = writeback_last;
|
||||
always @(*) begin
|
||||
metadata_deq = 1'b0;
|
||||
|
||||
// if there's something in the meta queue, give it priority for commit,
|
||||
// since every HGMMA instructions are asynchronous and should not
|
||||
// block
|
||||
if (metadata_valid) begin
|
||||
// block tensor core writeback
|
||||
writeback_ready = 1'b0;
|
||||
|
||||
commit_if.valid = metadata_valid;
|
||||
commit_if.data.uuid = execute_if_data_uuid[0];
|
||||
commit_if.data.wid = execute_if_data_wid[0];
|
||||
commit_if.data.tmask = execute_if_data_tmask[0];
|
||||
commit_if.data.PC = execute_if_data_PC[0];
|
||||
commit_if.data.wb = execute_if_data_wb[0];
|
||||
commit_if.data.rd = execute_if_data_rd[0];
|
||||
commit_if.data.data = wb_data; // FIXME ?
|
||||
commit_if.data.tensor = 1'b0;
|
||||
commit_if.data.pid = 1'b0;
|
||||
commit_if.data.sop = 1'b1;
|
||||
commit_if.data.eop = 1'b1;
|
||||
|
||||
// block meta queue until tensor core is ready. This will
|
||||
// effectively stall further issue of async HGMMA when tensor core
|
||||
// is busy with too many outstanding requests (depth of meta queue).
|
||||
// be careful to not miss the commit backpressure.
|
||||
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
|
||||
end else begin
|
||||
// allow tensor core writeback, provided there's no commit
|
||||
// backpressure
|
||||
writeback_ready = commit_if.ready;
|
||||
|
||||
commit_if.valid = writeback_valid;
|
||||
commit_if.data.uuid = '0;
|
||||
commit_if.data.wid = '0; // FIXME
|
||||
commit_if.data.tmask = {NUM_LANES{1'b1}};
|
||||
commit_if.data.PC = '0;
|
||||
commit_if.data.wb = writeback_last;
|
||||
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
|
||||
commit_if.data.data = wb_data;
|
||||
// mark as "ghost" commit. This will prevent this commit from
|
||||
// decrementing from pending_instr buffer
|
||||
commit_if.data.tensor = 1'b1;
|
||||
// eop is deliberately set so that we don't underflow the pending_instr
|
||||
// buffer in VX_schedule. An instruction is considered committed only
|
||||
// when the eop bit is set to one (see VX_commit).
|
||||
// only the last ghost commit has eop set, which will trigger
|
||||
// scoreboard to clear out the busy bit.
|
||||
commit_if.data.eop = writeback_last;
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user