tensor: Simply metadata queue

Enqueue all different-warp reqs into the queue. There is a slight chance
that an HGMMA_WAIT might be blocked from commit when there are multiple
different-warp HGMMAs blocking the dequeue end, but it should be
uncommon.
This commit is contained in:
Hansung Kim
2024-10-22 22:01:18 -07:00
parent 69cbbdd89b
commit 78df981366

View File

@@ -24,65 +24,58 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
- wb
- rd
*/
wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid;
wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask;
wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC;
wire [`NUM_WARPS-1:0] execute_if_data_wb;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd;
wire [`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NW_WIDTH-1:0] execute_if_data_wid;
wire [NUM_LANES-1:0] execute_if_data_tmask;
wire [`INST_ALU_BITS-1:0] execute_if_data_op_type;
wire [`XLEN-1:0] execute_if_data_PC;
wire execute_if_data_wb;
wire [`NR_BITS-1:0] execute_if_data_rd;
wire [`NUM_WARPS-1:0] metadata_queue_fulls;
wire [`NUM_WARPS-1:0] metadata_queue_emptys;
wire metadata_queue_full;
wire metadata_queue_empty;
// OR not AND; we don't want any warp to be full
wire metadata_queue_full = |(metadata_queue_fulls);
assign execute_if.ready = !metadata_queue_full;
`RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
("runtime error: WGMMA execute not supported for warps other than 0!"))
logic metadata_deq;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
// Metadata queue for commit_if. This simply copies execute_if's
// metadata and pops them in conjunction with commit fire.
//
// This has to be separated per-warp, as otherwise requests from
// multiple warps can be enqueued interleaved, which makes it hard to
// ensure two consecutive dequeues are associated with the same warp for
// commit. (FIXME: this is not strictly necessary though.)
// Metadata queue for commit_if. This simply copies execute_if's
// metadata and pops them in conjunction with commit fire.
//
// Note both HGMMA and HGMMA_WAIT will be enqueued here, interleaved
// between different warps. There is a slight chance that an HGMMA_WAIT
// might be blocked from commit when there are multiple different-warp
// HGMMAs blocking the dequeue end, so keep an eye on those cases.
wire operand_enq_fire = execute_if.valid && execute_if.ready;
wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
// FIXME: commit only warp 0
wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
wire operand_enq_fire = execute_if.valid && execute_if.ready;
wire enq = operand_enq_fire;
wire deq = metadata_deq;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
) pending_uops (
.clk(clk),
.reset(reset),
.push(enq),
.pop(deq),
.data_in({execute_if.data.uuid, execute_if.data.wid,
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd}),
.data_out({execute_if_data_uuid[i], execute_if_data_wid[i],
execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
execute_if_data_wb[i], execute_if_data_rd[i]}),
.empty(metadata_queue_emptys[i]),
`UNUSED_PIN(alm_empty),
.full(metadata_queue_fulls[i]),
`UNUSED_PIN(alm_full),
`UNUSED_PIN(size)
);
end
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
) pending_uops (
.clk(clk),
.reset(reset),
.push(enq),
.pop(deq),
.data_in({execute_if.data.uuid, execute_if.data.wid,
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd}),
.data_out({execute_if_data_uuid, execute_if_data_wid,
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
execute_if_data_wb, execute_if_data_rd}),
.empty(metadata_queue_empty),
`UNUSED_PIN(alm_empty),
.full(metadata_queue_full),
`UNUSED_PIN(alm_full),
`UNUSED_PIN(size)
);
// NOTE: this is not an error but tells us if backend doesn't keep up with
// HGMMA calls from the kernel
`RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
// `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
wire initiate_ready;
wire writeback_valid;
@@ -92,12 +85,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
logic writeback_ready;
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
wire metadata_valid = !metadata_queue_empty;
wire hmma_wait = metadata_valid &&
(execute_if_data_op_type[0/*FIXME*/] == `INST_TENSOR_HGMMA_WAIT);
(execute_if_data_op_type == `INST_TENSOR_HGMMA_WAIT);
// skip HGMMA_WAIT for kickoff
wire initiate_valid = metadata_valid && !hmma_wait;
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid[0/*FIXME*/];
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
// we're recycling execute_if.op_type as operands_if.op_type which might
// have a different width; let's be safe
@@ -216,12 +209,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
commit_if.data.eop = writeback_last;
end else begin
commit_if.valid = metadata_valid;
commit_if.data.uuid = execute_if_data_uuid[0];
commit_if.data.wid = execute_if_data_wid[0];
commit_if.data.tmask = execute_if_data_tmask[0];
commit_if.data.PC = execute_if_data_PC[0];
commit_if.data.wb = execute_if_data_wb[0];
commit_if.data.rd = execute_if_data_rd[0];
commit_if.data.uuid = execute_if_data_uuid;
commit_if.data.wid = execute_if_data_wid;
commit_if.data.tmask = execute_if_data_tmask;
commit_if.data.PC = execute_if_data_PC;
commit_if.data.wb = execute_if_data_wb;
commit_if.data.rd = execute_if_data_rd;
commit_if.data.data = '0; // can be arbitrary as rd is zero
commit_if.data.tensor = 1'b0;
commit_if.data.pid = 1'b0;