tensor: Separate async commit from tensor commit

With this we can prioritize commit of the async hgmma instructions over the "ghost" commits from the TC.
2024-10-11 21:32:20 -07:00
parent 717fe7ff29
commit 4dcbc31a88
2 changed files with 83 additions and 33 deletions
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #(
    // probably want to change this at some point
    // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
    wire [`ISSUE_WIDTH-1:0] final_hmma;
+    // if this is a "ghost" commit generated from the tensor core, don't count
+    // toward committed
+    wire [`ISSUE_WIDTH-1:0] tensor_ghost;
 `ifdef EXT_T_ENABLE
    for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
        // if PC is 0, this means it is not final step of a wmma, shouldn't be committed
        assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); 
+        // handle 'x' with ===.  FIXME fix unitialization
+        assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
    end
 `else
    assign final_hmma = '1;
+    assign tensor_ghost = '0;
 `endif

-
-    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
+    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);

    VX_pipe_register #(
        .DATAW  (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
        - wb
        - rd
    */
-    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
-    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
-    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
-    wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
-    wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
-    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
+    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0]    execute_if_data_uuid;
+    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]      execute_if_data_wid;
+    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]      execute_if_data_tmask;
+    wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
+    wire [`NUM_WARPS-1:0][`XLEN-1:0]          execute_if_data_PC;
+    wire [`NUM_WARPS-1:0]                     execute_if_data_wb;
+    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]       execute_if_data_rd;

    wire [`NUM_WARPS-1:0] metadata_queue_fulls;
    wire [`NUM_WARPS-1:0] metadata_queue_emptys;
@@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
    `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
        ("runtime error: WGMMA execute not supported for warps other than 0!"))

-    wire metadata_deq;
+    logic metadata_deq;

    for (genvar i = 0; i < `NUM_WARPS; i++) begin
        // Metadata queue for commit_if.  This simply copies execute_if's
@@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
        // FIXME: commit only warp 0
        wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));

-        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
+        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
        VX_fifo_queue #(
            .DATAW(DATAW),
            .DEPTH(METADATA_QUEUE_DEPTH)
@@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
            .push(enq),
            .pop(deq),
            .data_in({execute_if.data.uuid,  execute_if.data.wid,
-                      execute_if.data.tmask, execute_if.data.PC,
+                      execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
                      execute_if.data.wb,    execute_if.data.rd}),
            .data_out({execute_if_data_uuid[i],  execute_if_data_wid[i],
-                       execute_if_data_tmask[i], execute_if_data_PC[i],
+                       execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
                       execute_if_data_wb[i],    execute_if_data_rd[i]}),
            .empty(metadata_queue_emptys[i]),
            `UNUSED_PIN(alm_empty),
@@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
    // the commit stage
    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))

-    wire initiate_ready; // FIXME: unused
+    wire initiate_ready;
    wire writeback_valid;
    wire writeback_last;
+    logic writeback_ready;

    wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
-    // dequeue metadata at the last writeback
-    assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
+    wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
+    // skip HGMMA_WAIT for kickoff
+    wire initiate_valid = metadata_valid && not_wait;
+
+    // we're recycling execute_if.op_type as operands_if.op_type which might
+    // have a different width; let's be safe
+    `STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
+        ("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))

    VX_tensor_hopper_core #(
    ) tensor_hopper_core (
        .clk(clk),
        .reset(reset),

-        .initiate_valid(metadata_valid),
+        .initiate_valid(initiate_valid),
        .initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
        .initiate_ready(initiate_ready),

        .writeback_valid(writeback_valid),
        `UNUSED_PIN(writeback_wid),
        .writeback_last(writeback_last),
-        .writeback_ready(commit_if.ready)
+        .writeback_ready(writeback_ready)
    );

    wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;

-    assign commit_if.valid = writeback_valid;
-    assign commit_if.data.uuid   = execute_if_data_uuid[0];
-    assign commit_if.data.wid    = execute_if_data_wid[0];
-    assign commit_if.data.tmask  = execute_if_data_tmask[0];
-    assign commit_if.data.PC     = execute_if_data_PC[0];
-    assign commit_if.data.wb     = writeback_last;
-    // custom rd
-    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
-    assign commit_if.data.data   = wb_data;
-    assign commit_if.data.tensor = writeback_last;
-    assign commit_if.data.pid    = 1'b0;
-    assign commit_if.data.sop    = 1'b1;
-    // eop is deliberately set so that we don't underflow the pending_instr
-    // buffer in VX_schedule.  An instruction is considered committed only
-    // when the eop bit is set to one (see VX_commit).
-    assign commit_if.data.eop    = writeback_last;
+    always @(*) begin
+        metadata_deq = 1'b0;
+
+        // if there's something in the meta queue, give it priority for commit,
+        // since every HGMMA instructions are asynchronous and should not
+        // block
+        if (metadata_valid) begin
+            // block tensor core writeback
+            writeback_ready = 1'b0;
+
+            commit_if.valid       = metadata_valid;
+            commit_if.data.uuid   = execute_if_data_uuid[0];
+            commit_if.data.wid    = execute_if_data_wid[0];
+            commit_if.data.tmask  = execute_if_data_tmask[0];
+            commit_if.data.PC     = execute_if_data_PC[0];
+            commit_if.data.wb     = execute_if_data_wb[0];
+            commit_if.data.rd     = execute_if_data_rd[0];
+            commit_if.data.data   = wb_data; // FIXME ?
+            commit_if.data.tensor = 1'b0;
+            commit_if.data.pid    = 1'b0;
+            commit_if.data.sop    = 1'b1;
+            commit_if.data.eop    = 1'b1;
+
+            // block meta queue until tensor core is ready.  This will
+            // effectively stall further issue of async HGMMA when tensor core
+            // is busy with too many outstanding requests (depth of meta queue).
+            // be careful to not miss the commit backpressure.
+            metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
+        end else begin
+            // allow tensor core writeback, provided there's no commit
+            // backpressure
+            writeback_ready = commit_if.ready;
+
+            commit_if.valid       = writeback_valid;
+            commit_if.data.uuid   = '0;
+            commit_if.data.wid    = '0; // FIXME
+            commit_if.data.tmask  = {NUM_LANES{1'b1}};
+            commit_if.data.PC     = '0;
+            commit_if.data.wb     = writeback_last;
+            commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
+            commit_if.data.data   = wb_data;
+            // mark as "ghost" commit.  This will prevent this commit from
+            // decrementing from pending_instr buffer
+            commit_if.data.tensor = 1'b1;
+            // eop is deliberately set so that we don't underflow the pending_instr
+            // buffer in VX_schedule.  An instruction is considered committed only
+            // when the eop bit is set to one (see VX_commit).
+            // only the last ghost commit has eop set, which will trigger
+            // scoreboard to clear out the busy bit.
+            commit_if.data.eop    = writeback_last;
+        end
+    end
+
 endmodule