From 4cac1adf7d2b93e6b02f1bc7c1e7914af5d00944 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 7 Oct 2024 17:10:59 -0700 Subject: [PATCH 01/14] Add dummy code for decoupled Hopper tensor core Define EXT_T_HOPPER that, when EXT_T_ENABLE is defined, distinguishes whether to instantiate core-coupled Volta-style or decoupled Hopper-style Tensor Core. --- hw/rtl/VX_config.vh | 7 +- hw/rtl/core/VX_decode.sv | 34 ++++++--- hw/rtl/core/VX_tensor_core.sv | 18 +++-- hw/rtl/core/VX_tensor_hopper_core.sv | 102 +++++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 15 deletions(-) create mode 100644 hw/rtl/core/VX_tensor_hopper_core.sv diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index a9ff2742..f309a84a 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -40,8 +40,13 @@ `define EXT_F_ENABLE `endif +// core-coupled tensor core `ifndef EXT_T_DISABLE `define EXT_T_ENABLE +// decoupled Hopper-style tensor core +`ifndef EXT_T_HOPPER +`define EXT_T_HOPPER +`endif `endif `ifndef XLEN_32 @@ -83,7 +88,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 8 +`define NUM_CORES 4 `endif `ifndef NUM_WARPS diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index cf21d72f..62fdde76 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -542,16 +542,30 @@ module VX_decode #( endcase end `ifdef EXT_T_ENABLE - `INST_EXT4: begin - ex_type = `EX_TENSOR; - op_type = `INST_TENSOR_HMMA; - // tensor core macroop is encoded as r-type - use_rd = 1; - `USED_IREG (rd); - `USED_IREG (rs1); - `USED_IREG (rs2); - `USED_IREG (rs3); - end + `ifdef EXT_T_HOPPER + // TODO + `INST_EXT4: begin + ex_type = `EX_TENSOR; + op_type = `INST_TENSOR_HMMA; + // tensor core macroop is encoded as r-type + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); + end + `else + `INST_EXT4: begin + ex_type = `EX_TENSOR; + op_type = `INST_TENSOR_HMMA; + // tensor core macroop is encoded as r-type + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); + end + `endif `endif default:; endcase diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 730d7855..802af43d 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -12,7 +12,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( ); localparam BLOCK_SIZE = 1; localparam NUM_LANES = `NUM_THREADS; - // FIXME: @perf: PARTIAL_BW==1 increases power instantiating + // @perf: PARTIAL_BW==1 increases power instantiating // stream_buffers for ISSUE_WIDTH times localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); @@ -51,16 +51,27 @@ module VX_tensor_core import VX_gpu_pkg::*; #( ); for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin - VX_tensor_core_block #( +`ifdef EXT_T_HOPPER + VX_tensor_hopper_core_block #( .ISW(1), // FIXME: not block_idx .FP16(FP16) + ) tensor_hopper_core ( + .clk(clk), + .reset(reset), + .execute_if(execute_if[block_idx]), + .commit_if(commit_block_if[block_idx]) + ); +`else + VX_tensor_core_block #( + .ISW(1), // FIXME: use block_idx + .FP16(FP16) ) tensor_core ( .clk(clk), .reset(reset), - .execute_if(execute_if[block_idx]), .commit_if(commit_block_if[block_idx]) ); +`endif end endmodule @@ -275,7 +286,6 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #( localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { execute_if_data_deq[wb_wid], /* uuid ~ rd */ - // execute_if_data_deq, /* uuid ~ rd */ subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */ 1'b0, /* pid */ 1'b1, /* sop */ diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv new file mode 100644 index 00000000..c79a7994 --- /dev/null +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -0,0 +1,102 @@ +`ifdef EXT_T_ENABLE +`include "VX_fpu_define.vh" + +module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( + parameter ISW, + parameter FP16 +) ( + input clk, + input reset, + + VX_execute_if.slave execute_if, + VX_commit_if.master commit_if +); + localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary + + /* commit_if.data_t parts that we need to keep around: + - uuid + - wid + - tmask + - PC + - wb + - rd + */ + + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; + + wire operand_enq_fire = execute_if.valid && execute_if.ready; + wire commit_if_fire = commit_if.valid && commit_if.ready; + wire [DATAW-1:0] execute_if_data_enq = { + execute_if.data.uuid, + execute_if.data.wid, + execute_if.data.tmask, + execute_if.data.PC, + execute_if.data.wb, + execute_if.data.rd + // pid/sop/eop set later + }; + + wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + + wire [`NUM_WARPS-1:0] metadata_queue_fulls; + wire [`NUM_WARPS-1:0] metadata_queue_emptys; + // OR not AND, we don't want any warp full + wire metadata_queue_full = |(metadata_queue_fulls); + assign execute_if.ready = !metadata_queue_full; + + `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), + ("runtime error: WGMMA execute not supported for warps other than 0!")) + + for (genvar i = 0; i < `NUM_WARPS; i++) begin + // Metadata queue for commit_if. This simply copies execute_if's + // metadata and pops them in conjunction with commit fire. + // + // This has to be separated per-warp, as otherwise requests from + // multiple warps can be enqueued interleaved, which makes it hard to + // ensure two consecutive dequeues are associated with the same warp for + // commit. (FIXME: this is not strictly necessary though.) + + wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); + // FIXME: commit only warp 0 + wire deq = commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); + + VX_fifo_queue #( + .DATAW(DATAW), + .DEPTH(METADATA_QUEUE_DEPTH) + ) pending_uops ( + .clk(clk), + .reset(reset), + .push(enq), + .pop(deq), + .data_in(execute_if_data_enq), + .data_out(execute_if_data_deq[i]), + .empty(metadata_queue_emptys[i]), + `UNUSED_PIN(alm_empty), + .full(metadata_queue_fulls[i]), + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); + end + + // this shouldn't really happen unless there's a big contention over + // the commit stage + `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) + + // FIXME: only checks warp 0 for commit! + assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/]; + + wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; + + localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; + wire [COMMIT_DATAW-1:0] commit_if_data = { + execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */ + wb_data, /* data */ + 1'b0, /* pid */ + 1'b1, /* sop */ + 1'b1 /* eop */ + }; + + assign commit_if.data = commit_if_data; +endmodule + +`endif From e8ca4677df05960a908ca1951cedc6e079975507 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 7 Oct 2024 20:21:35 -0700 Subject: [PATCH 02/14] Remove old code for pending_instr underflow fix --- hw/rtl/core/VX_commit.sv | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index f417d64f..a584cace 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -173,30 +173,15 @@ module VX_commit import VX_gpu_pkg::*; #( // Committed instructions - // temporary hack to not underflow the pending instructions buffer - // relies on 1 cycle delay of arbiter and continuous issuing of tensor instructions, - // so probably want to change this at some point + // prevent underflow of the VX_pending_instr buffer + // probably want to change this at some point // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline) - // logic [`ISSUE_WIDTH-1:0][4:0] hmma_ctr, hmma_ctr_n; wire [`ISSUE_WIDTH-1:0] final_hmma; `ifdef EXT_T_ENABLE for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - // assign hmma_ctr_n[i] = (tensor_commit_if[i].valid && tensor_commit_if[i].ready) ? hmma_ctr[i] + 5'b1 : hmma_ctr[i]; - // assign final_hmma[i] = (commit_sel[i] != `EX_BITS'(2) || hmma_ctr == '0); - // i suppose this is now a feature and not a bug // if PC is 0, this means it is not final step of a wmma, shouldn't be committed assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); end - /* - always @(posedge clk) begin - if (reset) begin - hmma_ctr <= '0; - end - else begin - hmma_ctr <= hmma_ctr_n; - end - end - */ `else assign final_hmma = '1; `endif From 7ab14445f0ad9534a4c625374c3f3fda266642f7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 7 Oct 2024 21:29:44 -0700 Subject: [PATCH 03/14] tensor: Test many-commit per execute with an FSM Trick is to set commit_if.data.eop to 0, since the commit module only signals instruction completion to VX_schedule if the eop bit is 1. Otherwise it underflows the pending_instr buffer. The same eop trick works for VX_scoreboard, which works around the invalid rd writeback error. --- hw/rtl/core/VX_tensor_hopper_core.sv | 55 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index c79a7994..a58f4027 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -11,7 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( VX_execute_if.slave execute_if, VX_commit_if.master commit_if ); - localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary + localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary /* commit_if.data_t parts that we need to keep around: - uuid @@ -37,6 +37,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( }; wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + logic [DATAW-1:0] execute_if_data_new_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; wire [`NUM_WARPS-1:0] metadata_queue_emptys; @@ -47,6 +48,8 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), ("runtime error: WGMMA execute not supported for warps other than 0!")) + logic metadata_deq; + for (genvar i = 0; i < `NUM_WARPS; i++) begin // Metadata queue for commit_if. This simply copies execute_if's // metadata and pops them in conjunction with commit fire. @@ -58,7 +61,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); // FIXME: commit only warp 0 - wire deq = commit_if_fire && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); + wire deq = metadata_deq && commit_if.ready && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); VX_fifo_queue #( .DATAW(DATAW), @@ -82,18 +85,58 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // the commit stage `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) - // FIXME: only checks warp 0 for commit! - assign commit_if.valid = ~metadata_queue_emptys[0/*FIXME*/]; + // dummy FSM that generates commits + logic [1:0] state, state_n; + localparam STATE_IDLE = 4'd0; + + always @(*) begin + state_n = state; + metadata_deq = 1'b0; + + // when incremented to 1, count up until wrap-around to 0 + if (state != STATE_IDLE) begin + state_n = state + 1'd1; + end else begin + // kick-off from idle when execute valid + // FIXME: only checks warp 0 for commit! + if (~metadata_queue_emptys[0/*FIXME*/]) begin + state_n = 4'd1; + end + end + + // dequeue metadata when wrapping around + if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin + metadata_deq = 1'b1; + end + + // change rd of the commit data according to state + execute_if_data_new_rd = + {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS], + (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))}; + end + + always @(posedge clk) begin + if (reset) begin + state <= '0; + end else begin + state <= state_n; + end + end + + // assign commit_if.valid = metadata_deq; + assign commit_if.valid = (state != STATE_IDLE); wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { - execute_if_data_deq[0/*FIXME*/], /* uuid ~ rd */ + // write-back to the correct rd only when eop + ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */ wb_data, /* data */ 1'b0, /* pid */ 1'b1, /* sop */ - 1'b1 /* eop */ + (state == 2'b11) /* eop */ + // 1'b1 /* eop */ }; assign commit_if.data = commit_if_data; From 58c9761829ffedc12d40aad51c326888a8d2f2c2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 9 Oct 2024 21:53:04 -0700 Subject: [PATCH 04/14] Revert decode change for hopper Share the same insn as non-hopper TC. --- hw/rtl/core/VX_decode.sv | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 62fdde76..cf21d72f 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -542,30 +542,16 @@ module VX_decode #( endcase end `ifdef EXT_T_ENABLE - `ifdef EXT_T_HOPPER - // TODO - `INST_EXT4: begin - ex_type = `EX_TENSOR; - op_type = `INST_TENSOR_HMMA; - // tensor core macroop is encoded as r-type - use_rd = 1; - `USED_IREG (rd); - `USED_IREG (rs1); - `USED_IREG (rs2); - `USED_IREG (rs3); - end - `else - `INST_EXT4: begin - ex_type = `EX_TENSOR; - op_type = `INST_TENSOR_HMMA; - // tensor core macroop is encoded as r-type - use_rd = 1; - `USED_IREG (rd); - `USED_IREG (rs1); - `USED_IREG (rs2); - `USED_IREG (rs3); - end - `endif + `INST_EXT4: begin + ex_type = `EX_TENSOR; + op_type = `INST_TENSOR_HMMA; + // tensor core macroop is encoded as r-type + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); + end `endif default:; endcase From d9ad4809ec44bc6b225a0b0c8636c283a5253f76 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 15:41:16 -0700 Subject: [PATCH 05/14] Add 'tensor' bit to commit_if and writeback_if For use in the asynchronous tensor instruction. When 1'b1, sets/unsets the inuse_tensor status bit in the scoreboard to signal kickoff/completion of the asynchronous tensor op. --- hw/rtl/core/VX_alu_unit.sv | 2 +- hw/rtl/core/VX_commit.sv | 3 ++- hw/rtl/core/VX_csr_unit.sv | 6 +++--- hw/rtl/core/VX_gather_unit.sv | 3 ++- hw/rtl/core/VX_int_unit.sv | 6 +++--- hw/rtl/core/VX_lsu_unit.sv | 14 +++++++------- hw/rtl/core/VX_muldiv_unit.sv | 8 ++++---- hw/rtl/core/VX_reduce_unit.sv | 4 ++-- hw/rtl/core/VX_scoreboard.sv | 4 ++++ hw/rtl/core/VX_sfu_unit.sv | 2 +- hw/rtl/core/VX_tensor_core.sv | 3 ++- hw/rtl/core/VX_tensor_hopper_core.sv | 3 ++- hw/rtl/core/VX_wctl_unit.sv | 6 +++--- hw/rtl/interfaces/VX_commit_if.sv | 1 + hw/rtl/interfaces/VX_writeback_if.sv | 1 + 15 files changed, 38 insertions(+), 28 deletions(-) diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index 071cc08d..1c089509 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -32,7 +32,7 @@ module VX_alu_unit #( localparam NUM_LANES = `NUM_ALU_LANES; localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); - localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; + localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1; localparam RSP_ARB_SIZE = 2 + `EXT_M_ENABLED; localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index a584cace..9b930818 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -41,7 +41,7 @@ module VX_commit import VX_gpu_pkg::*; #( output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value ); `UNUSED_PARAM (CORE_ID) - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1 + 1; localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1); localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1; @@ -210,6 +210,7 @@ module VX_commit import VX_gpu_pkg::*; #( assign writeback_if[i].data.tmask= commit_if[i].data.tmask; assign writeback_if[i].data.rd = commit_if[i].data.rd; assign writeback_if[i].data.data = commit_if[i].data.data; + assign writeback_if[i].data.tensor = commit_if[i].data.tensor; assign writeback_if[i].data.sop = commit_if[i].data.sop; assign writeback_if[i].data.eop = commit_if[i].data.eop; assign commit_if[i].ready = 1'b1; // writeback has no backpressure diff --git a/hw/rtl/core/VX_csr_unit.sv b/hw/rtl/core/VX_csr_unit.sv index 9fa373b6..bf229789 100644 --- a/hw/rtl/core/VX_csr_unit.sv +++ b/hw/rtl/core/VX_csr_unit.sv @@ -43,7 +43,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #( `UNUSED_PARAM (CORE_ID) localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + 1 + PID_WIDTH + 1 + 1; `UNUSED_VAR (execute_if.data.rs3_data) @@ -174,8 +174,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #( .reset (reset), .valid_in (csr_req_valid), .ready_in (csr_req_ready), - .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}), - .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}), + .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}), + .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}), .valid_out (commit_if.valid), .ready_out (commit_if.ready) ); diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index 21ae4485..fc8270d4 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -31,7 +31,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #( localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1; localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH); wire [BLOCK_SIZE-1:0] commit_in_valid; @@ -119,6 +119,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #( commit_tmp_if.data.wb, commit_tmp_if.data.rd, commit_data_r, + commit_tmp_if.data.tensor, 1'b0, // PID commit_tmp_if.data.sop, commit_tmp_if.data.eop diff --git a/hw/rtl/core/VX_int_unit.sv b/hw/rtl/core/VX_int_unit.sv index a5e4f394..b8cb78dd 100644 --- a/hw/rtl/core/VX_int_unit.sv +++ b/hw/rtl/core/VX_int_unit.sv @@ -136,14 +136,14 @@ module VX_int_unit #( end VX_elastic_buffer #( - .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH) + .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH) ) rsp_buf ( .clk (clk), .reset (reset), .valid_in (execute_if.valid), .ready_in (execute_if.ready), - .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}), - .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}), + .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, 1'b0/*tensor*/, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}), + .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.tensor, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}), .valid_out (commit_if.valid), .ready_out (commit_if.ready) ); diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 20fac1d1..e8748e39 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -36,7 +36,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( localparam NUM_LANES = `NUM_LSU_LANES; localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); - localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; + localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + 1 + PID_WIDTH + 1 + 1; localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE); localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE); localparam MEM_ADDRW = `XLEN - MEM_ASHIFT; @@ -527,15 +527,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( // load commit VX_elastic_buffer #( - .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1), + .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1), .SIZE (2) ) ld_rsp_buf ( .clk (clk), .reset (reset), .valid_in (mem_rsp_valid), .ready_in (mem_rsp_ready), - .data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}), - .data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}), + .data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, 1'b0/*tensor*/, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}), + .data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.tensor, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}), .valid_out (commit_ld_if.valid), .ready_out (commit_ld_if.ready) ); @@ -545,15 +545,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( // store commit VX_elastic_buffer #( - .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1), + .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + PID_WIDTH + 1 + 1), .SIZE (2) ) st_rsp_buf ( .clk (clk), .reset (reset), .valid_in (mem_req_fire && mem_req_rw), .ready_in (st_rsp_ready), - .data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}), - .data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}), + .data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, 1'b0/*tensor*/, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}), + .data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.tensor, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}), .valid_out (commit_st_if.valid), .ready_out (commit_st_if.ready) ); diff --git a/hw/rtl/core/VX_muldiv_unit.sv b/hw/rtl/core/VX_muldiv_unit.sv index 6daa3c3d..80168c73 100644 --- a/hw/rtl/core/VX_muldiv_unit.sv +++ b/hw/rtl/core/VX_muldiv_unit.sv @@ -323,16 +323,16 @@ module VX_muldiv_unit #( VX_stream_arb #( .NUM_INPUTS (2), - .DATAW (TAGW + (NUM_LANES * `XLEN)), + .DATAW (1/*tensor field only in commit*/ + TAGW + (NUM_LANES * `XLEN)), .OUT_REG (1) ) rsp_buf ( .clk (clk), .reset (reset), .valid_in ({div_valid_out, mul_valid_out}), .ready_in ({div_ready_out, mul_ready_out}), - .data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out}, - {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}), - .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}), + .data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, 1'b0/*tensor*/, div_pid_out, div_sop_out, div_eop_out, div_result_out}, + {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, 1'b0/*tensor*/, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}), + .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}), .valid_out (commit_if.valid), .ready_out (commit_if.ready), `UNUSED_PIN (sel_out) diff --git a/hw/rtl/core/VX_reduce_unit.sv b/hw/rtl/core/VX_reduce_unit.sv index 8522f8d1..b63e57ae 100644 --- a/hw/rtl/core/VX_reduce_unit.sv +++ b/hw/rtl/core/VX_reduce_unit.sv @@ -269,7 +269,7 @@ module VX_reduce_unit #( ); VX_elastic_buffer #( - .DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + PID_WIDTH + 1 + 1) + .DATAW(`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + (`XLEN * NUM_LANES) + 1 + PID_WIDTH + 1 + 1) ) output_buffer ( .clk(clk), .reset(reset), @@ -277,7 +277,7 @@ module VX_reduce_unit #( .ready_in(commit_if_ready), .data_in({execute_if.data.uuid, execute_if.data.wid, stored_tmask, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, broadcasted_accumulator, stored_pid, stored_sop, stored_eop}), - .data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}), + .data_out({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.wb, commit_if.data.rd, commit_if.data.data, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}), .ready_out(commit_if.ready), .valid_out(commit_if.valid) ); diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index c63a5dcb..42a876f5 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -142,6 +142,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; + // busy bit for the asynchronous Tensor unit. Since the ISA does not + // have an explicit destination register, use a separate status bit. + reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -227,6 +230,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( always @(posedge clk) begin if (reset) begin inuse_regs <= '0; + inuse_tensor <= '0; end else begin if (writeback_fire) begin inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index ed2023b7..48f1cb8f 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -49,7 +49,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); - localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1; + localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + 1 + PID_WIDTH + 1 + 1; localparam RSP_ARB_SIZE = 1 + 1; localparam RSP_ARB_IDX_WCTL = 0; localparam RSP_ARB_IDX_CSRS = 1; diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 802af43d..1f7a95db 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -283,10 +283,11 @@ module VX_tensor_core_block import VX_gpu_pkg::*; #( assign commit_if_ready_override = commit_if.ready && (counter == 2'b0); `endif - localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; + localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { execute_if_data_deq[wb_wid], /* uuid ~ rd */ subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */ + 1'b0, /* tensor */ 1'b0, /* pid */ 1'b1, /* sop */ 1'b1 /* eop */ diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index a58f4027..2ecbea70 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -128,11 +128,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; + localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { // write-back to the correct rd only when eop ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */ wb_data, /* data */ + 1'b0, /* tensor */ 1'b0, /* pid */ 1'b1, /* sop */ (state == 2'b11) /* eop */ diff --git a/hw/rtl/core/VX_wctl_unit.sv b/hw/rtl/core/VX_wctl_unit.sv index 5b1ad834..36144018 100644 --- a/hw/rtl/core/VX_wctl_unit.sv +++ b/hw/rtl/core/VX_wctl_unit.sv @@ -32,7 +32,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1; `UNUSED_VAR (execute_if.data.rs3_data) @@ -141,8 +141,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( .reset (reset), .valid_in (execute_if.valid), .ready_in (execute_if.ready), - .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}), - .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}), + .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, 1'b0/*tensor*/, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}), + .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.tensor, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}), .valid_out (commit_if.valid), .ready_out (commit_if.ready) ); diff --git a/hw/rtl/interfaces/VX_commit_if.sv b/hw/rtl/interfaces/VX_commit_if.sv index e5bfa13a..2eaf5d0e 100644 --- a/hw/rtl/interfaces/VX_commit_if.sv +++ b/hw/rtl/interfaces/VX_commit_if.sv @@ -26,6 +26,7 @@ interface VX_commit_if #( logic wb; logic [`NR_BITS-1:0] rd; logic [NUM_LANES-1:0][`XLEN-1:0] data; + logic tensor; logic [PID_WIDTH-1:0] pid; logic sop; logic eop; diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index ce6241ef..72abdb45 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -22,6 +22,7 @@ interface VX_writeback_if import VX_gpu_pkg::*; (); logic [`XLEN-1:0] PC; logic [`NR_BITS-1:0] rd; logic [`NUM_THREADS-1:0][`XLEN-1:0] data; + logic tensor; logic sop; logic eop; } data_t; From 100d69ef210ea648f958146ac20afd53b2ba4c48 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 15:47:58 -0700 Subject: [PATCH 06/14] Doc update on accumulator regs --- hw/rtl/core/VX_uop_sequencer.sv | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 8a53d3f1..ab0298cb 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -225,8 +225,9 @@ end if (uop_sequencer_if.valid && use_uop && uop_sequencer_if.data.rd == `NR_BITS'(1)) begin - // a little sketchy? but shouldn't create any loop - ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // FIXME: 8 is hardcoded + // if rd is '1', use a separate set of 8 fp registers as the + // destination accumulator data. + ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // note 8 is hardcoded ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8); end end From 72f9dedce3f3cee924856897b5dcaa21db6e6d39 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 15:59:31 -0700 Subject: [PATCH 07/14] tensor: Disable micro-ops for hopper Have an uarch FSM handle the stepping mechanism entirely. --- hw/rtl/core/VX_uop_sequencer.sv | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index ab0298cb..798466bf 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -183,7 +183,13 @@ end // merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint, // but conceptually they should be linked always @(*) begin +`ifdef EXT_T_HOPPER + // for Hopper, disable micro-op blitzing. Set/step is managed + // microarchitecturally in an FSM inside the tensor core. + use_uop = 1'b0; +`else use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_BITS'(`EX_TENSOR); +`endif if (uop_start) begin // 1st cycle of microcoded operation, use op_type to determine entry point into microcode table From 408a9b5d2adcc64b76beb84c912da57d92b0b63d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 17:18:01 -0700 Subject: [PATCH 08/14] tensor: Write stall logic for hgmma_wait HGMMA_WAIT instruction stalls at issue when inuse_tensor is set, which is done by the previous HGMMA insn. Currently inuse_tensor is never set back to zero. --- hw/rtl/VX_define.vh | 5 ++++- hw/rtl/core/VX_decode.sv | 11 +++++++++++ hw/rtl/core/VX_scoreboard.sv | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index bb96a149..61ce41be 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -254,7 +254,10 @@ `define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) -`define INST_TENSOR_HMMA 4'b0000 +`define INST_TENSOR_HMMA 4'b0000 +// Hopper WGMMA-style asynchronous op +`define INST_TENSOR_HGMMA 4'b0001 +`define INST_TENSOR_HGMMA_WAIT 4'b0010 /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index cf21d72f..df3500f2 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -543,6 +543,16 @@ module VX_decode #( end `ifdef EXT_T_ENABLE `INST_EXT4: begin + `ifdef EXT_T_HOPPER + ex_type = `EX_TENSOR; + // tensor core macroop is encoded as r-type + if (func3[0]) begin + op_type = `INST_TENSOR_HGMMA_WAIT; + end else begin + op_type = `INST_TENSOR_HGMMA; + end + // rd/rs1/rs2/rs3 unused + `else ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; // tensor core macroop is encoded as r-type @@ -551,6 +561,7 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); `USED_IREG (rs3); + `endif end `endif default:; diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 42a876f5..59886098 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -146,6 +146,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // have an explicit destination register, use a separate status bit. reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; + wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); + wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]; @@ -208,7 +211,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // NOTE(hansung): why is inuse_rd checked? to prevent WAW? wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; + `ifdef EXT_T_HOPPER + wire hgmma_wait = ibuffer_if[i].valid && + (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); + wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); + wire operands_ready = (~(| operands_busy)) && hgmma_ready; + `else wire operands_ready = ~(| operands_busy); + `endif wire stg_valid_in, stg_ready_in; assign stg_valid_in = ibuffer_if[i].valid && operands_ready; @@ -238,6 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end + `ifdef EXT_T_HOPPER + if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin + inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1; + end + `endif end `ifdef PERF_ENABLE if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin From 42b9d23f832d2192231e66dd55a012193ecdc860 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 17:27:51 -0700 Subject: [PATCH 09/14] tensor: Write release logic for hgmma Upon completion of an op, tensor_core_hopper sends a "ghost" commit signal down the pipeline with the `wb` and `tensor` bit set in commit_if. The scoreboard receives this signal via writeback_if and resets the inuse_tensor status bit back to zero, which unblocks the HGMMA_WAIT instruction. --- hw/rtl/core/VX_scoreboard.sv | 7 +++- hw/rtl/core/VX_tensor_hopper_core.sv | 56 +++++++++++++--------------- 2 files changed, 30 insertions(+), 33 deletions(-) diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 59886098..2a39c058 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // have an explicit destination register, use a separate status bit. reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor; - wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; `ifdef EXT_T_HOPPER wire hgmma_wait = ibuffer_if[i].valid && - (ibuffer_if[i].data.ex_type == `EX_TENSOR) && + (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]); wire operands_ready = (~(| operands_busy)) && hgmma_ready; @@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end `ifdef EXT_T_HOPPER + if (writeback_fire && writeback_if[i].data.tensor) begin + inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0; + end if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1; end diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 2ecbea70..dc763d48 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( VX_execute_if.slave execute_if, VX_commit_if.master commit_if ); + localparam NUM_LANES = `NUM_THREADS; localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary /* commit_if.data_t parts that we need to keep around: @@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; wire operand_enq_fire = execute_if.valid && execute_if.ready; wire commit_if_fire = commit_if.valid && commit_if.ready; - wire [DATAW-1:0] execute_if_data_enq = { - execute_if.data.uuid, - execute_if.data.wid, - execute_if.data.tmask, - execute_if.data.PC, - execute_if.data.wb, - execute_if.data.rd - // pid/sop/eop set later - }; - wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; + wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; + wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; + wire [`NUM_WARPS-1:0] execute_if_data_wb; + wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; logic [DATAW-1:0] execute_if_data_new_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; @@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( .reset(reset), .push(enq), .pop(deq), - .data_in(execute_if_data_enq), - .data_out(execute_if_data_deq[i]), + .data_in({execute_if.data.uuid, execute_if.data.wid, + execute_if.data.tmask, execute_if.data.PC, + execute_if.data.wb, execute_if.data.rd}), + .data_out({execute_if_data_uuid[i], execute_if_data_wid[i], + execute_if_data_tmask[i], execute_if_data_PC[i], + execute_if_data_wb[i], execute_if_data_rd[i]}), .empty(metadata_queue_emptys[i]), `UNUSED_PIN(alm_empty), .full(metadata_queue_fulls[i]), @@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin metadata_deq = 1'b1; end - - // change rd of the commit data according to state - execute_if_data_new_rd = - {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS], - (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))}; end always @(posedge clk) begin @@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; - wire [COMMIT_DATAW-1:0] commit_if_data = { - // write-back to the correct rd only when eop - ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */ - wb_data, /* data */ - 1'b0, /* tensor */ - 1'b0, /* pid */ - 1'b1, /* sop */ - (state == 2'b11) /* eop */ - // 1'b1 /* eop */ - }; - - assign commit_if.data = commit_if_data; + assign commit_if.data.uuid = execute_if_data_uuid[0]; + assign commit_if.data.wid = execute_if_data_wid[0]; + assign commit_if.data.tmask = execute_if_data_tmask[0]; + assign commit_if.data.PC = execute_if_data_PC[0]; + assign commit_if.data.wb = (state == 2'b11); + // custom rd + assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state)); + assign commit_if.data.data = wb_data; + assign commit_if.data.tensor = (state == 2'b11); + assign commit_if.data.pid = 1'b0; + assign commit_if.data.sop = 1'b1; + assign commit_if.data.eop = (state == 2'b11); endmodule `endif From f7f23e0c05686039cd8a4f0836f82ca653952e25 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 18:00:36 -0700 Subject: [PATCH 10/14] tensor: Doc update --- hw/rtl/core/VX_decode.sv | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index df3500f2..8d52d450 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -551,11 +551,14 @@ module VX_decode #( end else begin op_type = `INST_TENSOR_HGMMA; end - // rd/rs1/rs2/rs3 unused + // rd/rs1/rs2/rs3 unused to prevent hazard stalls at the + // scoreboard `else ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; // tensor core macroop is encoded as r-type + // hazard stall logic in the scoreboard will handle + // read-after-write dependency on rd -> rs3 use_rd = 1; `USED_IREG (rd); `USED_IREG (rs1); From 2934b1bd94a670ba0c9588256b55373f555fa5b5 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 20:09:09 -0700 Subject: [PATCH 11/14] tensor: Split execution module from pipeline logic --- hw/rtl/core/VX_tensor_core.sv | 2 +- hw/rtl/core/VX_tensor_hopper_core.sv | 113 ++++++++++++++++++--------- 2 files changed, 77 insertions(+), 38 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 1f7a95db..cad70b97 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -55,7 +55,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( VX_tensor_hopper_core_block #( .ISW(1), // FIXME: not block_idx .FP16(FP16) - ) tensor_hopper_core ( + ) tensor_hopper_core_block ( .clk(clk), .reset(reset), .execute_if(execute_if[block_idx]), diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index dc763d48..8abe463e 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -12,7 +12,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( VX_commit_if.master commit_if ); localparam NUM_LANES = `NUM_THREADS; - localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary + localparam METADATA_QUEUE_DEPTH = 2; // FIXME: arbitrary /* commit_if.data_t parts that we need to keep around: - uuid @@ -22,29 +22,23 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - - wire operand_enq_fire = execute_if.valid && execute_if.ready; - wire commit_if_fire = commit_if.valid && commit_if.ready; - wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; wire [`NUM_WARPS-1:0] execute_if_data_wb; wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; - logic [DATAW-1:0] execute_if_data_new_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; wire [`NUM_WARPS-1:0] metadata_queue_emptys; - // OR not AND, we don't want any warp full + // OR not AND; we don't want any warp to be full wire metadata_queue_full = |(metadata_queue_fulls); assign execute_if.ready = !metadata_queue_full; `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), ("runtime error: WGMMA execute not supported for warps other than 0!")) - logic metadata_deq; + wire metadata_deq; for (genvar i = 0; i < `NUM_WARPS; i++) begin // Metadata queue for commit_if. This simply copies execute_if's @@ -55,10 +49,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // ensure two consecutive dequeues are associated with the same warp for // commit. (FIXME: this is not strictly necessary though.) + wire operand_enq_fire = execute_if.valid && execute_if.ready; wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); // FIXME: commit only warp 0 - wire deq = metadata_deq && commit_if.ready && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); + wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; VX_fifo_queue #( .DATAW(DATAW), .DEPTH(METADATA_QUEUE_DEPTH) @@ -85,28 +81,84 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // the commit stage `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) + wire initiate_ready; // FIXME: unused + wire writeback_valid; + wire writeback_last; + + wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/]; + // dequeue metadata at the last writeback + assign metadata_deq = metadata_valid && writeback_valid && writeback_last; + + VX_tensor_hopper_core #( + ) tensor_hopper_core ( + .clk(clk), + .reset(reset), + + .initiate_valid(metadata_valid), + .initiate_wid(`NW_WIDTH'(0)/*FIXME*/), + .initiate_ready(initiate_ready), + + .writeback_valid(writeback_valid), + `UNUSED_PIN(writeback_wid), + .writeback_last(writeback_last), + .writeback_ready(commit_if.ready) + ); + + wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; + + assign commit_if.valid = writeback_valid; + assign commit_if.data.uuid = execute_if_data_uuid[0]; + assign commit_if.data.wid = execute_if_data_wid[0]; + assign commit_if.data.tmask = execute_if_data_tmask[0]; + assign commit_if.data.PC = execute_if_data_PC[0]; + assign commit_if.data.wb = writeback_last; + // custom rd + assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/)); + assign commit_if.data.data = wb_data; + assign commit_if.data.tensor = writeback_last; + assign commit_if.data.pid = 1'b0; + assign commit_if.data.sop = 1'b1; + // eop is deliberately set so that we don't underflow the pending_instr + // buffer in VX_schedule. An instruction is considered committed only + // when the eop bit is set to one (see VX_commit). + assign commit_if.data.eop = writeback_last; +endmodule + + +// TODO: replace this with a Chisel module +module VX_tensor_hopper_core #( +) ( + input clk, + input reset, + + input initiate_valid, + input [`NW_WIDTH-1:0] initiate_wid, + output initiate_ready, + + output writeback_valid, + output [`NW_WIDTH-1:0] writeback_wid, + // indicates if this is the last writeback for the given wid, in which + // case the original HGMMA instruction should be signalled retired + output writeback_last, + input writeback_ready +); // dummy FSM that generates commits - logic [1:0] state, state_n; localparam STATE_IDLE = 4'd0; + logic [1:0] state, state_n; + + assign initiate_ready = (state == STATE_IDLE); always @(*) begin state_n = state; - metadata_deq = 1'b0; // when incremented to 1, count up until wrap-around to 0 if (state != STATE_IDLE) begin state_n = state + 1'd1; - end else begin - // kick-off from idle when execute valid - // FIXME: only checks warp 0 for commit! - if (~metadata_queue_emptys[0/*FIXME*/]) begin - state_n = 4'd1; - end end - // dequeue metadata when wrapping around - if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin - metadata_deq = 1'b1; + // kick-off + if (initiate_valid && initiate_ready) begin + state_n = 4'd1; end end @@ -118,23 +170,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( end end - // assign commit_if.valid = metadata_deq; - assign commit_if.valid = (state != STATE_IDLE); + assign writeback_valid = (state != STATE_IDLE); + assign writeback_wid = '0; // TODO + assign writeback_last = (state == 4'd15); - wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - - assign commit_if.data.uuid = execute_if_data_uuid[0]; - assign commit_if.data.wid = execute_if_data_wid[0]; - assign commit_if.data.tmask = execute_if_data_tmask[0]; - assign commit_if.data.PC = execute_if_data_PC[0]; - assign commit_if.data.wb = (state == 2'b11); - // custom rd - assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state)); - assign commit_if.data.data = wb_data; - assign commit_if.data.tensor = (state == 2'b11); - assign commit_if.data.pid = 1'b0; - assign commit_if.data.sop = 1'b1; - assign commit_if.data.eop = (state == 2'b11); endmodule `endif From 717fe7ff2959ba5185a69466b666a6439eb43e13 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 20:24:31 -0700 Subject: [PATCH 12/14] tensor: Fix FSM when commit not ready --- hw/rtl/core/VX_tensor_hopper_core.sv | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 8abe463e..21fad57c 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -144,17 +144,28 @@ module VX_tensor_hopper_core #( ); // dummy FSM that generates commits localparam STATE_IDLE = 4'd0; - logic [1:0] state, state_n; + localparam STATE_FINISH = 4'd15; + logic [3:0] state, state_n; assign initiate_ready = (state == STATE_IDLE); always @(*) begin state_n = state; - // when incremented to 1, count up until wrap-around to 0 - if (state != STATE_IDLE) begin - state_n = state + 1'd1; - end + case (state) + STATE_IDLE: begin + state_n = state; + end + STATE_FINISH: begin + // hold until writeback_ready + if (writeback_ready) begin + state_n = STATE_IDLE; + end + end + default: begin + state_n = state + 4'd1; + end + endcase // kick-off if (initiate_valid && initiate_ready) begin @@ -172,7 +183,7 @@ module VX_tensor_hopper_core #( assign writeback_valid = (state != STATE_IDLE); assign writeback_wid = '0; // TODO - assign writeback_last = (state == 4'd15); + assign writeback_last = (state == STATE_FINISH); endmodule From 4dcbc31a88915fff35ccefd00c6e753fa5ef135a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 11 Oct 2024 21:32:20 -0700 Subject: [PATCH 13/14] tensor: Separate async commit from tensor commit With this we can prioritize commit of the async hgmma instructions over the "ghost" commits from the TC. --- hw/rtl/core/VX_commit.sv | 9 ++- hw/rtl/core/VX_tensor_hopper_core.sv | 107 +++++++++++++++++++-------- 2 files changed, 83 insertions(+), 33 deletions(-) diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index 9b930818..faca0a2a 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #( // probably want to change this at some point // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline) wire [`ISSUE_WIDTH-1:0] final_hmma; + // if this is a "ghost" commit generated from the tensor core, don't count + // toward committed + wire [`ISSUE_WIDTH-1:0] tensor_ghost; `ifdef EXT_T_ENABLE for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin // if PC is 0, this means it is not final step of a wmma, shouldn't be committed assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); + // handle 'x' with ===. FIXME fix unitialization + assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1); end `else assign final_hmma = '1; + assign tensor_ghost = '0; `endif - - wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma; + wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost); VX_pipe_register #( .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 21fad57c..2b2136b6 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( - wb - rd */ - wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; - wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; - wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; - wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; - wire [`NUM_WARPS-1:0] execute_if_data_wb; - wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; + wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0] execute_if_data_wid; + wire [`NUM_WARPS-1:0][NUM_LANES-1:0] execute_if_data_tmask; + wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type; + wire [`NUM_WARPS-1:0][`XLEN-1:0] execute_if_data_PC; + wire [`NUM_WARPS-1:0] execute_if_data_wb; + wire [`NUM_WARPS-1:0][`NR_BITS-1:0] execute_if_data_rd; wire [`NUM_WARPS-1:0] metadata_queue_fulls; wire [`NUM_WARPS-1:0] metadata_queue_emptys; @@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)), ("runtime error: WGMMA execute not supported for warps other than 0!")) - wire metadata_deq; + logic metadata_deq; for (genvar i = 0; i < `NUM_WARPS; i++) begin // Metadata queue for commit_if. This simply copies execute_if's @@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // FIXME: commit only warp 0 wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0)); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS; VX_fifo_queue #( .DATAW(DATAW), .DEPTH(METADATA_QUEUE_DEPTH) @@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( .push(enq), .pop(deq), .data_in({execute_if.data.uuid, execute_if.data.wid, - execute_if.data.tmask, execute_if.data.PC, + execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd}), .data_out({execute_if_data_uuid[i], execute_if_data_wid[i], - execute_if_data_tmask[i], execute_if_data_PC[i], + execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i], execute_if_data_wb[i], execute_if_data_rd[i]}), .empty(metadata_queue_emptys[i]), `UNUSED_PIN(alm_empty), @@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // the commit stage `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) - wire initiate_ready; // FIXME: unused + wire initiate_ready; wire writeback_valid; wire writeback_last; + logic writeback_ready; wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/]; - // dequeue metadata at the last writeback - assign metadata_deq = metadata_valid && writeback_valid && writeback_last; + wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT); + // skip HGMMA_WAIT for kickoff + wire initiate_valid = metadata_valid && not_wait; + + // we're recycling execute_if.op_type as operands_if.op_type which might + // have a different width; let's be safe + `STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS), + ("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS")) VX_tensor_hopper_core #( ) tensor_hopper_core ( .clk(clk), .reset(reset), - .initiate_valid(metadata_valid), + .initiate_valid(initiate_valid), .initiate_wid(`NW_WIDTH'(0)/*FIXME*/), .initiate_ready(initiate_ready), .writeback_valid(writeback_valid), `UNUSED_PIN(writeback_wid), .writeback_last(writeback_last), - .writeback_ready(commit_if.ready) + .writeback_ready(writeback_ready) ); wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0; - assign commit_if.valid = writeback_valid; - assign commit_if.data.uuid = execute_if_data_uuid[0]; - assign commit_if.data.wid = execute_if_data_wid[0]; - assign commit_if.data.tmask = execute_if_data_tmask[0]; - assign commit_if.data.PC = execute_if_data_PC[0]; - assign commit_if.data.wb = writeback_last; - // custom rd - assign commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/)); - assign commit_if.data.data = wb_data; - assign commit_if.data.tensor = writeback_last; - assign commit_if.data.pid = 1'b0; - assign commit_if.data.sop = 1'b1; - // eop is deliberately set so that we don't underflow the pending_instr - // buffer in VX_schedule. An instruction is considered committed only - // when the eop bit is set to one (see VX_commit). - assign commit_if.data.eop = writeback_last; + always @(*) begin + metadata_deq = 1'b0; + + // if there's something in the meta queue, give it priority for commit, + // since every HGMMA instructions are asynchronous and should not + // block + if (metadata_valid) begin + // block tensor core writeback + writeback_ready = 1'b0; + + commit_if.valid = metadata_valid; + commit_if.data.uuid = execute_if_data_uuid[0]; + commit_if.data.wid = execute_if_data_wid[0]; + commit_if.data.tmask = execute_if_data_tmask[0]; + commit_if.data.PC = execute_if_data_PC[0]; + commit_if.data.wb = execute_if_data_wb[0]; + commit_if.data.rd = execute_if_data_rd[0]; + commit_if.data.data = wb_data; // FIXME ? + commit_if.data.tensor = 1'b0; + commit_if.data.pid = 1'b0; + commit_if.data.sop = 1'b1; + commit_if.data.eop = 1'b1; + + // block meta queue until tensor core is ready. This will + // effectively stall further issue of async HGMMA when tensor core + // is busy with too many outstanding requests (depth of meta queue). + // be careful to not miss the commit backpressure. + metadata_deq = metadata_valid && commit_if.ready && initiate_ready; + end else begin + // allow tensor core writeback, provided there's no commit + // backpressure + writeback_ready = commit_if.ready; + + commit_if.valid = writeback_valid; + commit_if.data.uuid = '0; + commit_if.data.wid = '0; // FIXME + commit_if.data.tmask = {NUM_LANES{1'b1}}; + commit_if.data.PC = '0; + commit_if.data.wb = writeback_last; + commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/)); + commit_if.data.data = wb_data; + // mark as "ghost" commit. This will prevent this commit from + // decrementing from pending_instr buffer + commit_if.data.tensor = 1'b1; + // eop is deliberately set so that we don't underflow the pending_instr + // buffer in VX_schedule. An instruction is considered committed only + // when the eop bit is set to one (see VX_commit). + // only the last ghost commit has eop set, which will trigger + // scoreboard to clear out the busy bit. + commit_if.data.eop = writeback_last; + end + end + endmodule From 0f06afc3ef7350e82c008f5f25395abf89879213 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 21 Oct 2024 22:37:20 -0700 Subject: [PATCH 14/14] Update doc --- hw/rtl/core/VX_commit.sv | 2 +- hw/rtl/core/VX_tensor_hopper_core.sv | 3 --- hw/rtl/fpu/VX_tensor_dpu.sv | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index faca0a2a..cf4d92b4 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -177,7 +177,7 @@ module VX_commit import VX_gpu_pkg::*; #( // probably want to change this at some point // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline) wire [`ISSUE_WIDTH-1:0] final_hmma; - // if this is a "ghost" commit generated from the tensor core, don't count + // if this is a "ghost" commit generated at the tensor core, don't count // toward committed wire [`ISSUE_WIDTH-1:0] tensor_ghost; `ifdef EXT_T_ENABLE diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 2b2136b6..b6302cc3 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -158,9 +158,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // mark as "ghost" commit. This will prevent this commit from // decrementing from pending_instr buffer commit_if.data.tensor = 1'b1; - // eop is deliberately set so that we don't underflow the pending_instr - // buffer in VX_schedule. An instruction is considered committed only - // when the eop bit is set to one (see VX_commit). // only the last ghost commit has eop set, which will trigger // scoreboard to clear out the busy bit. commit_if.data.eop = writeback_last; diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 1cd2df84..0b5d846e 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -290,7 +290,7 @@ module VX_tensor_threadgroup #( // d_col_sel: 1, 3, 1, 3 // // substep 0: - // [ 0 x 2 x ] + // [ 0 x 2 x ] (0~3 is 'i', the dpu id) // [ 1 x 3 x ] // substep 1: // [ x 0 x 2 ]