diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 8529105d..e8bb56fc 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -40,6 +40,10 @@ `define EXT_F_ENABLE `endif +`ifndef EXT_T_DISABLE +`define EXT_T_ENABLE +`endif + `ifndef XLEN_32 `ifndef XLEN_64 `define XLEN_32 @@ -618,6 +622,12 @@ `define EXT_F_ENABLED 0 `endif +`ifdef EXT_T_ENABLE + `define EXT_T_ENABLED 1 +`else + `define EXT_T_ENABLED 0 +`endif + `ifdef EXT_M_ENABLE `define EXT_M_ENABLED 1 `else diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 9ddeeeea..bb96a149 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -58,8 +58,9 @@ `define EX_LSU 1 `define EX_SFU 2 `define EX_FPU (`EX_SFU + `EXT_F_ENABLED) +`define EX_TENSOR (`EX_FPU + `EXT_T_ENABLED) -`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) +`define NUM_EX_UNITS (3 + `EXT_F_ENABLED + `EXT_T_ENABLED) `define EX_BITS `CLOG2(`NUM_EX_UNITS) `define EX_WIDTH `UP(`EX_BITS) @@ -253,6 +254,8 @@ `define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) +`define INST_TENSOR_HMMA 4'b0000 + /////////////////////////////////////////////////////////////////////////////// // non-cacheable tag bits diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index 09667d11..227104df 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -27,6 +27,10 @@ module VX_commit import VX_gpu_pkg::*; #( `endif VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH], +`ifdef EXT_T_ENABLE + VX_commit_if.slave tensor_commit_if [`ISSUE_WIDTH], +`endif + // outputs VX_writeback_if.master writeback_if [`ISSUE_WIDTH], VX_commit_csr_if.master commit_csr_if, @@ -65,6 +69,9 @@ module VX_commit import VX_gpu_pkg::*; #( sfu_commit_if[i].valid, `ifdef EXT_F_ENABLE fpu_commit_if[i].valid, + `endif + `ifdef EXT_T_ENABLE + tensor_commit_if[i].valid, `endif alu_commit_if[i].valid, lsu_commit_if[i].valid @@ -73,6 +80,9 @@ module VX_commit import VX_gpu_pkg::*; #( sfu_commit_if[i].ready, `ifdef EXT_F_ENABLE fpu_commit_if[i].ready, + `endif + `ifdef EXT_T_ENABLE + tensor_commit_if[i].ready, `endif alu_commit_if[i].ready, lsu_commit_if[i].ready @@ -81,6 +91,9 @@ module VX_commit import VX_gpu_pkg::*; #( sfu_commit_if[i].data, `ifdef EXT_F_ENABLE fpu_commit_if[i].data, + `endif + `ifdef EXT_T_ENABLE + tensor_commit_if[i].data, `endif alu_commit_if[i].data, lsu_commit_if[i].data @@ -157,7 +170,18 @@ module VX_commit import VX_gpu_pkg::*; #( // Committed instructions - wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop; + // temporary hack to not underflow the pending instructions buffer + wire [`ISSUE_WIDTH-1:0] final_hmma; +`ifdef EXT_T_ENABLE + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign final_hmma[i] = ~(tensor_commit_if[i].ready && tensor_commit_if[i].valid) || (tensor_commit_if[i].data.rd == `NR_BITS'(32 + 23)); + end +`else + assign final_hmma = '1; +`endif + + + wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma; VX_pipe_register #( .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index dde085a8..41f54b95 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -65,6 +65,10 @@ module VX_core import VX_gpu_pkg::*; #( `ifdef EXT_F_ENABLE VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH](); VX_commit_if fpu_commit_if[`ISSUE_WIDTH](); +`endif +`ifdef EXT_T_ENABLE + VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if tensor_commit_if[`ISSUE_WIDTH](); `endif VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH](); VX_commit_if sfu_commit_if[`ISSUE_WIDTH](); @@ -172,6 +176,9 @@ module VX_core import VX_gpu_pkg::*; #( .lsu_dispatch_if(lsu_dispatch_if), `ifdef EXT_F_ENABLE .fpu_dispatch_if(fpu_dispatch_if), + `endif + `ifdef EXT_T_ENABLE + .tensor_dispatch_if(tensor_dispatch_if), `endif .sfu_dispatch_if(sfu_dispatch_if) ); @@ -197,6 +204,10 @@ module VX_core import VX_gpu_pkg::*; #( .fpu_dispatch_if(fpu_dispatch_if), .fpu_commit_if (fpu_commit_if), `endif + `ifdef EXT_T_ENABLE + .tensor_dispatch_if (tensor_dispatch_if), + .tensor_commit_if (tensor_commit_if), + `endif .commit_csr_if (commit_csr_if), .sched_csr_if (sched_csr_if), @@ -227,6 +238,9 @@ module VX_core import VX_gpu_pkg::*; #( .fpu_commit_if (fpu_commit_if), `endif .sfu_commit_if (sfu_commit_if), + `ifdef EXT_T_ENABLE + .tensor_commit_if (tensor_commit_if), + `endif .writeback_if (writeback_if), diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 42cd7ffc..1d38c0b2 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -533,6 +533,12 @@ module VX_decode #( default:; endcase end + `ifdef EXT_T_ENABLE + `INST_EXT4: begin + ex_type = `EX_TENSOR; + op_type = `INST_TENSOR_HMMA; + end + `endif default:; endcase end diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 61d857c5..b8288529 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -31,6 +31,9 @@ module VX_dispatch import VX_gpu_pkg::*; #( VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH], `ifdef EXT_F_ENABLE VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH], +`endif +`ifdef EXT_T_ENABLE + VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] ); @@ -139,6 +142,35 @@ module VX_dispatch import VX_gpu_pkg::*; #( end `endif + // Tensor Core dispatch + +`ifdef EXT_T_ENABLE + + VX_operands_if tensor_operands_if[`ISSUE_WIDTH](); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR); + assign tensor_operands_if[i].data = operands_if[i].data; + + `RESET_RELAY (tensor_reset, reset); + + VX_elastic_buffer #( + .DATAW (DATAW), + .SIZE (2), + .OUT_REG (2) + ) tensor_buffer ( + .clk (clk), + .reset (tensor_reset), + .valid_in (tensor_operands_if[i].valid), + .ready_in (tensor_operands_if[i].ready), + .data_in (`TO_DISPATCH_DATA(tensor_operands_if[i].data, last_active_tid[i])), + .data_out (tensor_dispatch_if[i].data), + .valid_out (tensor_dispatch_if[i].valid), + .ready_out (tensor_dispatch_if[i].ready) + ); + end +`endif + // SFU dispatch VX_operands_if sfu_operands_if[`ISSUE_WIDTH](); @@ -171,6 +203,9 @@ module VX_dispatch import VX_gpu_pkg::*; #( || (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU)) `ifdef EXT_F_ENABLE || (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU)) + `endif + `ifdef EXT_T_ENABLE + || (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR)) `endif || (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU)); end diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index f1ea2675..cdf17a31 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -41,7 +41,7 @@ module VX_execute import VX_gpu_pkg::*; #( VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH], `endif - + VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master alu_commit_if [`ISSUE_WIDTH], VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS], @@ -53,6 +53,11 @@ module VX_execute import VX_gpu_pkg::*; #( VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH], VX_warp_ctl_if.master warp_ctl_if, +`ifdef EXT_T_ENABLE + VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH], + VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH], +`endif + // simulation helper signals output wire sim_ebreak ); @@ -127,6 +132,18 @@ module VX_execute import VX_gpu_pkg::*; #( .commit_if (sfu_commit_if) ); +`ifdef EXT_T_ENABLE + VX_tensor_core #( + + ) tensor_core ( + .clk(clk), + .reset(reset), + + .dispatch_if(tensor_dispatch_if), + .commit_if(tensor_commit_if) + ); +`endif + // simulation helper signal to get RISC-V tests Pass/Fail status assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready && alu_dispatch_if[0].data.wis == 0 diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index b465c195..c81d48c4 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -36,6 +36,8 @@ module VX_ibuffer import VX_gpu_pkg::*; #( assign decode_if.ready = ibuf_ready_in[decode_isw]; + VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH]; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin VX_elastic_buffer #( .DATAW (DATAW), @@ -62,13 +64,24 @@ module VX_ibuffer import VX_gpu_pkg::*; #( decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}), - .data_out(ibuffer_if[i].data), - .valid_out (ibuffer_if[i].valid), - .ready_out(ibuffer_if[i].ready) - ); + + .data_out (uop_sequencer_if[i].data), + .valid_out (uop_sequencer_if[i].valid), + .ready_out (uop_sequencer_if[i].ready) + ); + `ifndef L1_ENABLE - assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready; + assign decode_if.ibuf_pop[i] = uop_sequencer_if[i].valid && uop_sequencer_if[i].ready; `endif + + VX_uop_sequencer uop_sequencer ( + .clk(clk), + .reset(reset), + + .uop_sequencer_if(uop_sequencer_if[i]), + .ibuffer_if(ibuffer_if[i]) + ); + end endmodule diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 1ba4ca28..614451c2 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -33,6 +33,9 @@ module VX_issue #( VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH], `ifdef EXT_F_ENABLE VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH], +`endif +`ifdef EXT_T_ENABLE + VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] ); @@ -92,6 +95,9 @@ module VX_issue #( .lsu_dispatch_if(lsu_dispatch_if), `ifdef EXT_F_ENABLE .fpu_dispatch_if(fpu_dispatch_if), + `endif + `ifdef EXT_T_ENABLE + .tensor_dispatch_if(tensor_dispatch_if), `endif .sfu_dispatch_if(sfu_dispatch_if) ); diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv new file mode 100644 index 00000000..c31f3f9f --- /dev/null +++ b/hw/rtl/core/VX_tensor_core.sv @@ -0,0 +1,15 @@ +`include "VX_fpu_define.vh" + +module VX_tensor_core #( + +) ( + input clk, + input reset, + + VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], + VX_commit_if.master commit_if [`ISSUE_WIDTH] +); + `STATIC_ASSERT(`NUM_THREADS == 32, ("tensor core requires # of threads in a warp to be 32")); + `UNUSED_VAR(clk); + `UNUSED_VAR(reset); +endmodule diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv new file mode 100644 index 00000000..f18e473e --- /dev/null +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -0,0 +1,187 @@ +`include "VX_define.vh" + +`define FREG(x) {1'b1, `NRI_BITS'(`CLOG2(x))} + +module VX_uop_sequencer import VX_gpu_pkg::*; ( + input clk, + input reset, + + VX_ibuffer_if.slave uop_sequencer_if, + VX_ibuffer_if.master ibuffer_if +); + +`ifdef EXT_T_ENABLE + localparam UOP_TABLE_SIZE = 64; + localparam UPC_BITS = `CLOG2(UOP_TABLE_SIZE); + + localparam NEXT = 2'b00; + localparam FINISH = 2'b01; + + localparam UBR_BITS = 2; + + // uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 + localparam UOP_TABLE_WIDTH = UBR_BITS + UPC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + (`NR_BITS * 4); + localparam IBUFFER_IF_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4); + + logic [UOP_TABLE_WIDTH-1:0] uop; + + // reserve space at start of table for more uop sequences + localparam HMMA_SET0_STEP0_0 = UPC_BITS'(0); + localparam HMMA_SET0_STEP0_1 = UPC_BITS'(8); + /* + localparam HMMA_SET0_STEP1_0 = UPC_BITS'(9); + localparam HMMA_SET0_STEP1_1 = UPC_BITS'(10); + localparam HMMA_SET0_STEP2_0 = UPC_BITS'(11); + localparam HMMA_SET0_STEP2_1 = UPC_BITS'(12); + localparam HMMA_SET0_STEP3_0 = UPC_BITS'(13); + localparam HMMA_SET0_STEP3_1 = UPC_BITS'(14); + + localparam HMMA_SET1_STEP0_0 = UPC_BITS'(15); + localparam HMMA_SET1_STEP0_1 = UPC_BITS'(16); + localparam HMMA_SET1_STEP1_0 = UPC_BITS'(17); + localparam HMMA_SET1_STEP1_1 = UPC_BITS'(18); + localparam HMMA_SET1_STEP2_0 = UPC_BITS'(19); + localparam HMMA_SET1_STEP2_1 = UPC_BITS'(20); + localparam HMMA_SET1_STEP3_0 = UPC_BITS'(21); + localparam HMMA_SET1_STEP3_1 = UPC_BITS'(22); + + localparam HMMA_SET2_STEP0_0 = UPC_BITS'(23); + localparam HMMA_SET2_STEP0_1 = UPC_BITS'(24); + localparam HMMA_SET2_STEP1_0 = UPC_BITS'(25); + localparam HMMA_SET2_STEP1_1 = UPC_BITS'(26); + localparam HMMA_SET2_STEP2_0 = UPC_BITS'(27); + localparam HMMA_SET2_STEP2_1 = UPC_BITS'(28); + localparam HMMA_SET2_STEP3_0 = UPC_BITS'(29); + localparam HMMA_SET2_STEP3_1 = UPC_BITS'(30); + + localparam HMMA_SET3_STEP0_0 = UPC_BITS'(31); + localparam HMMA_SET3_STEP0_1 = UPC_BITS'(32); + localparam HMMA_SET3_STEP1_0 = UPC_BITS'(33); + localparam HMMA_SET3_STEP1_1 = UPC_BITS'(34); + localparam HMMA_SET3_STEP2_0 = UPC_BITS'(35); + localparam HMMA_SET3_STEP2_1 = UPC_BITS'(36); + localparam HMMA_SET3_STEP3_0 = UPC_BITS'(37); + localparam HMMA_SET3_STEP3_1 = UPC_BITS'(38); + */ + // register layout: f0-f7 used for A, f8-f15 used for B, f16-f23 used for C + + + + always @(*) begin + case (upc) + HMMA_SET0_STEP0_0: begin + uop = { + NEXT, + HMMA_SET0_STEP0_1, + `EX_BITS'(`EX_TENSOR), + `INST_OP_BITS'(0), // denotes that the first half is being computed + `INST_MOD_BITS'(0), // field is unused for HMMA + 1'b1, // write back + 1'b0, // don't use PC + 1'b0, // don't use immediate + 32'b0, // PC is unused - TODO: don't send a bogus PC down the pipeline as it is very confusing in trace + 32'b0, // immediate is unused + `FREG(16), // rd=f16 + `FREG(0), // rs1=f0, + `FREG(8), // rs2=f8 + `FREG(16) // rs3=f16 + }; + end + HMMA_SET0_STEP0_1: begin + uop = { + FINISH, + HMMA_SET0_STEP0_0, + `EX_BITS'(`EX_TENSOR), + `INST_OP_BITS'(1), // denotes that the second half is being computed + `INST_MOD_BITS'(0), // field is unused for HMMA + 1'b1, // write back + 1'b0, // don't use PC + 1'b0, // don't use immediate + 32'b0, // PC is unused - TODO: don't send a bogus PC down the pipeline as it is very confusing in trace + 32'b0, // immediate is unused + `FREG(17), // rd=f17 + `FREG(1), // rs1=f1, + `FREG(9), // rs2=f9 + `FREG(17) // rs3=f17 + }; + end + default: begin + uop = '0; + end + endcase + end + + logic [UPC_BITS-1:0] upc, upc_r, upc_n; + + logic [UBR_BITS-1:0] ubr = uop[UOP_TABLE_WIDTH-1:UOP_TABLE_WIDTH-UBR_BITS]; + logic [UPC_BITS-1:0] next_upc = uop[UOP_TABLE_WIDTH-UBR_BITS-1:UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS]; + + logic uop_fire = use_uop && ibuffer_if.valid && ibuffer_if.ready; + logic uop_start = ~use_uop_1d && use_uop; + logic uop_finish = use_uop && uop_sequencer_if.valid && uop_sequencer_if.ready; + logic use_uop, use_uop_1d; + + // merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint, but conceptually they should be linked + always @(*) begin + use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_TENSOR; + + if (uop_start) begin + // 1st cycle of microcoded operation, use op_type to determine entry point into microcode table + upc_n = UPC_BITS'(uop_sequencer_if.data.op_type); + end + else begin + upc_n = upc; + end + + if (uop_fire) begin + upc_n = next_upc; + end + end + + always @(*) begin + if (uop_start) begin + // 1st cycle of microcoded operation, use op_type to determine entry point into microcode table + upc = UPC_BITS'(uop_sequencer_if.data.op_type); + end + else begin + upc = upc_r; + end + end + + // copy UUID, wis, tmask from microcoded instruction + logic [IBUFFER_IF_DATAW-1:0] ibuffer_output = { + uop_sequencer_if.data.uuid, + uop_sequencer_if.data.wis, + uop_sequencer_if.data.tmask, + uop[UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS-1:0] + }; + + assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid; + assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready; + assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data; + + always @(posedge clk) begin + if (reset) begin + upc_r <= '0; + use_uop_1d <= '0; + end + else begin + upc_r <= upc_n; + if (uop_finish) begin + use_uop_1d <= 1'b0; // allow microcoded instructions to start immediately after eachother + end + else begin + use_uop_1d <= use_uop; + end + end + end +`else + `UNUSED_VAR(clk); + `UNUSED_VAR(reset); + assign ibuffer_if.valid = uop_sequencer_if.valid; + assign uop_sequencer_if.ready = ibuffer_if.ready; + assign ibuffer_if.data = uop_sequencer_if.data; +`endif + + +endmodule diff --git a/hw/rtl/fpu/VX_tensor_core.sv b/hw/rtl/fpu/VX_tensor_core.sv deleted file mode 100644 index e69de29b..00000000