diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index c4db29d4..de95144f 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -63,6 +63,7 @@ module VX_core import VX_gpu_pkg::*; #( VX_decode_if decode_if(); VX_sched_csr_if sched_csr_if(); VX_decode_sched_if decode_sched_if(); + VX_tc_rf_if tensor_regfile_if(); VX_commit_sched_if commit_sched_if(); VX_commit_csr_if commit_csr_if(); VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); @@ -190,6 +191,9 @@ module VX_core import VX_gpu_pkg::*; #( `endif `ifdef EXT_T_ENABLE .tensor_dispatch_if(tensor_dispatch_if), + `ifdef EXT_T_HOPPER + .tensor_regfile_if (tensor_regfile_if), + `endif `endif .sfu_dispatch_if(sfu_dispatch_if) ); @@ -221,8 +225,9 @@ module VX_core import VX_gpu_pkg::*; #( .tensor_dispatch_if (tensor_dispatch_if), .tensor_commit_if (tensor_commit_if), `ifdef EXT_T_HOPPER - .tensor_smem_A_if (tensor_smem_A_if), - .tensor_smem_B_if (tensor_smem_B_if), + .tensor_regfile_if (tensor_regfile_if), + .tensor_smem_A_if (tensor_smem_A_if), + .tensor_smem_B_if (tensor_smem_B_if), `endif `endif diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index c24fea7c..7cbb4ed2 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -59,6 +59,7 @@ module VX_execute import VX_gpu_pkg::*; #( VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH], `ifdef EXT_T_HOPPER + VX_tc_rf_if.master tensor_regfile_if, VX_tc_bus_if.master tensor_smem_A_if, VX_tc_bus_if.master tensor_smem_B_if, `endif @@ -156,6 +157,7 @@ module VX_execute import VX_gpu_pkg::*; #( .dispatch_if(tensor_dispatch_if), `ifdef EXT_T_HOPPER + .regfile_if(tensor_regfile_if), .smem_A_if(tensor_smem_A_if), .smem_B_if(tensor_smem_B_if), `endif diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 10a46121..8a641c55 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -14,7 +14,7 @@ `include "VX_define.vh" `include "VX_trace.vh" -module VX_issue #( +module VX_issue import VX_gpu_pkg::*; #( parameter CORE_ID = 0 ) ( `SCOPE_IO_DECL @@ -36,6 +36,9 @@ module VX_issue #( `endif `ifdef EXT_T_ENABLE VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], +`ifdef EXT_T_HOPPER + VX_tc_rf_if.slave tensor_regfile_if, +`endif `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] ); @@ -75,22 +78,6 @@ module VX_issue #( .scoreboard_if (scoreboard_if) ); - - // /* - // fake fsm driving tc output - reg [11:0] counter; - wire tc_rf_valid; - wire [4:0] tc_rf_addr; - always @(posedge clk) begin - if (reset) begin - counter <= 12'd1; - end else begin - counter <= counter + 12'd1; - end - end - assign tc_rf_valid = (counter[6:0] == 7'd0); - assign tc_rf_addr = counter[11:7]; - // */ `ifdef GPR_DUPLICATED VX_operands_dup #( `else @@ -104,11 +91,7 @@ module VX_issue #( .writeback_if (writeback_if), .scoreboard_if (scoreboard_if), .operands_if (operands_if), -`ifdef GPR_DUPLICATED - .tc_rf_valid ('{`ISSUE_WIDTH{tc_rf_valid}}), - .tc_rf_addr ('{`ISSUE_WIDTH{tc_rf_addr}}), - .tc_rf_data () -`endif + .tensor_regfile_if (tensor_regfile_if) ); VX_dispatch #( diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index e588655d..cfa6e8e4 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -24,11 +24,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH], - VX_operands_if.master operands_if [`ISSUE_WIDTH], - - input wire tc_rf_valid [`ISSUE_WIDTH], - input wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH], - output wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH] + VX_tc_rf_if.slave tensor_regfile_if, + VX_operands_if.master operands_if [`ISSUE_WIDTH] ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; @@ -47,6 +44,18 @@ module VX_operands_dup import VX_gpu_pkg::*; #( logic [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] empty2; logic [`ISSUE_WIDTH-1:0][2:0] size1; + wire tc_rf_valid [`ISSUE_WIDTH]; + wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH]; + // FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time + // because NUM_BLOCKS == 1 + wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH]; + + `STATIC_ASSERT((ISSUE_RATIO == 1), + ("static assertion failed: tensor core only supports ISSUE_RATIO == 1")) + assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}}; + assign tc_rf_addr = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}}; + assign tensor_regfile_if.rsp_data.data = tc_rf_data[0]; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin always @(posedge clk) begin @@ -104,7 +113,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .size (size1[i]) ); assign operands_if[i].valid = ~empty1[i]; - assign scoreboard_if[i].ready = (size1[i] < 2'd2) && ~tc_rf_valid[i]; + assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i]; // assert (full1[i] == full2[i]); // assert (empty1[i] == empty2[i]); @@ -207,10 +216,10 @@ module VX_operands_dup import VX_gpu_pkg::*; #( end `ifdef GPR_RESET - reg wr_enabled = 0; + reg wr_enabled = 1'b0; always @(posedge clk) begin if (reset) begin - wr_enabled <= 1; + wr_enabled <= 1'b1; end end `endif diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 6d3a8a4a..916340c4 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -9,6 +9,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], `ifdef EXT_T_HOPPER + VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master smem_A_if, VX_tc_bus_if.master smem_B_if, `endif @@ -63,6 +64,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .execute_if (execute_if[block_idx]), + .regfile_if (regfile_if), .smem_A_if (smem_A_if), .smem_B_if (smem_B_if), .commit_if (commit_block_if[block_idx]) diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index 40a66640..c6f9d4dd 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -9,6 +9,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( input reset, VX_execute_if.slave execute_if, + VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master smem_A_if, VX_tc_bus_if.master smem_B_if, VX_commit_if.master commit_if @@ -104,6 +105,21 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( `STATIC_ASSERT((`XLEN == 32), ("static assertion failed: tensor_hopper_core only supports XLEN == 32")) + // /* + // fake fsm driving tc rf port + reg [11:0] counter; + always @(posedge clk) begin + if (reset) begin + counter <= 12'd1; + end else begin + counter <= counter + 12'd1; + end + end + assign regfile_if.req_valid = (counter[3:0] != 4'd0); + assign regfile_if.req_data.wis = '0; + assign regfile_if.req_data.rs = counter[11:7]; + // */ + TensorCoreDecoupled tensor_hopper_core ( .clock(clk), .reset(reset), diff --git a/hw/rtl/mem/VX_tc_rf_if.sv b/hw/rtl/mem/VX_tc_rf_if.sv new file mode 100644 index 00000000..b12cd0dc --- /dev/null +++ b/hw/rtl/mem/VX_tc_rf_if.sv @@ -0,0 +1,46 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +interface VX_tc_rf_if import VX_gpu_pkg::*; (); + + typedef struct packed { + logic [ISSUE_WIS_W-1:0] wis; + logic [`NR_BITS-1:0] rs; + } req_data_t; + + typedef struct packed { + logic [`NUM_THREADS-1:0][`XLEN-1:0] data; + } rsp_data_t; + + logic req_valid; + req_data_t req_data; + + rsp_data_t rsp_data; + + modport master ( + output req_valid, + output req_data, + + input rsp_data + ); + + modport slave ( + input req_valid, + input req_data, + + output rsp_data + ); + +endinterface