tensor: Bore tensor regfile IO to execute units

This commit is contained in:
Hansung Kim
2024-10-24 20:32:18 -07:00
parent c88fd89f1f
commit 1bc4afe2bb
7 changed files with 95 additions and 32 deletions

View File

@@ -63,6 +63,7 @@ module VX_core import VX_gpu_pkg::*; #(
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_tc_rf_if tensor_regfile_if();
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
@@ -190,6 +191,9 @@ module VX_core import VX_gpu_pkg::*; #(
`endif
`ifdef EXT_T_ENABLE
.tensor_dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_HOPPER
.tensor_regfile_if (tensor_regfile_if),
`endif
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
@@ -221,8 +225,9 @@ module VX_core import VX_gpu_pkg::*; #(
.tensor_dispatch_if (tensor_dispatch_if),
.tensor_commit_if (tensor_commit_if),
`ifdef EXT_T_HOPPER
.tensor_smem_A_if (tensor_smem_A_if),
.tensor_smem_B_if (tensor_smem_B_if),
.tensor_regfile_if (tensor_regfile_if),
.tensor_smem_A_if (tensor_smem_A_if),
.tensor_smem_B_if (tensor_smem_B_if),
`endif
`endif

View File

@@ -59,6 +59,7 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
VX_tc_rf_if.master tensor_regfile_if,
VX_tc_bus_if.master tensor_smem_A_if,
VX_tc_bus_if.master tensor_smem_B_if,
`endif
@@ -156,6 +157,7 @@ module VX_execute import VX_gpu_pkg::*; #(
.dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_HOPPER
.regfile_if(tensor_regfile_if),
.smem_A_if(tensor_smem_A_if),
.smem_B_if(tensor_smem_B_if),
`endif

View File

@@ -14,7 +14,7 @@
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
module VX_issue import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
@@ -36,6 +36,9 @@ module VX_issue #(
`endif
`ifdef EXT_T_ENABLE
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
VX_tc_rf_if.slave tensor_regfile_if,
`endif
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
@@ -75,22 +78,6 @@ module VX_issue #(
.scoreboard_if (scoreboard_if)
);
// /*
// fake fsm driving tc output
reg [11:0] counter;
wire tc_rf_valid;
wire [4:0] tc_rf_addr;
always @(posedge clk) begin
if (reset) begin
counter <= 12'd1;
end else begin
counter <= counter + 12'd1;
end
end
assign tc_rf_valid = (counter[6:0] == 7'd0);
assign tc_rf_addr = counter[11:7];
// */
`ifdef GPR_DUPLICATED
VX_operands_dup #(
`else
@@ -104,11 +91,7 @@ module VX_issue #(
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if),
`ifdef GPR_DUPLICATED
.tc_rf_valid ('{`ISSUE_WIDTH{tc_rf_valid}}),
.tc_rf_addr ('{`ISSUE_WIDTH{tc_rf_addr}}),
.tc_rf_data ()
`endif
.tensor_regfile_if (tensor_regfile_if)
);
VX_dispatch #(

View File

@@ -24,11 +24,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH],
input wire tc_rf_valid [`ISSUE_WIDTH],
input wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH]
VX_tc_rf_if.slave tensor_regfile_if,
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
@@ -47,6 +44,18 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
logic [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] empty2;
logic [`ISSUE_WIDTH-1:0][2:0] size1;
wire tc_rf_valid [`ISSUE_WIDTH];
wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH];
// FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time
// because NUM_BLOCKS == 1
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
`STATIC_ASSERT((ISSUE_RATIO == 1),
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
assign tc_rf_addr = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}};
assign tensor_regfile_if.rsp_data.data = tc_rf_data[0];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
@@ -104,7 +113,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.size (size1[i])
);
assign operands_if[i].valid = ~empty1[i];
assign scoreboard_if[i].ready = (size1[i] < 2'd2) && ~tc_rf_valid[i];
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
// assert (full1[i] == full2[i]);
// assert (empty1[i] == empty2[i]);
@@ -207,10 +216,10 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
end
`ifdef GPR_RESET
reg wr_enabled = 0;
reg wr_enabled = 1'b0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
wr_enabled <= 1'b1;
end
end
`endif

View File

@@ -9,6 +9,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master smem_A_if,
VX_tc_bus_if.master smem_B_if,
`endif
@@ -63,6 +64,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.execute_if (execute_if[block_idx]),
.regfile_if (regfile_if),
.smem_A_if (smem_A_if),
.smem_B_if (smem_B_if),
.commit_if (commit_block_if[block_idx])

View File

@@ -9,6 +9,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
input reset,
VX_execute_if.slave execute_if,
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master smem_A_if,
VX_tc_bus_if.master smem_B_if,
VX_commit_if.master commit_if
@@ -104,6 +105,21 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
`STATIC_ASSERT((`XLEN == 32),
("static assertion failed: tensor_hopper_core only supports XLEN == 32"))
// /*
// fake fsm driving tc rf port
reg [11:0] counter;
always @(posedge clk) begin
if (reset) begin
counter <= 12'd1;
end else begin
counter <= counter + 12'd1;
end
end
assign regfile_if.req_valid = (counter[3:0] != 4'd0);
assign regfile_if.req_data.wis = '0;
assign regfile_if.req_data.rs = counter[11:7];
// */
TensorCoreDecoupled tensor_hopper_core (
.clock(clk),
.reset(reset),

46
hw/rtl/mem/VX_tc_rf_if.sv Normal file
View File

@@ -0,0 +1,46 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_tc_rf_if import VX_gpu_pkg::*; ();
typedef struct packed {
logic [ISSUE_WIS_W-1:0] wis;
logic [`NR_BITS-1:0] rs;
} req_data_t;
typedef struct packed {
logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
} rsp_data_t;
logic req_valid;
req_data_t req_data;
rsp_data_t rsp_data;
modport master (
output req_valid,
output req_data,
input rsp_data
);
modport slave (
input req_valid,
input req_data,
output rsp_data
);
endinterface