Add Blackwell tensor RTL scaffolding
This commit is contained in:
@@ -79,8 +79,11 @@ module Vortex import VX_gpu_pkg::*; #(
|
|||||||
// tc --------------------------------------------------
|
// tc --------------------------------------------------
|
||||||
input [1:0] tc_a_ready,
|
input [1:0] tc_a_ready,
|
||||||
output [1:0] tc_a_valid,
|
output [1:0] tc_a_valid,
|
||||||
|
output [1:0] tc_a_bits_write,
|
||||||
output [63:0] tc_a_bits_address,
|
output [63:0] tc_a_bits_address,
|
||||||
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
||||||
|
output [2 * 32 - 1:0] tc_a_bits_mask,
|
||||||
|
output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
|
||||||
output [1:0] tc_d_ready,
|
output [1:0] tc_d_ready,
|
||||||
input [1:0] tc_d_valid,
|
input [1:0] tc_d_valid,
|
||||||
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
||||||
@@ -305,8 +308,11 @@ module Vortex import VX_gpu_pkg::*; #(
|
|||||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
|
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
|
||||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
|
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
|
||||||
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
|
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
|
||||||
|
assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw};
|
||||||
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
|
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
|
||||||
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
|
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
|
||||||
|
assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen};
|
||||||
|
assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data};
|
||||||
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
|
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
|
||||||
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
|
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
|
||||||
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||||
@@ -575,4 +581,3 @@ endmodule : Vortex
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -258,6 +258,18 @@
|
|||||||
// Hopper WGMMA-style asynchronous op
|
// Hopper WGMMA-style asynchronous op
|
||||||
`define INST_TENSOR_HGMMA 4'b0001
|
`define INST_TENSOR_HGMMA 4'b0001
|
||||||
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||||
|
`define INST_TENSOR_TCGEN05_CP 4'b0011
|
||||||
|
`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100
|
||||||
|
`define INST_TENSOR_BWGMMA 4'b0101
|
||||||
|
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
|
||||||
|
`define INST_TENSOR_TCGEN05_LD 4'b0111
|
||||||
|
`define INST_TENSOR_TCGEN05_ST 4'b1000
|
||||||
|
|
||||||
|
`ifdef EXT_T_HOPPER
|
||||||
|
`define EXT_T_ASYNC
|
||||||
|
`elsif EXT_T_BLACKWELL
|
||||||
|
`define EXT_T_ASYNC
|
||||||
|
`endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
VX_decode_if decode_if();
|
VX_decode_if decode_if();
|
||||||
VX_sched_csr_if sched_csr_if();
|
VX_sched_csr_if sched_csr_if();
|
||||||
VX_decode_sched_if decode_sched_if();
|
VX_decode_sched_if decode_sched_if();
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
VX_tc_rf_if tensor_regfile_if();
|
VX_tc_rf_if tensor_regfile_if();
|
||||||
`endif
|
`endif
|
||||||
VX_commit_sched_if commit_sched_if();
|
VX_commit_sched_if commit_sched_if();
|
||||||
@@ -193,7 +193,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
`endif
|
`endif
|
||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
.tensor_dispatch_if(tensor_dispatch_if),
|
.tensor_dispatch_if(tensor_dispatch_if),
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
.tensor_regfile_if (tensor_regfile_if),
|
.tensor_regfile_if (tensor_regfile_if),
|
||||||
`endif
|
`endif
|
||||||
`endif
|
`endif
|
||||||
@@ -226,7 +226,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
.tensor_dispatch_if (tensor_dispatch_if),
|
.tensor_dispatch_if (tensor_dispatch_if),
|
||||||
.tensor_commit_if (tensor_commit_if),
|
.tensor_commit_if (tensor_commit_if),
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
.tensor_regfile_if (tensor_regfile_if),
|
.tensor_regfile_if (tensor_regfile_if),
|
||||||
.tensor_smem_A_if (tensor_smem_A_if),
|
.tensor_smem_A_if (tensor_smem_A_if),
|
||||||
.tensor_smem_B_if (tensor_smem_B_if),
|
.tensor_smem_B_if (tensor_smem_B_if),
|
||||||
|
|||||||
@@ -555,6 +555,20 @@ module VX_decode #(
|
|||||||
// B matrix tiles
|
// B matrix tiles
|
||||||
`USED_IREG (rs1);
|
`USED_IREG (rs1);
|
||||||
`USED_IREG (rs2);
|
`USED_IREG (rs2);
|
||||||
|
`elsif EXT_T_BLACKWELL
|
||||||
|
ex_type = `EX_TENSOR;
|
||||||
|
case (func3)
|
||||||
|
3'b000: op_type = `INST_TENSOR_BWGMMA;
|
||||||
|
3'b001: op_type = `INST_TENSOR_BWGMMA_WAIT;
|
||||||
|
3'b010: op_type = `INST_TENSOR_TCGEN05_CP;
|
||||||
|
3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT;
|
||||||
|
3'b100: op_type = `INST_TENSOR_TCGEN05_LD;
|
||||||
|
3'b101: op_type = `INST_TENSOR_TCGEN05_ST;
|
||||||
|
default: ;
|
||||||
|
endcase
|
||||||
|
`USED_IREG (rd);
|
||||||
|
`USED_IREG (rs1);
|
||||||
|
`USED_IREG (rs2);
|
||||||
`else
|
`else
|
||||||
ex_type = `EX_TENSOR;
|
ex_type = `EX_TENSOR;
|
||||||
op_type = `INST_TENSOR_HMMA;
|
op_type = `INST_TENSOR_HMMA;
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
||||||
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
VX_tc_rf_if.master tensor_regfile_if,
|
VX_tc_rf_if.master tensor_regfile_if,
|
||||||
VX_tc_bus_if.master tensor_smem_A_if,
|
VX_tc_bus_if.master tensor_smem_A_if,
|
||||||
VX_tc_bus_if.master tensor_smem_B_if,
|
VX_tc_bus_if.master tensor_smem_B_if,
|
||||||
@@ -156,7 +156,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||||||
.reset(reset),
|
.reset(reset),
|
||||||
|
|
||||||
.dispatch_if(tensor_dispatch_if),
|
.dispatch_if(tensor_dispatch_if),
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
.regfile_if(tensor_regfile_if),
|
.regfile_if(tensor_regfile_if),
|
||||||
.smem_A_if(tensor_smem_A_if),
|
.smem_A_if(tensor_smem_A_if),
|
||||||
.smem_B_if(tensor_smem_B_if),
|
.smem_B_if(tensor_smem_B_if),
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||||||
`endif
|
`endif
|
||||||
`ifdef EXT_T_ENABLE
|
`ifdef EXT_T_ENABLE
|
||||||
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
VX_tc_rf_if.slave tensor_regfile_if,
|
VX_tc_rf_if.slave tensor_regfile_if,
|
||||||
`endif
|
`endif
|
||||||
`endif
|
`endif
|
||||||
@@ -90,7 +90,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||||||
.reset (operands_reset),
|
.reset (operands_reset),
|
||||||
.writeback_if (writeback_if),
|
.writeback_if (writeback_if),
|
||||||
.scoreboard_if (scoreboard_if),
|
.scoreboard_if (scoreboard_if),
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
.operands_if (operands_if),
|
.operands_if (operands_if),
|
||||||
.tensor_regfile_if (tensor_regfile_if)
|
.tensor_regfile_if (tensor_regfile_if)
|
||||||
`else
|
`else
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||||
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
|
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
VX_tc_rf_if.slave tensor_regfile_if,
|
VX_tc_rf_if.slave tensor_regfile_if,
|
||||||
`endif
|
`endif
|
||||||
VX_operands_if.master operands_if [`ISSUE_WIDTH]
|
VX_operands_if.master operands_if [`ISSUE_WIDTH]
|
||||||
@@ -52,7 +52,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
// because NUM_BLOCKS == 1
|
// because NUM_BLOCKS == 1
|
||||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
|
||||||
|
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
`STATIC_ASSERT((ISSUE_RATIO == 1),
|
`STATIC_ASSERT((ISSUE_RATIO == 1),
|
||||||
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
|
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
|
||||||
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
|
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
|
||||||
@@ -117,7 +117,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
.size (size1[i])
|
.size (size1[i])
|
||||||
);
|
);
|
||||||
assign operands_if[i].valid = ~empty1[i];
|
assign operands_if[i].valid = ~empty1[i];
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
|
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
|
||||||
`else
|
`else
|
||||||
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
|
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
|
||||||
@@ -161,7 +161,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PIN (alm_full),
|
`UNUSED_PIN (alm_full),
|
||||||
`UNUSED_PIN (size)
|
`UNUSED_PIN (size)
|
||||||
);
|
);
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
assign tc_rf_data[i][j] = rs3_data[j];
|
assign tc_rf_data[i][j] = rs3_data[j];
|
||||||
`endif
|
`endif
|
||||||
end
|
end
|
||||||
@@ -189,7 +189,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||||
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
|
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
|
||||||
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
|
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||||
`else
|
`else
|
||||||
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||||
@@ -212,7 +212,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
assign gpr_wr_addr = writeback_if[i].data.rd;
|
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||||
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
|
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
|
||||||
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
|
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3;
|
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3;
|
||||||
`else
|
`else
|
||||||
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||||
@@ -308,7 +308,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
|||||||
.NO_RWCHECK (1)
|
.NO_RWCHECK (1)
|
||||||
) gpr_ram_rs3 (
|
) gpr_ram_rs3 (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
.read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]),
|
.read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]),
|
||||||
`else
|
`else
|
||||||
.read (scoreboard_if[i].valid && scoreboard_if[i].ready),
|
.read (scoreboard_if[i].valid && scoreboard_if[i].ready),
|
||||||
|
|||||||
@@ -150,7 +150,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
localparam INFLT_MAX = {INFLT_WIDTH{1'b1}};
|
localparam INFLT_MAX = {INFLT_WIDTH{1'b1}};
|
||||||
|
|
||||||
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) ||
|
||||||
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA));
|
||||||
|
|
||||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||||
|
|
||||||
@@ -213,10 +214,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
`endif
|
`endif
|
||||||
|
|
||||||
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
wire hgmma_wait = ibuffer_if[i].valid &&
|
wire hgmma_wait = ibuffer_if[i].valid &&
|
||||||
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT) ||
|
||||||
|
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA_WAIT));
|
||||||
// HGMMA is unblocked as long as there is available counter left for
|
// HGMMA is unblocked as long as there is available counter left for
|
||||||
// the inflight operations. This is to ensure back-to-back fire of
|
// the inflight operations. This is to ensure back-to-back fire of
|
||||||
// the dot product units with minimal downtime.
|
// the dot product units with minimal downtime.
|
||||||
@@ -280,7 +282,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||||
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||||
end
|
end
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
if (tensor_writeback_fire) begin
|
if (tensor_writeback_fire) begin
|
||||||
// ensure no race condition
|
// ensure no race condition
|
||||||
if (!tensor_issue_fire) begin
|
if (!tensor_issue_fire) begin
|
||||||
|
|||||||
192
hw/rtl/core/VX_tensor_blackwell_core.sv
Normal file
192
hw/rtl/core/VX_tensor_blackwell_core.sv
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
`include "VX_fpu_define.vh"
|
||||||
|
|
||||||
|
module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||||
|
parameter ISW,
|
||||||
|
parameter FP16
|
||||||
|
) (
|
||||||
|
input clk,
|
||||||
|
input reset,
|
||||||
|
|
||||||
|
VX_execute_if.slave execute_if,
|
||||||
|
VX_tc_rf_if.master regfile_if,
|
||||||
|
VX_tc_bus_if.master tmem_if,
|
||||||
|
VX_tc_bus_if.master smem_B_if,
|
||||||
|
VX_commit_if.master commit_if
|
||||||
|
);
|
||||||
|
localparam NUM_LANES = `NUM_THREADS;
|
||||||
|
localparam METADATA_QUEUE_DEPTH = 2;
|
||||||
|
|
||||||
|
wire [`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||||
|
wire [`NW_WIDTH-1:0] execute_if_data_wid;
|
||||||
|
wire [NUM_LANES-1:0] execute_if_data_tmask;
|
||||||
|
wire [`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
||||||
|
wire [`XLEN-1:0] execute_if_data_PC;
|
||||||
|
wire execute_if_data_wb;
|
||||||
|
wire [`NR_BITS-1:0] execute_if_data_rd;
|
||||||
|
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1;
|
||||||
|
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2;
|
||||||
|
|
||||||
|
wire metadata_queue_full;
|
||||||
|
wire metadata_queue_empty;
|
||||||
|
assign execute_if.ready = !metadata_queue_full;
|
||||||
|
|
||||||
|
logic metadata_deq;
|
||||||
|
|
||||||
|
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||||
|
wire enq = operand_enq_fire;
|
||||||
|
wire deq = metadata_deq;
|
||||||
|
|
||||||
|
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 +
|
||||||
|
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
|
||||||
|
VX_fifo_queue #(
|
||||||
|
.DATAW(DATAW),
|
||||||
|
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||||
|
) pending_uops (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
.push(enq),
|
||||||
|
.pop(deq),
|
||||||
|
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||||
|
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||||
|
execute_if.data.wb, execute_if.data.rd,
|
||||||
|
execute_if.data.rs1_data, execute_if.data.rs2_data}),
|
||||||
|
.data_out({execute_if_data_uuid, execute_if_data_wid,
|
||||||
|
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
|
||||||
|
execute_if_data_wb, execute_if_data_rd,
|
||||||
|
execute_if_data_rs1, execute_if_data_rs2}),
|
||||||
|
.empty(metadata_queue_empty),
|
||||||
|
`UNUSED_PIN(alm_empty),
|
||||||
|
.full(metadata_queue_full),
|
||||||
|
`UNUSED_PIN(alm_full),
|
||||||
|
`UNUSED_PIN(size)
|
||||||
|
);
|
||||||
|
|
||||||
|
wire initiate_ready;
|
||||||
|
wire writeback_valid;
|
||||||
|
wire writeback_last;
|
||||||
|
wire [`NW_WIDTH-1:0] writeback_wid;
|
||||||
|
wire [4:0] writeback_rd;
|
||||||
|
logic writeback_ready;
|
||||||
|
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
|
||||||
|
|
||||||
|
wire metadata_valid = !metadata_queue_empty;
|
||||||
|
wire bwgmma = metadata_valid &&
|
||||||
|
(execute_if_data_op_type == `INST_TENSOR_BWGMMA);
|
||||||
|
wire bwgmma_wait = metadata_valid &&
|
||||||
|
(execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT);
|
||||||
|
wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma;
|
||||||
|
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
|
||||||
|
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
|
||||||
|
wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0];
|
||||||
|
|
||||||
|
TensorCoreDecoupled tensor_blackwell_core (
|
||||||
|
.clock(clk),
|
||||||
|
.reset(reset),
|
||||||
|
|
||||||
|
.io_initiate_ready(initiate_ready),
|
||||||
|
.io_initiate_valid(bwgmma_initiate_valid),
|
||||||
|
.io_initiate_bits_wid(initiate_wid),
|
||||||
|
.io_initiate_bits_addressA(initiate_addr_a),
|
||||||
|
.io_initiate_bits_addressB(initiate_addr_b),
|
||||||
|
|
||||||
|
.io_writeback_ready(writeback_ready),
|
||||||
|
.io_writeback_valid(writeback_valid),
|
||||||
|
.io_writeback_bits_last(writeback_last),
|
||||||
|
.io_writeback_bits_wid(writeback_wid),
|
||||||
|
.io_writeback_bits_rd(writeback_rd),
|
||||||
|
.io_writeback_bits_data_0(writeback_data[0]),
|
||||||
|
.io_writeback_bits_data_1(writeback_data[1]),
|
||||||
|
.io_writeback_bits_data_2(writeback_data[2]),
|
||||||
|
.io_writeback_bits_data_3(writeback_data[3]),
|
||||||
|
.io_writeback_bits_data_4(writeback_data[4]),
|
||||||
|
.io_writeback_bits_data_5(writeback_data[5]),
|
||||||
|
.io_writeback_bits_data_6(writeback_data[6]),
|
||||||
|
.io_writeback_bits_data_7(writeback_data[7]),
|
||||||
|
|
||||||
|
.io_respA_ready(tmem_if.rsp_ready),
|
||||||
|
.io_respA_valid(tmem_if.rsp_valid),
|
||||||
|
.io_respA_bits_source(tmem_if.rsp_data.tag),
|
||||||
|
.io_respA_bits_data(tmem_if.rsp_data.data),
|
||||||
|
.io_respB_ready(smem_B_if.rsp_ready),
|
||||||
|
.io_respB_valid(smem_B_if.rsp_valid),
|
||||||
|
.io_respB_bits_source(smem_B_if.rsp_data.tag),
|
||||||
|
.io_respB_bits_data(smem_B_if.rsp_data.data),
|
||||||
|
.io_respC(regfile_if.rsp_data.data),
|
||||||
|
|
||||||
|
.io_reqA_ready(tmem_if.req_ready),
|
||||||
|
.io_reqA_valid(tmem_if.req_valid),
|
||||||
|
.io_reqA_bits_source(tmem_if.req_data.tag),
|
||||||
|
.io_reqA_bits_address(tmem_if.req_data.addr),
|
||||||
|
.io_reqB_ready(smem_B_if.req_ready),
|
||||||
|
.io_reqB_valid(smem_B_if.req_valid),
|
||||||
|
.io_reqB_bits_source(smem_B_if.req_data.tag),
|
||||||
|
.io_reqB_bits_address(smem_B_if.req_data.addr),
|
||||||
|
.io_reqC_valid(regfile_if.req_valid),
|
||||||
|
.io_reqC_bits(regfile_if.req_data.rs[4:0])
|
||||||
|
);
|
||||||
|
|
||||||
|
assign regfile_if.req_data.rs[5] = 1'b1;
|
||||||
|
assign regfile_if.req_data.wis = '0;
|
||||||
|
assign tmem_if.req_data.rw = 1'b0;
|
||||||
|
assign tmem_if.req_data.byteen = '1;
|
||||||
|
assign tmem_if.req_data.data = '0;
|
||||||
|
assign smem_B_if.req_data.rw = 1'b0;
|
||||||
|
assign smem_B_if.req_data.byteen = '1;
|
||||||
|
assign smem_B_if.req_data.data = '0;
|
||||||
|
|
||||||
|
logic commit_select_tensor;
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
metadata_deq = 1'b0;
|
||||||
|
commit_select_tensor = 1'b0;
|
||||||
|
writeback_ready = commit_if.ready;
|
||||||
|
|
||||||
|
if (metadata_valid) begin
|
||||||
|
if (bwgmma_wait) begin
|
||||||
|
writeback_ready = 1'b0;
|
||||||
|
commit_select_tensor = 1'b0;
|
||||||
|
metadata_deq = metadata_valid && commit_if.ready;
|
||||||
|
end else if (bwgmma) begin
|
||||||
|
commit_select_tensor = !initiate_ready;
|
||||||
|
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
|
||||||
|
end else begin
|
||||||
|
// TCGEN05 ops are recognized by decode so software can build
|
||||||
|
// against the Blackwell surface. Phase 1 compute integration
|
||||||
|
// only routes BWGMMA through the decoupled tensor datapath.
|
||||||
|
commit_select_tensor = 1'b0;
|
||||||
|
metadata_deq = metadata_valid && commit_if.ready;
|
||||||
|
end
|
||||||
|
end else begin
|
||||||
|
commit_select_tensor = 1'b1;
|
||||||
|
end
|
||||||
|
|
||||||
|
if (commit_select_tensor) begin
|
||||||
|
commit_if.valid = writeback_valid;
|
||||||
|
commit_if.data.uuid = '0;
|
||||||
|
commit_if.data.wid = writeback_wid;
|
||||||
|
commit_if.data.tmask = {NUM_LANES{1'b1}};
|
||||||
|
commit_if.data.PC = '0;
|
||||||
|
commit_if.data.wb = 1'b1;
|
||||||
|
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd});
|
||||||
|
commit_if.data.data = writeback_data;
|
||||||
|
commit_if.data.tensor = 1'b1;
|
||||||
|
commit_if.data.eop = writeback_last;
|
||||||
|
end else begin
|
||||||
|
commit_if.valid = metadata_valid;
|
||||||
|
commit_if.data.uuid = execute_if_data_uuid;
|
||||||
|
commit_if.data.wid = execute_if_data_wid;
|
||||||
|
commit_if.data.tmask = execute_if_data_tmask;
|
||||||
|
commit_if.data.PC = execute_if_data_PC;
|
||||||
|
commit_if.data.wb = execute_if_data_wb;
|
||||||
|
commit_if.data.rd = execute_if_data_rd;
|
||||||
|
commit_if.data.data = '0;
|
||||||
|
commit_if.data.tensor = 1'b0;
|
||||||
|
commit_if.data.pid = 1'b0;
|
||||||
|
commit_if.data.sop = 1'b1;
|
||||||
|
commit_if.data.eop = 1'b1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
endmodule
|
||||||
|
|
||||||
|
`endif
|
||||||
@@ -8,7 +8,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
|||||||
input reset,
|
input reset,
|
||||||
|
|
||||||
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
VX_tc_rf_if.master regfile_if,
|
VX_tc_rf_if.master regfile_if,
|
||||||
VX_tc_bus_if.master smem_A_if,
|
VX_tc_bus_if.master smem_A_if,
|
||||||
VX_tc_bus_if.master smem_B_if,
|
VX_tc_bus_if.master smem_B_if,
|
||||||
@@ -69,6 +69,19 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
|||||||
.smem_B_if (smem_B_if),
|
.smem_B_if (smem_B_if),
|
||||||
.commit_if (commit_block_if[block_idx])
|
.commit_if (commit_block_if[block_idx])
|
||||||
);
|
);
|
||||||
|
`elsif EXT_T_BLACKWELL
|
||||||
|
VX_tensor_blackwell_core_block #(
|
||||||
|
.ISW(1),
|
||||||
|
.FP16(FP16)
|
||||||
|
) tensor_blackwell_core_block (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (reset),
|
||||||
|
.execute_if (execute_if[block_idx]),
|
||||||
|
.regfile_if (regfile_if),
|
||||||
|
.tmem_if (smem_A_if),
|
||||||
|
.smem_B_if (smem_B_if),
|
||||||
|
.commit_if (commit_block_if[block_idx])
|
||||||
|
);
|
||||||
// ) tensor_hopper_core_block (
|
// ) tensor_hopper_core_block (
|
||||||
// .clk (clk),
|
// .clk (clk),
|
||||||
// .reset (reset),
|
// .reset (reset),
|
||||||
|
|||||||
@@ -177,6 +177,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
|||||||
// add offset of 32 for fp regs
|
// add offset of 32 for fp regs
|
||||||
assign regfile_if.req_data.rs[5] = 1'b1;
|
assign regfile_if.req_data.rs[5] = 1'b1;
|
||||||
assign regfile_if.req_data.wis = '0;
|
assign regfile_if.req_data.wis = '0;
|
||||||
|
assign smem_A_if.req_data.rw = 1'b0;
|
||||||
|
assign smem_A_if.req_data.byteen = '1;
|
||||||
|
assign smem_A_if.req_data.data = '0;
|
||||||
|
assign smem_B_if.req_data.rw = 1'b0;
|
||||||
|
assign smem_B_if.req_data.byteen = '1;
|
||||||
|
assign smem_B_if.req_data.data = '0;
|
||||||
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
|
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
|
||||||
("static assertion failed: tensor_hopper_core assumes ISSUE_WIDTH == NUM_WARPS"))
|
("static assertion failed: tensor_hopper_core assumes ISSUE_WIDTH == NUM_WARPS"))
|
||||||
|
|
||||||
|
|||||||
@@ -183,7 +183,7 @@ end
|
|||||||
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
|
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
|
||||||
// but conceptually they should be linked
|
// but conceptually they should be linked
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
`ifdef EXT_T_HOPPER
|
`ifdef EXT_T_ASYNC
|
||||||
// for Hopper, disable micro-op blitzing. Set/step is managed
|
// for Hopper, disable micro-op blitzing. Set/step is managed
|
||||||
// microarchitecturally in an FSM inside the tensor core.
|
// microarchitecturally in an FSM inside the tensor core.
|
||||||
use_uop = 1'b0;
|
use_uop = 1'b0;
|
||||||
|
|||||||
@@ -20,7 +20,10 @@ interface VX_tc_bus_if #(
|
|||||||
)();
|
)();
|
||||||
|
|
||||||
typedef struct packed {
|
typedef struct packed {
|
||||||
|
logic rw;
|
||||||
|
logic [DATA_SIZE-1:0] byteen;
|
||||||
logic [ADDR_WIDTH-1:0] addr;
|
logic [ADDR_WIDTH-1:0] addr;
|
||||||
|
logic [DATA_SIZE*8-1:0] data;
|
||||||
logic [TAG_WIDTH-1:0] tag;
|
logic [TAG_WIDTH-1:0] tag;
|
||||||
} req_data_t;
|
} req_data_t;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user