Add Blackwell tensor RTL scaffolding

This commit is contained in:
2026-04-25 10:15:31 +08:00
parent f1d0fac518
commit cb912d3b8b
13 changed files with 281 additions and 34 deletions

View File

@@ -79,8 +79,11 @@ module Vortex import VX_gpu_pkg::*; #(
// tc --------------------------------------------------
input [1:0] tc_a_ready,
output [1:0] tc_a_valid,
output [1:0] tc_a_bits_write,
output [63:0] tc_a_bits_address,
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [2 * 32 - 1:0] tc_a_bits_mask,
output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [1:0] tc_d_ready,
input [1:0] tc_d_valid,
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
@@ -305,8 +308,11 @@ module Vortex import VX_gpu_pkg::*; #(
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data};
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
@@ -575,4 +581,3 @@ endmodule : Vortex

View File

@@ -258,6 +258,18 @@
// Hopper WGMMA-style asynchronous op
`define INST_TENSOR_HGMMA 4'b0001
`define INST_TENSOR_HGMMA_WAIT 4'b0010
`define INST_TENSOR_TCGEN05_CP 4'b0011
`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100
`define INST_TENSOR_BWGMMA 4'b0101
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
`define INST_TENSOR_TCGEN05_LD 4'b0111
`define INST_TENSOR_TCGEN05_ST 4'b1000
`ifdef EXT_T_HOPPER
`define EXT_T_ASYNC
`elsif EXT_T_BLACKWELL
`define EXT_T_ASYNC
`endif
///////////////////////////////////////////////////////////////////////////////

View File

@@ -63,7 +63,7 @@ module VX_core import VX_gpu_pkg::*; #(
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
VX_tc_rf_if tensor_regfile_if();
`endif
VX_commit_sched_if commit_sched_if();
@@ -193,7 +193,7 @@ module VX_core import VX_gpu_pkg::*; #(
`endif
`ifdef EXT_T_ENABLE
.tensor_dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
`endif
`endif
@@ -226,7 +226,7 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef EXT_T_ENABLE
.tensor_dispatch_if (tensor_dispatch_if),
.tensor_commit_if (tensor_commit_if),
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
.tensor_smem_A_if (tensor_smem_A_if),
.tensor_smem_B_if (tensor_smem_B_if),

View File

@@ -555,6 +555,20 @@ module VX_decode #(
// B matrix tiles
`USED_IREG (rs1);
`USED_IREG (rs2);
`elsif EXT_T_BLACKWELL
ex_type = `EX_TENSOR;
case (func3)
3'b000: op_type = `INST_TENSOR_BWGMMA;
3'b001: op_type = `INST_TENSOR_BWGMMA_WAIT;
3'b010: op_type = `INST_TENSOR_TCGEN05_CP;
3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT;
3'b100: op_type = `INST_TENSOR_TCGEN05_LD;
3'b101: op_type = `INST_TENSOR_TCGEN05_ST;
default: ;
endcase
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`else
ex_type = `EX_TENSOR;
op_type = `INST_TENSOR_HMMA;

View File

@@ -58,7 +58,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef EXT_T_ENABLE
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master tensor_regfile_if,
VX_tc_bus_if.master tensor_smem_A_if,
VX_tc_bus_if.master tensor_smem_B_if,
@@ -156,7 +156,7 @@ module VX_execute import VX_gpu_pkg::*; #(
.reset(reset),
.dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
.regfile_if(tensor_regfile_if),
.smem_A_if(tensor_smem_A_if),
.smem_B_if(tensor_smem_B_if),

View File

@@ -36,7 +36,7 @@ module VX_issue import VX_gpu_pkg::*; #(
`endif
`ifdef EXT_T_ENABLE
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
VX_tc_rf_if.slave tensor_regfile_if,
`endif
`endif
@@ -90,7 +90,7 @@ module VX_issue import VX_gpu_pkg::*; #(
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
.operands_if (operands_if),
.tensor_regfile_if (tensor_regfile_if)
`else

View File

@@ -24,7 +24,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
VX_tc_rf_if.slave tensor_regfile_if,
`endif
VX_operands_if.master operands_if [`ISSUE_WIDTH]
@@ -52,7 +52,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
// because NUM_BLOCKS == 1
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
`STATIC_ASSERT((ISSUE_RATIO == 1),
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
@@ -117,7 +117,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.size (size1[i])
);
assign operands_if[i].valid = ~empty1[i];
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
`else
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
@@ -161,7 +161,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
assign tc_rf_data[i][j] = rs3_data[j];
`endif
end
@@ -189,7 +189,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
`else
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
@@ -212,7 +212,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
assign gpr_wr_addr = writeback_if[i].data.rd;
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3;
`else
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
@@ -308,7 +308,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.NO_RWCHECK (1)
) gpr_ram_rs3 (
.clk (clk),
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
.read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]),
`else
.read (scoreboard_if[i].valid && scoreboard_if[i].ready),

View File

@@ -150,7 +150,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
localparam INFLT_MAX = {INFLT_WIDTH{1'b1}};
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) ||
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA));
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
@@ -213,10 +214,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
wire hgmma_wait = ibuffer_if[i].valid &&
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT) ||
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA_WAIT));
// HGMMA is unblocked as long as there is available counter left for
// the inflight operations. This is to ensure back-to-back fire of
// the dot product units with minimal downtime.
@@ -280,7 +282,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
end
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
if (tensor_writeback_fire) begin
// ensure no race condition
if (!tensor_issue_fire) begin

View File

@@ -0,0 +1,192 @@
`ifdef EXT_T_ENABLE
`include "VX_fpu_define.vh"
module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
parameter ISW,
parameter FP16
) (
input clk,
input reset,
VX_execute_if.slave execute_if,
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master tmem_if,
VX_tc_bus_if.master smem_B_if,
VX_commit_if.master commit_if
);
localparam NUM_LANES = `NUM_THREADS;
localparam METADATA_QUEUE_DEPTH = 2;
wire [`UUID_WIDTH-1:0] execute_if_data_uuid;
wire [`NW_WIDTH-1:0] execute_if_data_wid;
wire [NUM_LANES-1:0] execute_if_data_tmask;
wire [`INST_ALU_BITS-1:0] execute_if_data_op_type;
wire [`XLEN-1:0] execute_if_data_PC;
wire execute_if_data_wb;
wire [`NR_BITS-1:0] execute_if_data_rd;
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1;
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2;
wire metadata_queue_full;
wire metadata_queue_empty;
assign execute_if.ready = !metadata_queue_full;
logic metadata_deq;
wire operand_enq_fire = execute_if.valid && execute_if.ready;
wire enq = operand_enq_fire;
wire deq = metadata_deq;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 +
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
) pending_uops (
.clk(clk),
.reset(reset),
.push(enq),
.pop(deq),
.data_in({execute_if.data.uuid, execute_if.data.wid,
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd,
execute_if.data.rs1_data, execute_if.data.rs2_data}),
.data_out({execute_if_data_uuid, execute_if_data_wid,
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
execute_if_data_wb, execute_if_data_rd,
execute_if_data_rs1, execute_if_data_rs2}),
.empty(metadata_queue_empty),
`UNUSED_PIN(alm_empty),
.full(metadata_queue_full),
`UNUSED_PIN(alm_full),
`UNUSED_PIN(size)
);
wire initiate_ready;
wire writeback_valid;
wire writeback_last;
wire [`NW_WIDTH-1:0] writeback_wid;
wire [4:0] writeback_rd;
logic writeback_ready;
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
wire metadata_valid = !metadata_queue_empty;
wire bwgmma = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_BWGMMA);
wire bwgmma_wait = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT);
wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma;
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0];
TensorCoreDecoupled tensor_blackwell_core (
.clock(clk),
.reset(reset),
.io_initiate_ready(initiate_ready),
.io_initiate_valid(bwgmma_initiate_valid),
.io_initiate_bits_wid(initiate_wid),
.io_initiate_bits_addressA(initiate_addr_a),
.io_initiate_bits_addressB(initiate_addr_b),
.io_writeback_ready(writeback_ready),
.io_writeback_valid(writeback_valid),
.io_writeback_bits_last(writeback_last),
.io_writeback_bits_wid(writeback_wid),
.io_writeback_bits_rd(writeback_rd),
.io_writeback_bits_data_0(writeback_data[0]),
.io_writeback_bits_data_1(writeback_data[1]),
.io_writeback_bits_data_2(writeback_data[2]),
.io_writeback_bits_data_3(writeback_data[3]),
.io_writeback_bits_data_4(writeback_data[4]),
.io_writeback_bits_data_5(writeback_data[5]),
.io_writeback_bits_data_6(writeback_data[6]),
.io_writeback_bits_data_7(writeback_data[7]),
.io_respA_ready(tmem_if.rsp_ready),
.io_respA_valid(tmem_if.rsp_valid),
.io_respA_bits_source(tmem_if.rsp_data.tag),
.io_respA_bits_data(tmem_if.rsp_data.data),
.io_respB_ready(smem_B_if.rsp_ready),
.io_respB_valid(smem_B_if.rsp_valid),
.io_respB_bits_source(smem_B_if.rsp_data.tag),
.io_respB_bits_data(smem_B_if.rsp_data.data),
.io_respC(regfile_if.rsp_data.data),
.io_reqA_ready(tmem_if.req_ready),
.io_reqA_valid(tmem_if.req_valid),
.io_reqA_bits_source(tmem_if.req_data.tag),
.io_reqA_bits_address(tmem_if.req_data.addr),
.io_reqB_ready(smem_B_if.req_ready),
.io_reqB_valid(smem_B_if.req_valid),
.io_reqB_bits_source(smem_B_if.req_data.tag),
.io_reqB_bits_address(smem_B_if.req_data.addr),
.io_reqC_valid(regfile_if.req_valid),
.io_reqC_bits(regfile_if.req_data.rs[4:0])
);
assign regfile_if.req_data.rs[5] = 1'b1;
assign regfile_if.req_data.wis = '0;
assign tmem_if.req_data.rw = 1'b0;
assign tmem_if.req_data.byteen = '1;
assign tmem_if.req_data.data = '0;
assign smem_B_if.req_data.rw = 1'b0;
assign smem_B_if.req_data.byteen = '1;
assign smem_B_if.req_data.data = '0;
logic commit_select_tensor;
always @(*) begin
metadata_deq = 1'b0;
commit_select_tensor = 1'b0;
writeback_ready = commit_if.ready;
if (metadata_valid) begin
if (bwgmma_wait) begin
writeback_ready = 1'b0;
commit_select_tensor = 1'b0;
metadata_deq = metadata_valid && commit_if.ready;
end else if (bwgmma) begin
commit_select_tensor = !initiate_ready;
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
end else begin
// TCGEN05 ops are recognized by decode so software can build
// against the Blackwell surface. Phase 1 compute integration
// only routes BWGMMA through the decoupled tensor datapath.
commit_select_tensor = 1'b0;
metadata_deq = metadata_valid && commit_if.ready;
end
end else begin
commit_select_tensor = 1'b1;
end
if (commit_select_tensor) begin
commit_if.valid = writeback_valid;
commit_if.data.uuid = '0;
commit_if.data.wid = writeback_wid;
commit_if.data.tmask = {NUM_LANES{1'b1}};
commit_if.data.PC = '0;
commit_if.data.wb = 1'b1;
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd});
commit_if.data.data = writeback_data;
commit_if.data.tensor = 1'b1;
commit_if.data.eop = writeback_last;
end else begin
commit_if.valid = metadata_valid;
commit_if.data.uuid = execute_if_data_uuid;
commit_if.data.wid = execute_if_data_wid;
commit_if.data.tmask = execute_if_data_tmask;
commit_if.data.PC = execute_if_data_PC;
commit_if.data.wb = execute_if_data_wb;
commit_if.data.rd = execute_if_data_rd;
commit_if.data.data = '0;
commit_if.data.tensor = 1'b0;
commit_if.data.pid = 1'b0;
commit_if.data.sop = 1'b1;
commit_if.data.eop = 1'b1;
end
end
endmodule
`endif

View File

@@ -8,7 +8,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
input reset,
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master smem_A_if,
VX_tc_bus_if.master smem_B_if,
@@ -69,6 +69,19 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.smem_B_if (smem_B_if),
.commit_if (commit_block_if[block_idx])
);
`elsif EXT_T_BLACKWELL
VX_tensor_blackwell_core_block #(
.ISW(1),
.FP16(FP16)
) tensor_blackwell_core_block (
.clk (clk),
.reset (reset),
.execute_if (execute_if[block_idx]),
.regfile_if (regfile_if),
.tmem_if (smem_A_if),
.smem_B_if (smem_B_if),
.commit_if (commit_block_if[block_idx])
);
// ) tensor_hopper_core_block (
// .clk (clk),
// .reset (reset),

View File

@@ -177,6 +177,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
// add offset of 32 for fp regs
assign regfile_if.req_data.rs[5] = 1'b1;
assign regfile_if.req_data.wis = '0;
assign smem_A_if.req_data.rw = 1'b0;
assign smem_A_if.req_data.byteen = '1;
assign smem_A_if.req_data.data = '0;
assign smem_B_if.req_data.rw = 1'b0;
assign smem_B_if.req_data.byteen = '1;
assign smem_B_if.req_data.data = '0;
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
("static assertion failed: tensor_hopper_core assumes ISSUE_WIDTH == NUM_WARPS"))

View File

@@ -183,7 +183,7 @@ end
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
// but conceptually they should be linked
always @(*) begin
`ifdef EXT_T_HOPPER
`ifdef EXT_T_ASYNC
// for Hopper, disable micro-op blitzing. Set/step is managed
// microarchitecturally in an FSM inside the tensor core.
use_uop = 1'b0;

View File

@@ -20,7 +20,10 @@ interface VX_tc_bus_if #(
)();
typedef struct packed {
logic rw;
logic [DATA_SIZE-1:0] byteen;
logic [ADDR_WIDTH-1:0] addr;
logic [DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
} req_data_t;