From cb912d3b8b689683f0a283039aa4c1633cddd2f3 Mon Sep 17 00:00:00 2001 From: abnerhexu Date: Sat, 25 Apr 2026 10:15:31 +0800 Subject: [PATCH] Add Blackwell tensor RTL scaffolding --- hw/rtl/VX_core_wrapper.sv | 7 +- hw/rtl/VX_define.vh | 24 ++- hw/rtl/core/VX_core.sv | 20 +-- hw/rtl/core/VX_decode.sv | 14 ++ hw/rtl/core/VX_execute.sv | 4 +- hw/rtl/core/VX_issue.sv | 4 +- hw/rtl/core/VX_operands_dup.sv | 14 +- hw/rtl/core/VX_scoreboard.sv | 10 +- hw/rtl/core/VX_tensor_blackwell_core.sv | 192 ++++++++++++++++++++++++ hw/rtl/core/VX_tensor_core.sv | 15 +- hw/rtl/core/VX_tensor_hopper_core.sv | 6 + hw/rtl/core/VX_uop_sequencer.sv | 2 +- hw/rtl/mem/VX_tc_bus_if.sv | 3 + 13 files changed, 281 insertions(+), 34 deletions(-) create mode 100644 hw/rtl/core/VX_tensor_blackwell_core.sv diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 8efff248..77baecb4 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -79,8 +79,11 @@ module Vortex import VX_gpu_pkg::*; #( // tc -------------------------------------------------- input [1:0] tc_a_ready, output [1:0] tc_a_valid, + output [1:0] tc_a_bits_write, output [63:0] tc_a_bits_address, output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag, + output [2 * 32 - 1:0] tc_a_bits_mask, + output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data, output [1:0] tc_d_ready, input [1:0] tc_d_valid, input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data, @@ -305,8 +308,11 @@ module Vortex import VX_gpu_pkg::*; #( VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if(); VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if(); assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid}; + assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw}; assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr}; assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag}; + assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen}; + assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data}; assign tc_p0_bus_if.req_ready = tc_a_ready[0]; assign tc_p0_bus_if.rsp_valid = tc_d_valid[0]; assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; @@ -575,4 +581,3 @@ endmodule : Vortex - diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 61ce41be..2fe430c4 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -254,12 +254,24 @@ `define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) -`define INST_TENSOR_HMMA 4'b0000 -// Hopper WGMMA-style asynchronous op -`define INST_TENSOR_HGMMA 4'b0001 -`define INST_TENSOR_HGMMA_WAIT 4'b0010 - -/////////////////////////////////////////////////////////////////////////////// +`define INST_TENSOR_HMMA 4'b0000 +// Hopper WGMMA-style asynchronous op +`define INST_TENSOR_HGMMA 4'b0001 +`define INST_TENSOR_HGMMA_WAIT 4'b0010 +`define INST_TENSOR_TCGEN05_CP 4'b0011 +`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100 +`define INST_TENSOR_BWGMMA 4'b0101 +`define INST_TENSOR_BWGMMA_WAIT 4'b0110 +`define INST_TENSOR_TCGEN05_LD 4'b0111 +`define INST_TENSOR_TCGEN05_ST 4'b1000 + +`ifdef EXT_T_HOPPER +`define EXT_T_ASYNC +`elsif EXT_T_BLACKWELL +`define EXT_T_ASYNC +`endif + +/////////////////////////////////////////////////////////////////////////////// // non-cacheable tag bits `define NC_TAG_BITS 1 diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 61ee799d..392955d5 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -63,9 +63,9 @@ module VX_core import VX_gpu_pkg::*; #( VX_decode_if decode_if(); VX_sched_csr_if sched_csr_if(); VX_decode_sched_if decode_sched_if(); -`ifdef EXT_T_HOPPER - VX_tc_rf_if tensor_regfile_if(); -`endif +`ifdef EXT_T_ASYNC + VX_tc_rf_if tensor_regfile_if(); +`endif VX_commit_sched_if commit_sched_if(); VX_commit_csr_if commit_csr_if(); VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); @@ -193,9 +193,9 @@ module VX_core import VX_gpu_pkg::*; #( `endif `ifdef EXT_T_ENABLE .tensor_dispatch_if(tensor_dispatch_if), - `ifdef EXT_T_HOPPER - .tensor_regfile_if (tensor_regfile_if), - `endif + `ifdef EXT_T_ASYNC + .tensor_regfile_if (tensor_regfile_if), + `endif `endif .sfu_dispatch_if(sfu_dispatch_if) ); @@ -226,10 +226,10 @@ module VX_core import VX_gpu_pkg::*; #( `ifdef EXT_T_ENABLE .tensor_dispatch_if (tensor_dispatch_if), .tensor_commit_if (tensor_commit_if), - `ifdef EXT_T_HOPPER - .tensor_regfile_if (tensor_regfile_if), - .tensor_smem_A_if (tensor_smem_A_if), - .tensor_smem_B_if (tensor_smem_B_if), + `ifdef EXT_T_ASYNC + .tensor_regfile_if (tensor_regfile_if), + .tensor_smem_A_if (tensor_smem_A_if), + .tensor_smem_B_if (tensor_smem_B_if), `endif `endif diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 36063cd9..4b16d1fa 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -555,6 +555,20 @@ module VX_decode #( // B matrix tiles `USED_IREG (rs1); `USED_IREG (rs2); + `elsif EXT_T_BLACKWELL + ex_type = `EX_TENSOR; + case (func3) + 3'b000: op_type = `INST_TENSOR_BWGMMA; + 3'b001: op_type = `INST_TENSOR_BWGMMA_WAIT; + 3'b010: op_type = `INST_TENSOR_TCGEN05_CP; + 3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT; + 3'b100: op_type = `INST_TENSOR_TCGEN05_LD; + 3'b101: op_type = `INST_TENSOR_TCGEN05_ST; + default: ; + endcase + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); `else ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index 7cbb4ed2..051bf85a 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -58,7 +58,7 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_T_ENABLE VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH], -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC VX_tc_rf_if.master tensor_regfile_if, VX_tc_bus_if.master tensor_smem_A_if, VX_tc_bus_if.master tensor_smem_B_if, @@ -156,7 +156,7 @@ module VX_execute import VX_gpu_pkg::*; #( .reset(reset), .dispatch_if(tensor_dispatch_if), -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC .regfile_if(tensor_regfile_if), .smem_A_if(tensor_smem_A_if), .smem_B_if(tensor_smem_B_if), diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index db88330a..339c6b72 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -36,7 +36,7 @@ module VX_issue import VX_gpu_pkg::*; #( `endif `ifdef EXT_T_ENABLE VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC VX_tc_rf_if.slave tensor_regfile_if, `endif `endif @@ -90,7 +90,7 @@ module VX_issue import VX_gpu_pkg::*; #( .reset (operands_reset), .writeback_if (writeback_if), .scoreboard_if (scoreboard_if), -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC .operands_if (operands_if), .tensor_regfile_if (tensor_regfile_if) `else diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index bd94bcfe..a4f8ec9d 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -24,7 +24,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH], -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC VX_tc_rf_if.slave tensor_regfile_if, `endif VX_operands_if.master operands_if [`ISSUE_WIDTH] @@ -52,7 +52,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( // because NUM_BLOCKS == 1 wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH]; -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC `STATIC_ASSERT((ISSUE_RATIO == 1), ("static assertion failed: tensor core only supports ISSUE_RATIO == 1")) assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}}; @@ -117,7 +117,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .size (size1[i]) ); assign operands_if[i].valid = ~empty1[i]; -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i]; `else assign scoreboard_if[i].ready = (size1[i] < 3'd2); @@ -161,7 +161,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( `UNUSED_PIN (alm_full), `UNUSED_PIN (size) ); -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC assign tc_rf_data[i][j] = rs3_data[j]; `endif end @@ -189,7 +189,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1}; assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2}; -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; `else assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; @@ -212,7 +212,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( assign gpr_wr_addr = writeback_if[i].data.rd; assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1; assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2; -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3; `else assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; @@ -308,7 +308,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) gpr_ram_rs3 ( .clk (clk), -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC .read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]), `else .read (scoreboard_if[i].valid && scoreboard_if[i].ready), diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index f4b5f55b..78362e88 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -150,7 +150,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( localparam INFLT_MAX = {INFLT_WIDTH{1'b1}}; wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && - (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA); + ((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) || + (ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA)); wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; @@ -213,10 +214,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `endif wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; - `ifdef EXT_T_HOPPER + `ifdef EXT_T_ASYNC wire hgmma_wait = ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) && - (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT); + ((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT) || + (ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA_WAIT)); // HGMMA is unblocked as long as there is available counter left for // the inflight operations. This is to ensure back-to-back fire of // the dot product units with minimal downtime. @@ -280,7 +282,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; end - `ifdef EXT_T_HOPPER + `ifdef EXT_T_ASYNC if (tensor_writeback_fire) begin // ensure no race condition if (!tensor_issue_fire) begin diff --git a/hw/rtl/core/VX_tensor_blackwell_core.sv b/hw/rtl/core/VX_tensor_blackwell_core.sv new file mode 100644 index 00000000..b5a17d3d --- /dev/null +++ b/hw/rtl/core/VX_tensor_blackwell_core.sv @@ -0,0 +1,192 @@ +`ifdef EXT_T_ENABLE +`include "VX_fpu_define.vh" + +module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( + parameter ISW, + parameter FP16 +) ( + input clk, + input reset, + + VX_execute_if.slave execute_if, + VX_tc_rf_if.master regfile_if, + VX_tc_bus_if.master tmem_if, + VX_tc_bus_if.master smem_B_if, + VX_commit_if.master commit_if +); + localparam NUM_LANES = `NUM_THREADS; + localparam METADATA_QUEUE_DEPTH = 2; + + wire [`UUID_WIDTH-1:0] execute_if_data_uuid; + wire [`NW_WIDTH-1:0] execute_if_data_wid; + wire [NUM_LANES-1:0] execute_if_data_tmask; + wire [`INST_ALU_BITS-1:0] execute_if_data_op_type; + wire [`XLEN-1:0] execute_if_data_PC; + wire execute_if_data_wb; + wire [`NR_BITS-1:0] execute_if_data_rd; + wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1; + wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2; + + wire metadata_queue_full; + wire metadata_queue_empty; + assign execute_if.ready = !metadata_queue_full; + + logic metadata_deq; + + wire operand_enq_fire = execute_if.valid && execute_if.ready; + wire enq = operand_enq_fire; + wire deq = metadata_deq; + + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + + `NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN); + VX_fifo_queue #( + .DATAW(DATAW), + .DEPTH(METADATA_QUEUE_DEPTH) + ) pending_uops ( + .clk(clk), + .reset(reset), + .push(enq), + .pop(deq), + .data_in({execute_if.data.uuid, execute_if.data.wid, + execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, + execute_if.data.wb, execute_if.data.rd, + execute_if.data.rs1_data, execute_if.data.rs2_data}), + .data_out({execute_if_data_uuid, execute_if_data_wid, + execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC, + execute_if_data_wb, execute_if_data_rd, + execute_if_data_rs1, execute_if_data_rs2}), + .empty(metadata_queue_empty), + `UNUSED_PIN(alm_empty), + .full(metadata_queue_full), + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); + + wire initiate_ready; + wire writeback_valid; + wire writeback_last; + wire [`NW_WIDTH-1:0] writeback_wid; + wire [4:0] writeback_rd; + logic writeback_ready; + wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data; + + wire metadata_valid = !metadata_queue_empty; + wire bwgmma = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_BWGMMA); + wire bwgmma_wait = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT); + wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma; + wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid; + wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0]; + wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0]; + + TensorCoreDecoupled tensor_blackwell_core ( + .clock(clk), + .reset(reset), + + .io_initiate_ready(initiate_ready), + .io_initiate_valid(bwgmma_initiate_valid), + .io_initiate_bits_wid(initiate_wid), + .io_initiate_bits_addressA(initiate_addr_a), + .io_initiate_bits_addressB(initiate_addr_b), + + .io_writeback_ready(writeback_ready), + .io_writeback_valid(writeback_valid), + .io_writeback_bits_last(writeback_last), + .io_writeback_bits_wid(writeback_wid), + .io_writeback_bits_rd(writeback_rd), + .io_writeback_bits_data_0(writeback_data[0]), + .io_writeback_bits_data_1(writeback_data[1]), + .io_writeback_bits_data_2(writeback_data[2]), + .io_writeback_bits_data_3(writeback_data[3]), + .io_writeback_bits_data_4(writeback_data[4]), + .io_writeback_bits_data_5(writeback_data[5]), + .io_writeback_bits_data_6(writeback_data[6]), + .io_writeback_bits_data_7(writeback_data[7]), + + .io_respA_ready(tmem_if.rsp_ready), + .io_respA_valid(tmem_if.rsp_valid), + .io_respA_bits_source(tmem_if.rsp_data.tag), + .io_respA_bits_data(tmem_if.rsp_data.data), + .io_respB_ready(smem_B_if.rsp_ready), + .io_respB_valid(smem_B_if.rsp_valid), + .io_respB_bits_source(smem_B_if.rsp_data.tag), + .io_respB_bits_data(smem_B_if.rsp_data.data), + .io_respC(regfile_if.rsp_data.data), + + .io_reqA_ready(tmem_if.req_ready), + .io_reqA_valid(tmem_if.req_valid), + .io_reqA_bits_source(tmem_if.req_data.tag), + .io_reqA_bits_address(tmem_if.req_data.addr), + .io_reqB_ready(smem_B_if.req_ready), + .io_reqB_valid(smem_B_if.req_valid), + .io_reqB_bits_source(smem_B_if.req_data.tag), + .io_reqB_bits_address(smem_B_if.req_data.addr), + .io_reqC_valid(regfile_if.req_valid), + .io_reqC_bits(regfile_if.req_data.rs[4:0]) + ); + + assign regfile_if.req_data.rs[5] = 1'b1; + assign regfile_if.req_data.wis = '0; + assign tmem_if.req_data.rw = 1'b0; + assign tmem_if.req_data.byteen = '1; + assign tmem_if.req_data.data = '0; + assign smem_B_if.req_data.rw = 1'b0; + assign smem_B_if.req_data.byteen = '1; + assign smem_B_if.req_data.data = '0; + + logic commit_select_tensor; + + always @(*) begin + metadata_deq = 1'b0; + commit_select_tensor = 1'b0; + writeback_ready = commit_if.ready; + + if (metadata_valid) begin + if (bwgmma_wait) begin + writeback_ready = 1'b0; + commit_select_tensor = 1'b0; + metadata_deq = metadata_valid && commit_if.ready; + end else if (bwgmma) begin + commit_select_tensor = !initiate_ready; + metadata_deq = metadata_valid && commit_if.ready && initiate_ready; + end else begin + // TCGEN05 ops are recognized by decode so software can build + // against the Blackwell surface. Phase 1 compute integration + // only routes BWGMMA through the decoupled tensor datapath. + commit_select_tensor = 1'b0; + metadata_deq = metadata_valid && commit_if.ready; + end + end else begin + commit_select_tensor = 1'b1; + end + + if (commit_select_tensor) begin + commit_if.valid = writeback_valid; + commit_if.data.uuid = '0; + commit_if.data.wid = writeback_wid; + commit_if.data.tmask = {NUM_LANES{1'b1}}; + commit_if.data.PC = '0; + commit_if.data.wb = 1'b1; + commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd}); + commit_if.data.data = writeback_data; + commit_if.data.tensor = 1'b1; + commit_if.data.eop = writeback_last; + end else begin + commit_if.valid = metadata_valid; + commit_if.data.uuid = execute_if_data_uuid; + commit_if.data.wid = execute_if_data_wid; + commit_if.data.tmask = execute_if_data_tmask; + commit_if.data.PC = execute_if_data_PC; + commit_if.data.wb = execute_if_data_wb; + commit_if.data.rd = execute_if_data_rd; + commit_if.data.data = '0; + commit_if.data.tensor = 1'b0; + commit_if.data.pid = 1'b0; + commit_if.data.sop = 1'b1; + commit_if.data.eop = 1'b1; + end + end +endmodule + +`endif diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 916340c4..090c6e80 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -8,7 +8,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( input reset, VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master smem_A_if, VX_tc_bus_if.master smem_B_if, @@ -69,6 +69,19 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .smem_B_if (smem_B_if), .commit_if (commit_block_if[block_idx]) ); +`elsif EXT_T_BLACKWELL + VX_tensor_blackwell_core_block #( + .ISW(1), + .FP16(FP16) + ) tensor_blackwell_core_block ( + .clk (clk), + .reset (reset), + .execute_if (execute_if[block_idx]), + .regfile_if (regfile_if), + .tmem_if (smem_A_if), + .smem_B_if (smem_B_if), + .commit_if (commit_block_if[block_idx]) + ); // ) tensor_hopper_core_block ( // .clk (clk), // .reset (reset), diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv index ab3b2330..a88dccd9 100644 --- a/hw/rtl/core/VX_tensor_hopper_core.sv +++ b/hw/rtl/core/VX_tensor_hopper_core.sv @@ -177,6 +177,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #( // add offset of 32 for fp regs assign regfile_if.req_data.rs[5] = 1'b1; assign regfile_if.req_data.wis = '0; + assign smem_A_if.req_data.rw = 1'b0; + assign smem_A_if.req_data.byteen = '1; + assign smem_A_if.req_data.data = '0; + assign smem_B_if.req_data.rw = 1'b0; + assign smem_B_if.req_data.byteen = '1; + assign smem_B_if.req_data.data = '0; `STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS), ("static assertion failed: tensor_hopper_core assumes ISSUE_WIDTH == NUM_WARPS")) diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 798466bf..2ca6eb3e 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -183,7 +183,7 @@ end // merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint, // but conceptually they should be linked always @(*) begin -`ifdef EXT_T_HOPPER +`ifdef EXT_T_ASYNC // for Hopper, disable micro-op blitzing. Set/step is managed // microarchitecturally in an FSM inside the tensor core. use_uop = 1'b0; diff --git a/hw/rtl/mem/VX_tc_bus_if.sv b/hw/rtl/mem/VX_tc_bus_if.sv index deb4f42e..6a06922f 100644 --- a/hw/rtl/mem/VX_tc_bus_if.sv +++ b/hw/rtl/mem/VX_tc_bus_if.sv @@ -20,7 +20,10 @@ interface VX_tc_bus_if #( )(); typedef struct packed { + logic rw; + logic [DATA_SIZE-1:0] byteen; logic [ADDR_WIDTH-1:0] addr; + logic [DATA_SIZE*8-1:0] data; logic [TAG_WIDTH-1:0] tag; } req_data_t;