Update Vortex core for Blackwell tensor instructions

- Add Blackwell tensor core support in VX_tensor_blackwell_core.sv
- Update decode, execute, and dispatch logic for new instructions
- Extend VX_define.vh and VX_types.vh with Blackwell ISA definitions
This commit is contained in:
2026-05-06 14:50:54 +08:00
parent cb912d3b8b
commit 323ed7d7e9
17 changed files with 492 additions and 114 deletions

View File

@@ -5,7 +5,7 @@
module Vortex import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter TENSOR_FP16 = 0,
parameter BOOTROM_HANG100 = 32'h10100,
parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100,
parameter NUM_THREADS = 0,
parameter TC_DATA_WIDTH = 256,
parameter TC_TAG_WIDTH = 4
@@ -77,17 +77,26 @@ module Vortex import VX_gpu_pkg::*; #(
output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data,
// tc --------------------------------------------------
input [1:0] tc_a_ready,
output [1:0] tc_a_valid,
output [1:0] tc_a_bits_write,
output [63:0] tc_a_bits_address,
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [2 * 32 - 1:0] tc_a_bits_mask,
output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [1:0] tc_d_ready,
input [1:0] tc_d_valid,
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [2 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
input [2:0] tc_a_ready,
output [2:0] tc_a_valid,
output [2:0] tc_a_bits_write,
output [95:0] tc_a_bits_address,
output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [3 * 32 - 1:0] tc_a_bits_mask,
output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [2:0] tc_d_ready,
input [2:0] tc_d_valid,
input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
// tmem_C direct SRAM port
output tc_tmem_C_wen,
output tc_tmem_C_ren,
output [8:0] tc_tmem_C_waddr,
output [8:0] tc_tmem_C_raddr,
output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
// gbar ------------------------------------------------
@@ -306,22 +315,23 @@ module Vortex import VX_gpu_pkg::*; #(
// tc ---------------------------------------------------------------------
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data};
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if();
// tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_*
assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data};
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_p1_bus_if.req_ready = tc_a_ready[1];
assign tc_p1_bus_if.rsp_valid = tc_d_valid[1];
assign tc_p1_bus_if.rsp_data.data = tc_d_bits_data[1 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p1_bus_if.rsp_data.tag = tc_d_bits_tag[1 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready = {tc_p1_bus_if.rsp_ready, tc_p0_bus_if.rsp_ready};
assign tc_p2_bus_if.req_ready = tc_a_ready[2];
assign tc_p2_bus_if.rsp_valid = tc_d_valid[2];
assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready};
// gbar -------------------------------------------------------------------
`ifdef GBAR_ENABLE
@@ -395,15 +405,14 @@ module Vortex import VX_gpu_pkg::*; #(
dcr_write_valid = 1'b1;
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR0;
dcr_write_data = BOOTROM_HANG100;
dcr_write_data = STARTUP_ADDR[31:0];
end
`VX_DCR_BASE_STARTUP_ADDR1: begin
dcr_state_n = `VX_DCR_BASE_MPM_CLASS;
dcr_write_valid = 1'b1;
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR1;
// FIXME: not sure what this does
dcr_write_data = `VX_DCR_DATA_WIDTH'h0;
dcr_write_data = STARTUP_ADDR[63:32];
end
`VX_DCR_BASE_MPM_CLASS: begin
dcr_state_n = `VX_DCR_BASE_STATE_END;
@@ -455,7 +464,25 @@ module Vortex import VX_gpu_pkg::*; #(
`endif
.tensor_smem_A_if (tc_p0_bus_if),
.tensor_smem_B_if (tc_p1_bus_if),
`ifdef EXT_T_BLACKWELL
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`else
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
@@ -577,7 +604,3 @@ module Vortex import VX_gpu_pkg::*; #(
`endif
endmodule : Vortex

View File

@@ -264,6 +264,7 @@
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
`define INST_TENSOR_TCGEN05_LD 4'b0111
`define INST_TENSOR_TCGEN05_ST 4'b1000
`define INST_TENSOR_TCGEN05_CB 4'b1001
`ifdef EXT_T_HOPPER
`define EXT_T_ASYNC

View File

@@ -179,6 +179,7 @@
`define VX_CSR_CORE_ID 12'hCC2
`define VX_CSR_WARP_MASK 12'hCC3
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_GCID 12'hCC5 // legacy global core id alias used by Radiance bootrom
`define VX_CSR_NUM_THREADS 12'hFC0
`define VX_CSR_NUM_WARPS 12'hFC1

View File

@@ -208,7 +208,7 @@ module VX_commit import VX_gpu_pkg::*; #(
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].valid = commit_if[i].valid && (commit_if[i].data.wb || commit_if[i].data.tensor);
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
@@ -224,7 +224,7 @@ module VX_commit import VX_gpu_pkg::*; #(
// simulation helper signal to get RISC-V tests Pass/Fail status
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
always @(posedge clk) begin
if (writeback_if[0].valid) begin
if (writeback_if[0].valid && !writeback_if[0].data.tensor) begin
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
end
end

View File

@@ -40,6 +40,14 @@ module VX_core import VX_gpu_pkg::*; #(
VX_mem_bus_if.master icache_bus_if,
VX_tc_bus_if.master tensor_smem_A_if,
// tensor_tmem_C is now a direct SRAM port
output logic tensor_tmem_C_wen,
output logic tensor_tmem_C_ren,
output logic [8:0] tensor_tmem_C_waddr,
output logic [8:0] tensor_tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
VX_tc_bus_if.master tensor_smem_B_if,
`ifdef GBAR_ENABLE
@@ -229,6 +237,13 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
.tensor_smem_A_if (tensor_smem_A_if),
.tensor_tmem_C_wen (tensor_tmem_C_wen),
.tensor_tmem_C_ren (tensor_tmem_C_ren),
.tensor_tmem_C_waddr(tensor_tmem_C_waddr),
.tensor_tmem_C_raddr(tensor_tmem_C_raddr),
.tensor_tmem_C_wdata(tensor_tmem_C_wdata),
.tensor_tmem_C_mask(tensor_tmem_C_mask),
.tensor_tmem_C_rdata(tensor_tmem_C_rdata),
.tensor_smem_B_if (tensor_smem_B_if),
`endif
`endif

View File

@@ -163,6 +163,7 @@ import VX_fpu_pkg::*;
`endif
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_GCID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);

View File

@@ -564,11 +564,45 @@ module VX_decode #(
3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT;
3'b100: op_type = `INST_TENSOR_TCGEN05_LD;
3'b101: op_type = `INST_TENSOR_TCGEN05_ST;
3'b110: op_type = `INST_TENSOR_TCGEN05_CB;
default: ;
endcase
`USED_IREG (rd);
case (func3)
3'b000: begin // BWGMMA: rs1=tmem_a, rs2=smem_b, rd field=tmem_c source
`USED_IREG (rs1);
`USED_IREG (rs2);
`ifdef EXT_F_ENABLE
rs3_r = {1'b0, rd};
`else
rs3_r = rd;
`endif
use_rs3 = 1;
end
3'b010, 3'b110: begin // TCGEN05_CP/CB: rs1=tmem, rs2=global memory address
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'b100: begin // TCGEN05_LD: rs1=tmem, rd=FP destination
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
`USED_FREG (rd);
`else
`USED_IREG (rd);
`endif
end
3'b101: begin // TCGEN05_ST: rs1=tmem, rd field=FP source
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
rd_r = {1'b1, rd};
rs3_r = {1'b1, rd};
`else
rd_r = rd;
rs3_r = rd;
`endif
use_rs3 = 1;
end
default: ;
endcase
`else
ex_type = `EX_TENSOR;
op_type = `INST_TENSOR_HMMA;

View File

@@ -312,6 +312,8 @@ module VX_dispatch import VX_gpu_pkg::*; #(
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.rd, '0, operands_if[i].data.use_imm, operands_if[i].data.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));

View File

@@ -61,6 +61,13 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master tensor_regfile_if,
VX_tc_bus_if.master tensor_smem_A_if,
output logic tensor_tmem_C_wen,
output logic tensor_tmem_C_ren,
output logic [8:0] tensor_tmem_C_waddr,
output logic [8:0] tensor_tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
VX_tc_bus_if.master tensor_smem_B_if,
`endif
`endif
@@ -159,6 +166,13 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
.regfile_if(tensor_regfile_if),
.smem_A_if(tensor_smem_A_if),
.tmem_C_wen(tensor_tmem_C_wen),
.tmem_C_ren(tensor_tmem_C_ren),
.tmem_C_waddr(tensor_tmem_C_waddr),
.tmem_C_raddr(tensor_tmem_C_raddr),
.tmem_C_wdata(tensor_tmem_C_wdata),
.tmem_C_mask(tensor_tmem_C_mask),
.tmem_C_rdata(tensor_tmem_C_rdata),
.smem_B_if(tensor_smem_B_if),
`endif
.commit_if(tensor_commit_if)

View File

@@ -12,6 +12,7 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
@@ -36,6 +37,20 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
assign decode_if.ready = ibuf_ready_in[decode_isw];
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
wire decode_probe_pc = (decode_if.data.PC >= 32'h80000240) && (decode_if.data.PC <= 32'h80000260);
always @(posedge clk) begin
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && decode_if.valid && decode_probe_pc) begin
`TRACE(2, ("%d: core%0d-ibuffer-probe: wid=%0d, PC=0x%0h, valid=%b, ready=%b, isw=%0d, ibuf_ready=%b, tmask=%b, ex=0x%0h, op=0x%0h (#%0d)\n",
$time, CORE_ID, decode_if.data.wid, decode_if.data.PC, decode_if.valid, decode_if.ready,
decode_isw, ibuf_ready_in, decode_if.data.tmask, decode_if.data.ex_type, decode_if.data.op_type,
decode_if.data.uuid));
end
end
`endif
`endif
VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin

View File

@@ -121,6 +121,36 @@ module VX_issue import VX_gpu_pkg::*; #(
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire ibuf_probe_pc = ibuffer_if[i].valid
&& (ibuffer_if[i].data.PC >= 32'h80000240)
&& (ibuffer_if[i].data.PC <= 32'h80000260);
wire scb_probe_pc = scoreboard_if[i].valid
&& (scoreboard_if[i].data.PC >= 32'h80000240)
&& (scoreboard_if[i].data.PC <= 32'h80000260);
wire ops_probe_pc = operands_if[i].valid
&& (operands_if[i].data.PC >= 32'h80000240)
&& (operands_if[i].data.PC <= 32'h80000260);
always @(posedge clk) begin
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && (ibuf_probe_pc || scb_probe_pc || ops_probe_pc)) begin
`TRACE(2, ("%d: core%0d-issue-probe: isw=%0d, ibuf=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, alu=%b/%b lsu=%b/%b sfu=%b/%b (#ibuf=%0d #scb=%0d #ops=%0d)\n",
$time, CORE_ID, i,
ibuffer_if[i].valid, ibuffer_if[i].ready, ibuffer_if[i].data.PC, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, operands_if[i].data.ex_type, operands_if[i].data.op_type,
alu_dispatch_if[i].valid, alu_dispatch_if[i].ready,
lsu_dispatch_if[i].valid, lsu_dispatch_if[i].ready,
sfu_dispatch_if[i].valid, sfu_dispatch_if[i].ready,
ibuffer_if[i].data.uuid, scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
end
end
end
`endif
`endif
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE

View File

@@ -156,7 +156,7 @@ module VX_operands import VX_gpu_pkg::*; #(
end
endcase
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
if (CACHE_ENABLE != 0 && writeback_if[i].valid && !writeback_if[i].data.tensor) begin
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
@@ -288,9 +288,9 @@ module VX_operands import VX_gpu_pkg::*; #(
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),

View File

@@ -12,6 +12,7 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef GPR_DUPLICATED
@@ -123,6 +124,46 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
`endif
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME)) begin
if ((scoreboard_if[i].valid &&
(scoreboard_if[i].data.PC >= `XLEN'h80000240) &&
(scoreboard_if[i].data.PC <= `XLEN'h80000260)) ||
(operands_if[i].valid &&
(operands_if[i].data.PC >= `XLEN'h80000240) &&
(operands_if[i].data.PC <= `XLEN'h80000260))
`ifdef EXT_T_ASYNC
|| tc_rf_valid[i]
`endif
) begin
`ifdef EXT_T_ASYNC
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b, tc_rf_valid=%b, tc_rf_rs=%0d, tc_rf_wis=%0d (#scb=%0d #ops=%0d)\n",
$time, CORE_ID, i,
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
operands_if[i].data.ex_type, operands_if[i].data.op_type,
size1[i], empty1[i], full1[i],
tc_rf_valid[i], tc_rf_addr[i], tensor_regfile_if.req_data.wis,
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
`else
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n",
$time, CORE_ID, i,
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
operands_if[i].data.ex_type, operands_if[i].data.op_type,
size1[i], empty1[i], full1[i],
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
`endif
end
end
end
`endif
`endif
// assert (full1[i] == full2[i]);
// assert (empty1[i] == empty2[i]);
@@ -263,9 +304,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -287,9 +328,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -315,9 +356,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
`endif
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -326,9 +367,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
);
`ifdef PERF_ENABLE
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
`endif
end

View File

@@ -322,6 +322,40 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
timeout_ctr <= '0;
end
`ifdef EXT_T_ASYNC
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
if (CORE_ID == 0 && ibuffer_if[i].valid &&
(ibuffer_if[i].data.PC >= `XLEN'h80000240) &&
(ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin
`TRACE(2, ("%d: core%0d-scoreboard-probe: wid=%0d, PC=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, busy={rd,rs1,rs2,rs3}=%b, rd=%0d rs1=%0d rs2=%0d rs3=%0d, wb=%b, ex=0x%0h, op=0x%0h, inuse_rd=%b, inuse_rs1=%b, inuse_rs2=%b, inuse_rs3=%b, hgmma_start=%b, hgmma_wait=%b, hgmma_ready=%b, inflight=%0d, wb_fire=%b, wb_tensor=%b, wb_wid=%0d, wb_rd=%0d, wb_pc=0x%0h (#%0d)\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
ibuffer_if[i].ready, stg_ready_in, operands_ready, operands_busy,
ibuffer_if[i].data.rd, ibuffer_if[i].data.rs1, ibuffer_if[i].data.rs2, ibuffer_if[i].data.rs3,
ibuffer_if[i].data.wb, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3, hgmma_start, hgmma_wait, hgmma_ready,
inflight_tensor[ibuffer_if[i].data.wis], writeback_fire, writeback_if[i].data.tensor,
wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.rd, writeback_if[i].data.PC,
ibuffer_if[i].data.uuid));
end
if (ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR))) begin
`TRACE(2, ("%d: core%0d-scoreboard-tensor: wid=%0d, PC=0x%0h, op=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, inuse=%b, inflight=%0d, hgmma_start=%b, hgmma_wait=%b, hgmma_fire_ready=%b, hgmma_wait_ready=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
ibuffer_if[i].data.op_type, ibuffer_if[i].ready, stg_ready_in, operands_ready,
operands_busy, inflight_tensor[ibuffer_if[i].data.wis], hgmma_start, hgmma_wait,
hgmma_ready_for_fire, hgmma_ready_for_wait, ibuffer_if[i].data.uuid));
end
if (tensor_issue_fire) begin
`TRACE(2, ("%d: core%0d-scoreboard-tensor-issue-count: wid=%0d, op=0x%0h, inflight_before=%0d\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.op_type,
inflight_tensor[ibuffer_if[i].data.wis]));
end
if (tensor_writeback_fire) begin
`TRACE(2, ("%d: core%0d-scoreboard-tensor-wb-count: wid=%0d, inflight_before=%0d\n",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i),
inflight_tensor[writeback_if[i].data.wis]));
end
`endif
`endif
end
end

View File

@@ -1,5 +1,6 @@
`ifdef EXT_T_ENABLE
`include "VX_fpu_define.vh"
`include "VX_trace.vh"
module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
parameter ISW,
@@ -11,6 +12,14 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if,
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master tmem_if,
// tmem_C is now a direct SRAM port (no TileLink)
output logic tmem_C_wen,
output logic tmem_C_ren,
output logic [8:0] tmem_C_waddr,
output logic [8:0] tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
VX_tc_bus_if.master smem_B_if,
VX_commit_if.master commit_if
);
@@ -26,6 +35,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
wire [`NR_BITS-1:0] execute_if_data_rd;
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1;
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2;
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs3;
wire metadata_queue_full;
wire metadata_queue_empty;
@@ -38,7 +48,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
wire deq = metadata_deq;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 +
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
VX_fifo_queue #(
.DATAW(DATAW),
.DEPTH(METADATA_QUEUE_DEPTH)
@@ -50,11 +60,11 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
.data_in({execute_if.data.uuid, execute_if.data.wid,
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
execute_if.data.wb, execute_if.data.rd,
execute_if.data.rs1_data, execute_if.data.rs2_data}),
execute_if.data.rs1_data, execute_if.data.rs2_data, execute_if.data.rs3_data}),
.data_out({execute_if_data_uuid, execute_if_data_wid,
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
execute_if_data_wb, execute_if_data_rd,
execute_if_data_rs1, execute_if_data_rs2}),
execute_if_data_rs1, execute_if_data_rs2, execute_if_data_rs3}),
.empty(metadata_queue_empty),
`UNUSED_PIN(alm_empty),
.full(metadata_queue_full),
@@ -75,20 +85,50 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
(execute_if_data_op_type == `INST_TENSOR_BWGMMA);
wire bwgmma_wait = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT);
wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma;
wire tcgen05_cp = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP);
wire tcgen05_cp_wait = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP_WAIT);
wire tcgen05_ld = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_LD);
wire tcgen05_st = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_ST);
wire tcgen05_cb = metadata_valid &&
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CB);
wire tensor_launch_op = bwgmma || tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb;
wire tensor_wait_op = bwgmma_wait || tcgen05_cp_wait;
wire [2:0] initiate_op =
bwgmma ? 3'd0 :
bwgmma_wait ? 3'd1 :
tcgen05_cp ? 3'd2 :
tcgen05_cp_wait ? 3'd3 :
tcgen05_ld ? 3'd4 :
tcgen05_st ? 3'd5 :
tcgen05_cb ? 3'd6 :
3'd0;
wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending && commit_if.ready;
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
wire [4:0] initiate_rd = execute_if_data_rd[4:0];
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0];
wire [`XLEN-1:0] initiate_addr_c = execute_if_data_rs3[0];
TensorCoreDecoupled tensor_blackwell_core (
logic sync_launch_pending;
logic sync_launch_is_ld;
logic sync_launch_is_bwgmma;
TensorCoreBlackwell tensor_blackwell_core (
.clock(clk),
.reset(reset),
.io_initiate_ready(initiate_ready),
.io_initiate_valid(bwgmma_initiate_valid),
.io_initiate_valid(initiate_valid),
.io_initiate_bits_op(initiate_op),
.io_initiate_bits_wid(initiate_wid),
.io_initiate_bits_rd(initiate_rd),
.io_initiate_bits_addressA(initiate_addr_a),
.io_initiate_bits_addressB(initiate_addr_b),
.io_initiate_bits_addressC(initiate_addr_c),
.io_writeback_ready(writeback_ready),
.io_writeback_valid(writeback_valid),
@@ -116,44 +156,73 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
.io_reqA_ready(tmem_if.req_ready),
.io_reqA_valid(tmem_if.req_valid),
.io_reqA_bits_rw(tmem_if.req_data.rw),
.io_reqA_bits_byteen(tmem_if.req_data.byteen),
.io_reqA_bits_source(tmem_if.req_data.tag),
.io_reqA_bits_address(tmem_if.req_data.addr),
.io_reqA_bits_data(tmem_if.req_data.data),
.io_tmemC_wen(tmem_C_wen),
.io_tmemC_ren(tmem_C_ren),
.io_tmemC_waddr(tmem_C_waddr),
.io_tmemC_raddr(tmem_C_raddr),
.io_tmemC_wdata(tmem_C_wdata),
.io_tmemC_mask(tmem_C_mask),
.io_tmemC_rdata(tmem_C_rdata),
.io_reqB_ready(smem_B_if.req_ready),
.io_reqB_valid(smem_B_if.req_valid),
.io_reqB_bits_rw(smem_B_if.req_data.rw),
.io_reqB_bits_byteen(smem_B_if.req_data.byteen),
.io_reqB_bits_source(smem_B_if.req_data.tag),
.io_reqB_bits_address(smem_B_if.req_data.addr),
.io_reqB_bits_data(smem_B_if.req_data.data),
.io_reqC_valid(regfile_if.req_valid),
.io_reqC_bits(regfile_if.req_data.rs[4:0])
);
assign regfile_if.req_data.rs[5] = 1'b1;
assign regfile_if.req_data.wis = '0;
assign tmem_if.req_data.rw = 1'b0;
assign tmem_if.req_data.byteen = '1;
assign tmem_if.req_data.data = '0;
assign smem_B_if.req_data.rw = 1'b0;
assign smem_B_if.req_data.byteen = '1;
assign smem_B_if.req_data.data = '0;
logic commit_select_tensor;
logic commit_sync_done;
logic sync_done;
wire sync_writeback_done = sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma) && writeback_valid;
wire sync_no_wb_done = sync_launch_pending && !(sync_launch_is_ld || sync_launch_is_bwgmma) && initiate_ready;
wire tensor_idle = initiate_ready && !writeback_valid && !sync_launch_pending;
always @(*) begin
metadata_deq = 1'b0;
commit_select_tensor = 1'b0;
writeback_ready = commit_if.ready;
commit_sync_done = 1'b0;
sync_done = 1'b0;
writeback_ready = commit_if.ready && sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma);
if (metadata_valid) begin
if (bwgmma_wait) begin
writeback_ready = 1'b0;
if (sync_launch_pending) begin
if (sync_writeback_done) begin
commit_select_tensor = 1'b1;
metadata_deq = commit_if.ready;
commit_sync_done = commit_if.ready;
end else if (sync_no_wb_done) begin
commit_select_tensor = 1'b0;
metadata_deq = metadata_valid && commit_if.ready;
commit_sync_done = metadata_valid && commit_if.ready;
end
end else if (metadata_valid) begin
if (tensor_wait_op) begin
writeback_ready = 1'b0;
commit_select_tensor = 1'b0;
metadata_deq = metadata_valid && commit_if.ready && tensor_idle;
end else if (bwgmma) begin
commit_select_tensor = !initiate_ready;
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
writeback_ready = 1'b0;
commit_select_tensor = 1'b0;
sync_done = initiate_valid && initiate_ready;
metadata_deq = 1'b0;
end else if (tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb) begin
writeback_ready = 1'b0;
commit_select_tensor = 1'b0;
sync_done = initiate_valid && initiate_ready;
// suppress commit until sync_launch_pending is set next cycle
metadata_deq = 1'b0;
end else begin
// TCGEN05 ops are recognized by decode so software can build
// against the Blackwell surface. Phase 1 compute integration
// only routes BWGMMA through the decoupled tensor datapath.
commit_select_tensor = 1'b0;
metadata_deq = metadata_valid && commit_if.ready;
end
@@ -162,18 +231,22 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
end
if (commit_select_tensor) begin
commit_if.valid = writeback_valid;
commit_if.data.uuid = '0;
commit_if.data.wid = writeback_wid;
commit_if.data.tmask = {NUM_LANES{1'b1}};
commit_if.data.PC = '0;
commit_if.data.wb = 1'b1;
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd});
commit_if.valid = sync_writeback_done;
commit_if.data.uuid = sync_launch_is_bwgmma ? '0 : execute_if_data_uuid;
commit_if.data.wid = sync_launch_is_bwgmma ? writeback_wid : execute_if_data_wid;
commit_if.data.tmask = sync_launch_is_bwgmma ? {NUM_LANES{1'b1}} : execute_if_data_tmask;
commit_if.data.PC = sync_launch_is_bwgmma ? '0 : execute_if_data_PC;
commit_if.data.wb = sync_launch_is_ld;
commit_if.data.rd = sync_launch_is_ld ? execute_if_data_rd : '0;
commit_if.data.data = writeback_data;
commit_if.data.tensor = 1'b1;
commit_if.data.eop = writeback_last;
commit_if.data.tensor = sync_launch_is_bwgmma;
commit_if.data.pid = 1'b0;
commit_if.data.sop = 1'b1;
commit_if.data.eop = sync_launch_is_ld ? 1'b1 : writeback_last;
end else begin
commit_if.valid = metadata_valid;
commit_if.valid = (sync_no_wb_done && metadata_valid) ||
(metadata_valid && !sync_launch_pending && !tensor_launch_op &&
(!tensor_wait_op || tensor_idle));
commit_if.data.uuid = execute_if_data_uuid;
commit_if.data.wid = execute_if_data_wid;
commit_if.data.tmask = execute_if_data_tmask;
@@ -187,6 +260,57 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
commit_if.data.eop = 1'b1;
end
end
always @(posedge clk) begin
if (reset) begin
sync_launch_pending <= 1'b0;
sync_launch_is_ld <= 1'b0;
sync_launch_is_bwgmma <= 1'b0;
end else begin
if (sync_done) begin
sync_launch_pending <= 1'b1;
sync_launch_is_ld <= tcgen05_ld;
sync_launch_is_bwgmma <= bwgmma;
end else if (commit_sync_done) begin
sync_launch_pending <= 1'b0;
sync_launch_is_ld <= 1'b0;
sync_launch_is_bwgmma <= 1'b0;
end
end
end
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME)) begin
if (operand_enq_fire) begin
`TRACE(2, ("%d: tensor-blackwell-enq: wid=%0d, PC=0x%0h, op=0x%0h, uuid=%0d, queue_full=%b\n",
$time, execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type,
execute_if.data.uuid, metadata_queue_full));
end
if (metadata_valid) begin
`TRACE(2, ("%d: tensor-blackwell-meta: wid=%0d, PC=0x%0h, op=0x%0h, deq=%b, commit_valid=%b, commit_ready=%b, initiate_valid=%b, initiate_ready=%b, writeback_valid=%b, writeback_ready=%b, sync_pending=%b, sync_is_ld=%b, sync_done=%b, commit_sync_done=%b, queue_full=%b\n",
$time, execute_if_data_wid, execute_if_data_PC, execute_if_data_op_type,
metadata_deq, commit_if.valid, commit_if.ready, initiate_valid, initiate_ready,
writeback_valid, writeback_ready, sync_launch_pending, sync_launch_is_ld,
sync_done, commit_sync_done, metadata_queue_full));
end
if (writeback_valid || initiate_valid) begin
`TRACE(2, ("%d: tensor-blackwell-handshake: initiate=%b/%b, op=%0d, addrA=0x%0h, addrB=0x%0h, writeback=%b/%b, wb_wid=%0d, wb_rd=%0d, wb_last=%b\n",
$time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b,
writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last));
end
if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin
`TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemC_wen=%b tmemC_ren=%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n",
$time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready,
tmem_C_wen, tmem_C_ren, tmem_C_waddr, tmem_C_raddr,
smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready,
initiate_ready));
end
end
end
`endif
`endif
endmodule
`endif

View File

@@ -11,6 +11,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master smem_A_if,
output logic tmem_C_wen,
output logic tmem_C_ren,
output logic [8:0] tmem_C_waddr,
output logic [8:0] tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
VX_tc_bus_if.master smem_B_if,
`endif
VX_commit_if.master commit_if [`ISSUE_WIDTH]
@@ -57,6 +64,12 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`ifdef EXT_T_HOPPER
assign tmem_C_wen = 1'b0;
assign tmem_C_ren = 1'b0;
assign tmem_C_waddr = '0;
assign tmem_C_raddr = '0;
assign tmem_C_wdata = '0;
assign tmem_C_mask = '0;
VX_tensor_hopper_core_block #(
.ISW(1), // FIXME: not block_idx
.FP16(FP16)
@@ -79,6 +92,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.execute_if (execute_if[block_idx]),
.regfile_if (regfile_if),
.tmem_if (smem_A_if),
.tmem_C_wen (tmem_C_wen),
.tmem_C_ren (tmem_C_ren),
.tmem_C_waddr(tmem_C_waddr),
.tmem_C_raddr(tmem_C_raddr),
.tmem_C_wdata(tmem_C_wdata),
.tmem_C_mask(tmem_C_mask),
.tmem_C_rdata(tmem_C_rdata),
.smem_B_if (smem_B_if),
.commit_if (commit_block_if[block_idx])
);

View File

@@ -22,6 +22,9 @@ task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
`ifdef EXT_T_ENABLE
`EX_TENSOR: `TRACE(level, ("TENSOR"));
`endif
default: `TRACE(level, ("?"));
endcase
endtask
@@ -36,22 +39,26 @@ task trace_ex_op(input int level,
`UNUSED_ARG(input [`XLEN-1:0] imm)
);
logic fdst_d;
logic fcvt_l;
logic rd_float;
`ifdef FLEN_64
logic fdst_d = imm[0];
fdst_d = imm[0];
`else
logic fdst_d = 0;
fdst_d = 0;
`endif
`ifdef XLEN_64
logic fcvt_l = imm[1];
fcvt_l = imm[1];
`else
logic fcvt_l = 0;
fcvt_l = 0;
`endif
`ifdef EXT_F_ENABLE
logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
`else
logic rd_float = 0;
rd_float = 0;
`endif
case (ex_type)
@@ -359,6 +366,22 @@ task trace_ex_op(input int level,
default: `TRACE(level, ("?"));
endcase
end
`ifdef EXT_T_ENABLE
`EX_TENSOR: begin
case (`INST_ALU_BITS'(op_type))
`INST_TENSOR_HMMA: `TRACE(level, ("HMMA"));
`INST_TENSOR_HGMMA: `TRACE(level, ("HGMMA"));
`INST_TENSOR_HGMMA_WAIT: `TRACE(level, ("HGMMA_WAIT"));
`INST_TENSOR_TCGEN05_CP: `TRACE(level, ("TCGEN05_CP"));
`INST_TENSOR_TCGEN05_CP_WAIT: `TRACE(level, ("TCGEN05_CP_WAIT"));
`INST_TENSOR_BWGMMA: `TRACE(level, ("BWGMMA"));
`INST_TENSOR_BWGMMA_WAIT: `TRACE(level, ("BWGMMA_WAIT"));
`INST_TENSOR_TCGEN05_LD: `TRACE(level, ("TCGEN05_LD"));
`INST_TENSOR_TCGEN05_ST: `TRACE(level, ("TCGEN05_ST"));
default: `TRACE(level, ("?"));
endcase
end
`endif
default: `TRACE(level, ("?"));
endcase
endtask