diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 77baecb4..b6bd737d 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -5,7 +5,7 @@ module Vortex import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter TENSOR_FP16 = 0, - parameter BOOTROM_HANG100 = 32'h10100, + parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100, parameter NUM_THREADS = 0, parameter TC_DATA_WIDTH = 256, parameter TC_TAG_WIDTH = 4 @@ -77,17 +77,26 @@ module Vortex import VX_gpu_pkg::*; #( output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data, // tc -------------------------------------------------- - input [1:0] tc_a_ready, - output [1:0] tc_a_valid, - output [1:0] tc_a_bits_write, - output [63:0] tc_a_bits_address, - output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag, - output [2 * 32 - 1:0] tc_a_bits_mask, - output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data, - output [1:0] tc_d_ready, - input [1:0] tc_d_valid, - input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data, - input [2 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag, + input [2:0] tc_a_ready, + output [2:0] tc_a_valid, + output [2:0] tc_a_bits_write, + output [95:0] tc_a_bits_address, + output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag, + output [3 * 32 - 1:0] tc_a_bits_mask, + output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data, + output [2:0] tc_d_ready, + input [2:0] tc_d_valid, + input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data, + input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag, + + // tmem_C direct SRAM port + output tc_tmem_C_wen, + output tc_tmem_C_ren, + output [8:0] tc_tmem_C_waddr, + output [8:0] tc_tmem_C_raddr, + output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata, + output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask, + input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata, // gbar ------------------------------------------------ @@ -306,22 +315,23 @@ module Vortex import VX_gpu_pkg::*; #( // tc --------------------------------------------------------------------- VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if(); - VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if(); - assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid}; - assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw}; - assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr}; - assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag}; - assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen}; - assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data}; + VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if(); + // tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_* + assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid}; + assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw}; + assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr}; + assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag}; + assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen}; + assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data}; assign tc_p0_bus_if.req_ready = tc_a_ready[0]; assign tc_p0_bus_if.rsp_valid = tc_d_valid[0]; assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; - assign tc_p1_bus_if.req_ready = tc_a_ready[1]; - assign tc_p1_bus_if.rsp_valid = tc_d_valid[1]; - assign tc_p1_bus_if.rsp_data.data = tc_d_bits_data[1 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; - assign tc_p1_bus_if.rsp_data.tag = tc_d_bits_tag[1 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; - assign tc_d_ready = {tc_p1_bus_if.rsp_ready, tc_p0_bus_if.rsp_ready}; + assign tc_p2_bus_if.req_ready = tc_a_ready[2]; + assign tc_p2_bus_if.rsp_valid = tc_d_valid[2]; + assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; + assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; + assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready}; // gbar ------------------------------------------------------------------- `ifdef GBAR_ENABLE @@ -395,15 +405,14 @@ module Vortex import VX_gpu_pkg::*; #( dcr_write_valid = 1'b1; dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR0; - dcr_write_data = BOOTROM_HANG100; + dcr_write_data = STARTUP_ADDR[31:0]; end `VX_DCR_BASE_STARTUP_ADDR1: begin dcr_state_n = `VX_DCR_BASE_MPM_CLASS; dcr_write_valid = 1'b1; dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR1; - // FIXME: not sure what this does - dcr_write_data = `VX_DCR_DATA_WIDTH'h0; + dcr_write_data = STARTUP_ADDR[63:32]; end `VX_DCR_BASE_MPM_CLASS: begin dcr_state_n = `VX_DCR_BASE_STATE_END; @@ -455,7 +464,25 @@ module Vortex import VX_gpu_pkg::*; #( `endif .tensor_smem_A_if (tc_p0_bus_if), - .tensor_smem_B_if (tc_p1_bus_if), + `ifdef EXT_T_BLACKWELL + .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_C_ren(tc_tmem_C_ren), + .tensor_tmem_C_waddr(tc_tmem_C_waddr), + .tensor_tmem_C_raddr(tc_tmem_C_raddr), + .tensor_tmem_C_wdata(tc_tmem_C_wdata), + .tensor_tmem_C_mask(tc_tmem_C_mask), + .tensor_tmem_C_rdata(tc_tmem_C_rdata), + .tensor_smem_B_if (tc_p2_bus_if), + `else + .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_C_ren(tc_tmem_C_ren), + .tensor_tmem_C_waddr(tc_tmem_C_waddr), + .tensor_tmem_C_raddr(tc_tmem_C_raddr), + .tensor_tmem_C_wdata(tc_tmem_C_wdata), + .tensor_tmem_C_mask(tc_tmem_C_mask), + .tensor_tmem_C_rdata(tc_tmem_C_rdata), + .tensor_smem_B_if (tc_p2_bus_if), + `endif .sim_ebreak (sim_ebreak), .sim_wb_value (sim_wb_value), @@ -577,7 +604,3 @@ module Vortex import VX_gpu_pkg::*; #( `endif endmodule : Vortex - - - - diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 2fe430c4..ada8fead 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -254,24 +254,25 @@ `define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) -`define INST_TENSOR_HMMA 4'b0000 -// Hopper WGMMA-style asynchronous op -`define INST_TENSOR_HGMMA 4'b0001 -`define INST_TENSOR_HGMMA_WAIT 4'b0010 -`define INST_TENSOR_TCGEN05_CP 4'b0011 -`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100 -`define INST_TENSOR_BWGMMA 4'b0101 -`define INST_TENSOR_BWGMMA_WAIT 4'b0110 -`define INST_TENSOR_TCGEN05_LD 4'b0111 -`define INST_TENSOR_TCGEN05_ST 4'b1000 - -`ifdef EXT_T_HOPPER -`define EXT_T_ASYNC -`elsif EXT_T_BLACKWELL -`define EXT_T_ASYNC -`endif - -/////////////////////////////////////////////////////////////////////////////// +`define INST_TENSOR_HMMA 4'b0000 +// Hopper WGMMA-style asynchronous op +`define INST_TENSOR_HGMMA 4'b0001 +`define INST_TENSOR_HGMMA_WAIT 4'b0010 +`define INST_TENSOR_TCGEN05_CP 4'b0011 +`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100 +`define INST_TENSOR_BWGMMA 4'b0101 +`define INST_TENSOR_BWGMMA_WAIT 4'b0110 +`define INST_TENSOR_TCGEN05_LD 4'b0111 +`define INST_TENSOR_TCGEN05_ST 4'b1000 +`define INST_TENSOR_TCGEN05_CB 4'b1001 + +`ifdef EXT_T_HOPPER +`define EXT_T_ASYNC +`elsif EXT_T_BLACKWELL +`define EXT_T_ASYNC +`endif + +/////////////////////////////////////////////////////////////////////////////// // non-cacheable tag bits `define NC_TAG_BITS 1 diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 0278dda6..11401cf7 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -179,6 +179,7 @@ `define VX_CSR_CORE_ID 12'hCC2 `define VX_CSR_WARP_MASK 12'hCC3 `define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM +`define VX_CSR_GCID 12'hCC5 // legacy global core id alias used by Radiance bootrom `define VX_CSR_NUM_THREADS 12'hFC0 `define VX_CSR_NUM_WARPS 12'hFC1 diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index cf4d92b4..b2676be7 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -208,7 +208,7 @@ module VX_commit import VX_gpu_pkg::*; #( // Writeback for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; + assign writeback_if[i].valid = commit_if[i].valid && (commit_if[i].data.wb || commit_if[i].data.tensor); assign writeback_if[i].data.uuid = commit_if[i].data.uuid; assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); assign writeback_if[i].data.PC = commit_if[i].data.PC; @@ -224,7 +224,7 @@ module VX_commit import VX_gpu_pkg::*; #( // simulation helper signal to get RISC-V tests Pass/Fail status reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r; always @(posedge clk) begin - if (writeback_if[0].valid) begin + if (writeback_if[0].valid && !writeback_if[0].data.tensor) begin sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0]; end end diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 392955d5..2f8af2b4 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -40,6 +40,14 @@ module VX_core import VX_gpu_pkg::*; #( VX_mem_bus_if.master icache_bus_if, VX_tc_bus_if.master tensor_smem_A_if, + // tensor_tmem_C is now a direct SRAM port + output logic tensor_tmem_C_wen, + output logic tensor_tmem_C_ren, + output logic [8:0] tensor_tmem_C_waddr, + output logic [8:0] tensor_tmem_C_raddr, + output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, + output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, + input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, VX_tc_bus_if.master tensor_smem_B_if, `ifdef GBAR_ENABLE @@ -63,9 +71,9 @@ module VX_core import VX_gpu_pkg::*; #( VX_decode_if decode_if(); VX_sched_csr_if sched_csr_if(); VX_decode_sched_if decode_sched_if(); -`ifdef EXT_T_ASYNC - VX_tc_rf_if tensor_regfile_if(); -`endif +`ifdef EXT_T_ASYNC + VX_tc_rf_if tensor_regfile_if(); +`endif VX_commit_sched_if commit_sched_if(); VX_commit_csr_if commit_csr_if(); VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); @@ -193,9 +201,9 @@ module VX_core import VX_gpu_pkg::*; #( `endif `ifdef EXT_T_ENABLE .tensor_dispatch_if(tensor_dispatch_if), - `ifdef EXT_T_ASYNC - .tensor_regfile_if (tensor_regfile_if), - `endif + `ifdef EXT_T_ASYNC + .tensor_regfile_if (tensor_regfile_if), + `endif `endif .sfu_dispatch_if(sfu_dispatch_if) ); @@ -226,10 +234,17 @@ module VX_core import VX_gpu_pkg::*; #( `ifdef EXT_T_ENABLE .tensor_dispatch_if (tensor_dispatch_if), .tensor_commit_if (tensor_commit_if), - `ifdef EXT_T_ASYNC - .tensor_regfile_if (tensor_regfile_if), - .tensor_smem_A_if (tensor_smem_A_if), - .tensor_smem_B_if (tensor_smem_B_if), + `ifdef EXT_T_ASYNC + .tensor_regfile_if (tensor_regfile_if), + .tensor_smem_A_if (tensor_smem_A_if), + .tensor_tmem_C_wen (tensor_tmem_C_wen), + .tensor_tmem_C_ren (tensor_tmem_C_ren), + .tensor_tmem_C_waddr(tensor_tmem_C_waddr), + .tensor_tmem_C_raddr(tensor_tmem_C_raddr), + .tensor_tmem_C_wdata(tensor_tmem_C_wdata), + .tensor_tmem_C_mask(tensor_tmem_C_mask), + .tensor_tmem_C_rdata(tensor_tmem_C_rdata), + .tensor_smem_B_if (tensor_smem_B_if), `endif `endif diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 7d8cc488..44ee0e70 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -163,6 +163,7 @@ import VX_fpu_pkg::*; `endif `VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid); `VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID); + `VX_CSR_GCID : read_data_ro_r = 32'(CORE_ID); `VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]); `VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps); `VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS); diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 4b16d1fa..65ca68f8 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -564,11 +564,45 @@ module VX_decode #( 3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT; 3'b100: op_type = `INST_TENSOR_TCGEN05_LD; 3'b101: op_type = `INST_TENSOR_TCGEN05_ST; + 3'b110: op_type = `INST_TENSOR_TCGEN05_CB; default: ; endcase - `USED_IREG (rd); - `USED_IREG (rs1); - `USED_IREG (rs2); + case (func3) + 3'b000: begin // BWGMMA: rs1=tmem_a, rs2=smem_b, rd field=tmem_c source + `USED_IREG (rs1); + `USED_IREG (rs2); + `ifdef EXT_F_ENABLE + rs3_r = {1'b0, rd}; + `else + rs3_r = rd; + `endif + use_rs3 = 1; + end + 3'b010, 3'b110: begin // TCGEN05_CP/CB: rs1=tmem, rs2=global memory address + `USED_IREG (rs1); + `USED_IREG (rs2); + end + 3'b100: begin // TCGEN05_LD: rs1=tmem, rd=FP destination + `USED_IREG (rs1); + `ifdef EXT_F_ENABLE + `USED_FREG (rd); + `else + `USED_IREG (rd); + `endif + end + 3'b101: begin // TCGEN05_ST: rs1=tmem, rd field=FP source + `USED_IREG (rs1); + `ifdef EXT_F_ENABLE + rd_r = {1'b1, rd}; + rs3_r = {1'b1, rd}; + `else + rd_r = rd; + rs3_r = rd; + `endif + use_rs3 = 1; + end + default: ; + endcase `else ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 29f07a18..74872d91 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -312,6 +312,8 @@ module VX_dispatch import VX_gpu_pkg::*; #( if (operands_if[i].valid && operands_if[i].ready) begin `TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC)); trace_ex_type(1, operands_if[i].data.ex_type); + `TRACE(1, (", op=")); + trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.rd, '0, operands_if[i].data.use_imm, operands_if[i].data.imm); `TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd)); `TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS); `TRACE(1, (", rs2_data=")); diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index 051bf85a..8515f098 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -61,6 +61,13 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_T_ASYNC VX_tc_rf_if.master tensor_regfile_if, VX_tc_bus_if.master tensor_smem_A_if, + output logic tensor_tmem_C_wen, + output logic tensor_tmem_C_ren, + output logic [8:0] tensor_tmem_C_waddr, + output logic [8:0] tensor_tmem_C_raddr, + output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, + output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, + input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, VX_tc_bus_if.master tensor_smem_B_if, `endif `endif @@ -159,6 +166,13 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_T_ASYNC .regfile_if(tensor_regfile_if), .smem_A_if(tensor_smem_A_if), + .tmem_C_wen(tensor_tmem_C_wen), + .tmem_C_ren(tensor_tmem_C_ren), + .tmem_C_waddr(tensor_tmem_C_waddr), + .tmem_C_raddr(tensor_tmem_C_raddr), + .tmem_C_wdata(tensor_tmem_C_wdata), + .tmem_C_mask(tensor_tmem_C_mask), + .tmem_C_rdata(tensor_tmem_C_rdata), .smem_B_if(tensor_smem_B_if), `endif .commit_if(tensor_commit_if) diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index 7ed5c64a..12113e92 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -12,6 +12,7 @@ // limitations under the License. `include "VX_define.vh" +`include "VX_trace.vh" module VX_ibuffer import VX_gpu_pkg::*; #( parameter CORE_ID = 0 @@ -36,6 +37,20 @@ module VX_ibuffer import VX_gpu_pkg::*; #( assign decode_if.ready = ibuf_ready_in[decode_isw]; +`ifdef SIMULATION +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + wire decode_probe_pc = (decode_if.data.PC >= 32'h80000240) && (decode_if.data.PC <= 32'h80000260); + always @(posedge clk) begin + if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && decode_if.valid && decode_probe_pc) begin + `TRACE(2, ("%d: core%0d-ibuffer-probe: wid=%0d, PC=0x%0h, valid=%b, ready=%b, isw=%0d, ibuf_ready=%b, tmask=%b, ex=0x%0h, op=0x%0h (#%0d)\n", + $time, CORE_ID, decode_if.data.wid, decode_if.data.PC, decode_if.valid, decode_if.ready, + decode_isw, ibuf_ready_in, decode_if.data.tmask, decode_if.data.ex_type, decode_if.data.op_type, + decode_if.data.uuid)); + end + end +`endif +`endif + VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 339c6b72..49a4f917 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -121,6 +121,36 @@ module VX_issue import VX_gpu_pkg::*; #( .sfu_dispatch_if(sfu_dispatch_if) ); +`ifdef SIMULATION +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + wire ibuf_probe_pc = ibuffer_if[i].valid + && (ibuffer_if[i].data.PC >= 32'h80000240) + && (ibuffer_if[i].data.PC <= 32'h80000260); + wire scb_probe_pc = scoreboard_if[i].valid + && (scoreboard_if[i].data.PC >= 32'h80000240) + && (scoreboard_if[i].data.PC <= 32'h80000260); + wire ops_probe_pc = operands_if[i].valid + && (operands_if[i].data.PC >= 32'h80000240) + && (operands_if[i].data.PC <= 32'h80000260); + + always @(posedge clk) begin + if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && (ibuf_probe_pc || scb_probe_pc || ops_probe_pc)) begin + `TRACE(2, ("%d: core%0d-issue-probe: isw=%0d, ibuf=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, alu=%b/%b lsu=%b/%b sfu=%b/%b (#ibuf=%0d #scb=%0d #ops=%0d)\n", + $time, CORE_ID, i, + ibuffer_if[i].valid, ibuffer_if[i].ready, ibuffer_if[i].data.PC, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type, + scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type, + operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, operands_if[i].data.ex_type, operands_if[i].data.op_type, + alu_dispatch_if[i].valid, alu_dispatch_if[i].ready, + lsu_dispatch_if[i].valid, lsu_dispatch_if[i].ready, + sfu_dispatch_if[i].valid, sfu_dispatch_if[i].ready, + ibuffer_if[i].data.uuid, scoreboard_if[i].data.uuid, operands_if[i].data.uuid)); + end + end + end +`endif +`endif + `ifdef DBG_SCOPE_ISSUE if (CORE_ID == 0) begin `ifdef SCOPE diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index fba3f861..237b711e 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -156,7 +156,7 @@ module VX_operands import VX_gpu_pkg::*; #( end endcase - if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin + if (CACHE_ENABLE != 0 && writeback_if[i].valid && !writeback_if[i].data.tensor) begin if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd) || (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin for (integer j = 0; j < `NUM_THREADS; ++j) begin @@ -288,9 +288,9 @@ module VX_operands import VX_gpu_pkg::*; #( .read (1'b1), `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index a4f8ec9d..ac1c79b7 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -12,6 +12,7 @@ // limitations under the License. `include "VX_define.vh" +`include "VX_trace.vh" `ifdef GPR_DUPLICATED @@ -123,6 +124,46 @@ module VX_operands_dup import VX_gpu_pkg::*; #( assign scoreboard_if[i].ready = (size1[i] < 3'd2); `endif +`ifdef SIMULATION +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME)) begin + if ((scoreboard_if[i].valid && + (scoreboard_if[i].data.PC >= `XLEN'h80000240) && + (scoreboard_if[i].data.PC <= `XLEN'h80000260)) || + (operands_if[i].valid && + (operands_if[i].data.PC >= `XLEN'h80000240) && + (operands_if[i].data.PC <= `XLEN'h80000260)) + `ifdef EXT_T_ASYNC + || tc_rf_valid[i] + `endif + ) begin +`ifdef EXT_T_ASYNC + `TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b, tc_rf_valid=%b, tc_rf_rs=%0d, tc_rf_wis=%0d (#scb=%0d #ops=%0d)\n", + $time, CORE_ID, i, + scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, + scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type, + operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, + operands_if[i].data.ex_type, operands_if[i].data.op_type, + size1[i], empty1[i], full1[i], + tc_rf_valid[i], tc_rf_addr[i], tensor_regfile_if.req_data.wis, + scoreboard_if[i].data.uuid, operands_if[i].data.uuid)); +`else + `TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n", + $time, CORE_ID, i, + scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, + scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type, + operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, + operands_if[i].data.ex_type, operands_if[i].data.op_type, + size1[i], empty1[i], full1[i], + scoreboard_if[i].data.uuid, operands_if[i].data.uuid)); +`endif + end + end + end +`endif +`endif + // assert (full1[i] == full2[i]); // assert (empty1[i] == empty2[i]); @@ -263,9 +304,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -287,9 +328,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -315,9 +356,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( `endif `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -326,9 +367,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( ); `ifdef PERF_ENABLE - assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); - assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); - assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]); + assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); + assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); + assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); `endif end diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 78362e88..bd680769 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -322,6 +322,40 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin timeout_ctr <= '0; end + `ifdef EXT_T_ASYNC + `ifdef DBG_TRACE_CORE_PIPELINE_VCS + if (CORE_ID == 0 && ibuffer_if[i].valid && + (ibuffer_if[i].data.PC >= `XLEN'h80000240) && + (ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin + `TRACE(2, ("%d: core%0d-scoreboard-probe: wid=%0d, PC=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, busy={rd,rs1,rs2,rs3}=%b, rd=%0d rs1=%0d rs2=%0d rs3=%0d, wb=%b, ex=0x%0h, op=0x%0h, inuse_rd=%b, inuse_rs1=%b, inuse_rs2=%b, inuse_rs3=%b, hgmma_start=%b, hgmma_wait=%b, hgmma_ready=%b, inflight=%0d, wb_fire=%b, wb_tensor=%b, wb_wid=%0d, wb_rd=%0d, wb_pc=0x%0h (#%0d)\n", + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, + ibuffer_if[i].ready, stg_ready_in, operands_ready, operands_busy, + ibuffer_if[i].data.rd, ibuffer_if[i].data.rs1, ibuffer_if[i].data.rs2, ibuffer_if[i].data.rs3, + ibuffer_if[i].data.wb, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type, + inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3, hgmma_start, hgmma_wait, hgmma_ready, + inflight_tensor[ibuffer_if[i].data.wis], writeback_fire, writeback_if[i].data.tensor, + wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.rd, writeback_if[i].data.PC, + ibuffer_if[i].data.uuid)); + end + if (ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR))) begin + `TRACE(2, ("%d: core%0d-scoreboard-tensor: wid=%0d, PC=0x%0h, op=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, inuse=%b, inflight=%0d, hgmma_start=%b, hgmma_wait=%b, hgmma_fire_ready=%b, hgmma_wait_ready=%b (#%0d)\n", + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, + ibuffer_if[i].data.op_type, ibuffer_if[i].ready, stg_ready_in, operands_ready, + operands_busy, inflight_tensor[ibuffer_if[i].data.wis], hgmma_start, hgmma_wait, + hgmma_ready_for_fire, hgmma_ready_for_wait, ibuffer_if[i].data.uuid)); + end + if (tensor_issue_fire) begin + `TRACE(2, ("%d: core%0d-scoreboard-tensor-issue-count: wid=%0d, op=0x%0h, inflight_before=%0d\n", + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.op_type, + inflight_tensor[ibuffer_if[i].data.wis])); + end + if (tensor_writeback_fire) begin + `TRACE(2, ("%d: core%0d-scoreboard-tensor-wb-count: wid=%0d, inflight_before=%0d\n", + $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), + inflight_tensor[writeback_if[i].data.wis])); + end + `endif + `endif end end diff --git a/hw/rtl/core/VX_tensor_blackwell_core.sv b/hw/rtl/core/VX_tensor_blackwell_core.sv index b5a17d3d..30ee6c4a 100644 --- a/hw/rtl/core/VX_tensor_blackwell_core.sv +++ b/hw/rtl/core/VX_tensor_blackwell_core.sv @@ -1,5 +1,6 @@ `ifdef EXT_T_ENABLE `include "VX_fpu_define.vh" +`include "VX_trace.vh" module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( parameter ISW, @@ -11,6 +12,14 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( VX_execute_if.slave execute_if, VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master tmem_if, + // tmem_C is now a direct SRAM port (no TileLink) + output logic tmem_C_wen, + output logic tmem_C_ren, + output logic [8:0] tmem_C_waddr, + output logic [8:0] tmem_C_raddr, + output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata, + output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask, + input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, VX_tc_bus_if.master smem_B_if, VX_commit_if.master commit_if ); @@ -26,6 +35,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( wire [`NR_BITS-1:0] execute_if_data_rd; wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1; wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2; + wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs3; wire metadata_queue_full; wire metadata_queue_empty; @@ -38,7 +48,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( wire deq = metadata_deq; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + - `NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN); + `NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN); VX_fifo_queue #( .DATAW(DATAW), .DEPTH(METADATA_QUEUE_DEPTH) @@ -50,11 +60,11 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( .data_in({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC, execute_if.data.wb, execute_if.data.rd, - execute_if.data.rs1_data, execute_if.data.rs2_data}), + execute_if.data.rs1_data, execute_if.data.rs2_data, execute_if.data.rs3_data}), .data_out({execute_if_data_uuid, execute_if_data_wid, execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC, execute_if_data_wb, execute_if_data_rd, - execute_if_data_rs1, execute_if_data_rs2}), + execute_if_data_rs1, execute_if_data_rs2, execute_if_data_rs3}), .empty(metadata_queue_empty), `UNUSED_PIN(alm_empty), .full(metadata_queue_full), @@ -75,20 +85,50 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( (execute_if_data_op_type == `INST_TENSOR_BWGMMA); wire bwgmma_wait = metadata_valid && (execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT); - wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma; + wire tcgen05_cp = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP); + wire tcgen05_cp_wait = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP_WAIT); + wire tcgen05_ld = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_TCGEN05_LD); + wire tcgen05_st = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_TCGEN05_ST); + wire tcgen05_cb = metadata_valid && + (execute_if_data_op_type == `INST_TENSOR_TCGEN05_CB); + wire tensor_launch_op = bwgmma || tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb; + wire tensor_wait_op = bwgmma_wait || tcgen05_cp_wait; + wire [2:0] initiate_op = + bwgmma ? 3'd0 : + bwgmma_wait ? 3'd1 : + tcgen05_cp ? 3'd2 : + tcgen05_cp_wait ? 3'd3 : + tcgen05_ld ? 3'd4 : + tcgen05_st ? 3'd5 : + tcgen05_cb ? 3'd6 : + 3'd0; + wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending && commit_if.ready; wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid; + wire [4:0] initiate_rd = execute_if_data_rd[4:0]; wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0]; wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0]; + wire [`XLEN-1:0] initiate_addr_c = execute_if_data_rs3[0]; - TensorCoreDecoupled tensor_blackwell_core ( + logic sync_launch_pending; + logic sync_launch_is_ld; + logic sync_launch_is_bwgmma; + + TensorCoreBlackwell tensor_blackwell_core ( .clock(clk), .reset(reset), .io_initiate_ready(initiate_ready), - .io_initiate_valid(bwgmma_initiate_valid), + .io_initiate_valid(initiate_valid), + .io_initiate_bits_op(initiate_op), .io_initiate_bits_wid(initiate_wid), + .io_initiate_bits_rd(initiate_rd), .io_initiate_bits_addressA(initiate_addr_a), .io_initiate_bits_addressB(initiate_addr_b), + .io_initiate_bits_addressC(initiate_addr_c), .io_writeback_ready(writeback_ready), .io_writeback_valid(writeback_valid), @@ -116,44 +156,73 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( .io_reqA_ready(tmem_if.req_ready), .io_reqA_valid(tmem_if.req_valid), + .io_reqA_bits_rw(tmem_if.req_data.rw), + .io_reqA_bits_byteen(tmem_if.req_data.byteen), .io_reqA_bits_source(tmem_if.req_data.tag), .io_reqA_bits_address(tmem_if.req_data.addr), + .io_reqA_bits_data(tmem_if.req_data.data), + .io_tmemC_wen(tmem_C_wen), + .io_tmemC_ren(tmem_C_ren), + .io_tmemC_waddr(tmem_C_waddr), + .io_tmemC_raddr(tmem_C_raddr), + .io_tmemC_wdata(tmem_C_wdata), + .io_tmemC_mask(tmem_C_mask), + .io_tmemC_rdata(tmem_C_rdata), .io_reqB_ready(smem_B_if.req_ready), .io_reqB_valid(smem_B_if.req_valid), + .io_reqB_bits_rw(smem_B_if.req_data.rw), + .io_reqB_bits_byteen(smem_B_if.req_data.byteen), .io_reqB_bits_source(smem_B_if.req_data.tag), .io_reqB_bits_address(smem_B_if.req_data.addr), + .io_reqB_bits_data(smem_B_if.req_data.data), .io_reqC_valid(regfile_if.req_valid), .io_reqC_bits(regfile_if.req_data.rs[4:0]) ); assign regfile_if.req_data.rs[5] = 1'b1; assign regfile_if.req_data.wis = '0; - assign tmem_if.req_data.rw = 1'b0; - assign tmem_if.req_data.byteen = '1; - assign tmem_if.req_data.data = '0; - assign smem_B_if.req_data.rw = 1'b0; - assign smem_B_if.req_data.byteen = '1; - assign smem_B_if.req_data.data = '0; logic commit_select_tensor; + logic commit_sync_done; + logic sync_done; + wire sync_writeback_done = sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma) && writeback_valid; + wire sync_no_wb_done = sync_launch_pending && !(sync_launch_is_ld || sync_launch_is_bwgmma) && initiate_ready; + wire tensor_idle = initiate_ready && !writeback_valid && !sync_launch_pending; always @(*) begin metadata_deq = 1'b0; commit_select_tensor = 1'b0; - writeback_ready = commit_if.ready; + commit_sync_done = 1'b0; + sync_done = 1'b0; + writeback_ready = commit_if.ready && sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma); - if (metadata_valid) begin - if (bwgmma_wait) begin - writeback_ready = 1'b0; + if (sync_launch_pending) begin + if (sync_writeback_done) begin + commit_select_tensor = 1'b1; + metadata_deq = commit_if.ready; + commit_sync_done = commit_if.ready; + end else if (sync_no_wb_done) begin commit_select_tensor = 1'b0; metadata_deq = metadata_valid && commit_if.ready; + commit_sync_done = metadata_valid && commit_if.ready; + end + end else if (metadata_valid) begin + if (tensor_wait_op) begin + writeback_ready = 1'b0; + commit_select_tensor = 1'b0; + metadata_deq = metadata_valid && commit_if.ready && tensor_idle; end else if (bwgmma) begin - commit_select_tensor = !initiate_ready; - metadata_deq = metadata_valid && commit_if.ready && initiate_ready; + writeback_ready = 1'b0; + commit_select_tensor = 1'b0; + sync_done = initiate_valid && initiate_ready; + metadata_deq = 1'b0; + end else if (tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb) begin + writeback_ready = 1'b0; + commit_select_tensor = 1'b0; + sync_done = initiate_valid && initiate_ready; + // suppress commit until sync_launch_pending is set next cycle + metadata_deq = 1'b0; end else begin - // TCGEN05 ops are recognized by decode so software can build - // against the Blackwell surface. Phase 1 compute integration - // only routes BWGMMA through the decoupled tensor datapath. commit_select_tensor = 1'b0; metadata_deq = metadata_valid && commit_if.ready; end @@ -162,18 +231,22 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( end if (commit_select_tensor) begin - commit_if.valid = writeback_valid; - commit_if.data.uuid = '0; - commit_if.data.wid = writeback_wid; - commit_if.data.tmask = {NUM_LANES{1'b1}}; - commit_if.data.PC = '0; - commit_if.data.wb = 1'b1; - commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd}); + commit_if.valid = sync_writeback_done; + commit_if.data.uuid = sync_launch_is_bwgmma ? '0 : execute_if_data_uuid; + commit_if.data.wid = sync_launch_is_bwgmma ? writeback_wid : execute_if_data_wid; + commit_if.data.tmask = sync_launch_is_bwgmma ? {NUM_LANES{1'b1}} : execute_if_data_tmask; + commit_if.data.PC = sync_launch_is_bwgmma ? '0 : execute_if_data_PC; + commit_if.data.wb = sync_launch_is_ld; + commit_if.data.rd = sync_launch_is_ld ? execute_if_data_rd : '0; commit_if.data.data = writeback_data; - commit_if.data.tensor = 1'b1; - commit_if.data.eop = writeback_last; + commit_if.data.tensor = sync_launch_is_bwgmma; + commit_if.data.pid = 1'b0; + commit_if.data.sop = 1'b1; + commit_if.data.eop = sync_launch_is_ld ? 1'b1 : writeback_last; end else begin - commit_if.valid = metadata_valid; + commit_if.valid = (sync_no_wb_done && metadata_valid) || + (metadata_valid && !sync_launch_pending && !tensor_launch_op && + (!tensor_wait_op || tensor_idle)); commit_if.data.uuid = execute_if_data_uuid; commit_if.data.wid = execute_if_data_wid; commit_if.data.tmask = execute_if_data_tmask; @@ -187,6 +260,57 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( commit_if.data.eop = 1'b1; end end + + always @(posedge clk) begin + if (reset) begin + sync_launch_pending <= 1'b0; + sync_launch_is_ld <= 1'b0; + sync_launch_is_bwgmma <= 1'b0; + end else begin + if (sync_done) begin + sync_launch_pending <= 1'b1; + sync_launch_is_ld <= tcgen05_ld; + sync_launch_is_bwgmma <= bwgmma; + end else if (commit_sync_done) begin + sync_launch_pending <= 1'b0; + sync_launch_is_ld <= 1'b0; + sync_launch_is_bwgmma <= 1'b0; + end + end + end + +`ifdef SIMULATION +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME)) begin + if (operand_enq_fire) begin + `TRACE(2, ("%d: tensor-blackwell-enq: wid=%0d, PC=0x%0h, op=0x%0h, uuid=%0d, queue_full=%b\n", + $time, execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type, + execute_if.data.uuid, metadata_queue_full)); + end + if (metadata_valid) begin + `TRACE(2, ("%d: tensor-blackwell-meta: wid=%0d, PC=0x%0h, op=0x%0h, deq=%b, commit_valid=%b, commit_ready=%b, initiate_valid=%b, initiate_ready=%b, writeback_valid=%b, writeback_ready=%b, sync_pending=%b, sync_is_ld=%b, sync_done=%b, commit_sync_done=%b, queue_full=%b\n", + $time, execute_if_data_wid, execute_if_data_PC, execute_if_data_op_type, + metadata_deq, commit_if.valid, commit_if.ready, initiate_valid, initiate_ready, + writeback_valid, writeback_ready, sync_launch_pending, sync_launch_is_ld, + sync_done, commit_sync_done, metadata_queue_full)); + end + if (writeback_valid || initiate_valid) begin + `TRACE(2, ("%d: tensor-blackwell-handshake: initiate=%b/%b, op=%0d, addrA=0x%0h, addrB=0x%0h, writeback=%b/%b, wb_wid=%0d, wb_rd=%0d, wb_last=%b\n", + $time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b, + writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last)); + end + if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin + `TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemC_wen=%b tmemC_ren=%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n", + $time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready, + tmem_C_wen, tmem_C_ren, tmem_C_waddr, tmem_C_raddr, + smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready, + initiate_ready)); + end + end + end +`endif +`endif endmodule `endif diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 090c6e80..3b95a4f1 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -11,6 +11,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #( `ifdef EXT_T_ASYNC VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master smem_A_if, + output logic tmem_C_wen, + output logic tmem_C_ren, + output logic [8:0] tmem_C_waddr, + output logic [8:0] tmem_C_raddr, + output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata, + output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask, + input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, VX_tc_bus_if.master smem_B_if, `endif VX_commit_if.master commit_if [`ISSUE_WIDTH] @@ -57,6 +64,12 @@ module VX_tensor_core import VX_gpu_pkg::*; #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin `ifdef EXT_T_HOPPER + assign tmem_C_wen = 1'b0; + assign tmem_C_ren = 1'b0; + assign tmem_C_waddr = '0; + assign tmem_C_raddr = '0; + assign tmem_C_wdata = '0; + assign tmem_C_mask = '0; VX_tensor_hopper_core_block #( .ISW(1), // FIXME: not block_idx .FP16(FP16) @@ -79,6 +92,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .execute_if (execute_if[block_idx]), .regfile_if (regfile_if), .tmem_if (smem_A_if), + .tmem_C_wen (tmem_C_wen), + .tmem_C_ren (tmem_C_ren), + .tmem_C_waddr(tmem_C_waddr), + .tmem_C_raddr(tmem_C_raddr), + .tmem_C_wdata(tmem_C_wdata), + .tmem_C_mask(tmem_C_mask), + .tmem_C_rdata(tmem_C_rdata), .smem_B_if (smem_B_if), .commit_if (commit_block_if[block_idx]) ); diff --git a/hw/rtl/core/VX_trace.vh b/hw/rtl/core/VX_trace.vh index ff2b3bb9..de0775ef 100644 --- a/hw/rtl/core/VX_trace.vh +++ b/hw/rtl/core/VX_trace.vh @@ -22,6 +22,9 @@ task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type); `EX_LSU: `TRACE(level, ("LSU")); `EX_FPU: `TRACE(level, ("FPU")); `EX_SFU: `TRACE(level, ("SFU")); + `ifdef EXT_T_ENABLE + `EX_TENSOR: `TRACE(level, ("TENSOR")); + `endif default: `TRACE(level, ("?")); endcase endtask @@ -36,22 +39,26 @@ task trace_ex_op(input int level, `UNUSED_ARG(input [`XLEN-1:0] imm) ); + logic fdst_d; + logic fcvt_l; + logic rd_float; + `ifdef FLEN_64 - logic fdst_d = imm[0]; + fdst_d = imm[0]; `else - logic fdst_d = 0; + fdst_d = 0; `endif `ifdef XLEN_64 - logic fcvt_l = imm[1]; + fcvt_l = imm[1]; `else - logic fcvt_l = 0; + fcvt_l = 0; `endif `ifdef EXT_F_ENABLE - logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5); + rd_float = 1'(rd >> 5) || 1'(rs2 >> 5); `else - logic rd_float = 0; + rd_float = 0; `endif case (ex_type) @@ -359,6 +366,22 @@ task trace_ex_op(input int level, default: `TRACE(level, ("?")); endcase end +`ifdef EXT_T_ENABLE + `EX_TENSOR: begin + case (`INST_ALU_BITS'(op_type)) + `INST_TENSOR_HMMA: `TRACE(level, ("HMMA")); + `INST_TENSOR_HGMMA: `TRACE(level, ("HGMMA")); + `INST_TENSOR_HGMMA_WAIT: `TRACE(level, ("HGMMA_WAIT")); + `INST_TENSOR_TCGEN05_CP: `TRACE(level, ("TCGEN05_CP")); + `INST_TENSOR_TCGEN05_CP_WAIT: `TRACE(level, ("TCGEN05_CP_WAIT")); + `INST_TENSOR_BWGMMA: `TRACE(level, ("BWGMMA")); + `INST_TENSOR_BWGMMA_WAIT: `TRACE(level, ("BWGMMA_WAIT")); + `INST_TENSOR_TCGEN05_LD: `TRACE(level, ("TCGEN05_LD")); + `INST_TENSOR_TCGEN05_ST: `TRACE(level, ("TCGEN05_ST")); + default: `TRACE(level, ("?")); + endcase + end +`endif default: `TRACE(level, ("?")); endcase endtask