Update Vortex core for Blackwell tensor instructions
- Add Blackwell tensor core support in VX_tensor_blackwell_core.sv - Update decode, execute, and dispatch logic for new instructions - Extend VX_define.vh and VX_types.vh with Blackwell ISA definitions
This commit is contained in:
@@ -5,7 +5,7 @@
|
||||
module Vortex import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0,
|
||||
parameter BOOTROM_HANG100 = 32'h10100,
|
||||
parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100,
|
||||
parameter NUM_THREADS = 0,
|
||||
parameter TC_DATA_WIDTH = 256,
|
||||
parameter TC_TAG_WIDTH = 4
|
||||
@@ -77,17 +77,26 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data,
|
||||
|
||||
// tc --------------------------------------------------
|
||||
input [1:0] tc_a_ready,
|
||||
output [1:0] tc_a_valid,
|
||||
output [1:0] tc_a_bits_write,
|
||||
output [63:0] tc_a_bits_address,
|
||||
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
||||
output [2 * 32 - 1:0] tc_a_bits_mask,
|
||||
output [2 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
|
||||
output [1:0] tc_d_ready,
|
||||
input [1:0] tc_d_valid,
|
||||
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
||||
input [2 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
|
||||
input [2:0] tc_a_ready,
|
||||
output [2:0] tc_a_valid,
|
||||
output [2:0] tc_a_bits_write,
|
||||
output [95:0] tc_a_bits_address,
|
||||
output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
||||
output [3 * 32 - 1:0] tc_a_bits_mask,
|
||||
output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
|
||||
output [2:0] tc_d_ready,
|
||||
input [2:0] tc_d_valid,
|
||||
input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
||||
input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
|
||||
|
||||
// tmem_C direct SRAM port
|
||||
output tc_tmem_C_wen,
|
||||
output tc_tmem_C_ren,
|
||||
output [8:0] tc_tmem_C_waddr,
|
||||
output [8:0] tc_tmem_C_raddr,
|
||||
output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
|
||||
output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
|
||||
input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
|
||||
|
||||
// gbar ------------------------------------------------
|
||||
|
||||
@@ -306,22 +315,23 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
|
||||
// tc ---------------------------------------------------------------------
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
|
||||
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
|
||||
assign tc_a_bits_write = {tc_p1_bus_if.req_data.rw, tc_p0_bus_if.req_data.rw};
|
||||
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
|
||||
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
|
||||
assign tc_a_bits_mask = {tc_p1_bus_if.req_data.byteen, tc_p0_bus_if.req_data.byteen};
|
||||
assign tc_a_bits_data = {tc_p1_bus_if.req_data.data, tc_p0_bus_if.req_data.data};
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if();
|
||||
// tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_*
|
||||
assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid};
|
||||
assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw};
|
||||
assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr};
|
||||
assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag};
|
||||
assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen};
|
||||
assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data};
|
||||
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
|
||||
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
|
||||
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
assign tc_p1_bus_if.req_ready = tc_a_ready[1];
|
||||
assign tc_p1_bus_if.rsp_valid = tc_d_valid[1];
|
||||
assign tc_p1_bus_if.rsp_data.data = tc_d_bits_data[1 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p1_bus_if.rsp_data.tag = tc_d_bits_tag[1 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
assign tc_d_ready = {tc_p1_bus_if.rsp_ready, tc_p0_bus_if.rsp_ready};
|
||||
assign tc_p2_bus_if.req_ready = tc_a_ready[2];
|
||||
assign tc_p2_bus_if.rsp_valid = tc_d_valid[2];
|
||||
assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready};
|
||||
|
||||
// gbar -------------------------------------------------------------------
|
||||
`ifdef GBAR_ENABLE
|
||||
@@ -395,15 +405,14 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
|
||||
dcr_write_valid = 1'b1;
|
||||
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR0;
|
||||
dcr_write_data = BOOTROM_HANG100;
|
||||
dcr_write_data = STARTUP_ADDR[31:0];
|
||||
end
|
||||
`VX_DCR_BASE_STARTUP_ADDR1: begin
|
||||
dcr_state_n = `VX_DCR_BASE_MPM_CLASS;
|
||||
|
||||
dcr_write_valid = 1'b1;
|
||||
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR1;
|
||||
// FIXME: not sure what this does
|
||||
dcr_write_data = `VX_DCR_DATA_WIDTH'h0;
|
||||
dcr_write_data = STARTUP_ADDR[63:32];
|
||||
end
|
||||
`VX_DCR_BASE_MPM_CLASS: begin
|
||||
dcr_state_n = `VX_DCR_BASE_STATE_END;
|
||||
@@ -455,7 +464,25 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
.tensor_smem_A_if (tc_p0_bus_if),
|
||||
.tensor_smem_B_if (tc_p1_bus_if),
|
||||
`ifdef EXT_T_BLACKWELL
|
||||
.tensor_tmem_C_wen(tc_tmem_C_wen),
|
||||
.tensor_tmem_C_ren(tc_tmem_C_ren),
|
||||
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
|
||||
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
|
||||
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tc_tmem_C_mask),
|
||||
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
|
||||
.tensor_smem_B_if (tc_p2_bus_if),
|
||||
`else
|
||||
.tensor_tmem_C_wen(tc_tmem_C_wen),
|
||||
.tensor_tmem_C_ren(tc_tmem_C_ren),
|
||||
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
|
||||
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
|
||||
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tc_tmem_C_mask),
|
||||
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
|
||||
.tensor_smem_B_if (tc_p2_bus_if),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (sim_ebreak),
|
||||
.sim_wb_value (sim_wb_value),
|
||||
@@ -577,7 +604,3 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
endmodule : Vortex
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -254,24 +254,25 @@
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
`define INST_TENSOR_HMMA 4'b0000
|
||||
// Hopper WGMMA-style asynchronous op
|
||||
`define INST_TENSOR_HGMMA 4'b0001
|
||||
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||
`define INST_TENSOR_TCGEN05_CP 4'b0011
|
||||
`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100
|
||||
`define INST_TENSOR_BWGMMA 4'b0101
|
||||
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
|
||||
`define INST_TENSOR_TCGEN05_LD 4'b0111
|
||||
`define INST_TENSOR_TCGEN05_ST 4'b1000
|
||||
|
||||
`ifdef EXT_T_HOPPER
|
||||
`define EXT_T_ASYNC
|
||||
`elsif EXT_T_BLACKWELL
|
||||
`define EXT_T_ASYNC
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
`define INST_TENSOR_HMMA 4'b0000
|
||||
// Hopper WGMMA-style asynchronous op
|
||||
`define INST_TENSOR_HGMMA 4'b0001
|
||||
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||
`define INST_TENSOR_TCGEN05_CP 4'b0011
|
||||
`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100
|
||||
`define INST_TENSOR_BWGMMA 4'b0101
|
||||
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
|
||||
`define INST_TENSOR_TCGEN05_LD 4'b0111
|
||||
`define INST_TENSOR_TCGEN05_ST 4'b1000
|
||||
`define INST_TENSOR_TCGEN05_CB 4'b1001
|
||||
|
||||
`ifdef EXT_T_HOPPER
|
||||
`define EXT_T_ASYNC
|
||||
`elsif EXT_T_BLACKWELL
|
||||
`define EXT_T_ASYNC
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BITS 1
|
||||
|
||||
@@ -179,6 +179,7 @@
|
||||
`define VX_CSR_CORE_ID 12'hCC2
|
||||
`define VX_CSR_WARP_MASK 12'hCC3
|
||||
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
|
||||
`define VX_CSR_GCID 12'hCC5 // legacy global core id alias used by Radiance bootrom
|
||||
|
||||
`define VX_CSR_NUM_THREADS 12'hFC0
|
||||
`define VX_CSR_NUM_WARPS 12'hFC1
|
||||
|
||||
@@ -208,7 +208,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].valid = commit_if[i].valid && (commit_if[i].data.wb || commit_if[i].data.tensor);
|
||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
@@ -224,7 +224,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
|
||||
always @(posedge clk) begin
|
||||
if (writeback_if[0].valid) begin
|
||||
if (writeback_if[0].valid && !writeback_if[0].data.tensor) begin
|
||||
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
|
||||
end
|
||||
end
|
||||
|
||||
@@ -40,6 +40,14 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
VX_tc_bus_if.master tensor_smem_A_if,
|
||||
// tensor_tmem_C is now a direct SRAM port
|
||||
output logic tensor_tmem_C_wen,
|
||||
output logic tensor_tmem_C_ren,
|
||||
output logic [8:0] tensor_tmem_C_waddr,
|
||||
output logic [8:0] tensor_tmem_C_raddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
VX_tc_bus_if.master tensor_smem_B_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
@@ -63,9 +71,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if tensor_regfile_if();
|
||||
`endif
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if tensor_regfile_if();
|
||||
`endif
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
@@ -193,9 +201,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_dispatch_if(tensor_dispatch_if),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
`endif
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
`endif
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
@@ -226,10 +234,17 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_dispatch_if (tensor_dispatch_if),
|
||||
.tensor_commit_if (tensor_commit_if),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
.tensor_smem_A_if (tensor_smem_A_if),
|
||||
.tensor_smem_B_if (tensor_smem_B_if),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
.tensor_smem_A_if (tensor_smem_A_if),
|
||||
.tensor_tmem_C_wen (tensor_tmem_C_wen),
|
||||
.tensor_tmem_C_ren (tensor_tmem_C_ren),
|
||||
.tensor_tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tensor_tmem_C_raddr(tensor_tmem_C_raddr),
|
||||
.tensor_tmem_C_wdata(tensor_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tensor_tmem_C_mask),
|
||||
.tensor_tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.tensor_smem_B_if (tensor_smem_B_if),
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
||||
@@ -163,6 +163,7 @@ import VX_fpu_pkg::*;
|
||||
`endif
|
||||
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
|
||||
`VX_CSR_GCID : read_data_ro_r = 32'(CORE_ID);
|
||||
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
|
||||
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
|
||||
|
||||
@@ -564,11 +564,45 @@ module VX_decode #(
|
||||
3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT;
|
||||
3'b100: op_type = `INST_TENSOR_TCGEN05_LD;
|
||||
3'b101: op_type = `INST_TENSOR_TCGEN05_ST;
|
||||
3'b110: op_type = `INST_TENSOR_TCGEN05_CB;
|
||||
default: ;
|
||||
endcase
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
case (func3)
|
||||
3'b000: begin // BWGMMA: rs1=tmem_a, rs2=smem_b, rd field=tmem_c source
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`ifdef EXT_F_ENABLE
|
||||
rs3_r = {1'b0, rd};
|
||||
`else
|
||||
rs3_r = rd;
|
||||
`endif
|
||||
use_rs3 = 1;
|
||||
end
|
||||
3'b010, 3'b110: begin // TCGEN05_CP/CB: rs1=tmem, rs2=global memory address
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'b100: begin // TCGEN05_LD: rs1=tmem, rd=FP destination
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`USED_FREG (rd);
|
||||
`else
|
||||
`USED_IREG (rd);
|
||||
`endif
|
||||
end
|
||||
3'b101: begin // TCGEN05_ST: rs1=tmem, rd field=FP source
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
rd_r = {1'b1, rd};
|
||||
rs3_r = {1'b1, rd};
|
||||
`else
|
||||
rd_r = rd;
|
||||
rs3_r = rd;
|
||||
`endif
|
||||
use_rs3 = 1;
|
||||
end
|
||||
default: ;
|
||||
endcase
|
||||
`else
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
|
||||
@@ -312,6 +312,8 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
if (operands_if[i].valid && operands_if[i].ready) begin
|
||||
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
|
||||
trace_ex_type(1, operands_if[i].data.ex_type);
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.rd, '0, operands_if[i].data.use_imm, operands_if[i].data.imm);
|
||||
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
|
||||
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs2_data="));
|
||||
|
||||
@@ -61,6 +61,13 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.master tensor_regfile_if,
|
||||
VX_tc_bus_if.master tensor_smem_A_if,
|
||||
output logic tensor_tmem_C_wen,
|
||||
output logic tensor_tmem_C_ren,
|
||||
output logic [8:0] tensor_tmem_C_waddr,
|
||||
output logic [8:0] tensor_tmem_C_raddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
VX_tc_bus_if.master tensor_smem_B_if,
|
||||
`endif
|
||||
`endif
|
||||
@@ -159,6 +166,13 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_T_ASYNC
|
||||
.regfile_if(tensor_regfile_if),
|
||||
.smem_A_if(tensor_smem_A_if),
|
||||
.tmem_C_wen(tensor_tmem_C_wen),
|
||||
.tmem_C_ren(tensor_tmem_C_ren),
|
||||
.tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tmem_C_raddr(tensor_tmem_C_raddr),
|
||||
.tmem_C_wdata(tensor_tmem_C_wdata),
|
||||
.tmem_C_mask(tensor_tmem_C_mask),
|
||||
.tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.smem_B_if(tensor_smem_B_if),
|
||||
`endif
|
||||
.commit_if(tensor_commit_if)
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
@@ -36,6 +37,20 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
|
||||
assign decode_if.ready = ibuf_ready_in[decode_isw];
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
wire decode_probe_pc = (decode_if.data.PC >= 32'h80000240) && (decode_if.data.PC <= 32'h80000260);
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && decode_if.valid && decode_probe_pc) begin
|
||||
`TRACE(2, ("%d: core%0d-ibuffer-probe: wid=%0d, PC=0x%0h, valid=%b, ready=%b, isw=%0d, ibuf_ready=%b, tmask=%b, ex=0x%0h, op=0x%0h (#%0d)\n",
|
||||
$time, CORE_ID, decode_if.data.wid, decode_if.data.PC, decode_if.valid, decode_if.ready,
|
||||
decode_isw, ibuf_ready_in, decode_if.data.tmask, decode_if.data.ex_type, decode_if.data.op_type,
|
||||
decode_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
|
||||
@@ -121,6 +121,36 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire ibuf_probe_pc = ibuffer_if[i].valid
|
||||
&& (ibuffer_if[i].data.PC >= 32'h80000240)
|
||||
&& (ibuffer_if[i].data.PC <= 32'h80000260);
|
||||
wire scb_probe_pc = scoreboard_if[i].valid
|
||||
&& (scoreboard_if[i].data.PC >= 32'h80000240)
|
||||
&& (scoreboard_if[i].data.PC <= 32'h80000260);
|
||||
wire ops_probe_pc = operands_if[i].valid
|
||||
&& (operands_if[i].data.PC >= 32'h80000240)
|
||||
&& (operands_if[i].data.PC <= 32'h80000260);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && (ibuf_probe_pc || scb_probe_pc || ops_probe_pc)) begin
|
||||
`TRACE(2, ("%d: core%0d-issue-probe: isw=%0d, ibuf=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, alu=%b/%b lsu=%b/%b sfu=%b/%b (#ibuf=%0d #scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
ibuffer_if[i].valid, ibuffer_if[i].ready, ibuffer_if[i].data.PC, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
alu_dispatch_if[i].valid, alu_dispatch_if[i].ready,
|
||||
lsu_dispatch_if[i].valid, lsu_dispatch_if[i].ready,
|
||||
sfu_dispatch_if[i].valid, sfu_dispatch_if[i].ready,
|
||||
ibuffer_if[i].data.uuid, scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef DBG_SCOPE_ISSUE
|
||||
if (CORE_ID == 0) begin
|
||||
`ifdef SCOPE
|
||||
|
||||
@@ -156,7 +156,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
end
|
||||
endcase
|
||||
|
||||
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
|
||||
if (CACHE_ENABLE != 0 && writeback_if[i].valid && !writeback_if[i].data.tensor) begin
|
||||
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|
||||
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
|
||||
for (integer j = 0; j < `NUM_THREADS; ++j) begin
|
||||
@@ -288,9 +288,9 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
`ifdef GPR_DUPLICATED
|
||||
|
||||
@@ -123,6 +124,46 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
|
||||
`endif
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME)) begin
|
||||
if ((scoreboard_if[i].valid &&
|
||||
(scoreboard_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(scoreboard_if[i].data.PC <= `XLEN'h80000260)) ||
|
||||
(operands_if[i].valid &&
|
||||
(operands_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(operands_if[i].data.PC <= `XLEN'h80000260))
|
||||
`ifdef EXT_T_ASYNC
|
||||
|| tc_rf_valid[i]
|
||||
`endif
|
||||
) begin
|
||||
`ifdef EXT_T_ASYNC
|
||||
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b, tc_rf_valid=%b, tc_rf_rs=%0d, tc_rf_wis=%0d (#scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
|
||||
operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
size1[i], empty1[i], full1[i],
|
||||
tc_rf_valid[i], tc_rf_addr[i], tensor_regfile_if.req_data.wis,
|
||||
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
`else
|
||||
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
|
||||
operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
size1[i], empty1[i], full1[i],
|
||||
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// assert (full1[i] == full2[i]);
|
||||
// assert (empty1[i] == empty2[i]);
|
||||
|
||||
@@ -263,9 +304,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -287,9 +328,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -315,9 +356,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -326,9 +367,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
`endif
|
||||
end
|
||||
|
||||
|
||||
@@ -322,6 +322,40 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
timeout_ctr <= '0;
|
||||
end
|
||||
`ifdef EXT_T_ASYNC
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
if (CORE_ID == 0 && ibuffer_if[i].valid &&
|
||||
(ibuffer_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-probe: wid=%0d, PC=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, busy={rd,rs1,rs2,rs3}=%b, rd=%0d rs1=%0d rs2=%0d rs3=%0d, wb=%b, ex=0x%0h, op=0x%0h, inuse_rd=%b, inuse_rs1=%b, inuse_rs2=%b, inuse_rs3=%b, hgmma_start=%b, hgmma_wait=%b, hgmma_ready=%b, inflight=%0d, wb_fire=%b, wb_tensor=%b, wb_wid=%0d, wb_rd=%0d, wb_pc=0x%0h (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
|
||||
ibuffer_if[i].ready, stg_ready_in, operands_ready, operands_busy,
|
||||
ibuffer_if[i].data.rd, ibuffer_if[i].data.rs1, ibuffer_if[i].data.rs2, ibuffer_if[i].data.rs3,
|
||||
ibuffer_if[i].data.wb, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
|
||||
inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3, hgmma_start, hgmma_wait, hgmma_ready,
|
||||
inflight_tensor[ibuffer_if[i].data.wis], writeback_fire, writeback_if[i].data.tensor,
|
||||
wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.rd, writeback_if[i].data.PC,
|
||||
ibuffer_if[i].data.uuid));
|
||||
end
|
||||
if (ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR))) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor: wid=%0d, PC=0x%0h, op=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, inuse=%b, inflight=%0d, hgmma_start=%b, hgmma_wait=%b, hgmma_fire_ready=%b, hgmma_wait_ready=%b (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
|
||||
ibuffer_if[i].data.op_type, ibuffer_if[i].ready, stg_ready_in, operands_ready,
|
||||
operands_busy, inflight_tensor[ibuffer_if[i].data.wis], hgmma_start, hgmma_wait,
|
||||
hgmma_ready_for_fire, hgmma_ready_for_wait, ibuffer_if[i].data.uuid));
|
||||
end
|
||||
if (tensor_issue_fire) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor-issue-count: wid=%0d, op=0x%0h, inflight_before=%0d\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.op_type,
|
||||
inflight_tensor[ibuffer_if[i].data.wis]));
|
||||
end
|
||||
if (tensor_writeback_fire) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor-wb-count: wid=%0d, inflight_before=%0d\n",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i),
|
||||
inflight_tensor[writeback_if[i].data.wis]));
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
`ifdef EXT_T_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
parameter ISW,
|
||||
@@ -11,6 +12,14 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
VX_execute_if.slave execute_if,
|
||||
VX_tc_rf_if.master regfile_if,
|
||||
VX_tc_bus_if.master tmem_if,
|
||||
// tmem_C is now a direct SRAM port (no TileLink)
|
||||
output logic tmem_C_wen,
|
||||
output logic tmem_C_ren,
|
||||
output logic [8:0] tmem_C_waddr,
|
||||
output logic [8:0] tmem_C_raddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
|
||||
VX_tc_bus_if.master smem_B_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
@@ -26,6 +35,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
wire [`NR_BITS-1:0] execute_if_data_rd;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs3;
|
||||
|
||||
wire metadata_queue_full;
|
||||
wire metadata_queue_empty;
|
||||
@@ -38,7 +48,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
wire deq = metadata_deq;
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 +
|
||||
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
|
||||
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
|
||||
VX_fifo_queue #(
|
||||
.DATAW(DATAW),
|
||||
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||
@@ -50,11 +60,11 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||
execute_if.data.wb, execute_if.data.rd,
|
||||
execute_if.data.rs1_data, execute_if.data.rs2_data}),
|
||||
execute_if.data.rs1_data, execute_if.data.rs2_data, execute_if.data.rs3_data}),
|
||||
.data_out({execute_if_data_uuid, execute_if_data_wid,
|
||||
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
|
||||
execute_if_data_wb, execute_if_data_rd,
|
||||
execute_if_data_rs1, execute_if_data_rs2}),
|
||||
execute_if_data_rs1, execute_if_data_rs2, execute_if_data_rs3}),
|
||||
.empty(metadata_queue_empty),
|
||||
`UNUSED_PIN(alm_empty),
|
||||
.full(metadata_queue_full),
|
||||
@@ -75,20 +85,50 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
(execute_if_data_op_type == `INST_TENSOR_BWGMMA);
|
||||
wire bwgmma_wait = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT);
|
||||
wire bwgmma_initiate_valid = metadata_valid && commit_if.ready && bwgmma;
|
||||
wire tcgen05_cp = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP);
|
||||
wire tcgen05_cp_wait = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP_WAIT);
|
||||
wire tcgen05_ld = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_LD);
|
||||
wire tcgen05_st = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_ST);
|
||||
wire tcgen05_cb = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CB);
|
||||
wire tensor_launch_op = bwgmma || tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb;
|
||||
wire tensor_wait_op = bwgmma_wait || tcgen05_cp_wait;
|
||||
wire [2:0] initiate_op =
|
||||
bwgmma ? 3'd0 :
|
||||
bwgmma_wait ? 3'd1 :
|
||||
tcgen05_cp ? 3'd2 :
|
||||
tcgen05_cp_wait ? 3'd3 :
|
||||
tcgen05_ld ? 3'd4 :
|
||||
tcgen05_st ? 3'd5 :
|
||||
tcgen05_cb ? 3'd6 :
|
||||
3'd0;
|
||||
wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending && commit_if.ready;
|
||||
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
|
||||
wire [4:0] initiate_rd = execute_if_data_rd[4:0];
|
||||
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
|
||||
wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0];
|
||||
wire [`XLEN-1:0] initiate_addr_c = execute_if_data_rs3[0];
|
||||
|
||||
TensorCoreDecoupled tensor_blackwell_core (
|
||||
logic sync_launch_pending;
|
||||
logic sync_launch_is_ld;
|
||||
logic sync_launch_is_bwgmma;
|
||||
|
||||
TensorCoreBlackwell tensor_blackwell_core (
|
||||
.clock(clk),
|
||||
.reset(reset),
|
||||
|
||||
.io_initiate_ready(initiate_ready),
|
||||
.io_initiate_valid(bwgmma_initiate_valid),
|
||||
.io_initiate_valid(initiate_valid),
|
||||
.io_initiate_bits_op(initiate_op),
|
||||
.io_initiate_bits_wid(initiate_wid),
|
||||
.io_initiate_bits_rd(initiate_rd),
|
||||
.io_initiate_bits_addressA(initiate_addr_a),
|
||||
.io_initiate_bits_addressB(initiate_addr_b),
|
||||
.io_initiate_bits_addressC(initiate_addr_c),
|
||||
|
||||
.io_writeback_ready(writeback_ready),
|
||||
.io_writeback_valid(writeback_valid),
|
||||
@@ -116,44 +156,73 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
|
||||
.io_reqA_ready(tmem_if.req_ready),
|
||||
.io_reqA_valid(tmem_if.req_valid),
|
||||
.io_reqA_bits_rw(tmem_if.req_data.rw),
|
||||
.io_reqA_bits_byteen(tmem_if.req_data.byteen),
|
||||
.io_reqA_bits_source(tmem_if.req_data.tag),
|
||||
.io_reqA_bits_address(tmem_if.req_data.addr),
|
||||
.io_reqA_bits_data(tmem_if.req_data.data),
|
||||
.io_tmemC_wen(tmem_C_wen),
|
||||
.io_tmemC_ren(tmem_C_ren),
|
||||
.io_tmemC_waddr(tmem_C_waddr),
|
||||
.io_tmemC_raddr(tmem_C_raddr),
|
||||
.io_tmemC_wdata(tmem_C_wdata),
|
||||
.io_tmemC_mask(tmem_C_mask),
|
||||
.io_tmemC_rdata(tmem_C_rdata),
|
||||
.io_reqB_ready(smem_B_if.req_ready),
|
||||
.io_reqB_valid(smem_B_if.req_valid),
|
||||
.io_reqB_bits_rw(smem_B_if.req_data.rw),
|
||||
.io_reqB_bits_byteen(smem_B_if.req_data.byteen),
|
||||
.io_reqB_bits_source(smem_B_if.req_data.tag),
|
||||
.io_reqB_bits_address(smem_B_if.req_data.addr),
|
||||
.io_reqB_bits_data(smem_B_if.req_data.data),
|
||||
.io_reqC_valid(regfile_if.req_valid),
|
||||
.io_reqC_bits(regfile_if.req_data.rs[4:0])
|
||||
);
|
||||
|
||||
assign regfile_if.req_data.rs[5] = 1'b1;
|
||||
assign regfile_if.req_data.wis = '0;
|
||||
assign tmem_if.req_data.rw = 1'b0;
|
||||
assign tmem_if.req_data.byteen = '1;
|
||||
assign tmem_if.req_data.data = '0;
|
||||
assign smem_B_if.req_data.rw = 1'b0;
|
||||
assign smem_B_if.req_data.byteen = '1;
|
||||
assign smem_B_if.req_data.data = '0;
|
||||
|
||||
logic commit_select_tensor;
|
||||
logic commit_sync_done;
|
||||
logic sync_done;
|
||||
wire sync_writeback_done = sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma) && writeback_valid;
|
||||
wire sync_no_wb_done = sync_launch_pending && !(sync_launch_is_ld || sync_launch_is_bwgmma) && initiate_ready;
|
||||
wire tensor_idle = initiate_ready && !writeback_valid && !sync_launch_pending;
|
||||
|
||||
always @(*) begin
|
||||
metadata_deq = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
writeback_ready = commit_if.ready;
|
||||
commit_sync_done = 1'b0;
|
||||
sync_done = 1'b0;
|
||||
writeback_ready = commit_if.ready && sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma);
|
||||
|
||||
if (metadata_valid) begin
|
||||
if (bwgmma_wait) begin
|
||||
writeback_ready = 1'b0;
|
||||
if (sync_launch_pending) begin
|
||||
if (sync_writeback_done) begin
|
||||
commit_select_tensor = 1'b1;
|
||||
metadata_deq = commit_if.ready;
|
||||
commit_sync_done = commit_if.ready;
|
||||
end else if (sync_no_wb_done) begin
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready;
|
||||
commit_sync_done = metadata_valid && commit_if.ready;
|
||||
end
|
||||
end else if (metadata_valid) begin
|
||||
if (tensor_wait_op) begin
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready && tensor_idle;
|
||||
end else if (bwgmma) begin
|
||||
commit_select_tensor = !initiate_ready;
|
||||
metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
sync_done = initiate_valid && initiate_ready;
|
||||
metadata_deq = 1'b0;
|
||||
end else if (tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb) begin
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
sync_done = initiate_valid && initiate_ready;
|
||||
// suppress commit until sync_launch_pending is set next cycle
|
||||
metadata_deq = 1'b0;
|
||||
end else begin
|
||||
// TCGEN05 ops are recognized by decode so software can build
|
||||
// against the Blackwell surface. Phase 1 compute integration
|
||||
// only routes BWGMMA through the decoupled tensor datapath.
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready;
|
||||
end
|
||||
@@ -162,18 +231,22 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
end
|
||||
|
||||
if (commit_select_tensor) begin
|
||||
commit_if.valid = writeback_valid;
|
||||
commit_if.data.uuid = '0;
|
||||
commit_if.data.wid = writeback_wid;
|
||||
commit_if.data.tmask = {NUM_LANES{1'b1}};
|
||||
commit_if.data.PC = '0;
|
||||
commit_if.data.wb = 1'b1;
|
||||
commit_if.data.rd = (`NR_BITS'(`NUM_IREGS) + {1'b0, writeback_rd});
|
||||
commit_if.valid = sync_writeback_done;
|
||||
commit_if.data.uuid = sync_launch_is_bwgmma ? '0 : execute_if_data_uuid;
|
||||
commit_if.data.wid = sync_launch_is_bwgmma ? writeback_wid : execute_if_data_wid;
|
||||
commit_if.data.tmask = sync_launch_is_bwgmma ? {NUM_LANES{1'b1}} : execute_if_data_tmask;
|
||||
commit_if.data.PC = sync_launch_is_bwgmma ? '0 : execute_if_data_PC;
|
||||
commit_if.data.wb = sync_launch_is_ld;
|
||||
commit_if.data.rd = sync_launch_is_ld ? execute_if_data_rd : '0;
|
||||
commit_if.data.data = writeback_data;
|
||||
commit_if.data.tensor = 1'b1;
|
||||
commit_if.data.eop = writeback_last;
|
||||
commit_if.data.tensor = sync_launch_is_bwgmma;
|
||||
commit_if.data.pid = 1'b0;
|
||||
commit_if.data.sop = 1'b1;
|
||||
commit_if.data.eop = sync_launch_is_ld ? 1'b1 : writeback_last;
|
||||
end else begin
|
||||
commit_if.valid = metadata_valid;
|
||||
commit_if.valid = (sync_no_wb_done && metadata_valid) ||
|
||||
(metadata_valid && !sync_launch_pending && !tensor_launch_op &&
|
||||
(!tensor_wait_op || tensor_idle));
|
||||
commit_if.data.uuid = execute_if_data_uuid;
|
||||
commit_if.data.wid = execute_if_data_wid;
|
||||
commit_if.data.tmask = execute_if_data_tmask;
|
||||
@@ -187,6 +260,57 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
commit_if.data.eop = 1'b1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
sync_launch_pending <= 1'b0;
|
||||
sync_launch_is_ld <= 1'b0;
|
||||
sync_launch_is_bwgmma <= 1'b0;
|
||||
end else begin
|
||||
if (sync_done) begin
|
||||
sync_launch_pending <= 1'b1;
|
||||
sync_launch_is_ld <= tcgen05_ld;
|
||||
sync_launch_is_bwgmma <= bwgmma;
|
||||
end else if (commit_sync_done) begin
|
||||
sync_launch_pending <= 1'b0;
|
||||
sync_launch_is_ld <= 1'b0;
|
||||
sync_launch_is_bwgmma <= 1'b0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if (operand_enq_fire) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-enq: wid=%0d, PC=0x%0h, op=0x%0h, uuid=%0d, queue_full=%b\n",
|
||||
$time, execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type,
|
||||
execute_if.data.uuid, metadata_queue_full));
|
||||
end
|
||||
if (metadata_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-meta: wid=%0d, PC=0x%0h, op=0x%0h, deq=%b, commit_valid=%b, commit_ready=%b, initiate_valid=%b, initiate_ready=%b, writeback_valid=%b, writeback_ready=%b, sync_pending=%b, sync_is_ld=%b, sync_done=%b, commit_sync_done=%b, queue_full=%b\n",
|
||||
$time, execute_if_data_wid, execute_if_data_PC, execute_if_data_op_type,
|
||||
metadata_deq, commit_if.valid, commit_if.ready, initiate_valid, initiate_ready,
|
||||
writeback_valid, writeback_ready, sync_launch_pending, sync_launch_is_ld,
|
||||
sync_done, commit_sync_done, metadata_queue_full));
|
||||
end
|
||||
if (writeback_valid || initiate_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-handshake: initiate=%b/%b, op=%0d, addrA=0x%0h, addrB=0x%0h, writeback=%b/%b, wb_wid=%0d, wb_rd=%0d, wb_last=%b\n",
|
||||
$time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b,
|
||||
writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last));
|
||||
end
|
||||
if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemC_wen=%b tmemC_ren=%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n",
|
||||
$time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready,
|
||||
tmem_C_wen, tmem_C_ren, tmem_C_waddr, tmem_C_raddr,
|
||||
smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready,
|
||||
initiate_ready));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
|
||||
@@ -11,6 +11,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.master regfile_if,
|
||||
VX_tc_bus_if.master smem_A_if,
|
||||
output logic tmem_C_wen,
|
||||
output logic tmem_C_ren,
|
||||
output logic [8:0] tmem_C_waddr,
|
||||
output logic [8:0] tmem_C_raddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
|
||||
VX_tc_bus_if.master smem_B_if,
|
||||
`endif
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||
@@ -57,6 +64,12 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`ifdef EXT_T_HOPPER
|
||||
assign tmem_C_wen = 1'b0;
|
||||
assign tmem_C_ren = 1'b0;
|
||||
assign tmem_C_waddr = '0;
|
||||
assign tmem_C_raddr = '0;
|
||||
assign tmem_C_wdata = '0;
|
||||
assign tmem_C_mask = '0;
|
||||
VX_tensor_hopper_core_block #(
|
||||
.ISW(1), // FIXME: not block_idx
|
||||
.FP16(FP16)
|
||||
@@ -79,6 +92,13 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
.execute_if (execute_if[block_idx]),
|
||||
.regfile_if (regfile_if),
|
||||
.tmem_if (smem_A_if),
|
||||
.tmem_C_wen (tmem_C_wen),
|
||||
.tmem_C_ren (tmem_C_ren),
|
||||
.tmem_C_waddr(tmem_C_waddr),
|
||||
.tmem_C_raddr(tmem_C_raddr),
|
||||
.tmem_C_wdata(tmem_C_wdata),
|
||||
.tmem_C_mask(tmem_C_mask),
|
||||
.tmem_C_rdata(tmem_C_rdata),
|
||||
.smem_B_if (smem_B_if),
|
||||
.commit_if (commit_block_if[block_idx])
|
||||
);
|
||||
|
||||
@@ -22,6 +22,9 @@ task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
`EX_LSU: `TRACE(level, ("LSU"));
|
||||
`EX_FPU: `TRACE(level, ("FPU"));
|
||||
`EX_SFU: `TRACE(level, ("SFU"));
|
||||
`ifdef EXT_T_ENABLE
|
||||
`EX_TENSOR: `TRACE(level, ("TENSOR"));
|
||||
`endif
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
@@ -36,22 +39,26 @@ task trace_ex_op(input int level,
|
||||
`UNUSED_ARG(input [`XLEN-1:0] imm)
|
||||
);
|
||||
|
||||
logic fdst_d;
|
||||
logic fcvt_l;
|
||||
logic rd_float;
|
||||
|
||||
`ifdef FLEN_64
|
||||
logic fdst_d = imm[0];
|
||||
fdst_d = imm[0];
|
||||
`else
|
||||
logic fdst_d = 0;
|
||||
fdst_d = 0;
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
logic fcvt_l = imm[1];
|
||||
fcvt_l = imm[1];
|
||||
`else
|
||||
logic fcvt_l = 0;
|
||||
fcvt_l = 0;
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
|
||||
rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
|
||||
`else
|
||||
logic rd_float = 0;
|
||||
rd_float = 0;
|
||||
`endif
|
||||
|
||||
case (ex_type)
|
||||
@@ -359,6 +366,22 @@ task trace_ex_op(input int level,
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`ifdef EXT_T_ENABLE
|
||||
`EX_TENSOR: begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_TENSOR_HMMA: `TRACE(level, ("HMMA"));
|
||||
`INST_TENSOR_HGMMA: `TRACE(level, ("HGMMA"));
|
||||
`INST_TENSOR_HGMMA_WAIT: `TRACE(level, ("HGMMA_WAIT"));
|
||||
`INST_TENSOR_TCGEN05_CP: `TRACE(level, ("TCGEN05_CP"));
|
||||
`INST_TENSOR_TCGEN05_CP_WAIT: `TRACE(level, ("TCGEN05_CP_WAIT"));
|
||||
`INST_TENSOR_BWGMMA: `TRACE(level, ("BWGMMA"));
|
||||
`INST_TENSOR_BWGMMA_WAIT: `TRACE(level, ("BWGMMA_WAIT"));
|
||||
`INST_TENSOR_TCGEN05_LD: `TRACE(level, ("TCGEN05_LD"));
|
||||
`INST_TENSOR_TCGEN05_ST: `TRACE(level, ("TCGEN05_ST"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
|
||||
Reference in New Issue
Block a user