diff --git a/hw/VX_config.h b/hw/VX_config.h index c5d0caec..0f1e9461 100644 --- a/hw/VX_config.h +++ b/hw/VX_config.h @@ -84,15 +84,15 @@ #endif #ifndef NUM_CORES -#define NUM_CORES 8 +#define NUM_CORES 1 #endif #ifndef NUM_WARPS -#define NUM_WARPS 8 +#define NUM_WARPS 4 #endif #ifndef NUM_THREADS -#define NUM_THREADS 8 +#define NUM_THREADS 4 #endif #ifndef NUM_BARRIERS @@ -682,4 +682,3 @@ #define IMPLEMENTATION_ID 0 #endif // VX_CONFIG_VH - diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 71afa182..9e43fc34 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -88,15 +88,48 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 8 +`define NUM_CORES 1 `endif `ifndef NUM_WARPS -`define NUM_WARPS 8 +`define NUM_WARPS 4 +`endif + +`ifndef NUM_TENSOR_WARPS +`define NUM_TENSOR_WARPS 2 +`endif + +`define NUM_SCALAR_WARPS (`NUM_WARPS - `NUM_TENSOR_WARPS) + +`define WU_CONFIG_STATIC_ASSERTS \ + generate \ + if (!(`NUM_WARPS > 0)) begin : g_wu_num_warps_gt_zero \ + invalid_NUM_WARPS_must_be_greater_than_zero __wu_config_error(); \ + end \ + if (!(`NUM_TENSOR_WARPS > 0)) begin : g_wu_num_tensor_warps_gt_zero \ + invalid_NUM_TENSOR_WARPS_must_be_greater_than_zero __wu_config_error(); \ + end \ + if (!(`NUM_TENSOR_WARPS < `NUM_WARPS)) begin : g_wu_num_tensor_warps_lt_num_warps \ + invalid_NUM_TENSOR_WARPS_must_be_smaller_than_NUM_WARPS __wu_config_error(); \ + end \ + if (!(`NUM_SCALAR_WARPS > 0)) begin : g_wu_num_scalar_warps_gt_zero \ + invalid_NUM_SCALAR_WARPS_must_be_greater_than_zero __wu_config_error(); \ + end \ + endgenerate + +`define IS_SCALAR_WARP(wid) ((wid) < `NUM_SCALAR_WARPS) +`define IS_TENSOR_WARP(wid) ((wid) >= `NUM_SCALAR_WARPS) + +`ifndef TENSOR_NUM_GPRS +`define TENSOR_NUM_GPRS 8 +`endif + +`ifndef TENSOR_NUM_FPRS +`define TENSOR_NUM_FPRS 8 `endif `ifndef NUM_THREADS -`define NUM_THREADS 8 +`define NUM_THREADS 4 `endif `ifndef NUM_BARRIERS diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index b6bd737d..a8ab65b3 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -7,6 +7,7 @@ module Vortex import VX_gpu_pkg::*; #( parameter TENSOR_FP16 = 0, parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100, parameter NUM_THREADS = 0, + parameter NUM_TENSOR_CORES = 1, parameter TC_DATA_WIDTH = 256, parameter TC_TAG_WIDTH = 4 ) ( @@ -77,26 +78,32 @@ module Vortex import VX_gpu_pkg::*; #( output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data, // tc -------------------------------------------------- - input [2:0] tc_a_ready, - output [2:0] tc_a_valid, - output [2:0] tc_a_bits_write, - output [95:0] tc_a_bits_address, - output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag, - output [3 * 32 - 1:0] tc_a_bits_mask, - output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data, - output [2:0] tc_d_ready, - input [2:0] tc_d_valid, - input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data, - input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag, + input [NUM_TENSOR_CORES * 3 - 1:0] tc_a_ready, + output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_valid, + output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_bits_write, + output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_address, + output [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag, + output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_mask, + output [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data, + output [NUM_TENSOR_CORES * 3 - 1:0] tc_d_ready, + input [NUM_TENSOR_CORES * 3 - 1:0] tc_d_valid, + input [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data, + input [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag, - // tmem_C direct SRAM port - output tc_tmem_C_wen, - output tc_tmem_C_ren, - output [8:0] tc_tmem_C_waddr, - output [8:0] tc_tmem_C_raddr, - output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata, - output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask, - input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata, + // shared tmem direct SRAM ports + output [NUM_TENSOR_CORES-1:0] tc_tmem_A_ren, + input [NUM_TENSOR_CORES-1:0] tc_tmem_A_rready, + output [NUM_TENSOR_CORES*9-1:0] tc_tmem_A_raddr, + input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_A_rdata, + output [NUM_TENSOR_CORES-1:0] tc_tmem_C_ren, + input [NUM_TENSOR_CORES-1:0] tc_tmem_C_rready, + output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_raddr, + input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata, + output [NUM_TENSOR_CORES-1:0] tc_tmem_C_wen, + input [NUM_TENSOR_CORES-1:0] tc_tmem_C_wready, + output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_waddr, + output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata, + output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask, // gbar ------------------------------------------------ @@ -314,24 +321,52 @@ module Vortex import VX_gpu_pkg::*; #( endgenerate // tc --------------------------------------------------------------------- - VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if(); - VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if(); - // tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_* - assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid}; - assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw}; - assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr}; - assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag}; - assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen}; - assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data}; - assign tc_p0_bus_if.req_ready = tc_a_ready[0]; - assign tc_p0_bus_if.rsp_valid = tc_d_valid[0]; - assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; - assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; - assign tc_p2_bus_if.req_ready = tc_a_ready[2]; - assign tc_p2_bus_if.rsp_valid = tc_d_valid[2]; - assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; - assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; - assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready}; + VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if[NUM_TENSOR_CORES](); + VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if[NUM_TENSOR_CORES](); + + for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_ports + localparam P0 = tc * 3; + localparam P1 = tc * 3 + 1; + localparam P2 = tc * 3 + 2; + + assign tc_a_valid[P0] = tc_p0_bus_if[tc].req_valid; + assign tc_a_valid[P1] = 1'b0; + assign tc_a_valid[P2] = tc_p2_bus_if[tc].req_valid; + + assign tc_a_bits_write[P0] = tc_p0_bus_if[tc].req_data.rw; + assign tc_a_bits_write[P1] = 1'b0; + assign tc_a_bits_write[P2] = tc_p2_bus_if[tc].req_data.rw; + + assign tc_a_bits_address[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.addr; + assign tc_a_bits_address[P1 * 32 +: 32] = 32'b0; + assign tc_a_bits_address[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.addr; + + assign tc_a_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p0_bus_if[tc].req_data.tag; + assign tc_a_bits_tag[P1 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = '0; + assign tc_a_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p2_bus_if[tc].req_data.tag; + + assign tc_a_bits_mask[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.byteen; + assign tc_a_bits_mask[P1 * 32 +: 32] = '0; + assign tc_a_bits_mask[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.byteen; + + assign tc_a_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p0_bus_if[tc].req_data.data; + assign tc_a_bits_data[P1 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = '0; + assign tc_a_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p2_bus_if[tc].req_data.data; + + assign tc_p0_bus_if[tc].req_ready = tc_a_ready[P0]; + assign tc_p0_bus_if[tc].rsp_valid = tc_d_valid[P0]; + assign tc_p0_bus_if[tc].rsp_data.data = tc_d_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; + assign tc_p0_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; + + assign tc_p2_bus_if[tc].req_ready = tc_a_ready[P2]; + assign tc_p2_bus_if[tc].rsp_valid = tc_d_valid[P2]; + assign tc_p2_bus_if[tc].rsp_data.data = tc_d_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH]; + assign tc_p2_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH]; + + assign tc_d_ready[P0] = tc_p0_bus_if[tc].rsp_ready; + assign tc_d_ready[P1] = 1'b0; + assign tc_d_ready[P2] = tc_p2_bus_if[tc].rsp_ready; + end // gbar ------------------------------------------------------------------- `ifdef GBAR_ENABLE @@ -439,7 +474,8 @@ module Vortex import VX_gpu_pkg::*; #( // TODO: SCOPE_IO_BIND should be socket id VX_core #( .CORE_ID (CORE_ID), - .TENSOR_FP16 (TENSOR_FP16) + .TENSOR_FP16 (TENSOR_FP16), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES) ) core ( `SCOPE_IO_BIND (0) @@ -465,22 +501,34 @@ module Vortex import VX_gpu_pkg::*; #( .tensor_smem_A_if (tc_p0_bus_if), `ifdef EXT_T_BLACKWELL - .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_A_ren(tc_tmem_A_ren), + .tensor_tmem_A_rready(tc_tmem_A_rready), + .tensor_tmem_A_raddr(tc_tmem_A_raddr), + .tensor_tmem_A_rdata(tc_tmem_A_rdata), .tensor_tmem_C_ren(tc_tmem_C_ren), - .tensor_tmem_C_waddr(tc_tmem_C_waddr), + .tensor_tmem_C_rready(tc_tmem_C_rready), .tensor_tmem_C_raddr(tc_tmem_C_raddr), + .tensor_tmem_C_rdata(tc_tmem_C_rdata), + .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_C_wready(tc_tmem_C_wready), + .tensor_tmem_C_waddr(tc_tmem_C_waddr), .tensor_tmem_C_wdata(tc_tmem_C_wdata), .tensor_tmem_C_mask(tc_tmem_C_mask), - .tensor_tmem_C_rdata(tc_tmem_C_rdata), .tensor_smem_B_if (tc_p2_bus_if), `else - .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_A_ren(tc_tmem_A_ren), + .tensor_tmem_A_rready(tc_tmem_A_rready), + .tensor_tmem_A_raddr(tc_tmem_A_raddr), + .tensor_tmem_A_rdata(tc_tmem_A_rdata), .tensor_tmem_C_ren(tc_tmem_C_ren), - .tensor_tmem_C_waddr(tc_tmem_C_waddr), + .tensor_tmem_C_rready(tc_tmem_C_rready), .tensor_tmem_C_raddr(tc_tmem_C_raddr), + .tensor_tmem_C_rdata(tc_tmem_C_rdata), + .tensor_tmem_C_wen(tc_tmem_C_wen), + .tensor_tmem_C_wready(tc_tmem_C_wready), + .tensor_tmem_C_waddr(tc_tmem_C_waddr), .tensor_tmem_C_wdata(tc_tmem_C_wdata), .tensor_tmem_C_mask(tc_tmem_C_mask), - .tensor_tmem_C_rdata(tc_tmem_C_rdata), .tensor_smem_B_if (tc_p2_bus_if), `endif @@ -583,7 +631,7 @@ module Vortex import VX_gpu_pkg::*; #( $display("simulation has probably ended. exiting"); $finish(); end - if (finished) begin + if (busy_prev && !busy) begin $display("---------------- core%2d has no more active warps ----------------", CORE_ID); // TODO: lane assumed to be 4 // `ifndef SYNTHESIS diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index ada8fead..084f0399 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -243,16 +243,17 @@ `define INST_SFU_WSPAWN 4'h1 `define INST_SFU_SPLIT 4'h2 `define INST_SFU_JOIN 4'h3 -`define INST_SFU_BAR 4'h4 -`define INST_SFU_PRED 4'h5 -`define INST_SFU_CSRRW 4'h6 -`define INST_SFU_CSRRS 4'h7 -`define INST_SFU_CSRRC 4'h8 -`define INST_SFU_CMOV 4'h9 -`define INST_SFU_BITS 4 -`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) -`define INST_SFU_IS_WCTL(op) (op <= 5) -`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) +`define INST_SFU_BAR 4'h4 +`define INST_SFU_PRED 4'h5 +`define INST_SFU_CSRRW 4'h6 +`define INST_SFU_CSRRS 4'h7 +`define INST_SFU_CSRRC 4'h8 +`define INST_SFU_CMOV 4'h9 +`define INST_SFU_BAR_MASK 4'ha +`define INST_SFU_BITS 4 +`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) +`define INST_SFU_IS_WCTL(op) ((op <= 5) || (op == `INST_SFU_BAR_MASK)) +`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8) `define INST_TENSOR_HMMA 4'b0000 // Hopper WGMMA-style asynchronous op diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 49dc9564..092d8d63 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -46,6 +46,8 @@ package VX_gpu_pkg; logic valid; logic [`NB_WIDTH-1:0] id; logic is_global; + logic [1:0] domain; + logic [`NUM_WARPS-1:0] mask; `ifdef GBAR_ENABLE logic [`MAX(`NW_WIDTH, `NC_WIDTH)-1:0] size_m1; `else @@ -53,6 +55,13 @@ package VX_gpu_pkg; `endif } barrier_t; + localparam logic [1:0] BARRIER_ALL = 2'd0; + localparam logic [1:0] BARRIER_SCALAR = 2'd1; + localparam logic [1:0] BARRIER_TENSOR = 2'd2; + localparam logic [1:0] BARRIER_MASK = 2'd3; + localparam logic WU_DOMAIN_SCALAR = 1'b0; + localparam logic WU_DOMAIN_TENSOR = 1'b1; + typedef struct packed { logic [`XLEN-1:0] startup_addr; logic [7:0] mpm_class; @@ -90,7 +99,7 @@ package VX_gpu_pkg; localparam ICACHE_TAG_ID_BITS = `NW_WIDTH; // Core request tag bits - localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS); + localparam ICACHE_TAG_WIDTH = (1 + `UUID_WIDTH + ICACHE_TAG_ID_BITS); // Memory request data bits localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8); diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index 1c089509..27742d82 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -188,6 +188,46 @@ module VX_alu_unit #( .ready_out (commit_block_if[block_idx].ready), `UNUSED_PIN (sel_out) ); + + `ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin + if (execute_if[block_idx].valid + && ((execute_if[block_idx].data.PC == 32'h80000010) || (execute_if[block_idx].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-alu-execute-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_mul=%b, is_red=%b, wb=%0d, rd=%0d (#%0d)\n", + $time, CORE_ID, block_idx, execute_if[block_idx].valid, execute_if[block_idx].ready, + execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, + execute_if[block_idx].data.op_type, execute_if[block_idx].data.op_mod, + is_muldiv_op, is_reduce_op, execute_if[block_idx].data.wb, + execute_if[block_idx].data.rd, execute_if[block_idx].data.uuid)); + end + if (int_execute_if.valid + && ((int_execute_if.data.PC == 32'h80000010) || (int_execute_if.data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-alu-int-execute: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d (#%0d)\n", + $time, CORE_ID, block_idx, int_execute_if.valid, int_execute_if.ready, + int_execute_if.data.wid, int_execute_if.data.PC, + int_execute_if.data.wb, int_execute_if.data.rd, int_execute_if.data.uuid)); + end + if (int_commit_if.valid + && ((int_commit_if.data.PC == 32'h80000010) || (int_commit_if.data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-alu-int-commit: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, block_idx, int_commit_if.valid, int_commit_if.ready, + int_commit_if.data.wid, int_commit_if.data.PC, + int_commit_if.data.wb, int_commit_if.data.rd, + int_commit_if.data.sop, int_commit_if.data.eop, int_commit_if.data.uuid)); + end + if (commit_block_if[block_idx].valid + && ((commit_block_if[block_idx].data.PC == 32'h80000010) || (commit_block_if[block_idx].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-alu-commit-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, block_idx, commit_block_if[block_idx].valid, commit_block_if[block_idx].ready, + commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.PC, + commit_block_if[block_idx].data.wb, commit_block_if[block_idx].data.rd, + commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop, + commit_block_if[block_idx].data.uuid)); + end + end + end + `endif end `RESET_RELAY (commit_reset, reset); diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index b2676be7..568310e8 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -234,6 +234,26 @@ module VX_commit import VX_gpu_pkg::*; #( for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin always @(posedge clk) begin if (!reset && ($time > `TRACE_STARTTIME)) begin + if ((CORE_ID == 0) + && commit_if[i].valid + && ((commit_if[i].data.PC == 32'h80000010) || (commit_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-commit-arb-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b, fire=%b (#%0d)\n", + $time, CORE_ID, i, commit_if[i].valid, commit_if[i].ready, + commit_if[i].data.wid, commit_if[i].data.PC, + commit_if[i].data.wb, commit_if[i].data.rd, + commit_if[i].data.tensor, commit_if[i].data.sop, + commit_if[i].data.eop, commit_fire[i], commit_if[i].data.uuid)); + end + if ((CORE_ID == 0) + && writeback_if[i].valid + && ((writeback_if[i].data.PC == 32'h80000010) || (writeback_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-writeback-out: isw=%0d, valid=%b, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, writeback_if[i].valid, + wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.wis, + writeback_if[i].data.PC, writeback_if[i].data.rd, + writeback_if[i].data.tensor, writeback_if[i].data.sop, + writeback_if[i].data.eop, writeback_if[i].data.uuid)); + end if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin `TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop)); `TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS); @@ -259,6 +279,16 @@ module VX_commit import VX_gpu_pkg::*; #( end end end + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin + for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin + if (commit_sched_if.committed[i] && (commit_sched_if.committed_wid[i] == `NW_WIDTH'(0))) begin + `TRACE(1, ("%d: core%0d-commit-sched-out: isw=%0d, committed=%b, wid=%0d\n", + $time, CORE_ID, i, commit_sched_if.committed[i], commit_sched_if.committed_wid[i])); + end + end + end + end `endif endmodule diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 2f8af2b4..a315102c 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -17,10 +17,11 @@ `include "VX_fpu_define.vh" `endif -module VX_core import VX_gpu_pkg::*; #( - parameter CORE_ID = 0, - parameter TENSOR_FP16 = 0 -) ( +module VX_core import VX_gpu_pkg::*; #( + parameter CORE_ID = 0, + parameter TENSOR_FP16 = 0, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS +) ( `SCOPE_IO_DECL // Clock @@ -39,16 +40,21 @@ module VX_core import VX_gpu_pkg::*; #( VX_mem_bus_if.master icache_bus_if, - VX_tc_bus_if.master tensor_smem_A_if, - // tensor_tmem_C is now a direct SRAM port - output logic tensor_tmem_C_wen, - output logic tensor_tmem_C_ren, - output logic [8:0] tensor_tmem_C_waddr, - output logic [8:0] tensor_tmem_C_raddr, - output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, - output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, - input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, - VX_tc_bus_if.master tensor_smem_B_if, + VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES], + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata, + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, + VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES], `ifdef GBAR_ENABLE VX_gbar_bus_if.master gbar_bus_if, @@ -65,18 +71,26 @@ module VX_core import VX_gpu_pkg::*; #( input wire [31:0] acc_read_in, output wire [31:0] acc_write_out, output wire acc_write_en -); - VX_schedule_if schedule_if(); - VX_fetch_if fetch_if(); - VX_decode_if decode_if(); - VX_sched_csr_if sched_csr_if(); - VX_decode_sched_if decode_sched_if(); -`ifdef EXT_T_ASYNC - VX_tc_rf_if tensor_regfile_if(); -`endif +); + `WU_CONFIG_STATIC_ASSERTS + + VX_schedule_if scalar_schedule_if(); + VX_schedule_if tensor_schedule_if(); + VX_fetch_if scalar_fetch_if(); + VX_fetch_if tensor_fetch_if(); + VX_decode_if scalar_decode_if(); + VX_decode_if tensor_decode_if(); + VX_sched_csr_if sched_csr_if(); + VX_decode_sched_if decode_sched_if(); + VX_decode_sched_if scalar_decode_sched_if(); + VX_decode_sched_if tensor_decode_sched_if(); +`ifdef EXT_T_ASYNC + VX_tc_rf_if tensor_regfile_if[NUM_TENSOR_CORES](); + VX_tc_rf_if scalar_dummy_tensor_regfile_if[NUM_TENSOR_CORES](); +`endif VX_commit_sched_if commit_sched_if(); VX_commit_csr_if commit_csr_if(); - VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); + VX_branch_ctl_if branch_ctl_if[2 * `NUM_ALU_BLOCKS](); VX_warp_ctl_if warp_ctl_if(); VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH](); @@ -88,10 +102,28 @@ module VX_core import VX_gpu_pkg::*; #( VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH](); VX_commit_if fpu_commit_if[`ISSUE_WIDTH](); `endif -`ifdef EXT_T_ENABLE - VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH](); - VX_commit_if tensor_commit_if[`ISSUE_WIDTH](); -`endif +`ifdef EXT_T_ENABLE + VX_dispatch_if tensor_alu_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if tensor_lsu_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if tensor_ctrl_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if scalar_dummy_tensor_alu_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if scalar_dummy_tensor_lsu_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if scalar_dummy_tensor_ctrl_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if scalar_dummy_tensor_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if tensor_dummy_alu_dispatch_if[`ISSUE_WIDTH](); + VX_dispatch_if tensor_dummy_lsu_dispatch_if[`ISSUE_WIDTH](); +`ifdef EXT_F_ENABLE + VX_dispatch_if tensor_dummy_fpu_dispatch_if[`ISSUE_WIDTH](); +`endif + VX_dispatch_if tensor_dummy_sfu_dispatch_if[`ISSUE_WIDTH](); + VX_commit_if tensor_commit_if[`ISSUE_WIDTH](); + wire tensor_csr_unlock_valid; + wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid; + wire tensor_tmc_valid; + wire [`NW_WIDTH-1:0] tensor_tmc_wid; + wire [`NUM_THREADS-1:0] tensor_tmc_tmask; +`endif VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH](); VX_commit_if sfu_commit_if[`ISSUE_WIDTH](); @@ -102,16 +134,62 @@ module VX_core import VX_gpu_pkg::*; #( .TAG_WIDTH (DCACHE_TAG_WIDTH) ) dcache_bus_tmp_if[DCACHE_NUM_REQS](); -`ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); - VX_pipeline_perf_if pipeline_perf_if(); - - assign mem_perf_tmp_if.icache = mem_perf_if.icache; +`ifdef PERF_ENABLE + VX_mem_perf_if mem_perf_tmp_if(); + VX_pipeline_perf_if pipeline_perf_if(); + VX_pipeline_perf_if scalar_decode_perf_if(); + VX_pipeline_perf_if tensor_decode_perf_if(); + VX_pipeline_perf_if scalar_issue_perf_if(); + VX_pipeline_perf_if tensor_issue_perf_if(); + + assign mem_perf_tmp_if.icache = mem_perf_if.icache; assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; - assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; - assign mem_perf_tmp_if.mem = mem_perf_if.mem; -`endif + assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; + assign mem_perf_tmp_if.mem = mem_perf_if.mem; + + assign pipeline_perf_if.illegal_tensor_reg_access = scalar_decode_perf_if.illegal_tensor_reg_access + tensor_decode_perf_if.illegal_tensor_reg_access; + assign pipeline_perf_if.illegal_tensor_scalar_op = scalar_decode_perf_if.illegal_tensor_scalar_op + tensor_decode_perf_if.illegal_tensor_scalar_op; + assign pipeline_perf_if.illegal_scalar_tensor_op = scalar_decode_perf_if.illegal_scalar_tensor_op + tensor_decode_perf_if.illegal_scalar_tensor_op; + assign pipeline_perf_if.ibf_stalls = scalar_issue_perf_if.ibf_stalls + tensor_issue_perf_if.ibf_stalls; + assign pipeline_perf_if.scb_stalls = scalar_issue_perf_if.scb_stalls + tensor_issue_perf_if.scb_stalls; + assign pipeline_perf_if.scb_any_unit_uses = scalar_issue_perf_if.scb_any_unit_uses + tensor_issue_perf_if.scb_any_unit_uses; + assign pipeline_perf_if.scb_fires = scalar_issue_perf_if.scb_fires + tensor_issue_perf_if.scb_fires; + assign pipeline_perf_if.scb_any_fire_cycles = scalar_issue_perf_if.scb_any_fire_cycles + tensor_issue_perf_if.scb_any_fire_cycles; + assign pipeline_perf_if.dispatch_any_fire_cycles = scalar_issue_perf_if.dispatch_any_fire_cycles + tensor_issue_perf_if.dispatch_any_fire_cycles; + for (genvar perf_i = 0; perf_i < `NUM_EX_UNITS; ++perf_i) begin : g_issue_perf_merge + assign pipeline_perf_if.units_uses[perf_i] = scalar_issue_perf_if.units_uses[perf_i] + tensor_issue_perf_if.units_uses[perf_i]; + assign pipeline_perf_if.dispatch_stalls[perf_i] = scalar_issue_perf_if.dispatch_stalls[perf_i] + tensor_issue_perf_if.dispatch_stalls[perf_i]; + assign pipeline_perf_if.dispatch_valids[perf_i] = scalar_issue_perf_if.dispatch_valids[perf_i] + tensor_issue_perf_if.dispatch_valids[perf_i]; + assign pipeline_perf_if.dispatch_fires[perf_i] = scalar_issue_perf_if.dispatch_fires[perf_i] + tensor_issue_perf_if.dispatch_fires[perf_i]; + end + for (genvar perf_sfu_i = 0; perf_sfu_i < `NUM_SFU_UNITS; ++perf_sfu_i) begin : g_issue_sfu_perf_merge + assign pipeline_perf_if.sfu_uses[perf_sfu_i] = scalar_issue_perf_if.sfu_uses[perf_sfu_i] + tensor_issue_perf_if.sfu_uses[perf_sfu_i]; + end +`endif + + assign decode_sched_if.valid = scalar_decode_sched_if.valid || tensor_decode_sched_if.valid; + assign decode_sched_if.is_wstall = tensor_decode_sched_if.valid ? tensor_decode_sched_if.is_wstall : scalar_decode_sched_if.is_wstall; + assign decode_sched_if.wid = tensor_decode_sched_if.valid ? tensor_decode_sched_if.wid : scalar_decode_sched_if.wid; + `RUNTIME_ASSERT( + !(scalar_decode_sched_if.valid && tensor_decode_sched_if.valid), + ("%t: *** core%0d-two-decode-sched-events-same-cycle", $time, CORE_ID) + ) + +`ifdef EXT_T_ENABLE + for (genvar dummy_i = 0; dummy_i < `ISSUE_WIDTH; ++dummy_i) begin : g_dummy_dispatch_ready + assign scalar_dummy_tensor_alu_dispatch_if[dummy_i].ready = 1'b1; + assign scalar_dummy_tensor_lsu_dispatch_if[dummy_i].ready = 1'b1; + assign scalar_dummy_tensor_ctrl_dispatch_if[dummy_i].ready = 1'b1; + assign scalar_dummy_tensor_dispatch_if[dummy_i].ready = 1'b1; + assign tensor_dummy_alu_dispatch_if[dummy_i].ready = 1'b1; + assign tensor_dummy_lsu_dispatch_if[dummy_i].ready = 1'b1; + `ifdef EXT_F_ENABLE + assign tensor_dummy_fpu_dispatch_if[dummy_i].ready = 1'b1; + `endif + assign tensor_dummy_sfu_dispatch_if[dummy_i].ready = 1'b1; + end +`endif `RESET_RELAY (dcr_data_reset, reset); `RESET_RELAY (schedule_reset, reset); @@ -132,9 +210,10 @@ module VX_core import VX_gpu_pkg::*; #( `SCOPE_IO_SWITCH (3) - VX_schedule #( - .CORE_ID (CORE_ID) - ) schedule ( + VX_schedule #( + .CORE_ID (CORE_ID), + .NUM_BRANCHES (2 * `NUM_ALU_BLOCKS) + ) schedule ( .clk (clk), .reset (schedule_reset), @@ -144,12 +223,20 @@ module VX_core import VX_gpu_pkg::*; #( .base_dcrs (base_dcrs), - .warp_ctl_if (warp_ctl_if), - .branch_ctl_if (branch_ctl_if), - .decode_sched_if(decode_sched_if), + .warp_ctl_if (warp_ctl_if), + .branch_ctl_if (branch_ctl_if), + `ifdef EXT_T_ENABLE + .tensor_csr_unlock_valid(tensor_csr_unlock_valid), + .tensor_csr_unlock_wid(tensor_csr_unlock_wid), + .tensor_tmc_valid(tensor_tmc_valid), + .tensor_tmc_wid(tensor_tmc_wid), + .tensor_tmc_tmask(tensor_tmc_tmask), + `endif + .decode_sched_if(decode_sched_if), .commit_sched_if(commit_sched_if), - .schedule_if (schedule_if), + .scalar_schedule_if (scalar_schedule_if), + .tensor_schedule_if (tensor_schedule_if), `ifdef GBAR_ENABLE .gbar_bus_if (gbar_bus_if), `endif @@ -164,54 +251,114 @@ module VX_core import VX_gpu_pkg::*; #( `SCOPE_IO_BIND (0) .clk (clk), .reset (fetch_reset), - .icache_bus_if (icache_bus_if), - .schedule_if (schedule_if), - .fetch_if (fetch_if) - ); - - VX_decode #( - .CORE_ID (CORE_ID) - ) decode ( - .clk (clk), - .reset (decode_reset), - .fetch_if (fetch_if), - .decode_if (decode_if), - .decode_sched_if(decode_sched_if) - ); - - VX_issue #( - .CORE_ID (CORE_ID) - ) issue ( - `SCOPE_IO_BIND (1) - - .clk (clk), - .reset (issue_reset), - - `ifdef PERF_ENABLE - .perf_issue_if (pipeline_perf_if.issue), - `endif - - .decode_if (decode_if), - .writeback_if (writeback_if), - - .alu_dispatch_if(alu_dispatch_if), - .lsu_dispatch_if(lsu_dispatch_if), - `ifdef EXT_F_ENABLE + .icache_bus_if (icache_bus_if), + .scalar_schedule_if (scalar_schedule_if), + .tensor_schedule_if (tensor_schedule_if), + .scalar_fetch_if(scalar_fetch_if), + .tensor_fetch_if(tensor_fetch_if) + ); + + VX_decode #( + .CORE_ID (CORE_ID) + ) scalar_decode ( + .clk (clk), + .reset (decode_reset), + .fetch_if (scalar_fetch_if), + .decode_if (scalar_decode_if), + `ifdef PERF_ENABLE + .perf_decode_if (scalar_decode_perf_if.decode), + `endif + .decode_sched_if(scalar_decode_sched_if) + ); + + VX_decode #( + .CORE_ID (CORE_ID) + ) tensor_decode ( + .clk (clk), + .reset (decode_reset), + .fetch_if (tensor_fetch_if), + .decode_if (tensor_decode_if), + `ifdef PERF_ENABLE + .perf_decode_if (tensor_decode_perf_if.decode), + `endif + .decode_sched_if(tensor_decode_sched_if) + ); + + VX_issue #( + .CORE_ID (CORE_ID), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES), + .DOMAIN (WU_DOMAIN_SCALAR) + ) scalar_issue ( + `SCOPE_IO_BIND (1) + + .clk (clk), + .reset (issue_reset), + + `ifdef PERF_ENABLE + .perf_issue_if (scalar_issue_perf_if.issue), + `endif + + .decode_if (scalar_decode_if), + .writeback_if (writeback_if), + + .alu_dispatch_if(alu_dispatch_if), + .lsu_dispatch_if(lsu_dispatch_if), + `ifdef EXT_F_ENABLE .fpu_dispatch_if(fpu_dispatch_if), `endif - `ifdef EXT_T_ENABLE - .tensor_dispatch_if(tensor_dispatch_if), - `ifdef EXT_T_ASYNC - .tensor_regfile_if (tensor_regfile_if), - `endif - `endif - .sfu_dispatch_if(sfu_dispatch_if) - ); + `ifdef EXT_T_ENABLE + .tensor_alu_dispatch_if(scalar_dummy_tensor_alu_dispatch_if), + .tensor_lsu_dispatch_if(scalar_dummy_tensor_lsu_dispatch_if), + .tensor_ctrl_dispatch_if(scalar_dummy_tensor_ctrl_dispatch_if), + .tensor_dispatch_if(scalar_dummy_tensor_dispatch_if), + `ifdef EXT_T_ASYNC + .tensor_regfile_if (scalar_dummy_tensor_regfile_if), + `endif + `endif + .sfu_dispatch_if(sfu_dispatch_if) + ); + +`ifdef EXT_T_ENABLE + VX_issue #( + .CORE_ID (CORE_ID), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES), + .DOMAIN (WU_DOMAIN_TENSOR) + ) tensor_issue ( + `SCOPE_IO_BIND (1) + + .clk (clk), + .reset (issue_reset), + + `ifdef PERF_ENABLE + .perf_issue_if (tensor_issue_perf_if.issue), + `endif + + .decode_if (tensor_decode_if), + .writeback_if (writeback_if), + + .alu_dispatch_if(tensor_dummy_alu_dispatch_if), + .lsu_dispatch_if(tensor_dummy_lsu_dispatch_if), + `ifdef EXT_F_ENABLE + .fpu_dispatch_if(tensor_dummy_fpu_dispatch_if), + `endif + .tensor_alu_dispatch_if(tensor_alu_dispatch_if), + .tensor_lsu_dispatch_if(tensor_lsu_dispatch_if), + .tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if), + .tensor_dispatch_if(tensor_dispatch_if), + `ifdef EXT_T_ASYNC + .tensor_regfile_if (tensor_regfile_if), + `endif + .sfu_dispatch_if(tensor_dummy_sfu_dispatch_if) + ); +`else + assign tensor_decode_if.ready = 1'b1; +`endif VX_execute #( .CORE_ID (CORE_ID), - .TENSOR_FP16 (TENSOR_FP16) - ) execute ( + .TENSOR_FP16 (TENSOR_FP16), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES) + ) execute ( `SCOPE_IO_BIND (2) .clk (clk), @@ -220,10 +367,15 @@ module VX_core import VX_gpu_pkg::*; #( .base_dcrs (base_dcrs), .downstream_mem_busy(downstream_mem_busy), - `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_tmp_if), - .pipeline_perf_if(pipeline_perf_if), - `endif + `ifdef PERF_ENABLE + .mem_perf_if (mem_perf_tmp_if), + .pipeline_perf_if(pipeline_perf_if), + .perf_scalar_lsu_reqs(pipeline_perf_if.scalar_lsu_reqs), + .perf_tensor_lsu_reqs(pipeline_perf_if.tensor_lsu_reqs), + .perf_scalar_lsu_stalls(pipeline_perf_if.scalar_lsu_stalls), + .perf_tensor_lsu_stalls(pipeline_perf_if.tensor_lsu_stalls), + .perf_mem_merge_stalls(pipeline_perf_if.mem_merge_stalls), + `endif .dcache_bus_if (dcache_bus_tmp_if), @@ -231,19 +383,33 @@ module VX_core import VX_gpu_pkg::*; #( .fpu_dispatch_if(fpu_dispatch_if), .fpu_commit_if (fpu_commit_if), `endif - `ifdef EXT_T_ENABLE - .tensor_dispatch_if (tensor_dispatch_if), - .tensor_commit_if (tensor_commit_if), + `ifdef EXT_T_ENABLE + .tensor_alu_dispatch_if (tensor_alu_dispatch_if), + .tensor_lsu_dispatch_if (tensor_lsu_dispatch_if), + .tensor_ctrl_dispatch_if (tensor_ctrl_dispatch_if), + .tensor_dispatch_if (tensor_dispatch_if), + .tensor_commit_if (tensor_commit_if), + .tensor_csr_unlock_valid(tensor_csr_unlock_valid), + .tensor_csr_unlock_wid(tensor_csr_unlock_wid), + .tensor_tmc_valid(tensor_tmc_valid), + .tensor_tmc_wid(tensor_tmc_wid), + .tensor_tmc_tmask(tensor_tmc_tmask), `ifdef EXT_T_ASYNC .tensor_regfile_if (tensor_regfile_if), .tensor_smem_A_if (tensor_smem_A_if), - .tensor_tmem_C_wen (tensor_tmem_C_wen), - .tensor_tmem_C_ren (tensor_tmem_C_ren), - .tensor_tmem_C_waddr(tensor_tmem_C_waddr), - .tensor_tmem_C_raddr(tensor_tmem_C_raddr), - .tensor_tmem_C_wdata(tensor_tmem_C_wdata), - .tensor_tmem_C_mask(tensor_tmem_C_mask), - .tensor_tmem_C_rdata(tensor_tmem_C_rdata), + .tensor_tmem_A_ren (tensor_tmem_A_ren), + .tensor_tmem_A_rready(tensor_tmem_A_rready), + .tensor_tmem_A_raddr(tensor_tmem_A_raddr), + .tensor_tmem_A_rdata(tensor_tmem_A_rdata), + .tensor_tmem_C_ren (tensor_tmem_C_ren), + .tensor_tmem_C_rready(tensor_tmem_C_rready), + .tensor_tmem_C_raddr(tensor_tmem_C_raddr), + .tensor_tmem_C_rdata(tensor_tmem_C_rdata), + .tensor_tmem_C_wen (tensor_tmem_C_wen), + .tensor_tmem_C_wready(tensor_tmem_C_wready), + .tensor_tmem_C_waddr(tensor_tmem_C_waddr), + .tensor_tmem_C_wdata(tensor_tmem_C_wdata), + .tensor_tmem_C_mask(tensor_tmem_C_mask), .tensor_smem_B_if (tensor_smem_B_if), `endif `endif @@ -500,10 +666,17 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.sched_barrier_idles, `NUM_WARPS); // sched_stalls can happen when the later issue stage stalls, // causing the ibuffer to clog. - $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, - $itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0); - $display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, - $itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0); + $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, + $itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0); + $display("wu scalar scheduler ready cycles: %d", pipeline_perf_if.scalar_sched_ready_cycles); + $display("wu tensor scheduler ready cycles: %d", pipeline_perf_if.tensor_sched_ready_cycles); + $display("wu scalar scheduler issued cycles: %d", pipeline_perf_if.scalar_sched_issued_cycles); + $display("wu tensor scheduler issued cycles: %d", pipeline_perf_if.tensor_sched_issued_cycles); + $display("wu illegal tensor reg accesses: %d", pipeline_perf_if.illegal_tensor_reg_access); + $display("wu illegal tensor scalar ops: %d", pipeline_perf_if.illegal_tensor_scalar_op); + $display("wu illegal scalar tensor ops: %d", pipeline_perf_if.illegal_scalar_tensor_op); + $display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, + $itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0); // see VX_scoreboard.sv // scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready) // units_uses: valid & ~operands_ready @@ -560,12 +733,17 @@ module VX_core import VX_gpu_pkg::*; #( $display("ifetches: %d", perf_ifetches); $display("ifetch latency: %f cycles", $itor(icache_lat) / $itor(ifetches)); - $display("dcache loads: %d", perf_loads); - $display("dcache load latency: %f cycles", - $itor(dcache_lat) / $itor(loads)); - $display("dcache stores: %d", perf_stores); - end - end + $display("dcache loads: %d", perf_loads); + $display("dcache load latency: %f cycles", + $itor(dcache_lat) / $itor(loads)); + $display("dcache stores: %d", perf_stores); + $display("wu scalar lsu accepted requests: %d", pipeline_perf_if.scalar_lsu_reqs); + $display("wu tensor lsu accepted requests: %d", pipeline_perf_if.tensor_lsu_reqs); + $display("wu scalar lsu merge stalls: %d", pipeline_perf_if.scalar_lsu_stalls); + $display("wu tensor lsu merge stalls: %d", pipeline_perf_if.tensor_lsu_stalls); + $display("wu memory merge contention cycles: %d", pipeline_perf_if.mem_merge_stalls); + end + end `endif diff --git a/hw/rtl/core/VX_csr_unit.sv b/hw/rtl/core/VX_csr_unit.sv index bf229789..64988eb6 100644 --- a/hw/rtl/core/VX_csr_unit.sv +++ b/hw/rtl/core/VX_csr_unit.sv @@ -163,6 +163,22 @@ module VX_csr_unit import VX_gpu_pkg::*; #( assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop; assign sched_csr_if.unlock_wid = execute_if.data.wid; +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0) + && execute_if.valid + && (execute_if.data.PC == 32'h80000028)) begin + `TRACE(1, ("%d: core%0d-csr-probe: valid=%b ready=%b req_valid=%b req_ready=%b no_pending=%b wid=%0d PC=0x%0h addr=0x%0h wb=%0d rd=%0d sop=%b eop=%b (#%0d)\n", + $time, CORE_ID, execute_if.valid, execute_if.ready, + csr_req_valid, csr_req_ready, no_pending_instr, + execute_if.data.wid, execute_if.data.PC, csr_addr, + execute_if.data.wb, execute_if.data.rd, + execute_if.data.sop, execute_if.data.eop, + execute_if.data.uuid)); + end + end +`endif + // send response wire [NUM_LANES-1:0][31:0] csr_commit_data; diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 65ca68f8..44d06da0 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -39,14 +39,15 @@ module VX_decode #( // outputs VX_decode_if.master decode_if, +`ifdef PERF_ENABLE + VX_pipeline_perf_if.decode perf_decode_if, +`endif VX_decode_sched_if.master decode_sched_if ); localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1; `UNUSED_PARAM (CORE_ID) - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) reg [`EX_BITS-1:0] ex_type; reg [`INST_OP_BITS-1:0] op_type; @@ -488,6 +489,17 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end + 3'h6: begin // WSPAWN_MASK + op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN); + op_mod[0] = 1; + `USED_IREG (rs1); + `USED_IREG (rs2); + end + 3'h7: begin // BAR_MASK + op_type = `INST_OP_BITS'(`INST_SFU_BAR_MASK); + `USED_IREG (rs1); + `USED_IREG (rs2); + end default:; endcase end @@ -620,8 +632,93 @@ module VX_decode #( endcase end + wire fetch_is_tensor_warp = fetch_if.data.wid >= `NW_WIDTH'(`NUM_SCALAR_WARPS); + wire fetch_is_scalar_warp = fetch_if.data.wid < `NW_WIDTH'(`NUM_SCALAR_WARPS); + wire fetch_fire = fetch_if.valid && fetch_if.ready; + wire decoded_tensor_inst = (ex_type == `EX_BITS'(`EX_TENSOR)); + wire t_reg_hi_rd = use_rd && (rd_r[`NRI_BITS-1:3] != '0); + wire t_reg_hi_rs1 = use_rs1 && (rs1_r[`NRI_BITS-1:3] != '0); + wire t_reg_hi_rs2 = use_rs2 && (rs2_r[`NRI_BITS-1:3] != '0); + wire t_reg_hi_rs3 = use_rs3 && (rs3_r[`NRI_BITS-1:3] != '0); + wire tensor_reg_illegal = fetch_is_tensor_warp && (t_reg_hi_rd || t_reg_hi_rs1 || t_reg_hi_rs2 || t_reg_hi_rs3); + wire scalar_tensor_illegal = fetch_is_scalar_warp && decoded_tensor_inst; + wire tensor_fpu_illegal = fetch_is_tensor_warp && (ex_type == `EX_BITS'(`EX_FPU)); + wire tensor_read_csr_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_CSRRS)) + && (rs1_r == `NR_BITS'(0)) + && ((u_12 == `VX_CSR_THREAD_ID) + || (u_12 == `VX_CSR_WARP_ID) + || (u_12 == `VX_CSR_CORE_ID) + || (u_12 == `VX_CSR_MHARTID) + || (u_12 == `VX_CSR_NUM_THREADS) + || (u_12 == `VX_CSR_NUM_WARPS) + || (u_12 == `VX_CSR_NUM_CORES)); + wire tensor_sfu_barrier_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_BAR)) + || (op_type == `INST_OP_BITS'(`INST_SFU_BAR_MASK)); + wire tensor_sfu_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_TMC)) + || tensor_read_csr_allowed + || tensor_sfu_barrier_allowed; + wire tensor_sfu_illegal = fetch_is_tensor_warp + && (ex_type == `EX_BITS'(`EX_SFU)) + && !tensor_sfu_allowed; + wire tensor_complex_alu_illegal = fetch_is_tensor_warp + && (ex_type == `EX_BITS'(`EX_ALU)) + && (`INST_ALU_IS_M(op_mod) || `INST_ALU_IS_RED(op_mod)); + wire tensor_scalar_illegal = tensor_fpu_illegal || tensor_sfu_illegal || tensor_complex_alu_illegal; + wire decode_illegal = tensor_reg_illegal || scalar_tensor_illegal || tensor_scalar_illegal; + + wire [`EX_BITS-1:0] emit_ex_type = decode_illegal ? `EX_BITS'(`EX_ALU) : ex_type; + wire [`INST_OP_BITS-1:0] emit_op_type = decode_illegal ? `INST_OP_BITS'(`INST_BR_EBREAK) : op_type; + wire [`INST_MOD_BITS-1:0] emit_op_mod = decode_illegal ? `INST_MOD_BITS'(1) : op_mod; + wire emit_use_PC = decode_illegal ? 1'b1 : use_PC; + wire emit_use_imm = decode_illegal ? 1'b1 : use_imm; + wire [`XLEN-1:0] emit_imm = decode_illegal ? `XLEN'(0) : imm; + wire [`NR_BITS-1:0] emit_rd = decode_illegal ? `NR_BITS'(0) : rd_r; + wire [`NR_BITS-1:0] emit_rs1 = decode_illegal ? `NR_BITS'(0) : rs1_r; + wire [`NR_BITS-1:0] emit_rs2 = decode_illegal ? `NR_BITS'(0) : rs2_r; + wire [`NR_BITS-1:0] emit_rs3 = decode_illegal ? `NR_BITS'(0) : rs3_r; + + `RUNTIME_ASSERT( + !fetch_if.valid || !tensor_reg_illegal, + ("%t: *** core%0d-decode-illegal-tensor-reg: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d rd=%0d rs1=%0d rs2=%0d rs3=%0d", + $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, rd, rs1, rs2, rs3) + ) + + `RUNTIME_ASSERT( + !fetch_if.valid || !scalar_tensor_illegal, + ("%t: *** core%0d-decode-illegal-scalar-tensor-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d", + $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type) + ) + + `RUNTIME_ASSERT( + !fetch_if.valid || !tensor_scalar_illegal, + ("%t: *** core%0d-decode-illegal-tensor-scalar-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d mod=%0d", + $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, op_mod) + ) + +`ifdef PERF_ENABLE + reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_reg_access; + reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_scalar_op; + reg [`PERF_CTR_BITS-1:0] perf_illegal_scalar_tensor_op; + + always @(posedge clk) begin + if (reset) begin + perf_illegal_tensor_reg_access <= '0; + perf_illegal_tensor_scalar_op <= '0; + perf_illegal_scalar_tensor_op <= '0; + end else if (fetch_fire) begin + perf_illegal_tensor_reg_access <= perf_illegal_tensor_reg_access + `PERF_CTR_BITS'(tensor_reg_illegal); + perf_illegal_tensor_scalar_op <= perf_illegal_tensor_scalar_op + `PERF_CTR_BITS'(tensor_scalar_illegal); + perf_illegal_scalar_tensor_op <= perf_illegal_scalar_tensor_op + `PERF_CTR_BITS'(scalar_tensor_illegal); + end + end + + assign perf_decode_if.illegal_tensor_reg_access = perf_illegal_tensor_reg_access; + assign perf_decode_if.illegal_tensor_scalar_op = perf_illegal_tensor_scalar_op; + assign perf_decode_if.illegal_scalar_tensor_op = perf_illegal_scalar_tensor_op; +`endif + // disable write to integer register r0 - wire wb = use_rd && (rd_r != 0); + wire wb = !decode_illegal && use_rd && (rd_r != 0); VX_elastic_buffer #( .DATAW (DATAW), @@ -631,7 +728,7 @@ module VX_decode #( .reset (reset), .valid_in (fetch_if.valid), .ready_in (fetch_if.ready), - .data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}), + .data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, emit_ex_type, emit_op_type, emit_op_mod, emit_use_PC, emit_imm, emit_use_imm, wb, emit_rd, emit_rs1, emit_rs2, emit_rs3}), .data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}), .valid_out (decode_if.valid), .ready_out (decode_if.ready) @@ -639,11 +736,9 @@ module VX_decode #( /////////////////////////////////////////////////////////////////////////// - wire fetch_fire = fetch_if.valid && fetch_if.ready; - assign decode_sched_if.valid = fetch_fire; assign decode_sched_if.wid = fetch_if.data.wid; - assign decode_sched_if.is_wstall = is_wstall; + assign decode_sched_if.is_wstall = is_wstall || decode_illegal; `ifndef L1_ENABLE assign fetch_if.ibuf_pop = decode_if.ibuf_pop; `endif diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 74872d91..6cceafe3 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -15,7 +15,8 @@ `include "VX_trace.vh" module VX_dispatch import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, @@ -36,11 +37,15 @@ module VX_dispatch import VX_gpu_pkg::*; #( VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH], `endif `ifdef EXT_T_ENABLE + VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] ); `UNUSED_PARAM (CORE_ID) + `UNUSED_PARAM (DOMAIN) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH; @@ -68,8 +73,29 @@ module VX_dispatch import VX_gpu_pkg::*; #( VX_operands_if alu_operands_if[`ISSUE_WIDTH](); + wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] operands_wid; + wire [`ISSUE_WIDTH-1:0] operands_is_tensor; + wire [`ISSUE_WIDTH-1:0] tensor_alu_allowed; + wire [`ISSUE_WIDTH-1:0] tensor_ctrl_allowed; + wire [`ISSUE_WIDTH-1:0] tensor_wctl_allowed; + wire [`ISSUE_WIDTH-1:0] tensor_sfu_allowed; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU); + assign operands_wid[i] = wis_to_wid(operands_if[i].data.wis, ISSUE_ISW_W'(i)); + assign operands_is_tensor[i] = operands_wid[i] >= `NW_WIDTH'(`NUM_SCALAR_WARPS); + assign tensor_alu_allowed[i] = !`INST_ALU_IS_M(operands_if[i].data.op_mod) + && !`INST_ALU_IS_RED(operands_if[i].data.op_mod); + assign tensor_ctrl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_TMC) + || (operands_if[i].data.op_type == `INST_SFU_CSRRS) + || (operands_if[i].data.op_type == `INST_SFU_BAR) + || (operands_if[i].data.op_type == `INST_SFU_BAR_MASK); + assign tensor_wctl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_BAR) + || (operands_if[i].data.op_type == `INST_SFU_BAR_MASK); + assign tensor_sfu_allowed[i] = tensor_ctrl_allowed[i] || tensor_wctl_allowed[i]; + end + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i]; assign alu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (alu_reset, reset); @@ -90,12 +116,43 @@ module VX_dispatch import VX_gpu_pkg::*; #( ); end +`ifdef EXT_T_ENABLE + // Tensor INT/control dispatch + + VX_operands_if tensor_alu_operands_if[`ISSUE_WIDTH](); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign tensor_alu_operands_if[i].valid = operands_if[i].valid + && (operands_if[i].data.ex_type == `EX_ALU) + && operands_is_tensor[i] + && tensor_alu_allowed[i]; + assign tensor_alu_operands_if[i].data = operands_if[i].data; + + `RESET_RELAY (tensor_alu_reset, reset); + + VX_elastic_buffer #( + .DATAW (DATAW), + .SIZE (2), + .OUT_REG (2) + ) tensor_alu_buffer ( + .clk (clk), + .reset (tensor_alu_reset), + .valid_in (tensor_alu_operands_if[i].valid), + .ready_in (tensor_alu_operands_if[i].ready), + .data_in (`TO_DISPATCH_DATA(tensor_alu_operands_if[i].data, last_active_tid[i])), + .data_out (tensor_alu_dispatch_if[i].data), + .valid_out (tensor_alu_dispatch_if[i].valid), + .ready_out (tensor_alu_dispatch_if[i].ready) + ); + end +`endif + // LSU dispatch VX_operands_if lsu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU); + assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i]; assign lsu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (lsu_reset, reset); @@ -116,6 +173,34 @@ module VX_dispatch import VX_gpu_pkg::*; #( ); end +`ifdef EXT_T_ENABLE + // Tensor LSU dispatch + + VX_operands_if tensor_lsu_operands_if[`ISSUE_WIDTH](); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign tensor_lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i]; + assign tensor_lsu_operands_if[i].data = operands_if[i].data; + + `RESET_RELAY (tensor_lsu_reset, reset); + + VX_elastic_buffer #( + .DATAW (DATAW), + .SIZE (2), + .OUT_REG (2) + ) tensor_lsu_buffer ( + .clk (clk), + .reset (tensor_lsu_reset), + .valid_in (tensor_lsu_operands_if[i].valid), + .ready_in (tensor_lsu_operands_if[i].ready), + .data_in (`TO_DISPATCH_DATA(tensor_lsu_operands_if[i].data, last_active_tid[i])), + .data_out (tensor_lsu_dispatch_if[i].data), + .valid_out (tensor_lsu_dispatch_if[i].valid), + .ready_out (tensor_lsu_dispatch_if[i].ready) + ); + end +`endif + // FPU dispatch `ifdef EXT_F_ENABLE @@ -123,7 +208,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( VX_operands_if fpu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU); + assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i]; assign fpu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (fpu_reset, reset); @@ -152,7 +237,9 @@ module VX_dispatch import VX_gpu_pkg::*; #( VX_operands_if tensor_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR); + assign tensor_operands_if[i].valid = operands_if[i].valid + && (operands_if[i].data.ex_type == `EX_TENSOR) + && operands_is_tensor[i]; assign tensor_operands_if[i].data = operands_if[i].data; `RESET_RELAY (tensor_reset, reset); @@ -174,12 +261,45 @@ module VX_dispatch import VX_gpu_pkg::*; #( end `endif +`ifdef EXT_T_ENABLE + // Tensor control dispatch + + VX_operands_if tensor_ctrl_operands_if[`ISSUE_WIDTH](); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign tensor_ctrl_operands_if[i].valid = operands_if[i].valid + && (operands_if[i].data.ex_type == `EX_SFU) + && operands_is_tensor[i] + && tensor_ctrl_allowed[i]; + assign tensor_ctrl_operands_if[i].data = operands_if[i].data; + + `RESET_RELAY (tensor_ctrl_reset, reset); + + VX_elastic_buffer #( + .DATAW (DATAW), + .SIZE (2), + .OUT_REG (2) + ) tensor_ctrl_buffer ( + .clk (clk), + .reset (tensor_ctrl_reset), + .valid_in (tensor_ctrl_operands_if[i].valid), + .ready_in (tensor_ctrl_operands_if[i].ready), + .data_in (`TO_DISPATCH_DATA(tensor_ctrl_operands_if[i].data, last_active_tid[i])), + .data_out (tensor_ctrl_dispatch_if[i].data), + .valid_out (tensor_ctrl_dispatch_if[i].valid), + .ready_out (tensor_ctrl_dispatch_if[i].ready) + ); + end +`endif + // SFU dispatch VX_operands_if sfu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU); + assign sfu_operands_if[i].valid = operands_if[i].valid + && (operands_if[i].data.ex_type == `EX_SFU) + && !operands_is_tensor[i]; assign sfu_operands_if[i].data = operands_if[i].data; `RESET_RELAY (sfu_reset, reset); @@ -202,17 +322,46 @@ module VX_dispatch import VX_gpu_pkg::*; #( // can take next request? for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU)) - || (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU)) - `ifdef EXT_F_ENABLE - || (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU)) + assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i]) + || (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i]) + `ifdef EXT_F_ENABLE + || (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i]) + `endif + `ifdef EXT_T_ENABLE + || (tensor_alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && operands_is_tensor[i] && tensor_alu_allowed[i]) + || (tensor_lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i]) + || (tensor_ctrl_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && operands_is_tensor[i] && tensor_ctrl_allowed[i]) + || (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR) && operands_is_tensor[i]) `endif - `ifdef EXT_T_ENABLE - || (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR)) - `endif - || (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU)); + || (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && !operands_is_tensor[i]); end +`ifdef SIMULATION + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + `RUNTIME_ASSERT( + !(operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR)) || operands_is_tensor[i], + ("%t: *** core%0d-dispatch-illegal-scalar-tensor-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", + $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) + ) + `RUNTIME_ASSERT( + !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_FPU)), + ("%t: *** core%0d-dispatch-illegal-tensor-fpu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", + $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) + ) + `RUNTIME_ASSERT( + !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_SFU) && !tensor_sfu_allowed[i]), + ("%t: *** core%0d-dispatch-illegal-tensor-sfu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)", + $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid) + ) + `RUNTIME_ASSERT( + !(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_ALU) + && (`INST_ALU_IS_M(operands_if[i].data.op_mod) || `INST_ALU_IS_RED(operands_if[i].data.op_mod))), + ("%t: *** core%0d-dispatch-illegal-tensor-complex-alu-op: wid=%0d PC=0x%0h op=0x%0h mod=0x%0h (#%0d)", + $time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.uuid) + ) + end +`endif + `ifdef PERF_ENABLE wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r; wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r; @@ -309,6 +458,16 @@ module VX_dispatch import VX_gpu_pkg::*; #( for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin always @(posedge clk) begin if (!reset && ($time > `TRACE_STARTTIME)) begin + if ((CORE_ID == 0) + && alu_dispatch_if[i].valid + && ((alu_dispatch_if[i].data.PC == 32'h80000010) || (alu_dispatch_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-alu-dispatch-buffer: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, wb=%0d, rd=%0d (#%0d)\n", + $time, CORE_ID, i, alu_dispatch_if[i].valid, alu_dispatch_if[i].ready, + wis_to_wid(alu_dispatch_if[i].data.wis, i), alu_dispatch_if[i].data.PC, + alu_dispatch_if[i].data.op_type, alu_dispatch_if[i].data.op_mod, + alu_dispatch_if[i].data.wb, alu_dispatch_if[i].data.rd, + alu_dispatch_if[i].data.uuid)); + end if (operands_if[i].valid && operands_if[i].ready) begin `TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC)); trace_ex_type(1, operands_if[i].data.ex_type); diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv index 3e31ced2..e2ec115a 100644 --- a/hw/rtl/core/VX_dispatch_unit_sane.sv +++ b/hw/rtl/core/VX_dispatch_unit_sane.sv @@ -17,6 +17,7 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( parameter BLOCK_SIZE = 1, parameter NUM_LANES = 1, parameter OUT_REG = 0, + parameter ISW_BASE = 0, parameter MAX_FANOUT = `MAX_FANOUT ) ( input wire clk, @@ -234,7 +235,8 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( `RESET_RELAY(buf_out_reset, reset); - wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); + wire [ISSUE_ISW_W-1:0] routed_isw = isw + ISSUE_ISW_W'(ISW_BASE); + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], routed_isw); VX_elastic_buffer #( .DATAW (OUT_DATAW), diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index 8515f098..b169933b 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -15,7 +15,8 @@ module VX_execute import VX_gpu_pkg::*; #( parameter CORE_ID = 0, - parameter TENSOR_FP16 = 0 + parameter TENSOR_FP16 = 0, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS ) ( `SCOPE_IO_DECL @@ -37,6 +38,11 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE VX_mem_perf_if.slave mem_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if, + output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs, + output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs, + output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls, + output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls, + output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls, `endif `ifdef EXT_F_ENABLE @@ -46,7 +52,7 @@ module VX_execute import VX_gpu_pkg::*; #( VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master alu_commit_if [`ISSUE_WIDTH], - VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS], + VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS], VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH], @@ -56,19 +62,33 @@ module VX_execute import VX_gpu_pkg::*; #( VX_warp_ctl_if.master warp_ctl_if, `ifdef EXT_T_ENABLE + VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH], VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH], + output wire tensor_csr_unlock_valid, + output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid, + output wire tensor_tmc_valid, + output wire [`NW_WIDTH-1:0] tensor_tmc_wid, + output wire [`NUM_THREADS-1:0] tensor_tmc_tmask, `ifdef EXT_T_ASYNC - VX_tc_rf_if.master tensor_regfile_if, - VX_tc_bus_if.master tensor_smem_A_if, - output logic tensor_tmem_C_wen, - output logic tensor_tmem_C_ren, - output logic [8:0] tensor_tmem_C_waddr, - output logic [8:0] tensor_tmem_C_raddr, - output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, - output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, - input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, - VX_tc_bus_if.master tensor_smem_B_if, + VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES], + VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES], + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata, + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata, + output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen, + input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready, + output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask, + VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES], `endif `endif @@ -83,23 +103,196 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_F_ENABLE VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS](); `endif +`ifdef EXT_T_ENABLE + VX_warp_ctl_if scalar_warp_ctl_if(); + VX_warp_ctl_if tensor_warp_ctl_if(); + + localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t); + + wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in; + wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in; + reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r; + reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r; + reg scalar_warp_ctl_valid_r; + reg tensor_warp_ctl_valid_r; + reg warp_ctl_rr; + + assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier}; + assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier}; + + wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid; + wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid; + wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr); + wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in; + wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in; + wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out; + + wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r; + wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid; + wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r; + wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid; + + assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid; + assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data; + + always @(posedge clk) begin + if (reset) begin + scalar_warp_ctl_valid_r <= 1'b0; + tensor_warp_ctl_valid_r <= 1'b0; + warp_ctl_rr <= 1'b0; + end else begin + if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin + warp_ctl_rr <= !select_tensor_warp_ctl; + end + + if (scalar_warp_ctl_valid_r) begin + if (consume_scalar_warp_ctl_pending) begin + scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid; + scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in; + end + end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin + scalar_warp_ctl_valid_r <= 1'b1; + scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in; + end + + if (tensor_warp_ctl_valid_r) begin + if (consume_tensor_warp_ctl_pending) begin + tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid; + tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in; + end + end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin + tensor_warp_ctl_valid_r <= 1'b1; + tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in; + end + end + end + + `RUNTIME_ASSERT( + !(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending), + ("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID) + ) + `RUNTIME_ASSERT( + !(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending), + ("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID) + ) +`endif `RESET_RELAY (alu_reset, reset); `RESET_RELAY (lsu_reset, reset); `RESET_RELAY (sfu_reset, reset); + VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH](); + VX_alu_unit #( .CORE_ID (CORE_ID) ) alu_unit ( .clk (clk), .reset (alu_reset), .dispatch_if (alu_dispatch_if), - .branch_ctl_if (branch_ctl_if), - .commit_if (alu_commit_if) + .branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]), + .commit_if (alu_scalar_commit_if) ); +`ifdef EXT_T_ENABLE + VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH](); + + `RESET_RELAY (tensor_alu_reset, reset); + + VX_alu_unit #( + .CORE_ID (CORE_ID) + ) tensor_alu_unit ( + .clk (clk), + .reset (tensor_alu_reset), + .dispatch_if (tensor_alu_dispatch_if), + .branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]), + .commit_if (alu_tensor_commit_if) + ); + + localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit + VX_stream_arb #( + .NUM_INPUTS (2), + .DATAW (ALU_COMMIT_DATAW), + .ARBITER ("R"), + .OUT_REG (1) + ) alu_commit_arb ( + .clk (clk), + .reset (reset), + .valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}), + .ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}), + .data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}), + .data_out (alu_commit_if[i].data), + .valid_out (alu_commit_if[i].valid), + .ready_out (alu_commit_if[i].ready), + `UNUSED_PIN (sel_out) + ); + `ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin + if (alu_scalar_commit_if[i].valid + && ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready, + alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC, + alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd, + alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop, + alu_scalar_commit_if[i].data.uuid)); + end + if (alu_tensor_commit_if[i].valid + && ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready, + alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC, + alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd, + alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop, + alu_tensor_commit_if[i].data.uuid)); + end + if (alu_commit_if[i].valid + && ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready, + alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, + alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, + alu_commit_if[i].data.sop, alu_commit_if[i].data.eop, + alu_commit_if[i].data.uuid)); + end + end + end + `endif + end +`else + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru + assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid; + assign alu_commit_if[i].data = alu_scalar_commit_if[i].data; + assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready; + `ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin + if (alu_commit_if[i].valid + && ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready, + alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, + alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, + alu_commit_if[i].data.sop, alu_commit_if[i].data.eop, + alu_commit_if[i].data.uuid)); + end + end + end + `endif + end +`endif + `SCOPE_IO_SWITCH (1) + VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) scalar_lsu_bus_if[DCACHE_NUM_REQS](); + VX_lsu_unit #( .CORE_ID (CORE_ID) ) lsu_unit ( @@ -107,11 +300,184 @@ module VX_execute import VX_gpu_pkg::*; #( .clk (clk), .reset (lsu_reset), .downstream_mem_busy (downstream_mem_busy), - .cache_bus_if (dcache_bus_if), + .cache_bus_if (scalar_lsu_bus_if), .dispatch_if (lsu_dispatch_if), - .commit_if (lsu_commit_if) + .commit_if (lsu_scalar_commit_if) ); +`ifdef EXT_T_ENABLE + VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) tensor_lsu_bus_if[DCACHE_NUM_REQS](); + + `RESET_RELAY (tensor_lsu_reset, reset); + + VX_lsu_unit #( + .CORE_ID (CORE_ID) + ) tensor_lsu_unit ( + `SCOPE_IO_BIND (0) + .clk (clk), + .reset (tensor_lsu_reset), + .downstream_mem_busy (downstream_mem_busy), + .cache_bus_if (tensor_lsu_bus_if), + .dispatch_if (tensor_lsu_dispatch_if), + .commit_if (lsu_tensor_commit_if) + ); + + localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit + VX_stream_arb #( + .NUM_INPUTS (2), + .DATAW (LSU_COMMIT_DATAW), + .ARBITER ("R"), + .OUT_REG (1) + ) lsu_commit_arb ( + .clk (clk), + .reset (reset), + .valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}), + .ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}), + .data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}), + .data_out (lsu_commit_if[i].data), + .valid_out (lsu_commit_if[i].valid), + .ready_out (lsu_commit_if[i].ready), + `UNUSED_PIN (sel_out) + ); + end + + wire scalar_lsu_req_any; + wire tensor_lsu_req_any; + wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids; + wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids; + wire [DCACHE_NUM_REQS-1:0] lsu_req_fires; + wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires; + wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires; + reg lsu_domain_rr; + reg lsu_active_domain; + reg [15:0] lsu_pending_reads; + logic lsu_select_tensor; + logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count; + logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count; + + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem + assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid; + assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid; + + assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid; + assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data; + assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready; + assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready; + + assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid; + assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data; + assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid; + assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data; + assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready; + + assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready; + assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw; + assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; + end + + assign scalar_lsu_req_any = |scalar_lsu_req_valids; + assign tensor_lsu_req_any = |tensor_lsu_req_valids; + + always @(*) begin + if (lsu_pending_reads != 0) begin + lsu_select_tensor = lsu_active_domain; + end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin + lsu_select_tensor = lsu_domain_rr; + end else begin + lsu_select_tensor = tensor_lsu_req_any; + end + + lsu_rd_req_fire_count = '0; + lsu_rsp_fire_count = '0; + for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin + lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]); + lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]); + end + end + + always @(posedge clk) begin + if (reset) begin + lsu_domain_rr <= 1'b0; + lsu_active_domain <= 1'b0; + lsu_pending_reads <= '0; + end else begin + if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin + lsu_domain_rr <= ~lsu_select_tensor; + if (lsu_rd_req_fire_count != 0) begin + lsu_active_domain <= lsu_select_tensor; + end + end + lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count); + end + end + + `RUNTIME_ASSERT( + !(lsu_pending_reads == 0 && (|lsu_rsp_fires)), + ("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID) + ) + +`ifdef PERF_ENABLE + reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r; + reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r; + reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r; + reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r; + reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r; + + wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor; + wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor; + wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor; + wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor; + wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any; + + always @(posedge clk) begin + if (reset) begin + perf_scalar_lsu_reqs_r <= '0; + perf_tensor_lsu_reqs_r <= '0; + perf_scalar_lsu_stalls_r <= '0; + perf_tensor_lsu_stalls_r <= '0; + perf_mem_merge_stalls_r <= '0; + end else begin + perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any); + perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any); + perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall); + perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall); + perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall); + end + end + + assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r; + assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r; + assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r; + assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r; + assign perf_mem_merge_stalls = perf_mem_merge_stalls_r; +`endif +`else + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru + assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid; + assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data; + assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready; + end + + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru + `ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]); + end + +`ifdef PERF_ENABLE + assign perf_scalar_lsu_reqs = '0; + assign perf_tensor_lsu_reqs = '0; + assign perf_scalar_lsu_stalls = '0; + assign perf_tensor_lsu_stalls = '0; + assign perf_mem_merge_stalls = '0; +`endif +`endif + `ifdef EXT_F_ENABLE `RESET_RELAY (fpu_reset, reset); @@ -147,7 +513,11 @@ module VX_execute import VX_gpu_pkg::*; #( .commit_csr_if (commit_csr_if), .sched_csr_if (sched_csr_if), + `ifdef EXT_T_ENABLE + .warp_ctl_if (scalar_warp_ctl_if), + `else .warp_ctl_if (warp_ctl_if), + `endif .commit_if (sfu_commit_if), .acc_read_in (acc_read_in), @@ -156,8 +526,27 @@ module VX_execute import VX_gpu_pkg::*; #( ); `ifdef EXT_T_ENABLE + VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH](); + VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH](); + + VX_tensor_ctrl_unit #( + .CORE_ID (CORE_ID) + ) tensor_ctrl_unit ( + .clk (clk), + .reset (reset), + .dispatch_if (tensor_ctrl_dispatch_if), + .commit_if (tensor_ctrl_commit_if), + .warp_ctl_if (tensor_warp_ctl_if), + .csr_unlock_valid (tensor_csr_unlock_valid), + .csr_unlock_wid (tensor_csr_unlock_wid), + .tmc_valid (tensor_tmc_valid), + .tmc_wid (tensor_tmc_wid), + .tmc_tmask (tensor_tmc_tmask) + ); + VX_tensor_core #( - .FP16 (TENSOR_FP16) + .FP16 (TENSOR_FP16), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES) ) tensor_core ( .clk(clk), .reset(reset), @@ -166,17 +555,44 @@ module VX_execute import VX_gpu_pkg::*; #( `ifdef EXT_T_ASYNC .regfile_if(tensor_regfile_if), .smem_A_if(tensor_smem_A_if), - .tmem_C_wen(tensor_tmem_C_wen), + .tmem_A_ren(tensor_tmem_A_ren), + .tmem_A_rready(tensor_tmem_A_rready), + .tmem_A_raddr(tensor_tmem_A_raddr), + .tmem_A_rdata(tensor_tmem_A_rdata), .tmem_C_ren(tensor_tmem_C_ren), - .tmem_C_waddr(tensor_tmem_C_waddr), + .tmem_C_rready(tensor_tmem_C_rready), .tmem_C_raddr(tensor_tmem_C_raddr), + .tmem_C_rdata(tensor_tmem_C_rdata), + .tmem_C_wen(tensor_tmem_C_wen), + .tmem_C_wready(tensor_tmem_C_wready), + .tmem_C_waddr(tensor_tmem_C_waddr), .tmem_C_wdata(tensor_tmem_C_wdata), .tmem_C_mask(tensor_tmem_C_mask), - .tmem_C_rdata(tensor_tmem_C_rdata), .smem_B_if(tensor_smem_B_if), `endif - .commit_if(tensor_commit_if) + .commit_if(tensor_core_commit_if) ); + + localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1; + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb + VX_stream_arb #( + .NUM_INPUTS (2), + .DATAW (TENSOR_COMMIT_DATAW), + .ARBITER ("R"), + .OUT_REG (1) + ) tensor_commit_arb ( + .clk (clk), + .reset (reset), + .valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}), + .ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}), + .data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}), + .data_out (tensor_commit_if[i].data), + .valid_out (tensor_commit_if[i].valid), + .ready_out (tensor_commit_if[i].ready), + `UNUSED_PIN (sel_out) + ); + end `endif // simulation helper signal to get RISC-V tests Pass/Fail status diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index db5a1d73..09b4f3e0 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -25,27 +25,40 @@ module VX_fetch import VX_gpu_pkg::*; #( VX_mem_bus_if.master icache_bus_if, // inputs - VX_schedule_if.slave schedule_if, + VX_schedule_if.slave scalar_schedule_if, + VX_schedule_if.slave tensor_schedule_if, // outputs - VX_fetch_if.master fetch_if + VX_fetch_if.master scalar_fetch_if, + VX_fetch_if.master tensor_fetch_if ); `UNUSED_PARAM (CORE_ID) - `UNUSED_VAR (reset) wire icache_req_valid; wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr; wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag; wire icache_req_ready; + wire rsp_domain; wire [`UUID_WIDTH-1:0] rsp_uuid; - wire [`NW_WIDTH-1:0] req_tag, rsp_tag; + wire [`NW_WIDTH-1:0] req_tag, rsp_tag; + reg fetch_domain_rr; wire icache_req_fire = icache_req_valid && icache_req_ready; - assign req_tag = schedule_if.data.wid; + wire scalar_req_valid = scalar_schedule_if.valid; + wire tensor_req_valid = tensor_schedule_if.valid; + wire select_tensor_req = tensor_req_valid && (!scalar_req_valid || fetch_domain_rr); + wire selected_domain = select_tensor_req ? WU_DOMAIN_TENSOR : WU_DOMAIN_SCALAR; + wire selected_valid = scalar_req_valid || tensor_req_valid; + wire [`NW_WIDTH-1:0] selected_wid = select_tensor_req ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid; + wire [`XLEN-1:0] selected_pc = select_tensor_req ? tensor_schedule_if.data.PC : scalar_schedule_if.data.PC; + wire [`NUM_THREADS-1:0] selected_tmask = select_tensor_req ? tensor_schedule_if.data.tmask : scalar_schedule_if.data.tmask; + wire [`UUID_WIDTH-1:0] selected_uuid = select_tensor_req ? tensor_schedule_if.data.uuid : scalar_schedule_if.data.uuid; + + assign req_tag = selected_wid; - assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag; + assign {rsp_domain, rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag; wire [`XLEN-1:0] rsp_PC; wire [`NUM_THREADS-1:0] rsp_tmask; @@ -60,7 +73,7 @@ module VX_fetch import VX_gpu_pkg::*; #( .write (icache_req_fire), `UNUSED_PIN (wren), .waddr (req_tag), - .wdata ({schedule_if.data.PC, schedule_if.data.tmask}), + .wdata ({selected_pc, selected_tmask}), .raddr (rsp_tag), .rdata ({rsp_PC, rsp_tmask}) ); @@ -69,7 +82,8 @@ module VX_fetch import VX_gpu_pkg::*; #( // Ensure that the ibuffer doesn't fill up. // This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request. // This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus. - wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid); + wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(selected_wid); + wire [`ISSUE_WIDTH-1:0] domain_ibuf_pop = scalar_fetch_if.ibuf_pop | tensor_fetch_if.ibuf_pop; wire [`ISSUE_WIDTH-1:0] pending_ibuf_full; for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin @@ -79,7 +93,7 @@ module VX_fetch import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .incr (icache_req_fire && schedule_isw == i), - .decr (fetch_if.ibuf_pop[i]), + .decr (domain_ibuf_pop[i]), .full (pending_ibuf_full[i]), `UNUSED_PIN (size), `UNUSED_PIN (empty) @@ -90,15 +104,24 @@ module VX_fetch import VX_gpu_pkg::*; #( wire ibuf_ready = 1'b1; `endif - `RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0), - ("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid)) + `RUNTIME_ASSERT((!selected_valid || selected_pc != 0), + ("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, selected_pc, selected_wid, selected_tmask, selected_uuid)) // Icache Request - assign icache_req_valid = schedule_if.valid && ibuf_ready; - assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2]; - assign icache_req_tag = {schedule_if.data.uuid, req_tag}; - assign schedule_if.ready = icache_req_ready && ibuf_ready; + assign icache_req_valid = selected_valid && ibuf_ready; + assign icache_req_addr = selected_pc[`MEM_ADDR_WIDTH-1:2]; + assign icache_req_tag = {selected_domain, selected_uuid, req_tag}; + assign scalar_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && !select_tensor_req; + assign tensor_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && select_tensor_req; + + always @(posedge clk) begin + if (reset) begin + fetch_domain_rr <= 1'b0; + end else if (icache_req_fire && scalar_req_valid && tensor_req_valid) begin + fetch_domain_rr <= ~fetch_domain_rr; + end + end VX_elastic_buffer #( .DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH), @@ -121,18 +144,26 @@ module VX_fetch import VX_gpu_pkg::*; #( // Icache Response - assign fetch_if.valid = icache_bus_if.rsp_valid; - assign fetch_if.data.tmask = rsp_tmask; - assign fetch_if.data.wid = rsp_tag; - assign fetch_if.data.PC = rsp_PC; - assign fetch_if.data.instr = icache_bus_if.rsp_data.data; - assign fetch_if.data.uuid = rsp_uuid; - assign icache_bus_if.rsp_ready = fetch_if.ready; + assign scalar_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_SCALAR); + assign scalar_fetch_if.data.tmask = rsp_tmask; + assign scalar_fetch_if.data.wid = rsp_tag; + assign scalar_fetch_if.data.PC = rsp_PC; + assign scalar_fetch_if.data.instr = icache_bus_if.rsp_data.data; + assign scalar_fetch_if.data.uuid = rsp_uuid; + + assign tensor_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_TENSOR); + assign tensor_fetch_if.data.tmask = rsp_tmask; + assign tensor_fetch_if.data.wid = rsp_tag; + assign tensor_fetch_if.data.PC = rsp_PC; + assign tensor_fetch_if.data.instr = icache_bus_if.rsp_data.data; + assign tensor_fetch_if.data.uuid = rsp_uuid; + + assign icache_bus_if.rsp_ready = (rsp_domain == WU_DOMAIN_TENSOR) ? tensor_fetch_if.ready : scalar_fetch_if.ready; `ifdef DBG_SCOPE_FETCH if (CORE_ID == 0) begin `ifdef SCOPE - wire schedule_fire = schedule_if.valid && schedule_if.ready; + wire schedule_fire = icache_req_fire; wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; VX_scope_tap #( .SCOPE_ID (1), @@ -150,7 +181,7 @@ module VX_fetch import VX_gpu_pkg::*; #( icache_rsp_fire }), .probes({ - schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, + selected_uuid, selected_wid, selected_tmask, selected_pc, icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag }), @@ -161,7 +192,7 @@ module VX_fetch import VX_gpu_pkg::*; #( `ifdef CHIPSCOPE ila_fetch ila_fetch_inst ( .clk (clk), - .probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}), + .probe0 ({reset, selected_uuid, selected_wid, selected_tmask, selected_pc, icache_req_ready, selected_valid}), .probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}), .probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid}) ); @@ -172,14 +203,18 @@ module VX_fetch import VX_gpu_pkg::*; #( `endif `ifdef DBG_TRACE_CORE_ICACHE - wire schedule_fire = schedule_if.valid && schedule_if.ready; - wire fetch_fire = fetch_if.valid && fetch_if.ready; + wire schedule_fire = icache_req_fire; + wire scalar_fetch_fire = scalar_fetch_if.valid && scalar_fetch_if.ready; + wire tensor_fetch_fire = tensor_fetch_if.valid && tensor_fetch_if.ready; always @(posedge clk) begin if (schedule_fire) begin - `TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid)); + `TRACE(1, ("%d: I$%0d req: domain=%0d wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, selected_domain, selected_wid, selected_pc, selected_tmask, selected_uuid)); end - if (fetch_fire) begin - `TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid)); + if (scalar_fetch_fire) begin + `TRACE(1, ("%d: I$%0d scalar rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, scalar_fetch_if.data.wid, scalar_fetch_if.data.PC, scalar_fetch_if.data.tmask, scalar_fetch_if.data.instr, scalar_fetch_if.data.uuid)); + end + if (tensor_fetch_fire) begin + `TRACE(1, ("%d: I$%0d tensor rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, tensor_fetch_if.data.wid, tensor_fetch_if.data.PC, tensor_fetch_if.data.tmask, tensor_fetch_if.data.instr, tensor_fetch_if.data.uuid)); end end `endif diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index fc8270d4..aa414ecf 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -127,4 +127,37 @@ module VX_gather_unit import VX_gpu_pkg::*; #( assign commit_tmp_if.ready = commit_out_if[i].ready; end +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + for (genvar i = 0; i < BLOCK_SIZE; ++i) begin + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME)) begin + if (commit_in_if[i].valid + && ((commit_in_if[i].data.PC == 32'h80000010) || (commit_in_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: gather-commit-in: block=%0d, isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, i, commit_in_isw[i], commit_in_if[i].valid, commit_in_if[i].ready, + commit_in_if[i].data.wid, commit_in_if[i].data.PC, + commit_in_if[i].data.wb, commit_in_if[i].data.rd, + commit_in_if[i].data.sop, commit_in_if[i].data.eop, + commit_in_if[i].data.uuid)); + end + end + end + end + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME)) begin + if (commit_out_if[i].valid + && ((commit_out_if[i].data.PC == 32'h80000010) || (commit_out_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: gather-commit-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, i, commit_out_if[i].valid, commit_out_if[i].ready, + commit_out_if[i].data.wid, commit_out_if[i].data.PC, + commit_out_if[i].data.wb, commit_out_if[i].data.rd, + commit_out_if[i].data.sop, commit_out_if[i].data.eop, + commit_out_if[i].data.uuid)); + end + end + end + end +`endif + endmodule diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index 12113e92..101fbdcf 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -15,7 +15,8 @@ `include "VX_trace.vh" module VX_ibuffer import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, @@ -34,8 +35,9 @@ module VX_ibuffer import VX_gpu_pkg::*; #( wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid); wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid); + wire decode_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid); - assign decode_if.ready = ibuf_ready_in[decode_isw]; + assign decode_if.ready = decode_lane_in_domain && ibuf_ready_in[decode_isw]; `ifdef SIMULATION `ifdef DBG_TRACE_CORE_PIPELINE_VCS @@ -54,6 +56,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #( VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH](); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`IBUF_SIZE), @@ -61,7 +64,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #( ) instr_buf ( .clk (clk), .reset (reset), - .valid_in (decode_if.valid && decode_isw == i), + .valid_in (decode_if.valid && decode_isw == i && decode_lane_in_domain && LANE_IN_DOMAIN), .ready_in (ibuf_ready_in[i]), .data_in ({ decode_if.data.uuid, diff --git a/hw/rtl/core/VX_int_unit.sv b/hw/rtl/core/VX_int_unit.sv index b8cb78dd..695c5585 100644 --- a/hw/rtl/core/VX_int_unit.sv +++ b/hw/rtl/core/VX_int_unit.sv @@ -173,6 +173,38 @@ module VX_int_unit #( .data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest}) ); +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin + if (execute_if.valid + && ((execute_if.data.PC == 32'h80000010) || (execute_if.data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-int-rsp-in: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_br=%b, br_op=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, BLOCK_IDX, execute_if.valid, execute_if.ready, + execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type, + execute_if.data.op_mod, is_br_op, br_op, execute_if.data.wb, + execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, + execute_if.data.uuid)); + end + if (commit_if.valid + && ((PC_r == `XLEN'h80000010) || (PC_r == `XLEN'h80000014))) begin + `TRACE(1, ("%d: core%0d-int-rsp-out: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, is_br=%b, br_op=0x%0h, br_enable=%b, br_taken=%b, br_dest=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n", + $time, CORE_ID, BLOCK_IDX, commit_if.valid, commit_if.ready, + commit_if.data.wid, PC_r, is_br_op_r, br_op_r, br_enable, + br_taken, br_dest, commit_if.data.wb, commit_if.data.rd, + commit_if.data.tensor, commit_if.data.sop, commit_if.data.eop, + commit_if.data.uuid)); + end + if (branch_ctl_if.valid + && (branch_ctl_if.wid == `NW_WIDTH'(0))) begin + `TRACE(1, ("%d: core%0d-int-branch-out: block=%0d, wid=%0d, taken=%b, dest=0x%0h, commit_PC=0x%0h, commit_valid=%b, commit_ready=%b (#%0d)\n", + $time, CORE_ID, BLOCK_IDX, branch_ctl_if.wid, + branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.PC, + commit_if.valid, commit_if.ready, commit_if.data.uuid)); + end + end + end +`endif + for (genvar i = 0; i < NUM_LANES; ++i) begin assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i]; end diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 49a4f917..45f1b69f 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -15,7 +15,9 @@ `include "VX_trace.vh" module VX_issue import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( `SCOPE_IO_DECL @@ -35,9 +37,12 @@ module VX_issue import VX_gpu_pkg::*; #( VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH], `endif `ifdef EXT_T_ENABLE + VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH], + VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH], `ifdef EXT_T_ASYNC - VX_tc_rf_if.slave tensor_regfile_if, + VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES], `endif `endif VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH] @@ -52,7 +57,8 @@ module VX_issue import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); VX_ibuffer #( - .CORE_ID (CORE_ID) + .CORE_ID (CORE_ID), + .DOMAIN (DOMAIN) ) ibuffer ( .clk (clk), .reset (ibuf_reset), @@ -61,7 +67,8 @@ module VX_issue import VX_gpu_pkg::*; #( ); VX_scoreboard #( - .CORE_ID (CORE_ID) + .CORE_ID (CORE_ID), + .DOMAIN (DOMAIN) ) scoreboard ( .clk (clk), .reset (scoreboard_reset), @@ -84,7 +91,9 @@ module VX_issue import VX_gpu_pkg::*; #( VX_operands #( `endif .CORE_ID (CORE_ID), - .CACHE_ENABLE (0) + .CACHE_ENABLE (0), + .NUM_TENSOR_CORES (NUM_TENSOR_CORES), + .DOMAIN (DOMAIN) ) operands ( .clk (clk), .reset (operands_reset), @@ -99,7 +108,8 @@ module VX_issue import VX_gpu_pkg::*; #( ); VX_dispatch #( - .CORE_ID (CORE_ID) + .CORE_ID (CORE_ID), + .DOMAIN (DOMAIN) ) dispatch ( .clk (clk), .reset (dispatch_reset), @@ -116,11 +126,22 @@ module VX_issue import VX_gpu_pkg::*; #( .fpu_dispatch_if(fpu_dispatch_if), `endif `ifdef EXT_T_ENABLE + .tensor_alu_dispatch_if(tensor_alu_dispatch_if), + .tensor_lsu_dispatch_if(tensor_lsu_dispatch_if), + .tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if), .tensor_dispatch_if(tensor_dispatch_if), `endif .sfu_dispatch_if(sfu_dispatch_if) ); + wire decode_fire = decode_if.valid && decode_if.ready; + `RUNTIME_ASSERT( + !decode_fire || + ((DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid)), + ("%t: *** core%0d-issue-domain-crossing domain=%0d wid=%0d PC=0x%0h", + $time, CORE_ID, DOMAIN, decode_if.data.wid, decode_if.data.PC) + ) + `ifdef SIMULATION `ifdef DBG_TRACE_CORE_PIPELINE_VCS for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 237b711e..64efeb27 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -17,7 +17,9 @@ module VX_operands import VX_gpu_pkg::*; #( parameter CORE_ID = 0, - parameter CACHE_ENABLE = 0 + parameter CACHE_ENABLE = 0, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, @@ -27,6 +29,8 @@ module VX_operands import VX_gpu_pkg::*; #( VX_operands_if.master operands_if [`ISSUE_WIDTH] ); `UNUSED_PARAM (CORE_ID) + `UNUSED_PARAM (NUM_TENSOR_CORES) + `UNUSED_PARAM (DOMAIN) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv index ac1c79b7..25609598 100644 --- a/hw/rtl/core/VX_operands_dup.sv +++ b/hw/rtl/core/VX_operands_dup.sv @@ -18,7 +18,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( parameter CORE_ID = 0, - parameter CACHE_ENABLE = 0 + parameter CACHE_ENABLE = 0, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, @@ -26,13 +28,43 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH], `ifdef EXT_T_ASYNC - VX_tc_rf_if.slave tensor_regfile_if, + VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES], `endif VX_operands_if.master operands_if [`ISSUE_WIDTH] ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); +`ifdef EXT_F_ENABLE + localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS; +`else + localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS; +`endif + + function automatic [RAM_ADDRW-1:0] wu_rf_addr ( + input logic [ISSUE_WIS_W-1:0] wis, + input logic [`NR_BITS-1:0] reg_id, + input logic is_tensor + ); + if (is_tensor) begin + `ifdef EXT_F_ENABLE + if (reg_id[`NR_BITS-1]) begin + wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS)) + + RAM_ADDRW'(`TENSOR_NUM_GPRS) + + RAM_ADDRW'(reg_id[`NRI_BITS-1:0]); + end else begin + wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS)) + + RAM_ADDRW'(reg_id[`NRI_BITS-1:0]); + end + `else + wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS)) + + RAM_ADDRW'(reg_id); + `endif + end else begin + wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(`NUM_REGS)) + + RAM_ADDRW'(reg_id); + end + endfunction `ifdef PERF_ENABLE logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp; @@ -49,6 +81,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( wire tc_rf_valid [`ISSUE_WIDTH]; wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH]; + wire [ISSUE_WIS_W-1:0] tc_rf_wis [`ISSUE_WIDTH]; // FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time // because NUM_BLOCKS == 1 wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH]; @@ -56,12 +89,44 @@ module VX_operands_dup import VX_gpu_pkg::*; #( `ifdef EXT_T_ASYNC `STATIC_ASSERT((ISSUE_RATIO == 1), ("static assertion failed: tensor core only supports ISSUE_RATIO == 1")) - assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}}; - assign tc_rf_addr = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}}; - assign tensor_regfile_if.rsp_data.data = tc_rf_data[0]; + `STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS), + ("Wu tensor RF binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS")) + for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_rf_rsp + localparam TENSOR_ISW = `NUM_SCALAR_WARPS + tc; + assign tensor_regfile_if[tc].rsp_data.data = tc_rf_data[TENSOR_ISW]; + end + + for (genvar rf_i = 0; rf_i < `ISSUE_WIDTH; ++rf_i) begin : g_tc_rf_req + if ((DOMAIN == WU_DOMAIN_TENSOR) && `IS_TENSOR_WARP(rf_i)) begin + localparam TC_IDX = rf_i - `NUM_SCALAR_WARPS; + assign tc_rf_valid[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_valid : 1'b0; + assign tc_rf_addr[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.rs : '0; + assign tc_rf_wis[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.wis : '0; + end else begin + assign tc_rf_valid[rf_i] = 1'b0; + assign tc_rf_addr[rf_i] = '0; + assign tc_rf_wis[rf_i] = '0; + end + end `endif for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i); + // Wu physical RF partition: tensor lanes instantiate a smaller RF, + // while scalar lanes retain the full scalar register file. + localparam GPR_RAM_SIZE = (!LANE_IN_DOMAIN ? 1 : (`IS_TENSOR_WARP(i) ? (TENSOR_NUM_REGS * ISSUE_RATIO) : (`NUM_REGS * ISSUE_RATIO))); + localparam GPR_RAM_ADDRW = `LOG2UP(GPR_RAM_SIZE); + + function automatic [GPR_RAM_ADDRW-1:0] lane_rf_addr ( + input logic [ISSUE_WIS_W-1:0] wis, + input logic [`NR_BITS-1:0] reg_id + ); + logic [RAM_ADDRW-1:0] full_addr; + begin + full_addr = wu_rf_addr(wis, reg_id, `IS_TENSOR_WARP(i)); + lane_rf_addr = full_addr[GPR_RAM_ADDRW-1:0]; + end + endfunction always @(posedge clk) begin if (reset) begin @@ -82,7 +147,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( scoreboard_if[i].data.imm, scoreboard_if[i].data.rd }; - scoreboard_if_stored_valid[i] <= scoreboard_if[i].valid && scoreboard_if[i].ready; + scoreboard_if_stored_valid[i] <= LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready; end end @@ -117,11 +182,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #( `UNUSED_PIN (alm_full), .size (size1[i]) ); - assign operands_if[i].valid = ~empty1[i]; + assign operands_if[i].valid = LANE_IN_DOMAIN && ~empty1[i]; `ifdef EXT_T_ASYNC - assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i]; + assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2) && ~tc_rf_valid[i]; `else - assign scoreboard_if[i].ready = (size1[i] < 3'd2); + assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2); `endif `ifdef SIMULATION @@ -146,7 +211,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #( operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, operands_if[i].data.ex_type, operands_if[i].data.op_type, size1[i], empty1[i], full1[i], - tc_rf_valid[i], tc_rf_addr[i], tensor_regfile_if.req_data.wis, + tc_rf_valid[i], tc_rf_addr[i], tc_rf_wis[i], scoreboard_if[i].data.uuid, operands_if[i].data.uuid)); `else `TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n", @@ -171,9 +236,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data; wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data; - reg [RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored; - reg [RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored; - reg [RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored; + reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored; + reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored; + reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored; for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_fifo_queue #( @@ -209,10 +274,10 @@ module VX_operands_dup import VX_gpu_pkg::*; #( // GPR banks - wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1; - wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2; - wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3; - wire [RAM_ADDRW-1:0] gpr_wr_addr; + wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1; + wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2; + wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3; + wire [GPR_RAM_ADDRW-1:0] gpr_wr_addr; always @(posedge clk) begin if (reset) begin @@ -227,13 +292,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #( end if (ISSUE_WIS != 0) begin - assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; - assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1}; - assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2}; + assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd); + assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1); + assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2); `ifdef EXT_T_ASYNC - assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; + assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3); `else - assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; + assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3); `endif // always @(posedge clk) begin // if (reset) begin @@ -250,13 +315,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #( // end // end end else begin - assign gpr_wr_addr = writeback_if[i].data.rd; - assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1; - assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2; + assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd); + assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1); + assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2); `ifdef EXT_T_ASYNC - assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3; + assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3); `else - assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; + assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3); `endif // always @(posedge clk) begin // if (reset) begin @@ -292,7 +357,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #( for (genvar j = 0; j < `NUM_THREADS; ++j) begin VX_dp_ram #( .DATAW (`XLEN), - .SIZE (`NUM_REGS * ISSUE_RATIO), + .SIZE (GPR_RAM_SIZE), + .ADDRW (GPR_RAM_ADDRW), .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), @@ -301,12 +367,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) gpr_ram_rs1 ( .clk (clk), - .read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready + .read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -316,7 +382,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (`XLEN), - .SIZE (`NUM_REGS * ISSUE_RATIO), + .SIZE (GPR_RAM_SIZE), + .ADDRW (GPR_RAM_ADDRW), .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), @@ -325,12 +392,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #( .NO_RWCHECK (1) ) gpr_ram_rs2( .clk (clk), - .read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready + .read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -340,7 +407,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #( VX_dp_ram #( .DATAW (`XLEN), - .SIZE (`NUM_REGS * ISSUE_RATIO), + .SIZE (GPR_RAM_SIZE), + .ADDRW (GPR_RAM_ADDRW), .OUT_REG (1), `ifdef GPR_RESET .INIT_ENABLE (1), @@ -350,15 +418,15 @@ module VX_operands_dup import VX_gpu_pkg::*; #( ) gpr_ram_rs3 ( .clk (clk), `ifdef EXT_T_ASYNC - .read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]), + .read (LANE_IN_DOMAIN && ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i])), `else - .read (scoreboard_if[i].valid && scoreboard_if[i].ready), + .read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), `endif `UNUSED_PIN (wren), `ifdef GPR_RESET - .write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `else - .write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), + .write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]), `endif .waddr (gpr_wr_addr), .wdata (writeback_if[i].data.data[j]), @@ -367,9 +435,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #( ); `ifdef PERF_ENABLE - assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); - assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); - assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); + assign perf_write_rs1_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); + assign perf_write_rs2_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); + assign perf_write_rs3_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]); `endif end diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 1b3cd5ee..adcd6091 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -14,7 +14,8 @@ `include "VX_define.vh" module VX_schedule import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter NUM_BRANCHES = `NUM_ALU_BLOCKS ) ( input wire clk, input wire reset, @@ -28,12 +29,20 @@ module VX_schedule import VX_gpu_pkg::*; #( // inputsdecode_if VX_warp_ctl_if.slave warp_ctl_if, - VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS], + VX_branch_ctl_if.slave branch_ctl_if [NUM_BRANCHES], VX_decode_sched_if.slave decode_sched_if, VX_commit_sched_if.slave commit_sched_if, +`ifdef EXT_T_ENABLE + input wire tensor_csr_unlock_valid, + input wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid, + input wire tensor_tmc_valid, + input wire [`NW_WIDTH-1:0] tensor_tmc_wid, + input wire [`NUM_THREADS-1:0] tensor_tmc_tmask, +`endif // outputs - VX_schedule_if.master schedule_if, + VX_schedule_if.master scalar_schedule_if, + VX_schedule_if.master tensor_schedule_if, `ifdef GBAR_ENABLE VX_gbar_bus_if.master gbar_bus_if, `endif @@ -50,11 +59,10 @@ module VX_schedule import VX_gpu_pkg::*; #( reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n; reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n; - wire [`NW_WIDTH-1:0] schedule_wid; - wire [`NUM_THREADS-1:0] schedule_tmask; - wire [`XLEN-1:0] schedule_pc; - wire schedule_valid; - wire schedule_ready; + wire scalar_schedule_fire = scalar_schedule_if.valid && scalar_schedule_if.ready; + wire tensor_schedule_fire = tensor_schedule_if.valid && tensor_schedule_if.ready; + wire schedule_fire_any = scalar_schedule_fire || tensor_schedule_fire; + wire [`NW_WIDTH-1:0] schedule_fire_wid = tensor_schedule_fire ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid; // split/join wire join_valid; @@ -68,15 +76,14 @@ module VX_schedule import VX_gpu_pkg::*; #( reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs; - wire schedule_fire = schedule_valid && schedule_ready; - wire schedule_if_fire = schedule_if.valid && schedule_if.ready; + wire schedule_if_fire = schedule_fire_any; // branch - wire [`NUM_ALU_BLOCKS-1:0] branch_valid; - wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid; - wire [`NUM_ALU_BLOCKS-1:0] branch_taken; - wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest; - for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin + wire [NUM_BRANCHES-1:0] branch_valid; + wire [NUM_BRANCHES-1:0][`NW_WIDTH-1:0] branch_wid; + wire [NUM_BRANCHES-1:0] branch_taken; + wire [NUM_BRANCHES-1:0][`XLEN-1:0] branch_dest; + for (genvar i = 0; i < NUM_BRANCHES; ++i) begin assign branch_valid[i] = branch_ctl_if[i].valid; assign branch_wid[i] = branch_ctl_if[i].wid; assign branch_taken[i] = branch_ctl_if[i].taken; @@ -87,7 +94,13 @@ module VX_schedule import VX_gpu_pkg::*; #( reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n; reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n; wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count; - wire [`NUM_WARPS-1:0] curr_barrier_mask; + wire [`NUM_WARPS-1:0] curr_barrier_mask; + wire [`NUM_WARPS-1:0] curr_barrier_mask_with_self; + wire [`NUM_WARPS-1:0] scalar_warp_mask; + wire [`NUM_WARPS-1:0] tensor_warp_mask; + wire [`NUM_WARPS-1:0] barrier_domain_mask; + wire [`NUM_WARPS-1:0] barrier_arrived_mask; + wire [`CLOG2(`NUM_WARPS+1)-1:0] barrier_arrived_count; `ifdef GBAR_ENABLE reg [`NUM_WARPS-1:0] curr_barrier_mask_n; reg gbar_req_valid; @@ -95,8 +108,21 @@ module VX_schedule import VX_gpu_pkg::*; #( reg [`NC_WIDTH-1:0] gbar_req_size_m1; `endif + for (genvar i = 0; i < `NUM_WARPS; ++i) begin + assign scalar_warp_mask[i] = `IS_SCALAR_WARP(i); + assign tensor_warp_mask[i] = `IS_TENSOR_WARP(i); + end + assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id]; + assign curr_barrier_mask_with_self = curr_barrier_mask | (`NUM_WARPS'(1) << warp_ctl_if.wid); + assign barrier_domain_mask = + (warp_ctl_if.barrier.domain == BARRIER_SCALAR) ? (active_warps & scalar_warp_mask) : + (warp_ctl_if.barrier.domain == BARRIER_TENSOR) ? (active_warps & tensor_warp_mask) : + (warp_ctl_if.barrier.domain == BARRIER_MASK) ? (active_warps & warp_ctl_if.barrier.mask) : + active_warps; + assign barrier_arrived_mask = curr_barrier_mask_with_self & barrier_domain_mask; `POP_COUNT(active_barrier_count, curr_barrier_mask); + `POP_COUNT(barrier_arrived_count, barrier_arrived_mask); `UNUSED_VAR (active_barrier_count) always @(*) begin @@ -152,9 +178,11 @@ module VX_schedule import VX_gpu_pkg::*; #( `endif if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin if (~warp_ctl_if.barrier.is_global - && (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin + && ((warp_ctl_if.barrier.domain == BARRIER_MASK) + ? ((barrier_arrived_mask & warp_ctl_if.barrier.mask) == warp_ctl_if.barrier.mask) + : (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1))))) begin barrier_masks_n[warp_ctl_if.barrier.id] = '0; - barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id]; + barrier_stalls_n &= ~barrier_arrived_mask; end else begin barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1; barrier_stalls_n[warp_ctl_if.wid] = 1; @@ -186,7 +214,7 @@ module VX_schedule import VX_gpu_pkg::*; #( `endif // Branch handling - for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin + for (integer i = 0; i < NUM_BRANCHES; ++i) begin if (branch_valid[i]) begin if (branch_taken[i]) begin warp_pcs_n[branch_wid[i]] = branch_dest[i]; @@ -205,14 +233,31 @@ module VX_schedule import VX_gpu_pkg::*; #( stalled_warps_n[sched_csr_if.unlock_wid] = 0; end + `ifdef EXT_T_ENABLE + // Tensor control handles a minimal CSR-read/TMC subset without + // reusing the scalar SFU. + if (tensor_csr_unlock_valid) begin + stalled_warps_n[tensor_csr_unlock_wid] = 0; + end + + if (tensor_tmc_valid) begin + active_warps_n[tensor_tmc_wid] = (tensor_tmc_tmask != 0); + thread_masks_n[tensor_tmc_wid] = tensor_tmc_tmask; + stalled_warps_n[tensor_tmc_wid] = 0; + end + `endif + // stall the warp until decode stage - if (schedule_fire) begin - stalled_warps_n[schedule_wid] = 1; + if (schedule_fire_any) begin + stalled_warps_n[schedule_fire_wid] = 1; end // advance PC - if (schedule_if_fire) begin - warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4; + if (scalar_schedule_fire) begin + warp_pcs_n[scalar_schedule_if.data.wid] = scalar_schedule_if.data.PC + 4; + end + if (tensor_schedule_fire) begin + warp_pcs_n[tensor_schedule_if.data.wid] = tensor_schedule_if.data.PC + 4; end end @@ -251,9 +296,9 @@ module VX_schedule import VX_gpu_pkg::*; #( `ifdef GBAR_CLUSTER_ENABLE // engage cluster barrier as soon as the barrier count is // fulfilled, instead of requiring all warps to be synchronized - && (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin + && (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1)))) begin `else - && (curr_barrier_mask_n == active_warps)) begin + && (barrier_arrived_mask == barrier_domain_mask)) begin `endif gbar_req_valid <= 1; gbar_req_id <= warp_ctl_if.barrier.id; @@ -264,8 +309,11 @@ module VX_schedule import VX_gpu_pkg::*; #( end `endif - if (schedule_if_fire) begin - issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1); + if (scalar_schedule_fire) begin + issued_instrs[scalar_schedule_if.data.wid] <= issued_instrs[scalar_schedule_if.data.wid] + `UUID_WIDTH'(1); + end + if (tensor_schedule_fire) begin + issued_instrs[tensor_schedule_if.data.wid] <= issued_instrs[tensor_schedule_if.data.wid] + `UUID_WIDTH'(1); end if (busy) begin @@ -309,15 +357,33 @@ module VX_schedule import VX_gpu_pkg::*; #( // schedule the next ready warp wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls); + wire [`NUM_WARPS-1:0] scalar_ready_warps = ready_warps & scalar_warp_mask; + wire [`NUM_WARPS-1:0] tensor_ready_warps = ready_warps & tensor_warp_mask; + wire [`NW_WIDTH-1:0] scalar_schedule_wid; + wire [`NW_WIDTH-1:0] tensor_schedule_wid; + wire scalar_schedule_valid; + wire tensor_schedule_valid; + wire scalar_schedule_ready; + wire tensor_schedule_ready; VX_lzc_rr #( .N (`NUM_WARPS) - ) wid_select ( + ) scalar_wid_select ( .clk (clk), .reset (reset), - .data_in (ready_warps), - .data_out (schedule_wid), - .valid_out (schedule_valid) + .data_in (scalar_ready_warps), + .data_out (scalar_schedule_wid), + .valid_out (scalar_schedule_valid) + ); + + VX_lzc_rr #( + .N (`NUM_WARPS) + ) tensor_wid_select ( + .clk (clk), + .reset (reset), + .data_in (tensor_ready_warps), + .data_out (tensor_schedule_wid), + .valid_out (tensor_schedule_valid) ); wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data; @@ -325,47 +391,78 @@ module VX_schedule import VX_gpu_pkg::*; #( assign schedule_data[i] = {thread_masks[i], warp_pcs[i]}; end - assign {schedule_tmask, schedule_pc} = { - schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4], - schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0] - }; - `ifndef NDEBUG localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS); - reg [`UUID_WIDTH-1:0] instr_uuid; - wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid); -`ifdef SV_DPI - always @(posedge clk) begin - if (reset) begin - instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0)); - end else if (schedule_fire) begin - instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc))); - end - end + function automatic [`UUID_WIDTH-1:0] schedule_uuid ( + input logic [`NW_WIDTH-1:0] wid, + input logic [`XLEN-1:0] pc + ); + logic [GNW_WIDTH-1:0] g_wid; + begin + g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid); + schedule_uuid = `UUID_WIDTH'({g_wid, 16'(pc)}); + end + endfunction `else - wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)}; - always @(*) begin - instr_uuid = `UUID_WIDTH'(w_uuid); - end -`endif -`else - wire [`UUID_WIDTH-1:0] instr_uuid = '0; + function automatic [`UUID_WIDTH-1:0] schedule_uuid ( + input logic [`NW_WIDTH-1:0] wid, + input logic [`XLEN-1:0] pc + ); + begin + `UNUSED_VAR (wid) + `UNUSED_VAR (pc) + schedule_uuid = '0; + end + endfunction `endif VX_elastic_buffer #( - .DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH) - ) out_buf ( + .DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH), + .SIZE (0) + ) scalar_out_buf ( .clk (clk), .reset (reset), - .valid_in (schedule_valid), - .ready_in (schedule_ready), - .data_in ({schedule_tmask, schedule_pc, schedule_wid}), - .data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}), - .valid_out (schedule_if.valid), - .ready_out (schedule_if.ready) + .valid_in (!reset && scalar_schedule_valid), + .ready_in (scalar_schedule_ready), + .data_in ({schedule_data[scalar_schedule_wid], scalar_schedule_wid}), + .data_out ({scalar_schedule_if.data.tmask, scalar_schedule_if.data.PC, scalar_schedule_if.data.wid}), + .valid_out (scalar_schedule_if.valid), + .ready_out (scalar_schedule_if.ready) ); - assign schedule_if.data.uuid = instr_uuid; + VX_elastic_buffer #( + .DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH), + .SIZE (0) + ) tensor_out_buf ( + .clk (clk), + .reset (reset), + .valid_in (!reset && tensor_schedule_valid), + .ready_in (tensor_schedule_ready), + .data_in ({schedule_data[tensor_schedule_wid], tensor_schedule_wid}), + .data_out ({tensor_schedule_if.data.tmask, tensor_schedule_if.data.PC, tensor_schedule_if.data.wid}), + .valid_out (tensor_schedule_if.valid), + .ready_out (tensor_schedule_if.ready) + ); + + assign scalar_schedule_if.data.uuid = schedule_uuid(scalar_schedule_if.data.wid, scalar_schedule_if.data.PC); + assign tensor_schedule_if.data.uuid = schedule_uuid(tensor_schedule_if.data.wid, tensor_schedule_if.data.PC); + + `RUNTIME_ASSERT( + !(scalar_schedule_fire && tensor_schedule_fire), + ("%t: *** core%0d-schedule-two-domain-fire-with-single-fetch", $time, CORE_ID) + ) + + `RUNTIME_ASSERT( + !scalar_schedule_if.valid || `IS_SCALAR_WARP(scalar_schedule_if.data.wid), + ("%t: *** core%0d-scalar-scheduler-issued-tensor-warp wid=%0d", + $time, CORE_ID, scalar_schedule_if.data.wid) + ) + + `RUNTIME_ASSERT( + !tensor_schedule_if.valid || `IS_TENSOR_WARP(tensor_schedule_if.data.wid), + ("%t: *** core%0d-tensor-scheduler-issued-scalar-warp wid=%0d", + $time, CORE_ID, tensor_schedule_if.data.wid) + ) `RESET_RELAY (pending_instr_reset, reset); @@ -377,8 +474,8 @@ module VX_schedule import VX_gpu_pkg::*; #( ) pending_instr( .clk (clk), .reset (pending_instr_reset), - .incr (schedule_if_fire), - .incr_wid (schedule_if.data.wid), + .incr (decode_sched_if.valid), + .incr_wid (decode_sched_if.wid), .decr (commit_sched_if.committed), .decr_wid (commit_sched_if.committed_wid), .alm_empty_wid (sched_csr_if.alm_empty_wid), @@ -413,13 +510,30 @@ module VX_schedule import VX_gpu_pkg::*; #( end `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); + `RUNTIME_ASSERT( + !(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask != '0, + ("%t: *** core%0d-invalid-barrier-empty-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b", + $time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask) + ) + + `RUNTIME_ASSERT( + !(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask[warp_ctl_if.wid], + ("%t: *** core%0d-invalid-barrier-wid-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b", + $time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask) + ) + `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles; + reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_ready_cycles; + reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_ready_cycles; + reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_issued_cycles; + reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_issued_cycles; - wire schedule_idle = ~schedule_valid; - wire schedule_stall = schedule_if.valid && ~schedule_if.ready; + wire schedule_idle = ~(scalar_schedule_if.valid || tensor_schedule_if.valid); + wire schedule_stall = (scalar_schedule_if.valid && ~scalar_schedule_if.ready) + || (tensor_schedule_if.valid && ~tensor_schedule_if.ready); wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle; `POP_COUNT(schedule_barrier_idle, barrier_stalls); @@ -427,17 +541,29 @@ module VX_schedule import VX_gpu_pkg::*; #( if (reset) begin perf_sched_idles <= '0; perf_sched_barrier_idles <= '0; - perf_sched_stalls <= '0; + perf_sched_stalls <= '0; + perf_scalar_sched_ready_cycles <= '0; + perf_tensor_sched_ready_cycles <= '0; + perf_scalar_sched_issued_cycles <= '0; + perf_tensor_sched_issued_cycles <= '0; end else begin perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); + perf_scalar_sched_ready_cycles <= perf_scalar_sched_ready_cycles + `PERF_CTR_BITS'(scalar_schedule_valid); + perf_tensor_sched_ready_cycles <= perf_tensor_sched_ready_cycles + `PERF_CTR_BITS'(tensor_schedule_valid); + perf_scalar_sched_issued_cycles <= perf_scalar_sched_issued_cycles + `PERF_CTR_BITS'(scalar_schedule_fire); + perf_tensor_sched_issued_cycles <= perf_tensor_sched_issued_cycles + `PERF_CTR_BITS'(tensor_schedule_fire); end end assign perf_schedule_if.sched_idles = perf_sched_idles; assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles; - assign perf_schedule_if.sched_stalls = perf_sched_stalls; + assign perf_schedule_if.sched_stalls = perf_sched_stalls; + assign perf_schedule_if.scalar_sched_ready_cycles = perf_scalar_sched_ready_cycles; + assign perf_schedule_if.tensor_sched_ready_cycles = perf_tensor_sched_ready_cycles; + assign perf_schedule_if.scalar_sched_issued_cycles = perf_scalar_sched_issued_cycles; + assign perf_schedule_if.tensor_sched_issued_cycles = perf_tensor_sched_issued_cycles; `endif endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index bd680769..0ae86141 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -14,7 +14,8 @@ `include "VX_define.vh" module VX_scoreboard import VX_gpu_pkg::*; #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter DOMAIN = WU_DOMAIN_SCALAR ) ( input wire clk, input wire reset, @@ -34,6 +35,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( ); `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; +`ifdef EXT_F_ENABLE + localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS; +`else + localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS; +`endif `ifdef PERF_ENABLE reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle; @@ -141,7 +147,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `endif for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; + localparam IS_TENSOR_LANE = `IS_TENSOR_WARP(i); + localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i); + localparam SCOREBOARD_NUM_REGS = (IS_TENSOR_LANE ? TENSOR_NUM_REGS : `NUM_REGS); + localparam SCOREBOARD_REG_BITS = `CLOG2(SCOREBOARD_NUM_REGS); + + function automatic [SCOREBOARD_REG_BITS-1:0] scb_reg_idx ( + input logic [`NR_BITS-1:0] reg_id + ); + `ifdef EXT_F_ENABLE + if (IS_TENSOR_LANE && reg_id[`NR_BITS-1]) begin + scb_reg_idx = SCOREBOARD_REG_BITS'(`TENSOR_NUM_GPRS) + SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]); + end else + `endif + if (IS_TENSOR_LANE) begin + scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]); + end else begin + scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id); + end + endfunction + + reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0] inuse_regs; // Number of inflight operations in execution in the asynchronous // Tensor unit. Since the ISA does not specify an explicit destination // register, use a separate status bit. @@ -153,16 +179,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #( ((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) || (ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA)); - wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; + wire [`NW_WIDTH-1:0] writeback_wid = wis_to_wid(writeback_if[i].data.wis, ISSUE_ISW_W'(i)); + wire writeback_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(writeback_wid) : `IS_SCALAR_WARP(writeback_wid); + wire writeback_fire = writeback_lane_in_domain && writeback_if[i].valid && writeback_if[i].data.eop; - wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]; - wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]; - wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]; - wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]; + wire [SCOREBOARD_REG_BITS-1:0] ibuf_rd_idx = scb_reg_idx(ibuffer_if[i].data.rd); + wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs1_idx = scb_reg_idx(ibuffer_if[i].data.rs1); + wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs2_idx = scb_reg_idx(ibuffer_if[i].data.rs2); + wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs3_idx = scb_reg_idx(ibuffer_if[i].data.rs3); + wire [SCOREBOARD_REG_BITS-1:0] writeback_rd_idx = scb_reg_idx(writeback_if[i].data.rd); + + wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx]; + wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs1_idx]; + wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs2_idx]; + wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs3_idx]; `ifdef PERF_ENABLE - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units; - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu; + reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units; + reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu; reg [`SFU_WIDTH-1:0] sfu_type; always @(*) begin @@ -181,30 +215,30 @@ module VX_scoreboard import VX_gpu_pkg::*; #( if (ibuffer_if[i].valid) begin if (inuse_rd) begin perf_issue_any_unit_per_cycle[i] = '1; - perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; - if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin - perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1; end end if (inuse_rs1) begin perf_issue_any_unit_per_cycle[i] = '1; - perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; - if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin - perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1; end end if (inuse_rs2) begin perf_issue_any_unit_per_cycle[i] = '1; - perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; - if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin - perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1; end end if (inuse_rs3) begin perf_issue_any_unit_per_cycle[i] = '1; - perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; - if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin - perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1; + if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx] == `EX_SFU) begin + perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1; end end end @@ -234,8 +268,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `endif wire stg_valid_in, stg_ready_in; - assign stg_valid_in = ibuffer_if[i].valid && operands_ready; - assign ibuffer_if[i].ready = stg_ready_in && operands_ready; + assign stg_valid_in = LANE_IN_DOMAIN && ibuffer_if[i].valid && operands_ready; + assign ibuffer_if[i].ready = LANE_IN_DOMAIN && stg_ready_in && operands_ready; VX_stream_buffer #( .DATAW (DATAW) @@ -277,10 +311,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inflight_tensor <= '0; end else begin if (writeback_fire) begin - inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; + inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] <= 0; end if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin - inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1; + inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx] <= 1; end `ifdef EXT_T_ASYNC if (tensor_writeback_fire) begin @@ -297,9 +331,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end `ifdef PERF_ENABLE if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin - inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type; + inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] <= ibuffer_if[i].data.ex_type; if (ibuffer_if[i].data.ex_type == `EX_SFU) begin - inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type; + inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx] <= sfu_type; end end `endif @@ -324,6 +358,16 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end `ifdef EXT_T_ASYNC `ifdef DBG_TRACE_CORE_PIPELINE_VCS + if (CORE_ID == 0 && writeback_fire && + ((writeback_if[i].data.PC == 32'h80000010) || + (writeback_if[i].data.PC == 32'h80000014))) begin + `TRACE(1, ("%d: core%0d-scoreboard-writeback-recover: isw=%0d, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, inuse_before=%b, eop=%b (#%0d)\n", + $time, CORE_ID, i, wis_to_wid(writeback_if[i].data.wis, i), + writeback_if[i].data.wis, writeback_if[i].data.PC, + writeback_if[i].data.rd, writeback_if[i].data.tensor, + inuse_regs[writeback_if[i].data.wis][writeback_rd_idx], + writeback_if[i].data.eop, writeback_if[i].data.uuid)); + end if (CORE_ID == 0 && ibuffer_if[i].valid && (ibuffer_if[i].data.PC >= `XLEN'h80000240) && (ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin @@ -366,7 +410,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((~writeback_fire || writeback_if[i].data.tensor /* dont check rd for tensor ghost writes */ || - inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0), + inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] != 0), ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid)); `endif diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index 48f1cb8f..e0c9b1d7 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -60,7 +60,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (1) @@ -162,6 +162,22 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( end assign execute_if[0].ready = sfu_req_ready; +`ifdef DBG_TRACE_CORE_PIPELINE_VCS + always @(posedge clk) begin + if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0) + && execute_if[0].valid + && (execute_if[0].data.PC == 32'h80000028)) begin + `TRACE(1, ("%d: core%0d-sfu-probe: valid=%b ready=%b op=0x%0h csr_valid=%b csr_ready=%b wctl_valid=%b wctl_ready=%b wid=%0d PC=0x%0h (#%0d)\n", + $time, CORE_ID, execute_if[0].valid, execute_if[0].ready, + execute_if[0].data.op_type, + csr_execute_if.valid, csr_execute_if.ready, + wctl_execute_if.valid, wctl_execute_if.ready, + execute_if[0].data.wid, execute_if[0].data.PC, + execute_if[0].data.uuid)); + end + end +`endif + // response arbitration `RESET_RELAY (commit_reset, reset); diff --git a/hw/rtl/core/VX_tensor_blackwell_core.sv b/hw/rtl/core/VX_tensor_blackwell_core.sv index 30ee6c4a..acfd2e43 100644 --- a/hw/rtl/core/VX_tensor_blackwell_core.sv +++ b/hw/rtl/core/VX_tensor_blackwell_core.sv @@ -12,14 +12,19 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( VX_execute_if.slave execute_if, VX_tc_rf_if.master regfile_if, VX_tc_bus_if.master tmem_if, - // tmem_C is now a direct SRAM port (no TileLink) - output logic tmem_C_wen, + output logic tmem_A_ren, + input logic tmem_A_rready, + output logic [8:0] tmem_A_raddr, + input logic [`NUM_THREADS*`XLEN-1:0] tmem_A_rdata, output logic tmem_C_ren, - output logic [8:0] tmem_C_waddr, + input logic tmem_C_rready, output logic [8:0] tmem_C_raddr, + input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, + output logic tmem_C_wen, + input logic tmem_C_wready, + output logic [8:0] tmem_C_waddr, output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata, output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask, - input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, VX_tc_bus_if.master smem_B_if, VX_commit_if.master commit_if ); @@ -106,7 +111,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( tcgen05_st ? 3'd5 : tcgen05_cb ? 3'd6 : 3'd0; - wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending && commit_if.ready; + wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending; wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid; wire [4:0] initiate_rd = execute_if_data_rd[4:0]; wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0]; @@ -161,13 +166,19 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( .io_reqA_bits_source(tmem_if.req_data.tag), .io_reqA_bits_address(tmem_if.req_data.addr), .io_reqA_bits_data(tmem_if.req_data.data), - .io_tmemC_wen(tmem_C_wen), - .io_tmemC_ren(tmem_C_ren), - .io_tmemC_waddr(tmem_C_waddr), - .io_tmemC_raddr(tmem_C_raddr), - .io_tmemC_wdata(tmem_C_wdata), - .io_tmemC_mask(tmem_C_mask), - .io_tmemC_rdata(tmem_C_rdata), + .io_tmemC_aRen(tmem_A_ren), + .io_tmemC_aRready(tmem_A_rready), + .io_tmemC_aRaddr(tmem_A_raddr), + .io_tmemC_aRdata(tmem_A_rdata), + .io_tmemC_cRen(tmem_C_ren), + .io_tmemC_cRready(tmem_C_rready), + .io_tmemC_cRaddr(tmem_C_raddr), + .io_tmemC_cRdata(tmem_C_rdata), + .io_tmemC_cWen(tmem_C_wen), + .io_tmemC_cWready(tmem_C_wready), + .io_tmemC_cWaddr(tmem_C_waddr), + .io_tmemC_cWdata(tmem_C_wdata), + .io_tmemC_cMask(tmem_C_mask), .io_reqB_ready(smem_B_if.req_ready), .io_reqB_valid(smem_B_if.req_valid), .io_reqB_bits_rw(smem_B_if.req_data.rw), @@ -300,10 +311,11 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #( $time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b, writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last)); end - if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin - `TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemC_wen=%b tmemC_ren=%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n", + if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || tmem_A_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin + `TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemA_ren=%b/%b tmemA_raddr=0x%0h, tmemC_wen=%b/%b tmemC_ren=%b/%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n", $time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready, - tmem_C_wen, tmem_C_ren, tmem_C_waddr, tmem_C_raddr, + tmem_A_ren, tmem_A_rready, tmem_A_raddr, + tmem_C_wen, tmem_C_wready, tmem_C_ren, tmem_C_rready, tmem_C_waddr, tmem_C_raddr, smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready, initiate_ready)); end diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 3b95a4f1..51badbb1 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -2,27 +2,42 @@ `include "VX_fpu_define.vh" module VX_tensor_core import VX_gpu_pkg::*; #( - parameter FP16 + parameter FP16, + parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS ) ( input clk, input reset, VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], `ifdef EXT_T_ASYNC - VX_tc_rf_if.master regfile_if, - VX_tc_bus_if.master smem_A_if, - output logic tmem_C_wen, - output logic tmem_C_ren, - output logic [8:0] tmem_C_waddr, - output logic [8:0] tmem_C_raddr, - output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata, - output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask, - input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, - VX_tc_bus_if.master smem_B_if, + VX_tc_rf_if.master regfile_if[NUM_TENSOR_CORES], + VX_tc_bus_if.master smem_A_if[NUM_TENSOR_CORES], + output logic [NUM_TENSOR_CORES-1:0] tmem_A_ren, + input logic [NUM_TENSOR_CORES-1:0] tmem_A_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tmem_A_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_A_rdata, + output logic [NUM_TENSOR_CORES-1:0] tmem_C_ren, + input logic [NUM_TENSOR_CORES-1:0] tmem_C_rready, + output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_raddr, + input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_rdata, + output logic [NUM_TENSOR_CORES-1:0] tmem_C_wen, + input logic [NUM_TENSOR_CORES-1:0] tmem_C_wready, + output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_waddr, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_wdata, + output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask, + VX_tc_bus_if.master smem_B_if[NUM_TENSOR_CORES], `endif VX_commit_if.master commit_if [`ISSUE_WIDTH] ); + `STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS), + ("Wu tensor-core lane binding currently requires ISSUE_WIDTH == NUM_WARPS")) +`ifdef EXT_T_BLACKWELL + localparam BLOCK_SIZE = NUM_TENSOR_CORES; + `STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS), + ("Blackwell Wu tensor-core binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS")) +`else localparam BLOCK_SIZE = 1; +`endif localparam NUM_LANES = `NUM_THREADS; // @perf: PARTIAL_BW==1 increases power instantiating // stream_buffers for ISSUE_WIDTH times @@ -32,19 +47,48 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) execute_if[BLOCK_SIZE](); + VX_dispatch_if compact_dispatch_if[`ISSUE_WIDTH](); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_dispatch_lane + if (i < NUM_TENSOR_CORES) begin + localparam SRC_ISW = `NUM_SCALAR_WARPS + i; + assign compact_dispatch_if[i].valid = dispatch_if[SRC_ISW].valid; + assign compact_dispatch_if[i].data = dispatch_if[SRC_ISW].data; + assign dispatch_if[SRC_ISW].ready = compact_dispatch_if[i].ready; + end else begin + assign compact_dispatch_if[i].valid = 1'b0; + assign compact_dispatch_if[i].data = '0; + end + + if (i < `NUM_SCALAR_WARPS) begin + assign dispatch_if[i].ready = 1'b0; + end + end + `RESET_RELAY (dispatch_reset, reset); VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_REG (PARTIAL_BW ? 1 : 0) + .OUT_REG (PARTIAL_BW ? 1 : 0), + .ISW_BASE (`NUM_SCALAR_WARPS) ) dispatch_unit ( .clk (clk), .reset (dispatch_reset), - .dispatch_if(dispatch_if), + .dispatch_if(compact_dispatch_if), .execute_if (execute_if) ); +`ifdef SIMULATION + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_domain_assert + `RUNTIME_ASSERT( + !dispatch_if[i].valid || `IS_TENSOR_WARP(i), + ("%t: *** tensor-dispatch-cross-domain: issue_lane=%0d ex_op=0x%0h", + $time, i, dispatch_if[i].data.op_type) + ) + end +`endif + VX_commit_if #( .NUM_LANES (NUM_LANES) ) commit_block_if[BLOCK_SIZE](); @@ -64,12 +108,14 @@ module VX_tensor_core import VX_gpu_pkg::*; #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin `ifdef EXT_T_HOPPER - assign tmem_C_wen = 1'b0; - assign tmem_C_ren = 1'b0; - assign tmem_C_waddr = '0; - assign tmem_C_raddr = '0; - assign tmem_C_wdata = '0; - assign tmem_C_mask = '0; + assign tmem_A_ren[block_idx] = 1'b0; + assign tmem_A_raddr[block_idx * 9 +: 9] = '0; + assign tmem_C_wen[block_idx] = 1'b0; + assign tmem_C_ren[block_idx] = 1'b0; + assign tmem_C_waddr[block_idx * 9 +: 9] = '0; + assign tmem_C_raddr[block_idx * 9 +: 9] = '0; + assign tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN] = '0; + assign tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8] = '0; VX_tensor_hopper_core_block #( .ISW(1), // FIXME: not block_idx .FP16(FP16) @@ -77,9 +123,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .execute_if (execute_if[block_idx]), - .regfile_if (regfile_if), - .smem_A_if (smem_A_if), - .smem_B_if (smem_B_if), + .regfile_if (regfile_if[block_idx]), + .smem_A_if (smem_A_if[block_idx]), + .smem_B_if (smem_B_if[block_idx]), .commit_if (commit_block_if[block_idx]) ); `elsif EXT_T_BLACKWELL @@ -90,16 +136,22 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .execute_if (execute_if[block_idx]), - .regfile_if (regfile_if), - .tmem_if (smem_A_if), - .tmem_C_wen (tmem_C_wen), - .tmem_C_ren (tmem_C_ren), - .tmem_C_waddr(tmem_C_waddr), - .tmem_C_raddr(tmem_C_raddr), - .tmem_C_wdata(tmem_C_wdata), - .tmem_C_mask(tmem_C_mask), - .tmem_C_rdata(tmem_C_rdata), - .smem_B_if (smem_B_if), + .regfile_if (regfile_if[block_idx]), + .tmem_if (smem_A_if[block_idx]), + .tmem_A_ren (tmem_A_ren[block_idx]), + .tmem_A_rready(tmem_A_rready[block_idx]), + .tmem_A_raddr(tmem_A_raddr[block_idx * 9 +: 9]), + .tmem_A_rdata(tmem_A_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]), + .tmem_C_ren (tmem_C_ren[block_idx]), + .tmem_C_rready(tmem_C_rready[block_idx]), + .tmem_C_raddr(tmem_C_raddr[block_idx * 9 +: 9]), + .tmem_C_rdata(tmem_C_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]), + .tmem_C_wen (tmem_C_wen[block_idx]), + .tmem_C_wready(tmem_C_wready[block_idx]), + .tmem_C_waddr(tmem_C_waddr[block_idx * 9 +: 9]), + .tmem_C_wdata(tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]), + .tmem_C_mask(tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8]), + .smem_B_if (smem_B_if[block_idx]), .commit_if (commit_block_if[block_idx]) ); // ) tensor_hopper_core_block ( diff --git a/hw/rtl/core/VX_tensor_ctrl_unit.sv b/hw/rtl/core/VX_tensor_ctrl_unit.sv new file mode 100644 index 00000000..50f47095 --- /dev/null +++ b/hw/rtl/core/VX_tensor_ctrl_unit.sv @@ -0,0 +1,179 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_tensor_ctrl_unit import VX_gpu_pkg::*; #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], + VX_commit_if.master commit_if [`ISSUE_WIDTH], + VX_warp_ctl_if.master warp_ctl_if, + + output wire csr_unlock_valid, + output wire [`NW_WIDTH-1:0] csr_unlock_wid, + + output wire tmc_valid, + output wire [`NW_WIDTH-1:0] tmc_wid, + output wire [`NUM_THREADS-1:0] tmc_tmask +); + localparam BLOCK_SIZE = 1; + localparam NUM_LANES = `NUM_THREADS; + localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); + localparam PID_WIDTH = `UP(PID_BITS); + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + + 1 + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1 + + 1 + 1 + `NUM_THREADS + $bits(barrier_t); + + VX_execute_if #( + .NUM_LANES (NUM_LANES) + ) execute_if[BLOCK_SIZE](); + + `RESET_RELAY (dispatch_reset, reset); + + VX_dispatch_unit_sane #( + .BLOCK_SIZE (BLOCK_SIZE), + .NUM_LANES (NUM_LANES), + .OUT_REG (1) + ) dispatch_unit ( + .clk (clk), + .reset (dispatch_reset), + .dispatch_if(dispatch_if), + .execute_if (execute_if) + ); + + VX_commit_if #( + .NUM_LANES (NUM_LANES) + ) ctrl_commit_if[BLOCK_SIZE](); + + wire is_tmc = (execute_if[0].data.op_type == `INST_SFU_TMC); + wire is_csr = (execute_if[0].data.op_type == `INST_SFU_CSRRS); + wire is_bar = (execute_if[0].data.op_type == `INST_SFU_BAR); + wire is_bar_mask = (execute_if[0].data.op_type == `INST_SFU_BAR_MASK); + wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if[0].data.imm[`VX_CSR_ADDR_BITS-1:0]; + + wire [NUM_LANES-1:0][`XLEN-1:0] csr_read_data; + for (genvar i = 0; i < NUM_LANES; ++i) begin + wire [31:0] wtid = 32'(execute_if[0].data.pid * NUM_LANES + i); + wire [31:0] gtid = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + + (32'(execute_if[0].data.wid) << `NT_BITS) + + wtid; + assign csr_read_data[i] = + (csr_addr == `VX_CSR_THREAD_ID) ? `XLEN'(wtid) : + (csr_addr == `VX_CSR_WARP_ID) ? `XLEN'(execute_if[0].data.wid) : + (csr_addr == `VX_CSR_CORE_ID) ? `XLEN'(CORE_ID) : + (csr_addr == `VX_CSR_MHARTID) ? `XLEN'(gtid) : + (csr_addr == `VX_CSR_NUM_THREADS) ? `XLEN'(`NUM_THREADS) : + (csr_addr == `VX_CSR_NUM_WARPS) ? `XLEN'(`NUM_WARPS) : + (csr_addr == `VX_CSR_NUM_CORES) ? `XLEN'(`NUM_CORES * `NUM_CLUSTERS) : + `XLEN'(0); + end + + wire [`NUM_THREADS-1:0] req_tmc_tmask = execute_if[0].data.rs1_data[0][`NUM_THREADS-1:0]; + wire [`XLEN-1:0] req_rs1_data = execute_if[0].data.rs1_data[0]; + wire [`XLEN-1:0] req_rs2_data = execute_if[0].data.rs2_data[0]; + + barrier_t barrier, barrier_r; + assign barrier.valid = is_bar || is_bar_mask; + assign barrier.id = req_rs1_data[`NB_WIDTH-1:0]; +`ifdef GBAR_ENABLE +`ifdef GBAR_CLUSTER_ENABLE + assign barrier.is_global = 1'b1; +`else + assign barrier.is_global = req_rs1_data[31]; +`endif +`else + assign barrier.is_global = 1'b0; +`endif + assign barrier.domain = is_bar_mask ? BARRIER_MASK : req_rs1_data[29:28]; + assign barrier.mask = req_rs2_data[`NUM_WARPS-1:0]; + assign barrier.size_m1 = req_rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1); + +`ifdef SIMULATION + wire tensor_ctrl_fire = execute_if[0].valid && execute_if[0].ready && execute_if[0].data.eop; + `RUNTIME_ASSERT( + !tensor_ctrl_fire || is_tmc || is_csr || is_bar || is_bar_mask, + ("%t: *** core%0d-tensor-ctrl-illegal-op: wid=%0d PC=0x%0h op=0x%0h", + $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, execute_if[0].data.op_type) + ) + `RUNTIME_ASSERT( + !tensor_ctrl_fire || !is_csr + || (csr_addr == `VX_CSR_THREAD_ID) + || (csr_addr == `VX_CSR_WARP_ID) + || (csr_addr == `VX_CSR_CORE_ID) + || (csr_addr == `VX_CSR_MHARTID) + || (csr_addr == `VX_CSR_NUM_THREADS) + || (csr_addr == `VX_CSR_NUM_WARPS) + || (csr_addr == `VX_CSR_NUM_CORES), + ("%t: *** core%0d-tensor-ctrl-illegal-csr: wid=%0d PC=0x%0h csr=0x%0h", + $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, csr_addr) + ) + `RUNTIME_ASSERT( + !tensor_ctrl_fire || !is_bar_mask || barrier.mask != '0, + ("%t: *** core%0d-tensor-ctrl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h", + $time, CORE_ID, execute_if[0].data.wid, barrier.id, barrier.mask) + ) +`endif + + wire rsp_valid_in = execute_if[0].valid; + wire rsp_ready_in; + assign execute_if[0].ready = rsp_ready_in; + + wire [`NUM_THREADS-1:0] tmc_tmask_r; + wire is_tmc_r; + wire is_csr_r; + + VX_elastic_buffer #( + .DATAW (DATAW), + .SIZE (2) + ) rsp_buf ( + .clk (clk), + .reset (reset), + .valid_in (rsp_valid_in), + .ready_in (rsp_ready_in), + .data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.wb, csr_read_data, 1'b0, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop, is_tmc, is_csr, req_tmc_tmask, barrier}), + .data_out ({ctrl_commit_if[0].data.uuid, ctrl_commit_if[0].data.wid, ctrl_commit_if[0].data.tmask, ctrl_commit_if[0].data.PC, ctrl_commit_if[0].data.rd, ctrl_commit_if[0].data.wb, ctrl_commit_if[0].data.data, ctrl_commit_if[0].data.tensor, ctrl_commit_if[0].data.pid, ctrl_commit_if[0].data.sop, ctrl_commit_if[0].data.eop, is_tmc_r, is_csr_r, tmc_tmask_r, barrier_r}), + .valid_out (ctrl_commit_if[0].valid), + .ready_out (ctrl_commit_if[0].ready) + ); + + assign csr_unlock_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_csr_r; + assign csr_unlock_wid = ctrl_commit_if[0].data.wid; + + assign tmc_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_tmc_r; + assign tmc_wid = ctrl_commit_if[0].data.wid; + assign tmc_tmask = tmc_tmask_r; + + assign warp_ctl_if.valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && barrier_r.valid; + assign warp_ctl_if.wid = ctrl_commit_if[0].data.wid; + assign warp_ctl_if.tmc = '0; + assign warp_ctl_if.wspawn = '0; + assign warp_ctl_if.split = '0; + assign warp_ctl_if.sjoin = '0; + assign warp_ctl_if.barrier = barrier_r; + + VX_gather_unit #( + .BLOCK_SIZE (BLOCK_SIZE), + .NUM_LANES (NUM_LANES), + .OUT_REG (1) + ) gather_unit ( + .clk (clk), + .reset (reset), + .commit_in_if (ctrl_commit_if), + .commit_out_if (commit_if) + ); + +endmodule diff --git a/hw/rtl/core/VX_trace.vh b/hw/rtl/core/VX_trace.vh index de0775ef..2f7ceeee 100644 --- a/hw/rtl/core/VX_trace.vh +++ b/hw/rtl/core/VX_trace.vh @@ -360,6 +360,7 @@ task trace_ex_op(input int level, `INST_SFU_JOIN: `TRACE(level, ("JOIN")); `INST_SFU_BAR: `TRACE(level, ("BAR")); `INST_SFU_PRED: `TRACE(level, ("PRED")); + `INST_SFU_BAR_MASK:`TRACE(level, ("BAR_MASK")); `INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end `INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end `INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end diff --git a/hw/rtl/core/VX_wctl_unit.sv b/hw/rtl/core/VX_wctl_unit.sv index 36144018..b0f64f72 100644 --- a/hw/rtl/core/VX_wctl_unit.sv +++ b/hw/rtl/core/VX_wctl_unit.sv @@ -32,7 +32,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_WIDTH = `UP(PID_BITS); localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t); - localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1; + localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1; `UNUSED_VAR (execute_if.data.rs3_data) @@ -43,11 +44,13 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( barrier_t barrier, barrier_r; wire is_wspawn = (execute_if.data.op_type == `INST_SFU_WSPAWN); + wire is_wspawn_mask = is_wspawn && execute_if.data.op_mod[0]; wire is_tmc = (execute_if.data.op_type == `INST_SFU_TMC); wire is_pred = (execute_if.data.op_type == `INST_SFU_PRED); wire is_split = (execute_if.data.op_type == `INST_SFU_SPLIT); wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN); wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR); + wire is_bar_mask = (execute_if.data.op_type == `INST_SFU_BAR_MASK); wire [`UP(LANE_BITS)-1:0] tid; if (LANE_BITS != 0) begin @@ -106,7 +109,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( assign sjoin.is_dvg = rs1_data[0]; // barrier - assign barrier.valid = is_bar; + assign barrier.valid = is_bar || is_bar_mask; assign barrier.id = rs1_data[`NB_WIDTH-1:0]; `ifdef GBAR_ENABLE `ifdef GBAR_CLUSTER_ENABLE @@ -119,18 +122,35 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( `else assign barrier.is_global = 1'b0; `endif + assign barrier.domain = is_bar_mask ? BARRIER_MASK : rs1_data[29:28]; + assign barrier.mask = rs2_data[`NUM_WARPS-1:0]; assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1); // wspawn - wire [`NUM_WARPS-1:0] wspawn_wmask; + wire [`NUM_WARPS-1:0] legacy_wspawn_wmask; for (genvar i = 0; i < `NUM_WARPS; ++i) begin - assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid); + assign legacy_wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid); end + wire [`NUM_WARPS-1:0] masked_wspawn_wmask = rs1_data[`NUM_WARPS-1:0] & ~(`NUM_WARPS'(1) << execute_if.data.wid); assign wspawn.valid = is_wspawn; - assign wspawn.wmask = wspawn_wmask; + assign wspawn.wmask = is_wspawn_mask ? masked_wspawn_wmask : legacy_wspawn_wmask; assign wspawn.pc = rs2_data; +`ifdef SIMULATION + wire wctl_fire = execute_if.valid && execute_if.ready && execute_if.data.eop; + `RUNTIME_ASSERT( + !wctl_fire || !is_wspawn_mask || masked_wspawn_wmask != '0, + ("%t: *** core%0d-wctl-invalid-wspawn-mask: wid=%0d raw_mask=0x%0h pc=0x%0h", + $time, CORE_ID, execute_if.data.wid, rs1_data[`NUM_WARPS-1:0], rs2_data) + ) + `RUNTIME_ASSERT( + !wctl_fire || !is_bar_mask || barrier.mask != '0, + ("%t: *** core%0d-wctl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h", + $time, CORE_ID, execute_if.data.wid, barrier.id, barrier.mask) + ) +`endif + // response VX_elastic_buffer #( diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index fc57cad9..f4e43624 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -17,6 +17,13 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_idles; wire [`PERF_CTR_BITS-1:0] sched_stalls; wire [`PERF_CTR_BITS-1:0] sched_barrier_idles; + wire [`PERF_CTR_BITS-1:0] scalar_sched_ready_cycles; + wire [`PERF_CTR_BITS-1:0] tensor_sched_ready_cycles; + wire [`PERF_CTR_BITS-1:0] scalar_sched_issued_cycles; + wire [`PERF_CTR_BITS-1:0] tensor_sched_issued_cycles; + wire [`PERF_CTR_BITS-1:0] illegal_tensor_reg_access; + wire [`PERF_CTR_BITS-1:0] illegal_tensor_scalar_op; + wire [`PERF_CTR_BITS-1:0] illegal_scalar_tensor_op; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses; @@ -34,11 +41,26 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] stores; wire [`PERF_CTR_BITS-1:0] ifetch_latency; wire [`PERF_CTR_BITS-1:0] load_latency; + wire [`PERF_CTR_BITS-1:0] scalar_lsu_reqs; + wire [`PERF_CTR_BITS-1:0] tensor_lsu_reqs; + wire [`PERF_CTR_BITS-1:0] scalar_lsu_stalls; + wire [`PERF_CTR_BITS-1:0] tensor_lsu_stalls; + wire [`PERF_CTR_BITS-1:0] mem_merge_stalls; modport schedule ( output sched_idles, output sched_barrier_idles, - output sched_stalls + output sched_stalls, + output scalar_sched_ready_cycles, + output tensor_sched_ready_cycles, + output scalar_sched_issued_cycles, + output tensor_sched_issued_cycles + ); + + modport decode ( + output illegal_tensor_reg_access, + output illegal_tensor_scalar_op, + output illegal_scalar_tensor_op ); modport issue ( @@ -55,10 +77,25 @@ interface VX_pipeline_perf_if (); output dispatch_any_fire_cycles ); + modport execute ( + output scalar_lsu_reqs, + output tensor_lsu_reqs, + output scalar_lsu_stalls, + output tensor_lsu_stalls, + output mem_merge_stalls + ); + modport slave ( input sched_idles, input sched_barrier_idles, input sched_stalls, + input scalar_sched_ready_cycles, + input tensor_sched_ready_cycles, + input scalar_sched_issued_cycles, + input tensor_sched_issued_cycles, + input illegal_tensor_reg_access, + input illegal_tensor_scalar_op, + input illegal_scalar_tensor_op, input ibf_stalls, input scb_stalls, input scb_any_unit_uses, @@ -74,7 +111,12 @@ interface VX_pipeline_perf_if (); input loads, input stores, input ifetch_latency, - input load_latency + input load_latency, + input scalar_lsu_reqs, + input tensor_lsu_reqs, + input scalar_lsu_stalls, + input tensor_lsu_stalls, + input mem_merge_stalls ); endinterface diff --git a/kernel/include/VX_config.h b/kernel/include/VX_config.h new file mode 100644 index 00000000..b2c28f52 --- /dev/null +++ b/kernel/include/VX_config.h @@ -0,0 +1,701 @@ +// auto-generated by gen_config.py. DO NOT EDIT +// Generated at 2024-05-07 13:55:58.398687 + +// Translated from ./rtl/VX_config.vh: + +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VX_CONFIG_VH +#define VX_CONFIG_VH + +#ifndef MIN +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#endif + +#ifndef MAX +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#endif + +#ifndef CLAMP +#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x))) +#endif + +#ifndef UP +#define UP(x) (((x) != 0) ? (x) : 1) +#endif + +/////////////////////////////////////////////////////////////////////////////// + +#ifndef EXT_M_DISABLE +#define EXT_M_ENABLE +#endif + +#ifndef EXT_F_DISABLE +#define EXT_F_ENABLE +#endif + +#ifndef XLEN_32 +#ifndef XLEN_64 +#define XLEN_32 +#endif +#endif + +#ifdef XLEN_64 +#define XLEN 64 +#endif + +#ifdef XLEN_32 +#define XLEN 32 +#endif + +#ifdef EXT_D_ENABLE +#define FLEN_64 +#else +#define FLEN_32 +#endif + +#ifdef FLEN_64 +#define FLEN 64 +#endif + +#ifdef FLEN_32 +#define FLEN 32 +#endif + +#ifdef XLEN_64 +#ifdef FLEN_32 + #define FPU_RV64F +#endif +#endif + +#ifndef NUM_CLUSTERS +#define NUM_CLUSTERS 1 +#endif + +#ifndef NUM_CORES +#define NUM_CORES 1 +#endif + +#ifndef NUM_WARPS +#define NUM_WARPS 4 +#endif + +#ifndef NUM_TENSOR_WARPS +#define NUM_TENSOR_WARPS 2 +#endif + +#define NUM_SCALAR_WARPS (NUM_WARPS - NUM_TENSOR_WARPS) + +#define IS_SCALAR_WARP(wid) ((wid) < NUM_SCALAR_WARPS) +#define IS_TENSOR_WARP(wid) ((wid) >= NUM_SCALAR_WARPS) + +#ifndef TENSOR_NUM_GPRS +#define TENSOR_NUM_GPRS 8 +#endif + +#ifndef TENSOR_NUM_FPRS +#define TENSOR_NUM_FPRS 8 +#endif + +#ifndef NUM_THREADS +#define NUM_THREADS 4 +#endif + +#ifndef NUM_BARRIERS +#define NUM_BARRIERS 8 +#endif + +#ifndef SOCKET_SIZE +#define SOCKET_SIZE MIN(4, NUM_CORES) +#endif +#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE) + +#ifdef L2_ENABLE + #define L2_ENABLED 1 +#else + #define L2_ENABLED 0 +#endif + +#ifdef L3_ENABLE + #define L3_ENABLED 1 +#else + #define L3_ENABLED 0 +#endif + +#ifdef L1_DISABLE + #define ICACHE_DISABLE + #define DCACHE_DISABLE +#endif + +#ifndef MEM_BLOCK_SIZE +#define MEM_BLOCK_SIZE 64 +#endif + +#ifndef MEM_ADDR_WIDTH +#ifdef XLEN_64 +#define MEM_ADDR_WIDTH 48 +#else +#define MEM_ADDR_WIDTH 32 +#endif +#endif + +#ifndef L1_LINE_SIZE +#ifdef L1_DISABLE +#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE) +#else +#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE) +#endif +#endif + +#ifdef L2_ENABLE +#define L2_LINE_SIZE MEM_BLOCK_SIZE +#else +#define L2_LINE_SIZE L1_LINE_SIZE +#endif + +#ifdef L3_ENABLE +#define L3_LINE_SIZE MEM_BLOCK_SIZE +#else +#define L3_LINE_SIZE L2_LINE_SIZE +#endif + +#ifdef XLEN_64 + +#ifndef STARTUP_ADDR +#define STARTUP_ADDR 0x180000000 +#endif + +#ifndef STACK_BASE_ADDR +#define STACK_BASE_ADDR 0x1FF000000 +#endif + +#else + +#ifndef STARTUP_ADDR +#define STARTUP_ADDR 0x80000000 +#endif + +#ifndef STACK_BASE_ADDR +#define STACK_BASE_ADDR 0xFF000000 +#endif + +#endif + +#ifndef SMEM_BASE_ADDR +#define SMEM_BASE_ADDR STACK_BASE_ADDR +#endif + +#ifndef SMEM_LOG_SIZE +#define SMEM_LOG_SIZE 19 +#endif + +#ifndef IO_BASE_ADDR +#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE)) +#endif + +#ifndef IO_COUT_ADDR +#define IO_COUT_ADDR IO_BASE_ADDR +#endif +#define IO_COUT_SIZE MEM_BLOCK_SIZE + +#ifndef IO_CSR_ADDR +#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE) +#endif +#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS) + +#ifndef STACK_LOG2_SIZE +#define STACK_LOG2_SIZE 13 +#endif +#define STACK_SIZE (1 << STACK_LOG2_SIZE) + +#define RESET_DELAY 8 + +#ifndef STALL_TIMEOUT +#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED))) +#endif + +#ifndef SV_DPI +#define DPI_DISABLE +#endif + +#ifndef FPU_FPNEW +#ifndef FPU_DSP +#ifndef FPU_DPI +#ifndef SYNTHESIS +#ifndef DPI_DISABLE +#define FPU_DPI +#else +#define FPU_DSP +#endif +#else +#define FPU_DSP +#endif +#endif +#endif +#endif + +#ifndef SYNTHESIS +#ifndef DPI_DISABLE +#define IMUL_DPI +#define IDIV_DPI +#endif +#endif + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 3 +#endif + +// Pipeline Configuration ///////////////////////////////////////////////////// + +// Issue width +#ifndef ISSUE_WIDTH +#define ISSUE_WIDTH NUM_WARPS +#endif + +// Number of ALU units +#ifndef NUM_ALU_LANES +#define NUM_ALU_LANES NUM_THREADS +#endif +#ifndef NUM_ALU_BLOCKS +#define NUM_ALU_BLOCKS 4 +#endif + +// Number of FPU units +#ifndef NUM_FPU_LANES +#define NUM_FPU_LANES NUM_THREADS +#endif +#ifndef NUM_FPU_BLOCKS +#define NUM_FPU_BLOCKS 2 +#endif + +// Number of LSU units +#ifndef NUM_LSU_LANES +#define NUM_LSU_LANES NUM_THREADS +#endif + +// Number of SFU units +#ifndef NUM_SFU_LANES +#define NUM_SFU_LANES MIN(NUM_THREADS, 4) +#endif + +// Size of Instruction Buffer +#ifndef IBUF_SIZE +#define IBUF_SIZE (4 * ISSUE_WIDTH) +#endif + +// Size of LSU Request Queue +#ifndef LSUQ_SIZE +#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES)) +#endif + +// LSU Duplicate Address Check +#ifndef LSU_DUP_DISABLE +#define LSU_DUP_ENABLE +#endif +#ifdef LSU_DUP_ENABLE +#define LSU_DUP_ENABLED 1 +#else +#define LSU_DUP_ENABLED 0 +#endif + +#ifdef GBAR_ENABLE +#define GBAR_ENABLED 1 +#else +#define GBAR_ENABLED 0 +#endif + +#ifndef LATENCY_IMUL +#ifdef VIVADO +#define LATENCY_IMUL 4 +#endif +#ifdef QUARTUS +#define LATENCY_IMUL 3 +#endif +#ifndef LATENCY_IMUL +#define LATENCY_IMUL 4 +#endif +#endif + +// Floating-Point Units /////////////////////////////////////////////////////// + +// Size of FPU Request Queue +#ifndef FPUQ_SIZE +#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES)) +#endif + +// FNCP Latency +#ifndef LATENCY_FNCP +#define LATENCY_FNCP 2 +#endif + +// FMA Latency +#ifndef LATENCY_FMA +#ifdef FPU_DPI +#define LATENCY_FMA 4 +#endif +#ifdef FPU_FPNEW +#define LATENCY_FMA 4 +#endif +#ifdef FPU_DSP +#ifdef QUARTUS +#define LATENCY_FMA 4 +#endif +#ifdef VIVADO +#define LATENCY_FMA 16 +#endif +#ifndef LATENCY_FMA +#define LATENCY_FMA 4 +#endif +#endif +#endif + +// FDIV Latency +#ifndef LATENCY_FDIV +#ifdef FPU_DPI +#define LATENCY_FDIV 15 +#endif +#ifdef FPU_FPNEW +#define LATENCY_FDIV 16 +#endif +#ifdef FPU_DSP +#ifdef QUARTUS +#define LATENCY_FDIV 15 +#endif +#ifdef VIVADO +#define LATENCY_FDIV 28 +#endif +#ifndef LATENCY_FDIV +#define LATENCY_FDIV 16 +#endif +#endif +#endif + +// FSQRT Latency +#ifndef LATENCY_FSQRT +#ifdef FPU_DPI +#define LATENCY_FSQRT 10 +#endif +#ifdef FPU_FPNEW +#define LATENCY_FSQRT 16 +#endif +#ifdef FPU_DSP +#ifdef QUARTUS +#define LATENCY_FSQRT 10 +#endif +#ifdef VIVADO +#define LATENCY_FSQRT 28 +#endif +#ifndef LATENCY_FSQRT +#define LATENCY_FSQRT 16 +#endif +#endif +#endif + +// FCVT Latency +#ifndef LATENCY_FCVT +#define LATENCY_FCVT 5 +#endif + +// Icache Configurable Knobs ////////////////////////////////////////////////// + +// Cache Enable +#ifndef ICACHE_DISABLE +#define ICACHE_ENABLE +#endif +#ifdef ICACHE_ENABLE + #define ICACHE_ENABLED 1 +#else + #define ICACHE_ENABLED 0 + #define NUM_ICACHES 0 +#endif + +// Number of Cache Units +#ifndef NUM_ICACHES +#define NUM_ICACHES UP(SOCKET_SIZE / 4) +#endif + +// Cache Size +#ifndef ICACHE_SIZE +#define ICACHE_SIZE 16384 +#endif + +// Core Response Queue Size +#ifndef ICACHE_CRSQ_SIZE +#define ICACHE_CRSQ_SIZE 2 +#endif + +// Miss Handling Register Size +#ifndef ICACHE_MSHR_SIZE +#define ICACHE_MSHR_SIZE 16 +#endif + +// Memory Request Queue Size +#ifndef ICACHE_MREQ_SIZE +#define ICACHE_MREQ_SIZE 4 +#endif + +// Memory Response Queue Size +#ifndef ICACHE_MRSQ_SIZE +#define ICACHE_MRSQ_SIZE 0 +#endif + +// Number of Associative Ways +#ifndef ICACHE_NUM_WAYS +#define ICACHE_NUM_WAYS 1 +#endif + +// Dcache Configurable Knobs ////////////////////////////////////////////////// + +// Cache Enable +#ifndef DCACHE_DISABLE +#define DCACHE_ENABLE +#endif +#ifdef DCACHE_ENABLE + #define DCACHE_ENABLED 1 +#else + #define DCACHE_ENABLED 0 + #define NUM_DCACHES 0 + #define DCACHE_NUM_BANKS 1 +#endif + +// Number of Cache Units +#ifndef NUM_DCACHES +#define NUM_DCACHES UP(SOCKET_SIZE / 4) +#endif + +// Cache Size +#ifndef DCACHE_SIZE +#define DCACHE_SIZE 16384 +#endif + +// Number of Banks +#ifndef DCACHE_NUM_BANKS +#define DCACHE_NUM_BANKS NUM_LSU_LANES +#endif + +// Core Response Queue Size +#ifndef DCACHE_CRSQ_SIZE +#define DCACHE_CRSQ_SIZE 2 +#endif + +// Miss Handling Register Size +#ifndef DCACHE_MSHR_SIZE +#define DCACHE_MSHR_SIZE 8 +#endif + +// Memory Request Queue Size +#ifndef DCACHE_MREQ_SIZE +#define DCACHE_MREQ_SIZE 4 +#endif + +// Memory Response Queue Size +#ifndef DCACHE_MRSQ_SIZE +#define DCACHE_MRSQ_SIZE 0 +#endif + +// Number of Associative Ways +#ifndef DCACHE_NUM_WAYS +#define DCACHE_NUM_WAYS 1 +#endif + +// SM Configurable Knobs ////////////////////////////////////////////////////// + +#ifndef SM_DISABLE +#define SM_ENABLE +#endif + +#ifdef SM_ENABLE + #define SM_ENABLED 1 +#else + #define SM_ENABLED 0 + #define SMEM_NUM_BANKS 1 +#endif + +// Number of Banks +#ifndef SMEM_NUM_BANKS +#define SMEM_NUM_BANKS (NUM_LSU_LANES) +#endif + +// L2cache Configurable Knobs ///////////////////////////////////////////////// + +// Cache Size +#ifndef L2_CACHE_SIZE +#ifdef ALTERA_S10 +#define L2_CACHE_SIZE 2097152 +#else +#define L2_CACHE_SIZE 1048576 +#endif +#endif + +// Number of Banks +#ifndef L2_NUM_BANKS +#define L2_NUM_BANKS MIN(4, NUM_SOCKETS) +#endif + +// Core Response Queue Size +#ifndef L2_CRSQ_SIZE +#define L2_CRSQ_SIZE 2 +#endif + +// Miss Handling Register Size +#ifndef L2_MSHR_SIZE +#define L2_MSHR_SIZE 16 +#endif + +// Memory Request Queue Size +#ifndef L2_MREQ_SIZE +#define L2_MREQ_SIZE 4 +#endif + +// Memory Response Queue Size +#ifndef L2_MRSQ_SIZE +#define L2_MRSQ_SIZE 0 +#endif + +// Number of Associative Ways +#ifndef L2_NUM_WAYS +#define L2_NUM_WAYS 2 +#endif + +// L3cache Configurable Knobs ///////////////////////////////////////////////// + +// Cache Size +#ifndef L3_CACHE_SIZE +#ifdef ALTERA_S10 +#define L3_CACHE_SIZE 2097152 +#else +#define L3_CACHE_SIZE 1048576 +#endif +#endif + +// Number of Banks +#ifndef L3_NUM_BANKS +#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS) +#endif + +// Core Response Queue Size +#ifndef L3_CRSQ_SIZE +#define L3_CRSQ_SIZE 2 +#endif + +// Miss Handling Register Size +#ifndef L3_MSHR_SIZE +#define L3_MSHR_SIZE 16 +#endif + +// Memory Request Queue Size +#ifndef L3_MREQ_SIZE +#define L3_MREQ_SIZE 4 +#endif + +// Memory Response Queue Size +#ifndef L3_MRSQ_SIZE +#define L3_MRSQ_SIZE 0 +#endif + +// Number of Associative Ways +#ifndef L3_NUM_WAYS +#define L3_NUM_WAYS 4 +#endif + +// ISA Extensions ///////////////////////////////////////////////////////////// + +#ifdef EXT_A_ENABLE + #define EXT_A_ENABLED 1 +#else + #define EXT_A_ENABLED 0 +#endif + +#ifdef EXT_C_ENABLE + #define EXT_C_ENABLED 1 +#else + #define EXT_C_ENABLED 0 +#endif + +#ifdef EXT_D_ENABLE + #define EXT_D_ENABLED 1 +#else + #define EXT_D_ENABLED 0 +#endif + +#ifdef EXT_F_ENABLE + #define EXT_F_ENABLED 1 +#else + #define EXT_F_ENABLED 0 +#endif + +#ifdef EXT_M_ENABLE + #define EXT_M_ENABLED 1 +#else + #define EXT_M_ENABLED 0 +#endif + +#define ISA_STD_A 0 +#define ISA_STD_C 2 +#define ISA_STD_D 3 +#define ISA_STD_E 4 +#define ISA_STD_F 5 +#define ISA_STD_H 7 +#define ISA_STD_I 8 +#define ISA_STD_N 13 +#define ISA_STD_Q 16 +#define ISA_STD_S 18 +#define ISA_STD_U 20 + +#define ISA_EXT_ICACHE 0 +#define ISA_EXT_DCACHE 1 +#define ISA_EXT_L2CACHE 2 +#define ISA_EXT_L3CACHE 3 +#define ISA_EXT_SMEM 4 + +#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \ + | (DCACHE_ENABLED << ISA_EXT_DCACHE) \ + | (L2_ENABLED << ISA_EXT_L2CACHE) \ + | (L3_ENABLED << ISA_EXT_L3CACHE) \ + | (SM_ENABLED << ISA_EXT_SMEM) + +#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \ + | (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \ + | (EXT_C_ENABLED << 2) /* C - Compressed extension */ \ + | (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \ + | (0 << 4) /* E - RV32E base ISA */ \ + | (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \ + | (0 << 6) /* G - Additional standard extensions present */ \ + | (0 << 7) /* H - Hypervisor mode implemented */ \ + | (1 << 8) /* I - RV32I/64I/128I base ISA */ \ + | (0 << 9) /* J - Reserved */ \ + | (0 << 10) /* K - Reserved */ \ + | (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \ + | (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \ + | (0 << 13) /* N - User level interrupts supported */ \ + | (0 << 14) /* O - Reserved */ \ + | (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \ + | (0 << 16) /* Q - Quad-precision floating-point extension */ \ + | (0 << 17) /* R - Reserved */ \ + | (0 << 18) /* S - Supervisor mode implemented */ \ + | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \ + | (1 << 20) /* U - User mode implemented */ \ + | (0 << 21) /* V - Tentatively reserved for Vector extension */ \ + | (0 << 22) /* W - Reserved */ \ + | (1 << 23) /* X - Non-standard extensions present */ \ + | (0 << 24) /* Y - Reserved */ \ + | (0 << 25) /* Z - Reserved */ + +// Device identification ////////////////////////////////////////////////////// + +#define VENDOR_ID 0 +#define ARCHITECTURE_ID 0 +#define IMPLEMENTATION_ID 0 + +#endif // VX_CONFIG_VH diff --git a/kernel/include/VX_types.h b/kernel/include/VX_types.h new file mode 100644 index 00000000..4e2cdf12 --- /dev/null +++ b/kernel/include/VX_types.h @@ -0,0 +1,193 @@ +// auto-generated by gen_config.py. DO NOT EDIT +// Generated at 2024-06-15 00:25:12.935689 + +// Translated from ./rtl/VX_types.vh: + +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VX_TYPES_VH +#define VX_TYPES_VH + +// Device configuration registers + +#define VX_CSR_ADDR_BITS 12 +#define VX_DCR_ADDR_BITS 12 + +#define VX_DCR_BASE_STATE_BEGIN 0x001 +#define VX_DCR_BASE_STARTUP_ADDR0 0x001 +#define VX_DCR_BASE_STARTUP_ADDR1 0x002 +#define VX_DCR_BASE_MPM_CLASS 0x003 +#define VX_DCR_BASE_STATE_END 0x004 + +#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN) +#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN) + +// Machine Performance-monitoring counters classes + +#define VX_DCR_MPM_CLASS_NONE 0 +#define VX_DCR_MPM_CLASS_CORE 1 +#define VX_DCR_MPM_CLASS_MEM 2 + +// User Floating-Point CSRs + +#define VX_CSR_FFLAGS 0x001 +#define VX_CSR_FRM 0x002 +#define VX_CSR_FCSR 0x003 + +#define VX_CSR_SATP 0x180 + +#define VX_CSR_PMPCFG0 0x3A0 +#define VX_CSR_PMPADDR0 0x3B0 + +#define VX_CSR_MSTATUS 0x300 +#define VX_CSR_MISA 0x301 +#define VX_CSR_MEDELEG 0x302 +#define VX_CSR_MIDELEG 0x303 +#define VX_CSR_MIE 0x304 +#define VX_CSR_MTVEC 0x305 + +#define VX_CSR_MEPC 0x341 + +#define VX_CSR_MNSTATUS 0x744 + +#define VX_CSR_MPM_BASE 0xB00 +#define VX_CSR_MPM_BASE_H 0xB80 +#define VX_CSR_MPM_USER 0xB03 +#define VX_CSR_MPM_USER_H 0xB83 + +// Machine Performance-monitoring core counters +// PERF: Standard +#define VX_CSR_MCYCLE 0xB00 +#define VX_CSR_MCYCLE_H 0xB80 +#define VX_CSR_MPM_RESERVED 0xB01 +#define VX_CSR_MPM_RESERVED_H 0xB81 +#define VX_CSR_MINSTRET 0xB02 +#define VX_CSR_MINSTRET_H 0xB82 +// PERF: pipeline +#define VX_CSR_MPM_SCHED_ID 0xB03 +#define VX_CSR_MPM_SCHED_ID_H 0xB83 +#define VX_CSR_MPM_SCHED_ST 0xB04 +#define VX_CSR_MPM_SCHED_ST_H 0xB84 +#define VX_CSR_MPM_IBUF_ST 0xB05 +#define VX_CSR_MPM_IBUF_ST_H 0xB85 +#define VX_CSR_MPM_SCRB_ST 0xB06 +#define VX_CSR_MPM_SCRB_ST_H 0xB86 +#define VX_CSR_MPM_SCRB_ALU 0xB07 +#define VX_CSR_MPM_SCRB_ALU_H 0xB87 +#define VX_CSR_MPM_SCRB_FPU 0xB08 +#define VX_CSR_MPM_SCRB_FPU_H 0xB88 +#define VX_CSR_MPM_SCRB_LSU 0xB09 +#define VX_CSR_MPM_SCRB_LSU_H 0xB89 +#define VX_CSR_MPM_SCRB_SFU 0xB0A +#define VX_CSR_MPM_SCRB_SFU_H 0xB8A +// PERF: memory +#define VX_CSR_MPM_IFETCHES 0xB0B +#define VX_CSR_MPM_IFETCHES_H 0xB8B +#define VX_CSR_MPM_LOADS 0xB0C +#define VX_CSR_MPM_LOADS_H 0xB8C +#define VX_CSR_MPM_STORES 0xB0D +#define VX_CSR_MPM_STORES_H 0xB8D +#define VX_CSR_MPM_IFETCH_LT 0xB0E +#define VX_CSR_MPM_IFETCH_LT_H 0xB8E +#define VX_CSR_MPM_LOAD_LT 0xB0F +#define VX_CSR_MPM_LOAD_LT_H 0xB8F +// SFU: scoreboard +#define VX_CSR_MPM_SCRB_WCTL 0xB10 +#define VX_CSR_MPM_SCRB_WCTL_H 0xB90 +#define VX_CSR_MPM_SCRB_CSRS 0xB11 +#define VX_CSR_MPM_SCRB_CSRS_H 0xB91 + +// Machine Performance-monitoring memory counters +// PERF: icache +#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads +#define VX_CSR_MPM_ICACHE_READS_H 0xB83 +#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses +#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84 +#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls +#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85 +// PERF: dcache +#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads +#define VX_CSR_MPM_DCACHE_READS_H 0xB86 +#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes +#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87 +#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses +#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88 +#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses +#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89 +#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts +#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A +#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls +#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B +// PERF: l2cache +#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads +#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C +#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes +#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D +#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses +#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E +#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses +#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F +#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts +#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90 +#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls +#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91 +// PERF: l3cache +#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads +#define VX_CSR_MPM_L3CACHE_READS_H 0xB92 +#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes +#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93 +#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses +#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94 +#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses +#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95 +#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts +#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96 +#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls +#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97 +// PERF: memory +#define VX_CSR_MPM_MEM_READS 0xB18 // total reads +#define VX_CSR_MPM_MEM_READS_H 0xB98 +#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes +#define VX_CSR_MPM_MEM_WRITES_H 0xB99 +#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency +#define VX_CSR_MPM_MEM_LT_H 0xB9A +// PERF: smem +#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads +#define VX_CSR_MPM_SMEM_READS_H 0xB9B +#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes +#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C +#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts +#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D + +// Machine Information Registers + +#define VX_CSR_MVENDORID 0xF11 +#define VX_CSR_MARCHID 0xF12 +#define VX_CSR_MIMPID 0xF13 +#define VX_CSR_MHARTID 0xF14 + +// GPGU CSRs + +#define VX_CSR_THREAD_ID 0xCC0 +#define VX_CSR_WARP_ID 0xCC1 +#define VX_CSR_CORE_ID 0xCC2 +#define VX_CSR_WARP_MASK 0xCC3 +#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM +#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom + +#define VX_CSR_NUM_THREADS 0xFC0 +#define VX_CSR_NUM_WARPS 0xFC1 +#define VX_CSR_NUM_CORES 0xFC2 + +#endif // VX_TYPES_VH diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index f6cfbf58..26bbe65f 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -136,6 +136,19 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) { asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr)); } +// Spawn an explicit warp mask. The current warp bit is ignored by hardware. +inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) { + asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr)); +} + +inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) { + vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr); +} + +inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) { + vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr); +} + // Split on a predicate inline unsigned vx_split(unsigned predicate) { unsigned ret; @@ -149,8 +162,36 @@ inline void vx_join(unsigned stack_ptr) { } // Warp Barrier +__attribute__((convergent)) inline void vx_barrier(unsigned barried_id, unsigned num_warps) { - asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps)); + unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps; + asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps)); +} + +#define VX_BARRIER_DOMAIN_SHIFT 28 +#define VX_BARRIER_DOMAIN_ALL 0u +#define VX_BARRIER_DOMAIN_SCALAR 1u +#define VX_BARRIER_DOMAIN_TENSOR 2u + +__attribute__((convergent)) +inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) { + unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT); + asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps)); +} + +__attribute__((convergent)) +inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) { + vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR); +} + +__attribute__((convergent)) +inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) { + vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR); +} + +__attribute__((convergent)) +inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) { + asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask)); } // Return current thread identifier @@ -202,6 +243,22 @@ inline int vx_num_warps() { return ret; } +inline int vx_num_scalar_warps() { + return NUM_SCALAR_WARPS; +} + +inline int vx_num_tensor_warps() { + return NUM_TENSOR_WARPS; +} + +inline unsigned vx_scalar_warp_mask() { + return (1u << NUM_SCALAR_WARPS) - 1u; +} + +inline unsigned vx_tensor_warp_mask() { + return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS; +} + // Return the number of cores per cluster inline int vx_num_cores() { int ret; diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index d8797945..83052f30 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,6 +17,10 @@ #include #include +#ifndef CORES_PER_CLUSTER +#define CORES_PER_CLUSTER 8 +#endif + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +52,7 @@ void vx_wspawn_wait(); void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index b1ef7230..1971ae55 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,18 +74,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } -static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { - int cid = vx_core_id(); - int tid = vx_thread_id(); - - wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; - int task_id = p_wspawn_args->offset + tid; - (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); -} - static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { int NT = vx_num_threads(); - int NW = vx_num_warps(); + int NW = NUM_SCALAR_WARPS; int cid = vx_core_id(); int wid = vx_warp_id(); int tid = vx_thread_id(); @@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { + int NT = vx_num_threads(); + int NW = NUM_SCALAR_WARPS; + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + + // sequential iterations + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); + callback(task_id, arg); + } +} + +static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { + int cid = vx_core_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + int task_id = p_wspawn_args->offset + tid; + (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); +} + +static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { + int NT = vx_num_threads(); + int cid = vx_core_id(); + int tid = vx_thread_id(); + int wid = vx_warp_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + // FIXME: This assumes that all cores but the last one are working with full + // warps, and only the last core has a partially-filled warp. + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + int task_id = offset; + (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); +} + static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { // activate all threads vx_tmc(-1); @@ -111,11 +156,21 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { spawn_tasks_contiguous_all_stub(); // disable warp - // deadlock here on warps 1, 2, 3 vx_tmc_zero(); } -static void __attribute__ ((noinline)) spawn_tasks_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // disable warp + vx_tmc_zero(); +} + +static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // activate all threads vx_tmc(-1); @@ -126,10 +181,115 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +// This function runs in every core, but with only 1 warp and 1 thread enabled. +// The logic in this function figures out how many warps/threads this particular +// core has to enable to fulfill an entire grid of computation. +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { + // device specs + const int NC = vx_num_cores(); + const int NW = NUM_SCALAR_WARPS; + const int NT = vx_num_threads(); + // NOTE: assumes divisible + const int num_cluster = NC / CORES_PER_CLUSTER; + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + const int cluster_id = core_id / CORES_PER_CLUSTER; + const int core_id_in_cluster = core_id % CORES_PER_CLUSTER; + + // try to fill up full clusters first + const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT; + const int num_used_clusters = + (num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster; + if (cluster_id >= num_used_clusters) { + return; // terminate extra clusters + } + // fill up the last cluster with remaining tasks + const int num_full_clusters = num_tasks / num_threads_in_cluster; + int num_tasks_this_cluster = num_threads_in_cluster; + if (cluster_id >= num_full_clusters) { + num_tasks_this_cluster = num_tasks % num_threads_in_cluster; + } + + // Distribute threads equally across as many cores as possible, even if they + // don't fill up NW*NT in a single core. This makes sure the warps get evenly + // distributed in a single cluster + // + // TODO: Try to contain in a single cluster if possible? + const int num_active_cores = (num_tasks + (NT - 1)) / NT; + if (core_id >= num_active_cores) + return; // terminate extra cores + + const int num_full_warps_this_cluster = num_tasks_this_cluster / NT; + const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; + // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; + + int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER; + const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER; + if (core_id_in_cluster < num_warps_in_last_row) { + num_warps_this_core++; + } + // if 0, last warp is full-threads enabled + int rem_threads_in_last_warp_this_core = 0; + if (rem_threads_in_last_warp != 0) { + if (core_id_in_cluster == num_warps_in_last_row - 1) { + rem_threads_in_last_warp_this_core = rem_threads_in_last_warp; + } + } + + // sequential iterations + const int num_full_waves = num_warps_this_core / NW; + const int rem_full_warps_in_last_wave = num_warps_this_core % NW; + + const int offset = cluster_id * num_tasks_this_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, + rem_full_warps_in_last_wave}; + g_wspawn_args[core_id] = &wspawn_args; + + if (num_warps_this_core > 0) { + // execute callback on other warps + const int nw = MIN(num_warps_this_core, NW); + vx_wspawn(nw, spawn_tasks_cluster_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + // TODO: this is incomplete + // TODO: Instead of launching an additional wave just to work on remaining + // threads, handle this in the last wave amongst other full warps. + if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { + // adjust offset + // FIXME: use rem_threads_in_last_warp_this_core + wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); + + // activate remaining threads + const int tmask = (1 << rem_threads_in_last_warp) - 1; + vx_tmc(tmask); + + // call stub routine + spawn_tasks_cluster_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); - int NW = vx_num_warps(); + int NW = NUM_SCALAR_WARPS; int NT = vx_num_threads(); // current core id @@ -179,7 +339,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void vx_tmc_one(); // wait for spawn warps to terminate - // deadlock here on warp 0! vx_wspawn_wait(); } @@ -202,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); - int NW = vx_num_warps(); + int NW = NUM_SCALAR_WARPS; int NT = vx_num_threads(); // current core id @@ -356,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) { // device specs int NC = vx_num_cores(); - int NW = vx_num_warps(); + int NW = NUM_SCALAR_WARPS; int NT = vx_num_threads(); // current core id diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index b5065c95..41c42f60 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -22,9 +22,9 @@ _start: # initialize per-thread registers - csrr t0, VX_CSR_NUM_WARPS # get num warps + li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask la t1, init_regs_all - .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1 + .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1 li t0, -1 .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0 jal init_regs @@ -35,9 +35,9 @@ _start: jal vx_wspawn_wait # initialize TLS for all warps - csrr t0, VX_CSR_NUM_WARPS # get num warps + li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask la t1, init_tls_all - .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1 + .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1 li t0, -1 .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0 call __init_tls @@ -102,6 +102,8 @@ init_regs: #endif csrr t0, VX_CSR_MHARTID sll t1, t0, STACK_LOG2_SIZE + sll t2, t0, 4 + add t1, t1, t2 sub sp, sp, t1 # set thread pointer register