Implement WU architecture support

This commit is contained in:
2026-05-25 19:25:05 +08:00
parent 323ed7d7e9
commit 0ad87bde81
35 changed files with 3303 additions and 472 deletions

View File

@@ -84,15 +84,15 @@
#endif
#ifndef NUM_CORES
#define NUM_CORES 8
#define NUM_CORES 1
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 8
#define NUM_WARPS 4
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 8
#define NUM_THREADS 4
#endif
#ifndef NUM_BARRIERS
@@ -682,4 +682,3 @@
#define IMPLEMENTATION_ID 0
#endif // VX_CONFIG_VH

View File

@@ -88,15 +88,48 @@
`endif
`ifndef NUM_CORES
`define NUM_CORES 8
`define NUM_CORES 1
`endif
`ifndef NUM_WARPS
`define NUM_WARPS 8
`define NUM_WARPS 4
`endif
`ifndef NUM_TENSOR_WARPS
`define NUM_TENSOR_WARPS 2
`endif
`define NUM_SCALAR_WARPS (`NUM_WARPS - `NUM_TENSOR_WARPS)
`define WU_CONFIG_STATIC_ASSERTS \
generate \
if (!(`NUM_WARPS > 0)) begin : g_wu_num_warps_gt_zero \
invalid_NUM_WARPS_must_be_greater_than_zero __wu_config_error(); \
end \
if (!(`NUM_TENSOR_WARPS > 0)) begin : g_wu_num_tensor_warps_gt_zero \
invalid_NUM_TENSOR_WARPS_must_be_greater_than_zero __wu_config_error(); \
end \
if (!(`NUM_TENSOR_WARPS < `NUM_WARPS)) begin : g_wu_num_tensor_warps_lt_num_warps \
invalid_NUM_TENSOR_WARPS_must_be_smaller_than_NUM_WARPS __wu_config_error(); \
end \
if (!(`NUM_SCALAR_WARPS > 0)) begin : g_wu_num_scalar_warps_gt_zero \
invalid_NUM_SCALAR_WARPS_must_be_greater_than_zero __wu_config_error(); \
end \
endgenerate
`define IS_SCALAR_WARP(wid) ((wid) < `NUM_SCALAR_WARPS)
`define IS_TENSOR_WARP(wid) ((wid) >= `NUM_SCALAR_WARPS)
`ifndef TENSOR_NUM_GPRS
`define TENSOR_NUM_GPRS 8
`endif
`ifndef TENSOR_NUM_FPRS
`define TENSOR_NUM_FPRS 8
`endif
`ifndef NUM_THREADS
`define NUM_THREADS 8
`define NUM_THREADS 4
`endif
`ifndef NUM_BARRIERS

View File

@@ -7,6 +7,7 @@ module Vortex import VX_gpu_pkg::*; #(
parameter TENSOR_FP16 = 0,
parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100,
parameter NUM_THREADS = 0,
parameter NUM_TENSOR_CORES = 1,
parameter TC_DATA_WIDTH = 256,
parameter TC_TAG_WIDTH = 4
) (
@@ -77,26 +78,32 @@ module Vortex import VX_gpu_pkg::*; #(
output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data,
// tc --------------------------------------------------
input [2:0] tc_a_ready,
output [2:0] tc_a_valid,
output [2:0] tc_a_bits_write,
output [95:0] tc_a_bits_address,
output [3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [3 * 32 - 1:0] tc_a_bits_mask,
output [3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [2:0] tc_d_ready,
input [2:0] tc_d_valid,
input [3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
input [NUM_TENSOR_CORES * 3 - 1:0] tc_a_ready,
output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_valid,
output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_bits_write,
output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_address,
output [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_mask,
output [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
output [NUM_TENSOR_CORES * 3 - 1:0] tc_d_ready,
input [NUM_TENSOR_CORES * 3 - 1:0] tc_d_valid,
input [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
input [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
// tmem_C direct SRAM port
output tc_tmem_C_wen,
output tc_tmem_C_ren,
output [8:0] tc_tmem_C_waddr,
output [8:0] tc_tmem_C_raddr,
output [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
output [`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
input [`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
// shared tmem direct SRAM ports
output [NUM_TENSOR_CORES-1:0] tc_tmem_A_ren,
input [NUM_TENSOR_CORES-1:0] tc_tmem_A_rready,
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_A_raddr,
input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_A_rdata,
output [NUM_TENSOR_CORES-1:0] tc_tmem_C_ren,
input [NUM_TENSOR_CORES-1:0] tc_tmem_C_rready,
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_raddr,
input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
output [NUM_TENSOR_CORES-1:0] tc_tmem_C_wen,
input [NUM_TENSOR_CORES-1:0] tc_tmem_C_wready,
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_waddr,
output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
// gbar ------------------------------------------------
@@ -314,24 +321,52 @@ module Vortex import VX_gpu_pkg::*; #(
endgenerate
// tc ---------------------------------------------------------------------
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if();
// tc_p1 (tmem_C) is now a direct SRAM port exposed as top-level ports tc_tmem_C_*
assign tc_a_valid = {tc_p2_bus_if.req_valid, 1'b0, tc_p0_bus_if.req_valid};
assign tc_a_bits_write = {tc_p2_bus_if.req_data.rw, 1'b0, tc_p0_bus_if.req_data.rw};
assign tc_a_bits_address = {tc_p2_bus_if.req_data.addr, 32'b0, tc_p0_bus_if.req_data.addr};
assign tc_a_bits_tag = {tc_p2_bus_if.req_data.tag, {TC_TAG_WIDTH{1'b0}}, tc_p0_bus_if.req_data.tag};
assign tc_a_bits_mask = {tc_p2_bus_if.req_data.byteen, {(TC_DATA_WIDTH/8){1'b0}},tc_p0_bus_if.req_data.byteen};
assign tc_a_bits_data = {tc_p2_bus_if.req_data.data, {TC_DATA_WIDTH{1'b0}}, tc_p0_bus_if.req_data.data};
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_p2_bus_if.req_ready = tc_a_ready[2];
assign tc_p2_bus_if.rsp_valid = tc_d_valid[2];
assign tc_p2_bus_if.rsp_data.data = tc_d_bits_data[2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p2_bus_if.rsp_data.tag = tc_d_bits_tag[2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready = {tc_p2_bus_if.rsp_ready, 1'b0, tc_p0_bus_if.rsp_ready};
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if[NUM_TENSOR_CORES]();
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if[NUM_TENSOR_CORES]();
for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_ports
localparam P0 = tc * 3;
localparam P1 = tc * 3 + 1;
localparam P2 = tc * 3 + 2;
assign tc_a_valid[P0] = tc_p0_bus_if[tc].req_valid;
assign tc_a_valid[P1] = 1'b0;
assign tc_a_valid[P2] = tc_p2_bus_if[tc].req_valid;
assign tc_a_bits_write[P0] = tc_p0_bus_if[tc].req_data.rw;
assign tc_a_bits_write[P1] = 1'b0;
assign tc_a_bits_write[P2] = tc_p2_bus_if[tc].req_data.rw;
assign tc_a_bits_address[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.addr;
assign tc_a_bits_address[P1 * 32 +: 32] = 32'b0;
assign tc_a_bits_address[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.addr;
assign tc_a_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p0_bus_if[tc].req_data.tag;
assign tc_a_bits_tag[P1 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = '0;
assign tc_a_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p2_bus_if[tc].req_data.tag;
assign tc_a_bits_mask[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.byteen;
assign tc_a_bits_mask[P1 * 32 +: 32] = '0;
assign tc_a_bits_mask[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.byteen;
assign tc_a_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p0_bus_if[tc].req_data.data;
assign tc_a_bits_data[P1 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = '0;
assign tc_a_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p2_bus_if[tc].req_data.data;
assign tc_p0_bus_if[tc].req_ready = tc_a_ready[P0];
assign tc_p0_bus_if[tc].rsp_valid = tc_d_valid[P0];
assign tc_p0_bus_if[tc].rsp_data.data = tc_d_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p0_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_p2_bus_if[tc].req_ready = tc_a_ready[P2];
assign tc_p2_bus_if[tc].rsp_valid = tc_d_valid[P2];
assign tc_p2_bus_if[tc].rsp_data.data = tc_d_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
assign tc_p2_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
assign tc_d_ready[P0] = tc_p0_bus_if[tc].rsp_ready;
assign tc_d_ready[P1] = 1'b0;
assign tc_d_ready[P2] = tc_p2_bus_if[tc].rsp_ready;
end
// gbar -------------------------------------------------------------------
`ifdef GBAR_ENABLE
@@ -439,7 +474,8 @@ module Vortex import VX_gpu_pkg::*; #(
// TODO: SCOPE_IO_BIND should be socket id
VX_core #(
.CORE_ID (CORE_ID),
.TENSOR_FP16 (TENSOR_FP16)
.TENSOR_FP16 (TENSOR_FP16),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
) core (
`SCOPE_IO_BIND (0)
@@ -465,22 +501,34 @@ module Vortex import VX_gpu_pkg::*; #(
.tensor_smem_A_if (tc_p0_bus_if),
`ifdef EXT_T_BLACKWELL
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_A_ren(tc_tmem_A_ren),
.tensor_tmem_A_rready(tc_tmem_A_rready),
.tensor_tmem_A_raddr(tc_tmem_A_raddr),
.tensor_tmem_A_rdata(tc_tmem_A_rdata),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_rready(tc_tmem_C_rready),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_wready(tc_tmem_C_wready),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`else
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_A_ren(tc_tmem_A_ren),
.tensor_tmem_A_rready(tc_tmem_A_rready),
.tensor_tmem_A_raddr(tc_tmem_A_raddr),
.tensor_tmem_A_rdata(tc_tmem_A_rdata),
.tensor_tmem_C_ren(tc_tmem_C_ren),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_rready(tc_tmem_C_rready),
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_tmem_C_wen(tc_tmem_C_wen),
.tensor_tmem_C_wready(tc_tmem_C_wready),
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
.tensor_tmem_C_mask(tc_tmem_C_mask),
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
.tensor_smem_B_if (tc_p2_bus_if),
`endif
@@ -583,7 +631,7 @@ module Vortex import VX_gpu_pkg::*; #(
$display("simulation has probably ended. exiting");
$finish();
end
if (finished) begin
if (busy_prev && !busy) begin
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
// TODO: lane assumed to be 4
// `ifndef SYNTHESIS

View File

@@ -249,9 +249,10 @@
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
`define INST_SFU_CMOV 4'h9
`define INST_SFU_BAR_MASK 4'ha
`define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5)
`define INST_SFU_IS_WCTL(op) ((op <= 5) || (op == `INST_SFU_BAR_MASK))
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
`define INST_TENSOR_HMMA 4'b0000

View File

@@ -46,6 +46,8 @@ package VX_gpu_pkg;
logic valid;
logic [`NB_WIDTH-1:0] id;
logic is_global;
logic [1:0] domain;
logic [`NUM_WARPS-1:0] mask;
`ifdef GBAR_ENABLE
logic [`MAX(`NW_WIDTH, `NC_WIDTH)-1:0] size_m1;
`else
@@ -53,6 +55,13 @@ package VX_gpu_pkg;
`endif
} barrier_t;
localparam logic [1:0] BARRIER_ALL = 2'd0;
localparam logic [1:0] BARRIER_SCALAR = 2'd1;
localparam logic [1:0] BARRIER_TENSOR = 2'd2;
localparam logic [1:0] BARRIER_MASK = 2'd3;
localparam logic WU_DOMAIN_SCALAR = 1'b0;
localparam logic WU_DOMAIN_TENSOR = 1'b1;
typedef struct packed {
logic [`XLEN-1:0] startup_addr;
logic [7:0] mpm_class;
@@ -90,7 +99,7 @@ package VX_gpu_pkg;
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
localparam ICACHE_TAG_WIDTH = (1 + `UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);

View File

@@ -188,6 +188,46 @@ module VX_alu_unit #(
.ready_out (commit_block_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (execute_if[block_idx].valid
&& ((execute_if[block_idx].data.PC == 32'h80000010) || (execute_if[block_idx].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-alu-execute-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_mul=%b, is_red=%b, wb=%0d, rd=%0d (#%0d)\n",
$time, CORE_ID, block_idx, execute_if[block_idx].valid, execute_if[block_idx].ready,
execute_if[block_idx].data.wid, execute_if[block_idx].data.PC,
execute_if[block_idx].data.op_type, execute_if[block_idx].data.op_mod,
is_muldiv_op, is_reduce_op, execute_if[block_idx].data.wb,
execute_if[block_idx].data.rd, execute_if[block_idx].data.uuid));
end
if (int_execute_if.valid
&& ((int_execute_if.data.PC == 32'h80000010) || (int_execute_if.data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-alu-int-execute: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d (#%0d)\n",
$time, CORE_ID, block_idx, int_execute_if.valid, int_execute_if.ready,
int_execute_if.data.wid, int_execute_if.data.PC,
int_execute_if.data.wb, int_execute_if.data.rd, int_execute_if.data.uuid));
end
if (int_commit_if.valid
&& ((int_commit_if.data.PC == 32'h80000010) || (int_commit_if.data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-alu-int-commit: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, block_idx, int_commit_if.valid, int_commit_if.ready,
int_commit_if.data.wid, int_commit_if.data.PC,
int_commit_if.data.wb, int_commit_if.data.rd,
int_commit_if.data.sop, int_commit_if.data.eop, int_commit_if.data.uuid));
end
if (commit_block_if[block_idx].valid
&& ((commit_block_if[block_idx].data.PC == 32'h80000010) || (commit_block_if[block_idx].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-alu-commit-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, block_idx, commit_block_if[block_idx].valid, commit_block_if[block_idx].ready,
commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.PC,
commit_block_if[block_idx].data.wb, commit_block_if[block_idx].data.rd,
commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop,
commit_block_if[block_idx].data.uuid));
end
end
end
`endif
end
`RESET_RELAY (commit_reset, reset);

View File

@@ -234,6 +234,26 @@ module VX_commit import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME)) begin
if ((CORE_ID == 0)
&& commit_if[i].valid
&& ((commit_if[i].data.PC == 32'h80000010) || (commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-commit-arb-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b, fire=%b (#%0d)\n",
$time, CORE_ID, i, commit_if[i].valid, commit_if[i].ready,
commit_if[i].data.wid, commit_if[i].data.PC,
commit_if[i].data.wb, commit_if[i].data.rd,
commit_if[i].data.tensor, commit_if[i].data.sop,
commit_if[i].data.eop, commit_fire[i], commit_if[i].data.uuid));
end
if ((CORE_ID == 0)
&& writeback_if[i].valid
&& ((writeback_if[i].data.PC == 32'h80000010) || (writeback_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-writeback-out: isw=%0d, valid=%b, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, writeback_if[i].valid,
wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.wis,
writeback_if[i].data.PC, writeback_if[i].data.rd,
writeback_if[i].data.tensor, writeback_if[i].data.sop,
writeback_if[i].data.eop, writeback_if[i].data.uuid));
end
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS);
@@ -259,6 +279,16 @@ module VX_commit import VX_gpu_pkg::*; #(
end
end
end
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (commit_sched_if.committed[i] && (commit_sched_if.committed_wid[i] == `NW_WIDTH'(0))) begin
`TRACE(1, ("%d: core%0d-commit-sched-out: isw=%0d, committed=%b, wid=%0d\n",
$time, CORE_ID, i, commit_sched_if.committed[i], commit_sched_if.committed_wid[i]));
end
end
end
end
`endif
endmodule

View File

@@ -19,7 +19,8 @@
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter TENSOR_FP16 = 0
parameter TENSOR_FP16 = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
) (
`SCOPE_IO_DECL
@@ -39,16 +40,21 @@ module VX_core import VX_gpu_pkg::*; #(
VX_mem_bus_if.master icache_bus_if,
VX_tc_bus_if.master tensor_smem_A_if,
// tensor_tmem_C is now a direct SRAM port
output logic tensor_tmem_C_wen,
output logic tensor_tmem_C_ren,
output logic [8:0] tensor_tmem_C_waddr,
output logic [8:0] tensor_tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
VX_tc_bus_if.master tensor_smem_B_if,
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
@@ -66,17 +72,25 @@ module VX_core import VX_gpu_pkg::*; #(
output wire [31:0] acc_write_out,
output wire acc_write_en
);
VX_schedule_if schedule_if();
VX_fetch_if fetch_if();
VX_decode_if decode_if();
`WU_CONFIG_STATIC_ASSERTS
VX_schedule_if scalar_schedule_if();
VX_schedule_if tensor_schedule_if();
VX_fetch_if scalar_fetch_if();
VX_fetch_if tensor_fetch_if();
VX_decode_if scalar_decode_if();
VX_decode_if tensor_decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_decode_sched_if scalar_decode_sched_if();
VX_decode_sched_if tensor_decode_sched_if();
`ifdef EXT_T_ASYNC
VX_tc_rf_if tensor_regfile_if();
VX_tc_rf_if tensor_regfile_if[NUM_TENSOR_CORES]();
VX_tc_rf_if scalar_dummy_tensor_regfile_if[NUM_TENSOR_CORES]();
`endif
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_branch_ctl_if branch_ctl_if[2 * `NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
@@ -89,8 +103,26 @@ module VX_core import VX_gpu_pkg::*; #(
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
`ifdef EXT_T_ENABLE
VX_dispatch_if tensor_alu_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if tensor_lsu_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if tensor_ctrl_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if scalar_dummy_tensor_alu_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if scalar_dummy_tensor_lsu_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if scalar_dummy_tensor_ctrl_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if scalar_dummy_tensor_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if tensor_dummy_alu_dispatch_if[`ISSUE_WIDTH]();
VX_dispatch_if tensor_dummy_lsu_dispatch_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if tensor_dummy_fpu_dispatch_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if tensor_dummy_sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if tensor_commit_if[`ISSUE_WIDTH]();
wire tensor_csr_unlock_valid;
wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid;
wire tensor_tmc_valid;
wire [`NW_WIDTH-1:0] tensor_tmc_wid;
wire [`NUM_THREADS-1:0] tensor_tmc_tmask;
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
@@ -105,12 +137,58 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
VX_pipeline_perf_if scalar_decode_perf_if();
VX_pipeline_perf_if tensor_decode_perf_if();
VX_pipeline_perf_if scalar_issue_perf_if();
VX_pipeline_perf_if tensor_issue_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
assign pipeline_perf_if.illegal_tensor_reg_access = scalar_decode_perf_if.illegal_tensor_reg_access + tensor_decode_perf_if.illegal_tensor_reg_access;
assign pipeline_perf_if.illegal_tensor_scalar_op = scalar_decode_perf_if.illegal_tensor_scalar_op + tensor_decode_perf_if.illegal_tensor_scalar_op;
assign pipeline_perf_if.illegal_scalar_tensor_op = scalar_decode_perf_if.illegal_scalar_tensor_op + tensor_decode_perf_if.illegal_scalar_tensor_op;
assign pipeline_perf_if.ibf_stalls = scalar_issue_perf_if.ibf_stalls + tensor_issue_perf_if.ibf_stalls;
assign pipeline_perf_if.scb_stalls = scalar_issue_perf_if.scb_stalls + tensor_issue_perf_if.scb_stalls;
assign pipeline_perf_if.scb_any_unit_uses = scalar_issue_perf_if.scb_any_unit_uses + tensor_issue_perf_if.scb_any_unit_uses;
assign pipeline_perf_if.scb_fires = scalar_issue_perf_if.scb_fires + tensor_issue_perf_if.scb_fires;
assign pipeline_perf_if.scb_any_fire_cycles = scalar_issue_perf_if.scb_any_fire_cycles + tensor_issue_perf_if.scb_any_fire_cycles;
assign pipeline_perf_if.dispatch_any_fire_cycles = scalar_issue_perf_if.dispatch_any_fire_cycles + tensor_issue_perf_if.dispatch_any_fire_cycles;
for (genvar perf_i = 0; perf_i < `NUM_EX_UNITS; ++perf_i) begin : g_issue_perf_merge
assign pipeline_perf_if.units_uses[perf_i] = scalar_issue_perf_if.units_uses[perf_i] + tensor_issue_perf_if.units_uses[perf_i];
assign pipeline_perf_if.dispatch_stalls[perf_i] = scalar_issue_perf_if.dispatch_stalls[perf_i] + tensor_issue_perf_if.dispatch_stalls[perf_i];
assign pipeline_perf_if.dispatch_valids[perf_i] = scalar_issue_perf_if.dispatch_valids[perf_i] + tensor_issue_perf_if.dispatch_valids[perf_i];
assign pipeline_perf_if.dispatch_fires[perf_i] = scalar_issue_perf_if.dispatch_fires[perf_i] + tensor_issue_perf_if.dispatch_fires[perf_i];
end
for (genvar perf_sfu_i = 0; perf_sfu_i < `NUM_SFU_UNITS; ++perf_sfu_i) begin : g_issue_sfu_perf_merge
assign pipeline_perf_if.sfu_uses[perf_sfu_i] = scalar_issue_perf_if.sfu_uses[perf_sfu_i] + tensor_issue_perf_if.sfu_uses[perf_sfu_i];
end
`endif
assign decode_sched_if.valid = scalar_decode_sched_if.valid || tensor_decode_sched_if.valid;
assign decode_sched_if.is_wstall = tensor_decode_sched_if.valid ? tensor_decode_sched_if.is_wstall : scalar_decode_sched_if.is_wstall;
assign decode_sched_if.wid = tensor_decode_sched_if.valid ? tensor_decode_sched_if.wid : scalar_decode_sched_if.wid;
`RUNTIME_ASSERT(
!(scalar_decode_sched_if.valid && tensor_decode_sched_if.valid),
("%t: *** core%0d-two-decode-sched-events-same-cycle", $time, CORE_ID)
)
`ifdef EXT_T_ENABLE
for (genvar dummy_i = 0; dummy_i < `ISSUE_WIDTH; ++dummy_i) begin : g_dummy_dispatch_ready
assign scalar_dummy_tensor_alu_dispatch_if[dummy_i].ready = 1'b1;
assign scalar_dummy_tensor_lsu_dispatch_if[dummy_i].ready = 1'b1;
assign scalar_dummy_tensor_ctrl_dispatch_if[dummy_i].ready = 1'b1;
assign scalar_dummy_tensor_dispatch_if[dummy_i].ready = 1'b1;
assign tensor_dummy_alu_dispatch_if[dummy_i].ready = 1'b1;
assign tensor_dummy_lsu_dispatch_if[dummy_i].ready = 1'b1;
`ifdef EXT_F_ENABLE
assign tensor_dummy_fpu_dispatch_if[dummy_i].ready = 1'b1;
`endif
assign tensor_dummy_sfu_dispatch_if[dummy_i].ready = 1'b1;
end
`endif
`RESET_RELAY (dcr_data_reset, reset);
@@ -133,7 +211,8 @@ module VX_core import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.CORE_ID (CORE_ID)
.CORE_ID (CORE_ID),
.NUM_BRANCHES (2 * `NUM_ALU_BLOCKS)
) schedule (
.clk (clk),
.reset (schedule_reset),
@@ -146,10 +225,18 @@ module VX_core import VX_gpu_pkg::*; #(
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
`ifdef EXT_T_ENABLE
.tensor_csr_unlock_valid(tensor_csr_unlock_valid),
.tensor_csr_unlock_wid(tensor_csr_unlock_wid),
.tensor_tmc_valid(tensor_tmc_valid),
.tensor_tmc_wid(tensor_tmc_wid),
.tensor_tmc_tmask(tensor_tmc_tmask),
`endif
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.schedule_if (schedule_if),
.scalar_schedule_if (scalar_schedule_if),
.tensor_schedule_if (tensor_schedule_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
@@ -165,33 +252,53 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
.scalar_schedule_if (scalar_schedule_if),
.tensor_schedule_if (tensor_schedule_if),
.scalar_fetch_if(scalar_fetch_if),
.tensor_fetch_if(tensor_fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) decode (
) scalar_decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
.fetch_if (scalar_fetch_if),
.decode_if (scalar_decode_if),
`ifdef PERF_ENABLE
.perf_decode_if (scalar_decode_perf_if.decode),
`endif
.decode_sched_if(scalar_decode_sched_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) tensor_decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (tensor_fetch_if),
.decode_if (tensor_decode_if),
`ifdef PERF_ENABLE
.perf_decode_if (tensor_decode_perf_if.decode),
`endif
.decode_sched_if(tensor_decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
) issue (
.CORE_ID (CORE_ID),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
.DOMAIN (WU_DOMAIN_SCALAR)
) scalar_issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
.perf_issue_if (scalar_issue_perf_if.issue),
`endif
.decode_if (decode_if),
.decode_if (scalar_decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
@@ -200,17 +307,57 @@ module VX_core import VX_gpu_pkg::*; #(
.fpu_dispatch_if(fpu_dispatch_if),
`endif
`ifdef EXT_T_ENABLE
.tensor_dispatch_if(tensor_dispatch_if),
.tensor_alu_dispatch_if(scalar_dummy_tensor_alu_dispatch_if),
.tensor_lsu_dispatch_if(scalar_dummy_tensor_lsu_dispatch_if),
.tensor_ctrl_dispatch_if(scalar_dummy_tensor_ctrl_dispatch_if),
.tensor_dispatch_if(scalar_dummy_tensor_dispatch_if),
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
.tensor_regfile_if (scalar_dummy_tensor_regfile_if),
`endif
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef EXT_T_ENABLE
VX_issue #(
.CORE_ID (CORE_ID),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
.DOMAIN (WU_DOMAIN_TENSOR)
) tensor_issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (tensor_issue_perf_if.issue),
`endif
.decode_if (tensor_decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(tensor_dummy_alu_dispatch_if),
.lsu_dispatch_if(tensor_dummy_lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(tensor_dummy_fpu_dispatch_if),
`endif
.tensor_alu_dispatch_if(tensor_alu_dispatch_if),
.tensor_lsu_dispatch_if(tensor_lsu_dispatch_if),
.tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if),
.tensor_dispatch_if(tensor_dispatch_if),
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
`endif
.sfu_dispatch_if(tensor_dummy_sfu_dispatch_if)
);
`else
assign tensor_decode_if.ready = 1'b1;
`endif
VX_execute #(
.CORE_ID (CORE_ID),
.TENSOR_FP16 (TENSOR_FP16)
.TENSOR_FP16 (TENSOR_FP16),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
) execute (
`SCOPE_IO_BIND (2)
@@ -223,6 +370,11 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
.perf_scalar_lsu_reqs(pipeline_perf_if.scalar_lsu_reqs),
.perf_tensor_lsu_reqs(pipeline_perf_if.tensor_lsu_reqs),
.perf_scalar_lsu_stalls(pipeline_perf_if.scalar_lsu_stalls),
.perf_tensor_lsu_stalls(pipeline_perf_if.tensor_lsu_stalls),
.perf_mem_merge_stalls(pipeline_perf_if.mem_merge_stalls),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
@@ -232,18 +384,32 @@ module VX_core import VX_gpu_pkg::*; #(
.fpu_commit_if (fpu_commit_if),
`endif
`ifdef EXT_T_ENABLE
.tensor_alu_dispatch_if (tensor_alu_dispatch_if),
.tensor_lsu_dispatch_if (tensor_lsu_dispatch_if),
.tensor_ctrl_dispatch_if (tensor_ctrl_dispatch_if),
.tensor_dispatch_if (tensor_dispatch_if),
.tensor_commit_if (tensor_commit_if),
.tensor_csr_unlock_valid(tensor_csr_unlock_valid),
.tensor_csr_unlock_wid(tensor_csr_unlock_wid),
.tensor_tmc_valid(tensor_tmc_valid),
.tensor_tmc_wid(tensor_tmc_wid),
.tensor_tmc_tmask(tensor_tmc_tmask),
`ifdef EXT_T_ASYNC
.tensor_regfile_if (tensor_regfile_if),
.tensor_smem_A_if (tensor_smem_A_if),
.tensor_tmem_C_wen (tensor_tmem_C_wen),
.tensor_tmem_A_ren (tensor_tmem_A_ren),
.tensor_tmem_A_rready(tensor_tmem_A_rready),
.tensor_tmem_A_raddr(tensor_tmem_A_raddr),
.tensor_tmem_A_rdata(tensor_tmem_A_rdata),
.tensor_tmem_C_ren (tensor_tmem_C_ren),
.tensor_tmem_C_waddr(tensor_tmem_C_waddr),
.tensor_tmem_C_rready(tensor_tmem_C_rready),
.tensor_tmem_C_raddr(tensor_tmem_C_raddr),
.tensor_tmem_C_rdata(tensor_tmem_C_rdata),
.tensor_tmem_C_wen (tensor_tmem_C_wen),
.tensor_tmem_C_wready(tensor_tmem_C_wready),
.tensor_tmem_C_waddr(tensor_tmem_C_waddr),
.tensor_tmem_C_wdata(tensor_tmem_C_wdata),
.tensor_tmem_C_mask(tensor_tmem_C_mask),
.tensor_tmem_C_rdata(tensor_tmem_C_rdata),
.tensor_smem_B_if (tensor_smem_B_if),
`endif
`endif
@@ -502,6 +668,13 @@ module VX_core import VX_gpu_pkg::*; #(
// causing the ibuffer to clog.
$display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls,
$itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0);
$display("wu scalar scheduler ready cycles: %d", pipeline_perf_if.scalar_sched_ready_cycles);
$display("wu tensor scheduler ready cycles: %d", pipeline_perf_if.tensor_sched_ready_cycles);
$display("wu scalar scheduler issued cycles: %d", pipeline_perf_if.scalar_sched_issued_cycles);
$display("wu tensor scheduler issued cycles: %d", pipeline_perf_if.tensor_sched_issued_cycles);
$display("wu illegal tensor reg accesses: %d", pipeline_perf_if.illegal_tensor_reg_access);
$display("wu illegal tensor scalar ops: %d", pipeline_perf_if.illegal_tensor_scalar_op);
$display("wu illegal scalar tensor ops: %d", pipeline_perf_if.illegal_scalar_tensor_op);
$display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls,
$itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0);
// see VX_scoreboard.sv
@@ -564,6 +737,11 @@ module VX_core import VX_gpu_pkg::*; #(
$display("dcache load latency: %f cycles",
$itor(dcache_lat) / $itor(loads));
$display("dcache stores: %d", perf_stores);
$display("wu scalar lsu accepted requests: %d", pipeline_perf_if.scalar_lsu_reqs);
$display("wu tensor lsu accepted requests: %d", pipeline_perf_if.tensor_lsu_reqs);
$display("wu scalar lsu merge stalls: %d", pipeline_perf_if.scalar_lsu_stalls);
$display("wu tensor lsu merge stalls: %d", pipeline_perf_if.tensor_lsu_stalls);
$display("wu memory merge contention cycles: %d", pipeline_perf_if.mem_merge_stalls);
end
end

View File

@@ -163,6 +163,22 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
assign sched_csr_if.unlock_wid = execute_if.data.wid;
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)
&& execute_if.valid
&& (execute_if.data.PC == 32'h80000028)) begin
`TRACE(1, ("%d: core%0d-csr-probe: valid=%b ready=%b req_valid=%b req_ready=%b no_pending=%b wid=%0d PC=0x%0h addr=0x%0h wb=%0d rd=%0d sop=%b eop=%b (#%0d)\n",
$time, CORE_ID, execute_if.valid, execute_if.ready,
csr_req_valid, csr_req_ready, no_pending_instr,
execute_if.data.wid, execute_if.data.PC, csr_addr,
execute_if.data.wb, execute_if.data.rd,
execute_if.data.sop, execute_if.data.eop,
execute_if.data.uuid));
end
end
`endif
// send response
wire [NUM_LANES-1:0][31:0] csr_commit_data;

View File

@@ -39,14 +39,15 @@ module VX_decode #(
// outputs
VX_decode_if.master decode_if,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.decode perf_decode_if,
`endif
VX_decode_sched_if.master decode_sched_if
);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
@@ -488,6 +489,17 @@ module VX_decode #(
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h6: begin // WSPAWN_MASK
op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN);
op_mod[0] = 1;
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h7: begin // BAR_MASK
op_type = `INST_OP_BITS'(`INST_SFU_BAR_MASK);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
default:;
endcase
end
@@ -620,8 +632,93 @@ module VX_decode #(
endcase
end
wire fetch_is_tensor_warp = fetch_if.data.wid >= `NW_WIDTH'(`NUM_SCALAR_WARPS);
wire fetch_is_scalar_warp = fetch_if.data.wid < `NW_WIDTH'(`NUM_SCALAR_WARPS);
wire fetch_fire = fetch_if.valid && fetch_if.ready;
wire decoded_tensor_inst = (ex_type == `EX_BITS'(`EX_TENSOR));
wire t_reg_hi_rd = use_rd && (rd_r[`NRI_BITS-1:3] != '0);
wire t_reg_hi_rs1 = use_rs1 && (rs1_r[`NRI_BITS-1:3] != '0);
wire t_reg_hi_rs2 = use_rs2 && (rs2_r[`NRI_BITS-1:3] != '0);
wire t_reg_hi_rs3 = use_rs3 && (rs3_r[`NRI_BITS-1:3] != '0);
wire tensor_reg_illegal = fetch_is_tensor_warp && (t_reg_hi_rd || t_reg_hi_rs1 || t_reg_hi_rs2 || t_reg_hi_rs3);
wire scalar_tensor_illegal = fetch_is_scalar_warp && decoded_tensor_inst;
wire tensor_fpu_illegal = fetch_is_tensor_warp && (ex_type == `EX_BITS'(`EX_FPU));
wire tensor_read_csr_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_CSRRS))
&& (rs1_r == `NR_BITS'(0))
&& ((u_12 == `VX_CSR_THREAD_ID)
|| (u_12 == `VX_CSR_WARP_ID)
|| (u_12 == `VX_CSR_CORE_ID)
|| (u_12 == `VX_CSR_MHARTID)
|| (u_12 == `VX_CSR_NUM_THREADS)
|| (u_12 == `VX_CSR_NUM_WARPS)
|| (u_12 == `VX_CSR_NUM_CORES));
wire tensor_sfu_barrier_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_BAR))
|| (op_type == `INST_OP_BITS'(`INST_SFU_BAR_MASK));
wire tensor_sfu_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_TMC))
|| tensor_read_csr_allowed
|| tensor_sfu_barrier_allowed;
wire tensor_sfu_illegal = fetch_is_tensor_warp
&& (ex_type == `EX_BITS'(`EX_SFU))
&& !tensor_sfu_allowed;
wire tensor_complex_alu_illegal = fetch_is_tensor_warp
&& (ex_type == `EX_BITS'(`EX_ALU))
&& (`INST_ALU_IS_M(op_mod) || `INST_ALU_IS_RED(op_mod));
wire tensor_scalar_illegal = tensor_fpu_illegal || tensor_sfu_illegal || tensor_complex_alu_illegal;
wire decode_illegal = tensor_reg_illegal || scalar_tensor_illegal || tensor_scalar_illegal;
wire [`EX_BITS-1:0] emit_ex_type = decode_illegal ? `EX_BITS'(`EX_ALU) : ex_type;
wire [`INST_OP_BITS-1:0] emit_op_type = decode_illegal ? `INST_OP_BITS'(`INST_BR_EBREAK) : op_type;
wire [`INST_MOD_BITS-1:0] emit_op_mod = decode_illegal ? `INST_MOD_BITS'(1) : op_mod;
wire emit_use_PC = decode_illegal ? 1'b1 : use_PC;
wire emit_use_imm = decode_illegal ? 1'b1 : use_imm;
wire [`XLEN-1:0] emit_imm = decode_illegal ? `XLEN'(0) : imm;
wire [`NR_BITS-1:0] emit_rd = decode_illegal ? `NR_BITS'(0) : rd_r;
wire [`NR_BITS-1:0] emit_rs1 = decode_illegal ? `NR_BITS'(0) : rs1_r;
wire [`NR_BITS-1:0] emit_rs2 = decode_illegal ? `NR_BITS'(0) : rs2_r;
wire [`NR_BITS-1:0] emit_rs3 = decode_illegal ? `NR_BITS'(0) : rs3_r;
`RUNTIME_ASSERT(
!fetch_if.valid || !tensor_reg_illegal,
("%t: *** core%0d-decode-illegal-tensor-reg: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d rd=%0d rs1=%0d rs2=%0d rs3=%0d",
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, rd, rs1, rs2, rs3)
)
`RUNTIME_ASSERT(
!fetch_if.valid || !scalar_tensor_illegal,
("%t: *** core%0d-decode-illegal-scalar-tensor-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d",
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type)
)
`RUNTIME_ASSERT(
!fetch_if.valid || !tensor_scalar_illegal,
("%t: *** core%0d-decode-illegal-tensor-scalar-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d mod=%0d",
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, op_mod)
)
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_reg_access;
reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_scalar_op;
reg [`PERF_CTR_BITS-1:0] perf_illegal_scalar_tensor_op;
always @(posedge clk) begin
if (reset) begin
perf_illegal_tensor_reg_access <= '0;
perf_illegal_tensor_scalar_op <= '0;
perf_illegal_scalar_tensor_op <= '0;
end else if (fetch_fire) begin
perf_illegal_tensor_reg_access <= perf_illegal_tensor_reg_access + `PERF_CTR_BITS'(tensor_reg_illegal);
perf_illegal_tensor_scalar_op <= perf_illegal_tensor_scalar_op + `PERF_CTR_BITS'(tensor_scalar_illegal);
perf_illegal_scalar_tensor_op <= perf_illegal_scalar_tensor_op + `PERF_CTR_BITS'(scalar_tensor_illegal);
end
end
assign perf_decode_if.illegal_tensor_reg_access = perf_illegal_tensor_reg_access;
assign perf_decode_if.illegal_tensor_scalar_op = perf_illegal_tensor_scalar_op;
assign perf_decode_if.illegal_scalar_tensor_op = perf_illegal_scalar_tensor_op;
`endif
// disable write to integer register r0
wire wb = use_rd && (rd_r != 0);
wire wb = !decode_illegal && use_rd && (rd_r != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
@@ -631,7 +728,7 @@ module VX_decode #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, emit_ex_type, emit_op_type, emit_op_mod, emit_use_PC, emit_imm, emit_use_imm, wb, emit_rd, emit_rs1, emit_rs2, emit_rs3}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
@@ -639,11 +736,9 @@ module VX_decode #(
///////////////////////////////////////////////////////////////////////////
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign decode_sched_if.is_wstall = is_wstall || decode_illegal;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif

View File

@@ -15,7 +15,8 @@
`include "VX_trace.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
input wire clk,
input wire reset,
@@ -36,11 +37,15 @@ module VX_dispatch import VX_gpu_pkg::*; #(
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
`ifdef EXT_T_ENABLE
VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_PARAM (DOMAIN)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
@@ -68,8 +73,29 @@ module VX_dispatch import VX_gpu_pkg::*; #(
VX_operands_if alu_operands_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] operands_wid;
wire [`ISSUE_WIDTH-1:0] operands_is_tensor;
wire [`ISSUE_WIDTH-1:0] tensor_alu_allowed;
wire [`ISSUE_WIDTH-1:0] tensor_ctrl_allowed;
wire [`ISSUE_WIDTH-1:0] tensor_wctl_allowed;
wire [`ISSUE_WIDTH-1:0] tensor_sfu_allowed;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
assign operands_wid[i] = wis_to_wid(operands_if[i].data.wis, ISSUE_ISW_W'(i));
assign operands_is_tensor[i] = operands_wid[i] >= `NW_WIDTH'(`NUM_SCALAR_WARPS);
assign tensor_alu_allowed[i] = !`INST_ALU_IS_M(operands_if[i].data.op_mod)
&& !`INST_ALU_IS_RED(operands_if[i].data.op_mod);
assign tensor_ctrl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_TMC)
|| (operands_if[i].data.op_type == `INST_SFU_CSRRS)
|| (operands_if[i].data.op_type == `INST_SFU_BAR)
|| (operands_if[i].data.op_type == `INST_SFU_BAR_MASK);
assign tensor_wctl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_BAR)
|| (operands_if[i].data.op_type == `INST_SFU_BAR_MASK);
assign tensor_sfu_allowed[i] = tensor_ctrl_allowed[i] || tensor_wctl_allowed[i];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i];
assign alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (alu_reset, reset);
@@ -90,12 +116,43 @@ module VX_dispatch import VX_gpu_pkg::*; #(
);
end
`ifdef EXT_T_ENABLE
// Tensor INT/control dispatch
VX_operands_if tensor_alu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign tensor_alu_operands_if[i].valid = operands_if[i].valid
&& (operands_if[i].data.ex_type == `EX_ALU)
&& operands_is_tensor[i]
&& tensor_alu_allowed[i];
assign tensor_alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (tensor_alu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) tensor_alu_buffer (
.clk (clk),
.reset (tensor_alu_reset),
.valid_in (tensor_alu_operands_if[i].valid),
.ready_in (tensor_alu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(tensor_alu_operands_if[i].data, last_active_tid[i])),
.data_out (tensor_alu_dispatch_if[i].data),
.valid_out (tensor_alu_dispatch_if[i].valid),
.ready_out (tensor_alu_dispatch_if[i].ready)
);
end
`endif
// LSU dispatch
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i];
assign lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (lsu_reset, reset);
@@ -116,6 +173,34 @@ module VX_dispatch import VX_gpu_pkg::*; #(
);
end
`ifdef EXT_T_ENABLE
// Tensor LSU dispatch
VX_operands_if tensor_lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign tensor_lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i];
assign tensor_lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (tensor_lsu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) tensor_lsu_buffer (
.clk (clk),
.reset (tensor_lsu_reset),
.valid_in (tensor_lsu_operands_if[i].valid),
.ready_in (tensor_lsu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(tensor_lsu_operands_if[i].data, last_active_tid[i])),
.data_out (tensor_lsu_dispatch_if[i].data),
.valid_out (tensor_lsu_dispatch_if[i].valid),
.ready_out (tensor_lsu_dispatch_if[i].ready)
);
end
`endif
// FPU dispatch
`ifdef EXT_F_ENABLE
@@ -123,7 +208,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i];
assign fpu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (fpu_reset, reset);
@@ -152,7 +237,9 @@ module VX_dispatch import VX_gpu_pkg::*; #(
VX_operands_if tensor_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR);
assign tensor_operands_if[i].valid = operands_if[i].valid
&& (operands_if[i].data.ex_type == `EX_TENSOR)
&& operands_is_tensor[i];
assign tensor_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (tensor_reset, reset);
@@ -174,12 +261,45 @@ module VX_dispatch import VX_gpu_pkg::*; #(
end
`endif
`ifdef EXT_T_ENABLE
// Tensor control dispatch
VX_operands_if tensor_ctrl_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign tensor_ctrl_operands_if[i].valid = operands_if[i].valid
&& (operands_if[i].data.ex_type == `EX_SFU)
&& operands_is_tensor[i]
&& tensor_ctrl_allowed[i];
assign tensor_ctrl_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (tensor_ctrl_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) tensor_ctrl_buffer (
.clk (clk),
.reset (tensor_ctrl_reset),
.valid_in (tensor_ctrl_operands_if[i].valid),
.ready_in (tensor_ctrl_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(tensor_ctrl_operands_if[i].data, last_active_tid[i])),
.data_out (tensor_ctrl_dispatch_if[i].data),
.valid_out (tensor_ctrl_dispatch_if[i].valid),
.ready_out (tensor_ctrl_dispatch_if[i].ready)
);
end
`endif
// SFU dispatch
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
assign sfu_operands_if[i].valid = operands_if[i].valid
&& (operands_if[i].data.ex_type == `EX_SFU)
&& !operands_is_tensor[i];
assign sfu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (sfu_reset, reset);
@@ -202,17 +322,46 @@ module VX_dispatch import VX_gpu_pkg::*; #(
// can take next request?
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i])
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i])
`ifdef EXT_F_ENABLE
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i])
`endif
`ifdef EXT_T_ENABLE
|| (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR))
|| (tensor_alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && operands_is_tensor[i] && tensor_alu_allowed[i])
|| (tensor_lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i])
|| (tensor_ctrl_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && operands_is_tensor[i] && tensor_ctrl_allowed[i])
|| (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR) && operands_is_tensor[i])
`endif
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && !operands_is_tensor[i]);
end
`ifdef SIMULATION
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
`RUNTIME_ASSERT(
!(operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR)) || operands_is_tensor[i],
("%t: *** core%0d-dispatch-illegal-scalar-tensor-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
)
`RUNTIME_ASSERT(
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_FPU)),
("%t: *** core%0d-dispatch-illegal-tensor-fpu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
)
`RUNTIME_ASSERT(
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_SFU) && !tensor_sfu_allowed[i]),
("%t: *** core%0d-dispatch-illegal-tensor-sfu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
)
`RUNTIME_ASSERT(
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_ALU)
&& (`INST_ALU_IS_M(operands_if[i].data.op_mod) || `INST_ALU_IS_RED(operands_if[i].data.op_mod))),
("%t: *** core%0d-dispatch-illegal-tensor-complex-alu-op: wid=%0d PC=0x%0h op=0x%0h mod=0x%0h (#%0d)",
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.uuid)
)
end
`endif
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r;
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r;
@@ -309,6 +458,16 @@ module VX_dispatch import VX_gpu_pkg::*; #(
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME)) begin
if ((CORE_ID == 0)
&& alu_dispatch_if[i].valid
&& ((alu_dispatch_if[i].data.PC == 32'h80000010) || (alu_dispatch_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-alu-dispatch-buffer: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, wb=%0d, rd=%0d (#%0d)\n",
$time, CORE_ID, i, alu_dispatch_if[i].valid, alu_dispatch_if[i].ready,
wis_to_wid(alu_dispatch_if[i].data.wis, i), alu_dispatch_if[i].data.PC,
alu_dispatch_if[i].data.op_type, alu_dispatch_if[i].data.op_mod,
alu_dispatch_if[i].data.wb, alu_dispatch_if[i].data.rd,
alu_dispatch_if[i].data.uuid));
end
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);

View File

@@ -17,6 +17,7 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0,
parameter ISW_BASE = 0,
parameter MAX_FANOUT = `MAX_FANOUT
) (
input wire clk,
@@ -234,7 +235,8 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
wire [ISSUE_ISW_W-1:0] routed_isw = isw + ISSUE_ISW_W'(ISW_BASE);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], routed_isw);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),

View File

@@ -15,7 +15,8 @@
module VX_execute import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter TENSOR_FP16 = 0
parameter TENSOR_FP16 = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
) (
`SCOPE_IO_DECL
@@ -37,6 +38,11 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs,
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs,
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls,
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls,
output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls,
`endif
`ifdef EXT_F_ENABLE
@@ -46,7 +52,7 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
@@ -56,19 +62,33 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_warp_ctl_if.master warp_ctl_if,
`ifdef EXT_T_ENABLE
VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
output wire tensor_csr_unlock_valid,
output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
output wire tensor_tmc_valid,
output wire [`NW_WIDTH-1:0] tensor_tmc_wid,
output wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master tensor_regfile_if,
VX_tc_bus_if.master tensor_smem_A_if,
output logic tensor_tmem_C_wen,
output logic tensor_tmem_C_ren,
output logic [8:0] tensor_tmem_C_waddr,
output logic [8:0] tensor_tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
VX_tc_bus_if.master tensor_smem_B_if,
VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES],
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
`endif
`endif
@@ -83,23 +103,196 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
`endif
`ifdef EXT_T_ENABLE
VX_warp_ctl_if scalar_warp_ctl_if();
VX_warp_ctl_if tensor_warp_ctl_if();
localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in;
reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r;
reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r;
reg scalar_warp_ctl_valid_r;
reg tensor_warp_ctl_valid_r;
reg warp_ctl_rr;
assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier};
assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier};
wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid;
wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid;
wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr);
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in;
wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out;
wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r;
wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid;
wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r;
wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid;
assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid;
assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data;
always @(posedge clk) begin
if (reset) begin
scalar_warp_ctl_valid_r <= 1'b0;
tensor_warp_ctl_valid_r <= 1'b0;
warp_ctl_rr <= 1'b0;
end else begin
if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin
warp_ctl_rr <= !select_tensor_warp_ctl;
end
if (scalar_warp_ctl_valid_r) begin
if (consume_scalar_warp_ctl_pending) begin
scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid;
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
end
end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin
scalar_warp_ctl_valid_r <= 1'b1;
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
end
if (tensor_warp_ctl_valid_r) begin
if (consume_tensor_warp_ctl_pending) begin
tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid;
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
end
end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin
tensor_warp_ctl_valid_r <= 1'b1;
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
end
end
end
`RUNTIME_ASSERT(
!(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending),
("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID)
)
`RUNTIME_ASSERT(
!(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending),
("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID)
)
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH]();
VX_alu_unit #(
.CORE_ID (CORE_ID)
) alu_unit (
.clk (clk),
.reset (alu_reset),
.dispatch_if (alu_dispatch_if),
.branch_ctl_if (branch_ctl_if),
.commit_if (alu_commit_if)
.branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]),
.commit_if (alu_scalar_commit_if)
);
`ifdef EXT_T_ENABLE
VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH]();
`RESET_RELAY (tensor_alu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
) tensor_alu_unit (
.clk (clk),
.reset (tensor_alu_reset),
.dispatch_if (tensor_alu_dispatch_if),
.branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]),
.commit_if (alu_tensor_commit_if)
);
localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (ALU_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) alu_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}),
.ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}),
.data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}),
.data_out (alu_commit_if[i].data),
.valid_out (alu_commit_if[i].valid),
.ready_out (alu_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (alu_scalar_commit_if[i].valid
&& ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready,
alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC,
alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd,
alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop,
alu_scalar_commit_if[i].data.uuid));
end
if (alu_tensor_commit_if[i].valid
&& ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready,
alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC,
alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd,
alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop,
alu_tensor_commit_if[i].data.uuid));
end
if (alu_commit_if[i].valid
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
alu_commit_if[i].data.uuid));
end
end
end
`endif
end
`else
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru
assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid;
assign alu_commit_if[i].data = alu_scalar_commit_if[i].data;
assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready;
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (alu_commit_if[i].valid
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
alu_commit_if[i].data.uuid));
end
end
end
`endif
end
`endif
`SCOPE_IO_SWITCH (1)
VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) scalar_lsu_bus_if[DCACHE_NUM_REQS]();
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) lsu_unit (
@@ -107,11 +300,184 @@ module VX_execute import VX_gpu_pkg::*; #(
.clk (clk),
.reset (lsu_reset),
.downstream_mem_busy (downstream_mem_busy),
.cache_bus_if (dcache_bus_if),
.cache_bus_if (scalar_lsu_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_commit_if)
.commit_if (lsu_scalar_commit_if)
);
`ifdef EXT_T_ENABLE
VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) tensor_lsu_bus_if[DCACHE_NUM_REQS]();
`RESET_RELAY (tensor_lsu_reset, reset);
VX_lsu_unit #(
.CORE_ID (CORE_ID)
) tensor_lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (tensor_lsu_reset),
.downstream_mem_busy (downstream_mem_busy),
.cache_bus_if (tensor_lsu_bus_if),
.dispatch_if (tensor_lsu_dispatch_if),
.commit_if (lsu_tensor_commit_if)
);
localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (LSU_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) lsu_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}),
.ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}),
.data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}),
.data_out (lsu_commit_if[i].data),
.valid_out (lsu_commit_if[i].valid),
.ready_out (lsu_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
end
wire scalar_lsu_req_any;
wire tensor_lsu_req_any;
wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids;
wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids;
wire [DCACHE_NUM_REQS-1:0] lsu_req_fires;
wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires;
wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires;
reg lsu_domain_rr;
reg lsu_active_domain;
reg [15:0] lsu_pending_reads;
logic lsu_select_tensor;
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count;
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem
assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid;
assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid;
assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid;
assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data;
assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready;
assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready;
assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid;
assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid;
assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready;
assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready;
assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw;
assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
assign scalar_lsu_req_any = |scalar_lsu_req_valids;
assign tensor_lsu_req_any = |tensor_lsu_req_valids;
always @(*) begin
if (lsu_pending_reads != 0) begin
lsu_select_tensor = lsu_active_domain;
end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin
lsu_select_tensor = lsu_domain_rr;
end else begin
lsu_select_tensor = tensor_lsu_req_any;
end
lsu_rd_req_fire_count = '0;
lsu_rsp_fire_count = '0;
for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin
lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]);
lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]);
end
end
always @(posedge clk) begin
if (reset) begin
lsu_domain_rr <= 1'b0;
lsu_active_domain <= 1'b0;
lsu_pending_reads <= '0;
end else begin
if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin
lsu_domain_rr <= ~lsu_select_tensor;
if (lsu_rd_req_fire_count != 0) begin
lsu_active_domain <= lsu_select_tensor;
end
end
lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count);
end
end
`RUNTIME_ASSERT(
!(lsu_pending_reads == 0 && (|lsu_rsp_fires)),
("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID)
)
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r;
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r;
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r;
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r;
reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r;
wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor;
wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor;
wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor;
wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor;
wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any;
always @(posedge clk) begin
if (reset) begin
perf_scalar_lsu_reqs_r <= '0;
perf_tensor_lsu_reqs_r <= '0;
perf_scalar_lsu_stalls_r <= '0;
perf_tensor_lsu_stalls_r <= '0;
perf_mem_merge_stalls_r <= '0;
end else begin
perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any);
perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any);
perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall);
perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall);
perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall);
end
end
assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r;
assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r;
assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r;
assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r;
assign perf_mem_merge_stalls = perf_mem_merge_stalls_r;
`endif
`else
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru
assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid;
assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data;
assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready;
end
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru
`ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]);
end
`ifdef PERF_ENABLE
assign perf_scalar_lsu_reqs = '0;
assign perf_tensor_lsu_reqs = '0;
assign perf_scalar_lsu_stalls = '0;
assign perf_tensor_lsu_stalls = '0;
assign perf_mem_merge_stalls = '0;
`endif
`endif
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
@@ -147,7 +513,11 @@ module VX_execute import VX_gpu_pkg::*; #(
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
`ifdef EXT_T_ENABLE
.warp_ctl_if (scalar_warp_ctl_if),
`else
.warp_ctl_if (warp_ctl_if),
`endif
.commit_if (sfu_commit_if),
.acc_read_in (acc_read_in),
@@ -156,8 +526,27 @@ module VX_execute import VX_gpu_pkg::*; #(
);
`ifdef EXT_T_ENABLE
VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH]();
VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH]();
VX_tensor_ctrl_unit #(
.CORE_ID (CORE_ID)
) tensor_ctrl_unit (
.clk (clk),
.reset (reset),
.dispatch_if (tensor_ctrl_dispatch_if),
.commit_if (tensor_ctrl_commit_if),
.warp_ctl_if (tensor_warp_ctl_if),
.csr_unlock_valid (tensor_csr_unlock_valid),
.csr_unlock_wid (tensor_csr_unlock_wid),
.tmc_valid (tensor_tmc_valid),
.tmc_wid (tensor_tmc_wid),
.tmc_tmask (tensor_tmc_tmask)
);
VX_tensor_core #(
.FP16 (TENSOR_FP16)
.FP16 (TENSOR_FP16),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
) tensor_core (
.clk(clk),
.reset(reset),
@@ -166,17 +555,44 @@ module VX_execute import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
.regfile_if(tensor_regfile_if),
.smem_A_if(tensor_smem_A_if),
.tmem_C_wen(tensor_tmem_C_wen),
.tmem_A_ren(tensor_tmem_A_ren),
.tmem_A_rready(tensor_tmem_A_rready),
.tmem_A_raddr(tensor_tmem_A_raddr),
.tmem_A_rdata(tensor_tmem_A_rdata),
.tmem_C_ren(tensor_tmem_C_ren),
.tmem_C_waddr(tensor_tmem_C_waddr),
.tmem_C_rready(tensor_tmem_C_rready),
.tmem_C_raddr(tensor_tmem_C_raddr),
.tmem_C_rdata(tensor_tmem_C_rdata),
.tmem_C_wen(tensor_tmem_C_wen),
.tmem_C_wready(tensor_tmem_C_wready),
.tmem_C_waddr(tensor_tmem_C_waddr),
.tmem_C_wdata(tensor_tmem_C_wdata),
.tmem_C_mask(tensor_tmem_C_mask),
.tmem_C_rdata(tensor_tmem_C_rdata),
.smem_B_if(tensor_smem_B_if),
`endif
.commit_if(tensor_commit_if)
.commit_if(tensor_core_commit_if)
);
localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TENSOR_COMMIT_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
) tensor_commit_arb (
.clk (clk),
.reset (reset),
.valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}),
.ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}),
.data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}),
.data_out (tensor_commit_if[i].data),
.valid_out (tensor_commit_if[i].valid),
.ready_out (tensor_commit_if[i].ready),
`UNUSED_PIN (sel_out)
);
end
`endif
// simulation helper signal to get RISC-V tests Pass/Fail status

View File

@@ -25,27 +25,40 @@ module VX_fetch import VX_gpu_pkg::*; #(
VX_mem_bus_if.master icache_bus_if,
// inputs
VX_schedule_if.slave schedule_if,
VX_schedule_if.slave scalar_schedule_if,
VX_schedule_if.slave tensor_schedule_if,
// outputs
VX_fetch_if.master fetch_if
VX_fetch_if.master scalar_fetch_if,
VX_fetch_if.master tensor_fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
wire icache_req_ready;
wire rsp_domain;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
reg fetch_domain_rr;
wire icache_req_fire = icache_req_valid && icache_req_ready;
assign req_tag = schedule_if.data.wid;
wire scalar_req_valid = scalar_schedule_if.valid;
wire tensor_req_valid = tensor_schedule_if.valid;
wire select_tensor_req = tensor_req_valid && (!scalar_req_valid || fetch_domain_rr);
wire selected_domain = select_tensor_req ? WU_DOMAIN_TENSOR : WU_DOMAIN_SCALAR;
wire selected_valid = scalar_req_valid || tensor_req_valid;
wire [`NW_WIDTH-1:0] selected_wid = select_tensor_req ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid;
wire [`XLEN-1:0] selected_pc = select_tensor_req ? tensor_schedule_if.data.PC : scalar_schedule_if.data.PC;
wire [`NUM_THREADS-1:0] selected_tmask = select_tensor_req ? tensor_schedule_if.data.tmask : scalar_schedule_if.data.tmask;
wire [`UUID_WIDTH-1:0] selected_uuid = select_tensor_req ? tensor_schedule_if.data.uuid : scalar_schedule_if.data.uuid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
assign req_tag = selected_wid;
assign {rsp_domain, rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
@@ -60,7 +73,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.write (icache_req_fire),
`UNUSED_PIN (wren),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.wdata ({selected_pc, selected_tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
@@ -69,7 +82,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(selected_wid);
wire [`ISSUE_WIDTH-1:0] domain_ibuf_pop = scalar_fetch_if.ibuf_pop | tensor_fetch_if.ibuf_pop;
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@@ -79,7 +93,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.decr (fetch_if.ibuf_pop[i]),
.decr (domain_ibuf_pop[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
@@ -90,15 +104,24 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire ibuf_ready = 1'b1;
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
`RUNTIME_ASSERT((!selected_valid || selected_pc != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, selected_pc, selected_wid, selected_tmask, selected_uuid))
// Icache Request
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready;
assign icache_req_valid = selected_valid && ibuf_ready;
assign icache_req_addr = selected_pc[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {selected_domain, selected_uuid, req_tag};
assign scalar_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && !select_tensor_req;
assign tensor_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && select_tensor_req;
always @(posedge clk) begin
if (reset) begin
fetch_domain_rr <= 1'b0;
end else if (icache_req_fire && scalar_req_valid && tensor_req_valid) begin
fetch_domain_rr <= ~fetch_domain_rr;
end
end
VX_elastic_buffer #(
.DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH),
@@ -121,18 +144,26 @@ module VX_fetch import VX_gpu_pkg::*; #(
// Icache Response
assign fetch_if.valid = icache_bus_if.rsp_valid;
assign fetch_if.data.tmask = rsp_tmask;
assign fetch_if.data.wid = rsp_tag;
assign fetch_if.data.PC = rsp_PC;
assign fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
assign scalar_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_SCALAR);
assign scalar_fetch_if.data.tmask = rsp_tmask;
assign scalar_fetch_if.data.wid = rsp_tag;
assign scalar_fetch_if.data.PC = rsp_PC;
assign scalar_fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign scalar_fetch_if.data.uuid = rsp_uuid;
assign tensor_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_TENSOR);
assign tensor_fetch_if.data.tmask = rsp_tmask;
assign tensor_fetch_if.data.wid = rsp_tag;
assign tensor_fetch_if.data.PC = rsp_PC;
assign tensor_fetch_if.data.instr = icache_bus_if.rsp_data.data;
assign tensor_fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = (rsp_domain == WU_DOMAIN_TENSOR) ? tensor_fetch_if.ready : scalar_fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire schedule_fire = icache_req_fire;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
@@ -150,7 +181,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
selected_uuid, selected_wid, selected_tmask, selected_pc,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
@@ -161,7 +192,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe0 ({reset, selected_uuid, selected_wid, selected_tmask, selected_pc, icache_req_ready, selected_valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
@@ -172,14 +203,18 @@ module VX_fetch import VX_gpu_pkg::*; #(
`endif
`ifdef DBG_TRACE_CORE_ICACHE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
wire schedule_fire = icache_req_fire;
wire scalar_fetch_fire = scalar_fetch_if.valid && scalar_fetch_if.ready;
wire tensor_fetch_fire = tensor_fetch_if.valid && tensor_fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
`TRACE(1, ("%d: I$%0d req: domain=%0d wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, selected_domain, selected_wid, selected_pc, selected_tmask, selected_uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
if (scalar_fetch_fire) begin
`TRACE(1, ("%d: I$%0d scalar rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, scalar_fetch_if.data.wid, scalar_fetch_if.data.PC, scalar_fetch_if.data.tmask, scalar_fetch_if.data.instr, scalar_fetch_if.data.uuid));
end
if (tensor_fetch_fire) begin
`TRACE(1, ("%d: I$%0d tensor rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, tensor_fetch_if.data.wid, tensor_fetch_if.data.PC, tensor_fetch_if.data.tmask, tensor_fetch_if.data.instr, tensor_fetch_if.data.uuid));
end
end
`endif

View File

@@ -127,4 +127,37 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
assign commit_tmp_if.ready = commit_out_if[i].ready;
end
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME)) begin
if (commit_in_if[i].valid
&& ((commit_in_if[i].data.PC == 32'h80000010) || (commit_in_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: gather-commit-in: block=%0d, isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, i, commit_in_isw[i], commit_in_if[i].valid, commit_in_if[i].ready,
commit_in_if[i].data.wid, commit_in_if[i].data.PC,
commit_in_if[i].data.wb, commit_in_if[i].data.rd,
commit_in_if[i].data.sop, commit_in_if[i].data.eop,
commit_in_if[i].data.uuid));
end
end
end
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME)) begin
if (commit_out_if[i].valid
&& ((commit_out_if[i].data.PC == 32'h80000010) || (commit_out_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: gather-commit-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, i, commit_out_if[i].valid, commit_out_if[i].ready,
commit_out_if[i].data.wid, commit_out_if[i].data.PC,
commit_out_if[i].data.wb, commit_out_if[i].data.rd,
commit_out_if[i].data.sop, commit_out_if[i].data.eop,
commit_out_if[i].data.uuid));
end
end
end
end
`endif
endmodule

View File

@@ -15,7 +15,8 @@
`include "VX_trace.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
input wire clk,
input wire reset,
@@ -34,8 +35,9 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire decode_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid);
assign decode_if.ready = ibuf_ready_in[decode_isw];
assign decode_if.ready = decode_lane_in_domain && ibuf_ready_in[decode_isw];
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
@@ -54,6 +56,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
@@ -61,7 +64,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_isw == i),
.valid_in (decode_if.valid && decode_isw == i && decode_lane_in_domain && LANE_IN_DOMAIN),
.ready_in (ibuf_ready_in[i]),
.data_in ({
decode_if.data.uuid,

View File

@@ -173,6 +173,38 @@ module VX_int_unit #(
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
if (execute_if.valid
&& ((execute_if.data.PC == 32'h80000010) || (execute_if.data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-int-rsp-in: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_br=%b, br_op=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, BLOCK_IDX, execute_if.valid, execute_if.ready,
execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type,
execute_if.data.op_mod, is_br_op, br_op, execute_if.data.wb,
execute_if.data.rd, execute_if.data.sop, execute_if.data.eop,
execute_if.data.uuid));
end
if (commit_if.valid
&& ((PC_r == `XLEN'h80000010) || (PC_r == `XLEN'h80000014))) begin
`TRACE(1, ("%d: core%0d-int-rsp-out: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, is_br=%b, br_op=0x%0h, br_enable=%b, br_taken=%b, br_dest=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n",
$time, CORE_ID, BLOCK_IDX, commit_if.valid, commit_if.ready,
commit_if.data.wid, PC_r, is_br_op_r, br_op_r, br_enable,
br_taken, br_dest, commit_if.data.wb, commit_if.data.rd,
commit_if.data.tensor, commit_if.data.sop, commit_if.data.eop,
commit_if.data.uuid));
end
if (branch_ctl_if.valid
&& (branch_ctl_if.wid == `NW_WIDTH'(0))) begin
`TRACE(1, ("%d: core%0d-int-branch-out: block=%0d, wid=%0d, taken=%b, dest=0x%0h, commit_PC=0x%0h, commit_valid=%b, commit_ready=%b (#%0d)\n",
$time, CORE_ID, BLOCK_IDX, branch_ctl_if.wid,
branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.PC,
commit_if.valid, commit_if.ready, commit_if.data.uuid));
end
end
end
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i];
end

View File

@@ -15,7 +15,9 @@
`include "VX_trace.vh"
module VX_issue import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
`SCOPE_IO_DECL
@@ -35,9 +37,12 @@ module VX_issue import VX_gpu_pkg::*; #(
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
`ifdef EXT_T_ENABLE
VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_ASYNC
VX_tc_rf_if.slave tensor_regfile_if,
VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES],
`endif
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
@@ -52,7 +57,8 @@ module VX_issue import VX_gpu_pkg::*; #(
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
.CORE_ID (CORE_ID),
.DOMAIN (DOMAIN)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
@@ -61,7 +67,8 @@ module VX_issue import VX_gpu_pkg::*; #(
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
.CORE_ID (CORE_ID),
.DOMAIN (DOMAIN)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
@@ -84,7 +91,9 @@ module VX_issue import VX_gpu_pkg::*; #(
VX_operands #(
`endif
.CORE_ID (CORE_ID),
.CACHE_ENABLE (0)
.CACHE_ENABLE (0),
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
.DOMAIN (DOMAIN)
) operands (
.clk (clk),
.reset (operands_reset),
@@ -99,7 +108,8 @@ module VX_issue import VX_gpu_pkg::*; #(
);
VX_dispatch #(
.CORE_ID (CORE_ID)
.CORE_ID (CORE_ID),
.DOMAIN (DOMAIN)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
@@ -116,11 +126,22 @@ module VX_issue import VX_gpu_pkg::*; #(
.fpu_dispatch_if(fpu_dispatch_if),
`endif
`ifdef EXT_T_ENABLE
.tensor_alu_dispatch_if(tensor_alu_dispatch_if),
.tensor_lsu_dispatch_if(tensor_lsu_dispatch_if),
.tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if),
.tensor_dispatch_if(tensor_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
wire decode_fire = decode_if.valid && decode_if.ready;
`RUNTIME_ASSERT(
!decode_fire ||
((DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid)),
("%t: *** core%0d-issue-domain-crossing domain=%0d wid=%0d PC=0x%0h",
$time, CORE_ID, DOMAIN, decode_if.data.wid, decode_if.data.PC)
)
`ifdef SIMULATION
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin

View File

@@ -17,7 +17,9 @@
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
parameter CACHE_ENABLE = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
input wire clk,
input wire reset,
@@ -27,6 +29,8 @@ module VX_operands import VX_gpu_pkg::*; #(
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_PARAM (NUM_TENSOR_CORES)
`UNUSED_PARAM (DOMAIN)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);

View File

@@ -18,7 +18,9 @@
module VX_operands_dup import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
parameter CACHE_ENABLE = 0,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
input wire clk,
input wire reset,
@@ -26,13 +28,43 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
`ifdef EXT_T_ASYNC
VX_tc_rf_if.slave tensor_regfile_if,
VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES],
`endif
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
`ifdef EXT_F_ENABLE
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS;
`else
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS;
`endif
function automatic [RAM_ADDRW-1:0] wu_rf_addr (
input logic [ISSUE_WIS_W-1:0] wis,
input logic [`NR_BITS-1:0] reg_id,
input logic is_tensor
);
if (is_tensor) begin
`ifdef EXT_F_ENABLE
if (reg_id[`NR_BITS-1]) begin
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
+ RAM_ADDRW'(`TENSOR_NUM_GPRS)
+ RAM_ADDRW'(reg_id[`NRI_BITS-1:0]);
end else begin
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
+ RAM_ADDRW'(reg_id[`NRI_BITS-1:0]);
end
`else
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
+ RAM_ADDRW'(reg_id);
`endif
end else begin
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(`NUM_REGS))
+ RAM_ADDRW'(reg_id);
end
endfunction
`ifdef PERF_ENABLE
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
@@ -49,6 +81,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
wire tc_rf_valid [`ISSUE_WIDTH];
wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH];
wire [ISSUE_WIS_W-1:0] tc_rf_wis [`ISSUE_WIDTH];
// FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time
// because NUM_BLOCKS == 1
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
@@ -56,12 +89,44 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
`ifdef EXT_T_ASYNC
`STATIC_ASSERT((ISSUE_RATIO == 1),
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
assign tc_rf_addr = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}};
assign tensor_regfile_if.rsp_data.data = tc_rf_data[0];
`STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS),
("Wu tensor RF binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS"))
for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_rf_rsp
localparam TENSOR_ISW = `NUM_SCALAR_WARPS + tc;
assign tensor_regfile_if[tc].rsp_data.data = tc_rf_data[TENSOR_ISW];
end
for (genvar rf_i = 0; rf_i < `ISSUE_WIDTH; ++rf_i) begin : g_tc_rf_req
if ((DOMAIN == WU_DOMAIN_TENSOR) && `IS_TENSOR_WARP(rf_i)) begin
localparam TC_IDX = rf_i - `NUM_SCALAR_WARPS;
assign tc_rf_valid[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_valid : 1'b0;
assign tc_rf_addr[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.rs : '0;
assign tc_rf_wis[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.wis : '0;
end else begin
assign tc_rf_valid[rf_i] = 1'b0;
assign tc_rf_addr[rf_i] = '0;
assign tc_rf_wis[rf_i] = '0;
end
end
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
// Wu physical RF partition: tensor lanes instantiate a smaller RF,
// while scalar lanes retain the full scalar register file.
localparam GPR_RAM_SIZE = (!LANE_IN_DOMAIN ? 1 : (`IS_TENSOR_WARP(i) ? (TENSOR_NUM_REGS * ISSUE_RATIO) : (`NUM_REGS * ISSUE_RATIO)));
localparam GPR_RAM_ADDRW = `LOG2UP(GPR_RAM_SIZE);
function automatic [GPR_RAM_ADDRW-1:0] lane_rf_addr (
input logic [ISSUE_WIS_W-1:0] wis,
input logic [`NR_BITS-1:0] reg_id
);
logic [RAM_ADDRW-1:0] full_addr;
begin
full_addr = wu_rf_addr(wis, reg_id, `IS_TENSOR_WARP(i));
lane_rf_addr = full_addr[GPR_RAM_ADDRW-1:0];
end
endfunction
always @(posedge clk) begin
if (reset) begin
@@ -82,7 +147,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd
};
scoreboard_if_stored_valid[i] <= scoreboard_if[i].valid && scoreboard_if[i].ready;
scoreboard_if_stored_valid[i] <= LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready;
end
end
@@ -117,11 +182,11 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
`UNUSED_PIN (alm_full),
.size (size1[i])
);
assign operands_if[i].valid = ~empty1[i];
assign operands_if[i].valid = LANE_IN_DOMAIN && ~empty1[i];
`ifdef EXT_T_ASYNC
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2) && ~tc_rf_valid[i];
`else
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2);
`endif
`ifdef SIMULATION
@@ -146,7 +211,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
operands_if[i].data.ex_type, operands_if[i].data.op_type,
size1[i], empty1[i], full1[i],
tc_rf_valid[i], tc_rf_addr[i], tensor_regfile_if.req_data.wis,
tc_rf_valid[i], tc_rf_addr[i], tc_rf_wis[i],
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
`else
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n",
@@ -171,9 +236,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored;
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored;
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored;
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored;
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored;
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored;
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_fifo_queue #(
@@ -209,10 +274,10 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
// GPR banks
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1;
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2;
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1;
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2;
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3;
wire [GPR_RAM_ADDRW-1:0] gpr_wr_addr;
always @(posedge clk) begin
if (reset) begin
@@ -227,13 +292,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
end
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd);
assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1);
assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2);
`ifdef EXT_T_ASYNC
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
`else
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
`endif
// always @(posedge clk) begin
// if (reset) begin
@@ -250,13 +315,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
// end
// end
end else begin
assign gpr_wr_addr = writeback_if[i].data.rd;
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd);
assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1);
assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2);
`ifdef EXT_T_ASYNC
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3;
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
`else
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
`endif
// always @(posedge clk) begin
// if (reset) begin
@@ -292,7 +357,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
.SIZE (GPR_RAM_SIZE),
.ADDRW (GPR_RAM_ADDRW),
.OUT_REG (1),
`ifdef GPR_RESET
.INIT_ENABLE (1),
@@ -301,12 +367,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.NO_RWCHECK (1)
) gpr_ram_rs1 (
.clk (clk),
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -316,7 +382,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
.SIZE (GPR_RAM_SIZE),
.ADDRW (GPR_RAM_ADDRW),
.OUT_REG (1),
`ifdef GPR_RESET
.INIT_ENABLE (1),
@@ -325,12 +392,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
.NO_RWCHECK (1)
) gpr_ram_rs2(
.clk (clk),
.read (scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -340,7 +407,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
.SIZE (GPR_RAM_SIZE),
.ADDRW (GPR_RAM_ADDRW),
.OUT_REG (1),
`ifdef GPR_RESET
.INIT_ENABLE (1),
@@ -350,15 +418,15 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
) gpr_ram_rs3 (
.clk (clk),
`ifdef EXT_T_ASYNC
.read ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i]),
.read (LANE_IN_DOMAIN && ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i])),
`else
.read (scoreboard_if[i].valid && scoreboard_if[i].ready),
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready),
`endif
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
@@ -367,9 +435,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
);
`ifdef PERF_ENABLE
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs1_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs2_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
assign perf_write_rs3_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
`endif
end

View File

@@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter NUM_BRANCHES = `NUM_ALU_BLOCKS
) (
input wire clk,
input wire reset,
@@ -28,12 +29,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
// inputsdecode_if
VX_warp_ctl_if.slave warp_ctl_if,
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
VX_branch_ctl_if.slave branch_ctl_if [NUM_BRANCHES],
VX_decode_sched_if.slave decode_sched_if,
VX_commit_sched_if.slave commit_sched_if,
`ifdef EXT_T_ENABLE
input wire tensor_csr_unlock_valid,
input wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
input wire tensor_tmc_valid,
input wire [`NW_WIDTH-1:0] tensor_tmc_wid,
input wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
`endif
// outputs
VX_schedule_if.master schedule_if,
VX_schedule_if.master scalar_schedule_if,
VX_schedule_if.master tensor_schedule_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
@@ -50,11 +59,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
wire [`NW_WIDTH-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [`XLEN-1:0] schedule_pc;
wire schedule_valid;
wire schedule_ready;
wire scalar_schedule_fire = scalar_schedule_if.valid && scalar_schedule_if.ready;
wire tensor_schedule_fire = tensor_schedule_if.valid && tensor_schedule_if.ready;
wire schedule_fire_any = scalar_schedule_fire || tensor_schedule_fire;
wire [`NW_WIDTH-1:0] schedule_fire_wid = tensor_schedule_fire ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid;
// split/join
wire join_valid;
@@ -68,15 +76,14 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
wire schedule_if_fire = schedule_fire_any;
// branch
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
wire [NUM_BRANCHES-1:0] branch_valid;
wire [NUM_BRANCHES-1:0][`NW_WIDTH-1:0] branch_wid;
wire [NUM_BRANCHES-1:0] branch_taken;
wire [NUM_BRANCHES-1:0][`XLEN-1:0] branch_dest;
for (genvar i = 0; i < NUM_BRANCHES; ++i) begin
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
@@ -88,6 +95,12 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
wire [`NUM_WARPS-1:0] curr_barrier_mask;
wire [`NUM_WARPS-1:0] curr_barrier_mask_with_self;
wire [`NUM_WARPS-1:0] scalar_warp_mask;
wire [`NUM_WARPS-1:0] tensor_warp_mask;
wire [`NUM_WARPS-1:0] barrier_domain_mask;
wire [`NUM_WARPS-1:0] barrier_arrived_mask;
wire [`CLOG2(`NUM_WARPS+1)-1:0] barrier_arrived_count;
`ifdef GBAR_ENABLE
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
reg gbar_req_valid;
@@ -95,8 +108,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
`endif
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign scalar_warp_mask[i] = `IS_SCALAR_WARP(i);
assign tensor_warp_mask[i] = `IS_TENSOR_WARP(i);
end
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
assign curr_barrier_mask_with_self = curr_barrier_mask | (`NUM_WARPS'(1) << warp_ctl_if.wid);
assign barrier_domain_mask =
(warp_ctl_if.barrier.domain == BARRIER_SCALAR) ? (active_warps & scalar_warp_mask) :
(warp_ctl_if.barrier.domain == BARRIER_TENSOR) ? (active_warps & tensor_warp_mask) :
(warp_ctl_if.barrier.domain == BARRIER_MASK) ? (active_warps & warp_ctl_if.barrier.mask) :
active_warps;
assign barrier_arrived_mask = curr_barrier_mask_with_self & barrier_domain_mask;
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`POP_COUNT(barrier_arrived_count, barrier_arrived_mask);
`UNUSED_VAR (active_barrier_count)
always @(*) begin
@@ -152,9 +178,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
`endif
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
if (~warp_ctl_if.barrier.is_global
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
&& ((warp_ctl_if.barrier.domain == BARRIER_MASK)
? ((barrier_arrived_mask & warp_ctl_if.barrier.mask) == warp_ctl_if.barrier.mask)
: (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1))))) begin
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
barrier_stalls_n &= ~barrier_arrived_mask;
end else begin
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
barrier_stalls_n[warp_ctl_if.wid] = 1;
@@ -186,7 +214,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
`endif
// Branch handling
for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
for (integer i = 0; i < NUM_BRANCHES; ++i) begin
if (branch_valid[i]) begin
if (branch_taken[i]) begin
warp_pcs_n[branch_wid[i]] = branch_dest[i];
@@ -205,14 +233,31 @@ module VX_schedule import VX_gpu_pkg::*; #(
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
`ifdef EXT_T_ENABLE
// Tensor control handles a minimal CSR-read/TMC subset without
// reusing the scalar SFU.
if (tensor_csr_unlock_valid) begin
stalled_warps_n[tensor_csr_unlock_wid] = 0;
end
if (tensor_tmc_valid) begin
active_warps_n[tensor_tmc_wid] = (tensor_tmc_tmask != 0);
thread_masks_n[tensor_tmc_wid] = tensor_tmc_tmask;
stalled_warps_n[tensor_tmc_wid] = 0;
end
`endif
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
if (schedule_fire_any) begin
stalled_warps_n[schedule_fire_wid] = 1;
end
// advance PC
if (schedule_if_fire) begin
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
if (scalar_schedule_fire) begin
warp_pcs_n[scalar_schedule_if.data.wid] = scalar_schedule_if.data.PC + 4;
end
if (tensor_schedule_fire) begin
warp_pcs_n[tensor_schedule_if.data.wid] = tensor_schedule_if.data.PC + 4;
end
end
@@ -251,9 +296,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef GBAR_CLUSTER_ENABLE
// engage cluster barrier as soon as the barrier count is
// fulfilled, instead of requiring all warps to be synchronized
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
&& (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1)))) begin
`else
&& (curr_barrier_mask_n == active_warps)) begin
&& (barrier_arrived_mask == barrier_domain_mask)) begin
`endif
gbar_req_valid <= 1;
gbar_req_id <= warp_ctl_if.barrier.id;
@@ -264,8 +309,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
if (scalar_schedule_fire) begin
issued_instrs[scalar_schedule_if.data.wid] <= issued_instrs[scalar_schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (tensor_schedule_fire) begin
issued_instrs[tensor_schedule_if.data.wid] <= issued_instrs[tensor_schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
@@ -309,15 +357,33 @@ module VX_schedule import VX_gpu_pkg::*; #(
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
wire [`NUM_WARPS-1:0] scalar_ready_warps = ready_warps & scalar_warp_mask;
wire [`NUM_WARPS-1:0] tensor_ready_warps = ready_warps & tensor_warp_mask;
wire [`NW_WIDTH-1:0] scalar_schedule_wid;
wire [`NW_WIDTH-1:0] tensor_schedule_wid;
wire scalar_schedule_valid;
wire tensor_schedule_valid;
wire scalar_schedule_ready;
wire tensor_schedule_ready;
VX_lzc_rr #(
.N (`NUM_WARPS)
) wid_select (
) scalar_wid_select (
.clk (clk),
.reset (reset),
.data_in (ready_warps),
.data_out (schedule_wid),
.valid_out (schedule_valid)
.data_in (scalar_ready_warps),
.data_out (scalar_schedule_wid),
.valid_out (scalar_schedule_valid)
);
VX_lzc_rr #(
.N (`NUM_WARPS)
) tensor_wid_select (
.clk (clk),
.reset (reset),
.data_in (tensor_ready_warps),
.data_out (tensor_schedule_wid),
.valid_out (tensor_schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
@@ -325,47 +391,78 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
assign {schedule_tmask, schedule_pc} = {
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
end
function automatic [`UUID_WIDTH-1:0] schedule_uuid (
input logic [`NW_WIDTH-1:0] wid,
input logic [`XLEN-1:0] pc
);
logic [GNW_WIDTH-1:0] g_wid;
begin
g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
schedule_uuid = `UUID_WIDTH'({g_wid, 16'(pc)});
end
endfunction
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
function automatic [`UUID_WIDTH-1:0] schedule_uuid (
input logic [`NW_WIDTH-1:0] wid,
input logic [`XLEN-1:0] pc
);
begin
`UNUSED_VAR (wid)
`UNUSED_VAR (pc)
schedule_uuid = '0;
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
endfunction
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
) out_buf (
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH),
.SIZE (0)
) scalar_out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
.valid_in (!reset && scalar_schedule_valid),
.ready_in (scalar_schedule_ready),
.data_in ({schedule_data[scalar_schedule_wid], scalar_schedule_wid}),
.data_out ({scalar_schedule_if.data.tmask, scalar_schedule_if.data.PC, scalar_schedule_if.data.wid}),
.valid_out (scalar_schedule_if.valid),
.ready_out (scalar_schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH),
.SIZE (0)
) tensor_out_buf (
.clk (clk),
.reset (reset),
.valid_in (!reset && tensor_schedule_valid),
.ready_in (tensor_schedule_ready),
.data_in ({schedule_data[tensor_schedule_wid], tensor_schedule_wid}),
.data_out ({tensor_schedule_if.data.tmask, tensor_schedule_if.data.PC, tensor_schedule_if.data.wid}),
.valid_out (tensor_schedule_if.valid),
.ready_out (tensor_schedule_if.ready)
);
assign scalar_schedule_if.data.uuid = schedule_uuid(scalar_schedule_if.data.wid, scalar_schedule_if.data.PC);
assign tensor_schedule_if.data.uuid = schedule_uuid(tensor_schedule_if.data.wid, tensor_schedule_if.data.PC);
`RUNTIME_ASSERT(
!(scalar_schedule_fire && tensor_schedule_fire),
("%t: *** core%0d-schedule-two-domain-fire-with-single-fetch", $time, CORE_ID)
)
`RUNTIME_ASSERT(
!scalar_schedule_if.valid || `IS_SCALAR_WARP(scalar_schedule_if.data.wid),
("%t: *** core%0d-scalar-scheduler-issued-tensor-warp wid=%0d",
$time, CORE_ID, scalar_schedule_if.data.wid)
)
`RUNTIME_ASSERT(
!tensor_schedule_if.valid || `IS_TENSOR_WARP(tensor_schedule_if.data.wid),
("%t: *** core%0d-tensor-scheduler-issued-scalar-warp wid=%0d",
$time, CORE_ID, tensor_schedule_if.data.wid)
)
`RESET_RELAY (pending_instr_reset, reset);
@@ -377,8 +474,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.incr (decode_sched_if.valid),
.incr_wid (decode_sched_if.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
@@ -413,13 +510,30 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`RUNTIME_ASSERT(
!(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask != '0,
("%t: *** core%0d-invalid-barrier-empty-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b",
$time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask)
)
`RUNTIME_ASSERT(
!(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask[warp_ctl_if.wid],
("%t: *** core%0d-invalid-barrier-wid-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b",
$time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask)
)
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles;
reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_ready_cycles;
reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_ready_cycles;
reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_issued_cycles;
reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_issued_cycles;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
wire schedule_idle = ~(scalar_schedule_if.valid || tensor_schedule_if.valid);
wire schedule_stall = (scalar_schedule_if.valid && ~scalar_schedule_if.ready)
|| (tensor_schedule_if.valid && ~tensor_schedule_if.ready);
wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle;
`POP_COUNT(schedule_barrier_idle, barrier_stalls);
@@ -428,16 +542,28 @@ module VX_schedule import VX_gpu_pkg::*; #(
perf_sched_idles <= '0;
perf_sched_barrier_idles <= '0;
perf_sched_stalls <= '0;
perf_scalar_sched_ready_cycles <= '0;
perf_tensor_sched_ready_cycles <= '0;
perf_scalar_sched_issued_cycles <= '0;
perf_tensor_sched_issued_cycles <= '0;
end else begin
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
perf_scalar_sched_ready_cycles <= perf_scalar_sched_ready_cycles + `PERF_CTR_BITS'(scalar_schedule_valid);
perf_tensor_sched_ready_cycles <= perf_tensor_sched_ready_cycles + `PERF_CTR_BITS'(tensor_schedule_valid);
perf_scalar_sched_issued_cycles <= perf_scalar_sched_issued_cycles + `PERF_CTR_BITS'(scalar_schedule_fire);
perf_tensor_sched_issued_cycles <= perf_tensor_sched_issued_cycles + `PERF_CTR_BITS'(tensor_schedule_fire);
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign perf_schedule_if.scalar_sched_ready_cycles = perf_scalar_sched_ready_cycles;
assign perf_schedule_if.tensor_sched_ready_cycles = perf_tensor_sched_ready_cycles;
assign perf_schedule_if.scalar_sched_issued_cycles = perf_scalar_sched_issued_cycles;
assign perf_schedule_if.tensor_sched_issued_cycles = perf_tensor_sched_issued_cycles;
`endif
endmodule

View File

@@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter DOMAIN = WU_DOMAIN_SCALAR
) (
input wire clk,
input wire reset,
@@ -34,6 +35,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef EXT_F_ENABLE
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS;
`else
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS;
`endif
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
@@ -141,7 +147,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
localparam IS_TENSOR_LANE = `IS_TENSOR_WARP(i);
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
localparam SCOREBOARD_NUM_REGS = (IS_TENSOR_LANE ? TENSOR_NUM_REGS : `NUM_REGS);
localparam SCOREBOARD_REG_BITS = `CLOG2(SCOREBOARD_NUM_REGS);
function automatic [SCOREBOARD_REG_BITS-1:0] scb_reg_idx (
input logic [`NR_BITS-1:0] reg_id
);
`ifdef EXT_F_ENABLE
if (IS_TENSOR_LANE && reg_id[`NR_BITS-1]) begin
scb_reg_idx = SCOREBOARD_REG_BITS'(`TENSOR_NUM_GPRS) + SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]);
end else
`endif
if (IS_TENSOR_LANE) begin
scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]);
end else begin
scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id);
end
endfunction
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0] inuse_regs;
// Number of inflight operations in execution in the asynchronous
// Tensor unit. Since the ISA does not specify an explicit destination
// register, use a separate status bit.
@@ -153,16 +179,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) ||
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA));
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
wire [`NW_WIDTH-1:0] writeback_wid = wis_to_wid(writeback_if[i].data.wis, ISSUE_ISW_W'(i));
wire writeback_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(writeback_wid) : `IS_SCALAR_WARP(writeback_wid);
wire writeback_fire = writeback_lane_in_domain && writeback_if[i].valid && writeback_if[i].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rd_idx = scb_reg_idx(ibuffer_if[i].data.rd);
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs1_idx = scb_reg_idx(ibuffer_if[i].data.rs1);
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs2_idx = scb_reg_idx(ibuffer_if[i].data.rs2);
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs3_idx = scb_reg_idx(ibuffer_if[i].data.rs3);
wire [SCOREBOARD_REG_BITS-1:0] writeback_rd_idx = scb_reg_idx(writeback_if[i].data.rd);
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs1_idx];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs2_idx];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs3_idx];
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
@@ -181,30 +215,30 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1;
end
end
if (inuse_rs1) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1;
end
end
end
@@ -234,8 +268,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
wire stg_valid_in, stg_ready_in;
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
assign ibuffer_if[i].ready = stg_ready_in && operands_ready;
assign stg_valid_in = LANE_IN_DOMAIN && ibuffer_if[i].valid && operands_ready;
assign ibuffer_if[i].ready = LANE_IN_DOMAIN && stg_ready_in && operands_ready;
VX_stream_buffer #(
.DATAW (DATAW)
@@ -277,10 +311,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inflight_tensor <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] <= 0;
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx] <= 1;
end
`ifdef EXT_T_ASYNC
if (tensor_writeback_fire) begin
@@ -297,9 +331,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] <= ibuffer_if[i].data.ex_type;
if (ibuffer_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type;
inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx] <= sfu_type;
end
end
`endif
@@ -324,6 +358,16 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`ifdef EXT_T_ASYNC
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
if (CORE_ID == 0 && writeback_fire &&
((writeback_if[i].data.PC == 32'h80000010) ||
(writeback_if[i].data.PC == 32'h80000014))) begin
`TRACE(1, ("%d: core%0d-scoreboard-writeback-recover: isw=%0d, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, inuse_before=%b, eop=%b (#%0d)\n",
$time, CORE_ID, i, wis_to_wid(writeback_if[i].data.wis, i),
writeback_if[i].data.wis, writeback_if[i].data.PC,
writeback_if[i].data.rd, writeback_if[i].data.tensor,
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx],
writeback_if[i].data.eop, writeback_if[i].data.uuid));
end
if (CORE_ID == 0 && ibuffer_if[i].valid &&
(ibuffer_if[i].data.PC >= `XLEN'h80000240) &&
(ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin
@@ -366,7 +410,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((~writeback_fire ||
writeback_if[i].data.tensor /* dont check rd for tensor ghost writes */ ||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0),
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] != 0),
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
`endif

View File

@@ -60,7 +60,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
VX_dispatch_unit_sane #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
@@ -162,6 +162,22 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
end
assign execute_if[0].ready = sfu_req_ready;
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)
&& execute_if[0].valid
&& (execute_if[0].data.PC == 32'h80000028)) begin
`TRACE(1, ("%d: core%0d-sfu-probe: valid=%b ready=%b op=0x%0h csr_valid=%b csr_ready=%b wctl_valid=%b wctl_ready=%b wid=%0d PC=0x%0h (#%0d)\n",
$time, CORE_ID, execute_if[0].valid, execute_if[0].ready,
execute_if[0].data.op_type,
csr_execute_if.valid, csr_execute_if.ready,
wctl_execute_if.valid, wctl_execute_if.ready,
execute_if[0].data.wid, execute_if[0].data.PC,
execute_if[0].data.uuid));
end
end
`endif
// response arbitration
`RESET_RELAY (commit_reset, reset);

View File

@@ -12,14 +12,19 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if,
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master tmem_if,
// tmem_C is now a direct SRAM port (no TileLink)
output logic tmem_C_wen,
output logic tmem_A_ren,
input logic tmem_A_rready,
output logic [8:0] tmem_A_raddr,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_A_rdata,
output logic tmem_C_ren,
output logic [8:0] tmem_C_waddr,
input logic tmem_C_rready,
output logic [8:0] tmem_C_raddr,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
output logic tmem_C_wen,
input logic tmem_C_wready,
output logic [8:0] tmem_C_waddr,
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
VX_tc_bus_if.master smem_B_if,
VX_commit_if.master commit_if
);
@@ -106,7 +111,7 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
tcgen05_st ? 3'd5 :
tcgen05_cb ? 3'd6 :
3'd0;
wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending && commit_if.ready;
wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending;
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
wire [4:0] initiate_rd = execute_if_data_rd[4:0];
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
@@ -161,13 +166,19 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
.io_reqA_bits_source(tmem_if.req_data.tag),
.io_reqA_bits_address(tmem_if.req_data.addr),
.io_reqA_bits_data(tmem_if.req_data.data),
.io_tmemC_wen(tmem_C_wen),
.io_tmemC_ren(tmem_C_ren),
.io_tmemC_waddr(tmem_C_waddr),
.io_tmemC_raddr(tmem_C_raddr),
.io_tmemC_wdata(tmem_C_wdata),
.io_tmemC_mask(tmem_C_mask),
.io_tmemC_rdata(tmem_C_rdata),
.io_tmemC_aRen(tmem_A_ren),
.io_tmemC_aRready(tmem_A_rready),
.io_tmemC_aRaddr(tmem_A_raddr),
.io_tmemC_aRdata(tmem_A_rdata),
.io_tmemC_cRen(tmem_C_ren),
.io_tmemC_cRready(tmem_C_rready),
.io_tmemC_cRaddr(tmem_C_raddr),
.io_tmemC_cRdata(tmem_C_rdata),
.io_tmemC_cWen(tmem_C_wen),
.io_tmemC_cWready(tmem_C_wready),
.io_tmemC_cWaddr(tmem_C_waddr),
.io_tmemC_cWdata(tmem_C_wdata),
.io_tmemC_cMask(tmem_C_mask),
.io_reqB_ready(smem_B_if.req_ready),
.io_reqB_valid(smem_B_if.req_valid),
.io_reqB_bits_rw(smem_B_if.req_data.rw),
@@ -300,10 +311,11 @@ module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
$time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b,
writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last));
end
if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin
`TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemC_wen=%b tmemC_ren=%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n",
if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || tmem_A_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin
`TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemA_ren=%b/%b tmemA_raddr=0x%0h, tmemC_wen=%b/%b tmemC_ren=%b/%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n",
$time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready,
tmem_C_wen, tmem_C_ren, tmem_C_waddr, tmem_C_raddr,
tmem_A_ren, tmem_A_rready, tmem_A_raddr,
tmem_C_wen, tmem_C_wready, tmem_C_ren, tmem_C_rready, tmem_C_waddr, tmem_C_raddr,
smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready,
initiate_ready));
end

View File

@@ -2,27 +2,42 @@
`include "VX_fpu_define.vh"
module VX_tensor_core import VX_gpu_pkg::*; #(
parameter FP16
parameter FP16,
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
) (
input clk,
input reset,
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_T_ASYNC
VX_tc_rf_if.master regfile_if,
VX_tc_bus_if.master smem_A_if,
output logic tmem_C_wen,
output logic tmem_C_ren,
output logic [8:0] tmem_C_waddr,
output logic [8:0] tmem_C_raddr,
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
VX_tc_bus_if.master smem_B_if,
VX_tc_rf_if.master regfile_if[NUM_TENSOR_CORES],
VX_tc_bus_if.master smem_A_if[NUM_TENSOR_CORES],
output logic [NUM_TENSOR_CORES-1:0] tmem_A_ren,
input logic [NUM_TENSOR_CORES-1:0] tmem_A_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tmem_A_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_A_rdata,
output logic [NUM_TENSOR_CORES-1:0] tmem_C_ren,
input logic [NUM_TENSOR_CORES-1:0] tmem_C_rready,
output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_raddr,
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
output logic [NUM_TENSOR_CORES-1:0] tmem_C_wen,
input logic [NUM_TENSOR_CORES-1:0] tmem_C_wready,
output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_waddr,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
VX_tc_bus_if.master smem_B_if[NUM_TENSOR_CORES],
`endif
VX_commit_if.master commit_if [`ISSUE_WIDTH]
);
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
("Wu tensor-core lane binding currently requires ISSUE_WIDTH == NUM_WARPS"))
`ifdef EXT_T_BLACKWELL
localparam BLOCK_SIZE = NUM_TENSOR_CORES;
`STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS),
("Blackwell Wu tensor-core binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS"))
`else
localparam BLOCK_SIZE = 1;
`endif
localparam NUM_LANES = `NUM_THREADS;
// @perf: PARTIAL_BW==1 increases power instantiating
// stream_buffers for ISSUE_WIDTH times
@@ -32,19 +47,48 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
VX_dispatch_if compact_dispatch_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_dispatch_lane
if (i < NUM_TENSOR_CORES) begin
localparam SRC_ISW = `NUM_SCALAR_WARPS + i;
assign compact_dispatch_if[i].valid = dispatch_if[SRC_ISW].valid;
assign compact_dispatch_if[i].data = dispatch_if[SRC_ISW].data;
assign dispatch_if[SRC_ISW].ready = compact_dispatch_if[i].ready;
end else begin
assign compact_dispatch_if[i].valid = 1'b0;
assign compact_dispatch_if[i].data = '0;
end
if (i < `NUM_SCALAR_WARPS) begin
assign dispatch_if[i].ready = 1'b0;
end
end
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit_sane #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
.OUT_REG (PARTIAL_BW ? 1 : 0),
.ISW_BASE (`NUM_SCALAR_WARPS)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.dispatch_if(compact_dispatch_if),
.execute_if (execute_if)
);
`ifdef SIMULATION
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_domain_assert
`RUNTIME_ASSERT(
!dispatch_if[i].valid || `IS_TENSOR_WARP(i),
("%t: *** tensor-dispatch-cross-domain: issue_lane=%0d ex_op=0x%0h",
$time, i, dispatch_if[i].data.op_type)
)
end
`endif
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
@@ -64,12 +108,14 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`ifdef EXT_T_HOPPER
assign tmem_C_wen = 1'b0;
assign tmem_C_ren = 1'b0;
assign tmem_C_waddr = '0;
assign tmem_C_raddr = '0;
assign tmem_C_wdata = '0;
assign tmem_C_mask = '0;
assign tmem_A_ren[block_idx] = 1'b0;
assign tmem_A_raddr[block_idx * 9 +: 9] = '0;
assign tmem_C_wen[block_idx] = 1'b0;
assign tmem_C_ren[block_idx] = 1'b0;
assign tmem_C_waddr[block_idx * 9 +: 9] = '0;
assign tmem_C_raddr[block_idx * 9 +: 9] = '0;
assign tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN] = '0;
assign tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8] = '0;
VX_tensor_hopper_core_block #(
.ISW(1), // FIXME: not block_idx
.FP16(FP16)
@@ -77,9 +123,9 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.execute_if (execute_if[block_idx]),
.regfile_if (regfile_if),
.smem_A_if (smem_A_if),
.smem_B_if (smem_B_if),
.regfile_if (regfile_if[block_idx]),
.smem_A_if (smem_A_if[block_idx]),
.smem_B_if (smem_B_if[block_idx]),
.commit_if (commit_block_if[block_idx])
);
`elsif EXT_T_BLACKWELL
@@ -90,16 +136,22 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.execute_if (execute_if[block_idx]),
.regfile_if (regfile_if),
.tmem_if (smem_A_if),
.tmem_C_wen (tmem_C_wen),
.tmem_C_ren (tmem_C_ren),
.tmem_C_waddr(tmem_C_waddr),
.tmem_C_raddr(tmem_C_raddr),
.tmem_C_wdata(tmem_C_wdata),
.tmem_C_mask(tmem_C_mask),
.tmem_C_rdata(tmem_C_rdata),
.smem_B_if (smem_B_if),
.regfile_if (regfile_if[block_idx]),
.tmem_if (smem_A_if[block_idx]),
.tmem_A_ren (tmem_A_ren[block_idx]),
.tmem_A_rready(tmem_A_rready[block_idx]),
.tmem_A_raddr(tmem_A_raddr[block_idx * 9 +: 9]),
.tmem_A_rdata(tmem_A_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
.tmem_C_ren (tmem_C_ren[block_idx]),
.tmem_C_rready(tmem_C_rready[block_idx]),
.tmem_C_raddr(tmem_C_raddr[block_idx * 9 +: 9]),
.tmem_C_rdata(tmem_C_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
.tmem_C_wen (tmem_C_wen[block_idx]),
.tmem_C_wready(tmem_C_wready[block_idx]),
.tmem_C_waddr(tmem_C_waddr[block_idx * 9 +: 9]),
.tmem_C_wdata(tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
.tmem_C_mask(tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8]),
.smem_B_if (smem_B_if[block_idx]),
.commit_if (commit_block_if[block_idx])
);
// ) tensor_hopper_core_block (

View File

@@ -0,0 +1,179 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_tensor_ctrl_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if,
output wire csr_unlock_valid,
output wire [`NW_WIDTH-1:0] csr_unlock_wid,
output wire tmc_valid,
output wire [`NW_WIDTH-1:0] tmc_wid,
output wire [`NUM_THREADS-1:0] tmc_tmask
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_THREADS;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS
+ 1 + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1
+ 1 + 1 + `NUM_THREADS + $bits(barrier_t);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit_sane #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) ctrl_commit_if[BLOCK_SIZE]();
wire is_tmc = (execute_if[0].data.op_type == `INST_SFU_TMC);
wire is_csr = (execute_if[0].data.op_type == `INST_SFU_CSRRS);
wire is_bar = (execute_if[0].data.op_type == `INST_SFU_BAR);
wire is_bar_mask = (execute_if[0].data.op_type == `INST_SFU_BAR_MASK);
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if[0].data.imm[`VX_CSR_ADDR_BITS-1:0];
wire [NUM_LANES-1:0][`XLEN-1:0] csr_read_data;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [31:0] wtid = 32'(execute_if[0].data.pid * NUM_LANES + i);
wire [31:0] gtid = (32'(CORE_ID) << (`NW_BITS + `NT_BITS))
+ (32'(execute_if[0].data.wid) << `NT_BITS)
+ wtid;
assign csr_read_data[i] =
(csr_addr == `VX_CSR_THREAD_ID) ? `XLEN'(wtid) :
(csr_addr == `VX_CSR_WARP_ID) ? `XLEN'(execute_if[0].data.wid) :
(csr_addr == `VX_CSR_CORE_ID) ? `XLEN'(CORE_ID) :
(csr_addr == `VX_CSR_MHARTID) ? `XLEN'(gtid) :
(csr_addr == `VX_CSR_NUM_THREADS) ? `XLEN'(`NUM_THREADS) :
(csr_addr == `VX_CSR_NUM_WARPS) ? `XLEN'(`NUM_WARPS) :
(csr_addr == `VX_CSR_NUM_CORES) ? `XLEN'(`NUM_CORES * `NUM_CLUSTERS) :
`XLEN'(0);
end
wire [`NUM_THREADS-1:0] req_tmc_tmask = execute_if[0].data.rs1_data[0][`NUM_THREADS-1:0];
wire [`XLEN-1:0] req_rs1_data = execute_if[0].data.rs1_data[0];
wire [`XLEN-1:0] req_rs2_data = execute_if[0].data.rs2_data[0];
barrier_t barrier, barrier_r;
assign barrier.valid = is_bar || is_bar_mask;
assign barrier.id = req_rs1_data[`NB_WIDTH-1:0];
`ifdef GBAR_ENABLE
`ifdef GBAR_CLUSTER_ENABLE
assign barrier.is_global = 1'b1;
`else
assign barrier.is_global = req_rs1_data[31];
`endif
`else
assign barrier.is_global = 1'b0;
`endif
assign barrier.domain = is_bar_mask ? BARRIER_MASK : req_rs1_data[29:28];
assign barrier.mask = req_rs2_data[`NUM_WARPS-1:0];
assign barrier.size_m1 = req_rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
`ifdef SIMULATION
wire tensor_ctrl_fire = execute_if[0].valid && execute_if[0].ready && execute_if[0].data.eop;
`RUNTIME_ASSERT(
!tensor_ctrl_fire || is_tmc || is_csr || is_bar || is_bar_mask,
("%t: *** core%0d-tensor-ctrl-illegal-op: wid=%0d PC=0x%0h op=0x%0h",
$time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, execute_if[0].data.op_type)
)
`RUNTIME_ASSERT(
!tensor_ctrl_fire || !is_csr
|| (csr_addr == `VX_CSR_THREAD_ID)
|| (csr_addr == `VX_CSR_WARP_ID)
|| (csr_addr == `VX_CSR_CORE_ID)
|| (csr_addr == `VX_CSR_MHARTID)
|| (csr_addr == `VX_CSR_NUM_THREADS)
|| (csr_addr == `VX_CSR_NUM_WARPS)
|| (csr_addr == `VX_CSR_NUM_CORES),
("%t: *** core%0d-tensor-ctrl-illegal-csr: wid=%0d PC=0x%0h csr=0x%0h",
$time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, csr_addr)
)
`RUNTIME_ASSERT(
!tensor_ctrl_fire || !is_bar_mask || barrier.mask != '0,
("%t: *** core%0d-tensor-ctrl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h",
$time, CORE_ID, execute_if[0].data.wid, barrier.id, barrier.mask)
)
`endif
wire rsp_valid_in = execute_if[0].valid;
wire rsp_ready_in;
assign execute_if[0].ready = rsp_ready_in;
wire [`NUM_THREADS-1:0] tmc_tmask_r;
wire is_tmc_r;
wire is_csr_r;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_in),
.ready_in (rsp_ready_in),
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.wb, csr_read_data, 1'b0, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop, is_tmc, is_csr, req_tmc_tmask, barrier}),
.data_out ({ctrl_commit_if[0].data.uuid, ctrl_commit_if[0].data.wid, ctrl_commit_if[0].data.tmask, ctrl_commit_if[0].data.PC, ctrl_commit_if[0].data.rd, ctrl_commit_if[0].data.wb, ctrl_commit_if[0].data.data, ctrl_commit_if[0].data.tensor, ctrl_commit_if[0].data.pid, ctrl_commit_if[0].data.sop, ctrl_commit_if[0].data.eop, is_tmc_r, is_csr_r, tmc_tmask_r, barrier_r}),
.valid_out (ctrl_commit_if[0].valid),
.ready_out (ctrl_commit_if[0].ready)
);
assign csr_unlock_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_csr_r;
assign csr_unlock_wid = ctrl_commit_if[0].data.wid;
assign tmc_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_tmc_r;
assign tmc_wid = ctrl_commit_if[0].data.wid;
assign tmc_tmask = tmc_tmask_r;
assign warp_ctl_if.valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && barrier_r.valid;
assign warp_ctl_if.wid = ctrl_commit_if[0].data.wid;
assign warp_ctl_if.tmc = '0;
assign warp_ctl_if.wspawn = '0;
assign warp_ctl_if.split = '0;
assign warp_ctl_if.sjoin = '0;
assign warp_ctl_if.barrier = barrier_r;
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
) gather_unit (
.clk (clk),
.reset (reset),
.commit_in_if (ctrl_commit_if),
.commit_out_if (commit_if)
);
endmodule

View File

@@ -360,6 +360,7 @@ task trace_ex_op(input int level,
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: `TRACE(level, ("PRED"));
`INST_SFU_BAR_MASK:`TRACE(level, ("BAR_MASK"));
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end

View File

@@ -32,7 +32,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS
+ 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
@@ -43,11 +44,13 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
barrier_t barrier, barrier_r;
wire is_wspawn = (execute_if.data.op_type == `INST_SFU_WSPAWN);
wire is_wspawn_mask = is_wspawn && execute_if.data.op_mod[0];
wire is_tmc = (execute_if.data.op_type == `INST_SFU_TMC);
wire is_pred = (execute_if.data.op_type == `INST_SFU_PRED);
wire is_split = (execute_if.data.op_type == `INST_SFU_SPLIT);
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire is_bar_mask = (execute_if.data.op_type == `INST_SFU_BAR_MASK);
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
@@ -106,7 +109,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign sjoin.is_dvg = rs1_data[0];
// barrier
assign barrier.valid = is_bar;
assign barrier.valid = is_bar || is_bar_mask;
assign barrier.id = rs1_data[`NB_WIDTH-1:0];
`ifdef GBAR_ENABLE
`ifdef GBAR_CLUSTER_ENABLE
@@ -119,18 +122,35 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
`else
assign barrier.is_global = 1'b0;
`endif
assign barrier.domain = is_bar_mask ? BARRIER_MASK : rs1_data[29:28];
assign barrier.mask = rs2_data[`NUM_WARPS-1:0];
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
// wspawn
wire [`NUM_WARPS-1:0] wspawn_wmask;
wire [`NUM_WARPS-1:0] legacy_wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
assign legacy_wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
end
wire [`NUM_WARPS-1:0] masked_wspawn_wmask = rs1_data[`NUM_WARPS-1:0] & ~(`NUM_WARPS'(1) << execute_if.data.wid);
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.wmask = is_wspawn_mask ? masked_wspawn_wmask : legacy_wspawn_wmask;
assign wspawn.pc = rs2_data;
`ifdef SIMULATION
wire wctl_fire = execute_if.valid && execute_if.ready && execute_if.data.eop;
`RUNTIME_ASSERT(
!wctl_fire || !is_wspawn_mask || masked_wspawn_wmask != '0,
("%t: *** core%0d-wctl-invalid-wspawn-mask: wid=%0d raw_mask=0x%0h pc=0x%0h",
$time, CORE_ID, execute_if.data.wid, rs1_data[`NUM_WARPS-1:0], rs2_data)
)
`RUNTIME_ASSERT(
!wctl_fire || !is_bar_mask || barrier.mask != '0,
("%t: *** core%0d-wctl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h",
$time, CORE_ID, execute_if.data.wid, barrier.id, barrier.mask)
)
`endif
// response
VX_elastic_buffer #(

View File

@@ -17,6 +17,13 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] sched_barrier_idles;
wire [`PERF_CTR_BITS-1:0] scalar_sched_ready_cycles;
wire [`PERF_CTR_BITS-1:0] tensor_sched_ready_cycles;
wire [`PERF_CTR_BITS-1:0] scalar_sched_issued_cycles;
wire [`PERF_CTR_BITS-1:0] tensor_sched_issued_cycles;
wire [`PERF_CTR_BITS-1:0] illegal_tensor_reg_access;
wire [`PERF_CTR_BITS-1:0] illegal_tensor_scalar_op;
wire [`PERF_CTR_BITS-1:0] illegal_scalar_tensor_op;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses;
@@ -34,11 +41,26 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
wire [`PERF_CTR_BITS-1:0] scalar_lsu_reqs;
wire [`PERF_CTR_BITS-1:0] tensor_lsu_reqs;
wire [`PERF_CTR_BITS-1:0] scalar_lsu_stalls;
wire [`PERF_CTR_BITS-1:0] tensor_lsu_stalls;
wire [`PERF_CTR_BITS-1:0] mem_merge_stalls;
modport schedule (
output sched_idles,
output sched_barrier_idles,
output sched_stalls
output sched_stalls,
output scalar_sched_ready_cycles,
output tensor_sched_ready_cycles,
output scalar_sched_issued_cycles,
output tensor_sched_issued_cycles
);
modport decode (
output illegal_tensor_reg_access,
output illegal_tensor_scalar_op,
output illegal_scalar_tensor_op
);
modport issue (
@@ -55,10 +77,25 @@ interface VX_pipeline_perf_if ();
output dispatch_any_fire_cycles
);
modport execute (
output scalar_lsu_reqs,
output tensor_lsu_reqs,
output scalar_lsu_stalls,
output tensor_lsu_stalls,
output mem_merge_stalls
);
modport slave (
input sched_idles,
input sched_barrier_idles,
input sched_stalls,
input scalar_sched_ready_cycles,
input tensor_sched_ready_cycles,
input scalar_sched_issued_cycles,
input tensor_sched_issued_cycles,
input illegal_tensor_reg_access,
input illegal_tensor_scalar_op,
input illegal_scalar_tensor_op,
input ibf_stalls,
input scb_stalls,
input scb_any_unit_uses,
@@ -74,7 +111,12 @@ interface VX_pipeline_perf_if ();
input loads,
input stores,
input ifetch_latency,
input load_latency
input load_latency,
input scalar_lsu_reqs,
input tensor_lsu_reqs,
input scalar_lsu_stalls,
input tensor_lsu_stalls,
input mem_merge_stalls
);
endinterface

701
kernel/include/VX_config.h Normal file
View File

@@ -0,0 +1,701 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-05-07 13:55:58.398687
// Translated from ./rtl/VX_config.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_CONFIG_VH
#define VX_CONFIG_VH
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
#ifndef CLAMP
#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
#endif
#ifndef UP
#define UP(x) (((x) != 0) ? (x) : 1)
#endif
///////////////////////////////////////////////////////////////////////////////
#ifndef EXT_M_DISABLE
#define EXT_M_ENABLE
#endif
#ifndef EXT_F_DISABLE
#define EXT_F_ENABLE
#endif
#ifndef XLEN_32
#ifndef XLEN_64
#define XLEN_32
#endif
#endif
#ifdef XLEN_64
#define XLEN 64
#endif
#ifdef XLEN_32
#define XLEN 32
#endif
#ifdef EXT_D_ENABLE
#define FLEN_64
#else
#define FLEN_32
#endif
#ifdef FLEN_64
#define FLEN 64
#endif
#ifdef FLEN_32
#define FLEN 32
#endif
#ifdef XLEN_64
#ifdef FLEN_32
#define FPU_RV64F
#endif
#endif
#ifndef NUM_CLUSTERS
#define NUM_CLUSTERS 1
#endif
#ifndef NUM_CORES
#define NUM_CORES 1
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 4
#endif
#ifndef NUM_TENSOR_WARPS
#define NUM_TENSOR_WARPS 2
#endif
#define NUM_SCALAR_WARPS (NUM_WARPS - NUM_TENSOR_WARPS)
#define IS_SCALAR_WARP(wid) ((wid) < NUM_SCALAR_WARPS)
#define IS_TENSOR_WARP(wid) ((wid) >= NUM_SCALAR_WARPS)
#ifndef TENSOR_NUM_GPRS
#define TENSOR_NUM_GPRS 8
#endif
#ifndef TENSOR_NUM_FPRS
#define TENSOR_NUM_FPRS 8
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 4
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 8
#endif
#ifndef SOCKET_SIZE
#define SOCKET_SIZE MIN(4, NUM_CORES)
#endif
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
#ifdef L2_ENABLE
#define L2_ENABLED 1
#else
#define L2_ENABLED 0
#endif
#ifdef L3_ENABLE
#define L3_ENABLED 1
#else
#define L3_ENABLED 0
#endif
#ifdef L1_DISABLE
#define ICACHE_DISABLE
#define DCACHE_DISABLE
#endif
#ifndef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE 64
#endif
#ifndef MEM_ADDR_WIDTH
#ifdef XLEN_64
#define MEM_ADDR_WIDTH 48
#else
#define MEM_ADDR_WIDTH 32
#endif
#endif
#ifndef L1_LINE_SIZE
#ifdef L1_DISABLE
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
#else
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
#endif
#endif
#ifdef L2_ENABLE
#define L2_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L2_LINE_SIZE L1_LINE_SIZE
#endif
#ifdef L3_ENABLE
#define L3_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L3_LINE_SIZE L2_LINE_SIZE
#endif
#ifdef XLEN_64
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x180000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0x1FF000000
#endif
#else
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x80000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0xFF000000
#endif
#endif
#ifndef SMEM_BASE_ADDR
#define SMEM_BASE_ADDR STACK_BASE_ADDR
#endif
#ifndef SMEM_LOG_SIZE
#define SMEM_LOG_SIZE 19
#endif
#ifndef IO_BASE_ADDR
#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
#endif
#ifndef IO_COUT_ADDR
#define IO_COUT_ADDR IO_BASE_ADDR
#endif
#define IO_COUT_SIZE MEM_BLOCK_SIZE
#ifndef IO_CSR_ADDR
#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
#endif
#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
#ifndef STACK_LOG2_SIZE
#define STACK_LOG2_SIZE 13
#endif
#define STACK_SIZE (1 << STACK_LOG2_SIZE)
#define RESET_DELAY 8
#ifndef STALL_TIMEOUT
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
#endif
#ifndef SV_DPI
#define DPI_DISABLE
#endif
#ifndef FPU_FPNEW
#ifndef FPU_DSP
#ifndef FPU_DPI
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define FPU_DPI
#else
#define FPU_DSP
#endif
#else
#define FPU_DSP
#endif
#endif
#endif
#endif
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define IMUL_DPI
#define IDIV_DPI
#endif
#endif
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 3
#endif
// Pipeline Configuration /////////////////////////////////////////////////////
// Issue width
#ifndef ISSUE_WIDTH
#define ISSUE_WIDTH NUM_WARPS
#endif
// Number of ALU units
#ifndef NUM_ALU_LANES
#define NUM_ALU_LANES NUM_THREADS
#endif
#ifndef NUM_ALU_BLOCKS
#define NUM_ALU_BLOCKS 4
#endif
// Number of FPU units
#ifndef NUM_FPU_LANES
#define NUM_FPU_LANES NUM_THREADS
#endif
#ifndef NUM_FPU_BLOCKS
#define NUM_FPU_BLOCKS 2
#endif
// Number of LSU units
#ifndef NUM_LSU_LANES
#define NUM_LSU_LANES NUM_THREADS
#endif
// Number of SFU units
#ifndef NUM_SFU_LANES
#define NUM_SFU_LANES MIN(NUM_THREADS, 4)
#endif
// Size of Instruction Buffer
#ifndef IBUF_SIZE
#define IBUF_SIZE (4 * ISSUE_WIDTH)
#endif
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
#endif
// LSU Duplicate Address Check
#ifndef LSU_DUP_DISABLE
#define LSU_DUP_ENABLE
#endif
#ifdef LSU_DUP_ENABLE
#define LSU_DUP_ENABLED 1
#else
#define LSU_DUP_ENABLED 0
#endif
#ifdef GBAR_ENABLE
#define GBAR_ENABLED 1
#else
#define GBAR_ENABLED 0
#endif
#ifndef LATENCY_IMUL
#ifdef VIVADO
#define LATENCY_IMUL 4
#endif
#ifdef QUARTUS
#define LATENCY_IMUL 3
#endif
#ifndef LATENCY_IMUL
#define LATENCY_IMUL 4
#endif
#endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
#ifndef FPUQ_SIZE
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
#endif
// FNCP Latency
#ifndef LATENCY_FNCP
#define LATENCY_FNCP 2
#endif
// FMA Latency
#ifndef LATENCY_FMA
#ifdef FPU_DPI
#define LATENCY_FMA 4
#endif
#ifdef FPU_FPNEW
#define LATENCY_FMA 4
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FMA 4
#endif
#ifdef VIVADO
#define LATENCY_FMA 16
#endif
#ifndef LATENCY_FMA
#define LATENCY_FMA 4
#endif
#endif
#endif
// FDIV Latency
#ifndef LATENCY_FDIV
#ifdef FPU_DPI
#define LATENCY_FDIV 15
#endif
#ifdef FPU_FPNEW
#define LATENCY_FDIV 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FDIV 15
#endif
#ifdef VIVADO
#define LATENCY_FDIV 28
#endif
#ifndef LATENCY_FDIV
#define LATENCY_FDIV 16
#endif
#endif
#endif
// FSQRT Latency
#ifndef LATENCY_FSQRT
#ifdef FPU_DPI
#define LATENCY_FSQRT 10
#endif
#ifdef FPU_FPNEW
#define LATENCY_FSQRT 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FSQRT 10
#endif
#ifdef VIVADO
#define LATENCY_FSQRT 28
#endif
#ifndef LATENCY_FSQRT
#define LATENCY_FSQRT 16
#endif
#endif
#endif
// FCVT Latency
#ifndef LATENCY_FCVT
#define LATENCY_FCVT 5
#endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef ICACHE_DISABLE
#define ICACHE_ENABLE
#endif
#ifdef ICACHE_ENABLE
#define ICACHE_ENABLED 1
#else
#define ICACHE_ENABLED 0
#define NUM_ICACHES 0
#endif
// Number of Cache Units
#ifndef NUM_ICACHES
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef ICACHE_SIZE
#define ICACHE_SIZE 16384
#endif
// Core Response Queue Size
#ifndef ICACHE_CRSQ_SIZE
#define ICACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef ICACHE_MSHR_SIZE
#define ICACHE_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef ICACHE_MREQ_SIZE
#define ICACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef ICACHE_MRSQ_SIZE
#define ICACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef ICACHE_NUM_WAYS
#define ICACHE_NUM_WAYS 1
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef DCACHE_DISABLE
#define DCACHE_ENABLE
#endif
#ifdef DCACHE_ENABLE
#define DCACHE_ENABLED 1
#else
#define DCACHE_ENABLED 0
#define NUM_DCACHES 0
#define DCACHE_NUM_BANKS 1
#endif
// Number of Cache Units
#ifndef NUM_DCACHES
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef DCACHE_SIZE
#define DCACHE_SIZE 16384
#endif
// Number of Banks
#ifndef DCACHE_NUM_BANKS
#define DCACHE_NUM_BANKS NUM_LSU_LANES
#endif
// Core Response Queue Size
#ifndef DCACHE_CRSQ_SIZE
#define DCACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef DCACHE_MSHR_SIZE
#define DCACHE_MSHR_SIZE 8
#endif
// Memory Request Queue Size
#ifndef DCACHE_MREQ_SIZE
#define DCACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef DCACHE_MRSQ_SIZE
#define DCACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef DCACHE_NUM_WAYS
#define DCACHE_NUM_WAYS 1
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
#ifndef SM_DISABLE
#define SM_ENABLE
#endif
#ifdef SM_ENABLE
#define SM_ENABLED 1
#else
#define SM_ENABLED 0
#define SMEM_NUM_BANKS 1
#endif
// Number of Banks
#ifndef SMEM_NUM_BANKS
#define SMEM_NUM_BANKS (NUM_LSU_LANES)
#endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L2_CACHE_SIZE
#ifdef ALTERA_S10
#define L2_CACHE_SIZE 2097152
#else
#define L2_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L2_NUM_BANKS
#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
#endif
// Core Response Queue Size
#ifndef L2_CRSQ_SIZE
#define L2_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L2_MSHR_SIZE
#define L2_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L2_MREQ_SIZE
#define L2_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L2_MRSQ_SIZE
#define L2_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L2_NUM_WAYS
#define L2_NUM_WAYS 2
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L3_CACHE_SIZE
#ifdef ALTERA_S10
#define L3_CACHE_SIZE 2097152
#else
#define L3_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L3_NUM_BANKS
#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
#endif
// Core Response Queue Size
#ifndef L3_CRSQ_SIZE
#define L3_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L3_MSHR_SIZE
#define L3_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L3_MREQ_SIZE
#define L3_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L3_MRSQ_SIZE
#define L3_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L3_NUM_WAYS
#define L3_NUM_WAYS 4
#endif
// ISA Extensions /////////////////////////////////////////////////////////////
#ifdef EXT_A_ENABLE
#define EXT_A_ENABLED 1
#else
#define EXT_A_ENABLED 0
#endif
#ifdef EXT_C_ENABLE
#define EXT_C_ENABLED 1
#else
#define EXT_C_ENABLED 0
#endif
#ifdef EXT_D_ENABLE
#define EXT_D_ENABLED 1
#else
#define EXT_D_ENABLED 0
#endif
#ifdef EXT_F_ENABLE
#define EXT_F_ENABLED 1
#else
#define EXT_F_ENABLED 0
#endif
#ifdef EXT_M_ENABLE
#define EXT_M_ENABLED 1
#else
#define EXT_M_ENABLED 0
#endif
#define ISA_STD_A 0
#define ISA_STD_C 2
#define ISA_STD_D 3
#define ISA_STD_E 4
#define ISA_STD_F 5
#define ISA_STD_H 7
#define ISA_STD_I 8
#define ISA_STD_N 13
#define ISA_STD_Q 16
#define ISA_STD_S 18
#define ISA_STD_U 20
#define ISA_EXT_ICACHE 0
#define ISA_EXT_DCACHE 1
#define ISA_EXT_L2CACHE 2
#define ISA_EXT_L3CACHE 3
#define ISA_EXT_SMEM 4
#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \
| (DCACHE_ENABLED << ISA_EXT_DCACHE) \
| (L2_ENABLED << ISA_EXT_L2CACHE) \
| (L3_ENABLED << ISA_EXT_L3CACHE) \
| (SM_ENABLED << ISA_EXT_SMEM)
#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
| (EXT_C_ENABLED << 2) /* C - Compressed extension */ \
| (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
| (0 << 4) /* E - RV32E base ISA */ \
| (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
| (0 << 6) /* G - Additional standard extensions present */ \
| (0 << 7) /* H - Hypervisor mode implemented */ \
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
| (0 << 9) /* J - Reserved */ \
| (0 << 10) /* K - Reserved */ \
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
| (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
| (0 << 13) /* N - User level interrupts supported */ \
| (0 << 14) /* O - Reserved */ \
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
| (0 << 17) /* R - Reserved */ \
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (1 << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \
| (0 << 25) /* Z - Reserved */
// Device identification //////////////////////////////////////////////////////
#define VENDOR_ID 0
#define ARCHITECTURE_ID 0
#define IMPLEMENTATION_ID 0
#endif // VX_CONFIG_VH

193
kernel/include/VX_types.h Normal file
View File

@@ -0,0 +1,193 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-06-15 00:25:12.935689
// Translated from ./rtl/VX_types.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_TYPES_VH
#define VX_TYPES_VH
// Device configuration registers
#define VX_CSR_ADDR_BITS 12
#define VX_DCR_ADDR_BITS 12
#define VX_DCR_BASE_STATE_BEGIN 0x001
#define VX_DCR_BASE_STARTUP_ADDR0 0x001
#define VX_DCR_BASE_STARTUP_ADDR1 0x002
#define VX_DCR_BASE_MPM_CLASS 0x003
#define VX_DCR_BASE_STATE_END 0x004
#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN)
#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes
#define VX_DCR_MPM_CLASS_NONE 0
#define VX_DCR_MPM_CLASS_CORE 1
#define VX_DCR_MPM_CLASS_MEM 2
// User Floating-Point CSRs
#define VX_CSR_FFLAGS 0x001
#define VX_CSR_FRM 0x002
#define VX_CSR_FCSR 0x003
#define VX_CSR_SATP 0x180
#define VX_CSR_PMPCFG0 0x3A0
#define VX_CSR_PMPADDR0 0x3B0
#define VX_CSR_MSTATUS 0x300
#define VX_CSR_MISA 0x301
#define VX_CSR_MEDELEG 0x302
#define VX_CSR_MIDELEG 0x303
#define VX_CSR_MIE 0x304
#define VX_CSR_MTVEC 0x305
#define VX_CSR_MEPC 0x341
#define VX_CSR_MNSTATUS 0x744
#define VX_CSR_MPM_BASE 0xB00
#define VX_CSR_MPM_BASE_H 0xB80
#define VX_CSR_MPM_USER 0xB03
#define VX_CSR_MPM_USER_H 0xB83
// Machine Performance-monitoring core counters
// PERF: Standard
#define VX_CSR_MCYCLE 0xB00
#define VX_CSR_MCYCLE_H 0xB80
#define VX_CSR_MPM_RESERVED 0xB01
#define VX_CSR_MPM_RESERVED_H 0xB81
#define VX_CSR_MINSTRET 0xB02
#define VX_CSR_MINSTRET_H 0xB82
// PERF: pipeline
#define VX_CSR_MPM_SCHED_ID 0xB03
#define VX_CSR_MPM_SCHED_ID_H 0xB83
#define VX_CSR_MPM_SCHED_ST 0xB04
#define VX_CSR_MPM_SCHED_ST_H 0xB84
#define VX_CSR_MPM_IBUF_ST 0xB05
#define VX_CSR_MPM_IBUF_ST_H 0xB85
#define VX_CSR_MPM_SCRB_ST 0xB06
#define VX_CSR_MPM_SCRB_ST_H 0xB86
#define VX_CSR_MPM_SCRB_ALU 0xB07
#define VX_CSR_MPM_SCRB_ALU_H 0xB87
#define VX_CSR_MPM_SCRB_FPU 0xB08
#define VX_CSR_MPM_SCRB_FPU_H 0xB88
#define VX_CSR_MPM_SCRB_LSU 0xB09
#define VX_CSR_MPM_SCRB_LSU_H 0xB89
#define VX_CSR_MPM_SCRB_SFU 0xB0A
#define VX_CSR_MPM_SCRB_SFU_H 0xB8A
// PERF: memory
#define VX_CSR_MPM_IFETCHES 0xB0B
#define VX_CSR_MPM_IFETCHES_H 0xB8B
#define VX_CSR_MPM_LOADS 0xB0C
#define VX_CSR_MPM_LOADS_H 0xB8C
#define VX_CSR_MPM_STORES 0xB0D
#define VX_CSR_MPM_STORES_H 0xB8D
#define VX_CSR_MPM_IFETCH_LT 0xB0E
#define VX_CSR_MPM_IFETCH_LT_H 0xB8E
#define VX_CSR_MPM_LOAD_LT 0xB0F
#define VX_CSR_MPM_LOAD_LT_H 0xB8F
// SFU: scoreboard
#define VX_CSR_MPM_SCRB_WCTL 0xB10
#define VX_CSR_MPM_SCRB_WCTL_H 0xB90
#define VX_CSR_MPM_SCRB_CSRS 0xB11
#define VX_CSR_MPM_SCRB_CSRS_H 0xB91
// Machine Performance-monitoring memory counters
// PERF: icache
#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads
#define VX_CSR_MPM_ICACHE_READS_H 0xB83
#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses
#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84
#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls
#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85
// PERF: dcache
#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads
#define VX_CSR_MPM_DCACHE_READS_H 0xB86
#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes
#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87
#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses
#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88
#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses
#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89
#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts
#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A
#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls
#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B
// PERF: l2cache
#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads
#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C
#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes
#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D
#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses
#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E
#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses
#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F
#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts
#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90
#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls
#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91
// PERF: l3cache
#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads
#define VX_CSR_MPM_L3CACHE_READS_H 0xB92
#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes
#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93
#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses
#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94
#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses
#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95
#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts
#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96
#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls
#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97
// PERF: memory
#define VX_CSR_MPM_MEM_READS 0xB18 // total reads
#define VX_CSR_MPM_MEM_READS_H 0xB98
#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes
#define VX_CSR_MPM_MEM_WRITES_H 0xB99
#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency
#define VX_CSR_MPM_MEM_LT_H 0xB9A
// PERF: smem
#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads
#define VX_CSR_MPM_SMEM_READS_H 0xB9B
#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes
#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C
#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts
#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D
// Machine Information Registers
#define VX_CSR_MVENDORID 0xF11
#define VX_CSR_MARCHID 0xF12
#define VX_CSR_MIMPID 0xF13
#define VX_CSR_MHARTID 0xF14
// GPGU CSRs
#define VX_CSR_THREAD_ID 0xCC0
#define VX_CSR_WARP_ID 0xCC1
#define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
#define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH

View File

@@ -136,6 +136,19 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
}
// Spawn an explicit warp mask. The current warp bit is ignored by hardware.
inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr));
}
inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr);
}
inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr);
}
// Split on a predicate
inline unsigned vx_split(unsigned predicate) {
unsigned ret;
@@ -149,8 +162,36 @@ inline void vx_join(unsigned stack_ptr) {
}
// Warp Barrier
__attribute__((convergent))
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps;
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps));
}
#define VX_BARRIER_DOMAIN_SHIFT 28
#define VX_BARRIER_DOMAIN_ALL 0u
#define VX_BARRIER_DOMAIN_SCALAR 1u
#define VX_BARRIER_DOMAIN_TENSOR 2u
__attribute__((convergent))
inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) {
unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT);
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps));
}
__attribute__((convergent))
inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) {
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR);
}
__attribute__((convergent))
inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) {
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR);
}
__attribute__((convergent))
inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) {
asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask));
}
// Return current thread identifier
@@ -202,6 +243,22 @@ inline int vx_num_warps() {
return ret;
}
inline int vx_num_scalar_warps() {
return NUM_SCALAR_WARPS;
}
inline int vx_num_tensor_warps() {
return NUM_TENSOR_WARPS;
}
inline unsigned vx_scalar_warp_mask() {
return (1u << NUM_SCALAR_WARPS) - 1u;
}
inline unsigned vx_tensor_warp_mask() {
return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS;
}
// Return the number of cores per cluster
inline int vx_num_cores() {
int ret;

View File

@@ -17,6 +17,10 @@
#include <stdint.h>
#include <stdio.h>
#ifndef CORES_PER_CLUSTER
#define CORES_PER_CLUSTER 8
#endif
#ifdef __cplusplus
extern "C" {
#endif
@@ -48,6 +52,7 @@ void vx_wspawn_wait();
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);
void vx_serial(vx_serial_cb callback, void * arg);

View File

@@ -74,18 +74,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
}
}
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
int NT = vx_num_threads();
int NW = vx_num_warps();
int NW = NUM_SCALAR_WARPS;
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
@@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
}
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
int NT = vx_num_threads();
int NW = NUM_SCALAR_WARPS;
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
// round-robin warp_id allocation across cores in cluster
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
void* arg = p_wspawn_args->arg;
// sequential iterations
for (int wave_id = 0; wave_id < waves; ++wave_id) {
int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
callback(task_id, arg);
}
}
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int tid = vx_thread_id();
int wid = vx_warp_id();
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
// round-robin warp_id allocation across cores in cluster
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
// FIXME: This assumes that all cores but the last one are working with full
// warps, and only the last core has a partially-filled warp.
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
int task_id = offset;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
// activate all threads
vx_tmc(-1);
@@ -111,7 +156,17 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
spawn_tasks_contiguous_all_stub();
// disable warp
// deadlock here on warps 1, 2, 3
vx_tmc_zero();
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_cluster_all_stub();
// disable warp
vx_tmc_zero();
}
@@ -126,10 +181,115 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
vx_tmc_zero();
}
// This function runs in every core, but with only 1 warp and 1 thread enabled.
// The logic in this function figures out how many warps/threads this particular
// core has to enable to fulfill an entire grid of computation.
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
// device specs
const int NC = vx_num_cores();
const int NW = NUM_SCALAR_WARPS;
const int NT = vx_num_threads();
// NOTE: assumes divisible
const int num_cluster = NC / CORES_PER_CLUSTER;
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
const int cluster_id = core_id / CORES_PER_CLUSTER;
const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
// try to fill up full clusters first
const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT;
const int num_used_clusters =
(num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster;
if (cluster_id >= num_used_clusters) {
return; // terminate extra clusters
}
// fill up the last cluster with remaining tasks
const int num_full_clusters = num_tasks / num_threads_in_cluster;
int num_tasks_this_cluster = num_threads_in_cluster;
if (cluster_id >= num_full_clusters) {
num_tasks_this_cluster = num_tasks % num_threads_in_cluster;
}
// Distribute threads equally across as many cores as possible, even if they
// don't fill up NW*NT in a single core. This makes sure the warps get evenly
// distributed in a single cluster
//
// TODO: Try to contain in a single cluster if possible?
const int num_active_cores = (num_tasks + (NT - 1)) / NT;
if (core_id >= num_active_cores)
return; // terminate extra cores
const int num_full_warps_this_cluster = num_tasks_this_cluster / NT;
const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
// const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER;
const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER;
if (core_id_in_cluster < num_warps_in_last_row) {
num_warps_this_core++;
}
// if 0, last warp is full-threads enabled
int rem_threads_in_last_warp_this_core = 0;
if (rem_threads_in_last_warp != 0) {
if (core_id_in_cluster == num_warps_in_last_row - 1) {
rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
}
}
// sequential iterations
const int num_full_waves = num_warps_this_core / NW;
const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
const int offset = cluster_id * num_tasks_this_cluster;
wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
rem_full_warps_in_last_wave};
g_wspawn_args[core_id] = &wspawn_args;
if (num_warps_this_core > 0) {
// execute callback on other warps
const int nw = MIN(num_warps_this_core, NW);
vx_wspawn(nw, spawn_tasks_cluster_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_cluster_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
// TODO: this is incomplete
// TODO: Instead of launching an additional wave just to work on remaining
// threads, handle this in the last wave amongst other full warps.
if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
// adjust offset
// FIXME: use rem_threads_in_last_warp_this_core
wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
// activate remaining threads
const int tmask = (1 << rem_threads_in_last_warp) - 1;
vx_tmc(tmask);
// call stub routine
spawn_tasks_cluster_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NW = NUM_SCALAR_WARPS;
int NT = vx_num_threads();
// current core id
@@ -179,7 +339,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
vx_tmc_one();
// wait for spawn warps to terminate
// deadlock here on warp 0!
vx_wspawn_wait();
}
@@ -202,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NW = NUM_SCALAR_WARPS;
int NT = vx_num_threads();
// current core id
@@ -356,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NW = NUM_SCALAR_WARPS;
int NT = vx_num_threads();
// current core id

View File

@@ -22,9 +22,9 @@
_start:
# initialize per-thread registers
csrr t0, VX_CSR_NUM_WARPS # get num warps
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
la t1, init_regs_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
@@ -35,9 +35,9 @@ _start:
jal vx_wspawn_wait
# initialize TLS for all warps
csrr t0, VX_CSR_NUM_WARPS # get num warps
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
la t1, init_tls_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
@@ -102,6 +102,8 @@ init_regs:
#endif
csrr t0, VX_CSR_MHARTID
sll t1, t0, STACK_LOG2_SIZE
sll t2, t0, 4
add t1, t1, t2
sub sp, sp, t1
# set thread pointer register