Compare commits
10 Commits
ef902614ff
...
0ad87bde81
| Author | SHA1 | Date | |
|---|---|---|---|
| 0ad87bde81 | |||
| 323ed7d7e9 | |||
| cb912d3b8b | |||
|
|
f1d0fac518 | ||
|
|
c8529c4339 | ||
|
|
cf000afc8f | ||
|
|
8dc2a25e32 | ||
|
|
d794e055b6 | ||
|
|
ed61418ebf | ||
|
|
2e3ea060a5 |
@@ -84,15 +84,15 @@
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CORES
|
||||
#define NUM_CORES 8
|
||||
#define NUM_CORES 1
|
||||
#endif
|
||||
|
||||
#ifndef NUM_WARPS
|
||||
#define NUM_WARPS 8
|
||||
#define NUM_WARPS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_THREADS
|
||||
#define NUM_THREADS 8
|
||||
#define NUM_THREADS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_BARRIERS
|
||||
@@ -682,4 +682,3 @@
|
||||
#define IMPLEMENTATION_ID 0
|
||||
|
||||
#endif // VX_CONFIG_VH
|
||||
|
||||
|
||||
@@ -44,9 +44,9 @@
|
||||
`ifndef EXT_T_DISABLE
|
||||
`define EXT_T_ENABLE
|
||||
// decoupled Hopper-style tensor core
|
||||
`ifndef EXT_T_HOPPER
|
||||
`define EXT_T_HOPPER
|
||||
`endif
|
||||
// `ifndef EXT_T_HOPPER
|
||||
// `define EXT_T_HOPPER
|
||||
// `endif
|
||||
`endif
|
||||
|
||||
`ifndef XLEN_32
|
||||
@@ -88,15 +88,48 @@
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CORES
|
||||
`define NUM_CORES 4
|
||||
`define NUM_CORES 1
|
||||
`endif
|
||||
|
||||
`ifndef NUM_WARPS
|
||||
`define NUM_WARPS 8
|
||||
`define NUM_WARPS 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_TENSOR_WARPS
|
||||
`define NUM_TENSOR_WARPS 2
|
||||
`endif
|
||||
|
||||
`define NUM_SCALAR_WARPS (`NUM_WARPS - `NUM_TENSOR_WARPS)
|
||||
|
||||
`define WU_CONFIG_STATIC_ASSERTS \
|
||||
generate \
|
||||
if (!(`NUM_WARPS > 0)) begin : g_wu_num_warps_gt_zero \
|
||||
invalid_NUM_WARPS_must_be_greater_than_zero __wu_config_error(); \
|
||||
end \
|
||||
if (!(`NUM_TENSOR_WARPS > 0)) begin : g_wu_num_tensor_warps_gt_zero \
|
||||
invalid_NUM_TENSOR_WARPS_must_be_greater_than_zero __wu_config_error(); \
|
||||
end \
|
||||
if (!(`NUM_TENSOR_WARPS < `NUM_WARPS)) begin : g_wu_num_tensor_warps_lt_num_warps \
|
||||
invalid_NUM_TENSOR_WARPS_must_be_smaller_than_NUM_WARPS __wu_config_error(); \
|
||||
end \
|
||||
if (!(`NUM_SCALAR_WARPS > 0)) begin : g_wu_num_scalar_warps_gt_zero \
|
||||
invalid_NUM_SCALAR_WARPS_must_be_greater_than_zero __wu_config_error(); \
|
||||
end \
|
||||
endgenerate
|
||||
|
||||
`define IS_SCALAR_WARP(wid) ((wid) < `NUM_SCALAR_WARPS)
|
||||
`define IS_TENSOR_WARP(wid) ((wid) >= `NUM_SCALAR_WARPS)
|
||||
|
||||
`ifndef TENSOR_NUM_GPRS
|
||||
`define TENSOR_NUM_GPRS 8
|
||||
`endif
|
||||
|
||||
`ifndef TENSOR_NUM_FPRS
|
||||
`define TENSOR_NUM_FPRS 8
|
||||
`endif
|
||||
|
||||
`ifndef NUM_THREADS
|
||||
`define NUM_THREADS 8
|
||||
`define NUM_THREADS 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_BARRIERS
|
||||
|
||||
@@ -5,8 +5,9 @@
|
||||
module Vortex import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0,
|
||||
parameter BOOTROM_HANG100 = 32'h10100,
|
||||
parameter logic [63:0] STARTUP_ADDR = 64'h0000_0000_0001_0100,
|
||||
parameter NUM_THREADS = 0,
|
||||
parameter NUM_TENSOR_CORES = 1,
|
||||
parameter TC_DATA_WIDTH = 256,
|
||||
parameter TC_TAG_WIDTH = 4
|
||||
) (
|
||||
@@ -77,14 +78,32 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
output [(DCACHE_NUM_REQS * 32) - 1:0] smem_a_bits_data,
|
||||
|
||||
// tc --------------------------------------------------
|
||||
input [1:0] tc_a_ready,
|
||||
output [1:0] tc_a_valid,
|
||||
output [63:0] tc_a_bits_address,
|
||||
output [2 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
||||
output [1:0] tc_d_ready,
|
||||
input [1:0] tc_d_valid,
|
||||
input [2 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
||||
input [2 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
|
||||
input [NUM_TENSOR_CORES * 3 - 1:0] tc_a_ready,
|
||||
output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_valid,
|
||||
output [NUM_TENSOR_CORES * 3 - 1:0] tc_a_bits_write,
|
||||
output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_address,
|
||||
output [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_a_bits_tag,
|
||||
output [NUM_TENSOR_CORES * 3 * 32 - 1:0] tc_a_bits_mask,
|
||||
output [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_a_bits_data,
|
||||
output [NUM_TENSOR_CORES * 3 - 1:0] tc_d_ready,
|
||||
input [NUM_TENSOR_CORES * 3 - 1:0] tc_d_valid,
|
||||
input [NUM_TENSOR_CORES * 3 * TC_DATA_WIDTH - 1:0] tc_d_bits_data,
|
||||
input [NUM_TENSOR_CORES * 3 * TC_TAG_WIDTH - 1:0] tc_d_bits_tag,
|
||||
|
||||
// shared tmem direct SRAM ports
|
||||
output [NUM_TENSOR_CORES-1:0] tc_tmem_A_ren,
|
||||
input [NUM_TENSOR_CORES-1:0] tc_tmem_A_rready,
|
||||
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_A_raddr,
|
||||
input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_A_rdata,
|
||||
output [NUM_TENSOR_CORES-1:0] tc_tmem_C_ren,
|
||||
input [NUM_TENSOR_CORES-1:0] tc_tmem_C_rready,
|
||||
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_raddr,
|
||||
input [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_rdata,
|
||||
output [NUM_TENSOR_CORES-1:0] tc_tmem_C_wen,
|
||||
input [NUM_TENSOR_CORES-1:0] tc_tmem_C_wready,
|
||||
output [NUM_TENSOR_CORES*9-1:0] tc_tmem_C_waddr,
|
||||
output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tc_tmem_C_wdata,
|
||||
output [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tc_tmem_C_mask,
|
||||
|
||||
// gbar ------------------------------------------------
|
||||
|
||||
@@ -302,20 +321,52 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
endgenerate
|
||||
|
||||
// tc ---------------------------------------------------------------------
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if();
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p1_bus_if();
|
||||
assign tc_a_valid = {tc_p1_bus_if.req_valid, tc_p0_bus_if.req_valid};
|
||||
assign tc_a_bits_address = {tc_p1_bus_if.req_data.addr, tc_p0_bus_if.req_data.addr};
|
||||
assign tc_a_bits_tag = {tc_p1_bus_if.req_data.tag, tc_p0_bus_if.req_data.tag};
|
||||
assign tc_p0_bus_if.req_ready = tc_a_ready[0];
|
||||
assign tc_p0_bus_if.rsp_valid = tc_d_valid[0];
|
||||
assign tc_p0_bus_if.rsp_data.data = tc_d_bits_data[0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p0_bus_if.rsp_data.tag = tc_d_bits_tag[0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
assign tc_p1_bus_if.req_ready = tc_a_ready[1];
|
||||
assign tc_p1_bus_if.rsp_valid = tc_d_valid[1];
|
||||
assign tc_p1_bus_if.rsp_data.data = tc_d_bits_data[1 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p1_bus_if.rsp_data.tag = tc_d_bits_tag[1 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
assign tc_d_ready = {tc_p1_bus_if.rsp_ready, tc_p0_bus_if.rsp_ready};
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p0_bus_if[NUM_TENSOR_CORES]();
|
||||
VX_tc_bus_if #(.TAG_WIDTH(TC_TAG_WIDTH)) tc_p2_bus_if[NUM_TENSOR_CORES]();
|
||||
|
||||
for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_ports
|
||||
localparam P0 = tc * 3;
|
||||
localparam P1 = tc * 3 + 1;
|
||||
localparam P2 = tc * 3 + 2;
|
||||
|
||||
assign tc_a_valid[P0] = tc_p0_bus_if[tc].req_valid;
|
||||
assign tc_a_valid[P1] = 1'b0;
|
||||
assign tc_a_valid[P2] = tc_p2_bus_if[tc].req_valid;
|
||||
|
||||
assign tc_a_bits_write[P0] = tc_p0_bus_if[tc].req_data.rw;
|
||||
assign tc_a_bits_write[P1] = 1'b0;
|
||||
assign tc_a_bits_write[P2] = tc_p2_bus_if[tc].req_data.rw;
|
||||
|
||||
assign tc_a_bits_address[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.addr;
|
||||
assign tc_a_bits_address[P1 * 32 +: 32] = 32'b0;
|
||||
assign tc_a_bits_address[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.addr;
|
||||
|
||||
assign tc_a_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p0_bus_if[tc].req_data.tag;
|
||||
assign tc_a_bits_tag[P1 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = '0;
|
||||
assign tc_a_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH] = tc_p2_bus_if[tc].req_data.tag;
|
||||
|
||||
assign tc_a_bits_mask[P0 * 32 +: 32] = tc_p0_bus_if[tc].req_data.byteen;
|
||||
assign tc_a_bits_mask[P1 * 32 +: 32] = '0;
|
||||
assign tc_a_bits_mask[P2 * 32 +: 32] = tc_p2_bus_if[tc].req_data.byteen;
|
||||
|
||||
assign tc_a_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p0_bus_if[tc].req_data.data;
|
||||
assign tc_a_bits_data[P1 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = '0;
|
||||
assign tc_a_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH] = tc_p2_bus_if[tc].req_data.data;
|
||||
|
||||
assign tc_p0_bus_if[tc].req_ready = tc_a_ready[P0];
|
||||
assign tc_p0_bus_if[tc].rsp_valid = tc_d_valid[P0];
|
||||
assign tc_p0_bus_if[tc].rsp_data.data = tc_d_bits_data[P0 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p0_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P0 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
|
||||
assign tc_p2_bus_if[tc].req_ready = tc_a_ready[P2];
|
||||
assign tc_p2_bus_if[tc].rsp_valid = tc_d_valid[P2];
|
||||
assign tc_p2_bus_if[tc].rsp_data.data = tc_d_bits_data[P2 * TC_DATA_WIDTH +: TC_DATA_WIDTH];
|
||||
assign tc_p2_bus_if[tc].rsp_data.tag = tc_d_bits_tag[P2 * TC_TAG_WIDTH +: TC_TAG_WIDTH];
|
||||
|
||||
assign tc_d_ready[P0] = tc_p0_bus_if[tc].rsp_ready;
|
||||
assign tc_d_ready[P1] = 1'b0;
|
||||
assign tc_d_ready[P2] = tc_p2_bus_if[tc].rsp_ready;
|
||||
end
|
||||
|
||||
// gbar -------------------------------------------------------------------
|
||||
`ifdef GBAR_ENABLE
|
||||
@@ -389,15 +440,14 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
|
||||
dcr_write_valid = 1'b1;
|
||||
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR0;
|
||||
dcr_write_data = BOOTROM_HANG100;
|
||||
dcr_write_data = STARTUP_ADDR[31:0];
|
||||
end
|
||||
`VX_DCR_BASE_STARTUP_ADDR1: begin
|
||||
dcr_state_n = `VX_DCR_BASE_MPM_CLASS;
|
||||
|
||||
dcr_write_valid = 1'b1;
|
||||
dcr_write_addr = `VX_DCR_BASE_STARTUP_ADDR1;
|
||||
// FIXME: not sure what this does
|
||||
dcr_write_data = `VX_DCR_DATA_WIDTH'h0;
|
||||
dcr_write_data = STARTUP_ADDR[63:32];
|
||||
end
|
||||
`VX_DCR_BASE_MPM_CLASS: begin
|
||||
dcr_state_n = `VX_DCR_BASE_STATE_END;
|
||||
@@ -424,7 +474,8 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
// TODO: SCOPE_IO_BIND should be socket id
|
||||
VX_core #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.TENSOR_FP16 (TENSOR_FP16)
|
||||
.TENSOR_FP16 (TENSOR_FP16),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
|
||||
) core (
|
||||
`SCOPE_IO_BIND (0)
|
||||
|
||||
@@ -449,7 +500,37 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
.tensor_smem_A_if (tc_p0_bus_if),
|
||||
.tensor_smem_B_if (tc_p1_bus_if),
|
||||
`ifdef EXT_T_BLACKWELL
|
||||
.tensor_tmem_A_ren(tc_tmem_A_ren),
|
||||
.tensor_tmem_A_rready(tc_tmem_A_rready),
|
||||
.tensor_tmem_A_raddr(tc_tmem_A_raddr),
|
||||
.tensor_tmem_A_rdata(tc_tmem_A_rdata),
|
||||
.tensor_tmem_C_ren(tc_tmem_C_ren),
|
||||
.tensor_tmem_C_rready(tc_tmem_C_rready),
|
||||
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
|
||||
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
|
||||
.tensor_tmem_C_wen(tc_tmem_C_wen),
|
||||
.tensor_tmem_C_wready(tc_tmem_C_wready),
|
||||
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
|
||||
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tc_tmem_C_mask),
|
||||
.tensor_smem_B_if (tc_p2_bus_if),
|
||||
`else
|
||||
.tensor_tmem_A_ren(tc_tmem_A_ren),
|
||||
.tensor_tmem_A_rready(tc_tmem_A_rready),
|
||||
.tensor_tmem_A_raddr(tc_tmem_A_raddr),
|
||||
.tensor_tmem_A_rdata(tc_tmem_A_rdata),
|
||||
.tensor_tmem_C_ren(tc_tmem_C_ren),
|
||||
.tensor_tmem_C_rready(tc_tmem_C_rready),
|
||||
.tensor_tmem_C_raddr(tc_tmem_C_raddr),
|
||||
.tensor_tmem_C_rdata(tc_tmem_C_rdata),
|
||||
.tensor_tmem_C_wen(tc_tmem_C_wen),
|
||||
.tensor_tmem_C_wready(tc_tmem_C_wready),
|
||||
.tensor_tmem_C_waddr(tc_tmem_C_waddr),
|
||||
.tensor_tmem_C_wdata(tc_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tc_tmem_C_mask),
|
||||
.tensor_smem_B_if (tc_p2_bus_if),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (sim_ebreak),
|
||||
.sim_wb_value (sim_wb_value),
|
||||
@@ -550,7 +631,7 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
$display("simulation has probably ended. exiting");
|
||||
$finish();
|
||||
end
|
||||
if (finished) begin
|
||||
if (busy_prev && !busy) begin
|
||||
$display("---------------- core%2d has no more active warps ----------------", CORE_ID);
|
||||
// TODO: lane assumed to be 4
|
||||
// `ifndef SYNTHESIS
|
||||
@@ -571,8 +652,3 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
endmodule : Vortex
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -243,21 +243,35 @@
|
||||
`define INST_SFU_WSPAWN 4'h1
|
||||
`define INST_SFU_SPLIT 4'h2
|
||||
`define INST_SFU_JOIN 4'h3
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_CMOV 4'h9
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_CMOV 4'h9
|
||||
`define INST_SFU_BAR_MASK 4'ha
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) ((op <= 5) || (op == `INST_SFU_BAR_MASK))
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
`define INST_TENSOR_HMMA 4'b0000
|
||||
// Hopper WGMMA-style asynchronous op
|
||||
`define INST_TENSOR_HGMMA 4'b0001
|
||||
`define INST_TENSOR_HGMMA_WAIT 4'b0010
|
||||
`define INST_TENSOR_TCGEN05_CP 4'b0011
|
||||
`define INST_TENSOR_TCGEN05_CP_WAIT 4'b0100
|
||||
`define INST_TENSOR_BWGMMA 4'b0101
|
||||
`define INST_TENSOR_BWGMMA_WAIT 4'b0110
|
||||
`define INST_TENSOR_TCGEN05_LD 4'b0111
|
||||
`define INST_TENSOR_TCGEN05_ST 4'b1000
|
||||
`define INST_TENSOR_TCGEN05_CB 4'b1001
|
||||
|
||||
`ifdef EXT_T_HOPPER
|
||||
`define EXT_T_ASYNC
|
||||
`elsif EXT_T_BLACKWELL
|
||||
`define EXT_T_ASYNC
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
@@ -46,6 +46,8 @@ package VX_gpu_pkg;
|
||||
logic valid;
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
logic is_global;
|
||||
logic [1:0] domain;
|
||||
logic [`NUM_WARPS-1:0] mask;
|
||||
`ifdef GBAR_ENABLE
|
||||
logic [`MAX(`NW_WIDTH, `NC_WIDTH)-1:0] size_m1;
|
||||
`else
|
||||
@@ -53,6 +55,13 @@ package VX_gpu_pkg;
|
||||
`endif
|
||||
} barrier_t;
|
||||
|
||||
localparam logic [1:0] BARRIER_ALL = 2'd0;
|
||||
localparam logic [1:0] BARRIER_SCALAR = 2'd1;
|
||||
localparam logic [1:0] BARRIER_TENSOR = 2'd2;
|
||||
localparam logic [1:0] BARRIER_MASK = 2'd3;
|
||||
localparam logic WU_DOMAIN_SCALAR = 1'b0;
|
||||
localparam logic WU_DOMAIN_TENSOR = 1'b1;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`XLEN-1:0] startup_addr;
|
||||
logic [7:0] mpm_class;
|
||||
@@ -90,7 +99,7 @@ package VX_gpu_pkg;
|
||||
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
|
||||
|
||||
// Core request tag bits
|
||||
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||
localparam ICACHE_TAG_WIDTH = (1 + `UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||
|
||||
// Memory request data bits
|
||||
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
||||
|
||||
@@ -179,6 +179,7 @@
|
||||
`define VX_CSR_CORE_ID 12'hCC2
|
||||
`define VX_CSR_WARP_MASK 12'hCC3
|
||||
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
|
||||
`define VX_CSR_GCID 12'hCC5 // legacy global core id alias used by Radiance bootrom
|
||||
|
||||
`define VX_CSR_NUM_THREADS 12'hFC0
|
||||
`define VX_CSR_NUM_WARPS 12'hFC1
|
||||
|
||||
@@ -188,6 +188,46 @@ module VX_alu_unit #(
|
||||
.ready_out (commit_block_if[block_idx].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (execute_if[block_idx].valid
|
||||
&& ((execute_if[block_idx].data.PC == 32'h80000010) || (execute_if[block_idx].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-alu-execute-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_mul=%b, is_red=%b, wb=%0d, rd=%0d (#%0d)\n",
|
||||
$time, CORE_ID, block_idx, execute_if[block_idx].valid, execute_if[block_idx].ready,
|
||||
execute_if[block_idx].data.wid, execute_if[block_idx].data.PC,
|
||||
execute_if[block_idx].data.op_type, execute_if[block_idx].data.op_mod,
|
||||
is_muldiv_op, is_reduce_op, execute_if[block_idx].data.wb,
|
||||
execute_if[block_idx].data.rd, execute_if[block_idx].data.uuid));
|
||||
end
|
||||
if (int_execute_if.valid
|
||||
&& ((int_execute_if.data.PC == 32'h80000010) || (int_execute_if.data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-alu-int-execute: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d (#%0d)\n",
|
||||
$time, CORE_ID, block_idx, int_execute_if.valid, int_execute_if.ready,
|
||||
int_execute_if.data.wid, int_execute_if.data.PC,
|
||||
int_execute_if.data.wb, int_execute_if.data.rd, int_execute_if.data.uuid));
|
||||
end
|
||||
if (int_commit_if.valid
|
||||
&& ((int_commit_if.data.PC == 32'h80000010) || (int_commit_if.data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-alu-int-commit: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, block_idx, int_commit_if.valid, int_commit_if.ready,
|
||||
int_commit_if.data.wid, int_commit_if.data.PC,
|
||||
int_commit_if.data.wb, int_commit_if.data.rd,
|
||||
int_commit_if.data.sop, int_commit_if.data.eop, int_commit_if.data.uuid));
|
||||
end
|
||||
if (commit_block_if[block_idx].valid
|
||||
&& ((commit_block_if[block_idx].data.PC == 32'h80000010) || (commit_block_if[block_idx].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-alu-commit-block: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, block_idx, commit_block_if[block_idx].valid, commit_block_if[block_idx].ready,
|
||||
commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.PC,
|
||||
commit_block_if[block_idx].data.wb, commit_block_if[block_idx].data.rd,
|
||||
commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop,
|
||||
commit_block_if[block_idx].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
@@ -208,7 +208,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].valid = commit_if[i].valid && (commit_if[i].data.wb || commit_if[i].data.tensor);
|
||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
@@ -224,7 +224,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
|
||||
always @(posedge clk) begin
|
||||
if (writeback_if[0].valid) begin
|
||||
if (writeback_if[0].valid && !writeback_if[0].data.tensor) begin
|
||||
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
|
||||
end
|
||||
end
|
||||
@@ -234,6 +234,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if ((CORE_ID == 0)
|
||||
&& commit_if[i].valid
|
||||
&& ((commit_if[i].data.PC == 32'h80000010) || (commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-commit-arb-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b, fire=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, commit_if[i].valid, commit_if[i].ready,
|
||||
commit_if[i].data.wid, commit_if[i].data.PC,
|
||||
commit_if[i].data.wb, commit_if[i].data.rd,
|
||||
commit_if[i].data.tensor, commit_if[i].data.sop,
|
||||
commit_if[i].data.eop, commit_fire[i], commit_if[i].data.uuid));
|
||||
end
|
||||
if ((CORE_ID == 0)
|
||||
&& writeback_if[i].valid
|
||||
&& ((writeback_if[i].data.PC == 32'h80000010) || (writeback_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-writeback-out: isw=%0d, valid=%b, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, writeback_if[i].valid,
|
||||
wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.wis,
|
||||
writeback_if[i].data.PC, writeback_if[i].data.rd,
|
||||
writeback_if[i].data.tensor, writeback_if[i].data.sop,
|
||||
writeback_if[i].data.eop, writeback_if[i].data.uuid));
|
||||
end
|
||||
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin
|
||||
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop));
|
||||
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS);
|
||||
@@ -259,6 +279,16 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
end
|
||||
end
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
if (commit_sched_if.committed[i] && (commit_sched_if.committed_wid[i] == `NW_WIDTH'(0))) begin
|
||||
`TRACE(1, ("%d: core%0d-commit-sched-out: isw=%0d, committed=%b, wid=%0d\n",
|
||||
$time, CORE_ID, i, commit_sched_if.committed[i], commit_sched_if.committed_wid[i]));
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -17,10 +17,11 @@
|
||||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0
|
||||
) (
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
@@ -39,8 +40,21 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
VX_tc_bus_if.master tensor_smem_A_if,
|
||||
VX_tc_bus_if.master tensor_smem_B_if,
|
||||
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
@@ -57,18 +71,26 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
input wire [31:0] acc_read_in,
|
||||
output wire [31:0] acc_write_out,
|
||||
output wire acc_write_en
|
||||
);
|
||||
VX_schedule_if schedule_if();
|
||||
VX_fetch_if fetch_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tc_rf_if tensor_regfile_if();
|
||||
`endif
|
||||
);
|
||||
`WU_CONFIG_STATIC_ASSERTS
|
||||
|
||||
VX_schedule_if scalar_schedule_if();
|
||||
VX_schedule_if tensor_schedule_if();
|
||||
VX_fetch_if scalar_fetch_if();
|
||||
VX_fetch_if tensor_fetch_if();
|
||||
VX_decode_if scalar_decode_if();
|
||||
VX_decode_if tensor_decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
VX_decode_sched_if scalar_decode_sched_if();
|
||||
VX_decode_sched_if tensor_decode_sched_if();
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if tensor_regfile_if[NUM_TENSOR_CORES]();
|
||||
VX_tc_rf_if scalar_dummy_tensor_regfile_if[NUM_TENSOR_CORES]();
|
||||
`endif
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_branch_ctl_if branch_ctl_if[2 * `NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
@@ -80,10 +102,28 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if tensor_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if tensor_alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if tensor_lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if tensor_ctrl_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if scalar_dummy_tensor_alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if scalar_dummy_tensor_lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if scalar_dummy_tensor_ctrl_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if scalar_dummy_tensor_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if tensor_dummy_alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_dispatch_if tensor_dummy_lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_dispatch_if tensor_dummy_fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
VX_dispatch_if tensor_dummy_sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if tensor_commit_if[`ISSUE_WIDTH]();
|
||||
wire tensor_csr_unlock_valid;
|
||||
wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid;
|
||||
wire tensor_tmc_valid;
|
||||
wire [`NW_WIDTH-1:0] tensor_tmc_wid;
|
||||
wire [`NUM_THREADS-1:0] tensor_tmc_tmask;
|
||||
`endif
|
||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
@@ -94,16 +134,62 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
VX_pipeline_perf_if scalar_decode_perf_if();
|
||||
VX_pipeline_perf_if tensor_decode_perf_if();
|
||||
VX_pipeline_perf_if scalar_issue_perf_if();
|
||||
VX_pipeline_perf_if tensor_issue_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
|
||||
assign pipeline_perf_if.illegal_tensor_reg_access = scalar_decode_perf_if.illegal_tensor_reg_access + tensor_decode_perf_if.illegal_tensor_reg_access;
|
||||
assign pipeline_perf_if.illegal_tensor_scalar_op = scalar_decode_perf_if.illegal_tensor_scalar_op + tensor_decode_perf_if.illegal_tensor_scalar_op;
|
||||
assign pipeline_perf_if.illegal_scalar_tensor_op = scalar_decode_perf_if.illegal_scalar_tensor_op + tensor_decode_perf_if.illegal_scalar_tensor_op;
|
||||
assign pipeline_perf_if.ibf_stalls = scalar_issue_perf_if.ibf_stalls + tensor_issue_perf_if.ibf_stalls;
|
||||
assign pipeline_perf_if.scb_stalls = scalar_issue_perf_if.scb_stalls + tensor_issue_perf_if.scb_stalls;
|
||||
assign pipeline_perf_if.scb_any_unit_uses = scalar_issue_perf_if.scb_any_unit_uses + tensor_issue_perf_if.scb_any_unit_uses;
|
||||
assign pipeline_perf_if.scb_fires = scalar_issue_perf_if.scb_fires + tensor_issue_perf_if.scb_fires;
|
||||
assign pipeline_perf_if.scb_any_fire_cycles = scalar_issue_perf_if.scb_any_fire_cycles + tensor_issue_perf_if.scb_any_fire_cycles;
|
||||
assign pipeline_perf_if.dispatch_any_fire_cycles = scalar_issue_perf_if.dispatch_any_fire_cycles + tensor_issue_perf_if.dispatch_any_fire_cycles;
|
||||
for (genvar perf_i = 0; perf_i < `NUM_EX_UNITS; ++perf_i) begin : g_issue_perf_merge
|
||||
assign pipeline_perf_if.units_uses[perf_i] = scalar_issue_perf_if.units_uses[perf_i] + tensor_issue_perf_if.units_uses[perf_i];
|
||||
assign pipeline_perf_if.dispatch_stalls[perf_i] = scalar_issue_perf_if.dispatch_stalls[perf_i] + tensor_issue_perf_if.dispatch_stalls[perf_i];
|
||||
assign pipeline_perf_if.dispatch_valids[perf_i] = scalar_issue_perf_if.dispatch_valids[perf_i] + tensor_issue_perf_if.dispatch_valids[perf_i];
|
||||
assign pipeline_perf_if.dispatch_fires[perf_i] = scalar_issue_perf_if.dispatch_fires[perf_i] + tensor_issue_perf_if.dispatch_fires[perf_i];
|
||||
end
|
||||
for (genvar perf_sfu_i = 0; perf_sfu_i < `NUM_SFU_UNITS; ++perf_sfu_i) begin : g_issue_sfu_perf_merge
|
||||
assign pipeline_perf_if.sfu_uses[perf_sfu_i] = scalar_issue_perf_if.sfu_uses[perf_sfu_i] + tensor_issue_perf_if.sfu_uses[perf_sfu_i];
|
||||
end
|
||||
`endif
|
||||
|
||||
assign decode_sched_if.valid = scalar_decode_sched_if.valid || tensor_decode_sched_if.valid;
|
||||
assign decode_sched_if.is_wstall = tensor_decode_sched_if.valid ? tensor_decode_sched_if.is_wstall : scalar_decode_sched_if.is_wstall;
|
||||
assign decode_sched_if.wid = tensor_decode_sched_if.valid ? tensor_decode_sched_if.wid : scalar_decode_sched_if.wid;
|
||||
`RUNTIME_ASSERT(
|
||||
!(scalar_decode_sched_if.valid && tensor_decode_sched_if.valid),
|
||||
("%t: *** core%0d-two-decode-sched-events-same-cycle", $time, CORE_ID)
|
||||
)
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
for (genvar dummy_i = 0; dummy_i < `ISSUE_WIDTH; ++dummy_i) begin : g_dummy_dispatch_ready
|
||||
assign scalar_dummy_tensor_alu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
assign scalar_dummy_tensor_lsu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
assign scalar_dummy_tensor_ctrl_dispatch_if[dummy_i].ready = 1'b1;
|
||||
assign scalar_dummy_tensor_dispatch_if[dummy_i].ready = 1'b1;
|
||||
assign tensor_dummy_alu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
assign tensor_dummy_lsu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign tensor_dummy_fpu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
`endif
|
||||
assign tensor_dummy_sfu_dispatch_if[dummy_i].ready = 1'b1;
|
||||
end
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
@@ -124,9 +210,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.NUM_BRANCHES (2 * `NUM_ALU_BLOCKS)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
|
||||
@@ -136,12 +223,20 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_csr_unlock_valid(tensor_csr_unlock_valid),
|
||||
.tensor_csr_unlock_wid(tensor_csr_unlock_wid),
|
||||
.tensor_tmc_valid(tensor_tmc_valid),
|
||||
.tensor_tmc_wid(tensor_tmc_wid),
|
||||
.tensor_tmc_tmask(tensor_tmc_tmask),
|
||||
`endif
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.schedule_if (schedule_if),
|
||||
.scalar_schedule_if (scalar_schedule_if),
|
||||
.tensor_schedule_if (tensor_schedule_if),
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
@@ -156,54 +251,114 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.scalar_schedule_if (scalar_schedule_if),
|
||||
.tensor_schedule_if (tensor_schedule_if),
|
||||
.scalar_fetch_if(scalar_fetch_if),
|
||||
.tensor_fetch_if(tensor_fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) scalar_decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (scalar_fetch_if),
|
||||
.decode_if (scalar_decode_if),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (scalar_decode_perf_if.decode),
|
||||
`endif
|
||||
.decode_sched_if(scalar_decode_sched_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (tensor_fetch_if),
|
||||
.decode_if (tensor_decode_if),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (tensor_decode_perf_if.decode),
|
||||
`endif
|
||||
.decode_sched_if(tensor_decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
|
||||
.DOMAIN (WU_DOMAIN_SCALAR)
|
||||
) scalar_issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (scalar_issue_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (scalar_decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_dispatch_if(tensor_dispatch_if),
|
||||
`ifdef EXT_T_HOPPER
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
`endif
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_alu_dispatch_if(scalar_dummy_tensor_alu_dispatch_if),
|
||||
.tensor_lsu_dispatch_if(scalar_dummy_tensor_lsu_dispatch_if),
|
||||
.tensor_ctrl_dispatch_if(scalar_dummy_tensor_ctrl_dispatch_if),
|
||||
.tensor_dispatch_if(scalar_dummy_tensor_dispatch_if),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (scalar_dummy_tensor_regfile_if),
|
||||
`endif
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
|
||||
.DOMAIN (WU_DOMAIN_TENSOR)
|
||||
) tensor_issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (tensor_issue_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (tensor_decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(tensor_dummy_alu_dispatch_if),
|
||||
.lsu_dispatch_if(tensor_dummy_lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(tensor_dummy_fpu_dispatch_if),
|
||||
`endif
|
||||
.tensor_alu_dispatch_if(tensor_alu_dispatch_if),
|
||||
.tensor_lsu_dispatch_if(tensor_lsu_dispatch_if),
|
||||
.tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if),
|
||||
.tensor_dispatch_if(tensor_dispatch_if),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(tensor_dummy_sfu_dispatch_if)
|
||||
);
|
||||
`else
|
||||
assign tensor_decode_if.ready = 1'b1;
|
||||
`endif
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.TENSOR_FP16 (TENSOR_FP16)
|
||||
) execute (
|
||||
.TENSOR_FP16 (TENSOR_FP16),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
@@ -212,10 +367,15 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
.base_dcrs (base_dcrs),
|
||||
.downstream_mem_busy(downstream_mem_busy),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.perf_scalar_lsu_reqs(pipeline_perf_if.scalar_lsu_reqs),
|
||||
.perf_tensor_lsu_reqs(pipeline_perf_if.tensor_lsu_reqs),
|
||||
.perf_scalar_lsu_stalls(pipeline_perf_if.scalar_lsu_stalls),
|
||||
.perf_tensor_lsu_stalls(pipeline_perf_if.tensor_lsu_stalls),
|
||||
.perf_mem_merge_stalls(pipeline_perf_if.mem_merge_stalls),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
|
||||
@@ -223,12 +383,33 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_dispatch_if (tensor_dispatch_if),
|
||||
.tensor_commit_if (tensor_commit_if),
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_alu_dispatch_if (tensor_alu_dispatch_if),
|
||||
.tensor_lsu_dispatch_if (tensor_lsu_dispatch_if),
|
||||
.tensor_ctrl_dispatch_if (tensor_ctrl_dispatch_if),
|
||||
.tensor_dispatch_if (tensor_dispatch_if),
|
||||
.tensor_commit_if (tensor_commit_if),
|
||||
.tensor_csr_unlock_valid(tensor_csr_unlock_valid),
|
||||
.tensor_csr_unlock_wid(tensor_csr_unlock_wid),
|
||||
.tensor_tmc_valid(tensor_tmc_valid),
|
||||
.tensor_tmc_wid(tensor_tmc_wid),
|
||||
.tensor_tmc_tmask(tensor_tmc_tmask),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.tensor_regfile_if (tensor_regfile_if),
|
||||
.tensor_smem_A_if (tensor_smem_A_if),
|
||||
.tensor_tmem_A_ren (tensor_tmem_A_ren),
|
||||
.tensor_tmem_A_rready(tensor_tmem_A_rready),
|
||||
.tensor_tmem_A_raddr(tensor_tmem_A_raddr),
|
||||
.tensor_tmem_A_rdata(tensor_tmem_A_rdata),
|
||||
.tensor_tmem_C_ren (tensor_tmem_C_ren),
|
||||
.tensor_tmem_C_rready(tensor_tmem_C_rready),
|
||||
.tensor_tmem_C_raddr(tensor_tmem_C_raddr),
|
||||
.tensor_tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.tensor_tmem_C_wen (tensor_tmem_C_wen),
|
||||
.tensor_tmem_C_wready(tensor_tmem_C_wready),
|
||||
.tensor_tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tensor_tmem_C_wdata(tensor_tmem_C_wdata),
|
||||
.tensor_tmem_C_mask(tensor_tmem_C_mask),
|
||||
.tensor_smem_B_if (tensor_smem_B_if),
|
||||
`endif
|
||||
`endif
|
||||
@@ -485,10 +666,17 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
pipeline_perf_if.sched_barrier_idles, `NUM_WARPS);
|
||||
// sched_stalls can happen when the later issue stage stalls,
|
||||
// causing the ibuffer to clog.
|
||||
$display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls,
|
||||
$itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0);
|
||||
$display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls,
|
||||
$itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0);
|
||||
$display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls,
|
||||
$itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0);
|
||||
$display("wu scalar scheduler ready cycles: %d", pipeline_perf_if.scalar_sched_ready_cycles);
|
||||
$display("wu tensor scheduler ready cycles: %d", pipeline_perf_if.tensor_sched_ready_cycles);
|
||||
$display("wu scalar scheduler issued cycles: %d", pipeline_perf_if.scalar_sched_issued_cycles);
|
||||
$display("wu tensor scheduler issued cycles: %d", pipeline_perf_if.tensor_sched_issued_cycles);
|
||||
$display("wu illegal tensor reg accesses: %d", pipeline_perf_if.illegal_tensor_reg_access);
|
||||
$display("wu illegal tensor scalar ops: %d", pipeline_perf_if.illegal_tensor_scalar_op);
|
||||
$display("wu illegal scalar tensor ops: %d", pipeline_perf_if.illegal_scalar_tensor_op);
|
||||
$display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls,
|
||||
$itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0);
|
||||
// see VX_scoreboard.sv
|
||||
// scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready)
|
||||
// units_uses: valid & ~operands_ready
|
||||
@@ -545,12 +733,17 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
$display("ifetches: %d", perf_ifetches);
|
||||
$display("ifetch latency: %f cycles",
|
||||
$itor(icache_lat) / $itor(ifetches));
|
||||
$display("dcache loads: %d", perf_loads);
|
||||
$display("dcache load latency: %f cycles",
|
||||
$itor(dcache_lat) / $itor(loads));
|
||||
$display("dcache stores: %d", perf_stores);
|
||||
end
|
||||
end
|
||||
$display("dcache loads: %d", perf_loads);
|
||||
$display("dcache load latency: %f cycles",
|
||||
$itor(dcache_lat) / $itor(loads));
|
||||
$display("dcache stores: %d", perf_stores);
|
||||
$display("wu scalar lsu accepted requests: %d", pipeline_perf_if.scalar_lsu_reqs);
|
||||
$display("wu tensor lsu accepted requests: %d", pipeline_perf_if.tensor_lsu_reqs);
|
||||
$display("wu scalar lsu merge stalls: %d", pipeline_perf_if.scalar_lsu_stalls);
|
||||
$display("wu tensor lsu merge stalls: %d", pipeline_perf_if.tensor_lsu_stalls);
|
||||
$display("wu memory merge contention cycles: %d", pipeline_perf_if.mem_merge_stalls);
|
||||
end
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
@@ -163,6 +163,7 @@ import VX_fpu_pkg::*;
|
||||
`endif
|
||||
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
|
||||
`VX_CSR_GCID : read_data_ro_r = 32'(CORE_ID);
|
||||
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
|
||||
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
|
||||
|
||||
@@ -163,6 +163,22 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
|
||||
assign sched_csr_if.unlock_wid = execute_if.data.wid;
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)
|
||||
&& execute_if.valid
|
||||
&& (execute_if.data.PC == 32'h80000028)) begin
|
||||
`TRACE(1, ("%d: core%0d-csr-probe: valid=%b ready=%b req_valid=%b req_ready=%b no_pending=%b wid=%0d PC=0x%0h addr=0x%0h wb=%0d rd=%0d sop=%b eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, execute_if.valid, execute_if.ready,
|
||||
csr_req_valid, csr_req_ready, no_pending_instr,
|
||||
execute_if.data.wid, execute_if.data.PC, csr_addr,
|
||||
execute_if.data.wb, execute_if.data.rd,
|
||||
execute_if.data.sop, execute_if.data.eop,
|
||||
execute_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
// send response
|
||||
wire [NUM_LANES-1:0][31:0] csr_commit_data;
|
||||
|
||||
|
||||
@@ -39,14 +39,15 @@ module VX_decode #(
|
||||
|
||||
// outputs
|
||||
VX_decode_if.master decode_if,
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if.decode perf_decode_if,
|
||||
`endif
|
||||
VX_decode_sched_if.master decode_sched_if
|
||||
);
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
reg [`EX_BITS-1:0] ex_type;
|
||||
reg [`INST_OP_BITS-1:0] op_type;
|
||||
@@ -488,6 +489,17 @@ module VX_decode #(
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h6: begin // WSPAWN_MASK
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN);
|
||||
op_mod[0] = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h7: begin // BAR_MASK
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_BAR_MASK);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
@@ -555,6 +567,54 @@ module VX_decode #(
|
||||
// B matrix tiles
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`elsif EXT_T_BLACKWELL
|
||||
ex_type = `EX_TENSOR;
|
||||
case (func3)
|
||||
3'b000: op_type = `INST_TENSOR_BWGMMA;
|
||||
3'b001: op_type = `INST_TENSOR_BWGMMA_WAIT;
|
||||
3'b010: op_type = `INST_TENSOR_TCGEN05_CP;
|
||||
3'b011: op_type = `INST_TENSOR_TCGEN05_CP_WAIT;
|
||||
3'b100: op_type = `INST_TENSOR_TCGEN05_LD;
|
||||
3'b101: op_type = `INST_TENSOR_TCGEN05_ST;
|
||||
3'b110: op_type = `INST_TENSOR_TCGEN05_CB;
|
||||
default: ;
|
||||
endcase
|
||||
case (func3)
|
||||
3'b000: begin // BWGMMA: rs1=tmem_a, rs2=smem_b, rd field=tmem_c source
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`ifdef EXT_F_ENABLE
|
||||
rs3_r = {1'b0, rd};
|
||||
`else
|
||||
rs3_r = rd;
|
||||
`endif
|
||||
use_rs3 = 1;
|
||||
end
|
||||
3'b010, 3'b110: begin // TCGEN05_CP/CB: rs1=tmem, rs2=global memory address
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'b100: begin // TCGEN05_LD: rs1=tmem, rd=FP destination
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`USED_FREG (rd);
|
||||
`else
|
||||
`USED_IREG (rd);
|
||||
`endif
|
||||
end
|
||||
3'b101: begin // TCGEN05_ST: rs1=tmem, rd field=FP source
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
rd_r = {1'b1, rd};
|
||||
rs3_r = {1'b1, rd};
|
||||
`else
|
||||
rd_r = rd;
|
||||
rs3_r = rd;
|
||||
`endif
|
||||
use_rs3 = 1;
|
||||
end
|
||||
default: ;
|
||||
endcase
|
||||
`else
|
||||
ex_type = `EX_TENSOR;
|
||||
op_type = `INST_TENSOR_HMMA;
|
||||
@@ -572,8 +632,93 @@ module VX_decode #(
|
||||
endcase
|
||||
end
|
||||
|
||||
wire fetch_is_tensor_warp = fetch_if.data.wid >= `NW_WIDTH'(`NUM_SCALAR_WARPS);
|
||||
wire fetch_is_scalar_warp = fetch_if.data.wid < `NW_WIDTH'(`NUM_SCALAR_WARPS);
|
||||
wire fetch_fire = fetch_if.valid && fetch_if.ready;
|
||||
wire decoded_tensor_inst = (ex_type == `EX_BITS'(`EX_TENSOR));
|
||||
wire t_reg_hi_rd = use_rd && (rd_r[`NRI_BITS-1:3] != '0);
|
||||
wire t_reg_hi_rs1 = use_rs1 && (rs1_r[`NRI_BITS-1:3] != '0);
|
||||
wire t_reg_hi_rs2 = use_rs2 && (rs2_r[`NRI_BITS-1:3] != '0);
|
||||
wire t_reg_hi_rs3 = use_rs3 && (rs3_r[`NRI_BITS-1:3] != '0);
|
||||
wire tensor_reg_illegal = fetch_is_tensor_warp && (t_reg_hi_rd || t_reg_hi_rs1 || t_reg_hi_rs2 || t_reg_hi_rs3);
|
||||
wire scalar_tensor_illegal = fetch_is_scalar_warp && decoded_tensor_inst;
|
||||
wire tensor_fpu_illegal = fetch_is_tensor_warp && (ex_type == `EX_BITS'(`EX_FPU));
|
||||
wire tensor_read_csr_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_CSRRS))
|
||||
&& (rs1_r == `NR_BITS'(0))
|
||||
&& ((u_12 == `VX_CSR_THREAD_ID)
|
||||
|| (u_12 == `VX_CSR_WARP_ID)
|
||||
|| (u_12 == `VX_CSR_CORE_ID)
|
||||
|| (u_12 == `VX_CSR_MHARTID)
|
||||
|| (u_12 == `VX_CSR_NUM_THREADS)
|
||||
|| (u_12 == `VX_CSR_NUM_WARPS)
|
||||
|| (u_12 == `VX_CSR_NUM_CORES));
|
||||
wire tensor_sfu_barrier_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_BAR))
|
||||
|| (op_type == `INST_OP_BITS'(`INST_SFU_BAR_MASK));
|
||||
wire tensor_sfu_allowed = (op_type == `INST_OP_BITS'(`INST_SFU_TMC))
|
||||
|| tensor_read_csr_allowed
|
||||
|| tensor_sfu_barrier_allowed;
|
||||
wire tensor_sfu_illegal = fetch_is_tensor_warp
|
||||
&& (ex_type == `EX_BITS'(`EX_SFU))
|
||||
&& !tensor_sfu_allowed;
|
||||
wire tensor_complex_alu_illegal = fetch_is_tensor_warp
|
||||
&& (ex_type == `EX_BITS'(`EX_ALU))
|
||||
&& (`INST_ALU_IS_M(op_mod) || `INST_ALU_IS_RED(op_mod));
|
||||
wire tensor_scalar_illegal = tensor_fpu_illegal || tensor_sfu_illegal || tensor_complex_alu_illegal;
|
||||
wire decode_illegal = tensor_reg_illegal || scalar_tensor_illegal || tensor_scalar_illegal;
|
||||
|
||||
wire [`EX_BITS-1:0] emit_ex_type = decode_illegal ? `EX_BITS'(`EX_ALU) : ex_type;
|
||||
wire [`INST_OP_BITS-1:0] emit_op_type = decode_illegal ? `INST_OP_BITS'(`INST_BR_EBREAK) : op_type;
|
||||
wire [`INST_MOD_BITS-1:0] emit_op_mod = decode_illegal ? `INST_MOD_BITS'(1) : op_mod;
|
||||
wire emit_use_PC = decode_illegal ? 1'b1 : use_PC;
|
||||
wire emit_use_imm = decode_illegal ? 1'b1 : use_imm;
|
||||
wire [`XLEN-1:0] emit_imm = decode_illegal ? `XLEN'(0) : imm;
|
||||
wire [`NR_BITS-1:0] emit_rd = decode_illegal ? `NR_BITS'(0) : rd_r;
|
||||
wire [`NR_BITS-1:0] emit_rs1 = decode_illegal ? `NR_BITS'(0) : rs1_r;
|
||||
wire [`NR_BITS-1:0] emit_rs2 = decode_illegal ? `NR_BITS'(0) : rs2_r;
|
||||
wire [`NR_BITS-1:0] emit_rs3 = decode_illegal ? `NR_BITS'(0) : rs3_r;
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!fetch_if.valid || !tensor_reg_illegal,
|
||||
("%t: *** core%0d-decode-illegal-tensor-reg: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d rd=%0d rs1=%0d rs2=%0d rs3=%0d",
|
||||
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, rd, rs1, rs2, rs3)
|
||||
)
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!fetch_if.valid || !scalar_tensor_illegal,
|
||||
("%t: *** core%0d-decode-illegal-scalar-tensor-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d",
|
||||
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type)
|
||||
)
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!fetch_if.valid || !tensor_scalar_illegal,
|
||||
("%t: *** core%0d-decode-illegal-tensor-scalar-op: wid=%0d PC=0x%0h instr=0x%0h ex=%0d op=%0d mod=%0d",
|
||||
$time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.instr, ex_type, op_type, op_mod)
|
||||
)
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_reg_access;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_illegal_tensor_scalar_op;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_illegal_scalar_tensor_op;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_illegal_tensor_reg_access <= '0;
|
||||
perf_illegal_tensor_scalar_op <= '0;
|
||||
perf_illegal_scalar_tensor_op <= '0;
|
||||
end else if (fetch_fire) begin
|
||||
perf_illegal_tensor_reg_access <= perf_illegal_tensor_reg_access + `PERF_CTR_BITS'(tensor_reg_illegal);
|
||||
perf_illegal_tensor_scalar_op <= perf_illegal_tensor_scalar_op + `PERF_CTR_BITS'(tensor_scalar_illegal);
|
||||
perf_illegal_scalar_tensor_op <= perf_illegal_scalar_tensor_op + `PERF_CTR_BITS'(scalar_tensor_illegal);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_decode_if.illegal_tensor_reg_access = perf_illegal_tensor_reg_access;
|
||||
assign perf_decode_if.illegal_tensor_scalar_op = perf_illegal_tensor_scalar_op;
|
||||
assign perf_decode_if.illegal_scalar_tensor_op = perf_illegal_scalar_tensor_op;
|
||||
`endif
|
||||
|
||||
// disable write to integer register r0
|
||||
wire wb = use_rd && (rd_r != 0);
|
||||
wire wb = !decode_illegal && use_rd && (rd_r != 0);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
@@ -583,7 +728,7 @@ module VX_decode #(
|
||||
.reset (reset),
|
||||
.valid_in (fetch_if.valid),
|
||||
.ready_in (fetch_if.ready),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, emit_ex_type, emit_op_type, emit_op_mod, emit_use_PC, emit_imm, emit_use_imm, wb, emit_rd, emit_rs1, emit_rs2, emit_rs3}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.valid_out (decode_if.valid),
|
||||
.ready_out (decode_if.ready)
|
||||
@@ -591,11 +736,9 @@ module VX_decode #(
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire fetch_fire = fetch_if.valid && fetch_if.ready;
|
||||
|
||||
assign decode_sched_if.valid = fetch_fire;
|
||||
assign decode_sched_if.wid = fetch_if.data.wid;
|
||||
assign decode_sched_if.is_wstall = is_wstall;
|
||||
assign decode_sched_if.is_wstall = is_wstall || decode_illegal;
|
||||
`ifndef L1_ENABLE
|
||||
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
|
||||
`endif
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
parameter CORE_ID = 0,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
@@ -36,11 +37,15 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
||||
`endif
|
||||
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_PARAM (DOMAIN)
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
|
||||
|
||||
@@ -68,8 +73,29 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_operands_if alu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] operands_wid;
|
||||
wire [`ISSUE_WIDTH-1:0] operands_is_tensor;
|
||||
wire [`ISSUE_WIDTH-1:0] tensor_alu_allowed;
|
||||
wire [`ISSUE_WIDTH-1:0] tensor_ctrl_allowed;
|
||||
wire [`ISSUE_WIDTH-1:0] tensor_wctl_allowed;
|
||||
wire [`ISSUE_WIDTH-1:0] tensor_sfu_allowed;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
|
||||
assign operands_wid[i] = wis_to_wid(operands_if[i].data.wis, ISSUE_ISW_W'(i));
|
||||
assign operands_is_tensor[i] = operands_wid[i] >= `NW_WIDTH'(`NUM_SCALAR_WARPS);
|
||||
assign tensor_alu_allowed[i] = !`INST_ALU_IS_M(operands_if[i].data.op_mod)
|
||||
&& !`INST_ALU_IS_RED(operands_if[i].data.op_mod);
|
||||
assign tensor_ctrl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_TMC)
|
||||
|| (operands_if[i].data.op_type == `INST_SFU_CSRRS)
|
||||
|| (operands_if[i].data.op_type == `INST_SFU_BAR)
|
||||
|| (operands_if[i].data.op_type == `INST_SFU_BAR_MASK);
|
||||
assign tensor_wctl_allowed[i] = (operands_if[i].data.op_type == `INST_SFU_BAR)
|
||||
|| (operands_if[i].data.op_type == `INST_SFU_BAR_MASK);
|
||||
assign tensor_sfu_allowed[i] = tensor_ctrl_allowed[i] || tensor_wctl_allowed[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i];
|
||||
assign alu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (alu_reset, reset);
|
||||
@@ -90,12 +116,43 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
// Tensor INT/control dispatch
|
||||
|
||||
VX_operands_if tensor_alu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign tensor_alu_operands_if[i].valid = operands_if[i].valid
|
||||
&& (operands_if[i].data.ex_type == `EX_ALU)
|
||||
&& operands_is_tensor[i]
|
||||
&& tensor_alu_allowed[i];
|
||||
assign tensor_alu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (tensor_alu_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) tensor_alu_buffer (
|
||||
.clk (clk),
|
||||
.reset (tensor_alu_reset),
|
||||
.valid_in (tensor_alu_operands_if[i].valid),
|
||||
.ready_in (tensor_alu_operands_if[i].ready),
|
||||
.data_in (`TO_DISPATCH_DATA(tensor_alu_operands_if[i].data, last_active_tid[i])),
|
||||
.data_out (tensor_alu_dispatch_if[i].data),
|
||||
.valid_out (tensor_alu_dispatch_if[i].valid),
|
||||
.ready_out (tensor_alu_dispatch_if[i].ready)
|
||||
);
|
||||
end
|
||||
`endif
|
||||
|
||||
// LSU dispatch
|
||||
|
||||
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
|
||||
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i];
|
||||
assign lsu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (lsu_reset, reset);
|
||||
@@ -116,6 +173,34 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
// Tensor LSU dispatch
|
||||
|
||||
VX_operands_if tensor_lsu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign tensor_lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i];
|
||||
assign tensor_lsu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (tensor_lsu_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) tensor_lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (tensor_lsu_reset),
|
||||
.valid_in (tensor_lsu_operands_if[i].valid),
|
||||
.ready_in (tensor_lsu_operands_if[i].ready),
|
||||
.data_in (`TO_DISPATCH_DATA(tensor_lsu_operands_if[i].data, last_active_tid[i])),
|
||||
.data_out (tensor_lsu_dispatch_if[i].data),
|
||||
.valid_out (tensor_lsu_dispatch_if[i].valid),
|
||||
.ready_out (tensor_lsu_dispatch_if[i].ready)
|
||||
);
|
||||
end
|
||||
`endif
|
||||
|
||||
// FPU dispatch
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -123,7 +208,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
|
||||
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i];
|
||||
assign fpu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
@@ -152,7 +237,9 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
VX_operands_if tensor_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR);
|
||||
assign tensor_operands_if[i].valid = operands_if[i].valid
|
||||
&& (operands_if[i].data.ex_type == `EX_TENSOR)
|
||||
&& operands_is_tensor[i];
|
||||
assign tensor_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (tensor_reset, reset);
|
||||
@@ -174,12 +261,45 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
// Tensor control dispatch
|
||||
|
||||
VX_operands_if tensor_ctrl_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign tensor_ctrl_operands_if[i].valid = operands_if[i].valid
|
||||
&& (operands_if[i].data.ex_type == `EX_SFU)
|
||||
&& operands_is_tensor[i]
|
||||
&& tensor_ctrl_allowed[i];
|
||||
assign tensor_ctrl_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (tensor_ctrl_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) tensor_ctrl_buffer (
|
||||
.clk (clk),
|
||||
.reset (tensor_ctrl_reset),
|
||||
.valid_in (tensor_ctrl_operands_if[i].valid),
|
||||
.ready_in (tensor_ctrl_operands_if[i].ready),
|
||||
.data_in (`TO_DISPATCH_DATA(tensor_ctrl_operands_if[i].data, last_active_tid[i])),
|
||||
.data_out (tensor_ctrl_dispatch_if[i].data),
|
||||
.valid_out (tensor_ctrl_dispatch_if[i].valid),
|
||||
.ready_out (tensor_ctrl_dispatch_if[i].ready)
|
||||
);
|
||||
end
|
||||
`endif
|
||||
|
||||
// SFU dispatch
|
||||
|
||||
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
|
||||
assign sfu_operands_if[i].valid = operands_if[i].valid
|
||||
&& (operands_if[i].data.ex_type == `EX_SFU)
|
||||
&& !operands_is_tensor[i];
|
||||
assign sfu_operands_if[i].data = operands_if[i].data;
|
||||
|
||||
`RESET_RELAY (sfu_reset, reset);
|
||||
@@ -202,17 +322,46 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
|
||||
// can take next request?
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|
||||
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
|
||||
`ifdef EXT_F_ENABLE
|
||||
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
|
||||
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && !operands_is_tensor[i])
|
||||
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && !operands_is_tensor[i])
|
||||
`ifdef EXT_F_ENABLE
|
||||
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU) && !operands_is_tensor[i])
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
|| (tensor_alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU) && operands_is_tensor[i] && tensor_alu_allowed[i])
|
||||
|| (tensor_lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU) && operands_is_tensor[i])
|
||||
|| (tensor_ctrl_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && operands_is_tensor[i] && tensor_ctrl_allowed[i])
|
||||
|| (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR) && operands_is_tensor[i])
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
|| (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR))
|
||||
`endif
|
||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU) && !operands_is_tensor[i]);
|
||||
end
|
||||
|
||||
`ifdef SIMULATION
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
`RUNTIME_ASSERT(
|
||||
!(operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR)) || operands_is_tensor[i],
|
||||
("%t: *** core%0d-dispatch-illegal-scalar-tensor-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
|
||||
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_FPU)),
|
||||
("%t: *** core%0d-dispatch-illegal-tensor-fpu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
|
||||
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_SFU) && !tensor_sfu_allowed[i]),
|
||||
("%t: *** core%0d-dispatch-illegal-tensor-sfu-op: wid=%0d PC=0x%0h op=0x%0h (#%0d)",
|
||||
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.uuid)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!(operands_if[i].valid && operands_is_tensor[i] && (operands_if[i].data.ex_type == `EX_ALU)
|
||||
&& (`INST_ALU_IS_M(operands_if[i].data.op_mod) || `INST_ALU_IS_RED(operands_if[i].data.op_mod))),
|
||||
("%t: *** core%0d-dispatch-illegal-tensor-complex-alu-op: wid=%0d PC=0x%0h op=0x%0h mod=0x%0h (#%0d)",
|
||||
$time, CORE_ID, operands_wid[i], operands_if[i].data.PC, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.uuid)
|
||||
)
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r;
|
||||
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r;
|
||||
@@ -309,9 +458,21 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if ((CORE_ID == 0)
|
||||
&& alu_dispatch_if[i].valid
|
||||
&& ((alu_dispatch_if[i].data.PC == 32'h80000010) || (alu_dispatch_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-alu-dispatch-buffer: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, wb=%0d, rd=%0d (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_dispatch_if[i].valid, alu_dispatch_if[i].ready,
|
||||
wis_to_wid(alu_dispatch_if[i].data.wis, i), alu_dispatch_if[i].data.PC,
|
||||
alu_dispatch_if[i].data.op_type, alu_dispatch_if[i].data.op_mod,
|
||||
alu_dispatch_if[i].data.wb, alu_dispatch_if[i].data.rd,
|
||||
alu_dispatch_if[i].data.uuid));
|
||||
end
|
||||
if (operands_if[i].valid && operands_if[i].ready) begin
|
||||
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
|
||||
trace_ex_type(1, operands_if[i].data.ex_type);
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_mod, operands_if[i].data.rd, '0, operands_if[i].data.use_imm, operands_if[i].data.imm);
|
||||
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
|
||||
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs2_data="));
|
||||
|
||||
@@ -17,6 +17,7 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
|
||||
parameter BLOCK_SIZE = 1,
|
||||
parameter NUM_LANES = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter ISW_BASE = 0,
|
||||
parameter MAX_FANOUT = `MAX_FANOUT
|
||||
) (
|
||||
input wire clk,
|
||||
@@ -234,7 +235,8 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
|
||||
|
||||
`RESET_RELAY(buf_out_reset, reset);
|
||||
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||
wire [ISSUE_ISW_W-1:0] routed_isw = isw + ISSUE_ISW_W'(ISW_BASE);
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], routed_isw);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (OUT_DATAW),
|
||||
|
||||
@@ -15,7 +15,8 @@
|
||||
|
||||
module VX_execute import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter TENSOR_FP16 = 0
|
||||
parameter TENSOR_FP16 = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
@@ -37,6 +38,11 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls,
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -46,7 +52,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
|
||||
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
|
||||
VX_branch_ctl_if.master branch_ctl_if [2 * `NUM_ALU_BLOCKS],
|
||||
|
||||
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
|
||||
@@ -56,12 +62,33 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if.slave tensor_alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tc_rf_if.master tensor_regfile_if,
|
||||
VX_tc_bus_if.master tensor_smem_A_if,
|
||||
VX_tc_bus_if.master tensor_smem_B_if,
|
||||
output wire tensor_csr_unlock_valid,
|
||||
output wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
|
||||
output wire tensor_tmc_valid,
|
||||
output wire [`NW_WIDTH-1:0] tensor_tmc_wid,
|
||||
output wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.master tensor_regfile_if[NUM_TENSOR_CORES],
|
||||
VX_tc_bus_if.master tensor_smem_A_if[NUM_TENSOR_CORES],
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_A_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_A_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_A_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wen,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tensor_tmem_C_wready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tensor_tmem_C_waddr,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tensor_tmem_C_wdata,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tensor_tmem_C_mask,
|
||||
VX_tc_bus_if.master tensor_smem_B_if[NUM_TENSOR_CORES],
|
||||
`endif
|
||||
`endif
|
||||
|
||||
@@ -76,23 +103,196 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_warp_ctl_if scalar_warp_ctl_if();
|
||||
VX_warp_ctl_if tensor_warp_ctl_if();
|
||||
|
||||
localparam WARP_CTL_DATAW = `NW_WIDTH + $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
|
||||
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_in;
|
||||
reg [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_r;
|
||||
reg [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_r;
|
||||
reg scalar_warp_ctl_valid_r;
|
||||
reg tensor_warp_ctl_valid_r;
|
||||
reg warp_ctl_rr;
|
||||
|
||||
assign scalar_warp_ctl_data_in = {scalar_warp_ctl_if.wid, scalar_warp_ctl_if.tmc, scalar_warp_ctl_if.wspawn, scalar_warp_ctl_if.split, scalar_warp_ctl_if.sjoin, scalar_warp_ctl_if.barrier};
|
||||
assign tensor_warp_ctl_data_in = {tensor_warp_ctl_if.wid, tensor_warp_ctl_if.tmc, tensor_warp_ctl_if.wspawn, tensor_warp_ctl_if.split, tensor_warp_ctl_if.sjoin, tensor_warp_ctl_if.barrier};
|
||||
|
||||
wire scalar_warp_ctl_candidate_valid = scalar_warp_ctl_valid_r || scalar_warp_ctl_if.valid;
|
||||
wire tensor_warp_ctl_candidate_valid = tensor_warp_ctl_valid_r || tensor_warp_ctl_if.valid;
|
||||
wire select_tensor_warp_ctl = tensor_warp_ctl_candidate_valid && (!scalar_warp_ctl_candidate_valid || warp_ctl_rr);
|
||||
wire [WARP_CTL_DATAW-1:0] scalar_warp_ctl_data_out = scalar_warp_ctl_valid_r ? scalar_warp_ctl_data_r : scalar_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] tensor_warp_ctl_data_out = tensor_warp_ctl_valid_r ? tensor_warp_ctl_data_r : tensor_warp_ctl_data_in;
|
||||
wire [WARP_CTL_DATAW-1:0] selected_warp_ctl_data = select_tensor_warp_ctl ? tensor_warp_ctl_data_out : scalar_warp_ctl_data_out;
|
||||
|
||||
wire consume_scalar_warp_ctl_pending = !select_tensor_warp_ctl && scalar_warp_ctl_valid_r;
|
||||
wire consume_scalar_warp_ctl_input = !select_tensor_warp_ctl && !scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid;
|
||||
wire consume_tensor_warp_ctl_pending = select_tensor_warp_ctl && tensor_warp_ctl_valid_r;
|
||||
wire consume_tensor_warp_ctl_input = select_tensor_warp_ctl && !tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid;
|
||||
|
||||
assign warp_ctl_if.valid = scalar_warp_ctl_candidate_valid || tensor_warp_ctl_candidate_valid;
|
||||
assign {warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.sjoin, warp_ctl_if.barrier} = selected_warp_ctl_data;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
scalar_warp_ctl_valid_r <= 1'b0;
|
||||
tensor_warp_ctl_valid_r <= 1'b0;
|
||||
warp_ctl_rr <= 1'b0;
|
||||
end else begin
|
||||
if (scalar_warp_ctl_candidate_valid && tensor_warp_ctl_candidate_valid) begin
|
||||
warp_ctl_rr <= !select_tensor_warp_ctl;
|
||||
end
|
||||
|
||||
if (scalar_warp_ctl_valid_r) begin
|
||||
if (consume_scalar_warp_ctl_pending) begin
|
||||
scalar_warp_ctl_valid_r <= scalar_warp_ctl_if.valid;
|
||||
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
||||
end
|
||||
end else if (scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_input) begin
|
||||
scalar_warp_ctl_valid_r <= 1'b1;
|
||||
scalar_warp_ctl_data_r <= scalar_warp_ctl_data_in;
|
||||
end
|
||||
|
||||
if (tensor_warp_ctl_valid_r) begin
|
||||
if (consume_tensor_warp_ctl_pending) begin
|
||||
tensor_warp_ctl_valid_r <= tensor_warp_ctl_if.valid;
|
||||
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
||||
end
|
||||
end else if (tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_input) begin
|
||||
tensor_warp_ctl_valid_r <= 1'b1;
|
||||
tensor_warp_ctl_data_r <= tensor_warp_ctl_data_in;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(scalar_warp_ctl_valid_r && scalar_warp_ctl_if.valid && !consume_scalar_warp_ctl_pending),
|
||||
("%t: *** core%0d-scalar-warp-ctl-merge-overflow", $time, CORE_ID)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!(tensor_warp_ctl_valid_r && tensor_warp_ctl_if.valid && !consume_tensor_warp_ctl_pending),
|
||||
("%t: *** core%0d-tensor-warp-ctl-merge-overflow", $time, CORE_ID)
|
||||
)
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (alu_reset, reset);
|
||||
`RESET_RELAY (lsu_reset, reset);
|
||||
`RESET_RELAY (sfu_reset, reset);
|
||||
|
||||
VX_commit_if alu_scalar_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (alu_reset),
|
||||
.dispatch_if (alu_dispatch_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.commit_if (alu_commit_if)
|
||||
.branch_ctl_if (branch_ctl_if[0 +: `NUM_ALU_BLOCKS]),
|
||||
.commit_if (alu_scalar_commit_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if alu_tensor_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
`RESET_RELAY (tensor_alu_reset, reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_alu_unit (
|
||||
.clk (clk),
|
||||
.reset (tensor_alu_reset),
|
||||
.dispatch_if (tensor_alu_dispatch_if),
|
||||
.branch_ctl_if (branch_ctl_if[`NUM_ALU_BLOCKS +: `NUM_ALU_BLOCKS]),
|
||||
.commit_if (alu_tensor_commit_if)
|
||||
);
|
||||
|
||||
localparam ALU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_domain_commit
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (ALU_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) alu_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({alu_tensor_commit_if[i].valid, alu_scalar_commit_if[i].valid}),
|
||||
.ready_in ({alu_tensor_commit_if[i].ready, alu_scalar_commit_if[i].ready}),
|
||||
.data_in ({alu_tensor_commit_if[i].data, alu_scalar_commit_if[i].data}),
|
||||
.data_out (alu_commit_if[i].data),
|
||||
.valid_out (alu_commit_if[i].valid),
|
||||
.ready_out (alu_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (alu_scalar_commit_if[i].valid
|
||||
&& ((alu_scalar_commit_if[i].data.PC == 32'h80000010) || (alu_scalar_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-scalar-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_scalar_commit_if[i].valid, alu_scalar_commit_if[i].ready,
|
||||
alu_scalar_commit_if[i].data.wid, alu_scalar_commit_if[i].data.PC,
|
||||
alu_scalar_commit_if[i].data.wb, alu_scalar_commit_if[i].data.rd,
|
||||
alu_scalar_commit_if[i].data.sop, alu_scalar_commit_if[i].data.eop,
|
||||
alu_scalar_commit_if[i].data.uuid));
|
||||
end
|
||||
if (alu_tensor_commit_if[i].valid
|
||||
&& ((alu_tensor_commit_if[i].data.PC == 32'h80000010) || (alu_tensor_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-tensor-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_tensor_commit_if[i].valid, alu_tensor_commit_if[i].ready,
|
||||
alu_tensor_commit_if[i].data.wid, alu_tensor_commit_if[i].data.PC,
|
||||
alu_tensor_commit_if[i].data.wb, alu_tensor_commit_if[i].data.rd,
|
||||
alu_tensor_commit_if[i].data.sop, alu_tensor_commit_if[i].data.eop,
|
||||
alu_tensor_commit_if[i].data.uuid));
|
||||
end
|
||||
if (alu_commit_if[i].valid
|
||||
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
||||
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
||||
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
||||
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
||||
alu_commit_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`else
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_alu_commit_passthru
|
||||
assign alu_commit_if[i].valid = alu_scalar_commit_if[i].valid;
|
||||
assign alu_commit_if[i].data = alu_scalar_commit_if[i].data;
|
||||
assign alu_scalar_commit_if[i].ready = alu_commit_if[i].ready;
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (alu_commit_if[i].valid
|
||||
&& ((alu_commit_if[i].data.PC == 32'h80000010) || (alu_commit_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-execute-alu-domain-commit: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, alu_commit_if[i].valid, alu_commit_if[i].ready,
|
||||
alu_commit_if[i].data.wid, alu_commit_if[i].data.PC,
|
||||
alu_commit_if[i].data.wb, alu_commit_if[i].data.rd,
|
||||
alu_commit_if[i].data.sop, alu_commit_if[i].data.eop,
|
||||
alu_commit_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
|
||||
`SCOPE_IO_SWITCH (1)
|
||||
|
||||
VX_commit_if lsu_scalar_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) scalar_lsu_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) lsu_unit (
|
||||
@@ -100,11 +300,184 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
.downstream_mem_busy (downstream_mem_busy),
|
||||
.cache_bus_if (dcache_bus_if),
|
||||
.cache_bus_if (scalar_lsu_bus_if),
|
||||
.dispatch_if (lsu_dispatch_if),
|
||||
.commit_if (lsu_commit_if)
|
||||
.commit_if (lsu_scalar_commit_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if lsu_tensor_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) tensor_lsu_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (tensor_lsu_reset, reset);
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_lsu_unit (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (tensor_lsu_reset),
|
||||
.downstream_mem_busy (downstream_mem_busy),
|
||||
.cache_bus_if (tensor_lsu_bus_if),
|
||||
.dispatch_if (tensor_lsu_dispatch_if),
|
||||
.commit_if (lsu_tensor_commit_if)
|
||||
);
|
||||
|
||||
localparam LSU_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_domain_commit
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (LSU_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) lsu_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({lsu_tensor_commit_if[i].valid, lsu_scalar_commit_if[i].valid}),
|
||||
.ready_in ({lsu_tensor_commit_if[i].ready, lsu_scalar_commit_if[i].ready}),
|
||||
.data_in ({lsu_tensor_commit_if[i].data, lsu_scalar_commit_if[i].data}),
|
||||
.data_out (lsu_commit_if[i].data),
|
||||
.valid_out (lsu_commit_if[i].valid),
|
||||
.ready_out (lsu_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
|
||||
wire scalar_lsu_req_any;
|
||||
wire tensor_lsu_req_any;
|
||||
wire [DCACHE_NUM_REQS-1:0] scalar_lsu_req_valids;
|
||||
wire [DCACHE_NUM_REQS-1:0] tensor_lsu_req_valids;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_req_fires;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_rd_req_fires;
|
||||
wire [DCACHE_NUM_REQS-1:0] lsu_rsp_fires;
|
||||
reg lsu_domain_rr;
|
||||
reg lsu_active_domain;
|
||||
reg [15:0] lsu_pending_reads;
|
||||
logic lsu_select_tensor;
|
||||
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rd_req_fire_count;
|
||||
logic [`CLOG2(DCACHE_NUM_REQS+1)-1:0] lsu_rsp_fire_count;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_domain_mem
|
||||
assign scalar_lsu_req_valids[i] = scalar_lsu_bus_if[i].req_valid;
|
||||
assign tensor_lsu_req_valids[i] = tensor_lsu_bus_if[i].req_valid;
|
||||
|
||||
assign dcache_bus_if[i].req_valid = lsu_select_tensor ? tensor_lsu_bus_if[i].req_valid : scalar_lsu_bus_if[i].req_valid;
|
||||
assign dcache_bus_if[i].req_data = lsu_select_tensor ? tensor_lsu_bus_if[i].req_data : scalar_lsu_bus_if[i].req_data;
|
||||
assign scalar_lsu_bus_if[i].req_ready = !lsu_select_tensor && dcache_bus_if[i].req_ready;
|
||||
assign tensor_lsu_bus_if[i].req_ready = lsu_select_tensor && dcache_bus_if[i].req_ready;
|
||||
|
||||
assign scalar_lsu_bus_if[i].rsp_valid = !lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
||||
assign scalar_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
||||
assign tensor_lsu_bus_if[i].rsp_valid = lsu_active_domain && dcache_bus_if[i].rsp_valid;
|
||||
assign tensor_lsu_bus_if[i].rsp_data = dcache_bus_if[i].rsp_data;
|
||||
assign dcache_bus_if[i].rsp_ready = lsu_active_domain ? tensor_lsu_bus_if[i].rsp_ready : scalar_lsu_bus_if[i].rsp_ready;
|
||||
|
||||
assign lsu_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready;
|
||||
assign lsu_rd_req_fires[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && !dcache_bus_if[i].req_data.rw;
|
||||
assign lsu_rsp_fires[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
assign scalar_lsu_req_any = |scalar_lsu_req_valids;
|
||||
assign tensor_lsu_req_any = |tensor_lsu_req_valids;
|
||||
|
||||
always @(*) begin
|
||||
if (lsu_pending_reads != 0) begin
|
||||
lsu_select_tensor = lsu_active_domain;
|
||||
end else if (scalar_lsu_req_any && tensor_lsu_req_any) begin
|
||||
lsu_select_tensor = lsu_domain_rr;
|
||||
end else begin
|
||||
lsu_select_tensor = tensor_lsu_req_any;
|
||||
end
|
||||
|
||||
lsu_rd_req_fire_count = '0;
|
||||
lsu_rsp_fire_count = '0;
|
||||
for (integer i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
lsu_rd_req_fire_count = lsu_rd_req_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rd_req_fires[i]);
|
||||
lsu_rsp_fire_count = lsu_rsp_fire_count + `CLOG2(DCACHE_NUM_REQS+1)'(lsu_rsp_fires[i]);
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
lsu_domain_rr <= 1'b0;
|
||||
lsu_active_domain <= 1'b0;
|
||||
lsu_pending_reads <= '0;
|
||||
end else begin
|
||||
if (lsu_pending_reads == 0 && (|lsu_req_fires)) begin
|
||||
lsu_domain_rr <= ~lsu_select_tensor;
|
||||
if (lsu_rd_req_fire_count != 0) begin
|
||||
lsu_active_domain <= lsu_select_tensor;
|
||||
end
|
||||
end
|
||||
lsu_pending_reads <= lsu_pending_reads + 16'(lsu_rd_req_fire_count) - 16'(lsu_rsp_fire_count);
|
||||
end
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(lsu_pending_reads == 0 && (|lsu_rsp_fires)),
|
||||
("%t: *** core%0d-lsu-domain-arb-unmatched-response", $time, CORE_ID)
|
||||
)
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_reqs_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_reqs_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_lsu_stalls_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_lsu_stalls_r;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_merge_stalls_r;
|
||||
|
||||
wire scalar_lsu_req_fire_any = (|lsu_req_fires) && !lsu_select_tensor;
|
||||
wire tensor_lsu_req_fire_any = (|lsu_req_fires) && lsu_select_tensor;
|
||||
wire scalar_lsu_merge_stall = scalar_lsu_req_any && lsu_select_tensor;
|
||||
wire tensor_lsu_merge_stall = tensor_lsu_req_any && !lsu_select_tensor;
|
||||
wire mem_merge_stall = scalar_lsu_req_any && tensor_lsu_req_any;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scalar_lsu_reqs_r <= '0;
|
||||
perf_tensor_lsu_reqs_r <= '0;
|
||||
perf_scalar_lsu_stalls_r <= '0;
|
||||
perf_tensor_lsu_stalls_r <= '0;
|
||||
perf_mem_merge_stalls_r <= '0;
|
||||
end else begin
|
||||
perf_scalar_lsu_reqs_r <= perf_scalar_lsu_reqs_r + `PERF_CTR_BITS'(scalar_lsu_req_fire_any);
|
||||
perf_tensor_lsu_reqs_r <= perf_tensor_lsu_reqs_r + `PERF_CTR_BITS'(tensor_lsu_req_fire_any);
|
||||
perf_scalar_lsu_stalls_r <= perf_scalar_lsu_stalls_r + `PERF_CTR_BITS'(scalar_lsu_merge_stall);
|
||||
perf_tensor_lsu_stalls_r <= perf_tensor_lsu_stalls_r + `PERF_CTR_BITS'(tensor_lsu_merge_stall);
|
||||
perf_mem_merge_stalls_r <= perf_mem_merge_stalls_r + `PERF_CTR_BITS'(mem_merge_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_scalar_lsu_reqs = perf_scalar_lsu_reqs_r;
|
||||
assign perf_tensor_lsu_reqs = perf_tensor_lsu_reqs_r;
|
||||
assign perf_scalar_lsu_stalls = perf_scalar_lsu_stalls_r;
|
||||
assign perf_tensor_lsu_stalls = perf_tensor_lsu_stalls_r;
|
||||
assign perf_mem_merge_stalls = perf_mem_merge_stalls_r;
|
||||
`endif
|
||||
`else
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_lsu_commit_passthru
|
||||
assign lsu_commit_if[i].valid = lsu_scalar_commit_if[i].valid;
|
||||
assign lsu_commit_if[i].data = lsu_scalar_commit_if[i].data;
|
||||
assign lsu_scalar_commit_if[i].ready = lsu_commit_if[i].ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_lsu_mem_passthru
|
||||
`ASSIGN_VX_MEM_BUS_IF(dcache_bus_if[i], scalar_lsu_bus_if[i]);
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_scalar_lsu_reqs = '0;
|
||||
assign perf_tensor_lsu_reqs = '0;
|
||||
assign perf_scalar_lsu_stalls = '0;
|
||||
assign perf_tensor_lsu_stalls = '0;
|
||||
assign perf_mem_merge_stalls = '0;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
|
||||
@@ -140,7 +513,11 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
`ifdef EXT_T_ENABLE
|
||||
.warp_ctl_if (scalar_warp_ctl_if),
|
||||
`else
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
`endif
|
||||
.commit_if (sfu_commit_if),
|
||||
|
||||
.acc_read_in (acc_read_in),
|
||||
@@ -149,20 +526,73 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_commit_if tensor_core_commit_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if tensor_ctrl_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_tensor_ctrl_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tensor_ctrl_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.dispatch_if (tensor_ctrl_dispatch_if),
|
||||
.commit_if (tensor_ctrl_commit_if),
|
||||
.warp_ctl_if (tensor_warp_ctl_if),
|
||||
.csr_unlock_valid (tensor_csr_unlock_valid),
|
||||
.csr_unlock_wid (tensor_csr_unlock_wid),
|
||||
.tmc_valid (tensor_tmc_valid),
|
||||
.tmc_wid (tensor_tmc_wid),
|
||||
.tmc_tmask (tensor_tmc_tmask)
|
||||
);
|
||||
|
||||
VX_tensor_core #(
|
||||
.FP16 (TENSOR_FP16)
|
||||
.FP16 (TENSOR_FP16),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES)
|
||||
) tensor_core (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
.dispatch_if(tensor_dispatch_if),
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
.regfile_if(tensor_regfile_if),
|
||||
.smem_A_if(tensor_smem_A_if),
|
||||
.tmem_A_ren(tensor_tmem_A_ren),
|
||||
.tmem_A_rready(tensor_tmem_A_rready),
|
||||
.tmem_A_raddr(tensor_tmem_A_raddr),
|
||||
.tmem_A_rdata(tensor_tmem_A_rdata),
|
||||
.tmem_C_ren(tensor_tmem_C_ren),
|
||||
.tmem_C_rready(tensor_tmem_C_rready),
|
||||
.tmem_C_raddr(tensor_tmem_C_raddr),
|
||||
.tmem_C_rdata(tensor_tmem_C_rdata),
|
||||
.tmem_C_wen(tensor_tmem_C_wen),
|
||||
.tmem_C_wready(tensor_tmem_C_wready),
|
||||
.tmem_C_waddr(tensor_tmem_C_waddr),
|
||||
.tmem_C_wdata(tensor_tmem_C_wdata),
|
||||
.tmem_C_mask(tensor_tmem_C_mask),
|
||||
.smem_B_if(tensor_smem_B_if),
|
||||
`endif
|
||||
.commit_if(tensor_commit_if)
|
||||
.commit_if(tensor_core_commit_if)
|
||||
);
|
||||
|
||||
localparam TENSOR_COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_commit_arb
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (TENSOR_COMMIT_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
) tensor_commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({tensor_ctrl_commit_if[i].valid, tensor_core_commit_if[i].valid}),
|
||||
.ready_in ({tensor_ctrl_commit_if[i].ready, tensor_core_commit_if[i].ready}),
|
||||
.data_in ({tensor_ctrl_commit_if[i].data, tensor_core_commit_if[i].data}),
|
||||
.data_out (tensor_commit_if[i].data),
|
||||
.valid_out (tensor_commit_if[i].valid),
|
||||
.ready_out (tensor_commit_if[i].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
`endif
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
|
||||
@@ -25,27 +25,40 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
// inputs
|
||||
VX_schedule_if.slave schedule_if,
|
||||
VX_schedule_if.slave scalar_schedule_if,
|
||||
VX_schedule_if.slave tensor_schedule_if,
|
||||
|
||||
// outputs
|
||||
VX_fetch_if.master fetch_if
|
||||
VX_fetch_if.master scalar_fetch_if,
|
||||
VX_fetch_if.master tensor_fetch_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire icache_req_valid;
|
||||
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
|
||||
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
|
||||
wire icache_req_ready;
|
||||
|
||||
wire rsp_domain;
|
||||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
|
||||
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
|
||||
reg fetch_domain_rr;
|
||||
|
||||
wire icache_req_fire = icache_req_valid && icache_req_ready;
|
||||
|
||||
assign req_tag = schedule_if.data.wid;
|
||||
wire scalar_req_valid = scalar_schedule_if.valid;
|
||||
wire tensor_req_valid = tensor_schedule_if.valid;
|
||||
wire select_tensor_req = tensor_req_valid && (!scalar_req_valid || fetch_domain_rr);
|
||||
wire selected_domain = select_tensor_req ? WU_DOMAIN_TENSOR : WU_DOMAIN_SCALAR;
|
||||
wire selected_valid = scalar_req_valid || tensor_req_valid;
|
||||
wire [`NW_WIDTH-1:0] selected_wid = select_tensor_req ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid;
|
||||
wire [`XLEN-1:0] selected_pc = select_tensor_req ? tensor_schedule_if.data.PC : scalar_schedule_if.data.PC;
|
||||
wire [`NUM_THREADS-1:0] selected_tmask = select_tensor_req ? tensor_schedule_if.data.tmask : scalar_schedule_if.data.tmask;
|
||||
wire [`UUID_WIDTH-1:0] selected_uuid = select_tensor_req ? tensor_schedule_if.data.uuid : scalar_schedule_if.data.uuid;
|
||||
|
||||
assign req_tag = selected_wid;
|
||||
|
||||
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
|
||||
assign {rsp_domain, rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
|
||||
|
||||
wire [`XLEN-1:0] rsp_PC;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
@@ -60,7 +73,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
.write (icache_req_fire),
|
||||
`UNUSED_PIN (wren),
|
||||
.waddr (req_tag),
|
||||
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
|
||||
.wdata ({selected_pc, selected_tmask}),
|
||||
.raddr (rsp_tag),
|
||||
.rdata ({rsp_PC, rsp_tmask})
|
||||
);
|
||||
@@ -69,7 +82,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
// Ensure that the ibuffer doesn't fill up.
|
||||
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
|
||||
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
|
||||
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
|
||||
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(selected_wid);
|
||||
wire [`ISSUE_WIDTH-1:0] domain_ibuf_pop = scalar_fetch_if.ibuf_pop | tensor_fetch_if.ibuf_pop;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
@@ -79,7 +93,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (icache_req_fire && schedule_isw == i),
|
||||
.decr (fetch_if.ibuf_pop[i]),
|
||||
.decr (domain_ibuf_pop[i]),
|
||||
.full (pending_ibuf_full[i]),
|
||||
`UNUSED_PIN (size),
|
||||
`UNUSED_PIN (empty)
|
||||
@@ -90,15 +104,24 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
wire ibuf_ready = 1'b1;
|
||||
`endif
|
||||
|
||||
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
|
||||
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
|
||||
`RUNTIME_ASSERT((!selected_valid || selected_pc != 0),
|
||||
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, selected_pc, selected_wid, selected_tmask, selected_uuid))
|
||||
|
||||
// Icache Request
|
||||
|
||||
assign icache_req_valid = schedule_if.valid && ibuf_ready;
|
||||
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
|
||||
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
|
||||
assign schedule_if.ready = icache_req_ready && ibuf_ready;
|
||||
assign icache_req_valid = selected_valid && ibuf_ready;
|
||||
assign icache_req_addr = selected_pc[`MEM_ADDR_WIDTH-1:2];
|
||||
assign icache_req_tag = {selected_domain, selected_uuid, req_tag};
|
||||
assign scalar_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && !select_tensor_req;
|
||||
assign tensor_schedule_if.ready = icache_req_ready && ibuf_ready && selected_valid && select_tensor_req;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fetch_domain_rr <= 1'b0;
|
||||
end else if (icache_req_fire && scalar_req_valid && tensor_req_valid) begin
|
||||
fetch_domain_rr <= ~fetch_domain_rr;
|
||||
end
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (ICACHE_ADDR_WIDTH + ICACHE_TAG_WIDTH),
|
||||
@@ -121,18 +144,26 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
|
||||
// Icache Response
|
||||
|
||||
assign fetch_if.valid = icache_bus_if.rsp_valid;
|
||||
assign fetch_if.data.tmask = rsp_tmask;
|
||||
assign fetch_if.data.wid = rsp_tag;
|
||||
assign fetch_if.data.PC = rsp_PC;
|
||||
assign fetch_if.data.instr = icache_bus_if.rsp_data.data;
|
||||
assign fetch_if.data.uuid = rsp_uuid;
|
||||
assign icache_bus_if.rsp_ready = fetch_if.ready;
|
||||
assign scalar_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_SCALAR);
|
||||
assign scalar_fetch_if.data.tmask = rsp_tmask;
|
||||
assign scalar_fetch_if.data.wid = rsp_tag;
|
||||
assign scalar_fetch_if.data.PC = rsp_PC;
|
||||
assign scalar_fetch_if.data.instr = icache_bus_if.rsp_data.data;
|
||||
assign scalar_fetch_if.data.uuid = rsp_uuid;
|
||||
|
||||
assign tensor_fetch_if.valid = icache_bus_if.rsp_valid && (rsp_domain == WU_DOMAIN_TENSOR);
|
||||
assign tensor_fetch_if.data.tmask = rsp_tmask;
|
||||
assign tensor_fetch_if.data.wid = rsp_tag;
|
||||
assign tensor_fetch_if.data.PC = rsp_PC;
|
||||
assign tensor_fetch_if.data.instr = icache_bus_if.rsp_data.data;
|
||||
assign tensor_fetch_if.data.uuid = rsp_uuid;
|
||||
|
||||
assign icache_bus_if.rsp_ready = (rsp_domain == WU_DOMAIN_TENSOR) ? tensor_fetch_if.ready : scalar_fetch_if.ready;
|
||||
|
||||
`ifdef DBG_SCOPE_FETCH
|
||||
if (CORE_ID == 0) begin
|
||||
`ifdef SCOPE
|
||||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire schedule_fire = icache_req_fire;
|
||||
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (1),
|
||||
@@ -150,7 +181,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
icache_rsp_fire
|
||||
}),
|
||||
.probes({
|
||||
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
|
||||
selected_uuid, selected_wid, selected_tmask, selected_pc,
|
||||
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
|
||||
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
|
||||
}),
|
||||
@@ -161,7 +192,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
`ifdef CHIPSCOPE
|
||||
ila_fetch ila_fetch_inst (
|
||||
.clk (clk),
|
||||
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
|
||||
.probe0 ({reset, selected_uuid, selected_wid, selected_tmask, selected_pc, icache_req_ready, selected_valid}),
|
||||
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
|
||||
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
|
||||
);
|
||||
@@ -172,14 +203,18 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_ICACHE
|
||||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire fetch_fire = fetch_if.valid && fetch_if.ready;
|
||||
wire schedule_fire = icache_req_fire;
|
||||
wire scalar_fetch_fire = scalar_fetch_if.valid && scalar_fetch_if.ready;
|
||||
wire tensor_fetch_fire = tensor_fetch_if.valid && tensor_fetch_if.ready;
|
||||
always @(posedge clk) begin
|
||||
if (schedule_fire) begin
|
||||
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
|
||||
`TRACE(1, ("%d: I$%0d req: domain=%0d wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, selected_domain, selected_wid, selected_pc, selected_tmask, selected_uuid));
|
||||
end
|
||||
if (fetch_fire) begin
|
||||
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
|
||||
if (scalar_fetch_fire) begin
|
||||
`TRACE(1, ("%d: I$%0d scalar rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, scalar_fetch_if.data.wid, scalar_fetch_if.data.PC, scalar_fetch_if.data.tmask, scalar_fetch_if.data.instr, scalar_fetch_if.data.uuid));
|
||||
end
|
||||
if (tensor_fetch_fire) begin
|
||||
`TRACE(1, ("%d: I$%0d tensor rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, tensor_fetch_if.data.wid, tensor_fetch_if.data.PC, tensor_fetch_if.data.tmask, tensor_fetch_if.data.instr, tensor_fetch_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
@@ -127,4 +127,37 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
assign commit_tmp_if.ready = commit_out_if[i].ready;
|
||||
end
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if (commit_in_if[i].valid
|
||||
&& ((commit_in_if[i].data.PC == 32'h80000010) || (commit_in_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: gather-commit-in: block=%0d, isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, i, commit_in_isw[i], commit_in_if[i].valid, commit_in_if[i].ready,
|
||||
commit_in_if[i].data.wid, commit_in_if[i].data.PC,
|
||||
commit_in_if[i].data.wb, commit_in_if[i].data.rd,
|
||||
commit_in_if[i].data.sop, commit_in_if[i].data.eop,
|
||||
commit_in_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if (commit_out_if[i].valid
|
||||
&& ((commit_out_if[i].data.PC == 32'h80000010) || (commit_out_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: gather-commit-out: isw=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, i, commit_out_if[i].valid, commit_out_if[i].ready,
|
||||
commit_out_if[i].data.wid, commit_out_if[i].data.PC,
|
||||
commit_out_if[i].data.wb, commit_out_if[i].data.rd,
|
||||
commit_out_if[i].data.sop, commit_out_if[i].data.eop,
|
||||
commit_out_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -12,9 +12,11 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
parameter CORE_ID = 0,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
@@ -33,12 +35,28 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
|
||||
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
|
||||
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
|
||||
wire decode_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid);
|
||||
|
||||
assign decode_if.ready = ibuf_ready_in[decode_isw];
|
||||
assign decode_if.ready = decode_lane_in_domain && ibuf_ready_in[decode_isw];
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
wire decode_probe_pc = (decode_if.data.PC >= 32'h80000240) && (decode_if.data.PC <= 32'h80000260);
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && decode_if.valid && decode_probe_pc) begin
|
||||
`TRACE(2, ("%d: core%0d-ibuffer-probe: wid=%0d, PC=0x%0h, valid=%b, ready=%b, isw=%0d, ibuf_ready=%b, tmask=%b, ex=0x%0h, op=0x%0h (#%0d)\n",
|
||||
$time, CORE_ID, decode_if.data.wid, decode_if.data.PC, decode_if.valid, decode_if.ready,
|
||||
decode_isw, ibuf_ready_in, decode_if.data.tmask, decode_if.data.ex_type, decode_if.data.op_type,
|
||||
decode_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
@@ -46,7 +64,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
||||
) instr_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (decode_if.valid && decode_isw == i),
|
||||
.valid_in (decode_if.valid && decode_isw == i && decode_lane_in_domain && LANE_IN_DOMAIN),
|
||||
.ready_in (ibuf_ready_in[i]),
|
||||
.data_in ({
|
||||
decode_if.data.uuid,
|
||||
|
||||
@@ -173,6 +173,38 @@ module VX_int_unit #(
|
||||
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
|
||||
);
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)) begin
|
||||
if (execute_if.valid
|
||||
&& ((execute_if.data.PC == 32'h80000010) || (execute_if.data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-int-rsp-in: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, op=0x%0h, mod=%0d, is_br=%b, br_op=0x%0h, wb=%0d, rd=%0d, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, BLOCK_IDX, execute_if.valid, execute_if.ready,
|
||||
execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type,
|
||||
execute_if.data.op_mod, is_br_op, br_op, execute_if.data.wb,
|
||||
execute_if.data.rd, execute_if.data.sop, execute_if.data.eop,
|
||||
execute_if.data.uuid));
|
||||
end
|
||||
if (commit_if.valid
|
||||
&& ((PC_r == `XLEN'h80000010) || (PC_r == `XLEN'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-int-rsp-out: block=%0d, valid=%b, ready=%b, wid=%0d, PC=0x%0h, is_br=%b, br_op=0x%0h, br_enable=%b, br_taken=%b, br_dest=0x%0h, wb=%0d, rd=%0d, tensor=%b, sop=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, BLOCK_IDX, commit_if.valid, commit_if.ready,
|
||||
commit_if.data.wid, PC_r, is_br_op_r, br_op_r, br_enable,
|
||||
br_taken, br_dest, commit_if.data.wb, commit_if.data.rd,
|
||||
commit_if.data.tensor, commit_if.data.sop, commit_if.data.eop,
|
||||
commit_if.data.uuid));
|
||||
end
|
||||
if (branch_ctl_if.valid
|
||||
&& (branch_ctl_if.wid == `NW_WIDTH'(0))) begin
|
||||
`TRACE(1, ("%d: core%0d-int-branch-out: block=%0d, wid=%0d, taken=%b, dest=0x%0h, commit_PC=0x%0h, commit_valid=%b, commit_ready=%b (#%0d)\n",
|
||||
$time, CORE_ID, BLOCK_IDX, branch_ctl_if.wid,
|
||||
branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.PC,
|
||||
commit_if.valid, commit_if.ready, commit_if.data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i];
|
||||
end
|
||||
|
||||
@@ -15,7 +15,9 @@
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_issue import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
parameter CORE_ID = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
@@ -35,9 +37,12 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
VX_dispatch_if.master tensor_alu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_lsu_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_ctrl_dispatch_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tc_rf_if.slave tensor_regfile_if,
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES],
|
||||
`endif
|
||||
`endif
|
||||
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
||||
@@ -52,7 +57,8 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_ibuffer #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID (CORE_ID),
|
||||
.DOMAIN (DOMAIN)
|
||||
) ibuffer (
|
||||
.clk (clk),
|
||||
.reset (ibuf_reset),
|
||||
@@ -61,7 +67,8 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
VX_scoreboard #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID (CORE_ID),
|
||||
.DOMAIN (DOMAIN)
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (scoreboard_reset),
|
||||
@@ -84,13 +91,15 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
VX_operands #(
|
||||
`endif
|
||||
.CORE_ID (CORE_ID),
|
||||
.CACHE_ENABLE (0)
|
||||
.CACHE_ENABLE (0),
|
||||
.NUM_TENSOR_CORES (NUM_TENSOR_CORES),
|
||||
.DOMAIN (DOMAIN)
|
||||
) operands (
|
||||
.clk (clk),
|
||||
.reset (operands_reset),
|
||||
.writeback_if (writeback_if),
|
||||
.scoreboard_if (scoreboard_if),
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
.operands_if (operands_if),
|
||||
.tensor_regfile_if (tensor_regfile_if)
|
||||
`else
|
||||
@@ -99,7 +108,8 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
VX_dispatch #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID (CORE_ID),
|
||||
.DOMAIN (DOMAIN)
|
||||
) dispatch (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
@@ -116,11 +126,52 @@ module VX_issue import VX_gpu_pkg::*; #(
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
`ifdef EXT_T_ENABLE
|
||||
.tensor_alu_dispatch_if(tensor_alu_dispatch_if),
|
||||
.tensor_lsu_dispatch_if(tensor_lsu_dispatch_if),
|
||||
.tensor_ctrl_dispatch_if(tensor_ctrl_dispatch_if),
|
||||
.tensor_dispatch_if(tensor_dispatch_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
wire decode_fire = decode_if.valid && decode_if.ready;
|
||||
`RUNTIME_ASSERT(
|
||||
!decode_fire ||
|
||||
((DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(decode_if.data.wid) : `IS_SCALAR_WARP(decode_if.data.wid)),
|
||||
("%t: *** core%0d-issue-domain-crossing domain=%0d wid=%0d PC=0x%0h",
|
||||
$time, CORE_ID, DOMAIN, decode_if.data.wid, decode_if.data.PC)
|
||||
)
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire ibuf_probe_pc = ibuffer_if[i].valid
|
||||
&& (ibuffer_if[i].data.PC >= 32'h80000240)
|
||||
&& (ibuffer_if[i].data.PC <= 32'h80000260);
|
||||
wire scb_probe_pc = scoreboard_if[i].valid
|
||||
&& (scoreboard_if[i].data.PC >= 32'h80000240)
|
||||
&& (scoreboard_if[i].data.PC <= 32'h80000260);
|
||||
wire ops_probe_pc = operands_if[i].valid
|
||||
&& (operands_if[i].data.PC >= 32'h80000240)
|
||||
&& (operands_if[i].data.PC <= 32'h80000260);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME) && (ibuf_probe_pc || scb_probe_pc || ops_probe_pc)) begin
|
||||
`TRACE(2, ("%d: core%0d-issue-probe: isw=%0d, ibuf=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, alu=%b/%b lsu=%b/%b sfu=%b/%b (#ibuf=%0d #scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
ibuffer_if[i].valid, ibuffer_if[i].ready, ibuffer_if[i].data.PC, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC, scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC, operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
alu_dispatch_if[i].valid, alu_dispatch_if[i].ready,
|
||||
lsu_dispatch_if[i].valid, lsu_dispatch_if[i].ready,
|
||||
sfu_dispatch_if[i].valid, sfu_dispatch_if[i].ready,
|
||||
ibuffer_if[i].data.uuid, scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef DBG_SCOPE_ISSUE
|
||||
if (CORE_ID == 0) begin
|
||||
`ifdef SCOPE
|
||||
|
||||
@@ -17,7 +17,9 @@
|
||||
|
||||
module VX_operands import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter CACHE_ENABLE = 0
|
||||
parameter CACHE_ENABLE = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
@@ -27,6 +29,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
VX_operands_if.master operands_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_PARAM (NUM_TENSOR_CORES)
|
||||
`UNUSED_PARAM (DOMAIN)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
@@ -156,7 +160,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
end
|
||||
endcase
|
||||
|
||||
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
|
||||
if (CACHE_ENABLE != 0 && writeback_if[i].valid && !writeback_if[i].data.tensor) begin
|
||||
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|
||||
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
|
||||
for (integer j = 0; j < `NUM_THREADS; ++j) begin
|
||||
@@ -288,9 +292,9 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
|
||||
@@ -12,26 +12,59 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
`ifdef GPR_DUPLICATED
|
||||
|
||||
module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter CACHE_ENABLE = 0
|
||||
parameter CACHE_ENABLE = 0,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tc_rf_if.slave tensor_regfile_if,
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.slave tensor_regfile_if[NUM_TENSOR_CORES],
|
||||
`endif
|
||||
VX_operands_if.master operands_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
`ifdef EXT_F_ENABLE
|
||||
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS;
|
||||
`else
|
||||
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS;
|
||||
`endif
|
||||
|
||||
function automatic [RAM_ADDRW-1:0] wu_rf_addr (
|
||||
input logic [ISSUE_WIS_W-1:0] wis,
|
||||
input logic [`NR_BITS-1:0] reg_id,
|
||||
input logic is_tensor
|
||||
);
|
||||
if (is_tensor) begin
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (reg_id[`NR_BITS-1]) begin
|
||||
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
|
||||
+ RAM_ADDRW'(`TENSOR_NUM_GPRS)
|
||||
+ RAM_ADDRW'(reg_id[`NRI_BITS-1:0]);
|
||||
end else begin
|
||||
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
|
||||
+ RAM_ADDRW'(reg_id[`NRI_BITS-1:0]);
|
||||
end
|
||||
`else
|
||||
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(TENSOR_NUM_REGS))
|
||||
+ RAM_ADDRW'(reg_id);
|
||||
`endif
|
||||
end else begin
|
||||
wu_rf_addr = (RAM_ADDRW'(wis) * RAM_ADDRW'(`NUM_REGS))
|
||||
+ RAM_ADDRW'(reg_id);
|
||||
end
|
||||
endfunction
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
logic [`ISSUE_WIDTH-1:0][`PERF_CTR_BITS-1:0] perf_rf_read_per_warp;
|
||||
@@ -48,19 +81,52 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
|
||||
wire tc_rf_valid [`ISSUE_WIDTH];
|
||||
wire [`LOG2UP(`NUM_REGS * ISSUE_RATIO)-1:0] tc_rf_addr [`ISSUE_WIDTH];
|
||||
wire [ISSUE_WIS_W-1:0] tc_rf_wis [`ISSUE_WIDTH];
|
||||
// FIXME: don't need full ISSUE_WIDTH; only one warp is read at a time
|
||||
// because NUM_BLOCKS == 1
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] tc_rf_data [`ISSUE_WIDTH];
|
||||
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
`STATIC_ASSERT((ISSUE_RATIO == 1),
|
||||
("static assertion failed: tensor core only supports ISSUE_RATIO == 1"))
|
||||
assign tc_rf_valid = '{`ISSUE_WIDTH{tensor_regfile_if.req_valid}};
|
||||
assign tc_rf_addr = '{`ISSUE_WIDTH{tensor_regfile_if.req_data.rs}};
|
||||
assign tensor_regfile_if.rsp_data.data = tc_rf_data[0];
|
||||
`STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS),
|
||||
("Wu tensor RF binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS"))
|
||||
for (genvar tc = 0; tc < NUM_TENSOR_CORES; ++tc) begin : g_tc_rf_rsp
|
||||
localparam TENSOR_ISW = `NUM_SCALAR_WARPS + tc;
|
||||
assign tensor_regfile_if[tc].rsp_data.data = tc_rf_data[TENSOR_ISW];
|
||||
end
|
||||
|
||||
for (genvar rf_i = 0; rf_i < `ISSUE_WIDTH; ++rf_i) begin : g_tc_rf_req
|
||||
if ((DOMAIN == WU_DOMAIN_TENSOR) && `IS_TENSOR_WARP(rf_i)) begin
|
||||
localparam TC_IDX = rf_i - `NUM_SCALAR_WARPS;
|
||||
assign tc_rf_valid[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_valid : 1'b0;
|
||||
assign tc_rf_addr[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.rs : '0;
|
||||
assign tc_rf_wis[rf_i] = (TC_IDX < NUM_TENSOR_CORES) ? tensor_regfile_if[TC_IDX].req_data.wis : '0;
|
||||
end else begin
|
||||
assign tc_rf_valid[rf_i] = 1'b0;
|
||||
assign tc_rf_addr[rf_i] = '0;
|
||||
assign tc_rf_wis[rf_i] = '0;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
|
||||
// Wu physical RF partition: tensor lanes instantiate a smaller RF,
|
||||
// while scalar lanes retain the full scalar register file.
|
||||
localparam GPR_RAM_SIZE = (!LANE_IN_DOMAIN ? 1 : (`IS_TENSOR_WARP(i) ? (TENSOR_NUM_REGS * ISSUE_RATIO) : (`NUM_REGS * ISSUE_RATIO)));
|
||||
localparam GPR_RAM_ADDRW = `LOG2UP(GPR_RAM_SIZE);
|
||||
|
||||
function automatic [GPR_RAM_ADDRW-1:0] lane_rf_addr (
|
||||
input logic [ISSUE_WIS_W-1:0] wis,
|
||||
input logic [`NR_BITS-1:0] reg_id
|
||||
);
|
||||
logic [RAM_ADDRW-1:0] full_addr;
|
||||
begin
|
||||
full_addr = wu_rf_addr(wis, reg_id, `IS_TENSOR_WARP(i));
|
||||
lane_rf_addr = full_addr[GPR_RAM_ADDRW-1:0];
|
||||
end
|
||||
endfunction
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
@@ -81,7 +147,7 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd
|
||||
};
|
||||
scoreboard_if_stored_valid[i] <= scoreboard_if[i].valid && scoreboard_if[i].ready;
|
||||
scoreboard_if_stored_valid[i] <= LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready;
|
||||
end
|
||||
end
|
||||
|
||||
@@ -116,11 +182,51 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PIN (alm_full),
|
||||
.size (size1[i])
|
||||
);
|
||||
assign operands_if[i].valid = ~empty1[i];
|
||||
`ifdef EXT_T_HOPPER
|
||||
assign scoreboard_if[i].ready = (size1[i] < 3'd2) && ~tc_rf_valid[i];
|
||||
assign operands_if[i].valid = LANE_IN_DOMAIN && ~empty1[i];
|
||||
`ifdef EXT_T_ASYNC
|
||||
assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2) && ~tc_rf_valid[i];
|
||||
`else
|
||||
assign scoreboard_if[i].ready = (size1[i] < 3'd2);
|
||||
assign scoreboard_if[i].ready = LANE_IN_DOMAIN && (size1[i] < 3'd2);
|
||||
`endif
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && (CORE_ID == 0) && ($time > `TRACE_STARTTIME)) begin
|
||||
if ((scoreboard_if[i].valid &&
|
||||
(scoreboard_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(scoreboard_if[i].data.PC <= `XLEN'h80000260)) ||
|
||||
(operands_if[i].valid &&
|
||||
(operands_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(operands_if[i].data.PC <= `XLEN'h80000260))
|
||||
`ifdef EXT_T_ASYNC
|
||||
|| tc_rf_valid[i]
|
||||
`endif
|
||||
) begin
|
||||
`ifdef EXT_T_ASYNC
|
||||
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b, tc_rf_valid=%b, tc_rf_rs=%0d, tc_rf_wis=%0d (#scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
|
||||
operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
size1[i], empty1[i], full1[i],
|
||||
tc_rf_valid[i], tc_rf_addr[i], tc_rf_wis[i],
|
||||
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
`else
|
||||
`TRACE(2, ("%d: core%0d-operands-probe: isw=%0d, scb=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, ops=%b/%b PC=0x%0h ex=0x%0h op=0x%0h, size1=%0d, empty1=%b, full1=%b (#scb=%0d #ops=%0d)\n",
|
||||
$time, CORE_ID, i,
|
||||
scoreboard_if[i].valid, scoreboard_if[i].ready, scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.ex_type, scoreboard_if[i].data.op_type,
|
||||
operands_if[i].valid, operands_if[i].ready, operands_if[i].data.PC,
|
||||
operands_if[i].data.ex_type, operands_if[i].data.op_type,
|
||||
size1[i], empty1[i], full1[i],
|
||||
scoreboard_if[i].data.uuid, operands_if[i].data.uuid));
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// assert (full1[i] == full2[i]);
|
||||
@@ -130,9 +236,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
|
||||
|
||||
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored;
|
||||
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored;
|
||||
reg [RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored;
|
||||
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1_stored;
|
||||
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2_stored;
|
||||
reg [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3_stored;
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_fifo_queue #(
|
||||
@@ -161,17 +267,17 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
assign tc_rf_data[i][j] = rs3_data[j];
|
||||
`endif
|
||||
end
|
||||
|
||||
// GPR banks
|
||||
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1;
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2;
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3;
|
||||
wire [RAM_ADDRW-1:0] gpr_wr_addr;
|
||||
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs1;
|
||||
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs2;
|
||||
wire [GPR_RAM_ADDRW-1:0] gpr_rd_addr_rs3;
|
||||
wire [GPR_RAM_ADDRW-1:0] gpr_wr_addr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
@@ -186,13 +292,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
end
|
||||
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
|
||||
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
|
||||
`ifdef EXT_T_HOPPER
|
||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||
assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd);
|
||||
assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1);
|
||||
assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2);
|
||||
`ifdef EXT_T_ASYNC
|
||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
|
||||
`else
|
||||
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||
assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
|
||||
`endif
|
||||
// always @(posedge clk) begin
|
||||
// if (reset) begin
|
||||
@@ -209,13 +315,13 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
// end
|
||||
// end
|
||||
end else begin
|
||||
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
|
||||
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
|
||||
`ifdef EXT_T_HOPPER
|
||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? tc_rf_addr[i] : scoreboard_if[i].data.rs3;
|
||||
assign gpr_wr_addr = lane_rf_addr(writeback_if[i].data.wis, writeback_if[i].data.rd);
|
||||
assign gpr_rd_addr_rs1 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1);
|
||||
assign gpr_rd_addr_rs2 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2);
|
||||
`ifdef EXT_T_ASYNC
|
||||
assign gpr_rd_addr_rs3 = tc_rf_valid[i] ? lane_rf_addr(tc_rf_wis[i], tc_rf_addr[i][`NR_BITS-1:0]) : lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
|
||||
`else
|
||||
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||
assign gpr_rd_addr_rs3 = lane_rf_addr(scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3);
|
||||
`endif
|
||||
// always @(posedge clk) begin
|
||||
// if (reset) begin
|
||||
@@ -251,7 +357,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
.SIZE (GPR_RAM_SIZE),
|
||||
.ADDRW (GPR_RAM_ADDRW),
|
||||
.OUT_REG (1),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
@@ -260,16 +367,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs1 (
|
||||
.clk (clk),
|
||||
`ifdef EXT_T_HOPPER
|
||||
.read (~tc_rf_valid[i]),
|
||||
`else
|
||||
.read (1'b1),
|
||||
`endif
|
||||
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -279,7 +382,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
.SIZE (GPR_RAM_SIZE),
|
||||
.ADDRW (GPR_RAM_ADDRW),
|
||||
.OUT_REG (1),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
@@ -288,16 +392,12 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs2(
|
||||
.clk (clk),
|
||||
`ifdef EXT_T_HOPPER
|
||||
.read (~tc_rf_valid[i]),
|
||||
`else
|
||||
.read (1'b1),
|
||||
`endif
|
||||
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready), // tc read valid check incl. in ready
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -307,7 +407,8 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
.SIZE (GPR_RAM_SIZE),
|
||||
.ADDRW (GPR_RAM_ADDRW),
|
||||
.OUT_REG (1),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
@@ -316,12 +417,16 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs3 (
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`ifdef EXT_T_ASYNC
|
||||
.read (LANE_IN_DOMAIN && ((scoreboard_if[i].valid && scoreboard_if[i].ready) || tc_rf_valid[i])),
|
||||
`else
|
||||
.read (LANE_IN_DOMAIN && scoreboard_if[i].valid && scoreboard_if[i].ready),
|
||||
`endif
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.write (LANE_IN_DOMAIN && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
@@ -330,9 +435,9 @@ module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_write_rs1_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs2_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs3_per_thread[j] = (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs1_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs2_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
assign perf_write_rs3_per_thread[j] = `PERF_CTR_BITS'(wr_enabled && writeback_if[i].valid && !writeback_if[i].data.tensor && writeback_if[i].data.tmask[j]);
|
||||
`endif
|
||||
end
|
||||
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_schedule import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
parameter CORE_ID = 0,
|
||||
parameter NUM_BRANCHES = `NUM_ALU_BLOCKS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
@@ -28,12 +29,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
|
||||
// inputsdecode_if
|
||||
VX_warp_ctl_if.slave warp_ctl_if,
|
||||
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
|
||||
VX_branch_ctl_if.slave branch_ctl_if [NUM_BRANCHES],
|
||||
VX_decode_sched_if.slave decode_sched_if,
|
||||
VX_commit_sched_if.slave commit_sched_if,
|
||||
`ifdef EXT_T_ENABLE
|
||||
input wire tensor_csr_unlock_valid,
|
||||
input wire [`NW_WIDTH-1:0] tensor_csr_unlock_wid,
|
||||
input wire tensor_tmc_valid,
|
||||
input wire [`NW_WIDTH-1:0] tensor_tmc_wid,
|
||||
input wire [`NUM_THREADS-1:0] tensor_tmc_tmask,
|
||||
`endif
|
||||
|
||||
// outputs
|
||||
VX_schedule_if.master schedule_if,
|
||||
VX_schedule_if.master scalar_schedule_if,
|
||||
VX_schedule_if.master tensor_schedule_if,
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
@@ -50,11 +59,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
|
||||
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
|
||||
|
||||
wire [`NW_WIDTH-1:0] schedule_wid;
|
||||
wire [`NUM_THREADS-1:0] schedule_tmask;
|
||||
wire [`XLEN-1:0] schedule_pc;
|
||||
wire schedule_valid;
|
||||
wire schedule_ready;
|
||||
wire scalar_schedule_fire = scalar_schedule_if.valid && scalar_schedule_if.ready;
|
||||
wire tensor_schedule_fire = tensor_schedule_if.valid && tensor_schedule_if.ready;
|
||||
wire schedule_fire_any = scalar_schedule_fire || tensor_schedule_fire;
|
||||
wire [`NW_WIDTH-1:0] schedule_fire_wid = tensor_schedule_fire ? tensor_schedule_if.data.wid : scalar_schedule_if.data.wid;
|
||||
|
||||
// split/join
|
||||
wire join_valid;
|
||||
@@ -68,15 +76,14 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
|
||||
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
|
||||
|
||||
wire schedule_fire = schedule_valid && schedule_ready;
|
||||
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire schedule_if_fire = schedule_fire_any;
|
||||
|
||||
// branch
|
||||
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
|
||||
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
|
||||
wire [NUM_BRANCHES-1:0] branch_valid;
|
||||
wire [NUM_BRANCHES-1:0][`NW_WIDTH-1:0] branch_wid;
|
||||
wire [NUM_BRANCHES-1:0] branch_taken;
|
||||
wire [NUM_BRANCHES-1:0][`XLEN-1:0] branch_dest;
|
||||
for (genvar i = 0; i < NUM_BRANCHES; ++i) begin
|
||||
assign branch_valid[i] = branch_ctl_if[i].valid;
|
||||
assign branch_wid[i] = branch_ctl_if[i].wid;
|
||||
assign branch_taken[i] = branch_ctl_if[i].taken;
|
||||
@@ -87,7 +94,13 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
|
||||
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
|
||||
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
|
||||
wire [`NUM_WARPS-1:0] curr_barrier_mask;
|
||||
wire [`NUM_WARPS-1:0] curr_barrier_mask;
|
||||
wire [`NUM_WARPS-1:0] curr_barrier_mask_with_self;
|
||||
wire [`NUM_WARPS-1:0] scalar_warp_mask;
|
||||
wire [`NUM_WARPS-1:0] tensor_warp_mask;
|
||||
wire [`NUM_WARPS-1:0] barrier_domain_mask;
|
||||
wire [`NUM_WARPS-1:0] barrier_arrived_mask;
|
||||
wire [`CLOG2(`NUM_WARPS+1)-1:0] barrier_arrived_count;
|
||||
`ifdef GBAR_ENABLE
|
||||
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
|
||||
reg gbar_req_valid;
|
||||
@@ -95,8 +108,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign scalar_warp_mask[i] = `IS_SCALAR_WARP(i);
|
||||
assign tensor_warp_mask[i] = `IS_TENSOR_WARP(i);
|
||||
end
|
||||
|
||||
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
|
||||
assign curr_barrier_mask_with_self = curr_barrier_mask | (`NUM_WARPS'(1) << warp_ctl_if.wid);
|
||||
assign barrier_domain_mask =
|
||||
(warp_ctl_if.barrier.domain == BARRIER_SCALAR) ? (active_warps & scalar_warp_mask) :
|
||||
(warp_ctl_if.barrier.domain == BARRIER_TENSOR) ? (active_warps & tensor_warp_mask) :
|
||||
(warp_ctl_if.barrier.domain == BARRIER_MASK) ? (active_warps & warp_ctl_if.barrier.mask) :
|
||||
active_warps;
|
||||
assign barrier_arrived_mask = curr_barrier_mask_with_self & barrier_domain_mask;
|
||||
`POP_COUNT(active_barrier_count, curr_barrier_mask);
|
||||
`POP_COUNT(barrier_arrived_count, barrier_arrived_mask);
|
||||
`UNUSED_VAR (active_barrier_count)
|
||||
|
||||
always @(*) begin
|
||||
@@ -152,9 +178,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
|
||||
if (~warp_ctl_if.barrier.is_global
|
||||
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
|
||||
&& ((warp_ctl_if.barrier.domain == BARRIER_MASK)
|
||||
? ((barrier_arrived_mask & warp_ctl_if.barrier.mask) == warp_ctl_if.barrier.mask)
|
||||
: (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1))))) begin
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
|
||||
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
|
||||
barrier_stalls_n &= ~barrier_arrived_mask;
|
||||
end else begin
|
||||
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
|
||||
barrier_stalls_n[warp_ctl_if.wid] = 1;
|
||||
@@ -186,7 +214,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
// Branch handling
|
||||
for (integer i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
|
||||
for (integer i = 0; i < NUM_BRANCHES; ++i) begin
|
||||
if (branch_valid[i]) begin
|
||||
if (branch_taken[i]) begin
|
||||
warp_pcs_n[branch_wid[i]] = branch_dest[i];
|
||||
@@ -205,14 +233,31 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
||||
end
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
// Tensor control handles a minimal CSR-read/TMC subset without
|
||||
// reusing the scalar SFU.
|
||||
if (tensor_csr_unlock_valid) begin
|
||||
stalled_warps_n[tensor_csr_unlock_wid] = 0;
|
||||
end
|
||||
|
||||
if (tensor_tmc_valid) begin
|
||||
active_warps_n[tensor_tmc_wid] = (tensor_tmc_tmask != 0);
|
||||
thread_masks_n[tensor_tmc_wid] = tensor_tmc_tmask;
|
||||
stalled_warps_n[tensor_tmc_wid] = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
// stall the warp until decode stage
|
||||
if (schedule_fire) begin
|
||||
stalled_warps_n[schedule_wid] = 1;
|
||||
if (schedule_fire_any) begin
|
||||
stalled_warps_n[schedule_fire_wid] = 1;
|
||||
end
|
||||
|
||||
// advance PC
|
||||
if (schedule_if_fire) begin
|
||||
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
|
||||
if (scalar_schedule_fire) begin
|
||||
warp_pcs_n[scalar_schedule_if.data.wid] = scalar_schedule_if.data.PC + 4;
|
||||
end
|
||||
if (tensor_schedule_fire) begin
|
||||
warp_pcs_n[tensor_schedule_if.data.wid] = tensor_schedule_if.data.PC + 4;
|
||||
end
|
||||
end
|
||||
|
||||
@@ -251,9 +296,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
`ifdef GBAR_CLUSTER_ENABLE
|
||||
// engage cluster barrier as soon as the barrier count is
|
||||
// fulfilled, instead of requiring all warps to be synchronized
|
||||
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
|
||||
&& (barrier_arrived_count[`NW_WIDTH-1:0] == (warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0] + `NW_WIDTH'(1)))) begin
|
||||
`else
|
||||
&& (curr_barrier_mask_n == active_warps)) begin
|
||||
&& (barrier_arrived_mask == barrier_domain_mask)) begin
|
||||
`endif
|
||||
gbar_req_valid <= 1;
|
||||
gbar_req_id <= warp_ctl_if.barrier.id;
|
||||
@@ -264,8 +309,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`endif
|
||||
|
||||
if (schedule_if_fire) begin
|
||||
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
|
||||
if (scalar_schedule_fire) begin
|
||||
issued_instrs[scalar_schedule_if.data.wid] <= issued_instrs[scalar_schedule_if.data.wid] + `UUID_WIDTH'(1);
|
||||
end
|
||||
if (tensor_schedule_fire) begin
|
||||
issued_instrs[tensor_schedule_if.data.wid] <= issued_instrs[tensor_schedule_if.data.wid] + `UUID_WIDTH'(1);
|
||||
end
|
||||
|
||||
if (busy) begin
|
||||
@@ -309,15 +357,33 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
// schedule the next ready warp
|
||||
|
||||
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
|
||||
wire [`NUM_WARPS-1:0] scalar_ready_warps = ready_warps & scalar_warp_mask;
|
||||
wire [`NUM_WARPS-1:0] tensor_ready_warps = ready_warps & tensor_warp_mask;
|
||||
wire [`NW_WIDTH-1:0] scalar_schedule_wid;
|
||||
wire [`NW_WIDTH-1:0] tensor_schedule_wid;
|
||||
wire scalar_schedule_valid;
|
||||
wire tensor_schedule_valid;
|
||||
wire scalar_schedule_ready;
|
||||
wire tensor_schedule_ready;
|
||||
|
||||
VX_lzc_rr #(
|
||||
.N (`NUM_WARPS)
|
||||
) wid_select (
|
||||
) scalar_wid_select (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.data_in (ready_warps),
|
||||
.data_out (schedule_wid),
|
||||
.valid_out (schedule_valid)
|
||||
.data_in (scalar_ready_warps),
|
||||
.data_out (scalar_schedule_wid),
|
||||
.valid_out (scalar_schedule_valid)
|
||||
);
|
||||
|
||||
VX_lzc_rr #(
|
||||
.N (`NUM_WARPS)
|
||||
) tensor_wid_select (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.data_in (tensor_ready_warps),
|
||||
.data_out (tensor_schedule_wid),
|
||||
.valid_out (tensor_schedule_valid)
|
||||
);
|
||||
|
||||
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
|
||||
@@ -325,47 +391,78 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
|
||||
end
|
||||
|
||||
assign {schedule_tmask, schedule_pc} = {
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
|
||||
};
|
||||
|
||||
`ifndef NDEBUG
|
||||
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
||||
reg [`UUID_WIDTH-1:0] instr_uuid;
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
||||
`ifdef SV_DPI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
||||
end else if (schedule_fire) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
||||
end
|
||||
end
|
||||
function automatic [`UUID_WIDTH-1:0] schedule_uuid (
|
||||
input logic [`NW_WIDTH-1:0] wid,
|
||||
input logic [`XLEN-1:0] pc
|
||||
);
|
||||
logic [GNW_WIDTH-1:0] g_wid;
|
||||
begin
|
||||
g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
|
||||
schedule_uuid = `UUID_WIDTH'({g_wid, 16'(pc)});
|
||||
end
|
||||
endfunction
|
||||
`else
|
||||
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
|
||||
always @(*) begin
|
||||
instr_uuid = `UUID_WIDTH'(w_uuid);
|
||||
end
|
||||
`endif
|
||||
`else
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
||||
function automatic [`UUID_WIDTH-1:0] schedule_uuid (
|
||||
input logic [`NW_WIDTH-1:0] wid,
|
||||
input logic [`XLEN-1:0] pc
|
||||
);
|
||||
begin
|
||||
`UNUSED_VAR (wid)
|
||||
`UNUSED_VAR (pc)
|
||||
schedule_uuid = '0;
|
||||
end
|
||||
endfunction
|
||||
`endif
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
|
||||
) out_buf (
|
||||
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH),
|
||||
.SIZE (0)
|
||||
) scalar_out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (schedule_valid),
|
||||
.ready_in (schedule_ready),
|
||||
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
|
||||
.valid_out (schedule_if.valid),
|
||||
.ready_out (schedule_if.ready)
|
||||
.valid_in (!reset && scalar_schedule_valid),
|
||||
.ready_in (scalar_schedule_ready),
|
||||
.data_in ({schedule_data[scalar_schedule_wid], scalar_schedule_wid}),
|
||||
.data_out ({scalar_schedule_if.data.tmask, scalar_schedule_if.data.PC, scalar_schedule_if.data.wid}),
|
||||
.valid_out (scalar_schedule_if.valid),
|
||||
.ready_out (scalar_schedule_if.ready)
|
||||
);
|
||||
|
||||
assign schedule_if.data.uuid = instr_uuid;
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH),
|
||||
.SIZE (0)
|
||||
) tensor_out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (!reset && tensor_schedule_valid),
|
||||
.ready_in (tensor_schedule_ready),
|
||||
.data_in ({schedule_data[tensor_schedule_wid], tensor_schedule_wid}),
|
||||
.data_out ({tensor_schedule_if.data.tmask, tensor_schedule_if.data.PC, tensor_schedule_if.data.wid}),
|
||||
.valid_out (tensor_schedule_if.valid),
|
||||
.ready_out (tensor_schedule_if.ready)
|
||||
);
|
||||
|
||||
assign scalar_schedule_if.data.uuid = schedule_uuid(scalar_schedule_if.data.wid, scalar_schedule_if.data.PC);
|
||||
assign tensor_schedule_if.data.uuid = schedule_uuid(tensor_schedule_if.data.wid, tensor_schedule_if.data.PC);
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(scalar_schedule_fire && tensor_schedule_fire),
|
||||
("%t: *** core%0d-schedule-two-domain-fire-with-single-fetch", $time, CORE_ID)
|
||||
)
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!scalar_schedule_if.valid || `IS_SCALAR_WARP(scalar_schedule_if.data.wid),
|
||||
("%t: *** core%0d-scalar-scheduler-issued-tensor-warp wid=%0d",
|
||||
$time, CORE_ID, scalar_schedule_if.data.wid)
|
||||
)
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!tensor_schedule_if.valid || `IS_TENSOR_WARP(tensor_schedule_if.data.wid),
|
||||
("%t: *** core%0d-tensor-scheduler-issued-scalar-warp wid=%0d",
|
||||
$time, CORE_ID, tensor_schedule_if.data.wid)
|
||||
)
|
||||
|
||||
`RESET_RELAY (pending_instr_reset, reset);
|
||||
|
||||
@@ -377,8 +474,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
) pending_instr(
|
||||
.clk (clk),
|
||||
.reset (pending_instr_reset),
|
||||
.incr (schedule_if_fire),
|
||||
.incr_wid (schedule_if.data.wid),
|
||||
.incr (decode_sched_if.valid),
|
||||
.incr_wid (decode_sched_if.wid),
|
||||
.decr (commit_sched_if.committed),
|
||||
.decr_wid (commit_sched_if.committed_wid),
|
||||
.alm_empty_wid (sched_csr_if.alm_empty_wid),
|
||||
@@ -413,13 +510,30 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask != '0,
|
||||
("%t: *** core%0d-invalid-barrier-empty-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b",
|
||||
$time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask)
|
||||
)
|
||||
|
||||
`RUNTIME_ASSERT(
|
||||
!(warp_ctl_if.valid && warp_ctl_if.barrier.valid) || barrier_domain_mask[warp_ctl_if.wid],
|
||||
("%t: *** core%0d-invalid-barrier-wid-domain: wid=%0d id=%0d domain=%0d active=%b mask=%b",
|
||||
$time, CORE_ID, warp_ctl_if.wid, warp_ctl_if.barrier.id, warp_ctl_if.barrier.domain, active_warps, warp_ctl_if.barrier.mask)
|
||||
)
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_ready_cycles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_ready_cycles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scalar_sched_issued_cycles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_tensor_sched_issued_cycles;
|
||||
|
||||
wire schedule_idle = ~schedule_valid;
|
||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||
wire schedule_idle = ~(scalar_schedule_if.valid || tensor_schedule_if.valid);
|
||||
wire schedule_stall = (scalar_schedule_if.valid && ~scalar_schedule_if.ready)
|
||||
|| (tensor_schedule_if.valid && ~tensor_schedule_if.ready);
|
||||
wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle;
|
||||
`POP_COUNT(schedule_barrier_idle, barrier_stalls);
|
||||
|
||||
@@ -427,17 +541,29 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
if (reset) begin
|
||||
perf_sched_idles <= '0;
|
||||
perf_sched_barrier_idles <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
perf_scalar_sched_ready_cycles <= '0;
|
||||
perf_tensor_sched_ready_cycles <= '0;
|
||||
perf_scalar_sched_issued_cycles <= '0;
|
||||
perf_tensor_sched_issued_cycles <= '0;
|
||||
end else begin
|
||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
perf_scalar_sched_ready_cycles <= perf_scalar_sched_ready_cycles + `PERF_CTR_BITS'(scalar_schedule_valid);
|
||||
perf_tensor_sched_ready_cycles <= perf_tensor_sched_ready_cycles + `PERF_CTR_BITS'(tensor_schedule_valid);
|
||||
perf_scalar_sched_issued_cycles <= perf_scalar_sched_issued_cycles + `PERF_CTR_BITS'(scalar_schedule_fire);
|
||||
perf_tensor_sched_issued_cycles <= perf_tensor_sched_issued_cycles + `PERF_CTR_BITS'(tensor_schedule_fire);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||
assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles;
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
assign perf_schedule_if.scalar_sched_ready_cycles = perf_scalar_sched_ready_cycles;
|
||||
assign perf_schedule_if.tensor_sched_ready_cycles = perf_tensor_sched_ready_cycles;
|
||||
assign perf_schedule_if.scalar_sched_issued_cycles = perf_scalar_sched_issued_cycles;
|
||||
assign perf_schedule_if.tensor_sched_issued_cycles = perf_tensor_sched_issued_cycles;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -14,7 +14,8 @@
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
parameter CORE_ID = 0,
|
||||
parameter DOMAIN = WU_DOMAIN_SCALAR
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
@@ -34,6 +35,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS + `TENSOR_NUM_FPRS;
|
||||
`else
|
||||
localparam TENSOR_NUM_REGS = `TENSOR_NUM_GPRS;
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||
@@ -141,7 +147,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||
localparam IS_TENSOR_LANE = `IS_TENSOR_WARP(i);
|
||||
localparam LANE_IN_DOMAIN = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(i) : `IS_SCALAR_WARP(i);
|
||||
localparam SCOREBOARD_NUM_REGS = (IS_TENSOR_LANE ? TENSOR_NUM_REGS : `NUM_REGS);
|
||||
localparam SCOREBOARD_REG_BITS = `CLOG2(SCOREBOARD_NUM_REGS);
|
||||
|
||||
function automatic [SCOREBOARD_REG_BITS-1:0] scb_reg_idx (
|
||||
input logic [`NR_BITS-1:0] reg_id
|
||||
);
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (IS_TENSOR_LANE && reg_id[`NR_BITS-1]) begin
|
||||
scb_reg_idx = SCOREBOARD_REG_BITS'(`TENSOR_NUM_GPRS) + SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]);
|
||||
end else
|
||||
`endif
|
||||
if (IS_TENSOR_LANE) begin
|
||||
scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id[`NRI_BITS-1:0]);
|
||||
end else begin
|
||||
scb_reg_idx = SCOREBOARD_REG_BITS'(reg_id);
|
||||
end
|
||||
endfunction
|
||||
|
||||
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0] inuse_regs;
|
||||
// Number of inflight operations in execution in the asynchronous
|
||||
// Tensor unit. Since the ISA does not specify an explicit destination
|
||||
// register, use a separate status bit.
|
||||
@@ -150,18 +176,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
localparam INFLT_MAX = {INFLT_WIDTH{1'b1}};
|
||||
|
||||
wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
|
||||
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA) ||
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA));
|
||||
|
||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
wire [`NW_WIDTH-1:0] writeback_wid = wis_to_wid(writeback_if[i].data.wis, ISSUE_ISW_W'(i));
|
||||
wire writeback_lane_in_domain = (DOMAIN == WU_DOMAIN_TENSOR) ? `IS_TENSOR_WARP(writeback_wid) : `IS_SCALAR_WARP(writeback_wid);
|
||||
wire writeback_fire = writeback_lane_in_domain && writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rd_idx = scb_reg_idx(ibuffer_if[i].data.rd);
|
||||
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs1_idx = scb_reg_idx(ibuffer_if[i].data.rs1);
|
||||
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs2_idx = scb_reg_idx(ibuffer_if[i].data.rs2);
|
||||
wire [SCOREBOARD_REG_BITS-1:0] ibuf_rs3_idx = scb_reg_idx(ibuffer_if[i].data.rs3);
|
||||
wire [SCOREBOARD_REG_BITS-1:0] writeback_rd_idx = scb_reg_idx(writeback_if[i].data.rd);
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx];
|
||||
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs1_idx];
|
||||
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs2_idx];
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuf_rs3_idx];
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][SCOREBOARD_NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||
always @(*) begin
|
||||
@@ -180,30 +215,30 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs1_idx] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs1_idx]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs2_idx] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs2_idx]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuf_rs3_idx] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuf_rs3_idx]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -213,10 +248,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
wire hgmma_wait = ibuffer_if[i].valid &&
|
||||
(ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
|
||||
((ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT) ||
|
||||
(ibuffer_if[i].data.op_type == `INST_TENSOR_BWGMMA_WAIT));
|
||||
// HGMMA is unblocked as long as there is available counter left for
|
||||
// the inflight operations. This is to ensure back-to-back fire of
|
||||
// the dot product units with minimal downtime.
|
||||
@@ -232,8 +268,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
`endif
|
||||
|
||||
wire stg_valid_in, stg_ready_in;
|
||||
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
|
||||
assign ibuffer_if[i].ready = stg_ready_in && operands_ready;
|
||||
assign stg_valid_in = LANE_IN_DOMAIN && ibuffer_if[i].valid && operands_ready;
|
||||
assign ibuffer_if[i].ready = LANE_IN_DOMAIN && stg_ready_in && operands_ready;
|
||||
|
||||
VX_stream_buffer #(
|
||||
.DATAW (DATAW)
|
||||
@@ -275,12 +311,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
inflight_tensor <= '0;
|
||||
end else begin
|
||||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] <= 0;
|
||||
end
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||
inuse_regs[ibuffer_if[i].data.wis][ibuf_rd_idx] <= 1;
|
||||
end
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
if (tensor_writeback_fire) begin
|
||||
// ensure no race condition
|
||||
if (!tensor_issue_fire) begin
|
||||
@@ -295,9 +331,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`ifdef PERF_ENABLE
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||
inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
|
||||
inuse_units[ibuffer_if[i].data.wis][ibuf_rd_idx] <= ibuffer_if[i].data.ex_type;
|
||||
if (ibuffer_if[i].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type;
|
||||
inuse_sfu[ibuffer_if[i].data.wis][ibuf_rd_idx] <= sfu_type;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
@@ -320,6 +356,50 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
timeout_ctr <= '0;
|
||||
end
|
||||
`ifdef EXT_T_ASYNC
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
if (CORE_ID == 0 && writeback_fire &&
|
||||
((writeback_if[i].data.PC == 32'h80000010) ||
|
||||
(writeback_if[i].data.PC == 32'h80000014))) begin
|
||||
`TRACE(1, ("%d: core%0d-scoreboard-writeback-recover: isw=%0d, wid=%0d, wis=%0d, PC=0x%0h, rd=%0d, tensor=%b, inuse_before=%b, eop=%b (#%0d)\n",
|
||||
$time, CORE_ID, i, wis_to_wid(writeback_if[i].data.wis, i),
|
||||
writeback_if[i].data.wis, writeback_if[i].data.PC,
|
||||
writeback_if[i].data.rd, writeback_if[i].data.tensor,
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx],
|
||||
writeback_if[i].data.eop, writeback_if[i].data.uuid));
|
||||
end
|
||||
if (CORE_ID == 0 && ibuffer_if[i].valid &&
|
||||
(ibuffer_if[i].data.PC >= `XLEN'h80000240) &&
|
||||
(ibuffer_if[i].data.PC <= `XLEN'h80000260)) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-probe: wid=%0d, PC=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, busy={rd,rs1,rs2,rs3}=%b, rd=%0d rs1=%0d rs2=%0d rs3=%0d, wb=%b, ex=0x%0h, op=0x%0h, inuse_rd=%b, inuse_rs1=%b, inuse_rs2=%b, inuse_rs3=%b, hgmma_start=%b, hgmma_wait=%b, hgmma_ready=%b, inflight=%0d, wb_fire=%b, wb_tensor=%b, wb_wid=%0d, wb_rd=%0d, wb_pc=0x%0h (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
|
||||
ibuffer_if[i].ready, stg_ready_in, operands_ready, operands_busy,
|
||||
ibuffer_if[i].data.rd, ibuffer_if[i].data.rs1, ibuffer_if[i].data.rs2, ibuffer_if[i].data.rs3,
|
||||
ibuffer_if[i].data.wb, ibuffer_if[i].data.ex_type, ibuffer_if[i].data.op_type,
|
||||
inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3, hgmma_start, hgmma_wait, hgmma_ready,
|
||||
inflight_tensor[ibuffer_if[i].data.wis], writeback_fire, writeback_if[i].data.tensor,
|
||||
wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.rd, writeback_if[i].data.PC,
|
||||
ibuffer_if[i].data.uuid));
|
||||
end
|
||||
if (ibuffer_if[i].valid && (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR))) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor: wid=%0d, PC=0x%0h, op=0x%0h, ready=%b, stg_ready=%b, operands_ready=%b, inuse=%b, inflight=%0d, hgmma_start=%b, hgmma_wait=%b, hgmma_fire_ready=%b, hgmma_wait_ready=%b (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC,
|
||||
ibuffer_if[i].data.op_type, ibuffer_if[i].ready, stg_ready_in, operands_ready,
|
||||
operands_busy, inflight_tensor[ibuffer_if[i].data.wis], hgmma_start, hgmma_wait,
|
||||
hgmma_ready_for_fire, hgmma_ready_for_wait, ibuffer_if[i].data.uuid));
|
||||
end
|
||||
if (tensor_issue_fire) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor-issue-count: wid=%0d, op=0x%0h, inflight_before=%0d\n",
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.op_type,
|
||||
inflight_tensor[ibuffer_if[i].data.wis]));
|
||||
end
|
||||
if (tensor_writeback_fire) begin
|
||||
`TRACE(2, ("%d: core%0d-scoreboard-tensor-wb-count: wid=%0d, inflight_before=%0d\n",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i),
|
||||
inflight_tensor[writeback_if[i].data.wis]));
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
@@ -330,7 +410,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
`RUNTIME_ASSERT((~writeback_fire ||
|
||||
writeback_if[i].data.tensor /* dont check rd for tensor ghost writes */ ||
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0),
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_rd_idx] != 0),
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
||||
`endif
|
||||
|
||||
@@ -60,7 +60,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_dispatch_unit #(
|
||||
VX_dispatch_unit_sane #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (1)
|
||||
@@ -162,6 +162,22 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
end
|
||||
assign execute_if[0].ready = sfu_req_ready;
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME) && (CORE_ID == 0)
|
||||
&& execute_if[0].valid
|
||||
&& (execute_if[0].data.PC == 32'h80000028)) begin
|
||||
`TRACE(1, ("%d: core%0d-sfu-probe: valid=%b ready=%b op=0x%0h csr_valid=%b csr_ready=%b wctl_valid=%b wctl_ready=%b wid=%0d PC=0x%0h (#%0d)\n",
|
||||
$time, CORE_ID, execute_if[0].valid, execute_if[0].ready,
|
||||
execute_if[0].data.op_type,
|
||||
csr_execute_if.valid, csr_execute_if.ready,
|
||||
wctl_execute_if.valid, wctl_execute_if.ready,
|
||||
execute_if[0].data.wid, execute_if[0].data.PC,
|
||||
execute_if[0].data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
// response arbitration
|
||||
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
328
hw/rtl/core/VX_tensor_blackwell_core.sv
Normal file
328
hw/rtl/core/VX_tensor_blackwell_core.sv
Normal file
@@ -0,0 +1,328 @@
|
||||
`ifdef EXT_T_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_tensor_blackwell_core_block import VX_gpu_pkg::*; #(
|
||||
parameter ISW,
|
||||
parameter FP16
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
|
||||
VX_execute_if.slave execute_if,
|
||||
VX_tc_rf_if.master regfile_if,
|
||||
VX_tc_bus_if.master tmem_if,
|
||||
output logic tmem_A_ren,
|
||||
input logic tmem_A_rready,
|
||||
output logic [8:0] tmem_A_raddr,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tmem_A_rdata,
|
||||
output logic tmem_C_ren,
|
||||
input logic tmem_C_rready,
|
||||
output logic [8:0] tmem_C_raddr,
|
||||
input logic [`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
|
||||
output logic tmem_C_wen,
|
||||
input logic tmem_C_wready,
|
||||
output logic [8:0] tmem_C_waddr,
|
||||
output logic [`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
|
||||
output logic [`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
|
||||
VX_tc_bus_if.master smem_B_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
localparam NUM_LANES = `NUM_THREADS;
|
||||
localparam METADATA_QUEUE_DEPTH = 2;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] execute_if_data_uuid;
|
||||
wire [`NW_WIDTH-1:0] execute_if_data_wid;
|
||||
wire [NUM_LANES-1:0] execute_if_data_tmask;
|
||||
wire [`INST_ALU_BITS-1:0] execute_if_data_op_type;
|
||||
wire [`XLEN-1:0] execute_if_data_PC;
|
||||
wire execute_if_data_wb;
|
||||
wire [`NR_BITS-1:0] execute_if_data_rd;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs1;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] execute_if_data_rs3;
|
||||
|
||||
wire metadata_queue_full;
|
||||
wire metadata_queue_empty;
|
||||
assign execute_if.ready = !metadata_queue_full;
|
||||
|
||||
logic metadata_deq;
|
||||
|
||||
wire operand_enq_fire = execute_if.valid && execute_if.ready;
|
||||
wire enq = operand_enq_fire;
|
||||
wire deq = metadata_deq;
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 +
|
||||
`NR_BITS + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN) + (NUM_LANES * `XLEN);
|
||||
VX_fifo_queue #(
|
||||
.DATAW(DATAW),
|
||||
.DEPTH(METADATA_QUEUE_DEPTH)
|
||||
) pending_uops (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.push(enq),
|
||||
.pop(deq),
|
||||
.data_in({execute_if.data.uuid, execute_if.data.wid,
|
||||
execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
|
||||
execute_if.data.wb, execute_if.data.rd,
|
||||
execute_if.data.rs1_data, execute_if.data.rs2_data, execute_if.data.rs3_data}),
|
||||
.data_out({execute_if_data_uuid, execute_if_data_wid,
|
||||
execute_if_data_tmask, execute_if_data_op_type, execute_if_data_PC,
|
||||
execute_if_data_wb, execute_if_data_rd,
|
||||
execute_if_data_rs1, execute_if_data_rs2, execute_if_data_rs3}),
|
||||
.empty(metadata_queue_empty),
|
||||
`UNUSED_PIN(alm_empty),
|
||||
.full(metadata_queue_full),
|
||||
`UNUSED_PIN(alm_full),
|
||||
`UNUSED_PIN(size)
|
||||
);
|
||||
|
||||
wire initiate_ready;
|
||||
wire writeback_valid;
|
||||
wire writeback_last;
|
||||
wire [`NW_WIDTH-1:0] writeback_wid;
|
||||
wire [4:0] writeback_rd;
|
||||
logic writeback_ready;
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data;
|
||||
|
||||
wire metadata_valid = !metadata_queue_empty;
|
||||
wire bwgmma = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_BWGMMA);
|
||||
wire bwgmma_wait = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_BWGMMA_WAIT);
|
||||
wire tcgen05_cp = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP);
|
||||
wire tcgen05_cp_wait = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CP_WAIT);
|
||||
wire tcgen05_ld = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_LD);
|
||||
wire tcgen05_st = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_ST);
|
||||
wire tcgen05_cb = metadata_valid &&
|
||||
(execute_if_data_op_type == `INST_TENSOR_TCGEN05_CB);
|
||||
wire tensor_launch_op = bwgmma || tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb;
|
||||
wire tensor_wait_op = bwgmma_wait || tcgen05_cp_wait;
|
||||
wire [2:0] initiate_op =
|
||||
bwgmma ? 3'd0 :
|
||||
bwgmma_wait ? 3'd1 :
|
||||
tcgen05_cp ? 3'd2 :
|
||||
tcgen05_cp_wait ? 3'd3 :
|
||||
tcgen05_ld ? 3'd4 :
|
||||
tcgen05_st ? 3'd5 :
|
||||
tcgen05_cb ? 3'd6 :
|
||||
3'd0;
|
||||
wire initiate_valid = metadata_valid && tensor_launch_op && !sync_launch_pending;
|
||||
wire [`NW_WIDTH-1:0] initiate_wid = execute_if_data_wid;
|
||||
wire [4:0] initiate_rd = execute_if_data_rd[4:0];
|
||||
wire [`XLEN-1:0] initiate_addr_a = execute_if_data_rs1[0];
|
||||
wire [`XLEN-1:0] initiate_addr_b = execute_if_data_rs2[0];
|
||||
wire [`XLEN-1:0] initiate_addr_c = execute_if_data_rs3[0];
|
||||
|
||||
logic sync_launch_pending;
|
||||
logic sync_launch_is_ld;
|
||||
logic sync_launch_is_bwgmma;
|
||||
|
||||
TensorCoreBlackwell tensor_blackwell_core (
|
||||
.clock(clk),
|
||||
.reset(reset),
|
||||
|
||||
.io_initiate_ready(initiate_ready),
|
||||
.io_initiate_valid(initiate_valid),
|
||||
.io_initiate_bits_op(initiate_op),
|
||||
.io_initiate_bits_wid(initiate_wid),
|
||||
.io_initiate_bits_rd(initiate_rd),
|
||||
.io_initiate_bits_addressA(initiate_addr_a),
|
||||
.io_initiate_bits_addressB(initiate_addr_b),
|
||||
.io_initiate_bits_addressC(initiate_addr_c),
|
||||
|
||||
.io_writeback_ready(writeback_ready),
|
||||
.io_writeback_valid(writeback_valid),
|
||||
.io_writeback_bits_last(writeback_last),
|
||||
.io_writeback_bits_wid(writeback_wid),
|
||||
.io_writeback_bits_rd(writeback_rd),
|
||||
.io_writeback_bits_data_0(writeback_data[0]),
|
||||
.io_writeback_bits_data_1(writeback_data[1]),
|
||||
.io_writeback_bits_data_2(writeback_data[2]),
|
||||
.io_writeback_bits_data_3(writeback_data[3]),
|
||||
.io_writeback_bits_data_4(writeback_data[4]),
|
||||
.io_writeback_bits_data_5(writeback_data[5]),
|
||||
.io_writeback_bits_data_6(writeback_data[6]),
|
||||
.io_writeback_bits_data_7(writeback_data[7]),
|
||||
|
||||
.io_respA_ready(tmem_if.rsp_ready),
|
||||
.io_respA_valid(tmem_if.rsp_valid),
|
||||
.io_respA_bits_source(tmem_if.rsp_data.tag),
|
||||
.io_respA_bits_data(tmem_if.rsp_data.data),
|
||||
.io_respB_ready(smem_B_if.rsp_ready),
|
||||
.io_respB_valid(smem_B_if.rsp_valid),
|
||||
.io_respB_bits_source(smem_B_if.rsp_data.tag),
|
||||
.io_respB_bits_data(smem_B_if.rsp_data.data),
|
||||
.io_respC(regfile_if.rsp_data.data),
|
||||
|
||||
.io_reqA_ready(tmem_if.req_ready),
|
||||
.io_reqA_valid(tmem_if.req_valid),
|
||||
.io_reqA_bits_rw(tmem_if.req_data.rw),
|
||||
.io_reqA_bits_byteen(tmem_if.req_data.byteen),
|
||||
.io_reqA_bits_source(tmem_if.req_data.tag),
|
||||
.io_reqA_bits_address(tmem_if.req_data.addr),
|
||||
.io_reqA_bits_data(tmem_if.req_data.data),
|
||||
.io_tmemC_aRen(tmem_A_ren),
|
||||
.io_tmemC_aRready(tmem_A_rready),
|
||||
.io_tmemC_aRaddr(tmem_A_raddr),
|
||||
.io_tmemC_aRdata(tmem_A_rdata),
|
||||
.io_tmemC_cRen(tmem_C_ren),
|
||||
.io_tmemC_cRready(tmem_C_rready),
|
||||
.io_tmemC_cRaddr(tmem_C_raddr),
|
||||
.io_tmemC_cRdata(tmem_C_rdata),
|
||||
.io_tmemC_cWen(tmem_C_wen),
|
||||
.io_tmemC_cWready(tmem_C_wready),
|
||||
.io_tmemC_cWaddr(tmem_C_waddr),
|
||||
.io_tmemC_cWdata(tmem_C_wdata),
|
||||
.io_tmemC_cMask(tmem_C_mask),
|
||||
.io_reqB_ready(smem_B_if.req_ready),
|
||||
.io_reqB_valid(smem_B_if.req_valid),
|
||||
.io_reqB_bits_rw(smem_B_if.req_data.rw),
|
||||
.io_reqB_bits_byteen(smem_B_if.req_data.byteen),
|
||||
.io_reqB_bits_source(smem_B_if.req_data.tag),
|
||||
.io_reqB_bits_address(smem_B_if.req_data.addr),
|
||||
.io_reqB_bits_data(smem_B_if.req_data.data),
|
||||
.io_reqC_valid(regfile_if.req_valid),
|
||||
.io_reqC_bits(regfile_if.req_data.rs[4:0])
|
||||
);
|
||||
|
||||
assign regfile_if.req_data.rs[5] = 1'b1;
|
||||
assign regfile_if.req_data.wis = '0;
|
||||
|
||||
logic commit_select_tensor;
|
||||
logic commit_sync_done;
|
||||
logic sync_done;
|
||||
wire sync_writeback_done = sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma) && writeback_valid;
|
||||
wire sync_no_wb_done = sync_launch_pending && !(sync_launch_is_ld || sync_launch_is_bwgmma) && initiate_ready;
|
||||
wire tensor_idle = initiate_ready && !writeback_valid && !sync_launch_pending;
|
||||
|
||||
always @(*) begin
|
||||
metadata_deq = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
commit_sync_done = 1'b0;
|
||||
sync_done = 1'b0;
|
||||
writeback_ready = commit_if.ready && sync_launch_pending && (sync_launch_is_ld || sync_launch_is_bwgmma);
|
||||
|
||||
if (sync_launch_pending) begin
|
||||
if (sync_writeback_done) begin
|
||||
commit_select_tensor = 1'b1;
|
||||
metadata_deq = commit_if.ready;
|
||||
commit_sync_done = commit_if.ready;
|
||||
end else if (sync_no_wb_done) begin
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready;
|
||||
commit_sync_done = metadata_valid && commit_if.ready;
|
||||
end
|
||||
end else if (metadata_valid) begin
|
||||
if (tensor_wait_op) begin
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready && tensor_idle;
|
||||
end else if (bwgmma) begin
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
sync_done = initiate_valid && initiate_ready;
|
||||
metadata_deq = 1'b0;
|
||||
end else if (tcgen05_cp || tcgen05_ld || tcgen05_st || tcgen05_cb) begin
|
||||
writeback_ready = 1'b0;
|
||||
commit_select_tensor = 1'b0;
|
||||
sync_done = initiate_valid && initiate_ready;
|
||||
// suppress commit until sync_launch_pending is set next cycle
|
||||
metadata_deq = 1'b0;
|
||||
end else begin
|
||||
commit_select_tensor = 1'b0;
|
||||
metadata_deq = metadata_valid && commit_if.ready;
|
||||
end
|
||||
end else begin
|
||||
commit_select_tensor = 1'b1;
|
||||
end
|
||||
|
||||
if (commit_select_tensor) begin
|
||||
commit_if.valid = sync_writeback_done;
|
||||
commit_if.data.uuid = sync_launch_is_bwgmma ? '0 : execute_if_data_uuid;
|
||||
commit_if.data.wid = sync_launch_is_bwgmma ? writeback_wid : execute_if_data_wid;
|
||||
commit_if.data.tmask = sync_launch_is_bwgmma ? {NUM_LANES{1'b1}} : execute_if_data_tmask;
|
||||
commit_if.data.PC = sync_launch_is_bwgmma ? '0 : execute_if_data_PC;
|
||||
commit_if.data.wb = sync_launch_is_ld;
|
||||
commit_if.data.rd = sync_launch_is_ld ? execute_if_data_rd : '0;
|
||||
commit_if.data.data = writeback_data;
|
||||
commit_if.data.tensor = sync_launch_is_bwgmma;
|
||||
commit_if.data.pid = 1'b0;
|
||||
commit_if.data.sop = 1'b1;
|
||||
commit_if.data.eop = sync_launch_is_ld ? 1'b1 : writeback_last;
|
||||
end else begin
|
||||
commit_if.valid = (sync_no_wb_done && metadata_valid) ||
|
||||
(metadata_valid && !sync_launch_pending && !tensor_launch_op &&
|
||||
(!tensor_wait_op || tensor_idle));
|
||||
commit_if.data.uuid = execute_if_data_uuid;
|
||||
commit_if.data.wid = execute_if_data_wid;
|
||||
commit_if.data.tmask = execute_if_data_tmask;
|
||||
commit_if.data.PC = execute_if_data_PC;
|
||||
commit_if.data.wb = execute_if_data_wb;
|
||||
commit_if.data.rd = execute_if_data_rd;
|
||||
commit_if.data.data = '0;
|
||||
commit_if.data.tensor = 1'b0;
|
||||
commit_if.data.pid = 1'b0;
|
||||
commit_if.data.sop = 1'b1;
|
||||
commit_if.data.eop = 1'b1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
sync_launch_pending <= 1'b0;
|
||||
sync_launch_is_ld <= 1'b0;
|
||||
sync_launch_is_bwgmma <= 1'b0;
|
||||
end else begin
|
||||
if (sync_done) begin
|
||||
sync_launch_pending <= 1'b1;
|
||||
sync_launch_is_ld <= tcgen05_ld;
|
||||
sync_launch_is_bwgmma <= bwgmma;
|
||||
end else if (commit_sync_done) begin
|
||||
sync_launch_pending <= 1'b0;
|
||||
sync_launch_is_ld <= 1'b0;
|
||||
sync_launch_is_bwgmma <= 1'b0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
|
||||
always @(posedge clk) begin
|
||||
if (!reset && ($time > `TRACE_STARTTIME)) begin
|
||||
if (operand_enq_fire) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-enq: wid=%0d, PC=0x%0h, op=0x%0h, uuid=%0d, queue_full=%b\n",
|
||||
$time, execute_if.data.wid, execute_if.data.PC, execute_if.data.op_type,
|
||||
execute_if.data.uuid, metadata_queue_full));
|
||||
end
|
||||
if (metadata_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-meta: wid=%0d, PC=0x%0h, op=0x%0h, deq=%b, commit_valid=%b, commit_ready=%b, initiate_valid=%b, initiate_ready=%b, writeback_valid=%b, writeback_ready=%b, sync_pending=%b, sync_is_ld=%b, sync_done=%b, commit_sync_done=%b, queue_full=%b\n",
|
||||
$time, execute_if_data_wid, execute_if_data_PC, execute_if_data_op_type,
|
||||
metadata_deq, commit_if.valid, commit_if.ready, initiate_valid, initiate_ready,
|
||||
writeback_valid, writeback_ready, sync_launch_pending, sync_launch_is_ld,
|
||||
sync_done, commit_sync_done, metadata_queue_full));
|
||||
end
|
||||
if (writeback_valid || initiate_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-handshake: initiate=%b/%b, op=%0d, addrA=0x%0h, addrB=0x%0h, writeback=%b/%b, wb_wid=%0d, wb_rd=%0d, wb_last=%b\n",
|
||||
$time, initiate_valid, initiate_ready, initiate_op, initiate_addr_a, initiate_addr_b,
|
||||
writeback_valid, writeback_ready, writeback_wid, writeback_rd, writeback_last));
|
||||
end
|
||||
if (tmem_if.req_valid || tmem_C_wen || tmem_C_ren || tmem_A_ren || smem_B_if.req_valid || tmem_if.rsp_valid || smem_B_if.rsp_valid) begin
|
||||
`TRACE(2, ("%d: tensor-blackwell-mem: tmemA_req=%b/%b tmemA_rw=%b tmemA_addr=0x%0h tmemA_rsp=%b/%b, tmemA_ren=%b/%b tmemA_raddr=0x%0h, tmemC_wen=%b/%b tmemC_ren=%b/%b tmemC_waddr=0x%0h tmemC_raddr=0x%0h, smemB_req=%b/%b smemB_rw=%b smemB_addr=0x%0h smemB_rsp=%b/%b, initiate_ready=%b\n",
|
||||
$time, tmem_if.req_valid, tmem_if.req_ready, tmem_if.req_data.rw, tmem_if.req_data.addr, tmem_if.rsp_valid, tmem_if.rsp_ready,
|
||||
tmem_A_ren, tmem_A_rready, tmem_A_raddr,
|
||||
tmem_C_wen, tmem_C_wready, tmem_C_ren, tmem_C_rready, tmem_C_waddr, tmem_C_raddr,
|
||||
smem_B_if.req_valid, smem_B_if.req_ready, smem_B_if.req_data.rw, smem_B_if.req_data.addr, smem_B_if.rsp_valid, smem_B_if.rsp_ready,
|
||||
initiate_ready));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
@@ -2,20 +2,42 @@
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
parameter FP16
|
||||
parameter FP16,
|
||||
parameter NUM_TENSOR_CORES = `NUM_TENSOR_WARPS
|
||||
) (
|
||||
input clk,
|
||||
input reset,
|
||||
|
||||
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||
`ifdef EXT_T_HOPPER
|
||||
VX_tc_rf_if.master regfile_if,
|
||||
VX_tc_bus_if.master smem_A_if,
|
||||
VX_tc_bus_if.master smem_B_if,
|
||||
`ifdef EXT_T_ASYNC
|
||||
VX_tc_rf_if.master regfile_if[NUM_TENSOR_CORES],
|
||||
VX_tc_bus_if.master smem_A_if[NUM_TENSOR_CORES],
|
||||
output logic [NUM_TENSOR_CORES-1:0] tmem_A_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tmem_A_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tmem_A_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_A_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tmem_C_ren,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tmem_C_rready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_raddr,
|
||||
input logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_rdata,
|
||||
output logic [NUM_TENSOR_CORES-1:0] tmem_C_wen,
|
||||
input logic [NUM_TENSOR_CORES-1:0] tmem_C_wready,
|
||||
output logic [NUM_TENSOR_CORES*9-1:0] tmem_C_waddr,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN-1:0] tmem_C_wdata,
|
||||
output logic [NUM_TENSOR_CORES*`NUM_THREADS*`XLEN/8-1:0] tmem_C_mask,
|
||||
VX_tc_bus_if.master smem_B_if[NUM_TENSOR_CORES],
|
||||
`endif
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
|
||||
("Wu tensor-core lane binding currently requires ISSUE_WIDTH == NUM_WARPS"))
|
||||
`ifdef EXT_T_BLACKWELL
|
||||
localparam BLOCK_SIZE = NUM_TENSOR_CORES;
|
||||
`STATIC_ASSERT((NUM_TENSOR_CORES == `NUM_TENSOR_WARPS),
|
||||
("Blackwell Wu tensor-core binding requires NUM_TENSOR_CORES == NUM_TENSOR_WARPS"))
|
||||
`else
|
||||
localparam BLOCK_SIZE = 1;
|
||||
`endif
|
||||
localparam NUM_LANES = `NUM_THREADS;
|
||||
// @perf: PARTIAL_BW==1 increases power instantiating
|
||||
// stream_buffers for ISSUE_WIDTH times
|
||||
@@ -25,19 +47,48 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) execute_if[BLOCK_SIZE]();
|
||||
|
||||
VX_dispatch_if compact_dispatch_if[`ISSUE_WIDTH]();
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_dispatch_lane
|
||||
if (i < NUM_TENSOR_CORES) begin
|
||||
localparam SRC_ISW = `NUM_SCALAR_WARPS + i;
|
||||
assign compact_dispatch_if[i].valid = dispatch_if[SRC_ISW].valid;
|
||||
assign compact_dispatch_if[i].data = dispatch_if[SRC_ISW].data;
|
||||
assign dispatch_if[SRC_ISW].ready = compact_dispatch_if[i].ready;
|
||||
end else begin
|
||||
assign compact_dispatch_if[i].valid = 1'b0;
|
||||
assign compact_dispatch_if[i].data = '0;
|
||||
end
|
||||
|
||||
if (i < `NUM_SCALAR_WARPS) begin
|
||||
assign dispatch_if[i].ready = 1'b0;
|
||||
end
|
||||
end
|
||||
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_dispatch_unit_sane #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 0)
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 0),
|
||||
.ISW_BASE (`NUM_SCALAR_WARPS)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
.dispatch_if(dispatch_if),
|
||||
.dispatch_if(compact_dispatch_if),
|
||||
.execute_if (execute_if)
|
||||
);
|
||||
|
||||
`ifdef SIMULATION
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_tensor_domain_assert
|
||||
`RUNTIME_ASSERT(
|
||||
!dispatch_if[i].valid || `IS_TENSOR_WARP(i),
|
||||
("%t: *** tensor-dispatch-cross-domain: issue_lane=%0d ex_op=0x%0h",
|
||||
$time, i, dispatch_if[i].data.op_type)
|
||||
)
|
||||
end
|
||||
`endif
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_block_if[BLOCK_SIZE]();
|
||||
@@ -57,6 +108,14 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`ifdef EXT_T_HOPPER
|
||||
assign tmem_A_ren[block_idx] = 1'b0;
|
||||
assign tmem_A_raddr[block_idx * 9 +: 9] = '0;
|
||||
assign tmem_C_wen[block_idx] = 1'b0;
|
||||
assign tmem_C_ren[block_idx] = 1'b0;
|
||||
assign tmem_C_waddr[block_idx * 9 +: 9] = '0;
|
||||
assign tmem_C_raddr[block_idx * 9 +: 9] = '0;
|
||||
assign tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN] = '0;
|
||||
assign tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8] = '0;
|
||||
VX_tensor_hopper_core_block #(
|
||||
.ISW(1), // FIXME: not block_idx
|
||||
.FP16(FP16)
|
||||
@@ -64,9 +123,35 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.execute_if (execute_if[block_idx]),
|
||||
.regfile_if (regfile_if),
|
||||
.smem_A_if (smem_A_if),
|
||||
.smem_B_if (smem_B_if),
|
||||
.regfile_if (regfile_if[block_idx]),
|
||||
.smem_A_if (smem_A_if[block_idx]),
|
||||
.smem_B_if (smem_B_if[block_idx]),
|
||||
.commit_if (commit_block_if[block_idx])
|
||||
);
|
||||
`elsif EXT_T_BLACKWELL
|
||||
VX_tensor_blackwell_core_block #(
|
||||
.ISW(1),
|
||||
.FP16(FP16)
|
||||
) tensor_blackwell_core_block (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.execute_if (execute_if[block_idx]),
|
||||
.regfile_if (regfile_if[block_idx]),
|
||||
.tmem_if (smem_A_if[block_idx]),
|
||||
.tmem_A_ren (tmem_A_ren[block_idx]),
|
||||
.tmem_A_rready(tmem_A_rready[block_idx]),
|
||||
.tmem_A_raddr(tmem_A_raddr[block_idx * 9 +: 9]),
|
||||
.tmem_A_rdata(tmem_A_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
|
||||
.tmem_C_ren (tmem_C_ren[block_idx]),
|
||||
.tmem_C_rready(tmem_C_rready[block_idx]),
|
||||
.tmem_C_raddr(tmem_C_raddr[block_idx * 9 +: 9]),
|
||||
.tmem_C_rdata(tmem_C_rdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
|
||||
.tmem_C_wen (tmem_C_wen[block_idx]),
|
||||
.tmem_C_wready(tmem_C_wready[block_idx]),
|
||||
.tmem_C_waddr(tmem_C_waddr[block_idx * 9 +: 9]),
|
||||
.tmem_C_wdata(tmem_C_wdata[block_idx * `NUM_THREADS*`XLEN +: `NUM_THREADS*`XLEN]),
|
||||
.tmem_C_mask(tmem_C_mask[block_idx * `NUM_THREADS*`XLEN/8 +: `NUM_THREADS*`XLEN/8]),
|
||||
.smem_B_if (smem_B_if[block_idx]),
|
||||
.commit_if (commit_block_if[block_idx])
|
||||
);
|
||||
// ) tensor_hopper_core_block (
|
||||
|
||||
179
hw/rtl/core/VX_tensor_ctrl_unit.sv
Normal file
179
hw/rtl/core/VX_tensor_ctrl_unit.sv
Normal file
@@ -0,0 +1,179 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_tensor_ctrl_unit import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH],
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
|
||||
output wire csr_unlock_valid,
|
||||
output wire [`NW_WIDTH-1:0] csr_unlock_wid,
|
||||
|
||||
output wire tmc_valid,
|
||||
output wire [`NW_WIDTH-1:0] tmc_wid,
|
||||
output wire [`NUM_THREADS-1:0] tmc_tmask
|
||||
);
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_THREADS;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS
|
||||
+ 1 + (NUM_LANES * `XLEN) + 1 + PID_WIDTH + 1 + 1
|
||||
+ 1 + 1 + `NUM_THREADS + $bits(barrier_t);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) execute_if[BLOCK_SIZE]();
|
||||
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_dispatch_unit_sane #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (1)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
.dispatch_if(dispatch_if),
|
||||
.execute_if (execute_if)
|
||||
);
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) ctrl_commit_if[BLOCK_SIZE]();
|
||||
|
||||
wire is_tmc = (execute_if[0].data.op_type == `INST_SFU_TMC);
|
||||
wire is_csr = (execute_if[0].data.op_type == `INST_SFU_CSRRS);
|
||||
wire is_bar = (execute_if[0].data.op_type == `INST_SFU_BAR);
|
||||
wire is_bar_mask = (execute_if[0].data.op_type == `INST_SFU_BAR_MASK);
|
||||
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if[0].data.imm[`VX_CSR_ADDR_BITS-1:0];
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] csr_read_data;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [31:0] wtid = 32'(execute_if[0].data.pid * NUM_LANES + i);
|
||||
wire [31:0] gtid = (32'(CORE_ID) << (`NW_BITS + `NT_BITS))
|
||||
+ (32'(execute_if[0].data.wid) << `NT_BITS)
|
||||
+ wtid;
|
||||
assign csr_read_data[i] =
|
||||
(csr_addr == `VX_CSR_THREAD_ID) ? `XLEN'(wtid) :
|
||||
(csr_addr == `VX_CSR_WARP_ID) ? `XLEN'(execute_if[0].data.wid) :
|
||||
(csr_addr == `VX_CSR_CORE_ID) ? `XLEN'(CORE_ID) :
|
||||
(csr_addr == `VX_CSR_MHARTID) ? `XLEN'(gtid) :
|
||||
(csr_addr == `VX_CSR_NUM_THREADS) ? `XLEN'(`NUM_THREADS) :
|
||||
(csr_addr == `VX_CSR_NUM_WARPS) ? `XLEN'(`NUM_WARPS) :
|
||||
(csr_addr == `VX_CSR_NUM_CORES) ? `XLEN'(`NUM_CORES * `NUM_CLUSTERS) :
|
||||
`XLEN'(0);
|
||||
end
|
||||
|
||||
wire [`NUM_THREADS-1:0] req_tmc_tmask = execute_if[0].data.rs1_data[0][`NUM_THREADS-1:0];
|
||||
wire [`XLEN-1:0] req_rs1_data = execute_if[0].data.rs1_data[0];
|
||||
wire [`XLEN-1:0] req_rs2_data = execute_if[0].data.rs2_data[0];
|
||||
|
||||
barrier_t barrier, barrier_r;
|
||||
assign barrier.valid = is_bar || is_bar_mask;
|
||||
assign barrier.id = req_rs1_data[`NB_WIDTH-1:0];
|
||||
`ifdef GBAR_ENABLE
|
||||
`ifdef GBAR_CLUSTER_ENABLE
|
||||
assign barrier.is_global = 1'b1;
|
||||
`else
|
||||
assign barrier.is_global = req_rs1_data[31];
|
||||
`endif
|
||||
`else
|
||||
assign barrier.is_global = 1'b0;
|
||||
`endif
|
||||
assign barrier.domain = is_bar_mask ? BARRIER_MASK : req_rs1_data[29:28];
|
||||
assign barrier.mask = req_rs2_data[`NUM_WARPS-1:0];
|
||||
assign barrier.size_m1 = req_rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
|
||||
|
||||
`ifdef SIMULATION
|
||||
wire tensor_ctrl_fire = execute_if[0].valid && execute_if[0].ready && execute_if[0].data.eop;
|
||||
`RUNTIME_ASSERT(
|
||||
!tensor_ctrl_fire || is_tmc || is_csr || is_bar || is_bar_mask,
|
||||
("%t: *** core%0d-tensor-ctrl-illegal-op: wid=%0d PC=0x%0h op=0x%0h",
|
||||
$time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, execute_if[0].data.op_type)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!tensor_ctrl_fire || !is_csr
|
||||
|| (csr_addr == `VX_CSR_THREAD_ID)
|
||||
|| (csr_addr == `VX_CSR_WARP_ID)
|
||||
|| (csr_addr == `VX_CSR_CORE_ID)
|
||||
|| (csr_addr == `VX_CSR_MHARTID)
|
||||
|| (csr_addr == `VX_CSR_NUM_THREADS)
|
||||
|| (csr_addr == `VX_CSR_NUM_WARPS)
|
||||
|| (csr_addr == `VX_CSR_NUM_CORES),
|
||||
("%t: *** core%0d-tensor-ctrl-illegal-csr: wid=%0d PC=0x%0h csr=0x%0h",
|
||||
$time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, csr_addr)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!tensor_ctrl_fire || !is_bar_mask || barrier.mask != '0,
|
||||
("%t: *** core%0d-tensor-ctrl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h",
|
||||
$time, CORE_ID, execute_if[0].data.wid, barrier.id, barrier.mask)
|
||||
)
|
||||
`endif
|
||||
|
||||
wire rsp_valid_in = execute_if[0].valid;
|
||||
wire rsp_ready_in;
|
||||
assign execute_if[0].ready = rsp_ready_in;
|
||||
|
||||
wire [`NUM_THREADS-1:0] tmc_tmask_r;
|
||||
wire is_tmc_r;
|
||||
wire is_csr_r;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid_in),
|
||||
.ready_in (rsp_ready_in),
|
||||
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.wb, csr_read_data, 1'b0, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop, is_tmc, is_csr, req_tmc_tmask, barrier}),
|
||||
.data_out ({ctrl_commit_if[0].data.uuid, ctrl_commit_if[0].data.wid, ctrl_commit_if[0].data.tmask, ctrl_commit_if[0].data.PC, ctrl_commit_if[0].data.rd, ctrl_commit_if[0].data.wb, ctrl_commit_if[0].data.data, ctrl_commit_if[0].data.tensor, ctrl_commit_if[0].data.pid, ctrl_commit_if[0].data.sop, ctrl_commit_if[0].data.eop, is_tmc_r, is_csr_r, tmc_tmask_r, barrier_r}),
|
||||
.valid_out (ctrl_commit_if[0].valid),
|
||||
.ready_out (ctrl_commit_if[0].ready)
|
||||
);
|
||||
|
||||
assign csr_unlock_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_csr_r;
|
||||
assign csr_unlock_wid = ctrl_commit_if[0].data.wid;
|
||||
|
||||
assign tmc_valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && is_tmc_r;
|
||||
assign tmc_wid = ctrl_commit_if[0].data.wid;
|
||||
assign tmc_tmask = tmc_tmask_r;
|
||||
|
||||
assign warp_ctl_if.valid = ctrl_commit_if[0].valid && ctrl_commit_if[0].ready && ctrl_commit_if[0].data.eop && barrier_r.valid;
|
||||
assign warp_ctl_if.wid = ctrl_commit_if[0].data.wid;
|
||||
assign warp_ctl_if.tmc = '0;
|
||||
assign warp_ctl_if.wspawn = '0;
|
||||
assign warp_ctl_if.split = '0;
|
||||
assign warp_ctl_if.sjoin = '0;
|
||||
assign warp_ctl_if.barrier = barrier_r;
|
||||
|
||||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (1)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.commit_in_if (ctrl_commit_if),
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
@@ -177,6 +177,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
|
||||
// add offset of 32 for fp regs
|
||||
assign regfile_if.req_data.rs[5] = 1'b1;
|
||||
assign regfile_if.req_data.wis = '0;
|
||||
assign smem_A_if.req_data.rw = 1'b0;
|
||||
assign smem_A_if.req_data.byteen = '1;
|
||||
assign smem_A_if.req_data.data = '0;
|
||||
assign smem_B_if.req_data.rw = 1'b0;
|
||||
assign smem_B_if.req_data.byteen = '1;
|
||||
assign smem_B_if.req_data.data = '0;
|
||||
`STATIC_ASSERT((`ISSUE_WIDTH == `NUM_WARPS),
|
||||
("static assertion failed: tensor_hopper_core assumes ISSUE_WIDTH == NUM_WARPS"))
|
||||
|
||||
|
||||
@@ -22,6 +22,9 @@ task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
`EX_LSU: `TRACE(level, ("LSU"));
|
||||
`EX_FPU: `TRACE(level, ("FPU"));
|
||||
`EX_SFU: `TRACE(level, ("SFU"));
|
||||
`ifdef EXT_T_ENABLE
|
||||
`EX_TENSOR: `TRACE(level, ("TENSOR"));
|
||||
`endif
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
@@ -36,22 +39,26 @@ task trace_ex_op(input int level,
|
||||
`UNUSED_ARG(input [`XLEN-1:0] imm)
|
||||
);
|
||||
|
||||
logic fdst_d;
|
||||
logic fcvt_l;
|
||||
logic rd_float;
|
||||
|
||||
`ifdef FLEN_64
|
||||
logic fdst_d = imm[0];
|
||||
fdst_d = imm[0];
|
||||
`else
|
||||
logic fdst_d = 0;
|
||||
fdst_d = 0;
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
logic fcvt_l = imm[1];
|
||||
fcvt_l = imm[1];
|
||||
`else
|
||||
logic fcvt_l = 0;
|
||||
fcvt_l = 0;
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
|
||||
rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
|
||||
`else
|
||||
logic rd_float = 0;
|
||||
rd_float = 0;
|
||||
`endif
|
||||
|
||||
case (ex_type)
|
||||
@@ -353,12 +360,29 @@ task trace_ex_op(input int level,
|
||||
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
|
||||
`INST_SFU_BAR: `TRACE(level, ("BAR"));
|
||||
`INST_SFU_PRED: `TRACE(level, ("PRED"));
|
||||
`INST_SFU_BAR_MASK:`TRACE(level, ("BAR_MASK"));
|
||||
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
||||
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
||||
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`ifdef EXT_T_ENABLE
|
||||
`EX_TENSOR: begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_TENSOR_HMMA: `TRACE(level, ("HMMA"));
|
||||
`INST_TENSOR_HGMMA: `TRACE(level, ("HGMMA"));
|
||||
`INST_TENSOR_HGMMA_WAIT: `TRACE(level, ("HGMMA_WAIT"));
|
||||
`INST_TENSOR_TCGEN05_CP: `TRACE(level, ("TCGEN05_CP"));
|
||||
`INST_TENSOR_TCGEN05_CP_WAIT: `TRACE(level, ("TCGEN05_CP_WAIT"));
|
||||
`INST_TENSOR_BWGMMA: `TRACE(level, ("BWGMMA"));
|
||||
`INST_TENSOR_BWGMMA_WAIT: `TRACE(level, ("BWGMMA_WAIT"));
|
||||
`INST_TENSOR_TCGEN05_LD: `TRACE(level, ("TCGEN05_LD"));
|
||||
`INST_TENSOR_TCGEN05_ST: `TRACE(level, ("TCGEN05_ST"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
|
||||
@@ -183,7 +183,7 @@ end
|
||||
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint,
|
||||
// but conceptually they should be linked
|
||||
always @(*) begin
|
||||
`ifdef EXT_T_HOPPER
|
||||
`ifdef EXT_T_ASYNC
|
||||
// for Hopper, disable micro-op blitzing. Set/step is managed
|
||||
// microarchitecturally in an FSM inside the tensor core.
|
||||
use_uop = 1'b0;
|
||||
|
||||
@@ -32,7 +32,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS
|
||||
+ 1 + WCTL_WIDTH + 1 + PID_WIDTH + 1 + 1;
|
||||
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
||||
@@ -43,11 +44,13 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
barrier_t barrier, barrier_r;
|
||||
|
||||
wire is_wspawn = (execute_if.data.op_type == `INST_SFU_WSPAWN);
|
||||
wire is_wspawn_mask = is_wspawn && execute_if.data.op_mod[0];
|
||||
wire is_tmc = (execute_if.data.op_type == `INST_SFU_TMC);
|
||||
wire is_pred = (execute_if.data.op_type == `INST_SFU_PRED);
|
||||
wire is_split = (execute_if.data.op_type == `INST_SFU_SPLIT);
|
||||
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
||||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||
wire is_bar_mask = (execute_if.data.op_type == `INST_SFU_BAR_MASK);
|
||||
|
||||
wire [`UP(LANE_BITS)-1:0] tid;
|
||||
if (LANE_BITS != 0) begin
|
||||
@@ -106,7 +109,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
assign sjoin.is_dvg = rs1_data[0];
|
||||
|
||||
// barrier
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.valid = is_bar || is_bar_mask;
|
||||
assign barrier.id = rs1_data[`NB_WIDTH-1:0];
|
||||
`ifdef GBAR_ENABLE
|
||||
`ifdef GBAR_CLUSTER_ENABLE
|
||||
@@ -119,18 +122,35 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
`else
|
||||
assign barrier.is_global = 1'b0;
|
||||
`endif
|
||||
assign barrier.domain = is_bar_mask ? BARRIER_MASK : rs1_data[29:28];
|
||||
assign barrier.mask = rs2_data[`NUM_WARPS-1:0];
|
||||
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
|
||||
|
||||
// wspawn
|
||||
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
wire [`NUM_WARPS-1:0] legacy_wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
|
||||
assign legacy_wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
|
||||
end
|
||||
wire [`NUM_WARPS-1:0] masked_wspawn_wmask = rs1_data[`NUM_WARPS-1:0] & ~(`NUM_WARPS'(1) << execute_if.data.wid);
|
||||
assign wspawn.valid = is_wspawn;
|
||||
assign wspawn.wmask = wspawn_wmask;
|
||||
assign wspawn.wmask = is_wspawn_mask ? masked_wspawn_wmask : legacy_wspawn_wmask;
|
||||
assign wspawn.pc = rs2_data;
|
||||
|
||||
`ifdef SIMULATION
|
||||
wire wctl_fire = execute_if.valid && execute_if.ready && execute_if.data.eop;
|
||||
`RUNTIME_ASSERT(
|
||||
!wctl_fire || !is_wspawn_mask || masked_wspawn_wmask != '0,
|
||||
("%t: *** core%0d-wctl-invalid-wspawn-mask: wid=%0d raw_mask=0x%0h pc=0x%0h",
|
||||
$time, CORE_ID, execute_if.data.wid, rs1_data[`NUM_WARPS-1:0], rs2_data)
|
||||
)
|
||||
`RUNTIME_ASSERT(
|
||||
!wctl_fire || !is_bar_mask || barrier.mask != '0,
|
||||
("%t: *** core%0d-wctl-invalid-barrier-mask: wid=%0d id=%0d raw_mask=0x%0h",
|
||||
$time, CORE_ID, execute_if.data.wid, barrier.id, barrier.mask)
|
||||
)
|
||||
`endif
|
||||
|
||||
// response
|
||||
|
||||
VX_elastic_buffer #(
|
||||
|
||||
@@ -325,12 +325,8 @@ module VX_tensor_threadgroup #(
|
||||
.io_in_valid (fedp_fire_in),
|
||||
.io_in_bits_a_0 (A_frag[d_row][0]),
|
||||
.io_in_bits_a_1 (A_frag[d_row][1]),
|
||||
.io_in_bits_a_2 (32'h0),
|
||||
.io_in_bits_a_3 (32'h0),
|
||||
.io_in_bits_b_0 (B_frag[0][d_col_sel]),
|
||||
.io_in_bits_b_1 (B_frag[1][d_col_sel]),
|
||||
.io_in_bits_b_2 (32'h0),
|
||||
.io_in_bits_b_3 (32'h0),
|
||||
.io_in_bits_c (C_frag[d_row][d_col_sel]),
|
||||
.io_stall (stall),
|
||||
.io_out_valid (fedp_valids[i]),
|
||||
|
||||
@@ -17,6 +17,13 @@ interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] sched_barrier_idles;
|
||||
wire [`PERF_CTR_BITS-1:0] scalar_sched_ready_cycles;
|
||||
wire [`PERF_CTR_BITS-1:0] tensor_sched_ready_cycles;
|
||||
wire [`PERF_CTR_BITS-1:0] scalar_sched_issued_cycles;
|
||||
wire [`PERF_CTR_BITS-1:0] tensor_sched_issued_cycles;
|
||||
wire [`PERF_CTR_BITS-1:0] illegal_tensor_reg_access;
|
||||
wire [`PERF_CTR_BITS-1:0] illegal_tensor_scalar_op;
|
||||
wire [`PERF_CTR_BITS-1:0] illegal_scalar_tensor_op;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses;
|
||||
@@ -34,11 +41,26 @@ interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] scalar_lsu_reqs;
|
||||
wire [`PERF_CTR_BITS-1:0] tensor_lsu_reqs;
|
||||
wire [`PERF_CTR_BITS-1:0] scalar_lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] tensor_lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_merge_stalls;
|
||||
|
||||
modport schedule (
|
||||
output sched_idles,
|
||||
output sched_barrier_idles,
|
||||
output sched_stalls
|
||||
output sched_stalls,
|
||||
output scalar_sched_ready_cycles,
|
||||
output tensor_sched_ready_cycles,
|
||||
output scalar_sched_issued_cycles,
|
||||
output tensor_sched_issued_cycles
|
||||
);
|
||||
|
||||
modport decode (
|
||||
output illegal_tensor_reg_access,
|
||||
output illegal_tensor_scalar_op,
|
||||
output illegal_scalar_tensor_op
|
||||
);
|
||||
|
||||
modport issue (
|
||||
@@ -55,10 +77,25 @@ interface VX_pipeline_perf_if ();
|
||||
output dispatch_any_fire_cycles
|
||||
);
|
||||
|
||||
modport execute (
|
||||
output scalar_lsu_reqs,
|
||||
output tensor_lsu_reqs,
|
||||
output scalar_lsu_stalls,
|
||||
output tensor_lsu_stalls,
|
||||
output mem_merge_stalls
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched_idles,
|
||||
input sched_barrier_idles,
|
||||
input sched_stalls,
|
||||
input scalar_sched_ready_cycles,
|
||||
input tensor_sched_ready_cycles,
|
||||
input scalar_sched_issued_cycles,
|
||||
input tensor_sched_issued_cycles,
|
||||
input illegal_tensor_reg_access,
|
||||
input illegal_tensor_scalar_op,
|
||||
input illegal_scalar_tensor_op,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input scb_any_unit_uses,
|
||||
@@ -74,7 +111,12 @@ interface VX_pipeline_perf_if ();
|
||||
input loads,
|
||||
input stores,
|
||||
input ifetch_latency,
|
||||
input load_latency
|
||||
input load_latency,
|
||||
input scalar_lsu_reqs,
|
||||
input tensor_lsu_reqs,
|
||||
input scalar_lsu_stalls,
|
||||
input tensor_lsu_stalls,
|
||||
input mem_merge_stalls
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
@@ -20,7 +20,10 @@ interface VX_tc_bus_if #(
|
||||
)();
|
||||
|
||||
typedef struct packed {
|
||||
logic rw;
|
||||
logic [DATA_SIZE-1:0] byteen;
|
||||
logic [ADDR_WIDTH-1:0] addr;
|
||||
logic [DATA_SIZE*8-1:0] data;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
} req_data_t;
|
||||
|
||||
|
||||
701
kernel/include/VX_config.h
Normal file
701
kernel/include/VX_config.h
Normal file
@@ -0,0 +1,701 @@
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at 2024-05-07 13:55:58.398687
|
||||
|
||||
// Translated from ./rtl/VX_config.vh:
|
||||
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef VX_CONFIG_VH
|
||||
#define VX_CONFIG_VH
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef CLAMP
|
||||
#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
|
||||
#endif
|
||||
|
||||
#ifndef UP
|
||||
#define UP(x) (((x) != 0) ? (x) : 1)
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef EXT_M_DISABLE
|
||||
#define EXT_M_ENABLE
|
||||
#endif
|
||||
|
||||
#ifndef EXT_F_DISABLE
|
||||
#define EXT_F_ENABLE
|
||||
#endif
|
||||
|
||||
#ifndef XLEN_32
|
||||
#ifndef XLEN_64
|
||||
#define XLEN_32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define XLEN 64
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_32
|
||||
#define XLEN 32
|
||||
#endif
|
||||
|
||||
#ifdef EXT_D_ENABLE
|
||||
#define FLEN_64
|
||||
#else
|
||||
#define FLEN_32
|
||||
#endif
|
||||
|
||||
#ifdef FLEN_64
|
||||
#define FLEN 64
|
||||
#endif
|
||||
|
||||
#ifdef FLEN_32
|
||||
#define FLEN 32
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
#ifdef FLEN_32
|
||||
#define FPU_RV64F
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CLUSTERS
|
||||
#define NUM_CLUSTERS 1
|
||||
#endif
|
||||
|
||||
#ifndef NUM_CORES
|
||||
#define NUM_CORES 1
|
||||
#endif
|
||||
|
||||
#ifndef NUM_WARPS
|
||||
#define NUM_WARPS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_TENSOR_WARPS
|
||||
#define NUM_TENSOR_WARPS 2
|
||||
#endif
|
||||
|
||||
#define NUM_SCALAR_WARPS (NUM_WARPS - NUM_TENSOR_WARPS)
|
||||
|
||||
#define IS_SCALAR_WARP(wid) ((wid) < NUM_SCALAR_WARPS)
|
||||
#define IS_TENSOR_WARP(wid) ((wid) >= NUM_SCALAR_WARPS)
|
||||
|
||||
#ifndef TENSOR_NUM_GPRS
|
||||
#define TENSOR_NUM_GPRS 8
|
||||
#endif
|
||||
|
||||
#ifndef TENSOR_NUM_FPRS
|
||||
#define TENSOR_NUM_FPRS 8
|
||||
#endif
|
||||
|
||||
#ifndef NUM_THREADS
|
||||
#define NUM_THREADS 4
|
||||
#endif
|
||||
|
||||
#ifndef NUM_BARRIERS
|
||||
#define NUM_BARRIERS 8
|
||||
#endif
|
||||
|
||||
#ifndef SOCKET_SIZE
|
||||
#define SOCKET_SIZE MIN(4, NUM_CORES)
|
||||
#endif
|
||||
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
|
||||
|
||||
#ifdef L2_ENABLE
|
||||
#define L2_ENABLED 1
|
||||
#else
|
||||
#define L2_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef L3_ENABLE
|
||||
#define L3_ENABLED 1
|
||||
#else
|
||||
#define L3_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef L1_DISABLE
|
||||
#define ICACHE_DISABLE
|
||||
#define DCACHE_DISABLE
|
||||
#endif
|
||||
|
||||
#ifndef MEM_BLOCK_SIZE
|
||||
#define MEM_BLOCK_SIZE 64
|
||||
#endif
|
||||
|
||||
#ifndef MEM_ADDR_WIDTH
|
||||
#ifdef XLEN_64
|
||||
#define MEM_ADDR_WIDTH 48
|
||||
#else
|
||||
#define MEM_ADDR_WIDTH 32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef L1_LINE_SIZE
|
||||
#ifdef L1_DISABLE
|
||||
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
|
||||
#else
|
||||
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef L2_ENABLE
|
||||
#define L2_LINE_SIZE MEM_BLOCK_SIZE
|
||||
#else
|
||||
#define L2_LINE_SIZE L1_LINE_SIZE
|
||||
#endif
|
||||
|
||||
#ifdef L3_ENABLE
|
||||
#define L3_LINE_SIZE MEM_BLOCK_SIZE
|
||||
#else
|
||||
#define L3_LINE_SIZE L2_LINE_SIZE
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
|
||||
#ifndef STARTUP_ADDR
|
||||
#define STARTUP_ADDR 0x180000000
|
||||
#endif
|
||||
|
||||
#ifndef STACK_BASE_ADDR
|
||||
#define STACK_BASE_ADDR 0x1FF000000
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#ifndef STARTUP_ADDR
|
||||
#define STARTUP_ADDR 0x80000000
|
||||
#endif
|
||||
|
||||
#ifndef STACK_BASE_ADDR
|
||||
#define STACK_BASE_ADDR 0xFF000000
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef SMEM_BASE_ADDR
|
||||
#define SMEM_BASE_ADDR STACK_BASE_ADDR
|
||||
#endif
|
||||
|
||||
#ifndef SMEM_LOG_SIZE
|
||||
#define SMEM_LOG_SIZE 19
|
||||
#endif
|
||||
|
||||
#ifndef IO_BASE_ADDR
|
||||
#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
|
||||
#endif
|
||||
|
||||
#ifndef IO_COUT_ADDR
|
||||
#define IO_COUT_ADDR IO_BASE_ADDR
|
||||
#endif
|
||||
#define IO_COUT_SIZE MEM_BLOCK_SIZE
|
||||
|
||||
#ifndef IO_CSR_ADDR
|
||||
#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
|
||||
#endif
|
||||
#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
|
||||
|
||||
#ifndef STACK_LOG2_SIZE
|
||||
#define STACK_LOG2_SIZE 13
|
||||
#endif
|
||||
#define STACK_SIZE (1 << STACK_LOG2_SIZE)
|
||||
|
||||
#define RESET_DELAY 8
|
||||
|
||||
#ifndef STALL_TIMEOUT
|
||||
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
|
||||
#endif
|
||||
|
||||
#ifndef SV_DPI
|
||||
#define DPI_DISABLE
|
||||
#endif
|
||||
|
||||
#ifndef FPU_FPNEW
|
||||
#ifndef FPU_DSP
|
||||
#ifndef FPU_DPI
|
||||
#ifndef SYNTHESIS
|
||||
#ifndef DPI_DISABLE
|
||||
#define FPU_DPI
|
||||
#else
|
||||
#define FPU_DSP
|
||||
#endif
|
||||
#else
|
||||
#define FPU_DSP
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef SYNTHESIS
|
||||
#ifndef DPI_DISABLE
|
||||
#define IMUL_DPI
|
||||
#define IDIV_DPI
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef DEBUG_LEVEL
|
||||
#define DEBUG_LEVEL 3
|
||||
#endif
|
||||
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
// Issue width
|
||||
#ifndef ISSUE_WIDTH
|
||||
#define ISSUE_WIDTH NUM_WARPS
|
||||
#endif
|
||||
|
||||
// Number of ALU units
|
||||
#ifndef NUM_ALU_LANES
|
||||
#define NUM_ALU_LANES NUM_THREADS
|
||||
#endif
|
||||
#ifndef NUM_ALU_BLOCKS
|
||||
#define NUM_ALU_BLOCKS 4
|
||||
#endif
|
||||
|
||||
// Number of FPU units
|
||||
#ifndef NUM_FPU_LANES
|
||||
#define NUM_FPU_LANES NUM_THREADS
|
||||
#endif
|
||||
#ifndef NUM_FPU_BLOCKS
|
||||
#define NUM_FPU_BLOCKS 2
|
||||
#endif
|
||||
|
||||
// Number of LSU units
|
||||
#ifndef NUM_LSU_LANES
|
||||
#define NUM_LSU_LANES NUM_THREADS
|
||||
#endif
|
||||
|
||||
// Number of SFU units
|
||||
#ifndef NUM_SFU_LANES
|
||||
#define NUM_SFU_LANES MIN(NUM_THREADS, 4)
|
||||
#endif
|
||||
|
||||
// Size of Instruction Buffer
|
||||
#ifndef IBUF_SIZE
|
||||
#define IBUF_SIZE (4 * ISSUE_WIDTH)
|
||||
#endif
|
||||
|
||||
// Size of LSU Request Queue
|
||||
#ifndef LSUQ_SIZE
|
||||
#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
|
||||
#endif
|
||||
|
||||
// LSU Duplicate Address Check
|
||||
#ifndef LSU_DUP_DISABLE
|
||||
#define LSU_DUP_ENABLE
|
||||
#endif
|
||||
#ifdef LSU_DUP_ENABLE
|
||||
#define LSU_DUP_ENABLED 1
|
||||
#else
|
||||
#define LSU_DUP_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef GBAR_ENABLE
|
||||
#define GBAR_ENABLED 1
|
||||
#else
|
||||
#define GBAR_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifndef LATENCY_IMUL
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_IMUL 4
|
||||
#endif
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_IMUL 3
|
||||
#endif
|
||||
#ifndef LATENCY_IMUL
|
||||
#define LATENCY_IMUL 4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Size of FPU Request Queue
|
||||
#ifndef FPUQ_SIZE
|
||||
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
|
||||
#endif
|
||||
|
||||
// FNCP Latency
|
||||
#ifndef LATENCY_FNCP
|
||||
#define LATENCY_FNCP 2
|
||||
#endif
|
||||
|
||||
// FMA Latency
|
||||
#ifndef LATENCY_FMA
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FMA 16
|
||||
#endif
|
||||
#ifndef LATENCY_FMA
|
||||
#define LATENCY_FMA 4
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FDIV Latency
|
||||
#ifndef LATENCY_FDIV
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FDIV 15
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FDIV 16
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FDIV 15
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FDIV 28
|
||||
#endif
|
||||
#ifndef LATENCY_FDIV
|
||||
#define LATENCY_FDIV 16
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FSQRT Latency
|
||||
#ifndef LATENCY_FSQRT
|
||||
#ifdef FPU_DPI
|
||||
#define LATENCY_FSQRT 10
|
||||
#endif
|
||||
#ifdef FPU_FPNEW
|
||||
#define LATENCY_FSQRT 16
|
||||
#endif
|
||||
#ifdef FPU_DSP
|
||||
#ifdef QUARTUS
|
||||
#define LATENCY_FSQRT 10
|
||||
#endif
|
||||
#ifdef VIVADO
|
||||
#define LATENCY_FSQRT 28
|
||||
#endif
|
||||
#ifndef LATENCY_FSQRT
|
||||
#define LATENCY_FSQRT 16
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// FCVT Latency
|
||||
#ifndef LATENCY_FCVT
|
||||
#define LATENCY_FCVT 5
|
||||
#endif
|
||||
|
||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Cache Enable
|
||||
#ifndef ICACHE_DISABLE
|
||||
#define ICACHE_ENABLE
|
||||
#endif
|
||||
#ifdef ICACHE_ENABLE
|
||||
#define ICACHE_ENABLED 1
|
||||
#else
|
||||
#define ICACHE_ENABLED 0
|
||||
#define NUM_ICACHES 0
|
||||
#endif
|
||||
|
||||
// Number of Cache Units
|
||||
#ifndef NUM_ICACHES
|
||||
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
|
||||
#endif
|
||||
|
||||
// Cache Size
|
||||
#ifndef ICACHE_SIZE
|
||||
#define ICACHE_SIZE 16384
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef ICACHE_CRSQ_SIZE
|
||||
#define ICACHE_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef ICACHE_MSHR_SIZE
|
||||
#define ICACHE_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef ICACHE_MREQ_SIZE
|
||||
#define ICACHE_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef ICACHE_MRSQ_SIZE
|
||||
#define ICACHE_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef ICACHE_NUM_WAYS
|
||||
#define ICACHE_NUM_WAYS 1
|
||||
#endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Cache Enable
|
||||
#ifndef DCACHE_DISABLE
|
||||
#define DCACHE_ENABLE
|
||||
#endif
|
||||
#ifdef DCACHE_ENABLE
|
||||
#define DCACHE_ENABLED 1
|
||||
#else
|
||||
#define DCACHE_ENABLED 0
|
||||
#define NUM_DCACHES 0
|
||||
#define DCACHE_NUM_BANKS 1
|
||||
#endif
|
||||
|
||||
// Number of Cache Units
|
||||
#ifndef NUM_DCACHES
|
||||
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
|
||||
#endif
|
||||
|
||||
// Cache Size
|
||||
#ifndef DCACHE_SIZE
|
||||
#define DCACHE_SIZE 16384
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef DCACHE_NUM_BANKS
|
||||
#define DCACHE_NUM_BANKS NUM_LSU_LANES
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef DCACHE_CRSQ_SIZE
|
||||
#define DCACHE_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef DCACHE_MSHR_SIZE
|
||||
#define DCACHE_MSHR_SIZE 8
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef DCACHE_MREQ_SIZE
|
||||
#define DCACHE_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef DCACHE_MRSQ_SIZE
|
||||
#define DCACHE_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef DCACHE_NUM_WAYS
|
||||
#define DCACHE_NUM_WAYS 1
|
||||
#endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
||||
#ifndef SM_DISABLE
|
||||
#define SM_ENABLE
|
||||
#endif
|
||||
|
||||
#ifdef SM_ENABLE
|
||||
#define SM_ENABLED 1
|
||||
#else
|
||||
#define SM_ENABLED 0
|
||||
#define SMEM_NUM_BANKS 1
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef SMEM_NUM_BANKS
|
||||
#define SMEM_NUM_BANKS (NUM_LSU_LANES)
|
||||
#endif
|
||||
|
||||
// L2cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Cache Size
|
||||
#ifndef L2_CACHE_SIZE
|
||||
#ifdef ALTERA_S10
|
||||
#define L2_CACHE_SIZE 2097152
|
||||
#else
|
||||
#define L2_CACHE_SIZE 1048576
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef L2_NUM_BANKS
|
||||
#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef L2_CRSQ_SIZE
|
||||
#define L2_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef L2_MSHR_SIZE
|
||||
#define L2_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef L2_MREQ_SIZE
|
||||
#define L2_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef L2_MRSQ_SIZE
|
||||
#define L2_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef L2_NUM_WAYS
|
||||
#define L2_NUM_WAYS 2
|
||||
#endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Cache Size
|
||||
#ifndef L3_CACHE_SIZE
|
||||
#ifdef ALTERA_S10
|
||||
#define L3_CACHE_SIZE 2097152
|
||||
#else
|
||||
#define L3_CACHE_SIZE 1048576
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Number of Banks
|
||||
#ifndef L3_NUM_BANKS
|
||||
#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
|
||||
#endif
|
||||
|
||||
// Core Response Queue Size
|
||||
#ifndef L3_CRSQ_SIZE
|
||||
#define L3_CRSQ_SIZE 2
|
||||
#endif
|
||||
|
||||
// Miss Handling Register Size
|
||||
#ifndef L3_MSHR_SIZE
|
||||
#define L3_MSHR_SIZE 16
|
||||
#endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
#ifndef L3_MREQ_SIZE
|
||||
#define L3_MREQ_SIZE 4
|
||||
#endif
|
||||
|
||||
// Memory Response Queue Size
|
||||
#ifndef L3_MRSQ_SIZE
|
||||
#define L3_MRSQ_SIZE 0
|
||||
#endif
|
||||
|
||||
// Number of Associative Ways
|
||||
#ifndef L3_NUM_WAYS
|
||||
#define L3_NUM_WAYS 4
|
||||
#endif
|
||||
|
||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef EXT_A_ENABLE
|
||||
#define EXT_A_ENABLED 1
|
||||
#else
|
||||
#define EXT_A_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_C_ENABLE
|
||||
#define EXT_C_ENABLED 1
|
||||
#else
|
||||
#define EXT_C_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_D_ENABLE
|
||||
#define EXT_D_ENABLED 1
|
||||
#else
|
||||
#define EXT_D_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_F_ENABLE
|
||||
#define EXT_F_ENABLED 1
|
||||
#else
|
||||
#define EXT_F_ENABLED 0
|
||||
#endif
|
||||
|
||||
#ifdef EXT_M_ENABLE
|
||||
#define EXT_M_ENABLED 1
|
||||
#else
|
||||
#define EXT_M_ENABLED 0
|
||||
#endif
|
||||
|
||||
#define ISA_STD_A 0
|
||||
#define ISA_STD_C 2
|
||||
#define ISA_STD_D 3
|
||||
#define ISA_STD_E 4
|
||||
#define ISA_STD_F 5
|
||||
#define ISA_STD_H 7
|
||||
#define ISA_STD_I 8
|
||||
#define ISA_STD_N 13
|
||||
#define ISA_STD_Q 16
|
||||
#define ISA_STD_S 18
|
||||
#define ISA_STD_U 20
|
||||
|
||||
#define ISA_EXT_ICACHE 0
|
||||
#define ISA_EXT_DCACHE 1
|
||||
#define ISA_EXT_L2CACHE 2
|
||||
#define ISA_EXT_L3CACHE 3
|
||||
#define ISA_EXT_SMEM 4
|
||||
|
||||
#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \
|
||||
| (DCACHE_ENABLED << ISA_EXT_DCACHE) \
|
||||
| (L2_ENABLED << ISA_EXT_L2CACHE) \
|
||||
| (L3_ENABLED << ISA_EXT_L3CACHE) \
|
||||
| (SM_ENABLED << ISA_EXT_SMEM)
|
||||
|
||||
#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
|
||||
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
|
||||
| (EXT_C_ENABLED << 2) /* C - Compressed extension */ \
|
||||
| (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
|
||||
| (0 << 4) /* E - RV32E base ISA */ \
|
||||
| (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
|
||||
| (0 << 6) /* G - Additional standard extensions present */ \
|
||||
| (0 << 7) /* H - Hypervisor mode implemented */ \
|
||||
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
|
||||
| (0 << 9) /* J - Reserved */ \
|
||||
| (0 << 10) /* K - Reserved */ \
|
||||
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
|
||||
| (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
|
||||
| (0 << 13) /* N - User level interrupts supported */ \
|
||||
| (0 << 14) /* O - Reserved */ \
|
||||
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
|
||||
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
|
||||
| (0 << 17) /* R - Reserved */ \
|
||||
| (0 << 18) /* S - Supervisor mode implemented */ \
|
||||
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
||||
| (1 << 20) /* U - User mode implemented */ \
|
||||
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||
| (0 << 22) /* W - Reserved */ \
|
||||
| (1 << 23) /* X - Non-standard extensions present */ \
|
||||
| (0 << 24) /* Y - Reserved */ \
|
||||
| (0 << 25) /* Z - Reserved */
|
||||
|
||||
// Device identification //////////////////////////////////////////////////////
|
||||
|
||||
#define VENDOR_ID 0
|
||||
#define ARCHITECTURE_ID 0
|
||||
#define IMPLEMENTATION_ID 0
|
||||
|
||||
#endif // VX_CONFIG_VH
|
||||
193
kernel/include/VX_types.h
Normal file
193
kernel/include/VX_types.h
Normal file
@@ -0,0 +1,193 @@
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at 2024-06-15 00:25:12.935689
|
||||
|
||||
// Translated from ./rtl/VX_types.vh:
|
||||
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef VX_TYPES_VH
|
||||
#define VX_TYPES_VH
|
||||
|
||||
// Device configuration registers
|
||||
|
||||
#define VX_CSR_ADDR_BITS 12
|
||||
#define VX_DCR_ADDR_BITS 12
|
||||
|
||||
#define VX_DCR_BASE_STATE_BEGIN 0x001
|
||||
#define VX_DCR_BASE_STARTUP_ADDR0 0x001
|
||||
#define VX_DCR_BASE_STARTUP_ADDR1 0x002
|
||||
#define VX_DCR_BASE_MPM_CLASS 0x003
|
||||
#define VX_DCR_BASE_STATE_END 0x004
|
||||
|
||||
#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN)
|
||||
#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
|
||||
|
||||
// Machine Performance-monitoring counters classes
|
||||
|
||||
#define VX_DCR_MPM_CLASS_NONE 0
|
||||
#define VX_DCR_MPM_CLASS_CORE 1
|
||||
#define VX_DCR_MPM_CLASS_MEM 2
|
||||
|
||||
// User Floating-Point CSRs
|
||||
|
||||
#define VX_CSR_FFLAGS 0x001
|
||||
#define VX_CSR_FRM 0x002
|
||||
#define VX_CSR_FCSR 0x003
|
||||
|
||||
#define VX_CSR_SATP 0x180
|
||||
|
||||
#define VX_CSR_PMPCFG0 0x3A0
|
||||
#define VX_CSR_PMPADDR0 0x3B0
|
||||
|
||||
#define VX_CSR_MSTATUS 0x300
|
||||
#define VX_CSR_MISA 0x301
|
||||
#define VX_CSR_MEDELEG 0x302
|
||||
#define VX_CSR_MIDELEG 0x303
|
||||
#define VX_CSR_MIE 0x304
|
||||
#define VX_CSR_MTVEC 0x305
|
||||
|
||||
#define VX_CSR_MEPC 0x341
|
||||
|
||||
#define VX_CSR_MNSTATUS 0x744
|
||||
|
||||
#define VX_CSR_MPM_BASE 0xB00
|
||||
#define VX_CSR_MPM_BASE_H 0xB80
|
||||
#define VX_CSR_MPM_USER 0xB03
|
||||
#define VX_CSR_MPM_USER_H 0xB83
|
||||
|
||||
// Machine Performance-monitoring core counters
|
||||
// PERF: Standard
|
||||
#define VX_CSR_MCYCLE 0xB00
|
||||
#define VX_CSR_MCYCLE_H 0xB80
|
||||
#define VX_CSR_MPM_RESERVED 0xB01
|
||||
#define VX_CSR_MPM_RESERVED_H 0xB81
|
||||
#define VX_CSR_MINSTRET 0xB02
|
||||
#define VX_CSR_MINSTRET_H 0xB82
|
||||
// PERF: pipeline
|
||||
#define VX_CSR_MPM_SCHED_ID 0xB03
|
||||
#define VX_CSR_MPM_SCHED_ID_H 0xB83
|
||||
#define VX_CSR_MPM_SCHED_ST 0xB04
|
||||
#define VX_CSR_MPM_SCHED_ST_H 0xB84
|
||||
#define VX_CSR_MPM_IBUF_ST 0xB05
|
||||
#define VX_CSR_MPM_IBUF_ST_H 0xB85
|
||||
#define VX_CSR_MPM_SCRB_ST 0xB06
|
||||
#define VX_CSR_MPM_SCRB_ST_H 0xB86
|
||||
#define VX_CSR_MPM_SCRB_ALU 0xB07
|
||||
#define VX_CSR_MPM_SCRB_ALU_H 0xB87
|
||||
#define VX_CSR_MPM_SCRB_FPU 0xB08
|
||||
#define VX_CSR_MPM_SCRB_FPU_H 0xB88
|
||||
#define VX_CSR_MPM_SCRB_LSU 0xB09
|
||||
#define VX_CSR_MPM_SCRB_LSU_H 0xB89
|
||||
#define VX_CSR_MPM_SCRB_SFU 0xB0A
|
||||
#define VX_CSR_MPM_SCRB_SFU_H 0xB8A
|
||||
// PERF: memory
|
||||
#define VX_CSR_MPM_IFETCHES 0xB0B
|
||||
#define VX_CSR_MPM_IFETCHES_H 0xB8B
|
||||
#define VX_CSR_MPM_LOADS 0xB0C
|
||||
#define VX_CSR_MPM_LOADS_H 0xB8C
|
||||
#define VX_CSR_MPM_STORES 0xB0D
|
||||
#define VX_CSR_MPM_STORES_H 0xB8D
|
||||
#define VX_CSR_MPM_IFETCH_LT 0xB0E
|
||||
#define VX_CSR_MPM_IFETCH_LT_H 0xB8E
|
||||
#define VX_CSR_MPM_LOAD_LT 0xB0F
|
||||
#define VX_CSR_MPM_LOAD_LT_H 0xB8F
|
||||
// SFU: scoreboard
|
||||
#define VX_CSR_MPM_SCRB_WCTL 0xB10
|
||||
#define VX_CSR_MPM_SCRB_WCTL_H 0xB90
|
||||
#define VX_CSR_MPM_SCRB_CSRS 0xB11
|
||||
#define VX_CSR_MPM_SCRB_CSRS_H 0xB91
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads
|
||||
#define VX_CSR_MPM_ICACHE_READS_H 0xB83
|
||||
#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses
|
||||
#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84
|
||||
#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls
|
||||
#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85
|
||||
// PERF: dcache
|
||||
#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads
|
||||
#define VX_CSR_MPM_DCACHE_READS_H 0xB86
|
||||
#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes
|
||||
#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87
|
||||
#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses
|
||||
#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88
|
||||
#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses
|
||||
#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89
|
||||
#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts
|
||||
#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A
|
||||
#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls
|
||||
#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B
|
||||
// PERF: l2cache
|
||||
#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads
|
||||
#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C
|
||||
#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes
|
||||
#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses
|
||||
#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F
|
||||
#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts
|
||||
#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90
|
||||
#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls
|
||||
#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91
|
||||
// PERF: l3cache
|
||||
#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads
|
||||
#define VX_CSR_MPM_L3CACHE_READS_H 0xB92
|
||||
#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes
|
||||
#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses
|
||||
#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95
|
||||
#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts
|
||||
#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96
|
||||
#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls
|
||||
#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97
|
||||
// PERF: memory
|
||||
#define VX_CSR_MPM_MEM_READS 0xB18 // total reads
|
||||
#define VX_CSR_MPM_MEM_READS_H 0xB98
|
||||
#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes
|
||||
#define VX_CSR_MPM_MEM_WRITES_H 0xB99
|
||||
#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency
|
||||
#define VX_CSR_MPM_MEM_LT_H 0xB9A
|
||||
// PERF: smem
|
||||
#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads
|
||||
#define VX_CSR_MPM_SMEM_READS_H 0xB9B
|
||||
#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes
|
||||
#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C
|
||||
#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts
|
||||
#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D
|
||||
|
||||
// Machine Information Registers
|
||||
|
||||
#define VX_CSR_MVENDORID 0xF11
|
||||
#define VX_CSR_MARCHID 0xF12
|
||||
#define VX_CSR_MIMPID 0xF13
|
||||
#define VX_CSR_MHARTID 0xF14
|
||||
|
||||
// GPGU CSRs
|
||||
|
||||
#define VX_CSR_THREAD_ID 0xCC0
|
||||
#define VX_CSR_WARP_ID 0xCC1
|
||||
#define VX_CSR_CORE_ID 0xCC2
|
||||
#define VX_CSR_WARP_MASK 0xCC3
|
||||
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
|
||||
#define VX_CSR_GCID 0xCC5 // legacy global core id alias used by Radiance bootrom
|
||||
|
||||
#define VX_CSR_NUM_THREADS 0xFC0
|
||||
#define VX_CSR_NUM_WARPS 0xFC1
|
||||
#define VX_CSR_NUM_CORES 0xFC2
|
||||
|
||||
#endif // VX_TYPES_VH
|
||||
@@ -136,6 +136,19 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Spawn an explicit warp mask. The current warp bit is ignored by hardware.
|
||||
inline void vx_wspawn_mask(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn r %0, 6, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(warp_mask), "r"(func_ptr));
|
||||
}
|
||||
|
||||
inline void vx_spawn_scalar(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
vx_wspawn_mask(warp_mask & ((1u << NUM_SCALAR_WARPS) - 1u), func_ptr);
|
||||
}
|
||||
|
||||
inline void vx_spawn_tensor(unsigned warp_mask, vx_wspawn_pfn func_ptr) {
|
||||
vx_wspawn_mask(warp_mask & (((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS), func_ptr);
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline unsigned vx_split(unsigned predicate) {
|
||||
unsigned ret;
|
||||
@@ -149,8 +162,36 @@ inline void vx_join(unsigned stack_ptr) {
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
|
||||
unsigned scalar_warps = (num_warps > NUM_SCALAR_WARPS) ? NUM_SCALAR_WARPS : num_warps;
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(scalar_warps));
|
||||
}
|
||||
|
||||
#define VX_BARRIER_DOMAIN_SHIFT 28
|
||||
#define VX_BARRIER_DOMAIN_ALL 0u
|
||||
#define VX_BARRIER_DOMAIN_SCALAR 1u
|
||||
#define VX_BARRIER_DOMAIN_TENSOR 2u
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_domain(unsigned barrier_id, unsigned num_warps, unsigned domain) {
|
||||
unsigned encoded_id = barrier_id | (domain << VX_BARRIER_DOMAIN_SHIFT);
|
||||
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(encoded_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_scalar(unsigned barrier_id, unsigned num_warps) {
|
||||
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_SCALAR);
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_tensor(unsigned barrier_id, unsigned num_warps) {
|
||||
vx_barrier_domain(barrier_id, num_warps, VX_BARRIER_DOMAIN_TENSOR);
|
||||
}
|
||||
|
||||
__attribute__((convergent))
|
||||
inline void vx_barrier_mask(unsigned barrier_id, unsigned warp_mask) {
|
||||
asm volatile (".insn r %0, 7, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barrier_id), "r"(warp_mask));
|
||||
}
|
||||
|
||||
// Return current thread identifier
|
||||
@@ -202,6 +243,22 @@ inline int vx_num_warps() {
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline int vx_num_scalar_warps() {
|
||||
return NUM_SCALAR_WARPS;
|
||||
}
|
||||
|
||||
inline int vx_num_tensor_warps() {
|
||||
return NUM_TENSOR_WARPS;
|
||||
}
|
||||
|
||||
inline unsigned vx_scalar_warp_mask() {
|
||||
return (1u << NUM_SCALAR_WARPS) - 1u;
|
||||
}
|
||||
|
||||
inline unsigned vx_tensor_warp_mask() {
|
||||
return ((1u << NUM_TENSOR_WARPS) - 1u) << NUM_SCALAR_WARPS;
|
||||
}
|
||||
|
||||
// Return the number of cores per cluster
|
||||
inline int vx_num_cores() {
|
||||
int ret;
|
||||
|
||||
@@ -17,6 +17,10 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef CORES_PER_CLUSTER
|
||||
#define CORES_PER_CLUSTER 8
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@@ -48,6 +52,7 @@ void vx_wspawn_wait();
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
|
||||
@@ -74,18 +74,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int cid = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
@@ -103,6 +94,60 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int cid = vx_core_id();
|
||||
int wid = vx_warp_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
|
||||
// round-robin warp_id allocation across cores in cluster
|
||||
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
|
||||
int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
|
||||
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
|
||||
|
||||
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
|
||||
void* arg = p_wspawn_args->arg;
|
||||
|
||||
// sequential iterations
|
||||
for (int wave_id = 0; wave_id < waves; ++wave_id) {
|
||||
int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
|
||||
callback(task_id, arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
int task_id = p_wspawn_args->offset + tid;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
|
||||
int NT = vx_num_threads();
|
||||
int cid = vx_core_id();
|
||||
int tid = vx_thread_id();
|
||||
int wid = vx_warp_id();
|
||||
|
||||
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
|
||||
// round-robin warp_id allocation across cores in cluster
|
||||
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
|
||||
|
||||
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
|
||||
// FIXME: This assumes that all cores but the last one are working with full
|
||||
// warps, and only the last core has a partially-filled warp.
|
||||
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
|
||||
|
||||
int task_id = offset;
|
||||
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
@@ -111,11 +156,21 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
|
||||
spawn_tasks_contiguous_all_stub();
|
||||
|
||||
// disable warp
|
||||
// deadlock here on warps 1, 2, 3
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_all_stub();
|
||||
|
||||
// disable warp
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
@@ -126,10 +181,115 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
// This function runs in every core, but with only 1 warp and 1 thread enabled.
|
||||
// The logic in this function figures out how many warps/threads this particular
|
||||
// core has to enable to fulfill an entire grid of computation.
|
||||
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
|
||||
// device specs
|
||||
const int NC = vx_num_cores();
|
||||
const int NW = NUM_SCALAR_WARPS;
|
||||
const int NT = vx_num_threads();
|
||||
// NOTE: assumes divisible
|
||||
const int num_cluster = NC / CORES_PER_CLUSTER;
|
||||
|
||||
// current core id
|
||||
int core_id = vx_core_id();
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
const int cluster_id = core_id / CORES_PER_CLUSTER;
|
||||
const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
|
||||
|
||||
// try to fill up full clusters first
|
||||
const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT;
|
||||
const int num_used_clusters =
|
||||
(num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster;
|
||||
if (cluster_id >= num_used_clusters) {
|
||||
return; // terminate extra clusters
|
||||
}
|
||||
// fill up the last cluster with remaining tasks
|
||||
const int num_full_clusters = num_tasks / num_threads_in_cluster;
|
||||
int num_tasks_this_cluster = num_threads_in_cluster;
|
||||
if (cluster_id >= num_full_clusters) {
|
||||
num_tasks_this_cluster = num_tasks % num_threads_in_cluster;
|
||||
}
|
||||
|
||||
// Distribute threads equally across as many cores as possible, even if they
|
||||
// don't fill up NW*NT in a single core. This makes sure the warps get evenly
|
||||
// distributed in a single cluster
|
||||
//
|
||||
// TODO: Try to contain in a single cluster if possible?
|
||||
const int num_active_cores = (num_tasks + (NT - 1)) / NT;
|
||||
if (core_id >= num_active_cores)
|
||||
return; // terminate extra cores
|
||||
|
||||
const int num_full_warps_this_cluster = num_tasks_this_cluster / NT;
|
||||
const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
|
||||
// const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
|
||||
|
||||
int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER;
|
||||
const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER;
|
||||
if (core_id_in_cluster < num_warps_in_last_row) {
|
||||
num_warps_this_core++;
|
||||
}
|
||||
// if 0, last warp is full-threads enabled
|
||||
int rem_threads_in_last_warp_this_core = 0;
|
||||
if (rem_threads_in_last_warp != 0) {
|
||||
if (core_id_in_cluster == num_warps_in_last_row - 1) {
|
||||
rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
|
||||
}
|
||||
}
|
||||
|
||||
// sequential iterations
|
||||
const int num_full_waves = num_warps_this_core / NW;
|
||||
const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
|
||||
|
||||
const int offset = cluster_id * num_tasks_this_cluster;
|
||||
wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
|
||||
rem_full_warps_in_last_wave};
|
||||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
if (num_warps_this_core > 0) {
|
||||
// execute callback on other warps
|
||||
const int nw = MIN(num_warps_this_core, NW);
|
||||
vx_wspawn(nw, spawn_tasks_cluster_all_cb);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_all_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
|
||||
// wait for spawn warps to terminate
|
||||
vx_wspawn_wait();
|
||||
}
|
||||
|
||||
// TODO: this is incomplete
|
||||
// TODO: Instead of launching an additional wave just to work on remaining
|
||||
// threads, handle this in the last wave amongst other full warps.
|
||||
if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
|
||||
// adjust offset
|
||||
// FIXME: use rem_threads_in_last_warp_this_core
|
||||
wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
|
||||
|
||||
// activate remaining threads
|
||||
const int tmask = (1 << rem_threads_in_last_warp) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// call stub routine
|
||||
spawn_tasks_cluster_rem_stub();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
}
|
||||
|
||||
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
@@ -179,7 +339,6 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
|
||||
vx_tmc_one();
|
||||
|
||||
// wait for spawn warps to terminate
|
||||
// deadlock here on warp 0!
|
||||
vx_wspawn_wait();
|
||||
}
|
||||
|
||||
@@ -202,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
@@ -356,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
|
||||
|
||||
// device specs
|
||||
int NC = vx_num_cores();
|
||||
int NW = vx_num_warps();
|
||||
int NW = NUM_SCALAR_WARPS;
|
||||
int NT = vx_num_threads();
|
||||
|
||||
// current core id
|
||||
|
||||
@@ -22,9 +22,9 @@
|
||||
_start:
|
||||
|
||||
# initialize per-thread registers
|
||||
csrr t0, VX_CSR_NUM_WARPS # get num warps
|
||||
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
|
||||
la t1, init_regs_all
|
||||
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
|
||||
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
|
||||
li t0, -1
|
||||
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
|
||||
jal init_regs
|
||||
@@ -35,9 +35,9 @@ _start:
|
||||
jal vx_wspawn_wait
|
||||
|
||||
# initialize TLS for all warps
|
||||
csrr t0, VX_CSR_NUM_WARPS # get num warps
|
||||
li t0, ((1 << NUM_SCALAR_WARPS) - 1) # scalar warp mask
|
||||
la t1, init_tls_all
|
||||
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
|
||||
.insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1 # wspawn_mask t0, t1
|
||||
li t0, -1
|
||||
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
|
||||
call __init_tls
|
||||
@@ -102,6 +102,8 @@ init_regs:
|
||||
#endif
|
||||
csrr t0, VX_CSR_MHARTID
|
||||
sll t1, t0, STACK_LOG2_SIZE
|
||||
sll t2, t0, 4
|
||||
add t1, t1, t2
|
||||
sub sp, sp, t1
|
||||
|
||||
# set thread pointer register
|
||||
|
||||
Reference in New Issue
Block a user