Merge remote-tracking branch 'upstream/master' into vortex2

This commit is contained in:
Hansung Kim
2024-02-01 23:35:58 -08:00
203 changed files with 4383 additions and 21981 deletions

View File

@@ -43,7 +43,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
`ifdef SCOPE
localparam scope_socket = 0;
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
`endif
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`ifdef GBAR_ENABLE
@@ -69,18 +78,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (gbar_reset),
.gbar_bus_if (gbar_bus_if)
);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_l2cache;
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l2cache = perf_l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
VX_mem_bus_if #(
@@ -102,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
@@ -113,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk),
.reset (l2_reset),
`ifdef PERF_ENABLE
.cache_perf (perf_l2cache),
.cache_perf (mem_perf_tmp_if.l2cache),
`endif
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
@@ -146,6 +144,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
) socket (
`SCOPE_IO_BIND (scope_socket+i)
.clk (clk),
.reset (socket_reset),
@@ -156,7 +155,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[i]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),
`endif
@@ -167,6 +166,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
);
end
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1));
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
endmodule

View File

@@ -191,13 +191,21 @@
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef SV_DPI
`define DPI_DISABLE
`endif
`ifndef FPU_FPNEW
`ifndef FPU_DSP
`ifndef FPU_DPI
`ifdef SYNTHESIS
`define FPU_DSP
`else
`ifndef SYNTHESIS
`ifndef DPI_DISABLE
`define FPU_DPI
`else
`define FPU_DSP
`endif
`else
`define FPU_DSP
`endif
`endif
`endif
@@ -223,18 +231,18 @@
// Number of ALU units
`ifndef NUM_ALU_LANES
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
`define NUM_ALU_LANES `NUM_THREADS
`endif
`ifndef NUM_ALU_BLOCKS
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
`endif
// Number of FPU units
`ifndef NUM_FPU_LANES
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
`define NUM_FPU_LANES `NUM_THREADS
`endif
`ifndef NUM_FPU_BLOCKS
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
`endif
// Number of LSU units
@@ -258,7 +266,10 @@
`endif
// LSU Duplicate Address Check
`ifdef LSU_DUP
`ifndef LSU_DUP_DISABLE
`define LSU_DUP_ENABLE
`endif
`ifdef LSU_DUP_ENABLE
`define LSU_DUP_ENABLED 1
`else
`define LSU_DUP_ENABLED 0
@@ -285,8 +296,8 @@
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
`ifndef FPU_REQ_QUEUE_SIZE
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`ifndef FPUQ_SIZE
`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`endif
// FNCP Latency
@@ -377,7 +388,7 @@
// Number of Cache Units
`ifndef NUM_ICACHES
`define NUM_ICACHES `UP(`NUM_CORES / 4)
`define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
`endif
// Cache Size
@@ -407,7 +418,7 @@
// Number of Associative Ways
`ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 2
`define ICACHE_NUM_WAYS 1
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@@ -426,7 +437,7 @@
// Number of Cache Units
`ifndef NUM_DCACHES
`define NUM_DCACHES `UP(`NUM_CORES / 4)
`define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
`endif
// Cache Size
@@ -436,7 +447,7 @@
// Number of Banks
`ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
`endif
// Core Response Queue Size
@@ -461,7 +472,7 @@
// Number of Associative Ways
`ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 2
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
@@ -520,7 +531,7 @@
// Number of Associative Ways
`ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 4
`define L2_NUM_WAYS 2
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////

View File

@@ -1,418 +1,437 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_DEFINE_VH
`define VX_DEFINE_VH
`include "VX_platform.vh"
`include "VX_config.vh"
`include "VX_types.vh"
///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `CLOG2(`NUM_WARPS)
`define NC_WIDTH `UP(`NC_BITS)
`define NT_BITS `CLOG2(`NUM_THREADS)
`define NW_WIDTH `UP(`NW_BITS)
`define NC_BITS `CLOG2(`NUM_CORES)
`define NT_WIDTH `UP(`NT_BITS)
`define NB_BITS `CLOG2(`NUM_BARRIERS)
`define NB_WIDTH `UP(`NB_BITS)
`define NUM_IREGS 32
`define NRI_BITS `CLOG2(`NUM_IREGS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`else
`define NUM_REGS `NUM_IREGS
`endif
`define NR_BITS `CLOG2(`NUM_REGS)
`define PERF_CTR_BITS 44
`ifndef NDEBUG
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
`define EX_LSU 1
`define EX_SFU 2
`define EX_FPU 3
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111
`define INST_JALR 7'b1100111
`define INST_B 7'b1100011 // branch instructions
`define INST_L 7'b0000011 // load instructions
`define INST_S 7'b0100011 // store instructions
`define INST_I 7'b0010011 // immediate instructions
`define INST_R 7'b0110011 // register instructions
`define INST_FENCE 7'b0001111 // Fence instructions
`define INST_SYS 7'b1110011 // system instructions
// RV64I instruction specific opcodes (for any W instruction)
`define INST_I_W 7'b0011011 // W type immediate instructions
`define INST_R_W 7'b0111011 // W type register instructions
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_FRM_RDN 3'b010 // round to -inf
`define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
`define INST_FMT_BITS 2
///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
`define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_ALU_BITS 4
`define INST_ALU_CLASS(op) op[3:2]
`define INST_ALU_SIGNED(op) op[0]
`define INST_ALU_IS_SUB(op) op[1]
`define INST_ALU_IS_BR(mod) mod[0]
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011
`define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_BR_MRET 4'b1110
`define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
`define INST_BR_IS_NEG(op) op[1]
`define INST_BR_IS_LESS(op) op[2]
`define INST_BR_IS_STATIC(op) op[3]
`define INST_M_MUL 3'b000
`define INST_M_MULHU 3'b001
`define INST_M_MULH 3'b010
`define INST_M_MULHSU 3'b011
`define INST_M_DIV 3'b100
`define INST_M_DIVU 3'b101
`define INST_M_REM 3'b110
`define INST_M_REMU 3'b111
`define INST_M_BITS 3
`define INST_M_SIGNED(op) (~op[0])
`define INST_M_IS_MULX(op) (~op[2])
`define INST_M_IS_MULH(op) (op[1:0] != 0)
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
`define INST_M_IS_REM(op) op[1]
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010
`define INST_FMT_D 3'b011
`define INST_FMT_BU 3'b100
`define INST_FMT_HU 3'b101
`define INST_FMT_WU 3'b110
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_SD 4'b1011 // new for RV64I SD
`define INST_LSU_FENCE 4'b1111
`define INST_LSU_BITS 4
`define INST_LSU_FMT(op) op[2:0]
`define INST_LSU_WSIZE(op) op[1:0]
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4])
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_SPLIT 4'h2
`define INST_SFU_JOIN 4'h3
`define INST_SFU_BAR 4'h4
`define INST_SFU_PRED 4'h5
`define INST_SFU_CSRRW 4'h6
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
`define INST_SFU_CMOV 4'h9
`define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5)
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
///////////////////////////////////////////////////////////////////////////////
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
`define VX_DCR_DATA_WIDTH 32
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define BUFFER_BUSY(dst, src, enable) \
logic __busy; \
if (enable) begin \
always @(posedge clk) begin \
if (reset) begin \
__busy <= 1'b0; \
end else begin \
__busy <= src; \
end \
end \
end else begin \
assign __busy = src; \
end \
assign dst = __busy
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define PERF_REDUCE(dst, src, field, width, count) \
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
reg [width-1:0] __reduce_add_r_``dst``field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
); \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
end \
end \
assign ``dst.``field = __reduce_add_r_``dst``field
`define PERF_CACHE_REDUCE(dst, src, count) \
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
end else begin \
assign dst = `NW_WIDTH'(block_idx); \
end \
end else begin \
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) \
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_DEFINE_VH
`define VX_DEFINE_VH
`include "VX_platform.vh"
`include "VX_config.vh"
`include "VX_types.vh"
///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `CLOG2(`NUM_WARPS)
`define NC_WIDTH `UP(`NC_BITS)
`define NT_BITS `CLOG2(`NUM_THREADS)
`define NW_WIDTH `UP(`NW_BITS)
`define NC_BITS `CLOG2(`NUM_CORES)
`define NT_WIDTH `UP(`NT_BITS)
`define NB_BITS `CLOG2(`NUM_BARRIERS)
`define NB_WIDTH `UP(`NB_BITS)
`define NUM_IREGS 32
`define NRI_BITS `CLOG2(`NUM_IREGS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`else
`define NUM_REGS `NUM_IREGS
`endif
`define NR_BITS `CLOG2(`NUM_REGS)
`define PERF_CTR_BITS 44
`ifndef NDEBUG
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
`define EX_LSU 1
`define EX_SFU 2
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
`define EX_WIDTH `UP(`EX_BITS)
`define SFU_CSRS 0
`define SFU_WCTL 1
`define NUM_SFU_UNITS (2)
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
`define SFU_WIDTH `UP(`SFU_BITS)
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111
`define INST_JALR 7'b1100111
`define INST_B 7'b1100011 // branch instructions
`define INST_L 7'b0000011 // load instructions
`define INST_S 7'b0100011 // store instructions
`define INST_I 7'b0010011 // immediate instructions
`define INST_R 7'b0110011 // register instructions
`define INST_FENCE 7'b0001111 // Fence instructions
`define INST_SYS 7'b1110011 // system instructions
// RV64I instruction specific opcodes (for any W instruction)
`define INST_I_W 7'b0011011 // W type immediate instructions
`define INST_R_W 7'b0111011 // W type register instructions
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
`define INST_FRM_RTZ 3'b001 // round to zero
`define INST_FRM_RDN 3'b010 // round to -inf
`define INST_FRM_RUP 3'b011 // round to +inf
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
`define INST_FRM_DYN 3'b111 // dynamic mode
`define INST_FRM_BITS 3
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
`define INST_FMT_BITS 2
///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
`define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define INST_ALU_BITS 4
`define INST_ALU_CLASS(op) op[3:2]
`define INST_ALU_SIGNED(op) op[0]
`define INST_ALU_IS_SUB(op) op[1]
`define INST_ALU_IS_BR(mod) mod[0]
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
`define INST_BR_JALR 4'b1001
`define INST_BR_ECALL 4'b1010
`define INST_BR_EBREAK 4'b1011
`define INST_BR_URET 4'b1100
`define INST_BR_SRET 4'b1101
`define INST_BR_MRET 4'b1110
`define INST_BR_OTHER 4'b1111
`define INST_BR_BITS 4
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
`define INST_BR_IS_NEG(op) op[1]
`define INST_BR_IS_LESS(op) op[2]
`define INST_BR_IS_STATIC(op) op[3]
`define INST_M_MUL 3'b000
`define INST_M_MULHU 3'b001
`define INST_M_MULH 3'b010
`define INST_M_MULHSU 3'b011
`define INST_M_DIV 3'b100
`define INST_M_DIVU 3'b101
`define INST_M_REM 3'b110
`define INST_M_REMU 3'b111
`define INST_M_BITS 3
`define INST_M_SIGNED(op) (~op[0])
`define INST_M_IS_MULX(op) (~op[2])
`define INST_M_IS_MULH(op) (op[1:0] != 0)
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
`define INST_M_IS_REM(op) op[1]
`define INST_FMT_B 3'b000
`define INST_FMT_H 3'b001
`define INST_FMT_W 3'b010
`define INST_FMT_D 3'b011
`define INST_FMT_BU 3'b100
`define INST_FMT_HU 3'b101
`define INST_FMT_WU 3'b110
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_SD 4'b1011 // new for RV64I SD
`define INST_LSU_FENCE 4'b1111
`define INST_LSU_BITS 4
`define INST_LSU_FMT(op) op[2:0]
`define INST_LSU_WSIZE(op) op[1:0]
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
`define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4])
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_SPLIT 4'h2
`define INST_SFU_JOIN 4'h3
`define INST_SFU_BAR 4'h4
`define INST_SFU_PRED 4'h5
`define INST_SFU_CSRRW 4'h6
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
`define INST_SFU_CMOV 4'h9
`define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5)
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
///////////////////////////////////////////////////////////////////////////////
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
`define VX_DCR_DATA_WIDTH 32
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define BUFFER_EX(dst, src, ena, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
for (genvar __d = 0; __d < dst_count; ++__d) begin \
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
for (genvar __i = 0; __i < __count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_``dst``field; \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
end \
end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
end else begin \
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
end \
end
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
end else begin \
assign dst = `NW_WIDTH'(block_idx); \
end \
end else begin \
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) { \
data.uuid, \
data.wis, \
data.tmask, \
data.op_type, \
data.op_mod, \
data.wb, \
data.use_PC, \
data.use_imm, \
data.PC, \
data.imm, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View File

@@ -99,7 +99,7 @@ package VX_gpu_pkg;
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
@@ -142,10 +142,13 @@ package VX_gpu_pkg;
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
@@ -190,42 +193,46 @@ package VX_gpu_pkg;
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH);
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO);
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO));
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
);
if (ISSUE_WIS == 0) begin
wis_to_wid = `NW_WIDTH'(isw);
end else if (ISSUE_ISW == 0) begin
wis_to_wid = `NW_WIDTH'(wis);
end else begin
wis_to_wid = `NW_WIDTH'({wis, isw});
end
endfunction
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (`ISSUE_WIDTH > 1) begin
wid_to_isw = ISSUE_IDX_W'(wid);
if (ISSUE_ISW != 0) begin
wid_to_isw = wid[ISSUE_ISW_W-1:0];
end else begin
wid_to_isw = 0;
end
endfunction
`IGNORE_UNUSED_END
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_IDX_W-1:0] isw
);
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
endfunction
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
input logic [`NW_WIDTH-1:0] wid
);
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
endfunction
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
input logic [`NR_BITS-1:0] rid,
input logic [ISSUE_WIS_W-1:0] wis
);
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
if (ISSUE_WIS != 0) begin
wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW);
end else begin
wid_to_wis = 0;
end
endfunction
`IGNORE_UNUSED_END
endpackage

View File

@@ -14,7 +14,7 @@
`ifndef VX_PLATFORM_VH
`define VX_PLATFORM_VH
`ifndef SYNTHESIS
`ifdef SV_DPI
`include "util_dpi.vh"
`endif

View File

@@ -65,58 +65,11 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_icache;
cache_perf_t perf_dcache;
assign mem_perf_tmp_if.icache = perf_icache;
assign mem_perf_tmp_if.dcache = perf_dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) cache_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) mem_bus_tmp_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
`endif
///////////////////////////////////////////////////////////////////////////
@@ -125,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
@@ -149,7 +107,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (perf_icache),
.cache_perf (mem_perf_tmp_if.icache),
`endif
.clk (clk),
.reset (icache_reset),
@@ -160,9 +118,14 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
`RESET_RELAY (dcache_reset, reset);
@@ -189,7 +152,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (perf_dcache),
.cache_perf (mem_perf_tmp_if.dcache),
`endif
.clk (clk),
.reset (dcache_reset),
@@ -197,6 +160,40 @@ module VX_socket import VX_gpu_pkg::*; #(
.mem_bus_if (dcache_mem_bus_if)
);
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
@@ -245,6 +242,6 @@ module VX_socket import VX_gpu_pkg::*; #(
);
end
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1));
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
endmodule

View File

@@ -58,6 +58,8 @@
`define VX_CSR_MPM_BASE 12'hB00
`define VX_CSR_MPM_BASE_H 12'hB80
`define VX_CSR_MPM_USER 12'hB03
`define VX_CSR_MPM_USER_H 12'hB83
// Machine Performance-monitoring core counters
// PERF: Standard
@@ -68,29 +70,38 @@
`define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82
// PERF: pipeline
`define VX_CSR_MPM_IBUF_ST 12'hB03
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
`define VX_CSR_MPM_SCRB_ST 12'hB04
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
`define VX_CSR_MPM_ALU_ST 12'hB05
`define VX_CSR_MPM_ALU_ST_H 12'hB85
`define VX_CSR_MPM_LSU_ST 12'hB06
`define VX_CSR_MPM_LSU_ST_H 12'hB86
`define VX_CSR_MPM_FPU_ST 12'hB07
`define VX_CSR_MPM_FPU_ST_H 12'hB87
`define VX_CSR_MPM_SFU_ST 12'hB08
`define VX_CSR_MPM_SFU_ST_H 12'hB88
`define VX_CSR_MPM_SCHED_ID 12'hB03
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
`define VX_CSR_MPM_SCHED_ST 12'hB04
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
`define VX_CSR_MPM_IBUF_ST 12'hB05
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0A
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
`define VX_CSR_MPM_LOADS 12'hB0B
`define VX_CSR_MPM_LOADS_H 12'hB8B
`define VX_CSR_MPM_STORES 12'hB0C
`define VX_CSR_MPM_STORES_H 12'hB8C
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
`define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
// Machine Performance-monitoring memory counters
// PERF: icache
@@ -98,59 +109,61 @@
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
// PERF: dcache
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB86
`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
// PERF: l2cache
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
// PERF: l3cache
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
// PERF: memory
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB98
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
// Machine Information Registers

View File

@@ -22,15 +22,15 @@ module Vortex import VX_gpu_pkg::*; (
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
@@ -45,17 +45,11 @@ module Vortex import VX_gpu_pkg::*; (
);
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
cache_perf_t perf_l3cache;
mem_perf_t mem_perf;
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.mem = mem_perf;
`endif
`endif
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf (perf_l3cache),
.cache_perf (mem_perf_if.l3cache),
`endif
.core_bus_if (per_cluster_mem_bus_if),
@@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; (
);
end
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
@@ -181,19 +176,19 @@ module Vortex import VX_gpu_pkg::*; (
end
end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
assign mem_perf_if.mem = mem_perf;
`endif

View File

@@ -262,7 +262,7 @@ module VX_afu_wrap #(
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awid (m_axi_mem_awid_a),
`UNUSED_PIN (m_axi_awlen),
.m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize),
`UNUSED_PIN (m_axi_awburst),
`UNUSED_PIN (m_axi_awlock),

View File

@@ -530,14 +530,17 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw;
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
@@ -560,7 +563,7 @@ module VX_cache import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin

View File

@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE
cache_perf_t perf_cache_unit[NUM_CACHES];
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
`endif
VX_mem_bus_if #(

View File

@@ -1,190 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_UNITS = 2,
parameter NUM_INPUTS = 4,
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 16,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 1,
// Core response output register
parameter CORE_OUT_REG = 2,
// Memory request output register
parameter MEM_OUT_REG = 2,
parameter NUM_CACHES = `UP(NUM_UNITS),
parameter PASSTHRU = (NUM_UNITS == 0),
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
// Core request
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
// Core response
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_if[NUM_INPUTS * NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_if();
// Core request
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
end
end
// Core response
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
end
end
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache_cluster #(
.INSTANCE_ID (INSTANCE_ID),
.NUM_UNITS (NUM_UNITS),
.NUM_INPUTS (NUM_INPUTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.NUM_REQS (NUM_REQS),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.NC_ENABLE (NC_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG),
.MEM_OUT_REG (MEM_OUT_REG)
) cache (
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.clk (clk),
.reset (reset),
.core_bus_if (core_bus_if),
.mem_bus_if (mem_bus_if)
);
endmodule

View File

@@ -93,13 +93,7 @@ module VX_cache_data #(
assign wren = fill;
end
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
// generate if (NUM_WAYS == 1) begin
// wire [0:0] way_idx;
// end else begin
// wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
// end
// endgenerate
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
.N (NUM_WAYS)

View File

@@ -63,4 +63,16 @@
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
`endif // VX_CACHE_DEFINE_VH

View File

@@ -13,7 +13,7 @@
`include "VX_cache_define.vh"
module VX_cache_top #(
module VX_cache_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle
@@ -22,7 +22,7 @@ module VX_cache_top #(
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways

View File

@@ -45,7 +45,7 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_if commit_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
@@ -92,24 +92,24 @@ module VX_commit import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
end
// CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count;
`POP_COUNT(pop_count, commit_tmask[i]);
assign commit_size[i] = pop_count;
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]);
assign commit_size[i] = count;
end
VX_pipe_register #(
@@ -130,7 +130,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.OP ("+")
) commit_size_reduce (
.data_in (commit_size_r),
.data_out (commit_size_all)
.data_out (commit_size_all_r)
);
VX_pipe_register #(
@@ -140,26 +140,26 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}),
.data_out ({commit_fire_any_rr, commit_size_all_r})
.data_in ({commit_fire_any_r, commit_size_all_r}),
.data_out ({commit_fire_any_rr, commit_size_all_rr})
);
reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin
if (reset) begin
instret <= '0;
end else begin
if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
end
end
end
assign commit_csr_if.instret = instret;
// Committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
@@ -167,23 +167,23 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
end
// simulation helper signal to get RISC-V tests Pass/Fail status

View File

@@ -1,339 +1,341 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
VX_schedule_if schedule_if();
VX_fetch_if fetch_if();
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE
VX_pipeline_perf_if pipeline_perf_if();
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
`ifdef SM_ENABLE
cache_perf_t smem_perf;
assign mem_perf_tmp_if.smem = smem_perf;
`else
assign mem_perf_tmp_if.smem = '0;
`endif
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.schedule_if (schedule_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sched_csr_if (sched_csr_if),
.busy (busy)
);
VX_fetch #(
.CORE_ID (CORE_ID)
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
VX_execute #(
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
);
VX_commit #(
.CORE_ID (CORE_ID)
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if),
.sim_wb_value (sim_wb_value)
);
`ifdef SM_ENABLE
VX_smem_unit #(
.CORE_ID (CORE_ID)
) smem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (smem_perf),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if),
.smem_bus_out_if (smem_bus_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
end
`endif
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_icache_pending_reads <= '0;
perf_dcache_pending_reads <= '0;
end else begin
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
always @(posedge clk) begin
if (reset) begin
perf_ifetches <= '0;
perf_loads <= '0;
perf_stores <= '0;
perf_icache_lat <= '0;
perf_dcache_lat <= '0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif
endmodule
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef EXT_F_ENABLE
`include "VX_fpu_define.vh"
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master icache_bus_if,
`ifdef GBAR_ENABLE
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
VX_schedule_if schedule_if();
VX_fetch_if fetch_if();
VX_decode_if decode_if();
VX_sched_csr_if sched_csr_if();
VX_decode_sched_if decode_sched_if();
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
`endif
.base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
.schedule_if (schedule_if),
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sched_csr_if (sched_csr_if),
.busy (busy)
);
VX_fetch #(
.CORE_ID (CORE_ID)
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
VX_execute #(
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
);
VX_commit #(
.CORE_ID (CORE_ID)
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if),
.sim_wb_value (sim_wb_value)
);
`ifdef SM_ENABLE
VX_smem_unit #(
.CORE_ID (CORE_ID)
) smem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.smem),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if),
.smem_bus_out_if (smem_bus_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
end
`endif
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_icache_pending_reads <= '0;
perf_dcache_pending_reads <= '0;
end else begin
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
always @(posedge clk) begin
if (reset) begin
perf_ifetches <= '0;
perf_loads <= '0;
perf_stores <= '0;
perf_icache_lat <= '0;
perf_dcache_lat <= '0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif
endmodule

View File

@@ -129,7 +129,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.smem = '0;
assign mem_perf_if.mem = '0;
`endif
`ifdef SCOPE

View File

@@ -35,7 +35,6 @@ import VX_fpu_pkg::*;
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
VX_commit_csr_if.slave commit_csr_if,
@@ -183,105 +182,115 @@ import VX_fpu_pkg::*;
default: begin
read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
@@ -301,8 +310,6 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem);
`endif

View File

@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif
`ifdef EXT_F_ENABLE
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif
.commit_csr_if (commit_csr_if),

View File

@@ -12,6 +12,7 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
@@ -174,30 +175,38 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
end
`ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
wire [`ISSUE_WIDTH-1:0] operands_stall;
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
assign operands_ex_type[i] = operands_if[i].data.ex_type;
end
always @(*) begin
perf_stalls_n = perf_stalls_r;
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
if (operands_stall[i]) begin
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
always @(posedge clk) begin
if (reset) begin
perf_stalls_r <= '0;
end else begin
perf_stalls_r <= perf_stalls_n;
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
end
end
end

View File

@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else if (batch_done) begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end
end
end else begin
@@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_done[block_idx] = ~valid_p || ready_p;
end
wire [ISSUE_IDX_W-1:0] wsi;
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
assign wsi = batch_idx;
assign isw = batch_idx;
end
end else begin
assign wsi = block_idx;
assign isw = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),

View File

@@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
@@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPU_REQ_QUEUE_SIZE)
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (reset),

View File

@@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] commit_in_valid;
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid;
@@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end
end else begin
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end
end
@@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[i] = 'x;
end
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin

View File

@@ -14,10 +14,10 @@
`include "VX_platform.vh"
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
input wire reset,

View File

@@ -59,6 +59,11 @@ module VX_issue #(
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
@@ -80,7 +85,7 @@ module VX_issue #(
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
.perf_stalls (perf_issue_if.dsp_stalls),
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),
@@ -152,29 +157,18 @@ module VX_issue #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif
endmodule

View File

@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// detect duplicate addresses
wire lsu_is_dup;
`ifdef LSU_DUP
`ifdef LSU_DUP_ENABLE
if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP
`ifdef LSU_DUP_ENABLE
, lsu_is_dup
`endif
};
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup;
`ifndef LSU_DUP
`ifndef LSU_DUP_ENABLE
assign rsp_is_dup = 0;
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP
`ifdef LSU_DUP_ENABLE
, rsp_is_dup
`endif
} = mem_rsp_tag;
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (1)
.OUT_REG (3)
) rsp_arb (
.clk (clk),
.reset (commit_reset),

View File

@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
`else
assign div_in1[i] = execute_if.data.rs1_data[i];
assign div_in2[i] = execute_if.data.rs2_data[i];
`endif
end
`ifdef IDIV_DPI

View File

@@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #(
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
@@ -38,14 +39,19 @@ module VX_operands import VX_gpu_pkg::*; #(
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg valid_out_r;
reg [DATAW-1:0] data_out_r;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
@@ -54,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #(
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire ready_out = operands_if[i].ready;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if();
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
always @(*) begin
state_n = state;
@@ -79,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #(
case (state)
STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin
if (valid_out_r && ready_out) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
@@ -160,44 +166,93 @@ module VX_operands import VX_gpu_pkg::*; #(
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
if (writeback_if[i].data.sop) begin
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
end else begin
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
end
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
end
end
end
always @(posedge clk) begin
if (reset) begin
if (reset) begin
state <= STATE_IDLE;
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
cache_reg <= '0;
data_ready <= 0;
valid_out_r <= 0;
end else begin
state <= state_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
data_ready <= data_ready_n;
if (~valid_out_r) begin
valid_out_r <= scoreboard_if[i].valid && data_ready;
end else if (ready_out) begin
valid_out_r <= 0;
end
end
end
if (~valid_out_r) begin
data_out_r <= {scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd};
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end
assign operands_if[i].valid = valid_out_r;
assign {operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
operands_if[i].data.wb,
operands_if[i].data.ex_type,
operands_if[i].data.op_type,
operands_if[i].data.op_mod,
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd} = data_out_r;
assign operands_if[i].data.rs1_data = rs1_data;
assign operands_if[i].data.rs2_data = rs2_data;
assign operands_if[i].data.rs3_data = rs3_data;
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
// GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
always @(posedge clk) begin
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
end
end else begin
assign gpr_wr_addr = writeback_if[i].data.rd;
always @(posedge clk) begin
gpr_rd_addr <= gpr_rd_rid_n;
end
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
@@ -205,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #(
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
@@ -222,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #(
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
.raddr (gpr_rd_addr),
.rdata (gpr_rd_data[j])
);
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end
end
endmodule

View File

@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
`endif
// configuration
input base_dcrs_t base_dcrs,
@@ -304,13 +308,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
end
end
end
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif
@@ -349,7 +360,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
.empty (no_pending_instr)
);
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
// export CSRs
assign sched_csr_if.cycles = cycles;
@@ -376,4 +387,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_sched_idles <= '0;
perf_sched_stalls <= '0;
end else begin
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
`endif
endmodule

View File

@@ -19,6 +19,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
@@ -26,114 +32,201 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_units_reduce (
.data_in (perf_issue_units_per_cycle),
.data_out (perf_units_per_cycle)
);
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
end else begin
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
end
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
end else begin
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
end
end
end
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [3:0] ready_masks, ready_masks_n;
VX_ibuffer_if staging_if();
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
inuse_regs_n = inuse_regs;
ready_masks_n = ready_masks;
if (writeback_fire) begin
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
& {(writeback_if[i].data.rd == staging_if.data.rd),
(writeback_if[i].data.rd == staging_if.data.rs1),
(writeback_if[i].data.rd == staging_if.data.rs2),
(writeback_if[i].data.rd == staging_if.data.rs3)};
end
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
ready_masks_n = '0;
case (scoreboard_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
default: sfu_type = `SFU_WCTL;
endcase
end
always @(*) begin
perf_issue_units_per_cycle[i] = '0;
perf_issue_sfu_per_cycle[i] = '0;
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
end
end
if (inuse_rs1) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
end
end
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
end
end
end
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire ready_out;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
wire valid_in = ibuffer_if[i].valid && deps_ready;
wire ready_in = ~valid_out_r && deps_ready;
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
assign ready_out = scoreboard_if[i].ready;
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
ready_masks <= '0;
end else begin
inuse_regs <= inuse_regs_n;
ready_masks <= ready_masks_n;
valid_out_r <= 0;
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= valid_in;
end else if (ready_out) begin
if (scoreboard_if[i].data.wb) begin
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
end
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= data_in;
end
end
// staging buffer
assign ibuffer_if[i].ready = ready_in;
assign scoreboard_if[i].valid = valid_out_r;
assign scoreboard_if[i].data = data_out_r;
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (ibuffer_if[i].valid),
.ready_in (ibuffer_if[i].ready),
.data_in (ibuffer_if[i].data),
.data_out (staging_if.data),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
`ifdef SIMULATION
reg [31:0] timeout_ctr;
// output buffer
wire valid_stg, ready_stg;
wire regs_ready = (& ready_masks);
assign valid_stg = staging_if.valid && regs_ready;
assign staging_if.ready = ready_stg && regs_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready)
);
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if.valid && ~regs_ready) begin
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (staging_if.valid && staging_if.ready) begin
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
timeout_ctr <= '0;
end
end
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
end
`endif
end
endmodule

View File

@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSR = 1;
localparam RSP_ARB_IDX_CSRS = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
`ifdef PERF_ENABLE
VX_sfu_perf_if sfu_perf_if();
`endif
// Warp control block
VX_execute_if #(
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif
`ifdef EXT_F_ENABLE
@@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_if (csr_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request?
reg sfu_req_ready;
always @(*) begin
case (execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
end
assign execute_if[0].ready = sfu_req_ready;
// response arbitration
@@ -170,7 +166,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
.OUT_REG (3)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
@@ -186,7 +182,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
.OUT_REG (1)
) gather_unit (
.clk (clk),
.reset (commit_reset),
@@ -194,16 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_out_if (commit_if)
);
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
`endif
endmodule

View File

@@ -14,9 +14,7 @@
`ifndef VX_TRACE_VH
`define VX_TRACE_VH
`ifndef SYNTHESIS
`include "VX_define.vh"
`ifdef SIMULATION
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)

View File

@@ -29,7 +29,6 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
);
`UNUSED_PARAM (CORE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
@@ -50,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_WIDTH-1:0] tid;
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin

View File

@@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
localparam MAN_BITS = 23;
localparam EXP_BITS = 8;
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
// Use 32-bit integer
localparam MAX_INT_WIDTH = 32;
localparam INT_WIDTH = 32;
// The internal mantissa includes normal bit or an entire integer
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
// The internal exponent must be able to represent the smallest denormal input value as signed
// or the number of bits in an integer
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
// shift amount for denormalization
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
@@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class (
.exp_i (dataa[i][30:23]),
.man_i (dataa[i][22:0]),
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
.man_i (dataa[i][MAN_BITS-1:0]),
.clss_o (fclass[i])
);
end
@@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] input_sign;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [INT_MAN_WIDTH-1:0] int_mantissa;
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
wire fmt_sign = dataa[i][31];
wire int_sign = dataa[i][31] && is_signed;
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
wire i2f_sign = dataa[i][INT_WIDTH-1];
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
end
// Pipeline stage0
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire unsigned_s0;
wire [2:0] rnd_mode_s0;
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
fclass_t [NUM_LANES-1:0] fclass_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
@@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
end
// Pipeline stage1
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire unsigned_s1;
wire [2:0] rnd_mode_s1;
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire is_signed_s1;
wire [2:0] rnd_mode_s1;
fclass_t [NUM_LANES-1:0] fclass_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
@@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
);
// Perform adjustments to mantissa and exponent
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
reg of_before_round_tmp_s1;
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
always @(*) begin
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
denorm_shamt_s1 = '0;
of_before_round_tmp_s1 = 1'b0;
if (is_itof_s1) begin
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
// Overflow or infinities (for proper rounding)
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
preshift_mant_s1 = ~0; // largest normal value and RS bits set
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
// Limit the shift to retain sticky bits
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
// Denormalize underflowing values
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
end
if (overflow) begin
denorm_shamt_q = '0;
end else if (underflow) begin
denorm_shamt_q = INT_WIDTH+1;
end else begin
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
// overflow: when converting to unsigned the range is larger by one
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
// underflow
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
end else begin
// By default right shift mantissa to be an integer
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
end
denorm_shamt_q = denorm_shamt;
end
end
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
assign final_exp_s1[i] = final_exp_tmp_s1;
assign of_before_round_s1[i] = of_before_round_tmp_s1;
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
assign of_before_round_s1[i] = overflow;
end
// Pipeline stage2
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire unsigned_s2;
wire [2:0] rnd_mode_s2;
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t [NUM_LANES-1:0] fclass_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
// Rouding and classification
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [31:0] fmt_pre_round_abs_s2;
wire [31:0] pre_round_abs_s2;
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
wire [INT_WIDTH-1:0] pre_round_abs_s2;
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
// Extract final mantissa and round bit, discard the normal bit (for FP)
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
// Collapse sticky bits
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
// select RS bits for destination operation
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
// Pack exponent and mantissa into proper rounding form
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] lane_mask_s3;
wire [TAGW-1:0] tag_in_s3;
wire is_itof_s3;
wire unsigned_s3;
wire is_signed_s3;
fclass_t [NUM_LANES-1:0] fclass_s3;
wire [NUM_LANES-1:0] mant_is_zero_s3;
wire [NUM_LANES-1:0] input_sign_s3;
wire [NUM_LANES-1:0] rounded_sign_s3;
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
wire [NUM_LANES-1:0] of_before_round_s3;
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@@ -339,105 +304,71 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
wire [NUM_LANES-1:0] of_after_round_s3;
wire [NUM_LANES-1:0] uf_after_round_s3;
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble regular result, nan box short ones. Int zeroes need to be detected
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Classification after rounding select by destination format
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Negative integer result needs to be brought into two's complement
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
end
// FP Special case handling
// F2I Special case handling
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
wire [NUM_LANES-1:0] fp_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Detect special case from source format, I2F casts don't produce a special result
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
// Assemble result according to destination format
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
end
// INT Special case handling
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
fflags_t [NUM_LANES-1:0] int_special_status_s3;
wire [NUM_LANES-1:0] int_result_is_special_s3;
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble result according to destination format
always @(*) begin
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
end else begin
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
| fclass_s3[i].is_inf
| of_before_round_s3[i]
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
// All integer special cases are invalid
assign int_special_status_s3[i] = {1'b1, 4'h0};
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
end
// Result selection and Output handshake
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t fp_regular_status_s3, int_regular_status_s3;
fflags_t fp_status_s3, int_status_s3;
wire [31:0] fp_result_s3, int_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
assign fp_regular_status_s3.NX = inexact_s3;
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
assign i2f_status_s3 = i2f_regular_status_s3;
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
// Select output depending on special case detection
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
end
assign stall = ~ready_out && valid_out;
@@ -457,7 +388,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
);
assign ready_in = ~stall;
assign has_fflags = 1'b1;
endmodule

View File

@@ -16,7 +16,7 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`ifdef SV_DPI
`include "float_dpi.vh"
`endif

View File

@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
2'b01: round_up = 1'b0; // < ulp/2 away, round down
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
2'b11: round_up = 1'b1; // > ulp/2 away, round up
default: round_up = 1'bx;
endcase
`INST_FRM_RTZ: round_up = 1'b0; // always round down
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -

View File

@@ -14,26 +14,38 @@
`include "VX_define.vh"
interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_idles,
output sched_stalls
);
modport issue (
output ibf_stalls,
output scb_stalls,
output dsp_stalls
);
output units_uses,
output sfu_uses
);
modport slave (
input sched_idles,
input sched_stalls,
input ibf_stalls,
input scb_stalls,
input dsp_stalls,
input units_uses,
input sfu_uses,
input ifetches,
input loads,
input stores,

View File

@@ -21,8 +21,8 @@ module VX_avs_adapter #(
parameter NUM_BANKS = 1,
parameter TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View File

@@ -20,7 +20,7 @@ module VX_axi_adapter #(
parameter TAG_WIDTH = 8,
parameter NUM_BANKS = 1,
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
parameter OUT_REG_RSP = 0
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View File

@@ -201,9 +201,7 @@ module VX_fifo_queue #(
rd_ptr_r <= '0;
rd_ptr_n_r <= 1;
end else begin
if (push) begin
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
end
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
if (pop) begin
rd_ptr_r <= rd_ptr_n_r;
if (DEPTH > 2) begin

View File

@@ -21,8 +21,8 @@ module VX_mem_adapter #(
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View File

@@ -21,7 +21,7 @@ module VX_stream_arb #(
parameter `STRING ARBITER = "P",
parameter LOCK_ENABLE = 1,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_REG = 0 ,
parameter OUT_REG = 0 ,
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)

View File

@@ -173,25 +173,27 @@ module VX_stream_xbar #(
end
// compute inputs collision
// we have a collision when there exists a valid transfer with mutiple input candicates
// we caount the unique duplicates each cycle.
// we have a collision when there exists a valid transfer with multiple input candicates
// we count the unique duplicates each cycle.
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
reg [PERF_CTR_BITS-1:0] collisions_r;
reg [NUM_INPUTS-1:0] per_cycle_collision;
always @(*) begin
per_cycle_collision = 0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin
per_cycle_collision[i] |= ready_in[i] | ready_in[j+i];
end
per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i]
&& (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
end
end
end
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
`POP_COUNT(collision_count, per_cycle_collision);
`BUFFER(per_cycle_collision_r, per_cycle_collision);
`POP_COUNT(collision_count, per_cycle_collision_r);
always @(posedge clk) begin
if (reset) begin

View File

@@ -15,7 +15,7 @@
module VX_gbar_arb #(
parameter NUM_REQS = 1,
parameter OUT_REG = 0,
parameter OUT_REG = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View File

@@ -21,8 +21,8 @@ module VX_mem_arb #(
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter TAG_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View File

@@ -233,10 +233,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw;
wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw;
wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
`BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw);
`BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw);
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);

View File

@@ -19,8 +19,8 @@ module VX_smem_switch #(
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View File

@@ -56,17 +56,17 @@ TARGET=asesim make -C runtime/opae
PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make
# ASE test runs
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t0
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t1
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/demo/demo -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/dogfood/dogfood -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/vecadd/vecadd
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/sgemm/sgemm -n4
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t0
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t1
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n16
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/demo/demo -n16
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/dogfood/dogfood -n16
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/vecadd/vecadd
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/sgemm/sgemm -n4
# modify "vsim_run.tcl" to dump VCD trace
vcd file trace.vcd
vcd add -r /*/Vortex/hw/rtl/*
vcd add -r /*/afu/*
run -all
# compress FPGA output files

View File

@@ -15,27 +15,27 @@
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
BUILD_DIR=$1
BUILD_DIR=$(realpath $1)
PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2`
POCL_RT_PATH=$TOOLDIR/pocl/runtime
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
# Export ASE_WORKDIR variable
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
shift 2
export ASE_WORKDIR=$BUILD_DIR/synth/work
# cleanup incomplete runs
rm -f $ASE_WORKDIR/.app_lock.pid
rm -f $ASE_WORKDIR/.ase_ready.pid
rm -f $SCRIPT_DIR/$BUILD_DIR/nohup.out
rm -f $BUILD_DIR/synth/nohup.out
# Start Simulator in background
pushd $SCRIPT_DIR/$BUILD_DIR
echo " [DBG] starting ASE simnulator (stdout saved to '$SCRIPT_DIR/$BUILD_DIR/nohup.out')"
nohup make sim &
# Start Simulator in background (capture processs group pid)
pushd $BUILD_DIR/synth
echo " [DBG] starting ASE simnulator (stdout saved to '$BUILD_DIR/synth/nohup.out')"
setsid make sim &> /dev/null &
SIM_PID=$!
popd
# Wait for simulator readiness
@@ -47,6 +47,11 @@ done
# run application
pushd $PROGRAM_DIR
shift 2
echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
popd
# stop the simulator (kill process group)
kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*')
wait $SIM_PID 2> /dev/null

View File

@@ -1,6 +1,6 @@
PROJECT = VX_cache_cluster_top
PROJECT = VX_cache_top
TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_cache_cluster.sv
SRC_FILE = $(PROJECT).sv
include ../../common.mk

View File

@@ -1,6 +1,6 @@
PROJECT = VX_core_top
TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_core.sv
SRC_FILE = $(PROJECT).sv
include ../../common.mk

View File

@@ -1,10 +1,11 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif

View File

@@ -33,7 +33,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -27,7 +27,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -27,7 +27,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -25,7 +25,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE)
@@ -56,7 +56,6 @@ PROJECT = top_modules
all: build
build: $(SRCS)
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'