Merge remote-tracking branch 'upstream/master' into vortex2
This commit is contained in:
@@ -43,7 +43,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
`ifdef SCOPE
|
||||
localparam scope_socket = 0;
|
||||
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
||||
@@ -69,18 +78,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.reset (gbar_reset),
|
||||
.gbar_bus_if (gbar_bus_if)
|
||||
);
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
cache_perf_t perf_l2cache;
|
||||
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l2cache = perf_l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
@@ -102,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH),
|
||||
.TAG_WIDTH (L2_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.CORE_OUT_REG (2),
|
||||
@@ -113,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_l2cache),
|
||||
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||
`endif
|
||||
.core_bus_if (per_socket_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
@@ -146,6 +144,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
||||
) socket (
|
||||
`SCOPE_IO_BIND (scope_socket+i)
|
||||
|
||||
.clk (clk),
|
||||
.reset (socket_reset),
|
||||
|
||||
@@ -156,7 +155,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_socket_mem_bus_if[i]),
|
||||
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||
`endif
|
||||
@@ -167,6 +166,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1));
|
||||
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -191,13 +191,21 @@
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`endif
|
||||
|
||||
`ifndef SV_DPI
|
||||
`define DPI_DISABLE
|
||||
`endif
|
||||
|
||||
`ifndef FPU_FPNEW
|
||||
`ifndef FPU_DSP
|
||||
`ifndef FPU_DPI
|
||||
`ifdef SYNTHESIS
|
||||
`define FPU_DSP
|
||||
`else
|
||||
`ifndef SYNTHESIS
|
||||
`ifndef DPI_DISABLE
|
||||
`define FPU_DPI
|
||||
`else
|
||||
`define FPU_DSP
|
||||
`endif
|
||||
`else
|
||||
`define FPU_DSP
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
@@ -223,18 +231,18 @@
|
||||
|
||||
// Number of ALU units
|
||||
`ifndef NUM_ALU_LANES
|
||||
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
|
||||
`define NUM_ALU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_ALU_BLOCKS
|
||||
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
// Number of FPU units
|
||||
`ifndef NUM_FPU_LANES
|
||||
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
|
||||
`define NUM_FPU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_FPU_BLOCKS
|
||||
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
// Number of LSU units
|
||||
@@ -258,7 +266,10 @@
|
||||
`endif
|
||||
|
||||
// LSU Duplicate Address Check
|
||||
`ifdef LSU_DUP
|
||||
`ifndef LSU_DUP_DISABLE
|
||||
`define LSU_DUP_ENABLE
|
||||
`endif
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
`define LSU_DUP_ENABLED 1
|
||||
`else
|
||||
`define LSU_DUP_ENABLED 0
|
||||
@@ -285,8 +296,8 @@
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Size of FPU Request Queue
|
||||
`ifndef FPU_REQ_QUEUE_SIZE
|
||||
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
||||
`ifndef FPUQ_SIZE
|
||||
`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
||||
`endif
|
||||
|
||||
// FNCP Latency
|
||||
@@ -377,7 +388,7 @@
|
||||
|
||||
// Number of Cache Units
|
||||
`ifndef NUM_ICACHES
|
||||
`define NUM_ICACHES `UP(`NUM_CORES / 4)
|
||||
`define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
|
||||
`endif
|
||||
|
||||
// Cache Size
|
||||
@@ -407,7 +418,7 @@
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef ICACHE_NUM_WAYS
|
||||
`define ICACHE_NUM_WAYS 2
|
||||
`define ICACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
@@ -426,7 +437,7 @@
|
||||
|
||||
// Number of Cache Units
|
||||
`ifndef NUM_DCACHES
|
||||
`define NUM_DCACHES `UP(`NUM_CORES / 4)
|
||||
`define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
|
||||
`endif
|
||||
|
||||
// Cache Size
|
||||
@@ -436,7 +447,7 @@
|
||||
|
||||
// Number of Banks
|
||||
`ifndef DCACHE_NUM_BANKS
|
||||
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
|
||||
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
@@ -461,7 +472,7 @@
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef DCACHE_NUM_WAYS
|
||||
`define DCACHE_NUM_WAYS 2
|
||||
`define DCACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
@@ -520,7 +531,7 @@
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef L2_NUM_WAYS
|
||||
`define L2_NUM_WAYS 4
|
||||
`define L2_NUM_WAYS 2
|
||||
`endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
@@ -1,418 +1,437 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_DEFINE_VH
|
||||
`define VX_DEFINE_VH
|
||||
|
||||
`include "VX_platform.vh"
|
||||
`include "VX_config.vh"
|
||||
`include "VX_types.vh"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
|
||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
|
||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||
`define NB_WIDTH `UP(`NB_BITS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
||||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`else
|
||||
`define NUM_REGS `NUM_IREGS
|
||||
`endif
|
||||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`define UUID_WIDTH 1
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_ALU 0
|
||||
`define EX_LSU 1
|
||||
`define EX_SFU 2
|
||||
`define EX_FPU 3
|
||||
|
||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
`define INST_JALR 7'b1100111
|
||||
`define INST_B 7'b1100011 // branch instructions
|
||||
`define INST_L 7'b0000011 // load instructions
|
||||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
// RV64I instruction specific opcodes (for any W instruction)
|
||||
`define INST_I_W 7'b0011011 // W type immediate instructions
|
||||
`define INST_R_W 7'b0111011 // W type register instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
`define INST_FS 7'b0100111 // float store instruction
|
||||
`define INST_FMADD 7'b1000011
|
||||
`define INST_FMSUB 7'b1000111
|
||||
`define INST_FNMSUB 7'b1001011
|
||||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
// Custom extension opcodes
|
||||
`define INST_EXT1 7'b0001011 // 0x0B
|
||||
`define INST_EXT2 7'b0101011 // 0x2B
|
||||
`define INST_EXT3 7'b1011011 // 0x5B
|
||||
`define INST_EXT4 7'b1111011 // 0x7B
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||
`define INST_FRM_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_MOD_BITS 3
|
||||
`define INST_FMT_BITS 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_OTHER 4'b0111
|
||||
`define INST_ALU_BITS 4
|
||||
`define INST_ALU_CLASS(op) op[3:2]
|
||||
`define INST_ALU_SIGNED(op) op[0]
|
||||
`define INST_ALU_IS_SUB(op) op[1]
|
||||
`define INST_ALU_IS_BR(mod) mod[0]
|
||||
`define INST_ALU_IS_M(mod) mod[1]
|
||||
`define INST_ALU_IS_W(mod) mod[2]
|
||||
|
||||
`define INST_BR_EQ 4'b0000
|
||||
`define INST_BR_NE 4'b0010
|
||||
`define INST_BR_LTU 4'b0100
|
||||
`define INST_BR_GEU 4'b0110
|
||||
`define INST_BR_LT 4'b0101
|
||||
`define INST_BR_GE 4'b0111
|
||||
`define INST_BR_JAL 4'b1000
|
||||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
||||
`define INST_BR_IS_NEG(op) op[1]
|
||||
`define INST_BR_IS_LESS(op) op[2]
|
||||
`define INST_BR_IS_STATIC(op) op[3]
|
||||
|
||||
`define INST_M_MUL 3'b000
|
||||
`define INST_M_MULHU 3'b001
|
||||
`define INST_M_MULH 3'b010
|
||||
`define INST_M_MULHSU 3'b011
|
||||
`define INST_M_DIV 3'b100
|
||||
`define INST_M_DIVU 3'b101
|
||||
`define INST_M_REM 3'b110
|
||||
`define INST_M_REMU 3'b111
|
||||
`define INST_M_BITS 3
|
||||
`define INST_M_SIGNED(op) (~op[0])
|
||||
`define INST_M_IS_MULX(op) (~op[2])
|
||||
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
||||
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
||||
`define INST_M_IS_REM(op) op[1]
|
||||
|
||||
`define INST_FMT_B 3'b000
|
||||
`define INST_FMT_H 3'b001
|
||||
`define INST_FMT_W 3'b010
|
||||
`define INST_FMT_D 3'b011
|
||||
`define INST_FMT_BU 3'b100
|
||||
`define INST_FMT_HU 3'b101
|
||||
`define INST_FMT_WU 3'b110
|
||||
|
||||
`define INST_LSU_LB 4'b0000
|
||||
`define INST_LSU_LH 4'b0001
|
||||
`define INST_LSU_LW 4'b0010
|
||||
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
||||
`define INST_LSU_LBU 4'b0100
|
||||
`define INST_LSU_LHU 4'b0101
|
||||
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
||||
`define INST_LSU_SB 4'b1000
|
||||
`define INST_LSU_SH 4'b1001
|
||||
`define INST_LSU_SW 4'b1010
|
||||
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
||||
`define INST_LSU_FENCE 4'b1111
|
||||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(op) op[2:0]
|
||||
`define INST_LSU_WSIZE(op) op[1:0]
|
||||
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_FPU_ADD 4'b0000
|
||||
`define INST_FPU_SUB 4'b0001
|
||||
`define INST_FPU_MUL 4'b0010
|
||||
`define INST_FPU_DIV 4'b0011
|
||||
`define INST_FPU_SQRT 4'b0100
|
||||
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b0110
|
||||
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_F2I 4'b1000
|
||||
`define INST_FPU_F2U 4'b1001
|
||||
`define INST_FPU_I2F 4'b1010
|
||||
`define INST_FPU_U2F 4'b1011
|
||||
`define INST_FPU_MADD 4'b1100
|
||||
`define INST_FPU_MSUB 4'b1101
|
||||
`define INST_FPU_NMSUB 4'b1110
|
||||
`define INST_FPU_NMADD 4'b1111
|
||||
`define INST_FPU_BITS 4
|
||||
`define INST_FPU_IS_W(mod) (mod[4])
|
||||
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
||||
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
||||
|
||||
`define INST_SFU_TMC 4'h0
|
||||
`define INST_SFU_WSPAWN 4'h1
|
||||
`define INST_SFU_SPLIT 4'h2
|
||||
`define INST_SFU_JOIN 4'h3
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_CMOV 4'h9
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BITS 1
|
||||
|
||||
// cache address type bits
|
||||
`ifdef SM_ENABLE
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
||||
`else
|
||||
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
||||
`endif
|
||||
|
||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L2_LINE_SIZE `L1_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L3_LINE_SIZE `L2_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
||||
|
||||
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
||||
`define VX_DCR_DATA_WIDTH 32
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define BUFFER_BUSY(dst, src, enable) \
|
||||
logic __busy; \
|
||||
if (enable) begin \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__busy <= 1'b0; \
|
||||
end else begin \
|
||||
__busy <= src; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign __busy = src; \
|
||||
end \
|
||||
assign dst = __busy
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __``out ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
||||
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data = src.req_data; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = src.req_data.rw; \
|
||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.data = src.req_data.data; \
|
||||
if (TD != TS) \
|
||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||
else \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
||||
if (enable) begin \
|
||||
always @(posedge clk) begin \
|
||||
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
end else begin \
|
||||
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
VX_dcr_bus_if dst(); \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||
|
||||
`define PERF_REDUCE(dst, src, field, width, count) \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
|
||||
__reduce_add_i_``src``field, \
|
||||
__reduce_add_o_``dst``field \
|
||||
); \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``dst``field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end \
|
||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
||||
|
||||
`define PERF_CACHE_REDUCE(dst, src, count) \
|
||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
if (block_size != 1) begin \
|
||||
if (block_size != `NUM_WARPS) begin \
|
||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
end else begin \
|
||||
assign dst = `NW_WIDTH'(block_idx); \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst = src; \
|
||||
end
|
||||
|
||||
`define TO_DISPATCH_DATA(data, tid) \
|
||||
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`endif // VX_DEFINE_VH
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_DEFINE_VH
|
||||
`define VX_DEFINE_VH
|
||||
|
||||
`include "VX_platform.vh"
|
||||
`include "VX_config.vh"
|
||||
`include "VX_types.vh"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
|
||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
|
||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||
`define NB_WIDTH `UP(`NB_BITS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
||||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`else
|
||||
`define NUM_REGS `NUM_IREGS
|
||||
`endif
|
||||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`define UUID_WIDTH 1
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_ALU 0
|
||||
`define EX_LSU 1
|
||||
`define EX_SFU 2
|
||||
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||
|
||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||
`define EX_WIDTH `UP(`EX_BITS)
|
||||
|
||||
`define SFU_CSRS 0
|
||||
`define SFU_WCTL 1
|
||||
|
||||
`define NUM_SFU_UNITS (2)
|
||||
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
|
||||
`define SFU_WIDTH `UP(`SFU_BITS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
`define INST_JALR 7'b1100111
|
||||
`define INST_B 7'b1100011 // branch instructions
|
||||
`define INST_L 7'b0000011 // load instructions
|
||||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
// RV64I instruction specific opcodes (for any W instruction)
|
||||
`define INST_I_W 7'b0011011 // W type immediate instructions
|
||||
`define INST_R_W 7'b0111011 // W type register instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
`define INST_FS 7'b0100111 // float store instruction
|
||||
`define INST_FMADD 7'b1000011
|
||||
`define INST_FMSUB 7'b1000111
|
||||
`define INST_FNMSUB 7'b1001011
|
||||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
// Custom extension opcodes
|
||||
`define INST_EXT1 7'b0001011 // 0x0B
|
||||
`define INST_EXT2 7'b0101011 // 0x2B
|
||||
`define INST_EXT3 7'b1011011 // 0x5B
|
||||
`define INST_EXT4 7'b1111011 // 0x7B
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||
`define INST_FRM_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_MOD_BITS 3
|
||||
`define INST_FMT_BITS 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_OTHER 4'b0111
|
||||
`define INST_ALU_BITS 4
|
||||
`define INST_ALU_CLASS(op) op[3:2]
|
||||
`define INST_ALU_SIGNED(op) op[0]
|
||||
`define INST_ALU_IS_SUB(op) op[1]
|
||||
`define INST_ALU_IS_BR(mod) mod[0]
|
||||
`define INST_ALU_IS_M(mod) mod[1]
|
||||
`define INST_ALU_IS_W(mod) mod[2]
|
||||
|
||||
`define INST_BR_EQ 4'b0000
|
||||
`define INST_BR_NE 4'b0010
|
||||
`define INST_BR_LTU 4'b0100
|
||||
`define INST_BR_GEU 4'b0110
|
||||
`define INST_BR_LT 4'b0101
|
||||
`define INST_BR_GE 4'b0111
|
||||
`define INST_BR_JAL 4'b1000
|
||||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
||||
`define INST_BR_IS_NEG(op) op[1]
|
||||
`define INST_BR_IS_LESS(op) op[2]
|
||||
`define INST_BR_IS_STATIC(op) op[3]
|
||||
|
||||
`define INST_M_MUL 3'b000
|
||||
`define INST_M_MULHU 3'b001
|
||||
`define INST_M_MULH 3'b010
|
||||
`define INST_M_MULHSU 3'b011
|
||||
`define INST_M_DIV 3'b100
|
||||
`define INST_M_DIVU 3'b101
|
||||
`define INST_M_REM 3'b110
|
||||
`define INST_M_REMU 3'b111
|
||||
`define INST_M_BITS 3
|
||||
`define INST_M_SIGNED(op) (~op[0])
|
||||
`define INST_M_IS_MULX(op) (~op[2])
|
||||
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
||||
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
||||
`define INST_M_IS_REM(op) op[1]
|
||||
|
||||
`define INST_FMT_B 3'b000
|
||||
`define INST_FMT_H 3'b001
|
||||
`define INST_FMT_W 3'b010
|
||||
`define INST_FMT_D 3'b011
|
||||
`define INST_FMT_BU 3'b100
|
||||
`define INST_FMT_HU 3'b101
|
||||
`define INST_FMT_WU 3'b110
|
||||
|
||||
`define INST_LSU_LB 4'b0000
|
||||
`define INST_LSU_LH 4'b0001
|
||||
`define INST_LSU_LW 4'b0010
|
||||
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
||||
`define INST_LSU_LBU 4'b0100
|
||||
`define INST_LSU_LHU 4'b0101
|
||||
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
||||
`define INST_LSU_SB 4'b1000
|
||||
`define INST_LSU_SH 4'b1001
|
||||
`define INST_LSU_SW 4'b1010
|
||||
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
||||
`define INST_LSU_FENCE 4'b1111
|
||||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(op) op[2:0]
|
||||
`define INST_LSU_WSIZE(op) op[1:0]
|
||||
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_FPU_ADD 4'b0000
|
||||
`define INST_FPU_SUB 4'b0001
|
||||
`define INST_FPU_MUL 4'b0010
|
||||
`define INST_FPU_DIV 4'b0011
|
||||
`define INST_FPU_SQRT 4'b0100
|
||||
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b0110
|
||||
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_F2I 4'b1000
|
||||
`define INST_FPU_F2U 4'b1001
|
||||
`define INST_FPU_I2F 4'b1010
|
||||
`define INST_FPU_U2F 4'b1011
|
||||
`define INST_FPU_MADD 4'b1100
|
||||
`define INST_FPU_MSUB 4'b1101
|
||||
`define INST_FPU_NMSUB 4'b1110
|
||||
`define INST_FPU_NMADD 4'b1111
|
||||
`define INST_FPU_BITS 4
|
||||
`define INST_FPU_IS_W(mod) (mod[4])
|
||||
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
||||
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
||||
|
||||
`define INST_SFU_TMC 4'h0
|
||||
`define INST_SFU_WSPAWN 4'h1
|
||||
`define INST_SFU_SPLIT 4'h2
|
||||
`define INST_SFU_JOIN 4'h3
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_CMOV 4'h9
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BITS 1
|
||||
|
||||
// cache address type bits
|
||||
`ifdef SM_ENABLE
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
||||
`else
|
||||
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
||||
`endif
|
||||
|
||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L2_LINE_SIZE `L1_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L3_LINE_SIZE `L2_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
||||
|
||||
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
||||
`define VX_DCR_DATA_WIDTH 32
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define BUFFER_EX(dst, src, ena, latency) \
|
||||
VX_pipe_register #( \
|
||||
.DATAW ($bits(dst)), \
|
||||
.RESETW ($bits(dst)), \
|
||||
.DEPTH (latency) \
|
||||
) __``dst ( \
|
||||
.clk (clk), \
|
||||
.reset (reset), \
|
||||
.enable (ena), \
|
||||
.data_in (src), \
|
||||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __``out ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
||||
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data = src.req_data; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = src.req_data.rw; \
|
||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.data = src.req_data.data; \
|
||||
if (TD != TS) \
|
||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||
else \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
||||
if (enable) begin \
|
||||
always @(posedge clk) begin \
|
||||
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
end else begin \
|
||||
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
VX_dcr_bus_if dst(); \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||
|
||||
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
|
||||
for (genvar __d = 0; __d < dst_count; ++__d) begin \
|
||||
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
|
||||
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||
for (genvar __i = 0; __i < __count; ++__i) begin \
|
||||
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
|
||||
__reduce_add_i_``src``field, \
|
||||
__reduce_add_o_``dst``field \
|
||||
); \
|
||||
if (reg_enable) begin \
|
||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``dst``field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end \
|
||||
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
|
||||
end else begin \
|
||||
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
if (block_size != 1) begin \
|
||||
if (block_size != `NUM_WARPS) begin \
|
||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
end else begin \
|
||||
assign dst = `NW_WIDTH'(block_idx); \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst = src; \
|
||||
end
|
||||
|
||||
`define TO_DISPATCH_DATA(data, tid) { \
|
||||
data.uuid, \
|
||||
data.wis, \
|
||||
data.tmask, \
|
||||
data.op_type, \
|
||||
data.op_mod, \
|
||||
data.wb, \
|
||||
data.use_PC, \
|
||||
data.use_imm, \
|
||||
data.PC, \
|
||||
data.imm, \
|
||||
data.rd, \
|
||||
tid, \
|
||||
data.rs1_data, \
|
||||
data.rs2_data, \
|
||||
data.rs3_data}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`endif // VX_DEFINE_VH
|
||||
|
||||
@@ -99,7 +99,7 @@ package VX_gpu_pkg;
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
@@ -142,10 +142,13 @@ package VX_gpu_pkg;
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||
|
||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||
|
||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
|
||||
// Word size in bytes
|
||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
@@ -190,42 +193,46 @@ package VX_gpu_pkg;
|
||||
|
||||
/////////////////////////////// Issue parameters //////////////////////////
|
||||
|
||||
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH);
|
||||
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
|
||||
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
|
||||
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
|
||||
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO);
|
||||
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO));
|
||||
|
||||
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
|
||||
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
|
||||
function logic [`NW_WIDTH-1:0] wis_to_wid(
|
||||
input logic [ISSUE_WIS_W-1:0] wis,
|
||||
input logic [ISSUE_ISW_W-1:0] isw
|
||||
);
|
||||
if (ISSUE_WIS == 0) begin
|
||||
wis_to_wid = `NW_WIDTH'(isw);
|
||||
end else if (ISSUE_ISW == 0) begin
|
||||
wis_to_wid = `NW_WIDTH'(wis);
|
||||
end else begin
|
||||
wis_to_wid = `NW_WIDTH'({wis, isw});
|
||||
end
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
|
||||
input logic [`NW_WIDTH-1:0] wid
|
||||
);
|
||||
if (`ISSUE_WIDTH > 1) begin
|
||||
wid_to_isw = ISSUE_IDX_W'(wid);
|
||||
if (ISSUE_ISW != 0) begin
|
||||
wid_to_isw = wid[ISSUE_ISW_W-1:0];
|
||||
end else begin
|
||||
wid_to_isw = 0;
|
||||
end
|
||||
endfunction
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
function logic [`NW_WIDTH-1:0] wis_to_wid(
|
||||
input logic [ISSUE_WIS_W-1:0] wis,
|
||||
input logic [ISSUE_IDX_W-1:0] isw
|
||||
);
|
||||
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
||||
input logic [`NW_WIDTH-1:0] wid
|
||||
);
|
||||
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
||||
input logic [`NR_BITS-1:0] rid,
|
||||
input logic [ISSUE_WIS_W-1:0] wis
|
||||
);
|
||||
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
|
||||
if (ISSUE_WIS != 0) begin
|
||||
wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW);
|
||||
end else begin
|
||||
wid_to_wis = 0;
|
||||
end
|
||||
endfunction
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
endpackage
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
`ifndef VX_PLATFORM_VH
|
||||
`define VX_PLATFORM_VH
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`ifdef SV_DPI
|
||||
`include "util_dpi.vh"
|
||||
`endif
|
||||
|
||||
|
||||
@@ -65,58 +65,11 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
cache_perf_t perf_icache;
|
||||
cache_perf_t perf_dcache;
|
||||
|
||||
assign mem_perf_tmp_if.icache = perf_icache;
|
||||
assign mem_perf_tmp_if.dcache = perf_dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.smem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_bus_if();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_bus_if();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) cache_mem_bus_if[2]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) mem_bus_tmp_if[1]();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
||||
`RESET_RELAY (mem_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
.bus_in_if (cache_mem_bus_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -125,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
||||
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_bus_if();
|
||||
|
||||
`RESET_RELAY (icache_reset, reset);
|
||||
|
||||
VX_cache_cluster #(
|
||||
@@ -149,7 +107,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.MEM_OUT_REG (2)
|
||||
) icache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_icache),
|
||||
.cache_perf (mem_perf_tmp_if.icache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
@@ -160,9 +118,14 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_bus_if();
|
||||
|
||||
`RESET_RELAY (dcache_reset, reset);
|
||||
|
||||
@@ -189,7 +152,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.MEM_OUT_REG (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_dcache),
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
@@ -197,6 +160,40 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
.mem_bus_if (dcache_mem_bus_if)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
||||
`RESET_RELAY (mem_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ (2),
|
||||
.OUT_REG_RSP (2)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
.bus_in_if (l1_mem_bus_if),
|
||||
.bus_out_if (l1_mem_arb_bus_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
|
||||
@@ -245,6 +242,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1));
|
||||
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -58,6 +58,8 @@
|
||||
|
||||
`define VX_CSR_MPM_BASE 12'hB00
|
||||
`define VX_CSR_MPM_BASE_H 12'hB80
|
||||
`define VX_CSR_MPM_USER 12'hB03
|
||||
`define VX_CSR_MPM_USER_H 12'hB83
|
||||
|
||||
// Machine Performance-monitoring core counters
|
||||
// PERF: Standard
|
||||
@@ -68,29 +70,38 @@
|
||||
`define VX_CSR_MINSTRET 12'hB02
|
||||
`define VX_CSR_MINSTRET_H 12'hB82
|
||||
// PERF: pipeline
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB03
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB04
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_ALU_ST 12'hB05
|
||||
`define VX_CSR_MPM_ALU_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_LSU_ST 12'hB06
|
||||
`define VX_CSR_MPM_LSU_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_FPU_ST 12'hB07
|
||||
`define VX_CSR_MPM_FPU_ST_H 12'hB87
|
||||
`define VX_CSR_MPM_SFU_ST 12'hB08
|
||||
`define VX_CSR_MPM_SFU_ST_H 12'hB88
|
||||
`define VX_CSR_MPM_SCHED_ID 12'hB03
|
||||
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
|
||||
`define VX_CSR_MPM_SCHED_ST 12'hB04
|
||||
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_SCRB_ALU 12'hB07
|
||||
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
|
||||
`define VX_CSR_MPM_SCRB_FPU 12'hB08
|
||||
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
|
||||
`define VX_CSR_MPM_SCRB_LSU 12'hB09
|
||||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
|
||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
|
||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0A
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
|
||||
`define VX_CSR_MPM_LOADS 12'hB0B
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8B
|
||||
`define VX_CSR_MPM_STORES 12'hB0C
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8C
|
||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
|
||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
|
||||
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
|
||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0B
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
|
||||
`define VX_CSR_MPM_LOADS 12'hB0C
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8C
|
||||
`define VX_CSR_MPM_STORES 12'hB0D
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8D
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
||||
// SFU: scoreboard
|
||||
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
|
||||
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
|
||||
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
|
||||
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
@@ -98,59 +109,61 @@
|
||||
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
||||
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
|
||||
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
|
||||
// PERF: dcache
|
||||
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
|
||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
|
||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
|
||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
|
||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
|
||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
|
||||
`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
|
||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB86
|
||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
|
||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
|
||||
// PERF: l2cache
|
||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
|
||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
|
||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
|
||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
|
||||
// PERF: l3cache
|
||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
|
||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
|
||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
|
||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
|
||||
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
|
||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
|
||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
|
||||
|
||||
// Machine Information Registers
|
||||
|
||||
|
||||
@@ -22,15 +22,15 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire mem_req_rw,
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
@@ -45,17 +45,11 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
cache_perf_t perf_l3cache;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.l3cache = perf_l3cache;
|
||||
assign mem_perf_if.smem = 'x;
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L2_LINE_SIZE),
|
||||
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (perf_l3cache),
|
||||
.cache_perf (mem_perf_if.l3cache),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
@@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
|
||||
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
@@ -181,19 +176,19 @@ module Vortex import VX_gpu_pkg::*; (
|
||||
end
|
||||
end
|
||||
|
||||
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
|
||||
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
mem_perf <= '0;
|
||||
end else begin
|
||||
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
if (mem_req_fire && mem_bus_if.req_data.rw) begin
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
end else begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
|
||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
@@ -262,7 +262,7 @@ module VX_afu_wrap #(
|
||||
.m_axi_awready (m_axi_mem_awready_a),
|
||||
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
||||
.m_axi_awid (m_axi_mem_awid_a),
|
||||
`UNUSED_PIN (m_axi_awlen),
|
||||
.m_axi_awlen (m_axi_mem_awlen_a),
|
||||
`UNUSED_PIN (m_axi_awsize),
|
||||
`UNUSED_PIN (m_axi_awburst),
|
||||
`UNUSED_PIN (m_axi_awlock),
|
||||
|
||||
11
hw/rtl/cache/VX_cache.sv
vendored
11
hw/rtl/cache/VX_cache.sv
vendored
@@ -530,14 +530,17 @@ module VX_cache import VX_gpu_pkg::*; #(
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
|
||||
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
@@ -560,7 +563,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
||||
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t perf_cache_unit[NUM_CACHES];
|
||||
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
|
||||
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
|
||||
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
|
||||
assign cache_perf = perf_cache_tmp[0];
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
||||
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
@@ -1,190 +0,0 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
parameter NUM_UNITS = 2,
|
||||
parameter NUM_INPUTS = 4,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 16,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 1,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 2,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 2,
|
||||
|
||||
parameter NUM_CACHES = `UP(NUM_UNITS),
|
||||
parameter PASSTHRU = (NUM_UNITS == 0),
|
||||
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
|
||||
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
|
||||
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t cache_perf,
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
|
||||
|
||||
// Core response
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready
|
||||
);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) core_bus_if[NUM_INPUTS * NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_X_WIDTH)
|
||||
) mem_bus_if();
|
||||
|
||||
// Core request
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
|
||||
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
|
||||
end
|
||||
end
|
||||
|
||||
// Core response
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
|
||||
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
|
||||
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
|
||||
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
|
||||
end
|
||||
end
|
||||
|
||||
// Memory request
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen = mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
|
||||
// Memory response
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.NUM_UNITS (NUM_UNITS),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.NC_ENABLE (NC_ENABLE),
|
||||
.CORE_OUT_REG (CORE_OUT_REG),
|
||||
.MEM_OUT_REG (MEM_OUT_REG)
|
||||
) cache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.core_bus_if (core_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
8
hw/rtl/cache/VX_cache_data.sv
vendored
8
hw/rtl/cache/VX_cache_data.sv
vendored
@@ -93,13 +93,7 @@ module VX_cache_data #(
|
||||
assign wren = fill;
|
||||
end
|
||||
|
||||
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
|
||||
// generate if (NUM_WAYS == 1) begin
|
||||
// wire [0:0] way_idx;
|
||||
// end else begin
|
||||
// wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
|
||||
// end
|
||||
// endgenerate
|
||||
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
|
||||
|
||||
VX_onehot_encoder #(
|
||||
.N (NUM_WAYS)
|
||||
|
||||
12
hw/rtl/cache/VX_cache_define.vh
vendored
12
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -63,4 +63,16 @@
|
||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
|
||||
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
|
||||
|
||||
`endif // VX_CACHE_DEFINE_VH
|
||||
|
||||
4
hw/rtl/cache/VX_cache_top.sv
vendored
4
hw/rtl/cache/VX_cache_top.sv
vendored
@@ -13,7 +13,7 @@
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_top #(
|
||||
module VX_cache_top import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
// Number of Word requests per cycle
|
||||
@@ -22,7 +22,7 @@ module VX_cache_top #(
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
// Number of associative ways
|
||||
|
||||
@@ -45,7 +45,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
|
||||
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
||||
@@ -92,24 +92,24 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
end
|
||||
|
||||
// CSRs update
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
|
||||
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
||||
|
||||
assign commit_fire_any = (| commit_fire);
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire [COMMIT_SIZEW-1:0] pop_count;
|
||||
`POP_COUNT(pop_count, commit_tmask[i]);
|
||||
assign commit_size[i] = pop_count;
|
||||
wire [COMMIT_SIZEW-1:0] count;
|
||||
`POP_COUNT(count, commit_tmask[i]);
|
||||
assign commit_size[i] = count;
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
@@ -130,7 +130,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.OP ("+")
|
||||
) commit_size_reduce (
|
||||
.data_in (commit_size_r),
|
||||
.data_out (commit_size_all)
|
||||
.data_out (commit_size_all_r)
|
||||
);
|
||||
|
||||
VX_pipe_register #(
|
||||
@@ -140,26 +140,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire_any_r, commit_size_all}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_r})
|
||||
.data_in ({commit_fire_any_r, commit_size_all_r}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||
);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] instret;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instret <= '0;
|
||||
end else begin
|
||||
if (commit_fire_any_rr) begin
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign commit_csr_if.instret = instret;
|
||||
|
||||
// Committed instructions
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||
.RESETW (`ISSUE_WIDTH)
|
||||
@@ -167,23 +167,23 @@ module VX_commit import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({(commit_fire & commit_eop), commit_wid}),
|
||||
.data_in ({committed, commit_wid}),
|
||||
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
||||
);
|
||||
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||
end
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
|
||||
@@ -1,339 +1,341 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
VX_schedule_if schedule_if();
|
||||
VX_fetch_if fetch_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
`ifdef SM_ENABLE
|
||||
cache_perf_t smem_perf;
|
||||
assign mem_perf_tmp_if.smem = smem_perf;
|
||||
`else
|
||||
assign mem_perf_tmp_if.smem = '0;
|
||||
`endif
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
`RESET_RELAY (fetch_reset, reset);
|
||||
`RESET_RELAY (decode_reset, reset);
|
||||
`RESET_RELAY (issue_reset, reset);
|
||||
`RESET_RELAY (execute_reset, reset);
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
||||
VX_dcr_data dcr_data (
|
||||
.clk (clk),
|
||||
.reset (dcr_data_reset),
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
.base_dcrs (base_dcrs)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.schedule_if (schedule_if),
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) fetch (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
.sfu_dispatch_if(sfu_dispatch_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.sim_ebreak (sim_ebreak)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
`ifdef SM_ENABLE
|
||||
|
||||
VX_smem_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) smem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (smem_perf),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_out_if (dcache_bus_if),
|
||||
.smem_bus_out_if (smem_bus_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_icache_pending_reads <= '0;
|
||||
perf_dcache_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ifetches <= '0;
|
||||
perf_loads <= '0;
|
||||
perf_stores <= '0;
|
||||
perf_icache_lat <= '0;
|
||||
perf_dcache_lat <= '0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
module VX_core import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
VX_mem_bus_if.master smem_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
VX_schedule_if schedule_if();
|
||||
VX_fetch_if fetch_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_sched_csr_if sched_csr_if();
|
||||
VX_decode_sched_if decode_sched_if();
|
||||
VX_commit_sched_if commit_sched_if();
|
||||
VX_commit_csr_if commit_csr_if();
|
||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||
`endif
|
||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
`RESET_RELAY (fetch_reset, reset);
|
||||
`RESET_RELAY (decode_reset, reset);
|
||||
`RESET_RELAY (issue_reset, reset);
|
||||
`RESET_RELAY (execute_reset, reset);
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
||||
VX_dcr_data dcr_data (
|
||||
.clk (clk),
|
||||
.reset (dcr_data_reset),
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
.base_dcrs (base_dcrs)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.decode_sched_if(decode_sched_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.schedule_if (schedule_if),
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (gbar_bus_if),
|
||||
`endif
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) fetch (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
`endif
|
||||
.sfu_dispatch_if(sfu_dispatch_if)
|
||||
);
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
.lsu_dispatch_if(lsu_dispatch_if),
|
||||
.sfu_dispatch_if(sfu_dispatch_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.sim_ebreak (sim_ebreak)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.lsu_commit_if (lsu_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.sfu_commit_if (sfu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_sched_if(commit_sched_if),
|
||||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
`ifdef SM_ENABLE
|
||||
|
||||
VX_smem_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) smem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.smem),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_out_if (dcache_bus_if),
|
||||
.smem_bus_out_if (smem_bus_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
|
||||
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
|
||||
|
||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_icache_pending_reads <= '0;
|
||||
perf_dcache_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ifetches <= '0;
|
||||
perf_loads <= '0;
|
||||
perf_stores <= '0;
|
||||
perf_icache_lat <= '0;
|
||||
perf_dcache_lat <= '0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -129,7 +129,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
||||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.smem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
@@ -35,7 +35,6 @@ import VX_fpu_pkg::*;
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
@@ -183,105 +182,115 @@ import VX_fpu_pkg::*;
|
||||
|
||||
default: begin
|
||||
read_addr_valid_r = 0;
|
||||
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
|
||||
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||
read_addr_valid_r = 1;
|
||||
`ifdef PERF_ENABLE
|
||||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l2cache
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l3cache
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
@@ -301,8 +310,6 @@ import VX_fpu_pkg::*;
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
||||
`UNUSED_VAR (perf_wctl_stalls);
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.smem);
|
||||
`endif
|
||||
|
||||
@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
VX_sfu_perf_if.slave sfu_perf_if,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_trace.vh"
|
||||
|
||||
module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
@@ -174,30 +175,38 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
|
||||
wire [`ISSUE_WIDTH-1:0] operands_stall;
|
||||
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
|
||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
|
||||
assign operands_ex_type[i] = operands_if[i].data.ex_type;
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_stalls_n = perf_stalls_r;
|
||||
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
if (operands_stall[i]) begin
|
||||
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
|
||||
always @(*) begin
|
||||
perf_issue_unit_stalls_per_cycle[i] = '0;
|
||||
if (operands_if[i].valid && ~operands_if[i].ready) begin
|
||||
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r <= '0;
|
||||
end else begin
|
||||
perf_stalls_r <= perf_stalls_n;
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) reduce (
|
||||
.data_in (perf_issue_unit_stalls_per_cycle),
|
||||
.data_out (perf_unit_stalls_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r[i] <= '0;
|
||||
end else begin
|
||||
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_idx <= '0;
|
||||
end else if (batch_done) begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
||||
end else begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
@@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
||||
assign block_done[block_idx] = ~valid_p || ready_p;
|
||||
end
|
||||
|
||||
wire [ISSUE_IDX_W-1:0] wsi;
|
||||
wire [ISSUE_ISW_W-1:0] isw;
|
||||
if (BATCH_COUNT != 1) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||
end else begin
|
||||
assign wsi = batch_idx;
|
||||
assign isw = batch_idx;
|
||||
end
|
||||
end else begin
|
||||
assign wsi = block_idx;
|
||||
assign isw = block_idx;
|
||||
end
|
||||
|
||||
`RESET_RELAY(buf_out_reset, reset);
|
||||
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (OUT_DATAW),
|
||||
|
||||
@@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
||||
localparam NUM_LANES = `NUM_FPU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
|
||||
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
|
||||
VX_execute_if #(
|
||||
@@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (`FPU_REQ_QUEUE_SIZE)
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
@@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
||||
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
|
||||
wire [BLOCK_SIZE-1:0] commit_in_ready;
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
|
||||
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_valid[i] = commit_in_if[i].valid;
|
||||
@@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
assign commit_in_if[i].ready = commit_in_ready[i];
|
||||
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||
end else begin
|
||||
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
|
||||
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
|
||||
end
|
||||
end else begin
|
||||
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
|
||||
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
|
||||
end
|
||||
end
|
||||
|
||||
@@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
||||
commit_out_data[i] = 'x;
|
||||
end
|
||||
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
|
||||
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
|
||||
commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
|
||||
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
|
||||
end
|
||||
end
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
|
||||
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
|
||||
@@ -14,10 +14,10 @@
|
||||
`include "VX_platform.vh"
|
||||
|
||||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -59,6 +59,11 @@ module VX_issue #(
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||
.perf_units_uses(perf_issue_if.units_uses),
|
||||
.perf_sfu_uses (perf_issue_if.sfu_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
.scoreboard_if (scoreboard_if)
|
||||
@@ -80,7 +85,7 @@ module VX_issue #(
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (perf_issue_if.dsp_stalls),
|
||||
`UNUSED_PIN (perf_stalls),
|
||||
`endif
|
||||
.operands_if (operands_if),
|
||||
.alu_dispatch_if(alu_dispatch_if),
|
||||
@@ -152,29 +157,18 @@ module VX_issue #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
||||
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
||||
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
end
|
||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
||||
|
||||
wire decode_stall = decode_if.valid && ~decode_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ibf_stalls <= '0;
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
if (decode_if.valid && ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
// detect duplicate addresses
|
||||
|
||||
wire lsu_is_dup;
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
if (NUM_LANES > 1) begin
|
||||
wire [NUM_LANES-2:0] addr_matches;
|
||||
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
||||
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
|
||||
assign mem_req_tag = {
|
||||
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, lsu_is_dup
|
||||
`endif
|
||||
};
|
||||
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
wire [PID_WIDTH-1:0] rsp_pid;
|
||||
wire rsp_is_dup;
|
||||
|
||||
`ifndef LSU_DUP
|
||||
`ifndef LSU_DUP_ENABLE
|
||||
assign rsp_is_dup = 0;
|
||||
`endif
|
||||
|
||||
assign {
|
||||
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||
`ifdef LSU_DUP
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, rsp_is_dup
|
||||
`endif
|
||||
} = mem_rsp_tag;
|
||||
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
||||
`else
|
||||
assign div_in1[i] = execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = execute_if.data.rs2_data[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef IDIV_DPI
|
||||
|
||||
@@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
localparam STATE_IDLE = 2'd0;
|
||||
localparam STATE_FETCH1 = 2'd1;
|
||||
@@ -38,14 +39,19 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
|
||||
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
|
||||
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
|
||||
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
|
||||
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
|
||||
|
||||
reg valid_out_r;
|
||||
reg [DATAW-1:0] data_out_r;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
||||
|
||||
reg [STATE_BITS-1:0] state, state_n;
|
||||
reg [`NR_BITS-1:0] rs2, rs2_n;
|
||||
@@ -54,11 +60,11 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
reg rs3_ready, rs3_ready_n;
|
||||
reg data_ready, data_ready_n;
|
||||
|
||||
wire ready_out = operands_if[i].ready;
|
||||
|
||||
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
|
||||
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
|
||||
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
||||
|
||||
VX_operands_if staging_if();
|
||||
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
||||
|
||||
always @(*) begin
|
||||
state_n = state;
|
||||
@@ -79,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
if (staging_if.valid && staging_if.ready) begin
|
||||
if (valid_out_r && ready_out) begin
|
||||
data_ready_n = 0;
|
||||
end
|
||||
if (scoreboard_if[i].valid && data_ready_n == 0) begin
|
||||
@@ -160,44 +166,93 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
end
|
||||
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
|
||||
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
|
||||
if (writeback_if[i].data.sop) begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
|
||||
end else begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
|
||||
end
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
|
||||
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
if (reset) begin
|
||||
state <= STATE_IDLE;
|
||||
gpr_rd_rid <= '0;
|
||||
gpr_rd_wis <= '0;
|
||||
cache_eop <= {ISSUE_RATIO{1'b1}};
|
||||
cache_reg <= '0;
|
||||
data_ready <= 0;
|
||||
valid_out_r <= 0;
|
||||
end else begin
|
||||
state <= state_n;
|
||||
rs2 <= rs2_n;
|
||||
rs3 <= rs3_n;
|
||||
rs2_ready <= rs2_ready_n;
|
||||
rs3_ready <= rs3_ready_n;
|
||||
rs1_data <= rs1_data_n;
|
||||
rs2_data <= rs2_data_n;
|
||||
rs3_data <= rs3_data_n;
|
||||
gpr_rd_rid <= gpr_rd_rid_n;
|
||||
gpr_rd_wis <= gpr_rd_wis_n;
|
||||
cache_data <= cache_data_n;
|
||||
cache_reg <= cache_reg_n;
|
||||
cache_tmask <= cache_tmask_n;
|
||||
cache_eop <= cache_eop_n;
|
||||
data_ready <= data_ready_n;
|
||||
data_ready <= data_ready_n;
|
||||
if (~valid_out_r) begin
|
||||
valid_out_r <= scoreboard_if[i].valid && data_ready;
|
||||
end else if (ready_out) begin
|
||||
valid_out_r <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (~valid_out_r) begin
|
||||
data_out_r <= {scoreboard_if[i].data.uuid,
|
||||
scoreboard_if[i].data.wis,
|
||||
scoreboard_if[i].data.tmask,
|
||||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.use_PC,
|
||||
scoreboard_if[i].data.use_imm,
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd};
|
||||
end
|
||||
|
||||
gpr_rd_rid <= gpr_rd_rid_n;
|
||||
gpr_rd_wis <= gpr_rd_wis_n;
|
||||
rs2_ready <= rs2_ready_n;
|
||||
rs3_ready <= rs3_ready_n;
|
||||
rs2 <= rs2_n;
|
||||
rs3 <= rs3_n;
|
||||
rs1_data <= rs1_data_n;
|
||||
rs2_data <= rs2_data_n;
|
||||
rs3_data <= rs3_data_n;
|
||||
cache_data <= cache_data_n;
|
||||
cache_reg <= cache_reg_n;
|
||||
cache_tmask <= cache_tmask_n;
|
||||
end
|
||||
|
||||
assign operands_if[i].valid = valid_out_r;
|
||||
assign {operands_if[i].data.uuid,
|
||||
operands_if[i].data.wis,
|
||||
operands_if[i].data.tmask,
|
||||
operands_if[i].data.PC,
|
||||
operands_if[i].data.wb,
|
||||
operands_if[i].data.ex_type,
|
||||
operands_if[i].data.op_type,
|
||||
operands_if[i].data.op_mod,
|
||||
operands_if[i].data.use_PC,
|
||||
operands_if[i].data.use_imm,
|
||||
operands_if[i].data.imm,
|
||||
operands_if[i].data.rd} = data_out_r;
|
||||
assign operands_if[i].data.rs1_data = rs1_data;
|
||||
assign operands_if[i].data.rs2_data = rs2_data;
|
||||
assign operands_if[i].data.rs3_data = rs3_data;
|
||||
|
||||
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
|
||||
|
||||
// GPR banks
|
||||
|
||||
reg [RAM_ADDRW-1:0] gpr_rd_addr;
|
||||
wire [RAM_ADDRW-1:0] gpr_wr_addr;
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||
always @(posedge clk) begin
|
||||
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
|
||||
end
|
||||
end else begin
|
||||
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||
always @(posedge clk) begin
|
||||
gpr_rd_addr <= gpr_rd_rid_n;
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef GPR_RESET
|
||||
reg wr_enabled = 0;
|
||||
always @(posedge clk) begin
|
||||
@@ -205,10 +260,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
wr_enabled <= 1;
|
||||
end
|
||||
end
|
||||
`else
|
||||
wire wr_enabled = 1;
|
||||
`endif
|
||||
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
@@ -222,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
|
||||
.raddr (gpr_rd_addr),
|
||||
.rdata (gpr_rd_data[j])
|
||||
);
|
||||
end
|
||||
|
||||
// staging buffer
|
||||
|
||||
`RESET_RELAY (stg_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stg_buf (
|
||||
.clk (clk),
|
||||
.reset (stg_buf_reset),
|
||||
.valid_in (scoreboard_if[i].valid),
|
||||
.ready_in (scoreboard_if[i].ready),
|
||||
.data_in ({
|
||||
scoreboard_if[i].data.uuid,
|
||||
scoreboard_if[i].data.wis,
|
||||
scoreboard_if[i].data.tmask,
|
||||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.use_PC,
|
||||
scoreboard_if[i].data.use_imm,
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd}),
|
||||
.data_out ({
|
||||
staging_if.data.uuid,
|
||||
staging_if.data.wis,
|
||||
staging_if.data.tmask,
|
||||
staging_if.data.PC,
|
||||
staging_if.data.wb,
|
||||
staging_if.data.ex_type,
|
||||
staging_if.data.op_type,
|
||||
staging_if.data.op_mod,
|
||||
staging_if.data.use_PC,
|
||||
staging_if.data.use_imm,
|
||||
staging_if.data.imm,
|
||||
staging_if.data.rd}),
|
||||
.valid_out (staging_if.valid),
|
||||
.ready_out (staging_if.ready)
|
||||
);
|
||||
|
||||
assign staging_if.data.rs1_data = rs1_data;
|
||||
assign staging_if.data.rs2_data = rs2_data;
|
||||
assign staging_if.data.rs3_data = rs3_data;
|
||||
|
||||
// output buffer
|
||||
|
||||
wire valid_stg, ready_stg;
|
||||
assign valid_stg = staging_if.valid && data_ready;
|
||||
assign staging_if.ready = ready_stg && data_ready;
|
||||
|
||||
`RESET_RELAY (out_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (out_buf_reset),
|
||||
.valid_in (valid_stg),
|
||||
.ready_in (ready_stg),
|
||||
.data_in (staging_if.data),
|
||||
.data_out (operands_if[i].data),
|
||||
.valid_out (operands_if[i].valid),
|
||||
.ready_out (operands_if[i].ready)
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if.schedule perf_schedule_if,
|
||||
`endif
|
||||
|
||||
// configuration
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
||||
@@ -304,13 +308,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
||||
reg [`UUID_WIDTH-1:0] instr_uuid;
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
||||
`ifdef SV_DPI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
||||
end else if (schedule_fire) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
||||
end
|
||||
end
|
||||
end
|
||||
`else
|
||||
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
|
||||
always @(*) begin
|
||||
instr_uuid = `UUID_WIDTH'(w_uuid);
|
||||
end
|
||||
`endif
|
||||
`else
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
||||
`endif
|
||||
@@ -349,7 +360,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
.empty (no_pending_instr)
|
||||
);
|
||||
|
||||
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
|
||||
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
|
||||
|
||||
// export CSRs
|
||||
assign sched_csr_if.cycles = cycles;
|
||||
@@ -376,4 +387,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
end
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
|
||||
wire schedule_idle = ~schedule_valid;
|
||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sched_idles <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
end else begin
|
||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -19,6 +19,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||
@@ -26,114 +32,201 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||
|
||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) perf_units_reduce (
|
||||
.data_in (perf_issue_units_per_cycle),
|
||||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (`ISSUE_WIDTH),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
.data_in (perf_issue_sfu_per_cycle),
|
||||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_units_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
reg [3:0] ready_masks, ready_masks_n;
|
||||
VX_ibuffer_if staging_if();
|
||||
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
ready_masks_n = ready_masks;
|
||||
if (writeback_fire) begin
|
||||
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
|
||||
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
|
||||
& {(writeback_if[i].data.rd == staging_if.data.rd),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs1),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs2),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs3)};
|
||||
end
|
||||
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
|
||||
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
|
||||
ready_masks_n = '0;
|
||||
case (scoreboard_if[i].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||
default: sfu_type = `SFU_WCTL;
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_issue_units_per_cycle[i] = '0;
|
||||
perf_issue_sfu_per_cycle[i] = '0;
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
`endif
|
||||
|
||||
reg [DATAW-1:0] data_out_r;
|
||||
reg valid_out_r;
|
||||
wire ready_out;
|
||||
|
||||
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||
wire deps_ready = (& ready_masks);
|
||||
|
||||
wire valid_in = ibuffer_if[i].valid && deps_ready;
|
||||
wire ready_in = ~valid_out_r && deps_ready;
|
||||
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
|
||||
|
||||
assign ready_out = scoreboard_if[i].ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
ready_masks <= '0;
|
||||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
ready_masks <= ready_masks_n;
|
||||
valid_out_r <= 0;
|
||||
inuse_regs <= '0;
|
||||
end else begin
|
||||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
valid_out_r <= valid_in;
|
||||
end else if (ready_out) begin
|
||||
if (scoreboard_if[i].data.wb) begin
|
||||
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
|
||||
`ifdef PERF_ENABLE
|
||||
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
|
||||
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
valid_out_r <= 0;
|
||||
end
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
data_out_r <= data_in;
|
||||
end
|
||||
end
|
||||
|
||||
// staging buffer
|
||||
assign ibuffer_if[i].ready = ready_in;
|
||||
assign scoreboard_if[i].valid = valid_out_r;
|
||||
assign scoreboard_if[i].data = data_out_r;
|
||||
|
||||
`RESET_RELAY (stg_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stg_buf (
|
||||
.clk (clk),
|
||||
.reset (stg_buf_reset),
|
||||
.valid_in (ibuffer_if[i].valid),
|
||||
.ready_in (ibuffer_if[i].ready),
|
||||
.data_in (ibuffer_if[i].data),
|
||||
.data_out (staging_if.data),
|
||||
.valid_out (staging_if.valid),
|
||||
.ready_out (staging_if.ready)
|
||||
);
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
// output buffer
|
||||
|
||||
wire valid_stg, ready_stg;
|
||||
wire regs_ready = (& ready_masks);
|
||||
assign valid_stg = staging_if.valid && regs_ready;
|
||||
assign staging_if.ready = ready_stg && regs_ready;
|
||||
|
||||
`RESET_RELAY (out_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (2)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (out_buf_reset),
|
||||
.valid_in (valid_stg),
|
||||
.ready_in (ready_stg),
|
||||
.data_in (staging_if.data),
|
||||
.data_out (scoreboard_if[i].data),
|
||||
.valid_out (scoreboard_if[i].valid),
|
||||
.ready_out (scoreboard_if[i].ready)
|
||||
);
|
||||
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
timeout_ctr <= '0;
|
||||
end else begin
|
||||
if (staging_if.valid && ~regs_ready) begin
|
||||
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
`endif
|
||||
timeout_ctr <= timeout_ctr + 1;
|
||||
end else if (staging_if.valid && staging_if.ready) begin
|
||||
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
timeout_ctr <= '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
|
||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
||||
end
|
||||
`endif
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + 1;
|
||||
localparam RSP_ARB_IDX_WCTL = 0;
|
||||
localparam RSP_ARB_IDX_CSR = 1;
|
||||
localparam RSP_ARB_IDX_CSRS = 1;
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_sfu_perf_if sfu_perf_if();
|
||||
`endif
|
||||
|
||||
// Warp control block
|
||||
VX_execute_if #(
|
||||
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sfu_perf_if (sfu_perf_if),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
@@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_if (csr_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||
|
||||
// can accept new request?
|
||||
|
||||
reg sfu_req_ready;
|
||||
always @(*) begin
|
||||
case (execute_if[0].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
default: sfu_req_ready = wctl_execute_if.ready;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
assign execute_if[0].ready = sfu_req_ready;
|
||||
|
||||
// response arbitration
|
||||
@@ -170,7 +166,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
@@ -186,7 +182,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (3)
|
||||
.OUT_REG (1)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
@@ -194,16 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_wctl_stalls <= '0;
|
||||
end else begin
|
||||
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
|
||||
end
|
||||
end
|
||||
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -14,9 +14,7 @@
|
||||
`ifndef VX_TRACE_VH
|
||||
`define VX_TRACE_VH
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
|
||||
`include "VX_define.vh"
|
||||
`ifdef SIMULATION
|
||||
|
||||
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
case (ex_type)
|
||||
|
||||
@@ -29,7 +29,6 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam LANE_BITS = `CLOG2(NUM_LANES);
|
||||
localparam LANE_WIDTH = `UP(LANE_BITS);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
@@ -50,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
||||
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
||||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||
|
||||
wire [LANE_WIDTH-1:0] tid;
|
||||
wire [`UP(LANE_BITS)-1:0] tid;
|
||||
if (LANE_BITS != 0) begin
|
||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||
end else begin
|
||||
|
||||
@@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
|
||||
localparam MAN_BITS = 23;
|
||||
localparam EXP_BITS = 8;
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
// Use 32-bit integer
|
||||
localparam MAX_INT_WIDTH = 32;
|
||||
localparam INT_WIDTH = 32;
|
||||
|
||||
// The internal mantissa includes normal bit or an entire integer
|
||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
|
||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
|
||||
|
||||
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
|
||||
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
|
||||
|
||||
// The internal exponent must be able to represent the smallest denormal input value as signed
|
||||
// or the number of bits in an integer
|
||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||
|
||||
// shift amount for denormalization
|
||||
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
|
||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||
|
||||
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
|
||||
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
|
||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
|
||||
|
||||
// Input processing
|
||||
|
||||
@@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class (
|
||||
.exp_i (dataa[i][30:23]),
|
||||
.man_i (dataa[i][22:0]),
|
||||
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
|
||||
.man_i (dataa[i][MAN_BITS-1:0]),
|
||||
.clss_o (fclass[i])
|
||||
);
|
||||
end
|
||||
@@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
wire [NUM_LANES-1:0] input_sign;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [INT_MAN_WIDTH-1:0] int_mantissa;
|
||||
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
|
||||
wire fmt_sign = dataa[i][31];
|
||||
wire int_sign = dataa[i][31] && is_signed;
|
||||
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
|
||||
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
wire i2f_sign = dataa[i][INT_WIDTH-1];
|
||||
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
|
||||
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
|
||||
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
|
||||
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
|
||||
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
|
||||
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
|
||||
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
|
||||
end
|
||||
|
||||
// Pipeline stage0
|
||||
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire is_itof_s0;
|
||||
wire unsigned_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire is_itof_s0;
|
||||
wire is_signed_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s0;
|
||||
wire [NUM_LANES-1:0] input_sign_s0;
|
||||
wire [NUM_LANES-1:0] input_sign_s0;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
|
||||
|
||||
@@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
);
|
||||
|
||||
// Normalization
|
||||
@@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||
|
||||
// Unbias exponent and compensate for shift
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
|
||||
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
|
||||
end
|
||||
|
||||
// Pipeline stage1
|
||||
|
||||
wire valid_in_s1;
|
||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||
wire [TAGW-1:0] tag_in_s1;
|
||||
wire is_itof_s1;
|
||||
wire unsigned_s1;
|
||||
wire [2:0] rnd_mode_s1;
|
||||
wire valid_in_s1;
|
||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||
wire [TAGW-1:0] tag_in_s1;
|
||||
wire is_itof_s1;
|
||||
wire is_signed_s1;
|
||||
wire [2:0] rnd_mode_s1;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s1;
|
||||
wire [NUM_LANES-1:0] input_sign_s1;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||
wire [NUM_LANES-1:0] input_sign_s1;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
|
||||
|
||||
@@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||
);
|
||||
|
||||
// Perform adjustments to mantissa and exponent
|
||||
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
|
||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
|
||||
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
|
||||
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
|
||||
reg of_before_round_tmp_s1;
|
||||
|
||||
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
|
||||
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
|
||||
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
|
||||
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
|
||||
always @(*) begin
|
||||
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
||||
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
|
||||
denorm_shamt_s1 = '0;
|
||||
of_before_round_tmp_s1 = 1'b0;
|
||||
|
||||
if (is_itof_s1) begin
|
||||
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
|
||||
// Overflow or infinities (for proper rounding)
|
||||
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
|
||||
preshift_mant_s1 = ~0; // largest normal value and RS bits set
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
|
||||
// Limit the shift to retain sticky bits
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
|
||||
// Denormalize underflowing values
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
|
||||
end
|
||||
if (overflow) begin
|
||||
denorm_shamt_q = '0;
|
||||
end else if (underflow) begin
|
||||
denorm_shamt_q = INT_WIDTH+1;
|
||||
end else begin
|
||||
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
|
||||
// overflow: when converting to unsigned the range is larger by one
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
|
||||
// underflow
|
||||
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
|
||||
end else begin
|
||||
// By default right shift mantissa to be an integer
|
||||
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
|
||||
end
|
||||
denorm_shamt_q = denorm_shamt;
|
||||
end
|
||||
end
|
||||
|
||||
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
|
||||
assign final_exp_s1[i] = final_exp_tmp_s1;
|
||||
assign of_before_round_s1[i] = of_before_round_tmp_s1;
|
||||
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
|
||||
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
|
||||
assign of_before_round_s1[i] = overflow;
|
||||
end
|
||||
|
||||
// Pipeline stage2
|
||||
|
||||
wire valid_in_s2;
|
||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||
wire [TAGW-1:0] tag_in_s2;
|
||||
wire is_itof_s2;
|
||||
wire unsigned_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
wire valid_in_s2;
|
||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||
wire [TAGW-1:0] tag_in_s2;
|
||||
wire is_itof_s2;
|
||||
wire is_signed_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s2;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
||||
@@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
|
||||
|
||||
// Rouding and classification
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [31:0] fmt_pre_round_abs_s2;
|
||||
wire [31:0] pre_round_abs_s2;
|
||||
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
|
||||
wire [INT_WIDTH-1:0] pre_round_abs_s2;
|
||||
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
|
||||
|
||||
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
||||
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
||||
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
|
||||
|
||||
// Collapse sticky bits
|
||||
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
|
||||
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
|
||||
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
|
||||
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
|
||||
|
||||
// select RS bits for destination operation
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
|
||||
|
||||
// Pack exponent and mantissa into proper rounding form
|
||||
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
||||
@@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
wire [NUM_LANES-1:0] lane_mask_s3;
|
||||
wire [TAGW-1:0] tag_in_s3;
|
||||
wire is_itof_s3;
|
||||
wire unsigned_s3;
|
||||
wire is_signed_s3;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s3;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s3;
|
||||
wire [NUM_LANES-1:0] input_sign_s3;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
||||
@@ -339,105 +304,71 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
|
||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] of_after_round_s3;
|
||||
wire [NUM_LANES-1:0] uf_after_round_s3;
|
||||
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Classification after rounding select by destination format
|
||||
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
||||
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
||||
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Negative integer result needs to be brought into two's complement
|
||||
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
||||
end
|
||||
|
||||
// FP Special case handling
|
||||
// F2I Special case handling
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
|
||||
wire [NUM_LANES-1:0] fp_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Detect special case from source format, I2F casts don't produce a special result
|
||||
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
||||
|
||||
// Signalling input NaNs raise invalid flag, otherwise no flags set
|
||||
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
||||
|
||||
// Assemble result according to destination format
|
||||
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
||||
end
|
||||
|
||||
// INT Special case handling
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] int_special_status_s3;
|
||||
wire [NUM_LANES-1:0] int_result_is_special_s3;
|
||||
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
|
||||
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble result according to destination format
|
||||
always @(*) begin
|
||||
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
||||
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
||||
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
|
||||
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
|
||||
end else begin
|
||||
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
||||
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
|
||||
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
|
||||
end
|
||||
end
|
||||
|
||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
| fclass_s3[i].is_inf
|
||||
| of_before_round_s3[i]
|
||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
|
||||
// All integer special cases are invalid
|
||||
assign int_special_status_s3[i] = {1'b1, 4'h0};
|
||||
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
|
||||
end
|
||||
|
||||
// Result selection and Output handshake
|
||||
|
||||
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
||||
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t fp_regular_status_s3, int_regular_status_s3;
|
||||
fflags_t fp_status_s3, int_status_s3;
|
||||
wire [31:0] fp_result_s3, int_result_s3;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
|
||||
fflags_t i2f_status_s3, f2i_status_s3;
|
||||
|
||||
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
|
||||
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
|
||||
|
||||
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
|
||||
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
|
||||
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
|
||||
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
|
||||
assign fp_regular_status_s3.NX = inexact_s3;
|
||||
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
|
||||
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
|
||||
|
||||
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
|
||||
assign i2f_status_s3 = i2f_regular_status_s3;
|
||||
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||
|
||||
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
|
||||
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
||||
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
|
||||
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
|
||||
|
||||
// Select output depending on special case detection
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
|
||||
end
|
||||
|
||||
assign stall = ~ready_out && valid_out;
|
||||
@@ -457,7 +388,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
assign has_fflags = 1'b1;
|
||||
|
||||
endmodule
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`ifdef SV_DPI
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
|
||||
@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
|
||||
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
||||
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
||||
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
||||
default: round_up = 1'bx;
|
||||
endcase
|
||||
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||
|
||||
@@ -14,26 +14,38 @@
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
||||
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport schedule (
|
||||
output sched_idles,
|
||||
output sched_stalls
|
||||
);
|
||||
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output dsp_stalls
|
||||
);
|
||||
output units_uses,
|
||||
output sfu_uses
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched_idles,
|
||||
input sched_stalls,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input dsp_stalls,
|
||||
input units_uses,
|
||||
input sfu_uses,
|
||||
input ifetches,
|
||||
input loads,
|
||||
input stores,
|
||||
|
||||
@@ -21,8 +21,8 @@ module VX_avs_adapter #(
|
||||
parameter NUM_BANKS = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter RD_QUEUE_SIZE = 1,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -20,7 +20,7 @@ module VX_axi_adapter #(
|
||||
parameter TAG_WIDTH = 8,
|
||||
parameter NUM_BANKS = 1,
|
||||
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -201,9 +201,7 @@ module VX_fifo_queue #(
|
||||
rd_ptr_r <= '0;
|
||||
rd_ptr_n_r <= 1;
|
||||
end else begin
|
||||
if (push) begin
|
||||
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
|
||||
end
|
||||
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
|
||||
if (pop) begin
|
||||
rd_ptr_r <= rd_ptr_n_r;
|
||||
if (DEPTH > 2) begin
|
||||
|
||||
@@ -21,8 +21,8 @@ module VX_mem_adapter #(
|
||||
parameter DST_ADDR_WIDTH = 1,
|
||||
parameter SRC_TAG_WIDTH = 1,
|
||||
parameter DST_TAG_WIDTH = 1,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -21,7 +21,7 @@ module VX_stream_arb #(
|
||||
parameter `STRING ARBITER = "P",
|
||||
parameter LOCK_ENABLE = 1,
|
||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||
parameter OUT_REG = 0 ,
|
||||
parameter OUT_REG = 0 ,
|
||||
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
|
||||
|
||||
@@ -173,25 +173,27 @@ module VX_stream_xbar #(
|
||||
end
|
||||
|
||||
// compute inputs collision
|
||||
// we have a collision when there exists a valid transfer with mutiple input candicates
|
||||
// we caount the unique duplicates each cycle.
|
||||
// we have a collision when there exists a valid transfer with multiple input candicates
|
||||
// we count the unique duplicates each cycle.
|
||||
|
||||
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
|
||||
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
|
||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
reg [NUM_INPUTS-1:0] per_cycle_collision;
|
||||
|
||||
always @(*) begin
|
||||
per_cycle_collision = 0;
|
||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
||||
if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin
|
||||
per_cycle_collision[i] |= ready_in[i] | ready_in[j+i];
|
||||
end
|
||||
per_cycle_collision[i] |= valid_in[i]
|
||||
&& valid_in[j+i]
|
||||
&& (sel_in[i] == sel_in[j+i])
|
||||
&& (ready_in[i] | ready_in[j+i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
|
||||
`POP_COUNT(collision_count, per_cycle_collision);
|
||||
|
||||
`BUFFER(per_cycle_collision_r, per_cycle_collision);
|
||||
`POP_COUNT(collision_count, per_cycle_collision_r);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
|
||||
module VX_gbar_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter OUT_REG = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
||||
@@ -21,8 +21,8 @@ module VX_mem_arb #(
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
||||
@@ -233,10 +233,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw;
|
||||
wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw;
|
||||
wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
|
||||
|
||||
`BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw);
|
||||
`BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw);
|
||||
|
||||
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
|
||||
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
|
||||
@@ -19,8 +19,8 @@ module VX_smem_switch #(
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
||||
@@ -56,17 +56,17 @@ TARGET=asesim make -C runtime/opae
|
||||
PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make
|
||||
|
||||
# ASE test runs
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t0
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t1
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/demo/demo -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/dogfood/dogfood -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/vecadd/vecadd
|
||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/sgemm/sgemm -n4
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t0
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t1
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/demo/demo -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/dogfood/dogfood -n16
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/vecadd/vecadd
|
||||
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/sgemm/sgemm -n4
|
||||
|
||||
# modify "vsim_run.tcl" to dump VCD trace
|
||||
vcd file trace.vcd
|
||||
vcd add -r /*/Vortex/hw/rtl/*
|
||||
vcd add -r /*/afu/*
|
||||
run -all
|
||||
|
||||
# compress FPGA output files
|
||||
|
||||
@@ -15,27 +15,27 @@
|
||||
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
BUILD_DIR=$1
|
||||
BUILD_DIR=$(realpath $1)
|
||||
|
||||
PROGRAM=$(basename "$2")
|
||||
PROGRAM_DIR=`dirname $2`
|
||||
|
||||
POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
|
||||
|
||||
# Export ASE_WORKDIR variable
|
||||
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
|
||||
|
||||
shift 2
|
||||
export ASE_WORKDIR=$BUILD_DIR/synth/work
|
||||
|
||||
# cleanup incomplete runs
|
||||
rm -f $ASE_WORKDIR/.app_lock.pid
|
||||
rm -f $ASE_WORKDIR/.ase_ready.pid
|
||||
rm -f $SCRIPT_DIR/$BUILD_DIR/nohup.out
|
||||
rm -f $BUILD_DIR/synth/nohup.out
|
||||
|
||||
# Start Simulator in background
|
||||
pushd $SCRIPT_DIR/$BUILD_DIR
|
||||
echo " [DBG] starting ASE simnulator (stdout saved to '$SCRIPT_DIR/$BUILD_DIR/nohup.out')"
|
||||
nohup make sim &
|
||||
# Start Simulator in background (capture processs group pid)
|
||||
pushd $BUILD_DIR/synth
|
||||
echo " [DBG] starting ASE simnulator (stdout saved to '$BUILD_DIR/synth/nohup.out')"
|
||||
setsid make sim &> /dev/null &
|
||||
SIM_PID=$!
|
||||
popd
|
||||
|
||||
# Wait for simulator readiness
|
||||
@@ -47,6 +47,11 @@ done
|
||||
|
||||
# run application
|
||||
pushd $PROGRAM_DIR
|
||||
shift 2
|
||||
echo " [DBG] running ./$PROGRAM $*"
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||
popd
|
||||
|
||||
# stop the simulator (kill process group)
|
||||
kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*')
|
||||
wait $SIM_PID 2> /dev/null
|
||||
4
hw/syn/altera/quartus/cache/Makefile
vendored
4
hw/syn/altera/quartus/cache/Makefile
vendored
@@ -1,6 +1,6 @@
|
||||
PROJECT = VX_cache_cluster_top
|
||||
PROJECT = VX_cache_top
|
||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||
SRC_FILE = VX_cache_cluster.sv
|
||||
SRC_FILE = $(PROJECT).sv
|
||||
|
||||
include ../../common.mk
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
PROJECT = VX_core_top
|
||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||
SRC_FILE = VX_core.sv
|
||||
SRC_FILE = $(PROJECT).sv
|
||||
|
||||
include ../../common.mk
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
XLEN ?= 32
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
|
||||
2
hw/unittest/cache/Makefile
vendored
2
hw/unittest/cache/Makefile
vendored
@@ -33,7 +33,7 @@ VL_FLAGS = --exe
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += $(PARAMS)
|
||||
VL_FLAGS += $(RTL_INCLUDE)
|
||||
|
||||
@@ -27,7 +27,7 @@ VL_FLAGS = --exe
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += $(PARAMS)
|
||||
VL_FLAGS += $(RTL_INCLUDE)
|
||||
|
||||
@@ -27,7 +27,7 @@ VL_FLAGS = --exe
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += $(PARAMS)
|
||||
VL_FLAGS += $(RTL_INCLUDE)
|
||||
|
||||
@@ -25,7 +25,7 @@ VL_FLAGS = --exe
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += $(PARAMS)
|
||||
VL_FLAGS += $(RTL_INCLUDE)
|
||||
@@ -56,7 +56,6 @@ PROJECT = top_modules
|
||||
all: build
|
||||
|
||||
build: $(SRCS)
|
||||
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
|
||||
|
||||
Reference in New Issue
Block a user