cache bindings and memory perf refactory
This commit is contained in:
@@ -23,10 +23,10 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.master mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
VX_mem_perf_if.slave perf_memsys_total_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
// DCRs
|
||||||
VX_dcr_bus_if.slave dcr_bus_if,
|
VX_dcr_bus_if.slave dcr_bus_if,
|
||||||
|
|
||||||
// Memory
|
// Memory
|
||||||
@@ -71,32 +71,51 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
`ifdef PERF_ENABLE
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
|
cache_perf_t perf_l2cache;
|
||||||
) per_socket_dcache_bus_if[`NUM_SOCKETS * DCACHE_NUM_REQS]();
|
|
||||||
|
assign mem_perf_tmp_if.icache = 'x;
|
||||||
|
assign mem_perf_tmp_if.dcache = 'x;
|
||||||
|
assign mem_perf_tmp_if.l2cache = perf_l2cache;
|
||||||
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
|
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||||
) per_socket_icache_bus_if[`NUM_SOCKETS]();
|
) per_socket_mem_bus_if[`NUM_SOCKETS]();
|
||||||
|
|
||||||
`RESET_RELAY (mem_unit_reset, reset);
|
`RESET_RELAY (l2_reset, reset);
|
||||||
|
|
||||||
VX_mem_unit #(
|
VX_cache_wrap #(
|
||||||
.CLUSTER_ID (CLUSTER_ID)
|
.INSTANCE_ID ("l2cache"),
|
||||||
) mem_unit (
|
.CACHE_SIZE (`L2_CACHE_SIZE),
|
||||||
|
.LINE_SIZE (`L2_LINE_SIZE),
|
||||||
|
.NUM_BANKS (`L2_NUM_BANKS),
|
||||||
|
.NUM_WAYS (`L2_NUM_WAYS),
|
||||||
|
.WORD_SIZE (L2_WORD_SIZE),
|
||||||
|
.NUM_REQS (L2_NUM_REQS),
|
||||||
|
.CRSQ_SIZE (`L2_CRSQ_SIZE),
|
||||||
|
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||||
|
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||||
|
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH),
|
||||||
|
.WRITE_ENABLE (1),
|
||||||
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
|
.CORE_OUT_REG (2),
|
||||||
|
.MEM_OUT_REG (2),
|
||||||
|
.NC_ENABLE (1),
|
||||||
|
.PASSTHRU (!`L2_ENABLED)
|
||||||
|
) l2cache (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (mem_unit_reset),
|
.reset (l2_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.cache_perf (perf_l2cache),
|
||||||
`endif
|
`endif
|
||||||
|
.core_bus_if (per_socket_mem_bus_if),
|
||||||
.dcache_bus_if (per_socket_dcache_bus_if),
|
|
||||||
|
|
||||||
.icache_bus_if (per_socket_icache_bus_if),
|
|
||||||
|
|
||||||
.mem_bus_if (mem_bus_if)
|
.mem_bus_if (mem_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -131,14 +150,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.reset (socket_reset),
|
.reset (socket_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (perf_memsys_total_if),
|
.mem_perf_if (mem_perf_tmp_if),
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.dcr_bus_if (socket_dcr_bus_if),
|
.dcr_bus_if (socket_dcr_bus_if),
|
||||||
|
|
||||||
.dcache_bus_if (per_socket_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
|
.mem_bus_if (per_socket_mem_bus_if[i]),
|
||||||
|
|
||||||
.icache_bus_if (per_socket_icache_bus_if[i]),
|
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||||
|
|||||||
@@ -32,7 +32,14 @@
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// 32 bit XLEN as default.
|
`ifndef EXT_M_DISABLE
|
||||||
|
`define EXT_M_ENABLE
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifndef EXT_F_DISABLE
|
||||||
|
`define EXT_F_ENABLE
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifndef XLEN_32
|
`ifndef XLEN_32
|
||||||
`ifndef XLEN_64
|
`ifndef XLEN_64
|
||||||
`define XLEN_32
|
`define XLEN_32
|
||||||
@@ -47,6 +54,26 @@
|
|||||||
`define XLEN 32
|
`define XLEN 32
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_D_ENABLE
|
||||||
|
`define FLEN_64
|
||||||
|
`else
|
||||||
|
`define FLEN_32
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef FLEN_64
|
||||||
|
`define FLEN 64
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef FLEN_32
|
||||||
|
`define FLEN 32
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef XLEN_64
|
||||||
|
`ifdef FLEN_32
|
||||||
|
`define FPU_RV64F
|
||||||
|
`endif
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifndef NUM_CLUSTERS
|
`ifndef NUM_CLUSTERS
|
||||||
`define NUM_CLUSTERS 1
|
`define NUM_CLUSTERS 1
|
||||||
`endif
|
`endif
|
||||||
@@ -70,6 +97,7 @@
|
|||||||
`ifndef SOCKET_SIZE
|
`ifndef SOCKET_SIZE
|
||||||
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
||||||
`endif
|
`endif
|
||||||
|
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
||||||
|
|
||||||
`ifdef L2_ENABLE
|
`ifdef L2_ENABLE
|
||||||
`define L2_ENABLED 1
|
`define L2_ENABLED 1
|
||||||
@@ -186,119 +214,6 @@
|
|||||||
`define DEBUG_LEVEL 3
|
`define DEBUG_LEVEL 3
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
`ifndef EXT_M_DISABLE
|
|
||||||
`define EXT_M_ENABLE
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifndef EXT_F_DISABLE
|
|
||||||
`define EXT_F_ENABLE
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef EXT_D_ENABLE
|
|
||||||
`define FLEN_64
|
|
||||||
`else
|
|
||||||
`define FLEN_32
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef FLEN_64
|
|
||||||
`define FLEN 64
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef FLEN_32
|
|
||||||
`define FLEN 32
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef XLEN_64
|
|
||||||
`ifdef FLEN_32
|
|
||||||
`define FPU_RV64F
|
|
||||||
`endif
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`define ISA_STD_A 0
|
|
||||||
`define ISA_STD_C 2
|
|
||||||
`define ISA_STD_D 3
|
|
||||||
`define ISA_STD_E 4
|
|
||||||
`define ISA_STD_F 5
|
|
||||||
`define ISA_STD_H 7
|
|
||||||
`define ISA_STD_I 8
|
|
||||||
`define ISA_STD_N 13
|
|
||||||
`define ISA_STD_Q 16
|
|
||||||
`define ISA_STD_S 18
|
|
||||||
`define ISA_STD_U 20
|
|
||||||
|
|
||||||
`define ISA_EXT_TEX 0
|
|
||||||
`define ISA_EXT_RASTER 1
|
|
||||||
`define ISA_EXT_ROP 2
|
|
||||||
|
|
||||||
`ifdef EXT_A_ENABLE
|
|
||||||
`define EXT_A_ENABLED 1
|
|
||||||
`else
|
|
||||||
`define EXT_A_ENABLED 0
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef EXT_C_ENABLE
|
|
||||||
`define EXT_C_ENABLED 1
|
|
||||||
`else
|
|
||||||
`define EXT_C_ENABLED 0
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef EXT_D_ENABLE
|
|
||||||
`define EXT_D_ENABLED 1
|
|
||||||
`else
|
|
||||||
`define EXT_D_ENABLED 0
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
|
||||||
`define EXT_F_ENABLED 1
|
|
||||||
`else
|
|
||||||
`define EXT_F_ENABLED 0
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef EXT_M_ENABLE
|
|
||||||
`define EXT_M_ENABLED 1
|
|
||||||
`else
|
|
||||||
`define EXT_M_ENABLED 0
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`define ISA_X_ENABLED 0
|
|
||||||
|
|
||||||
`define MISA_EXT 0
|
|
||||||
|
|
||||||
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
|
|
||||||
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
|
|
||||||
| (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \
|
|
||||||
| (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
|
|
||||||
| (0 << 4) /* E - RV32E base ISA */ \
|
|
||||||
| (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
|
|
||||||
| (0 << 6) /* G - Additional standard extensions present */ \
|
|
||||||
| (0 << 7) /* H - Hypervisor mode implemented */ \
|
|
||||||
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
|
|
||||||
| (0 << 9) /* J - Reserved */ \
|
|
||||||
| (0 << 10) /* K - Reserved */ \
|
|
||||||
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
|
|
||||||
| (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
|
|
||||||
| (0 << 13) /* N - User level interrupts supported */ \
|
|
||||||
| (0 << 14) /* O - Reserved */ \
|
|
||||||
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
|
|
||||||
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
|
|
||||||
| (0 << 17) /* R - Reserved */ \
|
|
||||||
| (0 << 18) /* S - Supervisor mode implemented */ \
|
|
||||||
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
|
||||||
| (1 << 20) /* U - User mode implemented */ \
|
|
||||||
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
|
||||||
| (0 << 22) /* W - Reserved */ \
|
|
||||||
| (`ISA_X_ENABLED << 23) /* X - Non-standard extensions present */ \
|
|
||||||
| (0 << 24) /* Y - Reserved */ \
|
|
||||||
| (0 << 25) /* Z - Reserved */
|
|
||||||
|
|
||||||
// Device identification //////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
`define VENDOR_ID 0
|
|
||||||
`define ARCHITECTURE_ID 0
|
|
||||||
`define IMPLEMENTATION_ID 0
|
|
||||||
|
|
||||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||||
|
|
||||||
// Issue width
|
// Issue width
|
||||||
@@ -554,6 +469,7 @@
|
|||||||
`ifndef SM_DISABLE
|
`ifndef SM_DISABLE
|
||||||
`define SM_ENABLE
|
`define SM_ENABLE
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef SM_ENABLE
|
`ifdef SM_ENABLE
|
||||||
`define SM_ENABLED 1
|
`define SM_ENABLED 1
|
||||||
`else
|
`else
|
||||||
@@ -579,7 +495,7 @@
|
|||||||
|
|
||||||
// Number of Banks
|
// Number of Banks
|
||||||
`ifndef L2_NUM_BANKS
|
`ifndef L2_NUM_BANKS
|
||||||
`define L2_NUM_BANKS 2
|
`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Core Response Queue Size
|
// Core Response Queue Size
|
||||||
@@ -648,4 +564,93 @@
|
|||||||
`define L3_NUM_WAYS 4
|
`define L3_NUM_WAYS 4
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
`ifdef EXT_A_ENABLE
|
||||||
|
`define EXT_A_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_A_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_C_ENABLE
|
||||||
|
`define EXT_C_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_C_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_D_ENABLE
|
||||||
|
`define EXT_D_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_D_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_F_ENABLE
|
||||||
|
`define EXT_F_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_F_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_M_ENABLE
|
||||||
|
`define EXT_M_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_M_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`define ISA_STD_A 0
|
||||||
|
`define ISA_STD_C 2
|
||||||
|
`define ISA_STD_D 3
|
||||||
|
`define ISA_STD_E 4
|
||||||
|
`define ISA_STD_F 5
|
||||||
|
`define ISA_STD_H 7
|
||||||
|
`define ISA_STD_I 8
|
||||||
|
`define ISA_STD_N 13
|
||||||
|
`define ISA_STD_Q 16
|
||||||
|
`define ISA_STD_S 18
|
||||||
|
`define ISA_STD_U 20
|
||||||
|
|
||||||
|
`define ISA_EXT_ICACHE 0
|
||||||
|
`define ISA_EXT_DCACHE 1
|
||||||
|
`define ISA_EXT_L2CACHE 2
|
||||||
|
`define ISA_EXT_L3CACHE 3
|
||||||
|
`define ISA_EXT_SMEM 4
|
||||||
|
|
||||||
|
`define MISA_EXT (`ICACHE_ENABLED << `ISA_EXT_ICACHE) \
|
||||||
|
| (`DCACHE_ENABLED << `ISA_EXT_DCACHE) \
|
||||||
|
| (`L2_ENABLED << `ISA_EXT_L2CACHE) \
|
||||||
|
| (`L3_ENABLED << `ISA_EXT_L3CACHE) \
|
||||||
|
| (`SM_ENABLED << `ISA_EXT_SMEM)
|
||||||
|
|
||||||
|
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
|
||||||
|
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
|
||||||
|
| (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \
|
||||||
|
| (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
|
||||||
|
| (0 << 4) /* E - RV32E base ISA */ \
|
||||||
|
| (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
|
||||||
|
| (0 << 6) /* G - Additional standard extensions present */ \
|
||||||
|
| (0 << 7) /* H - Hypervisor mode implemented */ \
|
||||||
|
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
|
||||||
|
| (0 << 9) /* J - Reserved */ \
|
||||||
|
| (0 << 10) /* K - Reserved */ \
|
||||||
|
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
|
||||||
|
| (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
|
||||||
|
| (0 << 13) /* N - User level interrupts supported */ \
|
||||||
|
| (0 << 14) /* O - Reserved */ \
|
||||||
|
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
|
||||||
|
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
|
||||||
|
| (0 << 17) /* R - Reserved */ \
|
||||||
|
| (0 << 18) /* S - Supervisor mode implemented */ \
|
||||||
|
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
||||||
|
| (1 << 20) /* U - User mode implemented */ \
|
||||||
|
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||||
|
| (0 << 22) /* W - Reserved */ \
|
||||||
|
| (1 << 23) /* X - Non-standard extensions present */ \
|
||||||
|
| (0 << 24) /* Y - Reserved */ \
|
||||||
|
| (0 << 25) /* Z - Reserved */
|
||||||
|
|
||||||
|
// Device identification //////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
`define VENDOR_ID 0
|
||||||
|
`define ARCHITECTURE_ID 0
|
||||||
|
`define IMPLEMENTATION_ID 0
|
||||||
|
|
||||||
`endif // VX_CONFIG_VH
|
`endif // VX_CONFIG_VH
|
||||||
|
|||||||
@@ -228,9 +228,6 @@
|
|||||||
`define INST_SFU_CSRRS 4'h7
|
`define INST_SFU_CSRRS 4'h7
|
||||||
`define INST_SFU_CSRRC 4'h8
|
`define INST_SFU_CSRRC 4'h8
|
||||||
`define INST_SFU_TEX 4'h9
|
`define INST_SFU_TEX 4'h9
|
||||||
`define INST_SFU_RASTER 4'hA
|
|
||||||
`define INST_SFU_ROP 4'hB
|
|
||||||
`define INST_SFU_CMOV 4'hC
|
|
||||||
`define INST_SFU_BITS 4
|
`define INST_SFU_BITS 4
|
||||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||||
@@ -238,10 +235,6 @@
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
// non-cacheable tag bits
|
// non-cacheable tag bits
|
||||||
`define NC_TAG_BITS 1
|
`define NC_TAG_BITS 1
|
||||||
|
|
||||||
@@ -396,7 +389,7 @@
|
|||||||
end \
|
end \
|
||||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
assign ``dst.``field = __reduce_add_r_``dst``field
|
||||||
|
|
||||||
`define PERF_CACHE_ADD(dst, src, count) \
|
`define PERF_CACHE_REDUCE(dst, src, count) \
|
||||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
||||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
||||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
||||||
|
|||||||
@@ -58,6 +58,23 @@ package VX_gpu_pkg;
|
|||||||
logic [7:0] mpm_class;
|
logic [7:0] mpm_class;
|
||||||
} base_dcrs_t;
|
} base_dcrs_t;
|
||||||
|
|
||||||
|
typedef struct packed {
|
||||||
|
logic [`PERF_CTR_BITS-1:0] reads;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] writes;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] read_misses;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] write_misses;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] mshr_stalls;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||||
|
} cache_perf_t;
|
||||||
|
|
||||||
|
typedef struct packed {
|
||||||
|
logic [`PERF_CTR_BITS-1:0] reads;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] writes;
|
||||||
|
logic [`PERF_CTR_BITS-1:0] latency;
|
||||||
|
} mem_perf_t;
|
||||||
|
|
||||||
/* verilator lint_off UNUSED */
|
/* verilator lint_off UNUSED */
|
||||||
|
|
||||||
////////////////////////// Icache Parameters //////////////////////////////
|
////////////////////////// Icache Parameters //////////////////////////////
|
||||||
@@ -74,7 +91,6 @@ package VX_gpu_pkg;
|
|||||||
|
|
||||||
// Core request tag bits
|
// Core request tag bits
|
||||||
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||||
localparam ICACHE_ARB_TAG_WIDTH = (ICACHE_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
|
|
||||||
|
|
||||||
// Memory request data bits
|
// Memory request data bits
|
||||||
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
||||||
@@ -83,7 +99,7 @@ package VX_gpu_pkg;
|
|||||||
`ifdef ICACHE_ENABLE
|
`ifdef ICACHE_ENABLE
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||||
`else
|
`else
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
////////////////////////// Dcache Parameters //////////////////////////////
|
////////////////////////// Dcache Parameters //////////////////////////////
|
||||||
@@ -112,23 +128,21 @@ package VX_gpu_pkg;
|
|||||||
// Core request tag bits
|
// Core request tag bits
|
||||||
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
||||||
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
|
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
|
||||||
localparam DCACHE_ARB_TAG_WIDTH = (DCACHE_NOSM_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
|
|
||||||
|
|
||||||
// Memory request data bits
|
// Memory request data bits
|
||||||
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
||||||
|
|
||||||
// Memory request tag bits
|
// Memory request tag bits
|
||||||
`ifdef DCACHE_ENABLE
|
`ifdef DCACHE_ENABLE
|
||||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
|
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||||
`else
|
`else
|
||||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
|
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||||
|
|
||||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||||
|
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||||
localparam NUM_L1_OUTPUTS = 2;
|
|
||||||
|
|
||||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||||
|
|
||||||
@@ -136,10 +150,10 @@ package VX_gpu_pkg;
|
|||||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||||
|
|
||||||
// Input request size
|
// Input request size
|
||||||
localparam L2_NUM_REQS = NUM_L1_OUTPUTS;
|
localparam L2_NUM_REQS = `NUM_SOCKETS;
|
||||||
|
|
||||||
// Core request tag bits
|
// Core request tag bits
|
||||||
localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH;
|
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
|
||||||
|
|
||||||
// Memory request data bits
|
// Memory request data bits
|
||||||
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
|
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
|
||||||
|
|||||||
@@ -26,13 +26,14 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
// DCRs
|
||||||
VX_dcr_bus_if.slave dcr_bus_if,
|
VX_dcr_bus_if.slave dcr_bus_if,
|
||||||
|
|
||||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
// Memory
|
||||||
|
VX_mem_bus_if.master mem_bus_if,
|
||||||
VX_mem_bus_if.master icache_bus_if,
|
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
|
// Barrier
|
||||||
VX_gbar_bus_if.master gbar_bus_if,
|
VX_gbar_bus_if.master gbar_bus_if,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
@@ -62,45 +63,60 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
`ifdef PERF_ENABLE
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
cache_perf_t perf_icache;
|
||||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
cache_perf_t perf_dcache;
|
||||||
|
|
||||||
`RESET_RELAY (dcache_arb_reset, reset);
|
assign mem_perf_tmp_if.icache = perf_icache;
|
||||||
|
assign mem_perf_tmp_if.dcache = perf_dcache;
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
VX_mem_bus_if #(
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
) dcache_bus_tmp_if[1]();
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||||
) per_core_dcache_bus_tmp_if[`SOCKET_SIZE]();
|
) icache_mem_bus_if();
|
||||||
|
|
||||||
for (genvar j = 0; j < `SOCKET_SIZE; ++j) begin
|
VX_mem_bus_if #(
|
||||||
`ASSIGN_VX_MEM_BUS_IF (per_core_dcache_bus_tmp_if[j], per_core_dcache_bus_if[j * DCACHE_NUM_REQS + i]);
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
end
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||||
|
) dcache_mem_bus_if();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||||
|
) cache_mem_bus_if[2]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||||
|
) mem_bus_tmp_if[1]();
|
||||||
|
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||||
|
|
||||||
|
`RESET_RELAY (mem_arb_reset, reset);
|
||||||
|
|
||||||
VX_mem_arb #(
|
VX_mem_arb #(
|
||||||
.NUM_INPUTS (`SOCKET_SIZE),
|
.NUM_INPUTS (2),
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||||
.TAG_SEL_IDX (`CACHE_ADDR_TYPE_BITS),
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
.ARBITER ("R"),
|
.ARBITER ("R"),
|
||||||
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
|
.OUT_REG_REQ (2),
|
||||||
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
|
.OUT_REG_RSP (2)
|
||||||
) dcache_arb (
|
) mem_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (dcache_arb_reset),
|
.reset (mem_arb_reset),
|
||||||
.bus_in_if (per_core_dcache_bus_tmp_if),
|
.bus_in_if (cache_mem_bus_if),
|
||||||
.bus_out_if (dcache_bus_tmp_if)
|
.bus_out_if (mem_bus_tmp_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[0]);
|
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||||
end
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@@ -109,30 +125,77 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
||||||
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
`RESET_RELAY (icache_reset, reset);
|
||||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
|
|
||||||
) icache_bus_tmp_if[1]();
|
|
||||||
|
|
||||||
`RESET_RELAY (icache_arb_reset, reset);
|
VX_cache_cluster #(
|
||||||
|
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
|
||||||
VX_mem_arb #(
|
.NUM_UNITS (`NUM_ICACHES),
|
||||||
.NUM_INPUTS (`SOCKET_SIZE),
|
.NUM_INPUTS (`SOCKET_SIZE),
|
||||||
.NUM_OUTPUTS (1),
|
|
||||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
|
||||||
.TAG_SEL_IDX (0),
|
.TAG_SEL_IDX (0),
|
||||||
.ARBITER ("R"),
|
.CACHE_SIZE (`ICACHE_SIZE),
|
||||||
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
|
.LINE_SIZE (ICACHE_LINE_SIZE),
|
||||||
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
|
.NUM_BANKS (1),
|
||||||
) icache_arb (
|
.NUM_WAYS (`ICACHE_NUM_WAYS),
|
||||||
|
.WORD_SIZE (ICACHE_WORD_SIZE),
|
||||||
|
.NUM_REQS (1),
|
||||||
|
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
|
||||||
|
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
|
||||||
|
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
||||||
|
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
||||||
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
|
.WRITE_ENABLE (0),
|
||||||
|
.CORE_OUT_REG (2),
|
||||||
|
.MEM_OUT_REG (2)
|
||||||
|
) icache (
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.cache_perf (perf_icache),
|
||||||
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (icache_arb_reset),
|
.reset (icache_reset),
|
||||||
.bus_in_if (per_core_icache_bus_if),
|
.core_bus_if (per_core_icache_bus_if),
|
||||||
.bus_out_if (icache_bus_tmp_if)
|
.mem_bus_if (icache_mem_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF (icache_bus_if, icache_bus_tmp_if[0]);
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||||
|
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
|
`RESET_RELAY (dcache_reset, reset);
|
||||||
|
|
||||||
|
VX_cache_cluster #(
|
||||||
|
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
|
||||||
|
.NUM_UNITS (`NUM_DCACHES),
|
||||||
|
.NUM_INPUTS (`SOCKET_SIZE),
|
||||||
|
.TAG_SEL_IDX (1),
|
||||||
|
.CACHE_SIZE (`DCACHE_SIZE),
|
||||||
|
.LINE_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.NUM_BANKS (`DCACHE_NUM_BANKS),
|
||||||
|
.NUM_WAYS (`DCACHE_NUM_WAYS),
|
||||||
|
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||||
|
.NUM_REQS (DCACHE_NUM_REQS),
|
||||||
|
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
|
||||||
|
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
||||||
|
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||||
|
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
|
||||||
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
|
.WRITE_ENABLE (1),
|
||||||
|
.NC_ENABLE (1),
|
||||||
|
.CORE_OUT_REG (`SM_ENABLED ? 2 : 1),
|
||||||
|
.MEM_OUT_REG (2)
|
||||||
|
) dcache (
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.cache_perf (perf_dcache),
|
||||||
|
`endif
|
||||||
|
.clk (clk),
|
||||||
|
.reset (dcache_reset),
|
||||||
|
.core_bus_if (per_core_dcache_bus_if),
|
||||||
|
.mem_bus_if (dcache_mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@@ -163,7 +226,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.reset (core_reset),
|
.reset (core_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.mem_perf_if (mem_perf_tmp_if),
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.dcr_bus_if (core_dcr_bus_if),
|
.dcr_bus_if (core_dcr_bus_if),
|
||||||
|
|||||||
@@ -33,9 +33,6 @@
|
|||||||
`define VX_DCR_MPM_CLASS_NONE 0
|
`define VX_DCR_MPM_CLASS_NONE 0
|
||||||
`define VX_DCR_MPM_CLASS_CORE 1
|
`define VX_DCR_MPM_CLASS_CORE 1
|
||||||
`define VX_DCR_MPM_CLASS_MEM 2
|
`define VX_DCR_MPM_CLASS_MEM 2
|
||||||
`define VX_DCR_MPM_CLASS_TEX 3
|
|
||||||
`define VX_DCR_MPM_CLASS_RASTER 4
|
|
||||||
`define VX_DCR_MPM_CLASS_ROP 5
|
|
||||||
|
|
||||||
// User Floating-Point CSRs
|
// User Floating-Point CSRs
|
||||||
|
|
||||||
|
|||||||
144
hw/rtl/Vortex.sv
144
hw/rtl/Vortex.sv
@@ -45,16 +45,61 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
);
|
);
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if[`NUM_CLUSTERS]();
|
VX_mem_perf_if mem_perf_if();
|
||||||
VX_mem_perf_if perf_memsys_total_if();
|
cache_perf_t perf_l3cache;
|
||||||
VX_cache_perf_if perf_l3cache_if();
|
mem_perf_t mem_perf;
|
||||||
|
|
||||||
|
assign mem_perf_if.icache = 'x;
|
||||||
|
assign mem_perf_if.dcache = 'x;
|
||||||
|
assign mem_perf_if.l2cache = 'x;
|
||||||
|
assign mem_perf_if.l3cache = perf_l3cache;
|
||||||
|
assign mem_perf_if.smem = 'x;
|
||||||
|
assign mem_perf_if.mem = mem_perf;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L2_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
|
||||||
|
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (`L3_LINE_SIZE),
|
.DATA_SIZE (`L3_LINE_SIZE),
|
||||||
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
|
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
|
||||||
) mem_bus_if();
|
) mem_bus_if();
|
||||||
|
|
||||||
|
`RESET_RELAY (l3_reset, reset);
|
||||||
|
|
||||||
|
VX_cache_wrap #(
|
||||||
|
.INSTANCE_ID ("l3cache"),
|
||||||
|
.CACHE_SIZE (`L3_CACHE_SIZE),
|
||||||
|
.LINE_SIZE (`L3_LINE_SIZE),
|
||||||
|
.NUM_BANKS (`L3_NUM_BANKS),
|
||||||
|
.NUM_WAYS (`L3_NUM_WAYS),
|
||||||
|
.WORD_SIZE (L3_WORD_SIZE),
|
||||||
|
.NUM_REQS (L3_NUM_REQS),
|
||||||
|
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
||||||
|
.MSHR_SIZE (`L3_MSHR_SIZE),
|
||||||
|
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
||||||
|
.MREQ_SIZE (`L3_MREQ_SIZE),
|
||||||
|
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
|
||||||
|
.WRITE_ENABLE (1),
|
||||||
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
|
.CORE_OUT_REG (2),
|
||||||
|
.MEM_OUT_REG (2),
|
||||||
|
.NC_ENABLE (1),
|
||||||
|
.PASSTHRU (!`L3_ENABLED)
|
||||||
|
) l3cache (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (l3_reset),
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.cache_perf (perf_l3cache),
|
||||||
|
`endif
|
||||||
|
|
||||||
|
.core_bus_if (per_cluster_mem_bus_if),
|
||||||
|
.mem_bus_if (mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
assign mem_req_valid = mem_bus_if.req_valid;
|
assign mem_req_valid = mem_bus_if.req_valid;
|
||||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||||
assign mem_req_byteen= mem_bus_if.req_data.byteen;
|
assign mem_req_byteen= mem_bus_if.req_data.byteen;
|
||||||
@@ -82,11 +127,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
`UNUSED_VAR (per_cluster_sim_ebreak)
|
`UNUSED_VAR (per_cluster_sim_ebreak)
|
||||||
`UNUSED_VAR (per_cluster_sim_wb_value)
|
`UNUSED_VAR (per_cluster_sim_wb_value)
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L2_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
|
|
||||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
|
|
||||||
|
|
||||||
VX_dcr_bus_if dcr_bus_if();
|
VX_dcr_bus_if dcr_bus_if();
|
||||||
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
||||||
assign dcr_bus_if.write_addr = dcr_wr_addr;
|
assign dcr_bus_if.write_addr = dcr_wr_addr;
|
||||||
@@ -112,8 +152,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
.reset (cluster_reset),
|
.reset (cluster_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if[i]),
|
.mem_perf_if (mem_perf_if),
|
||||||
.perf_memsys_total_if (perf_memsys_total_if),
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.dcr_bus_if (cluster_dcr_bus_if),
|
.dcr_bus_if (cluster_dcr_bus_if),
|
||||||
@@ -129,75 +168,8 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
|
|
||||||
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
|
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
|
||||||
|
|
||||||
`RESET_RELAY (l3_reset, reset);
|
|
||||||
|
|
||||||
VX_cache_wrap #(
|
|
||||||
.INSTANCE_ID ("l3cache"),
|
|
||||||
.CACHE_SIZE (`L3_CACHE_SIZE),
|
|
||||||
.LINE_SIZE (`L3_LINE_SIZE),
|
|
||||||
.NUM_BANKS (`L3_NUM_BANKS),
|
|
||||||
.NUM_WAYS (`L3_NUM_WAYS),
|
|
||||||
.WORD_SIZE (L3_WORD_SIZE),
|
|
||||||
.NUM_REQS (L3_NUM_REQS),
|
|
||||||
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
|
||||||
.MSHR_SIZE (`L3_MSHR_SIZE),
|
|
||||||
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
|
||||||
.MREQ_SIZE (`L3_MREQ_SIZE),
|
|
||||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
|
|
||||||
.WRITE_ENABLE (1),
|
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
|
||||||
.CORE_OUT_REG (2),
|
|
||||||
.MEM_OUT_REG (2),
|
|
||||||
.NC_ENABLE (1),
|
|
||||||
.PASSTHRU (!`L3_ENABLED)
|
|
||||||
) l3cache (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (l3_reset),
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf_if (perf_l3cache_if),
|
|
||||||
`endif
|
|
||||||
|
|
||||||
.core_bus_if (per_cluster_mem_bus_if),
|
|
||||||
.mem_bus_if (mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
|
||||||
|
|
||||||
`ifdef L3_ENABLE
|
|
||||||
assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads;
|
|
||||||
assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes;
|
|
||||||
assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses;
|
|
||||||
assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses;
|
|
||||||
assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls;
|
|
||||||
assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls;
|
|
||||||
`else
|
|
||||||
assign perf_memsys_total_if.l3cache_reads = '0;
|
|
||||||
assign perf_memsys_total_if.l3cache_writes = '0;
|
|
||||||
assign perf_memsys_total_if.l3cache_read_misses = '0;
|
|
||||||
assign perf_memsys_total_if.l3cache_write_misses= '0;
|
|
||||||
assign perf_memsys_total_if.l3cache_bank_stalls = '0;
|
|
||||||
assign perf_memsys_total_if.l3cache_mshr_stalls = '0;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
@@ -209,30 +181,20 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_mem_reads <= '0;
|
mem_perf <= '0;
|
||||||
perf_mem_writes <= '0;
|
|
||||||
perf_mem_lat <= '0;
|
|
||||||
end else begin
|
end else begin
|
||||||
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
|
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
|
||||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
|
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
|
||||||
end
|
end
|
||||||
if (mem_req_fire && mem_bus_if.req_data.rw) begin
|
if (mem_req_fire && mem_bus_if.req_data.rw) begin
|
||||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
|
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
|
||||||
end
|
end
|
||||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign perf_memsys_total_if.mem_reads = perf_mem_reads;
|
|
||||||
assign perf_memsys_total_if.mem_writes = perf_mem_writes;
|
|
||||||
assign perf_memsys_total_if.mem_latency = perf_mem_lat;
|
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef DBG_TRACE_CORE_MEM
|
`ifdef DBG_TRACE_CORE_MEM
|
||||||
|
|||||||
27
hw/rtl/cache/VX_cache.sv
vendored
27
hw/rtl/cache/VX_cache.sv
vendored
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
`include "VX_cache_define.vh"
|
`include "VX_cache_define.vh"
|
||||||
|
|
||||||
module VX_cache #(
|
module VX_cache import VX_gpu_pkg::*; #(
|
||||||
parameter `STRING INSTANCE_ID = "",
|
parameter `STRING INSTANCE_ID = "",
|
||||||
|
|
||||||
// Number of Word requests per cycle
|
// Number of Word requests per cycle
|
||||||
@@ -56,7 +56,7 @@ module VX_cache #(
|
|||||||
) (
|
) (
|
||||||
// PERF
|
// PERF
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_cache_perf_if.master cache_perf_if,
|
output cache_perf_t cache_perf,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
input wire clk,
|
input wire clk,
|
||||||
@@ -279,6 +279,10 @@ module VX_cache #(
|
|||||||
core_req_tag[i]};
|
core_req_tag[i]};
|
||||||
end
|
end
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||||
|
`endif
|
||||||
|
|
||||||
`RESET_RELAY (req_xbar_reset, reset);
|
`RESET_RELAY (req_xbar_reset, reset);
|
||||||
|
|
||||||
VX_stream_xbar #(
|
VX_stream_xbar #(
|
||||||
@@ -290,9 +294,9 @@ module VX_cache #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (req_xbar_reset),
|
.reset (req_xbar_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.collisions (cache_perf_if.bank_stalls),
|
.collisions(perf_collisions),
|
||||||
`else
|
`else
|
||||||
`UNUSED_PIN (collisions),
|
`UNUSED_PIN(collisions),
|
||||||
`endif
|
`endif
|
||||||
.valid_in (core_req_valid),
|
.valid_in (core_req_valid),
|
||||||
.data_in (core_req_data_in),
|
.data_in (core_req_data_in),
|
||||||
@@ -578,13 +582,14 @@ module VX_cache #(
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign cache_perf_if.reads = perf_core_reads;
|
assign cache_perf.reads = perf_core_reads;
|
||||||
assign cache_perf_if.writes = perf_core_writes;
|
assign cache_perf.writes = perf_core_writes;
|
||||||
assign cache_perf_if.read_misses = perf_read_misses;
|
assign cache_perf.read_misses = perf_read_misses;
|
||||||
assign cache_perf_if.write_misses = perf_write_misses;
|
assign cache_perf.write_misses = perf_write_misses;
|
||||||
assign cache_perf_if.mshr_stalls = perf_mshr_stalls;
|
assign cache_perf.bank_stalls = perf_collisions;
|
||||||
assign cache_perf_if.mem_stalls = perf_mem_stalls;
|
assign cache_perf.mshr_stalls = perf_mshr_stalls;
|
||||||
assign cache_perf_if.crsp_stalls = perf_crsp_stalls;
|
assign cache_perf.mem_stalls = perf_mem_stalls;
|
||||||
|
assign cache_perf.crsp_stalls = perf_crsp_stalls;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
13
hw/rtl/cache/VX_cache_cluster.sv
vendored
13
hw/rtl/cache/VX_cache_cluster.sv
vendored
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
`include "VX_cache_define.vh"
|
`include "VX_cache_define.vh"
|
||||||
|
|
||||||
module VX_cache_cluster #(
|
module VX_cache_cluster import VX_gpu_pkg::*; #(
|
||||||
parameter `STRING INSTANCE_ID = "",
|
parameter `STRING INSTANCE_ID = "",
|
||||||
|
|
||||||
parameter NUM_UNITS = 1,
|
parameter NUM_UNITS = 1,
|
||||||
@@ -66,7 +66,7 @@ module VX_cache_cluster #(
|
|||||||
|
|
||||||
// PERF
|
// PERF
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_cache_perf_if.master cache_perf_if,
|
output cache_perf_t cache_perf,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
|
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
|
||||||
@@ -83,8 +83,8 @@ module VX_cache_cluster #(
|
|||||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_cache_perf_if perf_cache_unit_if[NUM_CACHES]();
|
cache_perf_t perf_cache_unit[NUM_CACHES];
|
||||||
`PERF_CACHE_ADD (cache_perf_if, perf_cache_unit_if, NUM_CACHES);
|
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
@@ -97,7 +97,6 @@ module VX_cache_cluster #(
|
|||||||
.TAG_WIDTH (ARB_TAG_WIDTH)
|
.TAG_WIDTH (ARB_TAG_WIDTH)
|
||||||
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
|
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
|
||||||
|
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (WORD_SIZE),
|
.DATA_SIZE (WORD_SIZE),
|
||||||
@@ -161,7 +160,7 @@ module VX_cache_cluster #(
|
|||||||
.PASSTHRU (PASSTHRU)
|
.PASSTHRU (PASSTHRU)
|
||||||
) cache_wrap (
|
) cache_wrap (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf_if (perf_cache_unit_if[i]),
|
.cache_perf (perf_cache_unit[i]),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (cache_reset),
|
.reset (cache_reset),
|
||||||
@@ -357,7 +356,7 @@ module VX_cache_cluster_top #(
|
|||||||
.MEM_OUT_REG (MEM_OUT_REG)
|
.MEM_OUT_REG (MEM_OUT_REG)
|
||||||
) cache (
|
) cache (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf_if (perf_icache_if),
|
.cache_perf (perf_icache),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
|
|||||||
49
hw/rtl/cache/VX_cache_perf_if.sv
vendored
49
hw/rtl/cache/VX_cache_perf_if.sv
vendored
@@ -1,49 +0,0 @@
|
|||||||
// Copyright © 2019-2023
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
`include "VX_define.vh"
|
|
||||||
|
|
||||||
interface VX_cache_perf_if ();
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] reads;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] writes;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] read_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] write_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] bank_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] mshr_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] crsp_stalls;
|
|
||||||
|
|
||||||
modport master (
|
|
||||||
output reads,
|
|
||||||
output writes,
|
|
||||||
output read_misses,
|
|
||||||
output write_misses,
|
|
||||||
output bank_stalls,
|
|
||||||
output mshr_stalls,
|
|
||||||
output mem_stalls,
|
|
||||||
output crsp_stalls
|
|
||||||
);
|
|
||||||
|
|
||||||
modport slave (
|
|
||||||
input reads,
|
|
||||||
input writes,
|
|
||||||
input read_misses,
|
|
||||||
input write_misses,
|
|
||||||
input bank_stalls,
|
|
||||||
input mshr_stalls,
|
|
||||||
input mem_stalls,
|
|
||||||
input crsp_stalls
|
|
||||||
);
|
|
||||||
|
|
||||||
endinterface
|
|
||||||
17
hw/rtl/cache/VX_cache_wrap.sv
vendored
17
hw/rtl/cache/VX_cache_wrap.sv
vendored
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
`include "VX_cache_define.vh"
|
`include "VX_cache_define.vh"
|
||||||
|
|
||||||
module VX_cache_wrap #(
|
module VX_cache_wrap import VX_gpu_pkg::*; #(
|
||||||
parameter `STRING INSTANCE_ID = "",
|
parameter `STRING INSTANCE_ID = "",
|
||||||
|
|
||||||
// Number of Word requests per cycle
|
// Number of Word requests per cycle
|
||||||
@@ -67,14 +67,14 @@ module VX_cache_wrap #(
|
|||||||
|
|
||||||
// PERF
|
// PERF
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_cache_perf_if.master cache_perf_if,
|
output cache_perf_t cache_perf,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
||||||
VX_mem_bus_if.master mem_bus_if
|
VX_mem_bus_if.master mem_bus_if
|
||||||
);
|
);
|
||||||
|
|
||||||
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter"))
|
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS))
|
||||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
|
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
|
||||||
|
|
||||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||||
@@ -353,14 +353,7 @@ module VX_cache_wrap #(
|
|||||||
assign mem_rsp_ready_b = 0;
|
assign mem_rsp_ready_b = 0;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
assign cache_perf_if.reads = '0;
|
assign cache_perf = '0;
|
||||||
assign cache_perf_if.writes = '0;
|
|
||||||
assign cache_perf_if.read_misses = '0;
|
|
||||||
assign cache_perf_if.write_misses = '0;
|
|
||||||
assign cache_perf_if.bank_stalls = '0;
|
|
||||||
assign cache_perf_if.mshr_stalls = '0;
|
|
||||||
assign cache_perf_if.mem_stalls = '0;
|
|
||||||
assign cache_perf_if.crsp_stalls = '0;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
end else begin
|
end else begin
|
||||||
@@ -429,7 +422,7 @@ module VX_cache_wrap #(
|
|||||||
.reset (cache_reset),
|
.reset (cache_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf_if (cache_perf_if),
|
.cache_perf (cache_perf),
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.core_bus_if (core_bus_wrap_if),
|
.core_bus_if (core_bus_wrap_if),
|
||||||
|
|||||||
@@ -77,8 +77,20 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
|
||||||
VX_pipeline_perf_if pipeline_perf_if();
|
VX_pipeline_perf_if pipeline_perf_if();
|
||||||
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
|
cache_perf_t smem_perf;
|
||||||
|
|
||||||
|
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||||
|
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||||
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
|
`ifdef SM_ENABLE
|
||||||
|
assign mem_perf_tmp_if.smem = smem_perf;
|
||||||
|
`else
|
||||||
|
assign mem_perf_tmp_if.smem = '0;
|
||||||
|
`endif
|
||||||
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`RESET_RELAY (dcr_data_reset, reset);
|
`RESET_RELAY (dcr_data_reset, reset);
|
||||||
@@ -226,19 +238,28 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.sim_wb_value (sim_wb_value)
|
.sim_wb_value (sim_wb_value)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`ifdef SM_ENABLE
|
||||||
|
|
||||||
VX_smem_unit #(
|
VX_smem_unit #(
|
||||||
.CORE_ID (CORE_ID)
|
.CORE_ID (CORE_ID)
|
||||||
) smem_unit (
|
) smem_unit (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_in_if (mem_perf_if),
|
.cache_perf (smem_perf),
|
||||||
.mem_perf_out_if (mem_perf_tmp_if),
|
|
||||||
`endif
|
`endif
|
||||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||||
.dcache_bus_out_if (dcache_bus_if)
|
.dcache_bus_out_if (dcache_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`else
|
||||||
|
|
||||||
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||||
|
end
|
||||||
|
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||||
|
|||||||
@@ -221,63 +221,63 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_DCR_MPM_CLASS_MEM: begin
|
`VX_DCR_MPM_CLASS_MEM: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache_reads[31:0];
|
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache_read_misses[31:0];
|
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: dcache
|
// PERF: dcache
|
||||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache_reads[31:0];
|
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache_writes[31:0];
|
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache_writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache_read_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache_write_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache_bank_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache_mshr_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: smem
|
// PERF: smem
|
||||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem_reads[31:0];
|
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem_writes[31:0];
|
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem_bank_stalls[31:0];
|
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l2cache
|
// PERF: l2cache
|
||||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache_reads[31:0];
|
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache_writes[31:0];
|
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache_writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache_read_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache_read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache_write_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache_write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache_bank_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache_mshr_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l3cache
|
// PERF: l3cache
|
||||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache_reads[31:0];
|
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache_writes[31:0];
|
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache_writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache_read_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache_read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache_write_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache_write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache_bank_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache_mshr_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem_reads[31:0];
|
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem_writes[31:0];
|
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem_latency[31:0];
|
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -299,6 +299,8 @@ import VX_fpu_pkg::*;
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
||||||
`UNUSED_VAR (perf_wctl_stalls);
|
`UNUSED_VAR (perf_wctl_stalls);
|
||||||
|
`UNUSED_VAR (mem_perf_if.icache);
|
||||||
|
`UNUSED_VAR (mem_perf_if.smem);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -20,8 +20,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_in_if,
|
output cache_perf_t cache_perf,
|
||||||
VX_mem_perf_if.master mem_perf_out_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
|
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
|
||||||
@@ -29,21 +28,78 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
|
|
||||||
`ifdef SM_ENABLE
|
|
||||||
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
|
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
|
||||||
|
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
|
||||||
|
|
||||||
|
`RESET_RELAY (smem_reset, reset);
|
||||||
|
|
||||||
|
VX_shared_mem #(
|
||||||
|
.INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)),
|
||||||
|
.SIZE (1 << `SMEM_LOG_SIZE),
|
||||||
|
.NUM_REQS (DCACHE_NUM_REQS),
|
||||||
|
.NUM_BANKS (`SMEM_NUM_BANKS),
|
||||||
|
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||||
|
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
|
||||||
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
|
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||||
|
) shared_mem (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (smem_reset),
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.cache_perf (cache_perf),
|
||||||
|
`endif
|
||||||
|
|
||||||
|
// Core request
|
||||||
|
.req_valid (smem_req_valid),
|
||||||
|
.req_rw (smem_req_rw),
|
||||||
|
.req_byteen (smem_req_byteen),
|
||||||
|
.req_addr (smem_req_addr),
|
||||||
|
.req_data (smem_req_data),
|
||||||
|
.req_tag (smem_req_tag),
|
||||||
|
.req_ready (smem_req_ready),
|
||||||
|
|
||||||
|
// Core response
|
||||||
|
.rsp_valid (smem_rsp_valid),
|
||||||
|
.rsp_data (smem_rsp_data),
|
||||||
|
.rsp_tag (smem_rsp_tag),
|
||||||
|
.rsp_ready (smem_rsp_ready)
|
||||||
|
);
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||||
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
|
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_cache_perf_if perf_smem_if();
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`RESET_RELAY (switch_reset, reset);
|
`RESET_RELAY (switch_reset, reset);
|
||||||
|
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||||
|
|
||||||
|
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
|
||||||
|
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
|
||||||
|
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
|
||||||
|
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
|
||||||
|
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
|
||||||
|
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
|
||||||
|
|
||||||
|
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
|
||||||
|
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
|
||||||
|
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
|
||||||
|
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
|
||||||
|
|
||||||
|
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
|
||||||
|
|
||||||
VX_smem_switch #(
|
VX_smem_switch #(
|
||||||
.NUM_REQS (2),
|
.NUM_REQS (2),
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||||
@@ -65,121 +121,4 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
|
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
|
||||||
end
|
end
|
||||||
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
|
|
||||||
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
|
|
||||||
|
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
|
||||||
|
|
||||||
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
|
|
||||||
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
|
|
||||||
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
|
|
||||||
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
|
|
||||||
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
|
|
||||||
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
|
|
||||||
|
|
||||||
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
|
|
||||||
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
|
|
||||||
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
|
|
||||||
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
|
|
||||||
|
|
||||||
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
|
|
||||||
end
|
|
||||||
|
|
||||||
`RESET_RELAY (smem_reset, reset);
|
|
||||||
|
|
||||||
VX_shared_mem #(
|
|
||||||
.INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)),
|
|
||||||
.SIZE (1 << `SMEM_LOG_SIZE),
|
|
||||||
.NUM_REQS (DCACHE_NUM_REQS),
|
|
||||||
.NUM_BANKS (`SMEM_NUM_BANKS),
|
|
||||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
|
||||||
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
|
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
|
||||||
) shared_mem (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (smem_reset),
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf_if(perf_smem_if),
|
|
||||||
`endif
|
|
||||||
|
|
||||||
// Core request
|
|
||||||
.req_valid (smem_req_valid),
|
|
||||||
.req_rw (smem_req_rw),
|
|
||||||
.req_byteen (smem_req_byteen),
|
|
||||||
.req_addr (smem_req_addr),
|
|
||||||
.req_data (smem_req_data),
|
|
||||||
.req_tag (smem_req_tag),
|
|
||||||
.req_ready (smem_req_ready),
|
|
||||||
|
|
||||||
// Core response
|
|
||||||
.rsp_valid (smem_rsp_valid),
|
|
||||||
.rsp_data (smem_rsp_data),
|
|
||||||
.rsp_tag (smem_rsp_tag),
|
|
||||||
.rsp_ready (smem_rsp_ready)
|
|
||||||
);
|
|
||||||
|
|
||||||
`else
|
|
||||||
|
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], dcache_bus_in_if[i]);
|
|
||||||
end
|
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
|
||||||
`UNUSED_VAR (reset)
|
|
||||||
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
|
|
||||||
assign mem_perf_out_if.icache_reads = mem_perf_in_if.icache_reads;
|
|
||||||
assign mem_perf_out_if.icache_read_misses = mem_perf_in_if.icache_read_misses;
|
|
||||||
|
|
||||||
assign mem_perf_out_if.dcache_reads = mem_perf_in_if.dcache_reads;
|
|
||||||
assign mem_perf_out_if.dcache_writes = mem_perf_in_if.dcache_writes;
|
|
||||||
assign mem_perf_out_if.dcache_read_misses = mem_perf_in_if.dcache_read_misses;
|
|
||||||
assign mem_perf_out_if.dcache_write_misses = mem_perf_in_if.dcache_write_misses;
|
|
||||||
assign mem_perf_out_if.dcache_bank_stalls = mem_perf_in_if.dcache_bank_stalls;
|
|
||||||
assign mem_perf_out_if.dcache_mshr_stalls = mem_perf_in_if.dcache_mshr_stalls;
|
|
||||||
|
|
||||||
assign mem_perf_out_if.l2cache_reads = mem_perf_in_if.l2cache_reads;
|
|
||||||
assign mem_perf_out_if.l2cache_writes = mem_perf_in_if.l2cache_writes;
|
|
||||||
assign mem_perf_out_if.l2cache_read_misses = mem_perf_in_if.l2cache_read_misses;
|
|
||||||
assign mem_perf_out_if.l2cache_write_misses = mem_perf_in_if.l2cache_write_misses;
|
|
||||||
assign mem_perf_out_if.l2cache_bank_stalls = mem_perf_in_if.l2cache_bank_stalls;
|
|
||||||
assign mem_perf_out_if.l2cache_mshr_stalls = mem_perf_in_if.l2cache_mshr_stalls;
|
|
||||||
|
|
||||||
assign mem_perf_out_if.l3cache_reads = mem_perf_in_if.l3cache_reads;
|
|
||||||
assign mem_perf_out_if.l3cache_writes = mem_perf_in_if.l3cache_writes;
|
|
||||||
assign mem_perf_out_if.l3cache_read_misses = mem_perf_in_if.l3cache_read_misses;
|
|
||||||
assign mem_perf_out_if.l3cache_write_misses = mem_perf_in_if.l3cache_write_misses;
|
|
||||||
assign mem_perf_out_if.l3cache_bank_stalls = mem_perf_in_if.l3cache_bank_stalls;
|
|
||||||
assign mem_perf_out_if.l3cache_mshr_stalls = mem_perf_in_if.l3cache_mshr_stalls;
|
|
||||||
|
|
||||||
assign mem_perf_out_if.mem_reads = mem_perf_in_if.mem_reads;
|
|
||||||
assign mem_perf_out_if.mem_writes = mem_perf_in_if.mem_writes;
|
|
||||||
assign mem_perf_out_if.mem_latency = mem_perf_in_if.mem_latency;
|
|
||||||
|
|
||||||
`ifdef SM_ENABLE
|
|
||||||
assign mem_perf_out_if.smem_reads = perf_smem_if.reads;
|
|
||||||
assign mem_perf_out_if.smem_writes = perf_smem_if.writes;
|
|
||||||
assign mem_perf_out_if.smem_bank_stalls = perf_smem_if.bank_stalls;
|
|
||||||
`else
|
|
||||||
assign mem_perf_out_if.smem_reads = '0;
|
|
||||||
assign mem_perf_out_if.smem_writes = '0;
|
|
||||||
assign mem_perf_out_if.smem_bank_stalls = '0;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`endif
|
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -358,9 +358,6 @@ task trace_ex_op(input int level,
|
|||||||
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
||||||
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
||||||
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
||||||
`INST_SFU_TEX: `TRACE(level, ("TEX"));
|
|
||||||
`INST_SFU_RASTER:`TRACE(level, ("RASTER"));
|
|
||||||
`INST_SFU_ROP: `TRACE(level, ("ROP"));
|
|
||||||
default: `TRACE(level, ("?"));
|
default: `TRACE(level, ("?"));
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -13,106 +13,31 @@
|
|||||||
|
|
||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
interface VX_mem_perf_if ();
|
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] icache_reads;
|
cache_perf_t icache;
|
||||||
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
|
cache_perf_t dcache;
|
||||||
|
cache_perf_t l2cache;
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_reads;
|
cache_perf_t l3cache;
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
cache_perf_t smem;
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
|
mem_perf_t mem;
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] smem_reads;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] smem_writes;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_reads;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_writes;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_read_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_write_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_bank_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l2cache_mshr_stalls;
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_reads;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_writes;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_read_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_write_misses;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_bank_stalls;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] l3cache_mshr_stalls;
|
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] mem_writes;
|
|
||||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
|
||||||
|
|
||||||
modport master (
|
modport master (
|
||||||
output icache_reads,
|
output icache,
|
||||||
output icache_read_misses,
|
output dcache,
|
||||||
|
output l2cache,
|
||||||
output dcache_reads,
|
output l3cache,
|
||||||
output dcache_writes,
|
output smem,
|
||||||
output dcache_read_misses,
|
output mem
|
||||||
output dcache_write_misses,
|
|
||||||
output dcache_bank_stalls,
|
|
||||||
output dcache_mshr_stalls,
|
|
||||||
|
|
||||||
output smem_reads,
|
|
||||||
output smem_writes,
|
|
||||||
output smem_bank_stalls,
|
|
||||||
|
|
||||||
output l2cache_reads,
|
|
||||||
output l2cache_writes,
|
|
||||||
output l2cache_read_misses,
|
|
||||||
output l2cache_write_misses,
|
|
||||||
output l2cache_bank_stalls,
|
|
||||||
output l2cache_mshr_stalls,
|
|
||||||
|
|
||||||
output l3cache_reads,
|
|
||||||
output l3cache_writes,
|
|
||||||
output l3cache_read_misses,
|
|
||||||
output l3cache_write_misses,
|
|
||||||
output l3cache_bank_stalls,
|
|
||||||
output l3cache_mshr_stalls,
|
|
||||||
|
|
||||||
output mem_reads,
|
|
||||||
output mem_writes,
|
|
||||||
output mem_latency
|
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
input icache_reads,
|
input icache,
|
||||||
input icache_read_misses,
|
input dcache,
|
||||||
|
input l2cache,
|
||||||
input dcache_reads,
|
input l3cache,
|
||||||
input dcache_writes,
|
input smem,
|
||||||
input dcache_read_misses,
|
input mem
|
||||||
input dcache_write_misses,
|
|
||||||
input dcache_bank_stalls,
|
|
||||||
input dcache_mshr_stalls,
|
|
||||||
|
|
||||||
input smem_reads,
|
|
||||||
input smem_writes,
|
|
||||||
input smem_bank_stalls,
|
|
||||||
|
|
||||||
input l2cache_reads,
|
|
||||||
input l2cache_writes,
|
|
||||||
input l2cache_read_misses,
|
|
||||||
input l2cache_write_misses,
|
|
||||||
input l2cache_bank_stalls,
|
|
||||||
input l2cache_mshr_stalls,
|
|
||||||
|
|
||||||
input l3cache_reads,
|
|
||||||
input l3cache_writes,
|
|
||||||
input l3cache_read_misses,
|
|
||||||
input l3cache_write_misses,
|
|
||||||
input l3cache_bank_stalls,
|
|
||||||
input l3cache_mshr_stalls,
|
|
||||||
|
|
||||||
input mem_reads,
|
|
||||||
input mem_writes,
|
|
||||||
input mem_latency
|
|
||||||
);
|
);
|
||||||
|
|
||||||
endinterface
|
endinterface
|
||||||
|
|||||||
@@ -1,209 +0,0 @@
|
|||||||
// Copyright © 2019-2023
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
`include "VX_define.vh"
|
|
||||||
|
|
||||||
`define SMEM_ADDR_STACK_OPT
|
|
||||||
|
|
||||||
module VX_mem_unit import VX_gpu_pkg::*; #(
|
|
||||||
parameter CLUSTER_ID = 0
|
|
||||||
) (
|
|
||||||
input wire clk,
|
|
||||||
input wire reset,
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_mem_perf_if.master mem_perf_if,
|
|
||||||
`endif
|
|
||||||
|
|
||||||
VX_mem_bus_if.slave icache_bus_if [`NUM_SOCKETS],
|
|
||||||
|
|
||||||
VX_mem_bus_if.slave dcache_bus_if [`NUM_SOCKETS * DCACHE_NUM_REQS],
|
|
||||||
|
|
||||||
VX_mem_bus_if.master mem_bus_if
|
|
||||||
);
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_cache_perf_if perf_icache_if();
|
|
||||||
VX_cache_perf_if perf_dcache_if();
|
|
||||||
VX_cache_perf_if perf_l2cache_if();
|
|
||||||
`endif
|
|
||||||
|
|
||||||
/////////////////////////////// I-Cache ///////////////////////////////////
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
|
||||||
) icache_mem_bus_if();
|
|
||||||
|
|
||||||
`RESET_RELAY (icache_reset, reset);
|
|
||||||
|
|
||||||
VX_cache_cluster #(
|
|
||||||
.INSTANCE_ID ($sformatf("cluster%0d-icache", CLUSTER_ID)),
|
|
||||||
.NUM_UNITS (`NUM_ICACHES),
|
|
||||||
.NUM_INPUTS (`NUM_SOCKETS),
|
|
||||||
.TAG_SEL_IDX (0),
|
|
||||||
.CACHE_SIZE (`ICACHE_SIZE),
|
|
||||||
.LINE_SIZE (ICACHE_LINE_SIZE),
|
|
||||||
.NUM_BANKS (1),
|
|
||||||
.NUM_WAYS (`ICACHE_NUM_WAYS),
|
|
||||||
.WORD_SIZE (ICACHE_WORD_SIZE),
|
|
||||||
.NUM_REQS (1),
|
|
||||||
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
|
|
||||||
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
|
|
||||||
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
|
||||||
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH),
|
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
|
||||||
.WRITE_ENABLE (0),
|
|
||||||
.CORE_OUT_REG (2),
|
|
||||||
.MEM_OUT_REG (2)
|
|
||||||
) icache (
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf_if (perf_icache_if),
|
|
||||||
`endif
|
|
||||||
.clk (clk),
|
|
||||||
.reset (icache_reset),
|
|
||||||
.core_bus_if (icache_bus_if),
|
|
||||||
.mem_bus_if (icache_mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
/////////////////////////////// D-Cache ///////////////////////////////////
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
|
||||||
) dcache_mem_bus_if();
|
|
||||||
|
|
||||||
`RESET_RELAY (dcache_reset, reset);
|
|
||||||
|
|
||||||
VX_cache_cluster #(
|
|
||||||
.INSTANCE_ID ($sformatf("cluster%0d-dcache", CLUSTER_ID)),
|
|
||||||
.NUM_UNITS (`NUM_DCACHES),
|
|
||||||
.NUM_INPUTS (`NUM_SOCKETS),
|
|
||||||
.TAG_SEL_IDX (1),
|
|
||||||
.CACHE_SIZE (`DCACHE_SIZE),
|
|
||||||
.LINE_SIZE (DCACHE_LINE_SIZE),
|
|
||||||
.NUM_BANKS (`DCACHE_NUM_BANKS),
|
|
||||||
.NUM_WAYS (`DCACHE_NUM_WAYS),
|
|
||||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
|
||||||
.NUM_REQS (DCACHE_NUM_REQS),
|
|
||||||
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
|
|
||||||
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
|
||||||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
|
||||||
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH),
|
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
|
||||||
.WRITE_ENABLE (1),
|
|
||||||
.NC_ENABLE (1),
|
|
||||||
.CORE_OUT_REG (`SM_ENABLED ? 2 : 1),
|
|
||||||
.MEM_OUT_REG (2)
|
|
||||||
) dcache (
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf_if (perf_dcache_if),
|
|
||||||
`endif
|
|
||||||
|
|
||||||
.clk (clk),
|
|
||||||
.reset (dcache_reset),
|
|
||||||
.core_bus_if (dcache_bus_if),
|
|
||||||
.mem_bus_if (dcache_mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
/////////////////////////////// L2-Cache //////////////////////////////////
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (L2_WORD_SIZE),
|
|
||||||
.TAG_WIDTH (L2_TAG_WIDTH)
|
|
||||||
) l2_mem_bus_if[L2_NUM_REQS]();
|
|
||||||
|
|
||||||
localparam I_MEM_ARB_IDX = 0;
|
|
||||||
localparam D_MEM_ARB_IDX = I_MEM_ARB_IDX + 1;
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[I_MEM_ARB_IDX], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[D_MEM_ARB_IDX], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
|
||||||
|
|
||||||
`RESET_RELAY (l2_reset, reset);
|
|
||||||
|
|
||||||
VX_cache_wrap #(
|
|
||||||
.INSTANCE_ID ($sformatf("cluster%0d-l2cache", CLUSTER_ID)),
|
|
||||||
.CACHE_SIZE (`L2_CACHE_SIZE),
|
|
||||||
.LINE_SIZE (`L2_LINE_SIZE),
|
|
||||||
.NUM_BANKS (`L2_NUM_BANKS),
|
|
||||||
.NUM_WAYS (`L2_NUM_WAYS),
|
|
||||||
.WORD_SIZE (L2_WORD_SIZE),
|
|
||||||
.NUM_REQS (L2_NUM_REQS),
|
|
||||||
.CRSQ_SIZE (`L2_CRSQ_SIZE),
|
|
||||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
|
||||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
|
||||||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
|
||||||
.WRITE_ENABLE (1),
|
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
|
||||||
.CORE_OUT_REG (2),
|
|
||||||
.MEM_OUT_REG (2),
|
|
||||||
.NC_ENABLE (1),
|
|
||||||
.PASSTHRU (!`L2_ENABLED)
|
|
||||||
) l2cache (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (l2_reset),
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf_if (perf_l2cache_if),
|
|
||||||
`endif
|
|
||||||
.core_bus_if (l2_mem_bus_if),
|
|
||||||
.mem_bus_if (mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
|
|
||||||
`UNUSED_VAR (perf_dcache_if.mem_stalls)
|
|
||||||
`UNUSED_VAR (perf_dcache_if.crsp_stalls)
|
|
||||||
|
|
||||||
assign mem_perf_if.icache_reads = perf_icache_if.reads;
|
|
||||||
assign mem_perf_if.icache_read_misses = perf_icache_if.read_misses;
|
|
||||||
|
|
||||||
assign mem_perf_if.dcache_reads = perf_dcache_if.reads;
|
|
||||||
assign mem_perf_if.dcache_writes = perf_dcache_if.writes;
|
|
||||||
assign mem_perf_if.dcache_read_misses = perf_dcache_if.read_misses;
|
|
||||||
assign mem_perf_if.dcache_write_misses= perf_dcache_if.write_misses;
|
|
||||||
assign mem_perf_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
|
|
||||||
assign mem_perf_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
|
|
||||||
|
|
||||||
`ifdef L2_ENABLE
|
|
||||||
assign mem_perf_if.l2cache_reads = perf_l2cache_if.reads;
|
|
||||||
assign mem_perf_if.l2cache_writes = perf_l2cache_if.writes;
|
|
||||||
assign mem_perf_if.l2cache_read_misses = perf_l2cache_if.read_misses;
|
|
||||||
assign mem_perf_if.l2cache_write_misses= perf_l2cache_if.write_misses;
|
|
||||||
assign mem_perf_if.l2cache_bank_stalls = perf_l2cache_if.bank_stalls;
|
|
||||||
assign mem_perf_if.l2cache_mshr_stalls = perf_l2cache_if.mshr_stalls;
|
|
||||||
`else
|
|
||||||
assign mem_perf_if.l2cache_reads = '0;
|
|
||||||
assign mem_perf_if.l2cache_writes = '0;
|
|
||||||
assign mem_perf_if.l2cache_read_misses = '0;
|
|
||||||
assign mem_perf_if.l2cache_write_misses= '0;
|
|
||||||
assign mem_perf_if.l2cache_bank_stalls = '0;
|
|
||||||
assign mem_perf_if.l2cache_mshr_stalls = '0;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
assign mem_perf_if.l3cache_reads = '0;
|
|
||||||
assign mem_perf_if.l3cache_writes = '0;
|
|
||||||
assign mem_perf_if.l3cache_read_misses = '0;
|
|
||||||
assign mem_perf_if.l3cache_write_misses= '0;
|
|
||||||
assign mem_perf_if.l3cache_bank_stalls = '0;
|
|
||||||
assign mem_perf_if.l3cache_mshr_stalls = '0;
|
|
||||||
|
|
||||||
assign mem_perf_if.mem_reads = '0;
|
|
||||||
assign mem_perf_if.mem_writes = '0;
|
|
||||||
assign mem_perf_if.mem_latency = '0;
|
|
||||||
|
|
||||||
`endif
|
|
||||||
|
|
||||||
endmodule
|
|
||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
module VX_shared_mem #(
|
module VX_shared_mem import VX_gpu_pkg::*; #(
|
||||||
parameter `STRING INSTANCE_ID = "",
|
parameter `STRING INSTANCE_ID = "",
|
||||||
|
|
||||||
// Size of cache in bytes
|
// Size of cache in bytes
|
||||||
@@ -40,7 +40,7 @@ module VX_shared_mem #(
|
|||||||
|
|
||||||
// PERF
|
// PERF
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_cache_perf_if.master cache_perf_if,
|
output cache_perf_t cache_perf,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Core request
|
// Core request
|
||||||
@@ -106,6 +106,10 @@ module VX_shared_mem #(
|
|||||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
|
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||||
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] req_data_out;
|
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||||
|
`endif
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||||
assign req_data_in[i] = {
|
assign req_data_in[i] = {
|
||||||
req_rw[i],
|
req_rw[i],
|
||||||
@@ -125,7 +129,7 @@ module VX_shared_mem #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.collisions (cache_perf_if.bank_stalls),
|
.collisions (perf_collisions),
|
||||||
`else
|
`else
|
||||||
`UNUSED_PIN (collisions),
|
`UNUSED_PIN (collisions),
|
||||||
`endif
|
`endif
|
||||||
@@ -253,13 +257,14 @@ module VX_shared_mem #(
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign cache_perf_if.reads = perf_reads;
|
assign cache_perf.reads = perf_reads;
|
||||||
assign cache_perf_if.writes = perf_writes;
|
assign cache_perf.writes = perf_writes;
|
||||||
assign cache_perf_if.read_misses = '0;
|
assign cache_perf.read_misses = '0;
|
||||||
assign cache_perf_if.write_misses = '0;
|
assign cache_perf.write_misses = '0;
|
||||||
assign cache_perf_if.mshr_stalls = '0;
|
assign cache_perf.bank_stalls = perf_collisions;
|
||||||
assign cache_perf_if.mem_stalls = '0;
|
assign cache_perf.mshr_stalls = '0;
|
||||||
assign cache_perf_if.crsp_stalls = perf_crsp_stalls;
|
assign cache_perf.mem_stalls = '0;
|
||||||
|
assign cache_perf.crsp_stalls = perf_crsp_stalls;
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -179,6 +179,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
uint64_t cycles = 0;
|
uint64_t cycles = 0;
|
||||||
|
|
||||||
#ifdef PERF_ENABLE
|
#ifdef PERF_ENABLE
|
||||||
|
|
||||||
|
auto calcRatio = [&](uint64_t part, uint64_t total)->int {
|
||||||
|
if (total == 0)
|
||||||
|
return 0;
|
||||||
|
return int((1.0 - (double(part) / double(total))) * 100);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int {
|
||||||
|
if (requests == 0)
|
||||||
|
return 0;
|
||||||
|
return int(double(sum) / double(requests));
|
||||||
|
};
|
||||||
|
|
||||||
|
auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int {
|
||||||
|
if (count == 0)
|
||||||
|
return 0;
|
||||||
|
return int((double(count) / double(count + stalls)) * 100);
|
||||||
|
};
|
||||||
|
|
||||||
auto perf_class = gAutoPerfDump.get_perf_class();
|
auto perf_class = gAutoPerfDump.get_perf_class();
|
||||||
|
|
||||||
// PERF: pipeline stalls
|
// PERF: pipeline stalls
|
||||||
@@ -193,20 +212,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
uint64_t stores = 0;
|
uint64_t stores = 0;
|
||||||
uint64_t ifetch_lat = 0;
|
uint64_t ifetch_lat = 0;
|
||||||
uint64_t load_lat = 0;
|
uint64_t load_lat = 0;
|
||||||
// PERF: Icache
|
|
||||||
uint64_t icache_reads = 0;
|
|
||||||
uint64_t icache_read_misses = 0;
|
|
||||||
// PERF: Dcache
|
|
||||||
uint64_t dcache_reads = 0;
|
|
||||||
uint64_t dcache_writes = 0;
|
|
||||||
uint64_t dcache_read_misses = 0;
|
|
||||||
uint64_t dcache_write_misses = 0;
|
|
||||||
uint64_t dcache_bank_stalls = 0;
|
|
||||||
uint64_t dcache_mshr_stalls = 0;
|
|
||||||
// PERF: shared memory
|
|
||||||
uint64_t smem_reads = 0;
|
|
||||||
uint64_t smem_writes = 0;
|
|
||||||
uint64_t smem_bank_stalls = 0;
|
|
||||||
// PERF: l2cache
|
// PERF: l2cache
|
||||||
uint64_t l2cache_reads = 0;
|
uint64_t l2cache_reads = 0;
|
||||||
uint64_t l2cache_writes = 0;
|
uint64_t l2cache_writes = 0;
|
||||||
@@ -232,6 +237,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
|
#ifdef PERF_ENABLE
|
||||||
|
uint64_t isa_flags;
|
||||||
|
ret = vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags);
|
||||||
|
if (ret != 0)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE;
|
||||||
|
bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE;
|
||||||
|
bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE;
|
||||||
|
bool l3cache_enable = isa_flags & VX_ISA_EXT_L3CACHE;
|
||||||
|
bool smem_enable = isa_flags & VX_ISA_EXT_SMEM;
|
||||||
|
#endif
|
||||||
|
|
||||||
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
|
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
|
||||||
|
|
||||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||||
@@ -240,13 +258,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
if (ret != 0)
|
if (ret != 0)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
|
|
||||||
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
|
|
||||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
|
||||||
instrs += instrs_per_core;
|
|
||||||
cycles = std::max<uint64_t>(cycles_per_core, cycles);
|
|
||||||
|
|
||||||
#ifdef PERF_ENABLE
|
#ifdef PERF_ENABLE
|
||||||
switch (perf_class) {
|
switch (perf_class) {
|
||||||
case VX_DCR_MPM_CLASS_CORE: {
|
case VX_DCR_MPM_CLASS_CORE: {
|
||||||
@@ -291,45 +302,70 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
// ifetch latency
|
// ifetch latency
|
||||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
||||||
if (num_cores > 1) {
|
if (num_cores > 1) {
|
||||||
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
|
int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core);
|
||||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
}
|
}
|
||||||
ifetch_lat += ifetch_lat_per_core;
|
ifetch_lat += ifetch_lat_per_core;
|
||||||
// load latency
|
// load latency
|
||||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
||||||
if (num_cores > 1) {
|
if (num_cores > 1) {
|
||||||
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
|
int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core);
|
||||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
}
|
}
|
||||||
load_lat += load_lat_per_core;
|
load_lat += load_lat_per_core;
|
||||||
} break;
|
} break;
|
||||||
case VX_DCR_MPM_CLASS_MEM: {
|
case VX_DCR_MPM_CLASS_MEM: {
|
||||||
if (0 == core_id) {
|
if (smem_enable) {
|
||||||
// PERF: Icache
|
|
||||||
icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
|
|
||||||
icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
|
|
||||||
|
|
||||||
// PERF: Dcache
|
|
||||||
dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
|
|
||||||
dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
|
|
||||||
dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
|
|
||||||
dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
|
|
||||||
dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
|
|
||||||
dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
|
|
||||||
|
|
||||||
// PERF: smem
|
// PERF: smem
|
||||||
smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
|
uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
|
||||||
smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
|
uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
|
||||||
smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
|
uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
|
||||||
|
int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls);
|
||||||
|
fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads);
|
||||||
|
fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes);
|
||||||
|
fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (icache_enable) {
|
||||||
|
// PERF: Icache
|
||||||
|
uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
|
||||||
|
uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
|
||||||
|
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||||
|
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
|
||||||
|
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dcache_enable) {
|
||||||
|
// PERF: Dcache
|
||||||
|
uint64_t dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
|
||||||
|
uint64_t dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
|
||||||
|
uint64_t dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
|
||||||
|
uint64_t dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
|
||||||
|
uint64_t dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
|
||||||
|
uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
|
||||||
|
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
|
||||||
|
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
|
||||||
|
int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
|
||||||
|
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (l2cache_enable) {
|
||||||
// PERF: L2cache
|
// PERF: L2cache
|
||||||
l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
|
l2cache_reads += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
|
||||||
l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
|
l2cache_writes += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
|
||||||
l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
|
l2cache_read_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
|
||||||
l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
|
l2cache_write_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
|
||||||
l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
|
l2cache_bank_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
|
||||||
l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
|
l2cache_mshr_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0 == core_id) {
|
||||||
|
if (l3cache_enable) {
|
||||||
// PERF: L3cache
|
// PERF: L3cache
|
||||||
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
|
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
|
||||||
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
|
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
|
||||||
@@ -337,6 +373,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
|
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
|
||||||
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
|
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
|
||||||
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
|
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
|
||||||
|
}
|
||||||
|
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
|
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
|
||||||
@@ -348,10 +385,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
|
|
||||||
float IPC = (float)(double(instrs) / double(cycles));
|
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
|
||||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
|
||||||
|
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||||
|
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||||
|
instrs += instrs_per_core;
|
||||||
|
cycles = std::max<uint64_t>(cycles_per_core, cycles);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef PERF_ENABLE
|
#ifdef PERF_ENABLE
|
||||||
switch (perf_class) {
|
switch (perf_class) {
|
||||||
@@ -369,44 +410,40 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||||
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
|
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
|
||||||
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
||||||
|
|
||||||
} break;
|
} break;
|
||||||
case VX_DCR_MPM_CLASS_MEM: {
|
case VX_DCR_MPM_CLASS_MEM: {
|
||||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
|
if (l2cache_enable) {
|
||||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
|
l2cache_reads /= num_cores;
|
||||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
l2cache_writes /= num_cores;
|
||||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
l2cache_read_misses /= num_cores;
|
||||||
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
|
l2cache_write_misses /= num_cores;
|
||||||
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
|
l2cache_bank_stalls /= num_cores;
|
||||||
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
|
l2cache_mshr_stalls /= num_cores;
|
||||||
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
|
int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
|
||||||
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
|
int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
|
||||||
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
|
int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls);
|
||||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
|
||||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
|
||||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
|
||||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
|
||||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
|
||||||
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
|
|
||||||
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
|
|
||||||
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
|
|
||||||
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
|
|
||||||
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
|
|
||||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
|
||||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
|
||||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
|
||||||
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
||||||
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
||||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
|
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
|
||||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
|
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
|
||||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
|
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
|
||||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
|
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (l3cache_enable) {
|
||||||
|
int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||||
|
int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||||
|
int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls);
|
||||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
|
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
|
||||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
|
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
|
||||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
|
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
|
||||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
|
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
|
||||||
|
}
|
||||||
|
|
||||||
|
int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads);
|
||||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||||
} break;
|
} break;
|
||||||
@@ -415,6 +452,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
float IPC = (float)(double(instrs) / double(cycles));
|
||||||
|
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||||
|
|
||||||
fflush(stream);
|
fflush(stream);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@@ -47,10 +47,12 @@ typedef void* vx_device_h;
|
|||||||
#define VX_ISA_STD_Q (1ull << 16)
|
#define VX_ISA_STD_Q (1ull << 16)
|
||||||
#define VX_ISA_STD_S (1ull << 18)
|
#define VX_ISA_STD_S (1ull << 18)
|
||||||
#define VX_ISA_STD_U (1ull << 20)
|
#define VX_ISA_STD_U (1ull << 20)
|
||||||
#define VX_ISA_BASE(flags) (1 << (((flags >> 30) & 0x3) + 4))
|
#define VX_ISA_ARCH(flags) (1 << (((flags >> 30) & 0x3) + 4))
|
||||||
#define VX_ISA_EXT_TEX (1ull << 32)
|
#define VX_ISA_EXT_ICACHE (1ull << 32)
|
||||||
#define VX_ISA_EXT_RASTER (1ull << 33)
|
#define VX_ISA_EXT_DCACHE (1ull << 33)
|
||||||
#define VX_ISA_EXT_ROP (1ull << 34)
|
#define VX_ISA_EXT_L2CACHE (1ull << 34)
|
||||||
|
#define VX_ISA_EXT_L3CACHE (1ull << 35)
|
||||||
|
#define VX_ISA_EXT_SMEM (1ull << 36)
|
||||||
|
|
||||||
// device memory types
|
// device memory types
|
||||||
#define VX_MEM_TYPE_GLOBAL 0
|
#define VX_MEM_TYPE_GLOBAL 0
|
||||||
|
|||||||
Reference in New Issue
Block a user