From c9e6518e05f58a1ef1080b5b5de3187c046ed5cf Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 3 Nov 2023 08:18:18 -0400 Subject: [PATCH] cache bindings and memory perf refactory --- hw/rtl/VX_cluster.sv | 71 +++++---- hw/rtl/VX_config.vh | 235 +++++++++++++++--------------- hw/rtl/VX_define.vh | 9 +- hw/rtl/VX_gpu_pkg.sv | 32 +++-- hw/rtl/VX_socket.sv | 181 +++++++++++++++-------- hw/rtl/VX_types.vh | 3 - hw/rtl/Vortex.sv | 216 ++++++++++++---------------- hw/rtl/cache/VX_cache.sv | 27 ++-- hw/rtl/cache/VX_cache_cluster.sv | 13 +- hw/rtl/cache/VX_cache_perf_if.sv | 49 ------- hw/rtl/cache/VX_cache_wrap.sv | 17 +-- hw/rtl/core/VX_core.sv | 27 +++- hw/rtl/core/VX_csr_data.sv | 106 +++++++------- hw/rtl/core/VX_smem_unit.sv | 187 ++++++++---------------- hw/rtl/core/VX_trace.vh | 3 - hw/rtl/mem/VX_mem_perf_if.sv | 113 +++------------ hw/rtl/mem/VX_mem_unit.sv | 209 --------------------------- hw/rtl/mem/VX_shared_mem.sv | 25 ++-- runtime/common/utils.cpp | 238 ++++++++++++++++++------------- runtime/include/vortex.h | 10 +- 20 files changed, 746 insertions(+), 1025 deletions(-) delete mode 100644 hw/rtl/cache/VX_cache_perf_if.sv delete mode 100644 hw/rtl/mem/VX_mem_unit.sv diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 782bde9a..d537249d 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -23,10 +23,10 @@ module VX_cluster import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.master mem_perf_if, - VX_mem_perf_if.slave perf_memsys_total_if, + VX_mem_perf_if.slave mem_perf_if, `endif + // DCRs VX_dcr_bus_if.slave dcr_bus_if, // Memory @@ -71,33 +71,52 @@ module VX_cluster import VX_gpu_pkg::*; #( ); `endif - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_ARB_TAG_WIDTH) - ) per_socket_dcache_bus_if[`NUM_SOCKETS * DCACHE_NUM_REQS](); +`ifdef PERF_ENABLE + VX_mem_perf_if mem_perf_tmp_if(); + cache_perf_t perf_l2cache; + assign mem_perf_tmp_if.icache = 'x; + assign mem_perf_tmp_if.dcache = 'x; + assign mem_perf_tmp_if.l2cache = perf_l2cache; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; + assign mem_perf_tmp_if.smem = 'x; + assign mem_perf_tmp_if.mem = mem_perf_if.mem; +`endif + VX_mem_bus_if #( - .DATA_SIZE (ICACHE_WORD_SIZE), - .TAG_WIDTH (ICACHE_ARB_TAG_WIDTH) - ) per_socket_icache_bus_if[`NUM_SOCKETS](); + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) + ) per_socket_mem_bus_if[`NUM_SOCKETS](); - `RESET_RELAY (mem_unit_reset, reset); - - VX_mem_unit #( - .CLUSTER_ID (CLUSTER_ID) - ) mem_unit ( - .clk (clk), - .reset (mem_unit_reset), + `RESET_RELAY (l2_reset, reset); + VX_cache_wrap #( + .INSTANCE_ID ("l2cache"), + .CACHE_SIZE (`L2_CACHE_SIZE), + .LINE_SIZE (`L2_LINE_SIZE), + .NUM_BANKS (`L2_NUM_BANKS), + .NUM_WAYS (`L2_NUM_WAYS), + .WORD_SIZE (L2_WORD_SIZE), + .NUM_REQS (L2_NUM_REQS), + .CRSQ_SIZE (`L2_CRSQ_SIZE), + .MSHR_SIZE (`L2_MSHR_SIZE), + .MRSQ_SIZE (`L2_MRSQ_SIZE), + .MREQ_SIZE (`L2_MREQ_SIZE), + .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH), + .WRITE_ENABLE (1), + .UUID_WIDTH (`UUID_WIDTH), + .CORE_OUT_REG (2), + .MEM_OUT_REG (2), + .NC_ENABLE (1), + .PASSTHRU (!`L2_ENABLED) + ) l2cache ( + .clk (clk), + .reset (l2_reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), + .cache_perf (perf_l2cache), `endif - - .dcache_bus_if (per_socket_dcache_bus_if), - - .icache_bus_if (per_socket_icache_bus_if), - - .mem_bus_if (mem_bus_if) + .core_bus_if (per_socket_mem_bus_if), + .mem_bus_if (mem_bus_if) ); /////////////////////////////////////////////////////////////////////////// @@ -131,14 +150,12 @@ module VX_cluster import VX_gpu_pkg::*; #( .reset (socket_reset), `ifdef PERF_ENABLE - .mem_perf_if (perf_memsys_total_if), + .mem_perf_if (mem_perf_tmp_if), `endif .dcr_bus_if (socket_dcr_bus_if), - .dcache_bus_if (per_socket_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]), - - .icache_bus_if (per_socket_icache_bus_if[i]), + .mem_bus_if (per_socket_mem_bus_if[i]), `ifdef GBAR_ENABLE .gbar_bus_if (per_socket_gbar_bus_if[i]), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index f5bdb2d2..6ecb3cf4 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -32,7 +32,14 @@ /////////////////////////////////////////////////////////////////////////////// -// 32 bit XLEN as default. +`ifndef EXT_M_DISABLE +`define EXT_M_ENABLE +`endif + +`ifndef EXT_F_DISABLE +`define EXT_F_ENABLE +`endif + `ifndef XLEN_32 `ifndef XLEN_64 `define XLEN_32 @@ -47,6 +54,26 @@ `define XLEN 32 `endif +`ifdef EXT_D_ENABLE +`define FLEN_64 +`else +`define FLEN_32 +`endif + +`ifdef FLEN_64 +`define FLEN 64 +`endif + +`ifdef FLEN_32 +`define FLEN 32 +`endif + +`ifdef XLEN_64 +`ifdef FLEN_32 + `define FPU_RV64F +`endif +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif @@ -70,6 +97,7 @@ `ifndef SOCKET_SIZE `define SOCKET_SIZE `MIN(4, `NUM_CORES) `endif +`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE) `ifdef L2_ENABLE `define L2_ENABLED 1 @@ -186,119 +214,6 @@ `define DEBUG_LEVEL 3 `endif -// ISA Extensions ///////////////////////////////////////////////////////////// - -`ifndef EXT_M_DISABLE -`define EXT_M_ENABLE -`endif - -`ifndef EXT_F_DISABLE -`define EXT_F_ENABLE -`endif - -`ifdef EXT_D_ENABLE -`define FLEN_64 -`else -`define FLEN_32 -`endif - -`ifdef FLEN_64 -`define FLEN 64 -`endif - -`ifdef FLEN_32 -`define FLEN 32 -`endif - -`ifdef XLEN_64 -`ifdef FLEN_32 - `define FPU_RV64F -`endif -`endif - -`define ISA_STD_A 0 -`define ISA_STD_C 2 -`define ISA_STD_D 3 -`define ISA_STD_E 4 -`define ISA_STD_F 5 -`define ISA_STD_H 7 -`define ISA_STD_I 8 -`define ISA_STD_N 13 -`define ISA_STD_Q 16 -`define ISA_STD_S 18 -`define ISA_STD_U 20 - -`define ISA_EXT_TEX 0 -`define ISA_EXT_RASTER 1 -`define ISA_EXT_ROP 2 - -`ifdef EXT_A_ENABLE - `define EXT_A_ENABLED 1 -`else - `define EXT_A_ENABLED 0 -`endif - -`ifdef EXT_C_ENABLE - `define EXT_C_ENABLED 1 -`else - `define EXT_C_ENABLED 0 -`endif - -`ifdef EXT_D_ENABLE - `define EXT_D_ENABLED 1 -`else - `define EXT_D_ENABLED 0 -`endif - -`ifdef EXT_F_ENABLE - `define EXT_F_ENABLED 1 -`else - `define EXT_F_ENABLED 0 -`endif - -`ifdef EXT_M_ENABLE - `define EXT_M_ENABLED 1 -`else - `define EXT_M_ENABLED 0 -`endif - -`define ISA_X_ENABLED 0 - -`define MISA_EXT 0 - -`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \ - | (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \ - | (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \ - | (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \ - | (0 << 4) /* E - RV32E base ISA */ \ - | (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \ - | (0 << 6) /* G - Additional standard extensions present */ \ - | (0 << 7) /* H - Hypervisor mode implemented */ \ - | (1 << 8) /* I - RV32I/64I/128I base ISA */ \ - | (0 << 9) /* J - Reserved */ \ - | (0 << 10) /* K - Reserved */ \ - | (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \ - | (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \ - | (0 << 13) /* N - User level interrupts supported */ \ - | (0 << 14) /* O - Reserved */ \ - | (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \ - | (0 << 16) /* Q - Quad-precision floating-point extension */ \ - | (0 << 17) /* R - Reserved */ \ - | (0 << 18) /* S - Supervisor mode implemented */ \ - | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \ - | (1 << 20) /* U - User mode implemented */ \ - | (0 << 21) /* V - Tentatively reserved for Vector extension */ \ - | (0 << 22) /* W - Reserved */ \ - | (`ISA_X_ENABLED << 23) /* X - Non-standard extensions present */ \ - | (0 << 24) /* Y - Reserved */ \ - | (0 << 25) /* Z - Reserved */ - -// Device identification ////////////////////////////////////////////////////// - -`define VENDOR_ID 0 -`define ARCHITECTURE_ID 0 -`define IMPLEMENTATION_ID 0 - // Pipeline Configuration ///////////////////////////////////////////////////// // Issue width @@ -554,6 +469,7 @@ `ifndef SM_DISABLE `define SM_ENABLE `endif + `ifdef SM_ENABLE `define SM_ENABLED 1 `else @@ -579,7 +495,7 @@ // Number of Banks `ifndef L2_NUM_BANKS -`define L2_NUM_BANKS 2 +`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS) `endif // Core Response Queue Size @@ -648,4 +564,93 @@ `define L3_NUM_WAYS 4 `endif +// ISA Extensions ///////////////////////////////////////////////////////////// + +`ifdef EXT_A_ENABLE + `define EXT_A_ENABLED 1 +`else + `define EXT_A_ENABLED 0 +`endif + +`ifdef EXT_C_ENABLE + `define EXT_C_ENABLED 1 +`else + `define EXT_C_ENABLED 0 +`endif + +`ifdef EXT_D_ENABLE + `define EXT_D_ENABLED 1 +`else + `define EXT_D_ENABLED 0 +`endif + +`ifdef EXT_F_ENABLE + `define EXT_F_ENABLED 1 +`else + `define EXT_F_ENABLED 0 +`endif + +`ifdef EXT_M_ENABLE + `define EXT_M_ENABLED 1 +`else + `define EXT_M_ENABLED 0 +`endif + +`define ISA_STD_A 0 +`define ISA_STD_C 2 +`define ISA_STD_D 3 +`define ISA_STD_E 4 +`define ISA_STD_F 5 +`define ISA_STD_H 7 +`define ISA_STD_I 8 +`define ISA_STD_N 13 +`define ISA_STD_Q 16 +`define ISA_STD_S 18 +`define ISA_STD_U 20 + +`define ISA_EXT_ICACHE 0 +`define ISA_EXT_DCACHE 1 +`define ISA_EXT_L2CACHE 2 +`define ISA_EXT_L3CACHE 3 +`define ISA_EXT_SMEM 4 + +`define MISA_EXT (`ICACHE_ENABLED << `ISA_EXT_ICACHE) \ + | (`DCACHE_ENABLED << `ISA_EXT_DCACHE) \ + | (`L2_ENABLED << `ISA_EXT_L2CACHE) \ + | (`L3_ENABLED << `ISA_EXT_L3CACHE) \ + | (`SM_ENABLED << `ISA_EXT_SMEM) + +`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \ + | (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \ + | (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \ + | (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \ + | (0 << 4) /* E - RV32E base ISA */ \ + | (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \ + | (0 << 6) /* G - Additional standard extensions present */ \ + | (0 << 7) /* H - Hypervisor mode implemented */ \ + | (1 << 8) /* I - RV32I/64I/128I base ISA */ \ + | (0 << 9) /* J - Reserved */ \ + | (0 << 10) /* K - Reserved */ \ + | (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \ + | (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \ + | (0 << 13) /* N - User level interrupts supported */ \ + | (0 << 14) /* O - Reserved */ \ + | (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \ + | (0 << 16) /* Q - Quad-precision floating-point extension */ \ + | (0 << 17) /* R - Reserved */ \ + | (0 << 18) /* S - Supervisor mode implemented */ \ + | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \ + | (1 << 20) /* U - User mode implemented */ \ + | (0 << 21) /* V - Tentatively reserved for Vector extension */ \ + | (0 << 22) /* W - Reserved */ \ + | (1 << 23) /* X - Non-standard extensions present */ \ + | (0 << 24) /* Y - Reserved */ \ + | (0 << 25) /* Z - Reserved */ + +// Device identification ////////////////////////////////////////////////////// + +`define VENDOR_ID 0 +`define ARCHITECTURE_ID 0 +`define IMPLEMENTATION_ID 0 + `endif // VX_CONFIG_VH diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 31714580..f8865a1d 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -228,9 +228,6 @@ `define INST_SFU_CSRRS 4'h7 `define INST_SFU_CSRRC 4'h8 `define INST_SFU_TEX 4'h9 -`define INST_SFU_RASTER 4'hA -`define INST_SFU_ROP 4'hB -`define INST_SFU_CMOV 4'hC `define INST_SFU_BITS 4 `define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) `define INST_SFU_IS_WCTL(op) (op <= 5) @@ -238,10 +235,6 @@ /////////////////////////////////////////////////////////////////////////////// -`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE) - -/////////////////////////////////////////////////////////////////////////////// - // non-cacheable tag bits `define NC_TAG_BITS 1 @@ -396,7 +389,7 @@ end \ assign ``dst.``field = __reduce_add_r_``dst``field -`define PERF_CACHE_ADD(dst, src, count) \ +`define PERF_CACHE_REDUCE(dst, src, count) \ `PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \ `PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \ `PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \ diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 7325136f..cdb48db4 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -58,6 +58,23 @@ package VX_gpu_pkg; logic [7:0] mpm_class; } base_dcrs_t; + typedef struct packed { + logic [`PERF_CTR_BITS-1:0] reads; + logic [`PERF_CTR_BITS-1:0] writes; + logic [`PERF_CTR_BITS-1:0] read_misses; + logic [`PERF_CTR_BITS-1:0] write_misses; + logic [`PERF_CTR_BITS-1:0] bank_stalls; + logic [`PERF_CTR_BITS-1:0] mshr_stalls; + logic [`PERF_CTR_BITS-1:0] mem_stalls; + logic [`PERF_CTR_BITS-1:0] crsp_stalls; + } cache_perf_t; + + typedef struct packed { + logic [`PERF_CTR_BITS-1:0] reads; + logic [`PERF_CTR_BITS-1:0] writes; + logic [`PERF_CTR_BITS-1:0] latency; + } mem_perf_t; + /* verilator lint_off UNUSED */ ////////////////////////// Icache Parameters ////////////////////////////// @@ -74,7 +91,6 @@ package VX_gpu_pkg; // Core request tag bits localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS); - localparam ICACHE_ARB_TAG_WIDTH = (ICACHE_TAG_WIDTH + `CLOG2(`SOCKET_SIZE)); // Memory request data bits localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8); @@ -83,7 +99,7 @@ package VX_gpu_pkg; `ifdef ICACHE_ENABLE localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); `else - localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); + localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); `endif ////////////////////////// Dcache Parameters ////////////////////////////// @@ -112,23 +128,21 @@ package VX_gpu_pkg; // Core request tag bits localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS); localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED); - localparam DCACHE_ARB_TAG_WIDTH = (DCACHE_NOSM_TAG_WIDTH + `CLOG2(`SOCKET_SIZE)); // Memory request data bits localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8); // Memory request tag bits `ifdef DCACHE_ENABLE - localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES); + localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); `else - localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES); + localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); `endif /////////////////////////////// L1 Parameters ///////////////////////////// localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - - localparam NUM_L1_OUTPUTS = 2; + localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2)); /////////////////////////////// L2 Parameters ///////////////////////////// @@ -136,10 +150,10 @@ package VX_gpu_pkg; localparam L2_WORD_SIZE = `L1_LINE_SIZE; // Input request size - localparam L2_NUM_REQS = NUM_L1_OUTPUTS; + localparam L2_NUM_REQS = `NUM_SOCKETS; // Core request tag bits - localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH; + localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH; // Memory request data bits localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8); diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 8c8f4b39..1e61fdff 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -26,13 +26,14 @@ module VX_socket import VX_gpu_pkg::*; #( VX_mem_perf_if.slave mem_perf_if, `endif + // DCRs VX_dcr_bus_if.slave dcr_bus_if, - VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], - - VX_mem_bus_if.master icache_bus_if, + // Memory + VX_mem_bus_if.master mem_bus_if, `ifdef GBAR_ENABLE + // Barrier VX_gbar_bus_if.master gbar_bus_if, `endif @@ -62,77 +63,139 @@ module VX_socket import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////// +`ifdef PERF_ENABLE + VX_mem_perf_if mem_perf_tmp_if(); + cache_perf_t perf_icache; + cache_perf_t perf_dcache; + + assign mem_perf_tmp_if.icache = perf_icache; + assign mem_perf_tmp_if.dcache = perf_dcache; + assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; + assign mem_perf_tmp_if.smem = 'x; + assign mem_perf_tmp_if.mem = mem_perf_if.mem; +`endif + VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) - ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) + ) icache_mem_bus_if(); - `RESET_RELAY (dcache_arb_reset, reset); + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) + ) dcache_mem_bus_if(); - for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_ARB_TAG_WIDTH) - ) dcache_bus_tmp_if[1](); + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_TAG_WIDTH) + ) cache_mem_bus_if[2](); - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) - ) per_core_dcache_bus_tmp_if[`SOCKET_SIZE](); + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) + ) mem_bus_tmp_if[1](); - for (genvar j = 0; j < `SOCKET_SIZE; ++j) begin - `ASSIGN_VX_MEM_BUS_IF (per_core_dcache_bus_tmp_if[j], per_core_dcache_bus_if[j * DCACHE_NUM_REQS + i]); - end + `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - VX_mem_arb #( - .NUM_INPUTS (`SOCKET_SIZE), - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH), - .TAG_SEL_IDX (`CACHE_ADDR_TYPE_BITS), - .ARBITER ("R"), - .OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0), - .OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0) - ) dcache_arb ( - .clk (clk), - .reset (dcache_arb_reset), - .bus_in_if (per_core_dcache_bus_tmp_if), - .bus_out_if (dcache_bus_tmp_if) - ); - - `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[0]); - end + `RESET_RELAY (mem_arb_reset, reset); + + VX_mem_arb #( + .NUM_INPUTS (2), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) mem_arb ( + .clk (clk), + .reset (mem_arb_reset), + .bus_in_if (cache_mem_bus_if), + .bus_out_if (mem_bus_tmp_if) + ); + + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]); /////////////////////////////////////////////////////////////////////////// - + VX_mem_bus_if #( .DATA_SIZE (ICACHE_WORD_SIZE), .TAG_WIDTH (ICACHE_TAG_WIDTH) ) per_core_icache_bus_if[`SOCKET_SIZE](); - VX_mem_bus_if #( - .DATA_SIZE (ICACHE_WORD_SIZE), - .TAG_WIDTH (ICACHE_ARB_TAG_WIDTH) - ) icache_bus_tmp_if[1](); + `RESET_RELAY (icache_reset, reset); - `RESET_RELAY (icache_arb_reset, reset); - - VX_mem_arb #( - .NUM_INPUTS (`SOCKET_SIZE), - .NUM_OUTPUTS (1), - .DATA_SIZE (ICACHE_WORD_SIZE), - .TAG_WIDTH (ICACHE_TAG_WIDTH), - .TAG_SEL_IDX (0), - .ARBITER ("R"), - .OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0), - .OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0) - ) icache_arb ( - .clk (clk), - .reset (icache_arb_reset), - .bus_in_if (per_core_icache_bus_if), - .bus_out_if (icache_bus_tmp_if) + VX_cache_cluster #( + .INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)), + .NUM_UNITS (`NUM_ICACHES), + .NUM_INPUTS (`SOCKET_SIZE), + .TAG_SEL_IDX (0), + .CACHE_SIZE (`ICACHE_SIZE), + .LINE_SIZE (ICACHE_LINE_SIZE), + .NUM_BANKS (1), + .NUM_WAYS (`ICACHE_NUM_WAYS), + .WORD_SIZE (ICACHE_WORD_SIZE), + .NUM_REQS (1), + .CRSQ_SIZE (`ICACHE_CRSQ_SIZE), + .MSHR_SIZE (`ICACHE_MSHR_SIZE), + .MRSQ_SIZE (`ICACHE_MRSQ_SIZE), + .MREQ_SIZE (`ICACHE_MREQ_SIZE), + .TAG_WIDTH (ICACHE_TAG_WIDTH), + .UUID_WIDTH (`UUID_WIDTH), + .WRITE_ENABLE (0), + .CORE_OUT_REG (2), + .MEM_OUT_REG (2) + ) icache ( + `ifdef PERF_ENABLE + .cache_perf (perf_icache), + `endif + .clk (clk), + .reset (icache_reset), + .core_bus_if (per_core_icache_bus_if), + .mem_bus_if (icache_mem_bus_if) ); - `ASSIGN_VX_MEM_BUS_IF (icache_bus_if, icache_bus_tmp_if[0]); + /////////////////////////////////////////////////////////////////////////// + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) + ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); + + `RESET_RELAY (dcache_reset, reset); + + VX_cache_cluster #( + .INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)), + .NUM_UNITS (`NUM_DCACHES), + .NUM_INPUTS (`SOCKET_SIZE), + .TAG_SEL_IDX (1), + .CACHE_SIZE (`DCACHE_SIZE), + .LINE_SIZE (DCACHE_LINE_SIZE), + .NUM_BANKS (`DCACHE_NUM_BANKS), + .NUM_WAYS (`DCACHE_NUM_WAYS), + .WORD_SIZE (DCACHE_WORD_SIZE), + .NUM_REQS (DCACHE_NUM_REQS), + .CRSQ_SIZE (`DCACHE_CRSQ_SIZE), + .MSHR_SIZE (`DCACHE_MSHR_SIZE), + .MRSQ_SIZE (`DCACHE_MRSQ_SIZE), + .MREQ_SIZE (`DCACHE_MREQ_SIZE), + .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH), + .UUID_WIDTH (`UUID_WIDTH), + .WRITE_ENABLE (1), + .NC_ENABLE (1), + .CORE_OUT_REG (`SM_ENABLED ? 2 : 1), + .MEM_OUT_REG (2) + ) dcache ( + `ifdef PERF_ENABLE + .cache_perf (perf_dcache), + `endif + .clk (clk), + .reset (dcache_reset), + .core_bus_if (per_core_dcache_bus_if), + .mem_bus_if (dcache_mem_bus_if) + ); /////////////////////////////////////////////////////////////////////////// @@ -163,7 +226,7 @@ module VX_socket import VX_gpu_pkg::*; #( .reset (core_reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), + .mem_perf_if (mem_perf_tmp_if), `endif .dcr_bus_if (core_dcr_bus_if), diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 9f5aa0d5..388dc258 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -33,9 +33,6 @@ `define VX_DCR_MPM_CLASS_NONE 0 `define VX_DCR_MPM_CLASS_CORE 1 `define VX_DCR_MPM_CLASS_MEM 2 -`define VX_DCR_MPM_CLASS_TEX 3 -`define VX_DCR_MPM_CLASS_RASTER 4 -`define VX_DCR_MPM_CLASS_ROP 5 // User Floating-Point CSRs diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index 833c6860..5bd628d5 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -45,90 +45,28 @@ module Vortex import VX_gpu_pkg::*; ( ); `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_if[`NUM_CLUSTERS](); - VX_mem_perf_if perf_memsys_total_if(); - VX_cache_perf_if perf_l3cache_if(); -`endif + VX_mem_perf_if mem_perf_if(); + cache_perf_t perf_l3cache; + mem_perf_t mem_perf; + + assign mem_perf_if.icache = 'x; + assign mem_perf_if.dcache = 'x; + assign mem_perf_if.l2cache = 'x; + assign mem_perf_if.l3cache = perf_l3cache; + assign mem_perf_if.smem = 'x; + assign mem_perf_if.mem = mem_perf; +`endif + + VX_mem_bus_if #( + .DATA_SIZE (`L2_LINE_SIZE), + .TAG_WIDTH (L2_MEM_TAG_WIDTH) + ) per_cluster_mem_bus_if[`NUM_CLUSTERS](); VX_mem_bus_if #( .DATA_SIZE (`L3_LINE_SIZE), .TAG_WIDTH (L3_MEM_TAG_WIDTH) ) mem_bus_if(); - assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; - assign mem_req_byteen= mem_bus_if.req_data.byteen; - assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; - assign mem_bus_if.req_ready = mem_req_ready; - - assign mem_bus_if.rsp_valid = mem_rsp_valid; - assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; - assign mem_rsp_ready = mem_bus_if.rsp_ready; - - wire mem_req_fire = mem_req_valid && mem_req_ready; - wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - `UNUSED_VAR (mem_req_fire) - `UNUSED_VAR (mem_rsp_fire) - - wire sim_ebreak /* verilator public */; - wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */; - wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak; - wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value; - assign sim_ebreak = per_cluster_sim_ebreak[0]; - assign sim_wb_value = per_cluster_sim_wb_value[0]; - `UNUSED_VAR (per_cluster_sim_ebreak) - `UNUSED_VAR (per_cluster_sim_wb_value) - - VX_mem_bus_if #( - .DATA_SIZE (`L2_LINE_SIZE), - .TAG_WIDTH (L2_MEM_TAG_WIDTH) - ) per_cluster_mem_bus_if[`NUM_CLUSTERS](); - - VX_dcr_bus_if dcr_bus_if(); - assign dcr_bus_if.write_valid = dcr_wr_valid; - assign dcr_bus_if.write_addr = dcr_wr_addr; - assign dcr_bus_if.write_data = dcr_wr_data; - - wire [`NUM_CLUSTERS-1:0] per_cluster_busy; - - `SCOPE_IO_SWITCH (`NUM_CLUSTERS) - - // Generate all clusters - for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin - - `RESET_RELAY (cluster_reset, reset); - - `BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1)); - - VX_cluster #( - .CLUSTER_ID (i) - ) cluster ( - `SCOPE_IO_BIND (i) - - .clk (clk), - .reset (cluster_reset), - - `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if[i]), - .perf_memsys_total_if (perf_memsys_total_if), - `endif - - .dcr_bus_if (cluster_dcr_bus_if), - - .mem_bus_if (per_cluster_mem_bus_if[i]), - - .sim_ebreak (per_cluster_sim_ebreak[i]), - .sim_wb_value (per_cluster_sim_wb_value[i]), - - .busy (per_cluster_busy[i]) - ); - end - - `BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1)); - `RESET_RELAY (l3_reset, reset); VX_cache_wrap #( @@ -155,49 +93,83 @@ module Vortex import VX_gpu_pkg::*; ( .reset (l3_reset), `ifdef PERF_ENABLE - .cache_perf_if (perf_l3cache_if), + .cache_perf (perf_l3cache), `endif .core_bus_if (per_cluster_mem_bus_if), .mem_bus_if (mem_bus_if) ); + assign mem_req_valid = mem_bus_if.req_valid; + assign mem_req_rw = mem_bus_if.req_data.rw; + assign mem_req_byteen= mem_bus_if.req_data.byteen; + assign mem_req_addr = mem_bus_if.req_data.addr; + assign mem_req_data = mem_bus_if.req_data.data; + assign mem_req_tag = mem_bus_if.req_data.tag; + assign mem_bus_if.req_ready = mem_req_ready; + + assign mem_bus_if.rsp_valid = mem_rsp_valid; + assign mem_bus_if.rsp_data.data = mem_rsp_data; + assign mem_bus_if.rsp_data.tag = mem_rsp_tag; + assign mem_rsp_ready = mem_bus_if.rsp_ready; + + wire mem_req_fire = mem_req_valid && mem_req_ready; + wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; + `UNUSED_VAR (mem_req_fire) + `UNUSED_VAR (mem_rsp_fire) + + wire sim_ebreak /* verilator public */; + wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */; + wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak; + wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value; + assign sim_ebreak = per_cluster_sim_ebreak[0]; + assign sim_wb_value = per_cluster_sim_wb_value[0]; + `UNUSED_VAR (per_cluster_sim_ebreak) + `UNUSED_VAR (per_cluster_sim_wb_value) + + VX_dcr_bus_if dcr_bus_if(); + assign dcr_bus_if.write_valid = dcr_wr_valid; + assign dcr_bus_if.write_addr = dcr_wr_addr; + assign dcr_bus_if.write_data = dcr_wr_data; + + wire [`NUM_CLUSTERS-1:0] per_cluster_busy; + + `SCOPE_IO_SWITCH (`NUM_CLUSTERS) + + // Generate all clusters + for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin + + `RESET_RELAY (cluster_reset, reset); + + `BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1)); + + VX_cluster #( + .CLUSTER_ID (i) + ) cluster ( + `SCOPE_IO_BIND (i) + + .clk (clk), + .reset (cluster_reset), + + `ifdef PERF_ENABLE + .mem_perf_if (mem_perf_if), + `endif + + .dcr_bus_if (cluster_dcr_bus_if), + + .mem_bus_if (per_cluster_mem_bus_if[i]), + + .sim_ebreak (per_cluster_sim_ebreak[i]), + .sim_wb_value (per_cluster_sim_wb_value[i]), + + .busy (per_cluster_busy[i]) + ); + end + + `BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1)); + `ifdef PERF_ENABLE - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_reads, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_writes, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS); - `PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS); - -`ifdef L3_ENABLE - assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads; - assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes; - assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses; - assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses; - assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls; - assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls; -`else - assign perf_memsys_total_if.l3cache_reads = '0; - assign perf_memsys_total_if.l3cache_writes = '0; - assign perf_memsys_total_if.l3cache_read_misses = '0; - assign perf_memsys_total_if.l3cache_write_misses= '0; - assign perf_memsys_total_if.l3cache_bank_stalls = '0; - assign perf_memsys_total_if.l3cache_mshr_stalls = '0; -`endif - reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; always @(posedge clk) begin @@ -208,30 +180,20 @@ module Vortex import VX_gpu_pkg::*; ( `PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire))); end end - - reg [`PERF_CTR_BITS-1:0] perf_mem_reads; - reg [`PERF_CTR_BITS-1:0] perf_mem_writes; - reg [`PERF_CTR_BITS-1:0] perf_mem_lat; always @(posedge clk) begin if (reset) begin - perf_mem_reads <= '0; - perf_mem_writes <= '0; - perf_mem_lat <= '0; + mem_perf <= '0; end else begin if (mem_req_fire && ~mem_bus_if.req_data.rw) begin - perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1); + mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1); end if (mem_req_fire && mem_bus_if.req_data.rw) begin - perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1); + mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1); end - perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads; + mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; end end - - assign perf_memsys_total_if.mem_reads = perf_mem_reads; - assign perf_memsys_total_if.mem_writes = perf_mem_writes; - assign perf_memsys_total_if.mem_latency = perf_mem_lat; `endif diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index adf4f7c8..30594f1a 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -13,7 +13,7 @@ `include "VX_cache_define.vh" -module VX_cache #( +module VX_cache import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", // Number of Word requests per cycle @@ -56,7 +56,7 @@ module VX_cache #( ) ( // PERF `ifdef PERF_ENABLE - VX_cache_perf_if.master cache_perf_if, + output cache_perf_t cache_perf, `endif input wire clk, @@ -279,6 +279,10 @@ module VX_cache #( core_req_tag[i]}; end +`ifdef PERF_ENABLE + wire [`PERF_CTR_BITS-1:0] perf_collisions; +`endif + `RESET_RELAY (req_xbar_reset, reset); VX_stream_xbar #( @@ -290,9 +294,9 @@ module VX_cache #( .clk (clk), .reset (req_xbar_reset), `ifdef PERF_ENABLE - .collisions (cache_perf_if.bank_stalls), + .collisions(perf_collisions), `else - `UNUSED_PIN (collisions), + `UNUSED_PIN(collisions), `endif .valid_in (core_req_valid), .data_in (core_req_data_in), @@ -578,13 +582,14 @@ module VX_cache #( end end - assign cache_perf_if.reads = perf_core_reads; - assign cache_perf_if.writes = perf_core_writes; - assign cache_perf_if.read_misses = perf_read_misses; - assign cache_perf_if.write_misses = perf_write_misses; - assign cache_perf_if.mshr_stalls = perf_mshr_stalls; - assign cache_perf_if.mem_stalls = perf_mem_stalls; - assign cache_perf_if.crsp_stalls = perf_crsp_stalls; + assign cache_perf.reads = perf_core_reads; + assign cache_perf.writes = perf_core_writes; + assign cache_perf.read_misses = perf_read_misses; + assign cache_perf.write_misses = perf_write_misses; + assign cache_perf.bank_stalls = perf_collisions; + assign cache_perf.mshr_stalls = perf_mshr_stalls; + assign cache_perf.mem_stalls = perf_mem_stalls; + assign cache_perf.crsp_stalls = perf_crsp_stalls; `endif endmodule diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index b1c846a3..281b2b23 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -13,7 +13,7 @@ `include "VX_cache_define.vh" -module VX_cache_cluster #( +module VX_cache_cluster import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", parameter NUM_UNITS = 1, @@ -66,7 +66,7 @@ module VX_cache_cluster #( // PERF `ifdef PERF_ENABLE - VX_cache_perf_if.master cache_perf_if, + output cache_perf_t cache_perf, `endif VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS], @@ -83,8 +83,8 @@ module VX_cache_cluster #( `STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter")) `ifdef PERF_ENABLE - VX_cache_perf_if perf_cache_unit_if[NUM_CACHES](); - `PERF_CACHE_ADD (cache_perf_if, perf_cache_unit_if, NUM_CACHES); + cache_perf_t perf_cache_unit[NUM_CACHES]; + `PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES); `endif VX_mem_bus_if #( @@ -97,7 +97,6 @@ module VX_cache_cluster #( .TAG_WIDTH (ARB_TAG_WIDTH) ) arb_core_bus_if[NUM_CACHES * NUM_REQS](); - for (genvar i = 0; i < NUM_REQS; ++i) begin VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), @@ -161,7 +160,7 @@ module VX_cache_cluster #( .PASSTHRU (PASSTHRU) ) cache_wrap ( `ifdef PERF_ENABLE - .cache_perf_if (perf_cache_unit_if[i]), + .cache_perf (perf_cache_unit[i]), `endif .clk (clk), .reset (cache_reset), @@ -357,7 +356,7 @@ module VX_cache_cluster_top #( .MEM_OUT_REG (MEM_OUT_REG) ) cache ( `ifdef PERF_ENABLE - .cache_perf_if (perf_icache_if), + .cache_perf (perf_icache), `endif .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache_perf_if.sv b/hw/rtl/cache/VX_cache_perf_if.sv deleted file mode 100644 index 6e68b1b3..00000000 --- a/hw/rtl/cache/VX_cache_perf_if.sv +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_define.vh" - -interface VX_cache_perf_if (); - - wire [`PERF_CTR_BITS-1:0] reads; - wire [`PERF_CTR_BITS-1:0] writes; - wire [`PERF_CTR_BITS-1:0] read_misses; - wire [`PERF_CTR_BITS-1:0] write_misses; - wire [`PERF_CTR_BITS-1:0] bank_stalls; - wire [`PERF_CTR_BITS-1:0] mshr_stalls; - wire [`PERF_CTR_BITS-1:0] mem_stalls; - wire [`PERF_CTR_BITS-1:0] crsp_stalls; - - modport master ( - output reads, - output writes, - output read_misses, - output write_misses, - output bank_stalls, - output mshr_stalls, - output mem_stalls, - output crsp_stalls - ); - - modport slave ( - input reads, - input writes, - input read_misses, - input write_misses, - input bank_stalls, - input mshr_stalls, - input mem_stalls, - input crsp_stalls - ); - -endinterface diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 22ab57ae..0956e64b 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -13,7 +13,7 @@ `include "VX_cache_define.vh" -module VX_cache_wrap #( +module VX_cache_wrap import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", // Number of Word requests per cycle @@ -67,14 +67,14 @@ module VX_cache_wrap #( // PERF `ifdef PERF_ENABLE - VX_cache_perf_if.master cache_perf_if, + output cache_perf_t cache_perf, `endif VX_mem_bus_if.slave core_bus_if [NUM_REQS], VX_mem_bus_if.master mem_bus_if ); - `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter")) + `STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS)) `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE); @@ -353,14 +353,7 @@ module VX_cache_wrap #( assign mem_rsp_ready_b = 0; `ifdef PERF_ENABLE - assign cache_perf_if.reads = '0; - assign cache_perf_if.writes = '0; - assign cache_perf_if.read_misses = '0; - assign cache_perf_if.write_misses = '0; - assign cache_perf_if.bank_stalls = '0; - assign cache_perf_if.mshr_stalls = '0; - assign cache_perf_if.mem_stalls = '0; - assign cache_perf_if.crsp_stalls = '0; + assign cache_perf = '0; `endif end else begin @@ -429,7 +422,7 @@ module VX_cache_wrap #( .reset (cache_reset), `ifdef PERF_ENABLE - .cache_perf_if (cache_perf_if), + .cache_perf (cache_perf), `endif .core_bus_if (core_bus_wrap_if), diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 01795634..aa38bac0 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -77,8 +77,20 @@ module VX_core import VX_gpu_pkg::*; #( ) dcache_bus_tmp_if[DCACHE_NUM_REQS](); `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); VX_pipeline_perf_if pipeline_perf_if(); + VX_mem_perf_if mem_perf_tmp_if(); + cache_perf_t smem_perf; + + assign mem_perf_tmp_if.icache = mem_perf_if.icache; + assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; + assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; + assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; +`ifdef SM_ENABLE + assign mem_perf_tmp_if.smem = smem_perf; +`else + assign mem_perf_tmp_if.smem = '0; +`endif + assign mem_perf_tmp_if.mem = mem_perf_if.mem; `endif `RESET_RELAY (dcr_data_reset, reset); @@ -226,19 +238,28 @@ module VX_core import VX_gpu_pkg::*; #( .sim_wb_value (sim_wb_value) ); +`ifdef SM_ENABLE + VX_smem_unit #( .CORE_ID (CORE_ID) ) smem_unit ( .clk (clk), .reset (reset), `ifdef PERF_ENABLE - .mem_perf_in_if (mem_perf_if), - .mem_perf_out_if (mem_perf_tmp_if), + .cache_perf (smem_perf), `endif .dcache_bus_in_if (dcache_bus_tmp_if), .dcache_bus_out_if (dcache_bus_if) ); +`else + + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin + `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]); + end + +`endif + `ifdef PERF_ENABLE wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 0b492ba4..9ba0ffd0 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -221,63 +221,63 @@ import VX_fpu_pkg::*; `VX_DCR_MPM_CLASS_MEM: begin case (read_addr) // PERF: icache - `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache_reads[31:0]; - `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache_read_misses[31:0]; - `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache_read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; + `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; + `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); // PERF: dcache - `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache_reads[31:0]; - `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache_writes[31:0]; - `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache_writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache_read_misses[31:0]; - `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache_read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache_write_misses[31:0]; - `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache_write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache_bank_stalls[31:0]; - `VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache_mshr_stalls[31:0]; - `VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; + `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; + `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; + `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; + `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; + `VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0]; + `VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: smem - `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem_reads[31:0]; - `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem_writes[31:0]; - `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem_writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem_bank_stalls[31:0]; - `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0]; + `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0]; + `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0]; + `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]); // PERF: l2cache - `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache_reads[31:0]; - `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache_writes[31:0]; - `VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache_writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache_read_misses[31:0]; - `VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache_read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache_write_misses[31:0]; - `VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache_write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache_bank_stalls[31:0]; - `VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache_mshr_stalls[31:0]; - `VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache_mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; + `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; + `VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; + `VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0]; + `VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0]; + `VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0]; + `VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: l3cache - `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache_reads[31:0]; - `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache_writes[31:0]; - `VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache_writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache_read_misses[31:0]; - `VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache_read_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache_write_misses[31:0]; - `VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache_write_misses[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache_bank_stalls[31:0]; - `VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_bank_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache_mshr_stalls[31:0]; - `VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache_mshr_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; + `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; + `VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; + `VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0]; + `VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0]; + `VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0]; + `VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: memory - `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem_reads[31:0]; - `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem_reads[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem_writes[31:0]; - `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem_writes[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem_latency[31:0]; - `VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; + `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; + `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; + `VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); default:; endcase end @@ -299,6 +299,8 @@ import VX_fpu_pkg::*; `ifdef PERF_ENABLE wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls; `UNUSED_VAR (perf_wctl_stalls); + `UNUSED_VAR (mem_perf_if.icache); + `UNUSED_VAR (mem_perf_if.smem); `endif endmodule diff --git a/hw/rtl/core/VX_smem_unit.sv b/hw/rtl/core/VX_smem_unit.sv index 82eb126a..7ff7c2d8 100644 --- a/hw/rtl/core/VX_smem_unit.sv +++ b/hw/rtl/core/VX_smem_unit.sv @@ -20,8 +20,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_in_if, - VX_mem_perf_if.master mem_perf_out_if, + output cache_perf_t cache_perf, `endif VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS], @@ -29,21 +28,78 @@ module VX_smem_unit import VX_gpu_pkg::*; #( ); `UNUSED_PARAM (CORE_ID) -`ifdef SM_ENABLE localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE); + wire [DCACHE_NUM_REQS-1:0] smem_req_valid; + wire [DCACHE_NUM_REQS-1:0] smem_req_rw; + wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr; + wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen; + wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data; + wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag; + wire [DCACHE_NUM_REQS-1:0] smem_req_ready; + wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid; + wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data; + wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag; + wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready; + + `RESET_RELAY (smem_reset, reset); + + VX_shared_mem #( + .INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)), + .SIZE (1 << `SMEM_LOG_SIZE), + .NUM_REQS (DCACHE_NUM_REQS), + .NUM_BANKS (`SMEM_NUM_BANKS), + .WORD_SIZE (DCACHE_WORD_SIZE), + .ADDR_WIDTH (SMEM_ADDR_WIDTH), + .UUID_WIDTH (`UUID_WIDTH), + .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) + ) shared_mem ( + .clk (clk), + .reset (smem_reset), + + `ifdef PERF_ENABLE + .cache_perf (cache_perf), + `endif + + // Core request + .req_valid (smem_req_valid), + .req_rw (smem_req_rw), + .req_byteen (smem_req_byteen), + .req_addr (smem_req_addr), + .req_data (smem_req_data), + .req_tag (smem_req_tag), + .req_ready (smem_req_ready), + + // Core response + .rsp_valid (smem_rsp_valid), + .rsp_data (smem_rsp_data), + .rsp_tag (smem_rsp_tag), + .rsp_ready (smem_rsp_ready) + ); + VX_mem_bus_if #( .DATA_SIZE (DCACHE_WORD_SIZE), .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) ) switch_out_bus_if[2 * DCACHE_NUM_REQS](); -`ifdef PERF_ENABLE - VX_cache_perf_if perf_smem_if(); -`endif - `RESET_RELAY (switch_reset, reset); for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin + + assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid; + assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw; + assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen; + assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data; + assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag; + assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i]; + + assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i]; + assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i]; + assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i]; + assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready; + + assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0]; + VX_smem_switch #( .NUM_REQS (2), .DATA_SIZE (DCACHE_WORD_SIZE), @@ -65,121 +121,4 @@ module VX_smem_unit import VX_gpu_pkg::*; #( `ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]); end - wire [DCACHE_NUM_REQS-1:0] smem_req_valid; - wire [DCACHE_NUM_REQS-1:0] smem_req_rw; - wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr; - wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen; - wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data; - wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag; - wire [DCACHE_NUM_REQS-1:0] smem_req_ready; - wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid; - wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data; - wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag; - wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready; - - for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - - assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid; - assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw; - assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen; - assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data; - assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag; - assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i]; - - assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i]; - assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i]; - assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i]; - assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready; - - assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0]; - end - - `RESET_RELAY (smem_reset, reset); - - VX_shared_mem #( - .INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)), - .SIZE (1 << `SMEM_LOG_SIZE), - .NUM_REQS (DCACHE_NUM_REQS), - .NUM_BANKS (`SMEM_NUM_BANKS), - .WORD_SIZE (DCACHE_WORD_SIZE), - .ADDR_WIDTH (SMEM_ADDR_WIDTH), - .UUID_WIDTH (`UUID_WIDTH), - .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) - ) shared_mem ( - .clk (clk), - .reset (smem_reset), - - `ifdef PERF_ENABLE - .cache_perf_if(perf_smem_if), - `endif - - // Core request - .req_valid (smem_req_valid), - .req_rw (smem_req_rw), - .req_byteen (smem_req_byteen), - .req_addr (smem_req_addr), - .req_data (smem_req_data), - .req_tag (smem_req_tag), - .req_ready (smem_req_ready), - - // Core response - .rsp_valid (smem_rsp_valid), - .rsp_data (smem_rsp_data), - .rsp_tag (smem_rsp_tag), - .rsp_ready (smem_rsp_ready) - ); - -`else - - for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - `ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], dcache_bus_in_if[i]); - end - - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - -`endif - -`ifdef PERF_ENABLE - - assign mem_perf_out_if.icache_reads = mem_perf_in_if.icache_reads; - assign mem_perf_out_if.icache_read_misses = mem_perf_in_if.icache_read_misses; - - assign mem_perf_out_if.dcache_reads = mem_perf_in_if.dcache_reads; - assign mem_perf_out_if.dcache_writes = mem_perf_in_if.dcache_writes; - assign mem_perf_out_if.dcache_read_misses = mem_perf_in_if.dcache_read_misses; - assign mem_perf_out_if.dcache_write_misses = mem_perf_in_if.dcache_write_misses; - assign mem_perf_out_if.dcache_bank_stalls = mem_perf_in_if.dcache_bank_stalls; - assign mem_perf_out_if.dcache_mshr_stalls = mem_perf_in_if.dcache_mshr_stalls; - - assign mem_perf_out_if.l2cache_reads = mem_perf_in_if.l2cache_reads; - assign mem_perf_out_if.l2cache_writes = mem_perf_in_if.l2cache_writes; - assign mem_perf_out_if.l2cache_read_misses = mem_perf_in_if.l2cache_read_misses; - assign mem_perf_out_if.l2cache_write_misses = mem_perf_in_if.l2cache_write_misses; - assign mem_perf_out_if.l2cache_bank_stalls = mem_perf_in_if.l2cache_bank_stalls; - assign mem_perf_out_if.l2cache_mshr_stalls = mem_perf_in_if.l2cache_mshr_stalls; - - assign mem_perf_out_if.l3cache_reads = mem_perf_in_if.l3cache_reads; - assign mem_perf_out_if.l3cache_writes = mem_perf_in_if.l3cache_writes; - assign mem_perf_out_if.l3cache_read_misses = mem_perf_in_if.l3cache_read_misses; - assign mem_perf_out_if.l3cache_write_misses = mem_perf_in_if.l3cache_write_misses; - assign mem_perf_out_if.l3cache_bank_stalls = mem_perf_in_if.l3cache_bank_stalls; - assign mem_perf_out_if.l3cache_mshr_stalls = mem_perf_in_if.l3cache_mshr_stalls; - - assign mem_perf_out_if.mem_reads = mem_perf_in_if.mem_reads; - assign mem_perf_out_if.mem_writes = mem_perf_in_if.mem_writes; - assign mem_perf_out_if.mem_latency = mem_perf_in_if.mem_latency; - -`ifdef SM_ENABLE - assign mem_perf_out_if.smem_reads = perf_smem_if.reads; - assign mem_perf_out_if.smem_writes = perf_smem_if.writes; - assign mem_perf_out_if.smem_bank_stalls = perf_smem_if.bank_stalls; -`else - assign mem_perf_out_if.smem_reads = '0; - assign mem_perf_out_if.smem_writes = '0; - assign mem_perf_out_if.smem_bank_stalls = '0; -`endif - -`endif - endmodule diff --git a/hw/rtl/core/VX_trace.vh b/hw/rtl/core/VX_trace.vh index 2bf2a9fc..1dea3347 100644 --- a/hw/rtl/core/VX_trace.vh +++ b/hw/rtl/core/VX_trace.vh @@ -358,9 +358,6 @@ task trace_ex_op(input int level, `INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end `INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end `INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end - `INST_SFU_TEX: `TRACE(level, ("TEX")); - `INST_SFU_RASTER:`TRACE(level, ("RASTER")); - `INST_SFU_ROP: `TRACE(level, ("ROP")); default: `TRACE(level, ("?")); endcase end diff --git a/hw/rtl/mem/VX_mem_perf_if.sv b/hw/rtl/mem/VX_mem_perf_if.sv index 277ebf1f..cc41f23b 100644 --- a/hw/rtl/mem/VX_mem_perf_if.sv +++ b/hw/rtl/mem/VX_mem_perf_if.sv @@ -13,106 +13,31 @@ `include "VX_define.vh" -interface VX_mem_perf_if (); +interface VX_mem_perf_if import VX_gpu_pkg::*; (); - wire [`PERF_CTR_BITS-1:0] icache_reads; - wire [`PERF_CTR_BITS-1:0] icache_read_misses; - - wire [`PERF_CTR_BITS-1:0] dcache_reads; - wire [`PERF_CTR_BITS-1:0] dcache_writes; - wire [`PERF_CTR_BITS-1:0] dcache_read_misses; - wire [`PERF_CTR_BITS-1:0] dcache_write_misses; - wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls; - wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls; - - wire [`PERF_CTR_BITS-1:0] smem_reads; - wire [`PERF_CTR_BITS-1:0] smem_writes; - wire [`PERF_CTR_BITS-1:0] smem_bank_stalls; - - wire [`PERF_CTR_BITS-1:0] l2cache_reads; - wire [`PERF_CTR_BITS-1:0] l2cache_writes; - wire [`PERF_CTR_BITS-1:0] l2cache_read_misses; - wire [`PERF_CTR_BITS-1:0] l2cache_write_misses; - wire [`PERF_CTR_BITS-1:0] l2cache_bank_stalls; - wire [`PERF_CTR_BITS-1:0] l2cache_mshr_stalls; - - wire [`PERF_CTR_BITS-1:0] l3cache_reads; - wire [`PERF_CTR_BITS-1:0] l3cache_writes; - wire [`PERF_CTR_BITS-1:0] l3cache_read_misses; - wire [`PERF_CTR_BITS-1:0] l3cache_write_misses; - wire [`PERF_CTR_BITS-1:0] l3cache_bank_stalls; - wire [`PERF_CTR_BITS-1:0] l3cache_mshr_stalls; - - wire [`PERF_CTR_BITS-1:0] mem_reads; - wire [`PERF_CTR_BITS-1:0] mem_writes; - wire [`PERF_CTR_BITS-1:0] mem_latency; + cache_perf_t icache; + cache_perf_t dcache; + cache_perf_t l2cache; + cache_perf_t l3cache; + cache_perf_t smem; + mem_perf_t mem; modport master ( - output icache_reads, - output icache_read_misses, - - output dcache_reads, - output dcache_writes, - output dcache_read_misses, - output dcache_write_misses, - output dcache_bank_stalls, - output dcache_mshr_stalls, - - output smem_reads, - output smem_writes, - output smem_bank_stalls, - - output l2cache_reads, - output l2cache_writes, - output l2cache_read_misses, - output l2cache_write_misses, - output l2cache_bank_stalls, - output l2cache_mshr_stalls, - - output l3cache_reads, - output l3cache_writes, - output l3cache_read_misses, - output l3cache_write_misses, - output l3cache_bank_stalls, - output l3cache_mshr_stalls, - - output mem_reads, - output mem_writes, - output mem_latency + output icache, + output dcache, + output l2cache, + output l3cache, + output smem, + output mem ); modport slave ( - input icache_reads, - input icache_read_misses, - - input dcache_reads, - input dcache_writes, - input dcache_read_misses, - input dcache_write_misses, - input dcache_bank_stalls, - input dcache_mshr_stalls, - - input smem_reads, - input smem_writes, - input smem_bank_stalls, - - input l2cache_reads, - input l2cache_writes, - input l2cache_read_misses, - input l2cache_write_misses, - input l2cache_bank_stalls, - input l2cache_mshr_stalls, - - input l3cache_reads, - input l3cache_writes, - input l3cache_read_misses, - input l3cache_write_misses, - input l3cache_bank_stalls, - input l3cache_mshr_stalls, - - input mem_reads, - input mem_writes, - input mem_latency + input icache, + input dcache, + input l2cache, + input l3cache, + input smem, + input mem ); endinterface diff --git a/hw/rtl/mem/VX_mem_unit.sv b/hw/rtl/mem/VX_mem_unit.sv deleted file mode 100644 index 0f293a7e..00000000 --- a/hw/rtl/mem/VX_mem_unit.sv +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_define.vh" - -`define SMEM_ADDR_STACK_OPT - -module VX_mem_unit import VX_gpu_pkg::*; #( - parameter CLUSTER_ID = 0 -) ( - input wire clk, - input wire reset, - -`ifdef PERF_ENABLE - VX_mem_perf_if.master mem_perf_if, -`endif - - VX_mem_bus_if.slave icache_bus_if [`NUM_SOCKETS], - - VX_mem_bus_if.slave dcache_bus_if [`NUM_SOCKETS * DCACHE_NUM_REQS], - - VX_mem_bus_if.master mem_bus_if -); - -`ifdef PERF_ENABLE - VX_cache_perf_if perf_icache_if(); - VX_cache_perf_if perf_dcache_if(); - VX_cache_perf_if perf_l2cache_if(); -`endif - -/////////////////////////////// I-Cache /////////////////////////////////// - - VX_mem_bus_if #( - .DATA_SIZE (ICACHE_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) - ) icache_mem_bus_if(); - - `RESET_RELAY (icache_reset, reset); - - VX_cache_cluster #( - .INSTANCE_ID ($sformatf("cluster%0d-icache", CLUSTER_ID)), - .NUM_UNITS (`NUM_ICACHES), - .NUM_INPUTS (`NUM_SOCKETS), - .TAG_SEL_IDX (0), - .CACHE_SIZE (`ICACHE_SIZE), - .LINE_SIZE (ICACHE_LINE_SIZE), - .NUM_BANKS (1), - .NUM_WAYS (`ICACHE_NUM_WAYS), - .WORD_SIZE (ICACHE_WORD_SIZE), - .NUM_REQS (1), - .CRSQ_SIZE (`ICACHE_CRSQ_SIZE), - .MSHR_SIZE (`ICACHE_MSHR_SIZE), - .MRSQ_SIZE (`ICACHE_MRSQ_SIZE), - .MREQ_SIZE (`ICACHE_MREQ_SIZE), - .TAG_WIDTH (ICACHE_ARB_TAG_WIDTH), - .UUID_WIDTH (`UUID_WIDTH), - .WRITE_ENABLE (0), - .CORE_OUT_REG (2), - .MEM_OUT_REG (2) - ) icache ( - `ifdef PERF_ENABLE - .cache_perf_if (perf_icache_if), - `endif - .clk (clk), - .reset (icache_reset), - .core_bus_if (icache_bus_if), - .mem_bus_if (icache_mem_bus_if) - ); - -/////////////////////////////// D-Cache /////////////////////////////////// - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) - ) dcache_mem_bus_if(); - - `RESET_RELAY (dcache_reset, reset); - - VX_cache_cluster #( - .INSTANCE_ID ($sformatf("cluster%0d-dcache", CLUSTER_ID)), - .NUM_UNITS (`NUM_DCACHES), - .NUM_INPUTS (`NUM_SOCKETS), - .TAG_SEL_IDX (1), - .CACHE_SIZE (`DCACHE_SIZE), - .LINE_SIZE (DCACHE_LINE_SIZE), - .NUM_BANKS (`DCACHE_NUM_BANKS), - .NUM_WAYS (`DCACHE_NUM_WAYS), - .WORD_SIZE (DCACHE_WORD_SIZE), - .NUM_REQS (DCACHE_NUM_REQS), - .CRSQ_SIZE (`DCACHE_CRSQ_SIZE), - .MSHR_SIZE (`DCACHE_MSHR_SIZE), - .MRSQ_SIZE (`DCACHE_MRSQ_SIZE), - .MREQ_SIZE (`DCACHE_MREQ_SIZE), - .TAG_WIDTH (DCACHE_ARB_TAG_WIDTH), - .UUID_WIDTH (`UUID_WIDTH), - .WRITE_ENABLE (1), - .NC_ENABLE (1), - .CORE_OUT_REG (`SM_ENABLED ? 2 : 1), - .MEM_OUT_REG (2) - ) dcache ( - `ifdef PERF_ENABLE - .cache_perf_if (perf_dcache_if), - `endif - - .clk (clk), - .reset (dcache_reset), - .core_bus_if (dcache_bus_if), - .mem_bus_if (dcache_mem_bus_if) - ); - -/////////////////////////////// L2-Cache ////////////////////////////////// - - VX_mem_bus_if #( - .DATA_SIZE (L2_WORD_SIZE), - .TAG_WIDTH (L2_TAG_WIDTH) - ) l2_mem_bus_if[L2_NUM_REQS](); - - localparam I_MEM_ARB_IDX = 0; - localparam D_MEM_ARB_IDX = I_MEM_ARB_IDX + 1; - - `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[I_MEM_ARB_IDX], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); - `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[D_MEM_ARB_IDX], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - - `RESET_RELAY (l2_reset, reset); - - VX_cache_wrap #( - .INSTANCE_ID ($sformatf("cluster%0d-l2cache", CLUSTER_ID)), - .CACHE_SIZE (`L2_CACHE_SIZE), - .LINE_SIZE (`L2_LINE_SIZE), - .NUM_BANKS (`L2_NUM_BANKS), - .NUM_WAYS (`L2_NUM_WAYS), - .WORD_SIZE (L2_WORD_SIZE), - .NUM_REQS (L2_NUM_REQS), - .CRSQ_SIZE (`L2_CRSQ_SIZE), - .MSHR_SIZE (`L2_MSHR_SIZE), - .MRSQ_SIZE (`L2_MRSQ_SIZE), - .MREQ_SIZE (`L2_MREQ_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH), - .WRITE_ENABLE (1), - .UUID_WIDTH (`UUID_WIDTH), - .CORE_OUT_REG (2), - .MEM_OUT_REG (2), - .NC_ENABLE (1), - .PASSTHRU (!`L2_ENABLED) - ) l2cache ( - .clk (clk), - .reset (l2_reset), - `ifdef PERF_ENABLE - .cache_perf_if (perf_l2cache_if), - `endif - .core_bus_if (l2_mem_bus_if), - .mem_bus_if (mem_bus_if) - ); - -`ifdef PERF_ENABLE - - `UNUSED_VAR (perf_dcache_if.mem_stalls) - `UNUSED_VAR (perf_dcache_if.crsp_stalls) - - assign mem_perf_if.icache_reads = perf_icache_if.reads; - assign mem_perf_if.icache_read_misses = perf_icache_if.read_misses; - - assign mem_perf_if.dcache_reads = perf_dcache_if.reads; - assign mem_perf_if.dcache_writes = perf_dcache_if.writes; - assign mem_perf_if.dcache_read_misses = perf_dcache_if.read_misses; - assign mem_perf_if.dcache_write_misses= perf_dcache_if.write_misses; - assign mem_perf_if.dcache_bank_stalls = perf_dcache_if.bank_stalls; - assign mem_perf_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; - -`ifdef L2_ENABLE - assign mem_perf_if.l2cache_reads = perf_l2cache_if.reads; - assign mem_perf_if.l2cache_writes = perf_l2cache_if.writes; - assign mem_perf_if.l2cache_read_misses = perf_l2cache_if.read_misses; - assign mem_perf_if.l2cache_write_misses= perf_l2cache_if.write_misses; - assign mem_perf_if.l2cache_bank_stalls = perf_l2cache_if.bank_stalls; - assign mem_perf_if.l2cache_mshr_stalls = perf_l2cache_if.mshr_stalls; -`else - assign mem_perf_if.l2cache_reads = '0; - assign mem_perf_if.l2cache_writes = '0; - assign mem_perf_if.l2cache_read_misses = '0; - assign mem_perf_if.l2cache_write_misses= '0; - assign mem_perf_if.l2cache_bank_stalls = '0; - assign mem_perf_if.l2cache_mshr_stalls = '0; -`endif - - assign mem_perf_if.l3cache_reads = '0; - assign mem_perf_if.l3cache_writes = '0; - assign mem_perf_if.l3cache_read_misses = '0; - assign mem_perf_if.l3cache_write_misses= '0; - assign mem_perf_if.l3cache_bank_stalls = '0; - assign mem_perf_if.l3cache_mshr_stalls = '0; - - assign mem_perf_if.mem_reads = '0; - assign mem_perf_if.mem_writes = '0; - assign mem_perf_if.mem_latency = '0; - -`endif - -endmodule diff --git a/hw/rtl/mem/VX_shared_mem.sv b/hw/rtl/mem/VX_shared_mem.sv index ef19ef1c..a44c68a8 100644 --- a/hw/rtl/mem/VX_shared_mem.sv +++ b/hw/rtl/mem/VX_shared_mem.sv @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_shared_mem #( +module VX_shared_mem import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", // Size of cache in bytes @@ -40,7 +40,7 @@ module VX_shared_mem #( // PERF `ifdef PERF_ENABLE - VX_cache_perf_if.master cache_perf_if, + output cache_perf_t cache_perf, `endif // Core request @@ -106,6 +106,10 @@ module VX_shared_mem #( wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in; wire [NUM_BANKS-1:0][REQ_DATAW-1:0] req_data_out; +`ifdef PERF_ENABLE + wire [`PERF_CTR_BITS-1:0] perf_collisions; +`endif + for (genvar i = 0; i < NUM_REQS; ++i) begin assign req_data_in[i] = { req_rw[i], @@ -125,7 +129,7 @@ module VX_shared_mem #( .clk (clk), .reset (reset), `ifdef PERF_ENABLE - .collisions (cache_perf_if.bank_stalls), + .collisions (perf_collisions), `else `UNUSED_PIN (collisions), `endif @@ -253,13 +257,14 @@ module VX_shared_mem #( end end - assign cache_perf_if.reads = perf_reads; - assign cache_perf_if.writes = perf_writes; - assign cache_perf_if.read_misses = '0; - assign cache_perf_if.write_misses = '0; - assign cache_perf_if.mshr_stalls = '0; - assign cache_perf_if.mem_stalls = '0; - assign cache_perf_if.crsp_stalls = perf_crsp_stalls; + assign cache_perf.reads = perf_reads; + assign cache_perf.writes = perf_writes; + assign cache_perf.read_misses = '0; + assign cache_perf.write_misses = '0; + assign cache_perf.bank_stalls = perf_collisions; + assign cache_perf.mshr_stalls = '0; + assign cache_perf.mem_stalls = '0; + assign cache_perf.crsp_stalls = perf_crsp_stalls; `endif diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 7a539250..72c2b80c 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -178,7 +178,26 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t instrs = 0; uint64_t cycles = 0; -#ifdef PERF_ENABLE +#ifdef PERF_ENABLE + + auto calcRatio = [&](uint64_t part, uint64_t total)->int { + if (total == 0) + return 0; + return int((1.0 - (double(part) / double(total))) * 100); + }; + + auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int { + if (requests == 0) + return 0; + return int(double(sum) / double(requests)); + }; + + auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int { + if (count == 0) + return 0; + return int((double(count) / double(count + stalls)) * 100); + }; + auto perf_class = gAutoPerfDump.get_perf_class(); // PERF: pipeline stalls @@ -192,21 +211,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t loads = 0; uint64_t stores = 0; uint64_t ifetch_lat = 0; - uint64_t load_lat = 0; - // PERF: Icache - uint64_t icache_reads = 0; - uint64_t icache_read_misses = 0; - // PERF: Dcache - uint64_t dcache_reads = 0; - uint64_t dcache_writes = 0; - uint64_t dcache_read_misses = 0; - uint64_t dcache_write_misses = 0; - uint64_t dcache_bank_stalls = 0; - uint64_t dcache_mshr_stalls = 0; - // PERF: shared memory - uint64_t smem_reads = 0; - uint64_t smem_writes = 0; - uint64_t smem_bank_stalls = 0; + uint64_t load_lat = 0; // PERF: l2cache uint64_t l2cache_reads = 0; uint64_t l2cache_writes = 0; @@ -232,6 +237,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (ret != 0) return ret; +#ifdef PERF_ENABLE + uint64_t isa_flags; + ret = vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags); + if (ret != 0) + return ret; + + bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE; + bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE; + bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE; + bool l3cache_enable = isa_flags & VX_ISA_EXT_L3CACHE; + bool smem_enable = isa_flags & VX_ISA_EXT_SMEM; +#endif + std::vector staging_buf(64* sizeof(uint32_t)); for (unsigned core_id = 0; core_id < num_cores; ++core_id) { @@ -240,13 +258,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (ret != 0) return ret; - uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); - uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); - float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); - if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); - instrs += instrs_per_core; - cycles = std::max(cycles_per_core, cycles); - #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { @@ -291,52 +302,78 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // ifetch latency uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); if (num_cores > 1) { - int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core)); + int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core); fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); } ifetch_lat += ifetch_lat_per_core; // load latency uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); if (num_cores > 1) { - int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core)); + int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core); fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); } load_lat += load_lat_per_core; } break; case VX_DCR_MPM_CLASS_MEM: { - if (0 == core_id) { - // PERF: Icache - icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); - icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); - - // PERF: Dcache - dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS); - dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES); - dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R); - dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W); - dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST); - dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); - + if (smem_enable) { // PERF: smem - smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS); - smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES); - smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST); + uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS); + uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES); + uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST); + int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls); + fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads); + fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes); + fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization); + } + + if (icache_enable) { + // PERF: Icache + uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); + uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); + int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); + fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads); + fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio); + } + if (dcache_enable) { + // PERF: Dcache + uint64_t dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS); + uint64_t dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES); + uint64_t dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R); + uint64_t dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W); + uint64_t dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST); + uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); + int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads); + int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes); + int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls); + fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads); + fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes); + fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio); + fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio); + fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization); + fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls); + } + + if (l2cache_enable) { // PERF: L2cache - l2cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS); - l2cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES); - l2cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R); - l2cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W); - l2cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST); - l2cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST); - - // PERF: L3cache - l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS); - l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES); - l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R); - l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W); - l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST); - l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST); + l2cache_reads += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS); + l2cache_writes += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES); + l2cache_read_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R); + l2cache_write_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W); + l2cache_bank_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST); + l2cache_mshr_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST); + } + + if (0 == core_id) { + if (l3cache_enable) { + // PERF: L3cache + l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS); + l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES); + l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R); + l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W); + l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST); + l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST); + } // PERF: memory mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS); @@ -347,11 +384,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { default: break; } - #endif - } - - float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + #endif + + uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); + uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); + float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); + instrs += instrs_per_core; + cycles = std::max(cycles_per_core, cycles); + } #ifdef PERF_ENABLE switch (perf_class) { @@ -368,52 +409,51 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: stores=%ld\n", stores); fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat); - fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat); - + fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat); } break; - case VX_DCR_MPM_CLASS_MEM: { - int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100); - int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100); - int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); - int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); - int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100); - int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100); - int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100); - int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100); - int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100); - int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100); - int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); - int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads)); - fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); - fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio); - fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); - fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); - fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio); - fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio); - fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization); - fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); - fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); - fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); - fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); - fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads); - fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes); - fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio); - fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio); - fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization); - fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls); - fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); - fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); - fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio); - fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio); - fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization); - fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls); + case VX_DCR_MPM_CLASS_MEM: { + if (l2cache_enable) { + l2cache_reads /= num_cores; + l2cache_writes /= num_cores; + l2cache_read_misses /= num_cores; + l2cache_write_misses /= num_cores; + l2cache_bank_stalls /= num_cores; + l2cache_mshr_stalls /= num_cores; + int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); + int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); + int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls); + + fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads); + fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes); + fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio); + fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio); + fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization); + fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls); + } + + if (l3cache_enable) { + int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); + int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); + int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls); + fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); + fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); + fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio); + fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio); + fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization); + fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls); + } + + int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); } break; default: break; } -#endif +#endif + + float IPC = (float)(double(instrs) / double(cycles)); + fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); fflush(stream); diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index 9a91037c..749649f0 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -47,10 +47,12 @@ typedef void* vx_device_h; #define VX_ISA_STD_Q (1ull << 16) #define VX_ISA_STD_S (1ull << 18) #define VX_ISA_STD_U (1ull << 20) -#define VX_ISA_BASE(flags) (1 << (((flags >> 30) & 0x3) + 4)) -#define VX_ISA_EXT_TEX (1ull << 32) -#define VX_ISA_EXT_RASTER (1ull << 33) -#define VX_ISA_EXT_ROP (1ull << 34) +#define VX_ISA_ARCH(flags) (1 << (((flags >> 30) & 0x3) + 4)) +#define VX_ISA_EXT_ICACHE (1ull << 32) +#define VX_ISA_EXT_DCACHE (1ull << 33) +#define VX_ISA_EXT_L2CACHE (1ull << 34) +#define VX_ISA_EXT_L3CACHE (1ull << 35) +#define VX_ISA_EXT_SMEM (1ull << 36) // device memory types #define VX_MEM_TYPE_GLOBAL 0