From d956e268b96c7a3cade7d5f2d9d37f4828143ce6 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 22 Dec 2020 12:33:45 -0800 Subject: [PATCH] adding new performance counters (banks utilization and DRAM bus utilization) --- driver/common/vx_utils.cpp | 158 +++++++++-------- hw/rtl/VX_config.vh | 103 +++++------ hw/rtl/VX_csr_data.v | 101 +++++------ hw/rtl/VX_issue.v | 71 +++++++- hw/rtl/VX_mem_unit.v | 68 +++++--- hw/rtl/VX_pipeline.v | 74 -------- hw/rtl/cache/VX_bank.v | 24 +-- hw/rtl/cache/VX_cache.v | 197 +++++++++------------- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 25 ++- hw/rtl/cache/VX_tag_store.v | 1 + hw/rtl/interfaces/VX_perf_cache_if.v | 7 +- hw/rtl/interfaces/VX_perf_memsys_if.v | 20 ++- hw/rtl/interfaces/VX_perf_pipeline_if.v | 8 +- hw/simulate/simulator.cpp | 8 +- 14 files changed, 426 insertions(+), 439 deletions(-) diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 584ce270..27496a19 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -108,37 +108,39 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { uint64_t instrs = 0; uint64_t cycles = 0; + #ifdef PERF_ENABLE // PERF: pipeline stalls + uint64_t ibuffer_stalls = 0; + uint64_t scoreboard_stalls = 0; uint64_t lsu_stalls = 0; uint64_t fpu_stalls = 0; uint64_t mul_stalls = 0; uint64_t csr_stalls = 0; uint64_t alu_stalls = 0; uint64_t gpu_stalls = 0; - uint64_t ibuffer_stalls = 0; - uint64_t scoreboard_stalls = 0; - uint64_t icache_stalls = 0; // PERF: Icache uint64_t icache_reads = 0; uint64_t icache_read_misses = 0; uint64_t icache_pipe_stalls = 0; - uint64_t icache_dram_stalls = 0; - uint64_t icache_mshr_stalls = 0; uint64_t icache_rsp_stalls = 0; // PERF: Dcache uint64_t dcache_reads = 0; uint64_t dcache_writes = 0; uint64_t dcache_read_misses = 0; uint64_t dcache_write_misses = 0; - uint64_t dcache_pipe_stalls = 0; - uint64_t dcache_dram_stalls = 0; + uint64_t dcache_bank_stalls = 0; uint64_t dcache_mshr_stalls = 0; - uint64_t dcache_rsp_stalls = 0; - uint64_t dcache_evictions = 0; + uint64_t dcache_pipe_stalls = 0; + uint64_t dcache_rsp_stalls = 0; + // PERF: SMEM + uint64_t smem_reads = 0; + uint64_t smem_writes = 0; + uint64_t smem_bank_stalls = 0; // PERF: memory - uint64_t dram_req = 0; - uint64_t dram_rsp = 0; + uint64_t dram_reads = 0; + uint64_t dram_writes = 0; + uint64_t dram_stalls = 0; uint64_t dram_lat = 0; #endif @@ -154,11 +156,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { #ifdef PERF_ENABLE // PERF: pipeline - // icache_stall - uint64_t icache_stalls_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_ST, CSR_MPM_ICACHE_ST_H, &icache_stalls_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache stalls=%ld\n", core_id, icache_stalls_per_core); - icache_stalls += icache_stalls_per_core; // ibuffer_stall uint64_t ibuffer_stalls_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core); @@ -209,7 +206,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // read misses uint64_t icache_miss_r_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld\n", core_id, icache_miss_r_per_core); + int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio); icache_read_misses += icache_miss_r_per_core; // pipeline stalls uint64_t icache_pipe_st_per_core; @@ -221,16 +219,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core); icache_rsp_stalls += icache_crsp_st_per_core; - // dram_stalls - uint64_t icache_dram_st_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_DREQ_ST, CSR_MPM_ICACHE_DREQ_ST_H, &icache_dram_st_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache dram stalls=%ld\n", core_id, icache_dram_st_per_core); - icache_dram_stalls += icache_dram_st_per_core; - // mshr_stalls - uint64_t icache_mshr_st_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MSHR_ST, CSR_MPM_ICACHE_MSHR_ST_H, &icache_mshr_st_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: icache mshr stalls=%ld\n", core_id, icache_mshr_st_per_core); - icache_mshr_stalls += icache_mshr_st_per_core; // PERF: Dcache // total reads @@ -246,50 +234,70 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // read misses uint64_t dcache_miss_r_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld\n", core_id, dcache_miss_r_per_core); + int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_miss_r_per_core) / double(dcache_reads_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_r_per_core, dcache_read_hit_ratio); dcache_read_misses += dcache_miss_r_per_core; // read misses uint64_t dcache_miss_w_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld\n", core_id, dcache_miss_w_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core); + int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_miss_w_per_core) / double(dcache_writes_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio); dcache_write_misses += dcache_miss_w_per_core; - // total_evictions - uint64_t dcache_evictions_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_EVICTS, CSR_MPM_DCACHE_EVICTS_H, &dcache_evictions_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache evictions_per_core=%ld\n", core_id, dcache_evictions_per_core); - dcache_evictions += dcache_evictions_per_core; - // pipeline stalls - uint64_t dcache_pipe_st_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); - dcache_pipe_stalls += dcache_pipe_st_per_core; - // response stalls - uint64_t dcache_crsp_st_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); - dcache_rsp_stalls += dcache_crsp_st_per_core; - // dram_stalls - uint64_t dcache_dram_st_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_DREQ_ST, CSR_MPM_DCACHE_DREQ_ST_H, &dcache_dram_st_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache dram stalls=%ld\n", core_id, dcache_dram_st_per_core); - dcache_dram_stalls += dcache_dram_st_per_core; + // bank_stalls + uint64_t dcache_bank_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_BANK_ST, CSR_MPM_DCACHE_BANK_ST_H, &dcache_bank_st_per_core); + int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization); + dcache_bank_stalls += dcache_bank_st_per_core; // mshr_stalls uint64_t dcache_mshr_st_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core); dcache_mshr_stalls += dcache_mshr_st_per_core; + // pipeline stalls + uint64_t dcache_pipe_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); + dcache_pipe_stalls += dcache_pipe_st_per_core; + // response stalls + uint64_t dcache_crsp_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); + dcache_rsp_stalls += dcache_crsp_st_per_core; - // PERF: dram_latency - uint64_t dram_req_per_core, dram_rsp_per_core, dram_lat_per_core; - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_REQ, CSR_MPM_DRAM_REQ_H, &dram_req_per_core); - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_RSP, CSR_MPM_DRAM_RSP_H, &dram_rsp_per_core); + // PERF: SMEM + // total reads + uint64_t smem_reads_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_READS, CSR_MPM_SMEM_READS_H, &smem_reads_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads_per_core); + smem_reads += smem_reads_per_core; + // total write + uint64_t smem_writes_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_WRITES, CSR_MPM_SMEM_WRITES_H, &smem_writes_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes_per_core); + smem_writes += smem_writes_per_core; + // bank_stalls + uint64_t smem_bank_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_BANK_ST, CSR_MPM_SMEM_BANK_ST_H, &smem_bank_st_per_core); + int smem_bank_utilization = (int)((1.0 - (double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization); + smem_bank_stalls += smem_bank_st_per_core; + + // PERF: DRAM + uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core); ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core); - int avg_dram_lat_per_core = (int)(double(dram_lat_per_core) / double(dram_rsp_per_core)); - if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, dram_req_per_core, dram_rsp_per_core, dram_req_per_core - dram_rsp_per_core); - if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat_per_core); - dram_req += dram_req_per_core; - dram_rsp += dram_rsp_per_core; - dram_lat += dram_lat_per_core; + int avg_dram_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core)); + int dram_utilization = (int)((1.0 - (double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core))) * 100); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%d (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization); + if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat); + dram_reads += dram_reads_per_core; + dram_writes += dram_writes_per_core; + dram_stalls += dram_stalls_per_core; + dram_lat += dram_lat_per_core; #endif } @@ -297,7 +305,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); #ifdef PERF_ENABLE - fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls); + int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100); + int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100); + int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); + int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls))) * 100); + int smem_bank_utilization = (int)((1.0 - (double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls))) * 100); + int dram_utilization = (int)((1.0 - (double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls))) * 100); + int avg_dram_lat = (int)(double(dram_lat) / double(dram_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); @@ -307,22 +321,22 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls); fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); - fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses); + fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio); + fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); - fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); - fprintf(stream, "PERF: icache dram stalls=%ld\n", icache_dram_stalls); - fprintf(stream, "PERF: icache mshr stalls=%ld\n", icache_mshr_stalls); fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); - fprintf(stream, "PERF: dcache read misses=%ld\n", dcache_read_misses); - fprintf(stream, "PERF: dcache wrire misses=%ld\n", dcache_write_misses); - fprintf(stream, "PERF: dcache evictions=%ld\n", dcache_evictions); + fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio); + fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio); + fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization); + fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls); fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls); - fprintf(stream, "PERF: dcache dram stalls=%ld\n", dcache_dram_stalls); - fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); - fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", dram_req, dram_rsp, dram_req - dram_rsp); - int avg_dram_lat = (int)(double(dram_lat) / double(dram_rsp)); + fprintf(stream, "PERF: smem reads=%ld\n", smem_reads); + fprintf(stream, "PERF: smem writes=%ld\n", smem_writes); + fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); + fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes); + fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization); fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat); #endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index d3c7664a..6f12fc06 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -167,63 +167,64 @@ // Machine Performance-monitoring counters // PERF: pipeline -`define CSR_MPM_ICACHE_ST 12'hB03 -`define CSR_MPM_ICACHE_ST_H 12'hB83 -`define CSR_MPM_IBUF_ST 12'hB04 -`define CSR_MPM_IBUF_ST_H 12'hB84 -`define CSR_MPM_SCRB_ST 12'hB05 -`define CSR_MPM_SCRB_ST_H 12'hB85 -`define CSR_MPM_ALU_ST 12'hB06 -`define CSR_MPM_ALU_ST_H 12'hB86 -`define CSR_MPM_LSU_ST 12'hB07 -`define CSR_MPM_LSU_ST_H 12'hB87 -`define CSR_MPM_CSR_ST 12'hB08 -`define CSR_MPM_CSR_ST_H 12'hB88 -`define CSR_MPM_MUL_ST 12'hB09 -`define CSR_MPM_MUL_ST_H 12'hB89 -`define CSR_MPM_FPU_ST 12'hB0A -`define CSR_MPM_FPU_ST_H 12'hB8A -`define CSR_MPM_GPU_ST 12'hB0B -`define CSR_MPM_GPU_ST_H 12'hB8B +`define CSR_MPM_IBUF_ST 12'hB03 +`define CSR_MPM_IBUF_ST_H 12'hB83 +`define CSR_MPM_SCRB_ST 12'hB04 +`define CSR_MPM_SCRB_ST_H 12'hB84 +`define CSR_MPM_ALU_ST 12'hB05 +`define CSR_MPM_ALU_ST_H 12'hB85 +`define CSR_MPM_LSU_ST 12'hB06 +`define CSR_MPM_LSU_ST_H 12'hB86 +`define CSR_MPM_CSR_ST 12'hB07 +`define CSR_MPM_CSR_ST_H 12'hB87 +`define CSR_MPM_MUL_ST 12'hB08 +`define CSR_MPM_MUL_ST_H 12'hB88 +`define CSR_MPM_FPU_ST 12'hB09 +`define CSR_MPM_FPU_ST_H 12'hB89 +`define CSR_MPM_GPU_ST 12'hB0A +`define CSR_MPM_GPU_ST_H 12'hB8A // PERF: icache -`define CSR_MPM_ICACHE_MISS_R 12'hB0C // read misses +`define CSR_MPM_ICACHE_READS 12'hB0B // total reads +`define CSR_MPM_ICACHE_READS_H 12'hB8B +`define CSR_MPM_ICACHE_MISS_R 12'hB0C // total misses `define CSR_MPM_ICACHE_MISS_R_H 12'hB8C -`define CSR_MPM_ICACHE_DREQ_ST 12'hB0D // dram request stalls -`define CSR_MPM_ICACHE_DREQ_ST_H 12'hB8D +`define CSR_MPM_ICACHE_PIPE_ST 12'hB0D // pipeline stalls +`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8D `define CSR_MPM_ICACHE_CRSP_ST 12'hB0E // core response stalls `define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8E -`define CSR_MPM_ICACHE_MSHR_ST 12'hB0F // MSHR stalls -`define CSR_MPM_ICACHE_MSHR_ST_H 12'hB8F -`define CSR_MPM_ICACHE_PIPE_ST 12'hB10 // pipeline stalls -`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB90 -`define CSR_MPM_ICACHE_READS 12'hB11 // total reads -`define CSR_MPM_ICACHE_READS_H 12'hB91 // PERF: dcache -`define CSR_MPM_DCACHE_MISS_R 12'hB12 // read misses -`define CSR_MPM_DCACHE_MISS_R_H 12'hB92 -`define CSR_MPM_DCACHE_MISS_W 12'hB13 // write misses -`define CSR_MPM_DCACHE_MISS_W_H 12'hB93 -`define CSR_MPM_DCACHE_DREQ_ST 12'hB14 // dram request stalls -`define CSR_MPM_DCACHE_DREQ_ST_H 12'hB94 -`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls -`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95 -`define CSR_MPM_DCACHE_MSHR_ST 12'hB16 // MSHR stalls -`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB96 -`define CSR_MPM_DCACHE_PIPE_ST 12'hB17 // pipeline stalls -`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB97 -`define CSR_MPM_DCACHE_READS 12'hB18 // total reads -`define CSR_MPM_DCACHE_READS_H 12'hB98 -`define CSR_MPM_DCACHE_WRITES 12'hB19 // total writes -`define CSR_MPM_DCACHE_WRITES_H 12'hB99 -`define CSR_MPM_DCACHE_EVICTS 12'hB1A // total evictions -`define CSR_MPM_DCACHE_EVICTS_H 12'hB9A +`define CSR_MPM_DCACHE_READS 12'hB0F // total reads +`define CSR_MPM_DCACHE_READS_H 12'hB8F +`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes +`define CSR_MPM_DCACHE_WRITES_H 12'hB90 +`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses +`define CSR_MPM_DCACHE_MISS_R_H 12'hB91 +`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses +`define CSR_MPM_DCACHE_MISS_W_H 12'hB92 +`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts stalls +`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93 +`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls +`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94 +`define CSR_MPM_DCACHE_PIPE_ST 12'hB15 // pipeline stalls +`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB95 +`define CSR_MPM_DCACHE_CRSP_ST 12'hB16 // core response stalls +`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB96 +// PERF: smem +`define CSR_MPM_SMEM_READS 12'hB17 // total reads +`define CSR_MPM_SMEM_READS_H 12'hB97 +`define CSR_MPM_SMEM_WRITES 12'hB18 // total writes +`define CSR_MPM_SMEM_WRITES_H 12'hB98 +`define CSR_MPM_SMEM_BANK_ST 12'hB19 // bank conflicts stalls +`define CSR_MPM_SMEM_BANK_ST_H 12'hB99 // PERF: memory -`define CSR_MPM_DRAM_LAT 12'hB1B // dram latency (total) -`define CSR_MPM_DRAM_LAT_H 12'hB9B -`define CSR_MPM_DRAM_REQ 12'hB1C // dram requests -`define CSR_MPM_DRAM_REQ_H 12'hB9C -`define CSR_MPM_DRAM_RSP 12'hB1D // dram responses -`define CSR_MPM_DRAM_RSP_H 12'hB9D +`define CSR_MPM_DRAM_READS 12'hB1A // dram reads +`define CSR_MPM_DRAM_READS_H 12'hB9A +`define CSR_MPM_DRAM_WRITES 12'hB1B // dram writes +`define CSR_MPM_DRAM_WRITES_H 12'hB9B +`define CSR_MPM_DRAM_ST 12'hB1C // dram request stalls +`define CSR_MPM_DRAM_ST_H 12'hB9C +`define CSR_MPM_DRAM_LAT 12'hB1D // dram latency (total) +`define CSR_MPM_DRAM_LAT_H 12'hB9D // Machine Information Registers `define CSR_MVENDORID 12'hF11 diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 01cf93ee..4fe9de4e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -121,63 +121,64 @@ module VX_csr_data #( `ifdef PERF_ENABLE // PERF: pipeline - `CSR_MPM_ICACHE_ST : read_data_r = perf_pipeline_if.icache_stalls[31:0]; - `CSR_MPM_ICACHE_ST_H : read_data_r = perf_pipeline_if.icache_stalls[63:32]; - `CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibuffer_stalls[31:0]; - `CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibuffer_stalls[63:32]; - `CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scoreboard_stalls[31:0]; - `CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scoreboard_stalls[63:32]; - `CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0]; - `CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32]; - `CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0]; - `CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32]; - `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; - `CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32]; - `CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0]; - `CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32]; - `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; - `CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32]; - `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; - `CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32]; + `CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0]; + `CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibf_stalls[63:32]; + `CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0]; + `CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scb_stalls[63:32]; + `CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0]; + `CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32]; + `CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0]; + `CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32]; + `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; + `CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32]; + `CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0]; + `CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32]; + `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; + `CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32]; + `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; + `CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32]; // PERF: icache - `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; - `CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_read_misses[63:32]; - `CSR_MPM_ICACHE_DREQ_ST : read_data_r = perf_memsys_if.icache_dreq_stalls[31:0]; - `CSR_MPM_ICACHE_DREQ_ST_H : read_data_r = perf_memsys_if.icache_dreq_stalls[63:32]; - `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; - `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_crsp_stalls[63:32]; - `CSR_MPM_ICACHE_MSHR_ST : read_data_r = perf_memsys_if.icache_mshr_stalls[31:0]; - `CSR_MPM_ICACHE_MSHR_ST_H : read_data_r = perf_memsys_if.icache_mshr_stalls[63:32]; - `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; - `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_pipe_stalls[63:32]; `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0]; `CSR_MPM_ICACHE_READS_H : read_data_r = perf_memsys_if.icache_reads[63:32]; - // PERF: dcache - `CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0]; - `CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_read_misses[63:32]; - `CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0]; - `CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_write_misses[63:32]; - `CSR_MPM_DCACHE_DREQ_ST : read_data_r = perf_memsys_if.dcache_dreq_stalls[31:0]; - `CSR_MPM_DCACHE_DREQ_ST_H : read_data_r = perf_memsys_if.dcache_dreq_stalls[63:32]; - `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; - `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_crsp_stalls[63:32]; - `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; - `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_mshr_stalls[63:32]; - `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; - `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_pipe_stalls[63:32]; + `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0]; + `CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_read_misses[63:32]; + `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0]; + `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_pipe_stalls[63:32]; + `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0]; + `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_crsp_stalls[63:32]; + // PERF: dcache `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0]; `CSR_MPM_DCACHE_READS_H : read_data_r = perf_memsys_if.dcache_reads[63:32]; `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0]; `CSR_MPM_DCACHE_WRITES_H : read_data_r = perf_memsys_if.dcache_writes[63:32]; - `CSR_MPM_DCACHE_EVICTS : read_data_r = perf_memsys_if.dcache_evictions[31:0]; - `CSR_MPM_DCACHE_EVICTS_H : read_data_r = perf_memsys_if.dcache_evictions[63:32]; - // PERF: memory - `CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0]; - `CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32]; - `CSR_MPM_DRAM_REQ : read_data_r = perf_memsys_if.dram_requests[31:0]; - `CSR_MPM_DRAM_REQ_H : read_data_r = perf_memsys_if.dram_requests[63:32]; - `CSR_MPM_DRAM_RSP : read_data_r = perf_memsys_if.dram_responses[31:0]; - `CSR_MPM_DRAM_RSP_H : read_data_r = perf_memsys_if.dram_responses[63:32]; + `CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0]; + `CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_read_misses[63:32]; + `CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0]; + `CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_write_misses[63:32]; + `CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0]; + `CSR_MPM_DCACHE_BANK_ST_H : read_data_r = perf_memsys_if.dcache_bank_stalls[63:32]; + `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0]; + `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_mshr_stalls[63:32]; + `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0]; + `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_pipe_stalls[63:32]; + `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0]; + `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_crsp_stalls[63:32]; + // PERF: smem + `CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0]; + `CSR_MPM_SMEM_READS_H : read_data_r = perf_memsys_if.smem_reads[63:32]; + `CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0]; + `CSR_MPM_SMEM_WRITES_H : read_data_r = perf_memsys_if.smem_writes[63:32]; + `CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0]; + `CSR_MPM_SMEM_BANK_ST_H : read_data_r = perf_memsys_if.smem_bank_stalls[63:32]; + // PERF: DRAM + `CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0]; + `CSR_MPM_DRAM_READS_H : read_data_r = perf_memsys_if.dram_reads[63:32]; + `CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0]; + `CSR_MPM_DRAM_WRITES_H : read_data_r = perf_memsys_if.dram_writes[63:32]; + `CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0]; + `CSR_MPM_DRAM_ST_H : read_data_r = perf_memsys_if.dram_stalls[63:32]; + `CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0]; + `CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32]; `endif `CSR_SATP : read_data_r = 32'(csr_satp); diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 80b994ea..a25ed23b 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -123,18 +123,77 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_data, writeback_if.data); `ifdef PERF_ENABLE - reg [63:0] perf_scoreboard_stalls; + reg [63:0] perf_ibf_stalls ; + reg [63:0] perf_scb_stalls ; + reg [63:0] perf_alu_stalls; + reg [63:0] perf_lsu_stalls; + reg [63:0] perf_csr_stalls; + reg [63:0] perf_gpu_stalls; +`ifdef EXT_M_ENABLE + reg [63:0] perf_mul_stalls; +`endif +`ifdef EXT_F_ENABLE + reg [63:0] perf_fpu_stalls; +`endif + always @(posedge clk) begin if (reset) begin - perf_scoreboard_stalls <= 0; + perf_ibf_stalls <= 0; + perf_scb_stalls <= 0; + perf_alu_stalls <= 0; + perf_lsu_stalls <= 0; + perf_csr_stalls <= 0; + perf_gpu_stalls <= 0; + `ifdef EXT_M_ENABLE + perf_mul_stalls <= 0; + `endif + `ifdef EXT_F_ENABLE + perf_fpu_stalls <= 0; + `endif end else begin - // scoreboard_stall - if (ibuf_deq_if.valid & scoreboard_delay) begin - perf_scoreboard_stalls <= perf_scoreboard_stalls + 64'd1; + if (decode_if.valid & !decode_if.ready) begin + perf_ibf_stalls <= perf_ibf_stalls + 64'd1; end + if (ibuf_deq_if.valid & scoreboard_delay) begin + perf_scb_stalls <= perf_scb_stalls + 64'd1; + end + if (alu_req_if.valid & !alu_req_if.ready) begin + perf_alu_stalls <= perf_alu_stalls + 64'd1; + end + if (lsu_req_if.valid & !lsu_req_if.ready) begin + perf_lsu_stalls <= perf_lsu_stalls + 64'd1; + end + if (csr_req_if.valid & !csr_req_if.ready) begin + perf_csr_stalls <= perf_csr_stalls + 64'd1; + end + if (gpu_req_if.valid & !gpu_req_if.ready) begin + perf_gpu_stalls <= perf_gpu_stalls + 64'd1; + end + `ifdef EXT_M_ENABLE + if (mul_req_if.valid & !mul_req_if.ready) begin + perf_mul_stalls <= perf_mul_stalls + 64'd1; + end + `endif + `ifdef EXT_F_ENABLE + if (fpu_req_if.valid & !fpu_req_if.ready) begin + perf_fpu_stalls <= perf_fpu_stalls + 64'd1; + end + `endif end end - assign perf_pipeline_if.scoreboard_stalls = perf_scoreboard_stalls; + + assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls; + assign perf_pipeline_if.scb_stalls = perf_scb_stalls; + assign perf_pipeline_if.alu_stalls = perf_alu_stalls; + assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; + assign perf_pipeline_if.csr_stalls = perf_csr_stalls; + assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; +`ifdef EXT_M_ENABLE + assign perf_pipeline_if.mul_stalls = perf_mul_stalls; +`endif +`ifdef EXT_F_ENABLE + assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; +`endif `endif `ifdef DBG_PRINT_PIPELINE diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index d500b6bc..cc8e4864 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -363,60 +363,72 @@ module VX_mem_unit # ( `ifdef PERF_ENABLE - assign perf_memsys_if.icache_reads = perf_icache_if.reads; + assign perf_memsys_if.icache_reads = perf_icache_if.reads; assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses; - assign perf_memsys_if.icache_mshr_stalls = perf_icache_if.mshr_stalls; - assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls; - assign perf_memsys_if.icache_dreq_stalls = perf_icache_if.dreq_stalls; assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls; + assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls; - assign perf_memsys_if.dcache_reads = perf_dcache_if.reads; - assign perf_memsys_if.dcache_writes = perf_dcache_if.writes; + assign perf_memsys_if.dcache_reads = perf_dcache_if.reads; + assign perf_memsys_if.dcache_writes = perf_dcache_if.writes; assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses; - assign perf_memsys_if.dcache_write_misses = perf_dcache_if.write_misses; - assign perf_memsys_if.dcache_evictions = perf_dcache_if.evictions; - assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; - assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls; - assign perf_memsys_if.dcache_dreq_stalls = perf_dcache_if.dreq_stalls; + assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses; + assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls; + assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls; assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls; + assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls; + +if (`SM_ENABLE) begin + assign perf_memsys_if.smem_reads = perf_smem_if.reads; + assign perf_memsys_if.smem_writes = perf_smem_if.writes; + assign perf_memsys_if.smem_bank_stalls = perf_smem_if.bank_stalls; +end else begin + assign perf_memsys_if.smem_reads = 0; + assign perf_memsys_if.smem_writes = 0; + assign perf_memsys_if.smem_bank_stalls = 0; +end reg [63:0] perf_dram_lat_per_cycle; always @(posedge clk) begin if (reset) begin perf_dram_lat_per_cycle <= 0; - end else begin - if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready & dram_rsp_if.valid & dram_rsp_if.ready) begin + end else begin + if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready && dram_rsp_if.valid && dram_rsp_if.ready) begin perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle; - end else if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready) begin + end else if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) begin perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1; - end else if (dram_rsp_if.valid & dram_rsp_if.ready) begin + end else if (dram_rsp_if.valid && dram_rsp_if.ready) begin perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1; end end end - reg [63:0] perf_dram_req, perf_dram_rsp, perf_dram_lat; + reg [63:0] perf_dram_reads, perf_dram_writes, perf_dram_lat, perf_dram_stalls; always @(posedge clk) begin if (reset) begin - perf_dram_req <= 0; - perf_dram_rsp <= 0; - perf_dram_lat <= 0; - end else begin - if (dram_req_if.valid & dram_req_if.ready) begin - perf_dram_req <= perf_dram_req + 64'd1; - end - if (dram_rsp_if.valid & dram_rsp_if.ready) begin - perf_dram_rsp <= perf_dram_rsp + 64'd1; + perf_dram_reads <= 0; + perf_dram_writes <= 0; + perf_dram_lat <= 0; + perf_dram_stalls <= 0; + end else begin + if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin + perf_dram_reads <= perf_dram_reads + 64'd1; end + if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin + perf_dram_writes <= perf_dram_writes + 64'd1; + end + if (dram_req_if.valid && !dram_req_if.ready) begin + perf_dram_stalls <= perf_dram_stalls + 64'd1; + end perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle; end end - assign perf_memsys_if.dram_requests = perf_dram_req; - assign perf_memsys_if.dram_responses = perf_dram_rsp; - assign perf_memsys_if.dram_latency = perf_dram_lat; + assign perf_memsys_if.dram_reads = perf_dram_reads; + assign perf_memsys_if.dram_writes = perf_dram_writes; + assign perf_memsys_if.dram_latency = perf_dram_lat; + assign perf_memsys_if.dram_stalls = perf_dram_stalls; `endif endmodule diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 8d61f42e..47ea5876 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -288,79 +288,5 @@ module VX_pipeline #( .writeback_if (writeback_if), .cmt_to_csr_if (cmt_to_csr_if) ); - -`ifdef PERF_ENABLE - reg [63:0] perf_icache_stalls; - reg [63:0] perf_ibuffer_stalls; - reg [63:0] perf_alu_stalls; - reg [63:0] perf_lsu_stalls; - reg [63:0] perf_csr_stalls; - reg [63:0] perf_gpu_stalls; -`ifdef EXT_M_ENABLE - reg [63:0] perf_mul_stalls; -`endif -`ifdef EXT_F_ENABLE - reg [63:0] perf_fpu_stalls; -`endif - - always @(posedge clk) begin - if (reset) begin - perf_icache_stalls <= 0; - perf_ibuffer_stalls <= 0; - perf_alu_stalls <= 0; - perf_lsu_stalls <= 0; - perf_csr_stalls <= 0; - perf_gpu_stalls <= 0; - `ifdef EXT_M_ENABLE - perf_mul_stalls <= 0; - `endif - `ifdef EXT_F_ENABLE - perf_fpu_stalls <= 0; - `endif - end else begin - if (core_icache_req_if.valid & !core_icache_req_if.ready) begin - perf_icache_stalls <= perf_icache_stalls + 64'd1; - end - if (decode_if.valid & !decode_if.ready) begin - perf_ibuffer_stalls <= perf_ibuffer_stalls + 64'd1; - end - if (alu_req_if.valid & !alu_req_if.ready) begin - perf_alu_stalls <= perf_alu_stalls + 64'd1; - end - if (lsu_req_if.valid & !lsu_req_if.ready) begin - perf_lsu_stalls <= perf_lsu_stalls + 64'd1; - end - if (csr_req_if.valid & !csr_req_if.ready) begin - perf_csr_stalls <= perf_csr_stalls + 64'd1; - end - if (gpu_req_if.valid & !gpu_req_if.ready) begin - perf_gpu_stalls <= perf_gpu_stalls + 64'd1; - end - `ifdef EXT_M_ENABLE - if (mul_req_if.valid & !mul_req_if.ready) begin - perf_mul_stalls <= perf_mul_stalls + 64'd1; - end - `endif - `ifdef EXT_F_ENABLE - if (fpu_req_if.valid & !fpu_req_if.ready) begin - perf_fpu_stalls <= perf_fpu_stalls + 64'd1; - end - `endif - end - end - - assign perf_pipeline_if.icache_stalls = perf_icache_stalls; - assign perf_pipeline_if.ibuffer_stalls = perf_ibuffer_stalls; - assign perf_pipeline_if.alu_stalls = perf_alu_stalls; - assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; - assign perf_pipeline_if.csr_stalls = perf_csr_stalls; - assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; -`ifdef EXT_M_ENABLE - assign perf_pipeline_if.mul_stalls = perf_mul_stalls; -`endif -`ifdef EXT_F_ENABLE - assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; -`endif -`endif endmodule diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 6a2a4d9c..8602d7f5 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -98,11 +98,10 @@ module VX_bank #( input wire snp_rsp_ready, `ifdef PERF_ENABLE - output wire perf_mshr_stall, - output wire perf_pipe_stall, - output wire perf_evict, - output wire perf_read_miss, - output wire perf_write_miss, + output wire perf_read_misses, + output wire perf_write_misses, + output wire perf_mshr_stalls, + output wire perf_pipe_stalls, `endif // Misses @@ -335,7 +334,7 @@ module VX_bank #( wire dreq_push_stall; wire srsq_push_stall; wire pipeline_stall; - + wire is_mshr_miss_st2 = valid_st2 && is_mshr_st2 && (miss_st2 || force_miss_st2); wire is_mshr_miss_st3 = valid_st3 && is_mshr_st3 && (miss_st3 || force_miss_st3); @@ -938,15 +937,10 @@ end `SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID)); `ifdef PERF_ENABLE - assign perf_pipe_stall = pipeline_stall; - assign perf_mshr_stall = mshr_going_full; - assign perf_read_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & !mem_rw_st2; - assign perf_write_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & mem_rw_st2; - if (DRAM_ENABLE) begin - assign perf_evict = dreq_push & do_writeback_st3 & !is_snp_st3; - end else begin - assign perf_evict = 0; - end + assign perf_read_misses = !pipeline_stall && miss_st2 && !is_mshr_st2 && !mem_rw_st2; + assign perf_write_misses = !pipeline_stall && miss_st2 && !is_mshr_st2 && mem_rw_st2; + assign perf_mshr_stalls = mshr_going_full; + assign perf_pipe_stalls = pipeline_stall || mshr_going_full; `endif `ifdef DBG_PRINT_CACHE_BANK diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index c3faec40..a9cead6a 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -134,15 +134,13 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_snp_rsp_ready; wire [NUM_BANKS-1:0] per_bank_miss; - assign miss_vec = per_bank_miss; - + assign miss_vec = per_bank_miss; `ifdef PERF_ENABLE - wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; - wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; - wire [NUM_BANKS-1:0] perf_evict_per_bank; wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank; + wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; + wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; `endif if (NUM_BANKS == 1) begin @@ -156,13 +154,20 @@ module VX_cache #( .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS) - ) cache_core_req_bank_sel ( + ) cache_core_req_bank_sel ( + .clk (clk), + .reset (reset), + `ifdef PERF_ENABLE + .bank_stalls (perf_cache_if.bank_stalls), + `else + `UNUSED_PIN (bank_stalls), + `endif .core_req_valid (core_req_valid), .core_req_addr (core_req_addr), .core_req_ready (core_req_ready), .per_bank_valid (per_bank_core_req_valid), .per_bank_tid (per_bank_core_req_tid), - .per_bank_ready (per_bank_core_req_ready) + .per_bank_ready (per_bank_core_req_ready) ); assign dram_req_tag = dram_req_addr; @@ -297,7 +302,8 @@ module VX_cache #( `SCOPE_BIND_VX_cache_bank(i) .clk (clk), - .reset (reset), + .reset (reset), + // Core request .core_req_valid (curr_bank_core_req_valid), .core_req_tid (curr_bank_core_req_tid), @@ -330,11 +336,10 @@ module VX_cache #( .dram_rsp_ready (curr_bank_dram_rsp_ready), `ifdef PERF_ENABLE - .perf_mshr_stall (perf_mshr_stall_per_bank[i]), - .perf_pipe_stall (perf_pipe_stall_per_bank[i]), - .perf_evict (perf_evict_per_bank[i]), - .perf_read_miss (perf_read_miss_per_bank[i]), - .perf_write_miss (perf_write_miss_per_bank[i]), + .perf_read_misses (perf_read_miss_per_bank[i]), + .perf_write_misses (perf_write_miss_per_bank[i]), + .perf_mshr_stalls (perf_mshr_stall_per_bank[i]), + .perf_pipe_stalls (perf_pipe_stall_per_bank[i]), `endif // Snoop request @@ -434,47 +439,33 @@ module VX_cache #( end `ifdef PERF_ENABLE - // per cycle: core_req_r, core_req_w - reg [($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle; + // per cycle: core_reads, core_writes + reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle; reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle; + VX_countones #( + .N(NUM_REQS) + ) perf_countones_core_reads_count ( + .valids (core_req_valid & core_req_ready & ~core_req_rw), + .count (perf_core_reads_per_cycle) + ); + + VX_countones #( + .N(NUM_REQS) + ) perf_countones_core_writes_count ( + .valids (core_req_valid & core_req_ready & core_req_rw), + .count (perf_core_writes_per_cycle) + ); + if (CORE_TAG_ID_BITS != 0) begin - VX_countones #( // core_req_r - .N(NUM_REQS) - ) perf_countones_core_req_r_count ( - .valids (core_req_valid & {NUM_REQS{core_req_ready & ~core_req_rw}}), - .count (perf_core_req_r_per_cycle) - ); - - VX_countones #( // core_req_w - .N(NUM_REQS) - ) perf_countones_core_req_w_count ( - .valids (core_req_valid & {NUM_REQS{core_req_ready & core_req_rw}}), - .count (perf_core_req_w_per_cycle) - ); - - VX_countones #( // core_rsp + VX_countones #( .N(NUM_REQS) ) perf_countones_core_rsp_count ( .valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}), .count (perf_crsp_stall_per_cycle) ); end else begin - VX_countones #( // core_req_r - .N(NUM_REQS) - ) perf_countones_core_req_r_count ( - .valids (core_req_valid & core_req_ready & ~core_req_rw), - .count (perf_core_req_r_per_cycle) - ); - - VX_countones #( // core_req_w - .N(NUM_REQS) - ) perf_countones_core_req_w_count ( - .valids (core_req_valid & core_req_ready & core_req_rw), - .count (perf_core_req_w_per_cycle) - ); - - VX_countones #( // core_rsp + VX_countones #( .N(NUM_REQS) ) perf_countones_core_rsp_count ( .valids (core_rsp_valid & ~core_rsp_ready), @@ -482,33 +473,11 @@ module VX_cache #( ); end - // per cycle: msrq stalls, pipeline stalls, evictions, read misses, write misses - reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle; - reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle; - reg [($clog2(NUM_BANKS+1)-1):0] perf_evictions_per_cycle; + // per cycle: read misses, write misses, msrq stalls, pipeline stalls reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle; reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle; - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_mshr_stall_count ( - .valids (perf_mshr_stall_per_bank), - .count (perf_mshr_stall_per_cycle) - ); - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_total_stall_count ( - .valids (perf_pipe_stall_per_bank), - .count (perf_pipe_stall_per_cycle) - ); - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_EVICTSict_count ( - .valids (perf_evict_per_bank), - .count (perf_evictions_per_cycle) - ); + reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle; VX_countones #( .N(NUM_BANKS) @@ -524,59 +493,55 @@ module VX_cache #( .count (perf_write_miss_per_cycle) ); - reg [63:0] perf_core_req_r; - reg [63:0] perf_core_req_w; - reg [63:0] perf_mshr_stall; - reg [63:0] perf_pipe_stall; - reg [63:0] perf_evictions; - reg [63:0] perf_read_miss; - reg [63:0] perf_write_miss; - reg [63:0] perf_crsp_stall; - reg [63:0] perf_dreq_stall; + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_mshr_stall_count ( + .valids (perf_mshr_stall_per_bank), + .count (perf_mshr_stall_per_cycle) + ); + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_total_stall_count ( + .valids (perf_pipe_stall_per_bank), + .count (perf_pipe_stall_per_cycle) + ); + + reg [63:0] perf_core_reads; + reg [63:0] perf_core_writes; + reg [63:0] perf_read_misses; + reg [63:0] perf_write_misses; + reg [63:0] perf_mshr_stalls; + reg [63:0] perf_pipe_stalls; + reg [63:0] perf_crsp_stalls; always @(posedge clk) begin if (reset) begin - perf_core_req_r <= 0; - perf_core_req_w <= 0; - perf_crsp_stall <= 0; - perf_mshr_stall <= 0; - perf_pipe_stall <= 0; - perf_evictions <= 0; - perf_read_miss <= 0; - perf_write_miss <= 0; - perf_dreq_stall <= 0; + perf_core_reads <= 0; + perf_core_writes <= 0; + perf_read_misses <= 0; + perf_write_misses <= 0; + perf_mshr_stalls <= 0; + perf_pipe_stalls <= 0; + perf_crsp_stalls <= 0; end else begin - // core requests - perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle); - perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle); - // core response stalls - perf_crsp_stall <= perf_crsp_stall + $bits(perf_crsp_stall)'(perf_crsp_stall_per_cycle); - // miss reserve queue stalls - perf_mshr_stall <= perf_mshr_stall + $bits(perf_mshr_stall)'(perf_mshr_stall_per_cycle); - // pipeline stalls - perf_pipe_stall <= perf_pipe_stall + $bits(perf_pipe_stall)'(perf_pipe_stall_per_cycle); - // total evictions - perf_evictions <= perf_evictions + $bits(perf_evictions)'(perf_evictions_per_cycle); - // read misses - perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle); - // write misses - perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle); - // dram request stalls - if (dram_req_valid & !dram_req_ready) begin - perf_dreq_stall <= perf_dreq_stall + 64'd1; - end + perf_core_reads <= perf_core_reads + 64'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + 64'(perf_core_writes_per_cycle); + perf_read_misses <= perf_read_misses + 64'(perf_read_miss_per_cycle); + perf_write_misses <= perf_write_misses + 64'(perf_write_miss_per_cycle); + perf_mshr_stalls <= perf_mshr_stalls + 64'(perf_mshr_stall_per_cycle); + perf_pipe_stalls <= perf_pipe_stalls + 64'(perf_pipe_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + 64'(perf_crsp_stall_per_cycle); end end - assign perf_cache_if.reads = perf_core_req_r; - assign perf_cache_if.writes = perf_core_req_w; - assign perf_cache_if.read_misses = perf_read_miss; - assign perf_cache_if.write_misses = perf_write_miss; - assign perf_cache_if.evictions = perf_evictions; - assign perf_cache_if.mshr_stalls = perf_mshr_stall; - assign perf_cache_if.pipe_stalls = perf_pipe_stall; - assign perf_cache_if.crsp_stalls = perf_crsp_stall; - assign perf_cache_if.dreq_stalls = perf_dreq_stall; + assign perf_cache_if.reads = perf_core_reads; + assign perf_cache_if.writes = perf_core_writes; + assign perf_cache_if.read_misses = perf_read_misses; + assign perf_cache_if.write_misses = perf_write_misses; + assign perf_cache_if.mshr_stalls = perf_mshr_stalls; + assign perf_cache_if.pipe_stalls = perf_pipe_stalls; + assign perf_cache_if.crsp_stalls = perf_crsp_stalls; `endif endmodule diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index f3454d33..960f13cc 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -10,17 +10,21 @@ module VX_cache_core_req_bank_sel #( // Number of Word requests per cycle parameter NUM_REQS = 1 ) ( + input wire clk, + input wire reset, input wire [NUM_REQS-1:0] core_req_valid, input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr, output wire [NUM_REQS-1:0] core_req_ready, output wire [NUM_BANKS-1:0] per_bank_valid, output wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid, - input wire [NUM_BANKS-1:0] per_bank_ready + input wire [NUM_BANKS-1:0] per_bank_ready, + output wire [63:0] bank_stalls ); - if (NUM_BANKS > 1) begin + if (NUM_BANKS > 1) begin reg [NUM_BANKS-1:0] per_bank_valid_r; reg [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid_r; reg [NUM_REQS-1:0] core_req_ready_r; + reg [NUM_BANKS-1:0] core_req_sel_r; wire [NUM_REQS-1:0][`BANK_BITS-1:0] core_req_bid; for (genvar i = 0; i < NUM_REQS; ++i) begin @@ -40,28 +44,41 @@ module VX_cache_core_req_bank_sel #( always @(*) begin core_req_ready_r = 0; + core_req_sel_r = 0; for (integer j = 0; j < NUM_BANKS; ++j) begin for (integer i = 0; i < NUM_REQS; ++i) begin if (core_req_valid[i] && (core_req_bid[i] == `BANK_BITS'(j))) begin core_req_ready_r[i] = per_bank_ready[j]; + core_req_sel_r[i] = 1; break; end end end end + reg [63:0] bank_stalls_r; + always @(posedge clk) begin + if (reset) begin + bank_stalls_r <= 0; + end else begin + bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_valid & ~core_req_sel_r)); + end + end + assign per_bank_valid = per_bank_valid_r; assign per_bank_tid = per_bank_tid_r; assign core_req_ready = core_req_ready_r; + assign bank_stalls = bank_stalls_r; end else begin - + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) `UNUSED_VAR (core_req_valid) `UNUSED_VAR (core_req_addr) assign per_bank_valid = core_req_valid; assign per_bank_tid = 0; assign core_req_ready[0] = per_bank_ready; - + assign bank_stalls = 0; end endmodule \ No newline at end of file diff --git a/hw/rtl/cache/VX_tag_store.v b/hw/rtl/cache/VX_tag_store.v index 20d49d03..57107f5c 100644 --- a/hw/rtl/cache/VX_tag_store.v +++ b/hw/rtl/cache/VX_tag_store.v @@ -48,6 +48,7 @@ module VX_tag_store #( VX_dp_ram #( .DATAW(`TAG_SELECT_BITS), .SIZE(`BANK_LINE_COUNT), + .FASTRAM(1), .RWCHECK(1) ) tags ( .clk(clk), diff --git a/hw/rtl/interfaces/VX_perf_cache_if.v b/hw/rtl/interfaces/VX_perf_cache_if.v index 9fcde291..8b53a10b 100644 --- a/hw/rtl/interfaces/VX_perf_cache_if.v +++ b/hw/rtl/interfaces/VX_perf_cache_if.v @@ -6,14 +6,13 @@ interface VX_perf_cache_if (); wire [63:0] reads; - wire [63:0] writes; + wire [63:0] writes; wire [63:0] read_misses; wire [63:0] write_misses; - wire [63:0] evictions; + wire [63:0] bank_stalls; wire [63:0] mshr_stalls; + wire [63:0] pipe_stalls; wire [63:0] crsp_stalls; - wire [63:0] dreq_stalls; - wire [63:0] pipe_stalls; endinterface diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.v b/hw/rtl/interfaces/VX_perf_memsys_if.v index 42dc6045..d0264aa0 100644 --- a/hw/rtl/interfaces/VX_perf_memsys_if.v +++ b/hw/rtl/interfaces/VX_perf_memsys_if.v @@ -7,24 +7,26 @@ interface VX_perf_memsys_if (); wire [63:0] icache_reads; wire [63:0] icache_read_misses; - wire [63:0] icache_mshr_stalls; - wire [63:0] icache_crsp_stalls; - wire [63:0] icache_dreq_stalls; wire [63:0] icache_pipe_stalls; + wire [63:0] icache_crsp_stalls; wire [63:0] dcache_reads; - wire [63:0] dcache_writes; + wire [63:0] dcache_writes; wire [63:0] dcache_read_misses; wire [63:0] dcache_write_misses; - wire [63:0] dcache_evictions; + wire [63:0] dcache_bank_stalls; wire [63:0] dcache_mshr_stalls; - wire [63:0] dcache_crsp_stalls; - wire [63:0] dcache_dreq_stalls; wire [63:0] dcache_pipe_stalls; + wire [63:0] dcache_crsp_stalls; + + wire [63:0] smem_reads; + wire [63:0] smem_writes; + wire [63:0] smem_bank_stalls; + wire [63:0] dram_reads; + wire [63:0] dram_writes; + wire [63:0] dram_stalls; wire [63:0] dram_latency; - wire [63:0] dram_requests; - wire [63:0] dram_responses; endinterface diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.v b/hw/rtl/interfaces/VX_perf_pipeline_if.v index a1adb7fd..a3c28c80 100644 --- a/hw/rtl/interfaces/VX_perf_pipeline_if.v +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.v @@ -4,12 +4,8 @@ `include "VX_define.vh" interface VX_perf_pipeline_if (); - // from pipeline - wire [63:0] icache_stalls; - wire [63:0] ibuffer_stalls; - // from issue - wire [63:0] scoreboard_stalls; - // from execute + wire [63:0] ibf_stalls; + wire [63:0] scb_stalls; wire [63:0] lsu_stalls; wire [63:0] csr_stalls; wire [63:0] alu_stalls; diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 7106989f..b94d31d9 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -71,8 +71,8 @@ void Simulator::reset() { vortex_->dram_rsp_valid = 0; vortex_->dram_req_ready = 0; - vortex_->io_req_ready = 0; - vortex_->io_rsp_valid = 0; + //vortex_->io_req_ready = 0; + //vortex_->io_rsp_valid = 0; vortex_->snp_req_valid = 0; vortex_->snp_rsp_ready = 0; vortex_->csr_io_req_valid = 0; @@ -201,7 +201,7 @@ void Simulator::eval_dram_bus() { } void Simulator::eval_io_bus() { - for (int i = 0; i < NUM_THREADS; ++i) { + /*for (int i = 0; i < NUM_THREADS; ++i) { if (((vortex_->io_req_valid >> i) & 0x1) && ((VL_WDATA_GETW(vortex_->io_req_addr, i, NUM_THREADS, 30) << 2) == IO_BUS_ADDR_COUT)) { assert(vortex_->io_req_rw); @@ -217,7 +217,7 @@ void Simulator::eval_io_bus() { } } vortex_->io_req_ready = 1; - vortex_->io_rsp_valid = 0; + vortex_->io_rsp_valid = 0;*/ } void Simulator::eval_snp_bus() {