From 4b68235389217ae91ce110b0b1fa84542cb232f7 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 27 Nov 2023 04:50:55 -0800 Subject: [PATCH] fixed simx dispatcher bug --- hw/rtl/VX_types.vh | 158 +++++++++++++++------------ runtime/common/utils.cpp | 226 +++++++++++++++++++++++++-------------- sim/simx/cache_cluster.h | 10 +- sim/simx/cache_sim.cpp | 139 +++++++++++------------- sim/simx/cache_sim.h | 7 +- sim/simx/cluster.cpp | 27 +++-- sim/simx/core.cpp | 165 ++++++++++++++++------------ sim/simx/core.h | 21 +++- sim/simx/dispatcher.h | 42 ++++---- sim/simx/processor.cpp | 21 ++-- sim/simx/scoreboard.h | 111 +++++++++---------- sim/simx/types.h | 164 +++++++++++++++++++++------- 12 files changed, 640 insertions(+), 451 deletions(-) diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 388dc258..80a4a7d7 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -58,6 +58,8 @@ `define VX_CSR_MPM_BASE 12'hB00 `define VX_CSR_MPM_BASE_H 12'hB80 +`define VX_CSR_MPM_USER 12'hB03 +`define VX_CSR_MPM_USER_H 12'hB83 // Machine Performance-monitoring core counters // PERF: Standard @@ -68,29 +70,41 @@ `define VX_CSR_MINSTRET 12'hB02 `define VX_CSR_MINSTRET_H 12'hB82 // PERF: pipeline -`define VX_CSR_MPM_IBUF_ST 12'hB03 -`define VX_CSR_MPM_IBUF_ST_H 12'hB83 -`define VX_CSR_MPM_SCRB_ST 12'hB04 -`define VX_CSR_MPM_SCRB_ST_H 12'hB84 -`define VX_CSR_MPM_ALU_ST 12'hB05 -`define VX_CSR_MPM_ALU_ST_H 12'hB85 -`define VX_CSR_MPM_LSU_ST 12'hB06 -`define VX_CSR_MPM_LSU_ST_H 12'hB86 -`define VX_CSR_MPM_FPU_ST 12'hB07 -`define VX_CSR_MPM_FPU_ST_H 12'hB87 -`define VX_CSR_MPM_SFU_ST 12'hB08 -`define VX_CSR_MPM_SFU_ST_H 12'hB88 +`define VX_CSR_MPM_SCHED_ST 12'hB03 +`define VX_CSR_MPM_SCHED_ST_H 12'hB83 +`define VX_CSR_MPM_FETCH_ST 12'hB04 +`define VX_CSR_MPM_FETCH_ST_H 12'hB84 +`define VX_CSR_MPM_IBUF_ST 12'hB05 +`define VX_CSR_MPM_IBUF_ST_H 12'hB85 +`define VX_CSR_MPM_SCRB_ST 12'hB06 +`define VX_CSR_MPM_SCRB_ST_H 12'hB86 +`define VX_CSR_MPM_ALU_ST 12'hB07 +`define VX_CSR_MPM_ALU_ST_H 12'hB87 +`define VX_CSR_MPM_LSU_ST 12'hB08 +`define VX_CSR_MPM_LSU_ST_H 12'hB88 +`define VX_CSR_MPM_FPU_ST 12'hB09 +`define VX_CSR_MPM_FPU_ST_H 12'hB89 +`define VX_CSR_MPM_SFU_ST 12'hB0A +`define VX_CSR_MPM_SFU_ST_H 12'hB8A +`define VX_CSR_MPM_SCRB_ALU 12'hB0B +`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B +`define VX_CSR_MPM_SCRB_FPU 12'hB0C +`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C +`define VX_CSR_MPM_SCRB_LSU 12'hB0D +`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D +`define VX_CSR_MPM_SCRB_SFU 12'hB0E +`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E // PERF: memory -`define VX_CSR_MPM_IFETCHES 12'hB0A -`define VX_CSR_MPM_IFETCHES_H 12'hB8A -`define VX_CSR_MPM_LOADS 12'hB0B -`define VX_CSR_MPM_LOADS_H 12'hB8B -`define VX_CSR_MPM_STORES 12'hB0C -`define VX_CSR_MPM_STORES_H 12'hB8C -`define VX_CSR_MPM_IFETCH_LAT 12'hB0D -`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D -`define VX_CSR_MPM_LOAD_LAT 12'hB0E -`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E +`define VX_CSR_MPM_IFETCHES 12'hB0F +`define VX_CSR_MPM_IFETCHES_H 12'hB8F +`define VX_CSR_MPM_LOADS 12'hB10 +`define VX_CSR_MPM_LOADS_H 12'hB90 +`define VX_CSR_MPM_STORES 12'hB11 +`define VX_CSR_MPM_STORES_H 12'hB91 +`define VX_CSR_MPM_IFETCH_LAT 12'hB12 +`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92 +`define VX_CSR_MPM_LOAD_LAT 12'hB13 +`define VX_CSR_MPM_LOAD_LAT_H 12'hB93 // Machine Performance-monitoring memory counters // PERF: icache @@ -98,59 +112,61 @@ `define VX_CSR_MPM_ICACHE_READS_H 12'hB83 `define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses `define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84 +`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls +`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85 // PERF: dcache -`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads -`define VX_CSR_MPM_DCACHE_READS_H 12'hB85 -`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes -`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86 -`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses -`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87 -`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses -`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88 -`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts -`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89 -`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls -`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A -// PERF: smem -`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads -`define VX_CSR_MPM_SMEM_READS_H 12'hB8B -`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes -`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C -`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts -`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D +`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads +`define VX_CSR_MPM_DCACHE_READS_H 12'hB86 +`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes +`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87 +`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses +`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88 +`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses +`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89 +`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts +`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A +`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls +`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B // PERF: l2cache -`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads -`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E -`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes -`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F -`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses -`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90 -`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses -`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91 -`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts -`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92 -`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls -`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93 +`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads +`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C +`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes +`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D +`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses +`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E +`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses +`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F +`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts +`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90 +`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls +`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91 // PERF: l3cache -`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads -`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94 -`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes -`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95 -`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses -`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96 -`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses -`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97 -`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts -`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98 -`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls -`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99 +`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads +`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92 +`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes +`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93 +`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses +`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94 +`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses +`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95 +`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts +`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96 +`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls +`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97 // PERF: memory -`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads -`define VX_CSR_MPM_MEM_READS_H 12'hB9A -`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes -`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B -`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency -`define VX_CSR_MPM_MEM_LAT_H 12'hB9C +`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads +`define VX_CSR_MPM_MEM_READS_H 12'hB98 +`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes +`define VX_CSR_MPM_MEM_WRITES_H 12'hB99 +`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency +`define VX_CSR_MPM_MEM_LAT_H 12'hB9A +// PERF: smem +`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads +`define VX_CSR_MPM_SMEM_READS_H 12'hB9B +`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes +`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C +`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts +`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D // Machine Information Registers diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 72c2b80c..574f64a7 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -186,27 +186,31 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { return int((1.0 - (double(part) / double(total))) * 100); }; - auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int { - if (requests == 0) + auto caclAverage = [&](uint64_t part, uint64_t total)->double { + if (total == 0) return 0; - return int(double(sum) / double(requests)); + return double(part) / double(total); }; - auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int { - if (count == 0) - return 0; - return int((double(count) / double(count + stalls)) * 100); + auto calcAvgPercent = [&](uint64_t part, uint64_t total)->int { + return int(caclAverage(part, total) * 100); }; auto perf_class = gAutoPerfDump.get_perf_class(); // PERF: pipeline stalls + uint64_t scheduler_stalls = 0; + uint64_t fetch_stalls = 0; uint64_t ibuffer_stalls = 0; - uint64_t scoreboard_stalls = 0; + uint64_t scrb_stalls = 0; uint64_t lsu_stalls = 0; uint64_t fpu_stalls = 0; uint64_t alu_stalls = 0; - uint64_t sfu_stalls = 0; + uint64_t sfu_stalls = 0; + uint64_t scrb_alu = 0; + uint64_t scrb_fpu = 0; + uint64_t scrb_lsu = 0; + uint64_t scrb_sfu = 0; uint64_t ifetches = 0; uint64_t loads = 0; uint64_t stores = 0; @@ -251,76 +255,121 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { #endif std::vector staging_buf(64* sizeof(uint32_t)); - - for (unsigned core_id = 0; core_id < num_cores; ++core_id) { + + for (unsigned core_id = 0; core_id < num_cores; ++core_id) { uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size(); ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()); if (ret != 0) return ret; + uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); + uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); + #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { // PERF: pipeline - // ibuffer_stall - uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core); - ibuffer_stalls += ibuffer_stalls_per_core; - // scoreboard_stall - uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core); - scoreboard_stalls += scoreboard_stalls_per_core; - // alu_stall - uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); - alu_stalls += alu_stalls_per_core; - // lsu_stall - uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); - lsu_stalls += lsu_stalls_per_core; - // fpu_stall - uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); - fpu_stalls += fpu_stalls_per_core; - // sfu_stall - uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); - sfu_stalls += sfu_stalls_per_core; + // schedule stalls + { + uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); + int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core); + scheduler_stalls += scheduler_stalls_per_core; + } + // fetch stalls + { + uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST); + int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core); + fetch_stalls += fetch_stalls_per_core; + } + // ibuffer_stalls + { + uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); + int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); + ibuffer_stalls += ibuffer_stalls_per_core; + } + // scrb_stalls + { + uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); + uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU); + uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU); + uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU); + uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); + uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core; + scrb_alu += scrb_alu_per_core; + scrb_fpu += scrb_fpu_per_core; + scrb_lsu += scrb_lsu_per_core; + scrb_sfu += scrb_sfu_per_core; + if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, + calcAvgPercent(scrb_alu_per_core, scrb_total), + calcAvgPercent(scrb_fpu_per_core, scrb_total), + calcAvgPercent(scrb_lsu_per_core, scrb_total), + calcAvgPercent(scrb_sfu_per_core, scrb_total)); + scrb_stalls += scrb_stalls_per_core; + } + // alu_stalls + { + uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); + alu_stalls += alu_stalls_per_core; + } + // lsu_stalls + { + uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); + lsu_stalls += lsu_stalls_per_core; + } + // fpu_stalls + { + uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); + fpu_stalls += fpu_stalls_per_core; + } + // sfu_stalls + { + uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); + sfu_stalls += sfu_stalls_per_core; + } // PERF: memory // ifetches - uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); - ifetches += ifetches_per_core; + { + uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); + ifetches += ifetches_per_core; + + uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); + int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + ifetch_lat += ifetch_lat_per_core; + } // loads - uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); - if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); - loads += loads_per_core; + { + uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); + loads += loads_per_core; + + uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); + int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); + load_lat += load_lat_per_core; + } // stores - uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); - if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); - stores += stores_per_core; - // ifetch latency - uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); - if (num_cores > 1) { - int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core); - fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + { + uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); + stores += stores_per_core; } - ifetch_lat += ifetch_lat_per_core; - // load latency - uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); - if (num_cores > 1) { - int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core); - fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); - } - load_lat += load_lat_per_core; } break; - case VX_DCR_MPM_CLASS_MEM: { + case VX_DCR_MPM_CLASS_MEM: { if (smem_enable) { // PERF: smem uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS); uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES); uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST); - int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls); + int smem_bank_utilization = calcAvgPercent(smem_reads + smem_writes, smem_reads + smem_writes + smem_bank_stalls); fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads); fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes); fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization); @@ -330,9 +379,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: Icache uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); - int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); + uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST); + int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); + int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls); fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads); fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio); + fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization); } if (dcache_enable) { @@ -345,13 +397,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads); int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes); - int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls); + int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls); + int mshr_utilization = calcAvgPercent(dcache_read_misses + dcache_write_misses, dcache_read_misses + dcache_write_misses + dcache_mshr_stalls); fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads); fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes); fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization); - fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls); + fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization); } if (l2cache_enable) { @@ -386,8 +439,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } #endif - uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); - uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); instrs += instrs_per_core; @@ -397,10 +448,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { + int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles); + int fetch_percent = calcAvgPercent(fetch_stalls, cycles); + int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles); int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int load_avg_lat = (int)(double(load_lat) / double(loads)); - fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); - fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); + uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu; + fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent); + fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent); + fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); + fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, + calcAvgPercent(scrb_alu, scrb_total), + calcAvgPercent(scrb_fpu, scrb_total), + calcAvgPercent(scrb_lsu, scrb_total), + calcAvgPercent(scrb_sfu, scrb_total)); fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); @@ -419,31 +480,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { l2cache_write_misses /= num_cores; l2cache_bank_stalls /= num_cores; l2cache_mshr_stalls /= num_cores; - int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); - int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); - int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls); - + int read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); + int write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); + int bank_utilization = calcAvgPercent(l2cache_reads + l2cache_writes, l2cache_reads + l2cache_writes + l2cache_bank_stalls); + int mshr_utilization = calcAvgPercent(l2cache_read_misses + l2cache_write_misses, l2cache_read_misses + l2cache_write_misses + l2cache_mshr_stalls); fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads); fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes); - fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio); - fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio); - fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization); - fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls); + fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio); + fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio); + fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization); + fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization); } if (l3cache_enable) { - int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); - int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); - int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls); + int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); + int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); + int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls); + int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); - fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio); - fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio); - fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization); - fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls); + fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio); + fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio); + fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization); + fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization); } - int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads); + int mem_avg_lat = caclAverage(mem_lat, mem_reads); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); } break; diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index be3146d3..aef28d1a 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -45,20 +45,20 @@ public: char sname[100]; - std::vector::Ptr> unit_arbs(num_units); + std::vector unit_arbs(num_units); for (uint32_t u = 0; u < num_units; ++u) { snprintf(sname, 100, "%s-unit-arb-%d", name, u); - unit_arbs.at(u) = Switch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs); + unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs); for (uint32_t i = 0; i < num_requests; ++i) { this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i)); unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i)); } } - std::vector::Ptr> mem_arbs(config.num_inputs); + std::vector mem_arbs(config.num_inputs); for (uint32_t i = 0; i < config.num_inputs; ++i) { snprintf(sname, 100, "%s-mem-arb-%d", name, i); - mem_arbs.at(i) = Switch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches); + mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches); for (uint32_t u = 0; u < num_units; ++u) { unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u)); mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i)); @@ -66,7 +66,7 @@ public: } snprintf(sname, 100, "%s-cache-arb", name); - auto cache_arb = Switch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); + auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); for (uint32_t i = 0; i < num_caches; ++i) { snprintf(sname, 100, "%s-cache%d", name, i); diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 5a6906a9..2c9410f4 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -41,19 +41,16 @@ struct params_t { uint32_t tag_select_addr_end; params_t(const CacheSim::Config& config) { - int32_t bank_bits = log2ceil(config.num_banks); - int32_t offset_bits = config.B - config.W; - int32_t log2_bank_size = config.C - bank_bits; - int32_t index_bits = log2_bank_size - (config.B + config.A); - assert(log2_bank_size > 0); + int32_t offset_bits = config.L - config.W; + int32_t index_bits = config.C - (config.L + config.A + config.B); assert(offset_bits >= 0); assert(index_bits >= 0); this->log2_num_inputs = log2ceil(config.num_inputs); - this->words_per_line = 1 << offset_bits; + this->sets_per_bank = 1 << index_bits; this->lines_per_set = 1 << config.A; - this->sets_per_bank = 1 << index_bits; + this->words_per_line = 1 << offset_bits; assert(config.ports_per_bank <= this->words_per_line); @@ -63,7 +60,7 @@ struct params_t { // Bank select this->bank_select_addr_start = (1+this->word_select_addr_end); - this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); + this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1); // Set select this->set_select_addr_start = (1+this->bank_select_addr_end); @@ -74,23 +71,23 @@ struct params_t { this->tag_select_addr_end = (config.addr_width-1); } - uint32_t addr_bank_id(uint64_t word_addr) const { + uint32_t addr_bank_id(uint64_t addr) const { if (bank_select_addr_end >= bank_select_addr_start) - return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); + return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end); else return 0; } - uint32_t addr_set_id(uint64_t word_addr) const { + uint32_t addr_set_id(uint64_t addr) const { if (set_select_addr_end >= set_select_addr_start) - return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); + return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end); else return 0; } - uint64_t addr_tag(uint64_t word_addr) const { + uint64_t addr_tag(uint64_t addr) const { if (tag_select_addr_end >= tag_select_addr_start) - return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); + return bit_getw(addr, tag_select_addr_start, tag_select_addr_end); else return 0; } @@ -288,8 +285,8 @@ private: Config config_; params_t params_; std::vector banks_; - Switch::Ptr bank_switch_; - Switch::Ptr bypass_switch_; + MemSwitch::Ptr bank_switch_; + MemSwitch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; std::vector pipeline_reqs_; @@ -304,16 +301,16 @@ public: : simobject_(simobject) , config_(config) , params_(config) - , banks_(config.num_banks, {config, params_}) - , mem_req_ports_(config.num_banks, simobject) - , mem_rsp_ports_(config.num_banks, simobject) - , pipeline_reqs_(config.num_banks, config.ports_per_bank) + , banks_((1 << config.B), {config, params_}) + , mem_req_ports_((1 << config.B), simobject) + , mem_rsp_ports_((1 << config.B), simobject) + , pipeline_reqs_((1 << config.B), config.ports_per_bank) { char sname[100]; snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); if (config_.bypass) { - bypass_switch_ = Switch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); for (uint32_t i = 0; i < config_.num_inputs; ++i) { simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); @@ -323,14 +320,14 @@ public: return; } - bypass_switch_ = Switch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); - if (config.num_banks > 1) { + if (config.B != 0) { snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = Switch::Create(sname, ArbiterType::RoundRobin, config.num_banks); - for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); } @@ -383,20 +380,22 @@ public: pipeline_req.clear(); } - // schedule MSHR replay - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + // first: schedule MSHR replay (flush MSHR queue) + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& bank = banks_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id); bank.mshr.pop(&pipeline_req); } - // schedule memory fill - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + // second: schedule memory fill (flush memory queue) + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); if (mem_rsp_port.empty()) continue; auto& pipeline_req = pipeline_reqs_.at(bank_id); + + // skip if bank already busy if (pipeline_req.type != bank_req_t::None) continue; @@ -407,7 +406,7 @@ public: mem_rsp_port.pop(); } - // schedule core requests + // last: schedule core requests (flush core queue) for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { auto& core_req_port = simobject_->CoreReqPorts.at(req_id); if (core_req_port.empty()) @@ -425,18 +424,21 @@ public: } auto bank_id = params_.addr_bank_id(core_req.addr); - auto set_id = params_.addr_set_id(core_req.addr); - auto tag = params_.addr_tag(core_req.addr); - auto port_id = req_id % config_.ports_per_bank; - auto& bank = banks_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id); + // skip if bank already busy + if (pipeline_req.type != bank_req_t::None) + continue; + + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = req_id % config_.ports_per_bank; + // check MSHR capacity if ((!core_req.write || !config_.write_through) && bank.mshr.full()) { ++perf_stats_.mshr_stalls; - ++perf_stats_.bank_stalls; continue; } @@ -452,7 +454,7 @@ public: } // extend request ports pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; - } else if (pipeline_req.type == bank_req_t::None) { + } else { // schedule new request bank_req_t bank_req(config_.ports_per_bank); bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; @@ -463,10 +465,6 @@ public: bank_req.type = bank_req_t::Core; bank_req.write = core_req.write; pipeline_req = bank_req; - } else { - // bank in use - ++perf_stats_.bank_stalls; - continue; } if (core_req.write) @@ -516,7 +514,7 @@ private: } void processBankRequests() { - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& bank = banks_.at(bank_id); auto pipeline_req = pipeline_reqs_.at(bank_id); @@ -545,11 +543,10 @@ private: } } } break; - case bank_req_t::Core: { - bool hit = false; - bool found_free_line = false; - uint32_t hit_line_id = 0; - uint32_t repl_line_id = 0; + case bank_req_t::Core: { + int32_t hit_line_id = -1; + int32_t free_line_id = -1; + int32_t repl_line_id = 0; uint32_t max_cnt = 0; auto& set = bank.sets.at(pipeline_req.set_id); @@ -557,38 +554,34 @@ private: // tag lookup for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) { auto& line = set.lines.at(i); + if (max_cnt < line.lru_ctr) { + max_cnt = line.lru_ctr; + repl_line_id = i; + } if (line.valid) { - if (line.tag == pipeline_req.tag) { - line.lru_ctr = 0; + if (line.tag == pipeline_req.tag) { hit_line_id = i; - hit = true; + line.lru_ctr = 0; } else { ++line.lru_ctr; } - if (max_cnt < line.lru_ctr) { - max_cnt = line.lru_ctr; - repl_line_id = i; - } } else { - found_free_line = true; - repl_line_id = i; + free_line_id = i; } } - if (hit) { - // - // Hit handling - // + if (hit_line_id != -1) { + // Hit handling if (pipeline_req.write) { - // handle write hit + // handle write has_hit auto& hit_line = set.lines.at(hit_line_id); if (config_.write_through) { // forward write request to memory MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag); + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); } else { @@ -606,23 +599,21 @@ private: DT(3, simobject_->name() << "-core-" << core_rsp); } } - } else { - // - // Miss handling - // + } else { + // Miss handling if (pipeline_req.write) ++perf_stats_.write_misses; else ++perf_stats_.read_misses; - if (!found_free_line && !config_.write_through) { + if (free_line_id == -1 && !config_.write_through) { // write back dirty line auto& repl_line = set.lines.at(repl_line_id); if (repl_line.dirty) { MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; + mem_req.cid = pipeline_req.cid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); ++perf_stats_.evictions; @@ -635,8 +626,8 @@ private: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); } @@ -655,7 +646,7 @@ private: auto mshr_pending = bank.mshr.lookup(pipeline_req); // allocate MSHR - auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id); + auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id); // send fill request if (!mshr_pending) { @@ -663,8 +654,8 @@ private: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = false; mem_req.tag = mshr_id; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); ++pending_fill_reqs_; diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index 498fb73f..2faea91d 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -23,16 +23,15 @@ public: struct Config { bool bypass; // cache bypass uint8_t C; // log2 cache size - uint8_t B; // log2 block size + uint8_t L; // log2 line size uint8_t W; // log2 word size uint8_t A; // log2 associativity - uint8_t addr_width; // word address bits - uint8_t num_banks; // number of banks + uint8_t B; // log2 number of banks + uint8_t addr_width; // word address bits uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs bool write_through; // is write-through bool write_reponse; // enable write response - uint16_t victim_size; // victim cache size uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency }; diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index a129ddaa..d7104915 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -36,16 +36,15 @@ Cluster::Cluster(const SimContext& ctx, l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, log2ceil(L2_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B + log2ceil(MEM_BLOCK_SIZE), // L log2ceil(L2_NUM_WAYS), // W 0, // A + log2ceil(L2_NUM_BANKS), // B XLEN, // address bits - L2_NUM_BANKS, // number of banks 1, // number of ports 5, // request size true, // write-through false, // write response - 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency }); @@ -57,16 +56,15 @@ Cluster::Cluster(const SimContext& ctx, icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{ !ICACHE_ENABLED, log2ceil(ICACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // B + log2ceil(L1_LINE_SIZE), // L log2ceil(sizeof(uint32_t)), // W log2ceil(ICACHE_NUM_WAYS),// A - XLEN, // address bits - 1, // number of banks + 1, // B + XLEN, // address bits 1, // number of ports 1, // number of inputs true, // write-through false, // write response - 0, // victim size (uint8_t)arch.num_warps(), // mshr 2, // pipeline latency }); @@ -78,16 +76,15 @@ Cluster::Cluster(const SimContext& ctx, dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ !DCACHE_ENABLED, log2ceil(DCACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // B + log2ceil(L1_LINE_SIZE), // L log2ceil(sizeof(Word)), // W log2ceil(DCACHE_NUM_WAYS),// A - XLEN, // address bits - DCACHE_NUM_BANKS, // number of banks + log2ceil(DCACHE_NUM_BANKS), // B + XLEN, // address bits 1, // number of ports DCACHE_NUM_BANKS, // number of inputs true, // write-through false, // write response - 0, // victim size DCACHE_MSHR_SIZE, // mshr 4, // pipeline latency }); @@ -129,11 +126,11 @@ Cluster::Cluster(const SimContext& ctx, cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn); smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j)); - smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j)); - dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc); + smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j)); + dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC); - smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j)); - sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm); + smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j)); + sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM); } } } diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index f2931324..b2fe7ea2 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -45,19 +45,21 @@ Core::Core(const SimContext& ctx, , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , fcsrs_(arch.num_warps(), 0) - , ibuffers_(ISSUE_WIDTH, IBUF_SIZE) + , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , operands_(ISSUE_WIDTH) - , dispatchers_((uint32_t)ExeType::MAX) - , exe_units_((uint32_t)ExeType::MAX) + , dispatchers_((uint32_t)ExeType::ExeTypeCount) + , exe_units_((uint32_t)ExeType::ExeTypeCount) , sharedmem_(sharedmem) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) - , committed_traces_(ISSUE_WIDTH, nullptr) , csrs_(arch.num_warps()) , cluster_(cluster) -{ + , commit_arbs_(ISSUE_WIDTH) +{ + char sname[100]; + for (uint32_t i = 0; i < arch_.num_warps(); ++i) { csrs_.at(i).resize(arch.num_threads()); } @@ -82,6 +84,16 @@ Core::Core(const SimContext& ctx, exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object(this); exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object(this); + // bind commit arbiters + for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { + snprintf(sname, 100, "commit-arb%d", i); + auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1); + for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) { + exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); + } + commit_arbs_.at(i) = arbiter; + } + this->reset(); } @@ -99,8 +111,12 @@ void Core::reset() { for (auto& exe_unit : exe_units_) { exe_unit->reset(); } + + for (auto& commit_arb : commit_arbs_) { + commit_arb->reset(); + } - for ( auto& barrier : barriers_) { + for (auto& barrier : barriers_) { barrier.reset(); } @@ -112,7 +128,7 @@ void Core::reset() { ibuf.clear(); } - commit_exe_= 0; + ibuffer_idx_ = 0; scoreboard_.clear(); fetch_latch_.clear(); @@ -150,8 +166,10 @@ void Core::schedule() { break; } } - if (scheduled_warp == -1) + if (scheduled_warp == -1) { + ++perf_stats_.sched_stalls; return; + } // suspend warp until decode stalled_warps_.set(scheduled_warp); @@ -192,11 +210,11 @@ void Core::fetch() { mem_req.tag = pending_icache_.allocate(trace); mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; - icache_req_ports.at(0).send(mem_req, 1); + icache_req_ports.at(0).send(mem_req, 2); DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); - fetch_latch_.pop(); - ++pending_ifetches_; + fetch_latch_.pop(); ++perf_stats_.ifetches; + ++pending_ifetches_; } void Core::decode() { @@ -206,7 +224,7 @@ void Core::decode() { auto trace = decode_latch_.front(); // check ibuffer capacity - auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH); + auto& ibuffer = ibuffers_.at(trace->wid); if (ibuffer.full()) { if (!trace->log_once(true)) { DT(3, "*** ibuffer-stall: " << *trace); @@ -239,7 +257,7 @@ void Core::decode() { } void Core::issue() { - // operands to dispatch + // operands to dispatchers for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { auto& operand = operands_.at(i); if (operand->Output.empty()) @@ -257,7 +275,8 @@ void Core::issue() { // issue ibuffer instructions for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - auto& ibuffer = ibuffers_.at(i); + uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size(); + auto& ibuffer = ibuffers_.at(ii); if (ibuffer.empty()) continue; @@ -265,17 +284,27 @@ void Core::issue() { // check scoreboard if (scoreboard_.in_use(trace)) { + auto uses = scoreboard_.get_uses(trace); if (!trace->log_once(true)) { - DTH(3, "*** scoreboard-stall: dependents={"); - auto uses = scoreboard_.get_uses(trace); + DTH(3, "*** scoreboard-stall: dependents={"); for (uint32_t j = 0, n = uses.size(); j < n; ++j) { auto& use = uses.at(j); __unused (use); if (j) DTN(3, ", "); - DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")"); } DTN(3, "}, " << *trace << std::endl); } + for (uint32_t j = 0, n = uses.size(); j < n; ++j) { + auto& use = uses.at(j); + switch (use.exe_type) { + case ExeType::ALU: ++perf_stats_.scrb_alu; break; + case ExeType::FPU: ++perf_stats_.scrb_fpu; break; + case ExeType::LSU: ++perf_stats_.scrb_lsu; break; + case ExeType::SFU: ++perf_stats_.scrb_sfu; break; + default: assert(false); + } + } ++perf_stats_.scrb_stalls; continue; } else { @@ -294,10 +323,11 @@ void Core::issue() { ibuffer.pop(); } + ibuffer_idx_ += ISSUE_WIDTH; } void Core::execute() { - for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) { + for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) { auto& dispatch = dispatchers_.at(i); auto& exe_unit = exe_units_.at(i); for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) { @@ -313,10 +343,11 @@ void Core::execute() { void Core::commit() { // process completed instructions for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - auto trace = committed_traces_.at(i); - if (!trace) + auto& commit_arb = commit_arbs_.at(i); + if (commit_arb->Outputs.at(0).empty()) continue; - committed_traces_.at(i) = nullptr; + + auto trace = commit_arb->Outputs.at(0).front(); // advance to commit stage DT(3, "pipeline-commit: " << *trace); @@ -334,27 +365,11 @@ void Core::commit() { perf_stats_.instrs += trace->tmask.count(); } + commit_arb->Outputs.at(0).pop(); + // delete the trace delete trace; } - - // select completed instructions - for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) { - uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX; - auto& exe_unit = exe_units_.at(ii); - for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) { - auto committed_trace = committed_traces_.at(j); - if (committed_trace) - continue; - auto& output = exe_unit->Outputs.at(j); - if (output.empty()) - continue; - auto trace = output.front(); - committed_traces_.at(j) = trace; - output.pop(); - } - } - ++commit_exe_; } void Core::wspawn(uint32_t num_warps, Word nextPC) { @@ -533,6 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { break; case VX_DCR_MPM_CLASS_CORE: { switch (addr) { + case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff; + case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; + case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff; + case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; @@ -545,6 +564,14 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32; case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff; case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32; + case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff; + case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32; + case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff; + case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32; + case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff; + case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; + case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; + case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; @@ -561,30 +588,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_DCR_MPM_CLASS_MEM: { auto proc_perf = cluster_->processor()->perf_stats(); switch (addr) { - case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; - case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; - case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; - case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; + case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; + case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; + case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32; - case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; - case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; - case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; - case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; - case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; - case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; - case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32; - case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32; - - case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; - case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; - case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; - case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; - case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; - case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; + case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; + case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; + case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; + case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; + case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; + case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32; case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; @@ -612,12 +634,19 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32; - case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; - case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; - case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; - case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32; - case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; - case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; + case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; + case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; + case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; + case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32; + case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; + case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; + + case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; + case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; + case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; + case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; + case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; + case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32; } } break; } diff --git a/sim/simx/core.h b/sim/simx/core.h index ed06574d..60290bef 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -22,11 +22,11 @@ #include #include #include +#include #include "debug.h" #include "types.h" #include "arch.h" #include "decode.h" -#include "mem.h" #include "warp.h" #include "pipeline.h" #include "cache_sim.h" @@ -42,17 +42,25 @@ namespace vortex { class Cluster; +using TraceSwitch = Mux; + class Core : public SimObject { public: struct PerfStats { uint64_t cycles; uint64_t instrs; + uint64_t sched_stalls; + uint64_t fetch_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; uint64_t alu_stalls; uint64_t lsu_stalls; uint64_t fpu_stalls; uint64_t sfu_stalls; + uint64_t scrb_alu; + uint64_t scrb_fpu; + uint64_t scrb_lsu; + uint64_t scrb_sfu; uint64_t ifetches; uint64_t loads; uint64_t stores; @@ -62,12 +70,18 @@ public: PerfStats() : cycles(0) , instrs(0) + , sched_stalls(0) + , fetch_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) , alu_stalls(0) , lsu_stalls(0) , fpu_stalls(0) , sfu_stalls(0) + , scrb_alu(0) + , scrb_fpu(0) + , scrb_lsu(0) + , scrb_sfu(0) , ifetches(0) , loads(0) , stores(0) @@ -173,7 +187,6 @@ private: PipelineLatch decode_latch_; HashTable pending_icache_; - std::vector committed_traces_; WarpMask active_warps_; WarpMask stalled_warps_; uint64_t issued_instrs_; @@ -190,7 +203,9 @@ private: Cluster* cluster_; - uint32_t commit_exe_; + std::vector commit_arbs_; + + uint32_t ibuffer_idx_; friend class Warp; friend class LsuUnit; diff --git a/sim/simx/dispatcher.h b/sim/simx/dispatcher.h index a5c44b64..fe83e9de 100644 --- a/sim/simx/dispatcher.h +++ b/sim/simx/dispatcher.h @@ -66,6 +66,7 @@ public: } auto& output = Outputs.at(i); auto trace = input.front(); + auto new_trace = trace; if (pid_count_ != 1) { auto start_p = start_p_.at(b); if (start_p == -1) { @@ -81,33 +82,30 @@ public: end = j; } start /= num_lanes_; - end /= num_lanes_; - auto new_trace = new pipeline_trace_t(*trace); - new_trace->tmask.reset(); - for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) { - new_trace->tmask[j] = trace->tmask[j]; - } - new_trace->pid = start; - new_trace->sop = (start_p == 0); - if (start == end) { - new_trace->eop = 1; + end /= num_lanes_; + if (start != end) { + new_trace = new pipeline_trace_t(*trace); + new_trace->eop = false; + start_p_.at(b) = start + 1; + } else { start_p_.at(b) = -1; input.pop(); ++block_sent; - delete trace; - } else { - new_trace->eop = 0; - start_p_.at(b) = start + 1; - } - output.send(new_trace, 1); - DT(3, "pipeline-dispatch: " << *new_trace); + } + new_trace->pid = start; + new_trace->sop = (0 == start_p); + ThreadMask tmask; + for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) { + tmask[j] = trace->tmask[j]; + } + new_trace->tmask = tmask; } else { - trace->pid = 0; + new_trace->pid = 0; input.pop(); - output.send(trace, 1); - DT(3, "pipeline-dispatch: " << *trace); ++block_sent; - } + } + DT(3, "pipeline-dispatch: " << *new_trace); + output.send(new_trace, 1); } if (block_sent == block_size_) { batch_idx_ = (batch_idx_ + 1) % batch_count_; @@ -138,4 +136,4 @@ private: std::vector start_p_; }; -} \ No newline at end of file +} diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index da151b62..77021dbd 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{ !L3_ENABLED, log2ceil(L3_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - log2ceil(L3_NUM_WAYS), // W - 0, // A - XLEN, // address bits - L3_NUM_BANKS, // number of banks - 1, // number of ports + log2ceil(MEM_BLOCK_SIZE), // L + log2ceil(L3_NUM_WAYS), // W + 0, // A + log2ceil(L3_NUM_BANKS), // B + XLEN, // address bits + 1, // number of ports uint8_t(arch.num_clusters()), // request size - true, // write-through - false, // write response - 0, // victim size - L3_MSHR_SIZE, // mshr - 2, // pipeline latency + true, // write-through + false, // write response + L3_MSHR_SIZE, // mshr + 2, // pipeline latency } ); diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h index 4d311846..5c247b73 100644 --- a/sim/simx/scoreboard.h +++ b/sim/simx/scoreboard.h @@ -22,9 +22,10 @@ class Scoreboard { public: struct reg_use_t { - RegType type; - uint32_t reg; - uint64_t owner; + RegType reg_type; + uint32_t reg_id; + ExeType exe_type; + uint64_t uuid; }; Scoreboard(const Arch &arch) @@ -44,89 +45,81 @@ public: owners_.clear(); } - bool in_use(pipeline_trace_t* state) const { - return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 - || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0 - || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0; + bool in_use(pipeline_trace_t* trace) const { + return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0 + || (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0 + || (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0; } - std::vector get_uses(pipeline_trace_t* state) const { - std::vector out; - { - uint32_t r = 0; - auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid); - while (used_iregs.any()) { - if (used_iregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer; - out.push_back({RegType::Integer, r, owners_.at(tag)}); - } - used_iregs >>= 1; - ++r; + std::vector get_uses(pipeline_trace_t* trace) const { + std::vector out; + + auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid); + auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid); + auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid); + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_iregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer; + auto owner = owners_.at(tag); + out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid}); } } - { - uint32_t r = 0; - auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid); - while (used_fregs.any()) { - if (used_fregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float; - out.push_back({RegType::Float, r, owners_.at(tag)}); - } - used_fregs >>= 1; - ++r; + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_fregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float; + auto owner = owners_.at(tag); + out.push_back({RegType::Float, r, owner->exe_type, owner->uuid}); } } - { - uint32_t r = 0; - auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid); - while (used_vregs.any()) { - if (used_vregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector; - out.push_back({RegType::Vector, r, owners_.at(tag)}); - } - used_vregs >>= 1; - ++r; + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_vregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector; + auto owner = owners_.at(tag); + out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid}); } } + return out; } - void reserve(pipeline_trace_t* state) { - assert(state->wb); - switch (state->rdest_type) { + void reserve(pipeline_trace_t* trace) { + assert(trace->wb); + switch (trace->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state->wid).set(state->rdest); + in_use_iregs_.at(trace->wid).set(trace->rdest); break; case RegType::Float: - in_use_fregs_.at(state->wid).set(state->rdest); + in_use_fregs_.at(trace->wid).set(trace->rdest); break; case RegType::Vector: - in_use_vregs_.at(state->wid).set(state->rdest); - break; - default: + in_use_vregs_.at(trace->wid).set(trace->rdest); break; + default: assert(false); } - uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state->uuid; + owners_[tag] = trace; + assert((int)trace->exe_type < 5); } - void release(pipeline_trace_t* state) { - assert(state->wb); - switch (state->rdest_type) { + void release(pipeline_trace_t* trace) { + assert(trace->wb); + switch (trace->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state->wid).reset(state->rdest); + in_use_iregs_.at(trace->wid).reset(trace->rdest); break; case RegType::Float: - in_use_fregs_.at(state->wid).reset(state->rdest); + in_use_fregs_.at(trace->wid).reset(trace->rdest); break; case RegType::Vector: - in_use_vregs_.at(state->wid).reset(state->rdest); - break; - default: + in_use_vregs_.at(trace->wid).reset(trace->rdest); break; + default: assert(false); } - uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type; owners_.erase(tag); } @@ -135,7 +128,7 @@ private: std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; - std::unordered_map owners_; + std::unordered_map owners_; }; } \ No newline at end of file diff --git a/sim/simx/types.h b/sim/simx/types.h index 88b3ce0e..6bba7f9c 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -81,7 +81,7 @@ enum class ExeType { LSU, FPU, SFU, - MAX, + ExeTypeCount }; inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { @@ -90,7 +90,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { case ExeType::LSU: os << "LSU"; break; case ExeType::FPU: os << "FPU"; break; case ExeType::SFU: os << "SFU"; break; - case ExeType::MAX: break; + default: assert(false); } return os; } @@ -138,7 +138,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { enum class AddrType { Global, Shared, - IO, + IO }; inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { @@ -164,7 +164,7 @@ enum class FpuType { FMA, FDIV, FSQRT, - FCVT, + FCVT }; inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { @@ -190,7 +190,7 @@ enum class SfuType { CSRRW, CSRRS, CSRRC, - CMOV + CMOV }; inline std::ostream &operator<<(std::ostream &os, const SfuType& type) { @@ -351,6 +351,92 @@ private: /////////////////////////////////////////////////////////////////////////////// +template +class Mux : public SimObject> { +public: + std::vector> Inputs; + std::vector> Outputs; + + Mux( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t delay = 1 + ) : SimObject>(ctx, name) + , Inputs(num_inputs, this) + , Outputs(num_outputs, this) + , type_(type) + , delay_(delay) + , cursors_(num_outputs, 0) + , num_reqs_(num_inputs / num_outputs) + { + assert(delay != 0); + assert(num_inputs <= 32); + assert(num_outputs <= 32); + assert(num_inputs >= num_outputs); + + // bypass mode + if (num_inputs == num_outputs) { + for (uint32_t i = 0; i < num_inputs; ++i) { + Inputs.at(i).bind(&Outputs.at(i)); + } + } + } + + void reset() { + for (auto& cursor : cursors_) { + cursor = 0; + } + } + + void tick() { + uint32_t I = Inputs.size(); + uint32_t O = Outputs.size(); + uint32_t R = num_reqs_; + + // skip bypass mode + if (I == O) + return; + + // process inputs + for (uint32_t o = 0; o < O; ++o) { + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (cursors_.at(o) + r) & (R-1); + uint32_t j = o * R + i; + if (j >= I) + continue; + + auto& req_in = Inputs.at(j); + if (!req_in.empty()) { + auto& req = req_in.front(); + DT(4, this->name() << "-" << req); + Outputs.at(o).send(req, delay_); + req_in.pop(); + this->update_cursor(o, i); + break; + } + } + } + } + +private: + + void update_cursor(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + cursors_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector cursors_; + uint32_t num_reqs_; +}; + +/////////////////////////////////////////////////////////////////////////////// + template class Switch : public SimObject> { public: @@ -364,13 +450,13 @@ public: const SimContext& ctx, const char* name, ArbiterType type, - uint32_t num_inputs = 1, + uint32_t num_inputs, uint32_t num_outputs = 1, uint32_t delay = 1 ) : SimObject>(ctx, name) - , ReqIn(num_inputs, this) - , RspIn(num_inputs, this) + , ReqIn(num_inputs, this) + , RspIn(num_inputs, this) , ReqOut(num_outputs, this) , RspOut(num_outputs, this) , type_(type) @@ -383,8 +469,8 @@ public: assert(num_outputs <= 32); assert(num_inputs >= num_outputs); + // bypass mode if (num_inputs == num_outputs) { - // bypass mode for (uint32_t i = 0; i < num_inputs; ++i) { ReqIn.at(i).bind(&ReqOut.at(i)); RspOut.at(i).bind(&RspIn.at(i)); @@ -462,14 +548,14 @@ private: class SMemDemux : public SimObject { public: - SimPort ReqIn; - SimPort RspIn; + SimPort ReqIn; + SimPort RspIn; - SimPort ReqSm; - SimPort RspSm; + SimPort ReqSM; + SimPort RspSM; - SimPort ReqDc; - SimPort RspDc; + SimPort ReqDC; + SimPort RspDC; SMemDemux( const SimContext& ctx, @@ -478,45 +564,49 @@ public: ) : SimObject(ctx, name) , ReqIn(this) , RspIn(this) - , ReqSm(this) - , RspSm(this) - , ReqDc(this) - , RspDc(this) + , ReqSM(this) + , RspSM(this) + , ReqDC(this) + , RspDC(this) , delay_(delay) {} void reset() {} - void tick() { + void tick() { + // process incoming reponses + if (!RspSM.empty()) { + auto& rsp = RspSM.front(); + DT(4, this->name() << "-" << rsp); + RspIn.send(rsp, 1); + RspSM.pop(); + } + if (!RspDC.empty()) { + auto& rsp = RspDC.front(); + DT(4, this->name() << "-" << rsp); + RspIn.send(rsp, 1); + RspDC + .pop(); + } // process incomming requests if (!ReqIn.empty()) { auto& req = ReqIn.front(); DT(4, this->name() << "-" << req); if (req.type == AddrType::Shared) { - ReqSm.send(req, delay_); + ReqSM.send(req, delay_); } else { - ReqDc.send(req, delay_); + ReqDC.send(req, delay_); } ReqIn.pop(); } - - // process incoming reponses - if (!RspSm.empty()) { - auto& rsp = RspSm.front(); - DT(4, this->name() << "-" << rsp); - RspIn.send(rsp, 1); - RspSm.pop(); - } - if (!RspDc.empty()) { - auto& rsp = RspDc.front(); - DT(4, this->name() << "-" << rsp); - RspIn.send(rsp, 1); - RspDc.pop(); - } } private: uint32_t delay_; }; -} \ No newline at end of file +/////////////////////////////////////////////////////////////////////////////// + +using MemSwitch = Switch; + +}