diff --git a/Makefile b/Makefile index 410dc008..58c66a48 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,11 @@ all: $(MAKE) -C simX $(MAKE) -C benchmarks/opencl +perf-demo: + $(MAKE) -C hw + $(MAKE) -C driver rtlsim + $(MAKE) -C driver/tests/demo/ run-rtlsim + clean: $(MAKE) -C hw clean $(MAKE) -C driver clean diff --git a/driver/include/vortex.h b/driver/include/vortex.h index e115834d..24cd9070 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -62,7 +62,8 @@ int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value); // get device constant registers int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value); - +// get device constant registers (64 bit long int) +int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// // upload kernel bytes to device diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 2420fd03..b71e410f 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -243,6 +243,32 @@ extern int vx_dev_close(vx_device_h hdevice) { vx_csr_get(hdevice, 0, CSR_NC, &num_cores); if (num_cores > 1) { uint64_t total_instrs = 0, total_cycles = 0; + // ------------------------- + #ifdef PERF_ENABLE + // PERF: cache + uint64_t total_r = 0; + uint64_t total_w = 0; + uint64_t dram_st = 0; + uint64_t dram_lat = 0; + uint64_t dram_rsp = 0; + uint64_t msrq_st = 0; + uint64_t total_st = 0; + uint64_t r_miss = 0; + uint64_t w_miss = 0; + uint64_t core_rsp_st = 0; + uint64_t total_evict = 0; + // PERF: pipeline stalls + uint64_t lsu_stall = 0; + uint64_t fpu_stall = 0; + uint64_t mul_stall = 0; + uint64_t csr_stall = 0; + uint64_t alu_stall = 0; + uint64_t gpu_stall = 0; + uint64_t ibuffer_stall = 0; + uint64_t scoreboard_stall = 0; + uint64_t icache_stall = 0; + #endif + // ------------------------- for (unsigned core_id = 0; core_id < num_cores; ++core_id) { uint64_t instrs, cycles; vx_get_perf(hdevice, core_id, &instrs, &cycles); @@ -250,14 +276,235 @@ extern int vx_dev_close(vx_device_h hdevice) { fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); total_instrs += instrs; total_cycles = std::max(total_cycles, cycles); + + #ifdef PERF_ENABLE + // PERF: cache + // total_read + uint64_t total_r_per_core; + vx_csr_get_l(hdevice, core_id, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r_per_core); + fprintf(stdout, "PERF: \t\ttotal_reads_per_core=%ld\n", total_r_per_core); + total_r += total_r_per_core; + // total_write + uint64_t total_w_per_core; + vx_csr_get_l(hdevice, core_id, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w_per_core); + fprintf(stdout, "PERF: \t\ttotal_writes_per_core=%ld\n", total_w_per_core); + total_w += total_w_per_core; + // dram_stall + uint64_t dram_st_per_core; + vx_csr_get_l(hdevice, core_id, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st_per_core); + fprintf(stdout, "PERF: \t\tdram_stalls_per_core=%ld\n", dram_st_per_core); + dram_st += dram_st_per_core; + // dram_latency + uint64_t dram_lat_per_core, dram_rsp_per_core; + vx_csr_get_l(hdevice, core_id, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat_per_core); + vx_csr_get_l(hdevice, core_id, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp_per_core); + fprintf(stdout, "PERF: \t\tdram_latency_per_core=%ld\n", dram_lat_per_core); + fprintf(stdout, "PERF: \t\tdram_response_per_core=%ld\n", dram_rsp_per_core); + dram_lat += dram_lat_per_core; + dram_rsp += dram_rsp_per_core; + float dram_lat_per_rsp_per_core = (float)(double(dram_lat_per_core) / double(dram_rsp_per_core)); + fprintf(stdout, "PERF: \t\tdram_latency_per_response_per_core=%f\n", dram_lat_per_rsp_per_core); + // miss_reserve_queue_stall + uint64_t msrq_st_per_core; + vx_csr_get_l(hdevice, core_id, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st_per_core); + fprintf(stdout, "PERF: \t\tmsrq_stalls_per_core=%ld\n", msrq_st_per_core); + msrq_st += msrq_st_per_core; + // total_stall + uint64_t total_st_per_core; + vx_csr_get_l(hdevice, core_id, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st_per_core); + fprintf(stdout, "PERF: \t\ttotal_stalls_per_core=%ld\n", total_st_per_core); + total_st += total_st_per_core; + // read_miss + uint64_t r_miss_per_core; + vx_csr_get_l(hdevice, core_id, CSR_R_MISS, CSR_R_MISS_H, &r_miss_per_core); + fprintf(stdout, "PERF: \t\tread_misses_per_core=%ld\n", r_miss_per_core); + r_miss += r_miss_per_core; + // write_miss + uint64_t w_miss_per_core; + vx_csr_get_l(hdevice, core_id, CSR_W_MISS, CSR_W_MISS_H, &w_miss_per_core); + fprintf(stdout, "PERF: \t\twrite_misses_per_core=%ld\n", w_miss_per_core); + w_miss += w_miss_per_core; + // core_rsp_stalls + uint64_t core_rsp_st_per_core; + vx_csr_get_l(hdevice, core_id, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st_per_core); + fprintf(stdout, "PERF: \t\tcore_rsp_stalls_per_core=%ld\n", core_rsp_st_per_core); + core_rsp_st += core_rsp_st_per_core; + // total_evictions + uint64_t total_evict_per_core; + vx_csr_get_l(hdevice, core_id, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict_per_core); + fprintf(stdout, "PERF: \t\ttotal_evictions_per_core=%ld\n", total_evict_per_core); + total_evict += total_evict_per_core; + // PERF: pipeline stall + // lsu_stall + uint64_t lsu_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall_per_core); + fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall_per_core); + lsu_stall += lsu_stall_per_core; + // fpu_stall + uint64_t fpu_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall_per_core); + fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall_per_core); + fpu_stall += fpu_stall_per_core; + // mul_stall + uint64_t mul_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall_per_core); + fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall_per_core); + mul_stall += mul_stall_per_core; + // csr_stall + uint64_t csr_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall_per_core); + fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall_per_core); + csr_stall += csr_stall_per_core; + // alu_stall + uint64_t alu_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall_per_core); + fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall_per_core); + alu_stall += alu_stall_per_core; + // gpu_stall + uint64_t gpu_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall_per_core); + fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall_per_core); + gpu_stall += gpu_stall_per_core; + // ibuffer_stall + uint64_t ibuffer_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall_per_core); + fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall_per_core); + ibuffer_stall += ibuffer_stall_per_core; + // scoreboard_stall + uint64_t scoreboard_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall_per_core); + fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall_per_core); + scoreboard_stall += scoreboard_stall_per_core; + // icache_stall + uint64_t icache_stall_per_core; + vx_csr_get_l(hdevice, core_id, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall_per_core); + fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall_per_core); + icache_stall += icache_stall_per_core; + #endif + // ------------------------- } float IPC = (float)(double(total_instrs) / double(total_cycles)); fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); + + #ifdef PERF_ENABLE + // PERF: cache + fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r); + fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w); + fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st); + fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat); + fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp); + float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp)); + fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp); + fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st); + fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st); + fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss); + fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss); + fprintf(stdout, "PERF: \t\tcore_rsp_stalls=%ld\n", core_rsp_st); + fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict); + // PERF: pipeline stall + fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall); + fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall); + fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall); + fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall); + fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall); + fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall); + fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall); + fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall); + fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall); + #endif + // ------------------------- } else { uint64_t instrs, cycles; vx_get_perf(hdevice, 0, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + + #ifdef PERF_ENABLE + // PERF: cache + // total_read + uint64_t total_r; + vx_csr_get_l(hdevice, 0, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r); + fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r); + // total_write + uint64_t total_w; + vx_csr_get_l(hdevice, 0, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w); + fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w); + // dram_stall + uint64_t dram_st; + vx_csr_get_l(hdevice, 0, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st); + fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st); + // dram_latency + uint64_t dram_lat, dram_rsp; + vx_csr_get_l(hdevice, 0, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat); + vx_csr_get_l(hdevice, 0, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp); + float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp)); + fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat); + fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp); + fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp); + // miss_reserve_queue_stall + uint64_t msrq_st; + vx_csr_get_l(hdevice, 0, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st); + fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st); + // total_stall + uint64_t total_st; + vx_csr_get_l(hdevice, 0, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st); + fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st); + // read_miss + uint64_t r_miss; + vx_csr_get_l(hdevice, 0, CSR_R_MISS, CSR_R_MISS_H, &r_miss); + fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss); + // write_miss + uint64_t w_miss; + vx_csr_get_l(hdevice, 0, CSR_W_MISS, CSR_W_MISS_H, &w_miss); + fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss); + // core_rsp_stalls + uint64_t core_rsp_st; + vx_csr_get_l(hdevice, 0, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st); + fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", core_rsp_st); + // total_evictions + uint64_t total_evict; + vx_csr_get_l(hdevice, 0, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict); + fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict); + // PERF: pipeline stalls + // TODO: + // lsu_stall + uint64_t lsu_stall; + vx_csr_get_l(hdevice, 0, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall); + fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall); + // fpu_stall + uint64_t fpu_stall; + vx_csr_get_l(hdevice, 0, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall); + fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall); + // mul_stall + uint64_t mul_stall; + vx_csr_get_l(hdevice, 0, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall); + fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall); + // csr_stall + uint64_t csr_stall; + vx_csr_get_l(hdevice, 0, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall); + fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall); + // alu_stall + uint64_t alu_stall; + vx_csr_get_l(hdevice, 0, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall); + fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall); + // gpu_stall + uint64_t gpu_stall; + vx_csr_get_l(hdevice, 0, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall); + fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall); + // ibuffer_stall + uint64_t ibuffer_stall; + vx_csr_get_l(hdevice, 0, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall); + fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall); + // scoreboard_stall + uint64_t scoreboard_stall; + vx_csr_get_l(hdevice, 0, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall); + fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall); + // icache_stall + uint64_t icache_stall; + vx_csr_get_l(hdevice, 0, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall); + fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall); + #endif + // ------------------------- } #endif @@ -386,4 +633,16 @@ extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* valu vx_device *device = ((vx_device*)hdevice); return device->get_csr(core_id, addr, value); +} + +extern int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value) { + if (nullptr == hdevice) + return -1; + + unsigned csr_value; + vx_csr_get(hdevice, core_id, addr_h, &csr_value); + *value = csr_value; + vx_csr_get(hdevice, core_id, addr, &csr_value); + *value = (*value << 32) | csr_value; + return 0; } \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 5ff4a768..3ab494bd 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -369,6 +369,11 @@ module VX_cluster #( .core_rsp_tag (core_dram_rsp_tag), .core_rsp_ready (core_dram_rsp_ready), + // PERF: total read + `ifdef PERF_ENABLE + `UNUSED_PIN (perf_cache_if), + `endif + // DRAM request .dram_req_valid (dram_req_valid), .dram_req_rw (dram_req_rw), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index bc7a2558..609977fe 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -124,6 +124,12 @@ `define LATENCY_FCONV 3 `endif +/////////////////////////////////////// +`ifndef PERF_ENABLE +`define PERF_ENABLE +`endif +/////////////////////////////////////// + // CSR Addresses ////////////////////////////////////////////////////////////// `define CSR_FFLAGS 12'h001 @@ -139,6 +145,52 @@ `define CSR_NW 12'h026 `define CSR_NC 12'h027 +// PERF: cache +`define CSR_R_MISS 12'h030 // read misses +`define CSR_R_MISS_H 12'h031 +`define CSR_W_MISS 12'h032 // write misses +`define CSR_W_MISS_H 12'h033 +`define CSR_DRAM_ST 12'h034 // dram stalls +`define CSR_DRAM_ST_H 12'h035 +`define CSR_CORE_RSP_ST 12'h036 // core_rsp stalls +`define CSR_CORE_RSP_ST_H 12'h037 +`define CSR_MSRQ_ST 12'h038 // miss reserve queue stalls +`define CSR_MSRQ_ST_H 12'h039 +`define CSR_TOTAL_ST 12'h03A // total stalls +`define CSR_TOTAL_ST_H 12'h03B +`define CSR_TOTAL_R 12'h03C // total reads +`define CSR_TOTAL_R_H 12'h03D +`define CSR_TOTAL_W 12'h03E // total writes +`define CSR_TOTAL_W_H 12'h03F +`define CSR_TOTAL_EV 12'h040 // total evictions +`define CSR_TOTAL_EV_H 12'h041 +`define CSR_DRAM_LAT 12'h042 // dram latency (total) +`define CSR_DRAM_LAT_H 12'h043 +`define CSR_DRAM_RSP 12'h044 // dram responses +`define CSR_DRAM_RSP_H 12'h045 +// PERF: pipeline stalls +`define CSR_FPU_ST 12'h046 +`define CSR_FPU_ST_H 12'h047 +`define CSR_MUL_ST 12'h048 +`define CSR_MUL_ST_H 12'h049 +`define CSR_CSR_ST 12'h04A +`define CSR_CSR_ST_H 12'h04B +`define CSR_ALU_ST 12'h04C +`define CSR_ALU_ST_H 12'h04D +`define CSR_GPU_ST 12'h04E +`define CSR_GPU_ST_H 12'h04F +`define CSR_LSU_ST 12'h050 +`define CSR_LSU_ST_H 12'h051 +`define CSR_IBUF_ST 12'h052 +`define CSR_IBUF_ST_H 12'h053 +`define CSR_SCRBRD_ST 12'h054 +`define CSR_SCRBRD_ST_H 12'h055 +`define CSR_ICACHE_ST 12'h056 +`define CSR_ICACHE_ST_H 12'h057 + + +////////////////////////////////////////////////////////////// + `define CSR_SATP 12'h180 `define CSR_PMPCFG0 12'h3A0 diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 08f05413..8bd005a8 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -66,6 +66,10 @@ module VX_core #( output wire busy, output wire ebreak ); + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if(); + `endif + VX_cache_dram_req_if #( .DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH), .DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH), @@ -220,6 +224,11 @@ module VX_core #( .csr_io_rsp_data (csr_io_rsp_data), .csr_io_rsp_ready (csr_io_rsp_ready), + // PERF: total reads + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_if), + `endif + // Status .busy(busy), .ebreak(ebreak) @@ -238,6 +247,11 @@ module VX_core #( // Core <-> Dcache .core_dcache_req_if (core_dcache_req_if), .core_dcache_rsp_if (core_dcache_rsp_if), + + // PERF: total reads + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_if), + `endif // Core <-> Icache .core_icache_req_if (core_icache_req_if), diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index b150f281..538faa8e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -6,6 +6,12 @@ module VX_csr_data #( input wire clk, input wire reset, + // PERF: total reads + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, + VX_perf_pipeline_stall_if perf_pipeline_stall_if, + `endif + VX_cmt_to_csr_if cmt_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if, @@ -114,6 +120,51 @@ module VX_csr_data #( `CSR_NW : read_data_r = `NUM_WARPS; `CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS; + `ifdef PERF_ENABLE + // PERF: cache + `CSR_R_MISS : read_data_r = perf_cache_if.read_miss[31:0]; + `CSR_R_MISS_H : read_data_r = perf_cache_if.read_miss[63:32]; + `CSR_W_MISS : read_data_r = perf_cache_if.write_miss[31:0]; + `CSR_W_MISS_H : read_data_r = perf_cache_if.write_miss[63:32]; + `CSR_DRAM_ST : read_data_r = perf_cache_if.dram_stall[31:0]; + `CSR_DRAM_ST_H : read_data_r = perf_cache_if.dram_stall[63:32]; + `CSR_CORE_RSP_ST : read_data_r = perf_cache_if.core_rsp_stall[31:0]; + `CSR_CORE_RSP_ST_H: read_data_r = perf_cache_if.core_rsp_stall[63:32]; + `CSR_MSRQ_ST : read_data_r = perf_cache_if.msrq_stall[31:0]; + `CSR_MSRQ_ST_H : read_data_r = perf_cache_if.msrq_stall[63:32]; + `CSR_TOTAL_ST : read_data_r = perf_cache_if.total_stall[31:0]; + `CSR_TOTAL_ST_H : read_data_r = perf_cache_if.total_stall[63:32]; + `CSR_TOTAL_R : read_data_r = perf_cache_if.total_read[31:0]; + `CSR_TOTAL_R_H : read_data_r = perf_cache_if.total_read[63:32]; + `CSR_TOTAL_W : read_data_r = perf_cache_if.total_write[31:0]; + `CSR_TOTAL_W_H : read_data_r = perf_cache_if.total_write[63:32]; + `CSR_TOTAL_EV : read_data_r = perf_cache_if.total_eviction[31:0]; + `CSR_TOTAL_EV_H : read_data_r = perf_cache_if.total_eviction[63:32]; + `CSR_DRAM_LAT : read_data_r = perf_cache_if.dram_latency[31:0]; + `CSR_DRAM_LAT_H : read_data_r = perf_cache_if.dram_latency[63:32]; + `CSR_DRAM_RSP : read_data_r = perf_cache_if.dram_rsp[31:0]; + `CSR_DRAM_RSP_H : read_data_r = perf_cache_if.dram_rsp[63:32]; + // PERF: pipeline stalls + `CSR_LSU_ST : read_data_r = perf_pipeline_stall_if.lsu_stall[31:0]; + `CSR_LSU_ST_H : read_data_r = perf_pipeline_stall_if.lsu_stall[63:32]; + `CSR_FPU_ST : read_data_r = perf_pipeline_stall_if.fpu_stall[31:0]; + `CSR_FPU_ST_H : read_data_r = perf_pipeline_stall_if.fpu_stall[63:32]; + `CSR_MUL_ST : read_data_r = perf_pipeline_stall_if.mul_stall[31:0]; + `CSR_MUL_ST_H : read_data_r = perf_pipeline_stall_if.mul_stall[63:32]; + `CSR_CSR_ST : read_data_r = perf_pipeline_stall_if.csr_stall[31:0]; + `CSR_CSR_ST_H : read_data_r = perf_pipeline_stall_if.csr_stall[63:32]; + `CSR_ALU_ST : read_data_r = perf_pipeline_stall_if.alu_stall[31:0]; + `CSR_ALU_ST_H : read_data_r = perf_pipeline_stall_if.alu_stall[63:32]; + `CSR_GPU_ST : read_data_r = perf_pipeline_stall_if.gpu_stall[31:0]; + `CSR_GPU_ST_H : read_data_r = perf_pipeline_stall_if.gpu_stall[63:32]; + `CSR_IBUF_ST : read_data_r = perf_pipeline_stall_if.ibuffer_stall[31:0]; + `CSR_IBUF_ST_H : read_data_r = perf_pipeline_stall_if.ibuffer_stall[63:32]; + `CSR_SCRBRD_ST : read_data_r = perf_pipeline_stall_if.scoreboard_stall[31:0]; + `CSR_SCRBRD_ST_H : read_data_r = perf_pipeline_stall_if.scoreboard_stall[63:32]; + `CSR_ICACHE_ST : read_data_r = perf_pipeline_stall_if.icache_stall[31:0]; + `CSR_ICACHE_ST_H : read_data_r = perf_pipeline_stall_if.icache_stall[63:32]; + `endif + `CSR_SATP : read_data_r = 32'(csr_satp); `CSR_MSTATUS : read_data_r = 32'(csr_mstatus); diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index b2fcc54c..e2177f74 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -6,6 +6,12 @@ module VX_csr_unit #( input wire clk, input wire reset, + // PERF: total reads + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, + VX_perf_pipeline_stall_if perf_pipeline_stall_if, + `endif + VX_cmt_to_csr_if cmt_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if, @@ -51,6 +57,11 @@ module VX_csr_unit #( ) csr_data ( .clk (clk), .reset (reset), + // PERF: total reads + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_if), + .perf_pipeline_stall_if (perf_pipeline_stall_if), + `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), .read_enable (csr_pipe_req_if.valid), diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 2d9f98f2..e61c9583 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -18,6 +18,12 @@ module VX_execute #( // perf VX_cmt_to_csr_if cmt_to_csr_if, + + // PERF: total reads + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, + VX_perf_pipeline_stall_if perf_pipeline_stall_if, + `endif // inputs VX_alu_req_if alu_req_if, @@ -72,7 +78,12 @@ module VX_execute #( .CORE_ID(CORE_ID) ) csr_unit ( .clk (clk), - .reset (reset), + .reset (reset), + // PERF: total reads + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_if), + .perf_pipeline_stall_if (perf_pipeline_stall_if), + `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), .csr_io_req_if (csr_io_req_if), @@ -150,4 +161,72 @@ module VX_execute #( && (`BR_OP(alu_req_if.op_type) == `BR_EBREAK || `BR_OP(alu_req_if.op_type) == `BR_ECALL); + `ifdef PERF_ENABLE + reg [63:0] perf_alu_stall; + reg [63:0] perf_lsu_stall; + reg [63:0] perf_csr_stall; + reg [63:0] perf_gpu_stall; + `ifdef EXT_M_ENABLE + reg [63:0] perf_mul_stall; + `endif + `ifdef EXT_F_ENABLE + reg [63:0] perf_fpu_stall; + `endif + + always@(posedge clk) begin + if(reset) begin + perf_alu_stall <= 0; + perf_lsu_stall <= 0; + perf_csr_stall <= 0; + perf_gpu_stall <= 0; + `ifdef EXT_M_ENABLE + perf_mul_stall <= 0; + `endif + `ifdef EXT_F_ENABLE + perf_fpu_stall <= 0; + `endif + end else begin + // alu_stall + if (alu_req_if.valid & !alu_req_if.ready) begin + perf_alu_stall <= perf_alu_stall + 64'd1; + end + // lsu_stall + if (lsu_req_if.valid & !lsu_req_if.ready) begin + perf_lsu_stall <= perf_lsu_stall + 64'd1; + end + // csr_stall + if (csr_req_if.valid & !csr_req_if.ready) begin + perf_csr_stall <= perf_csr_stall + 64'd1; + end + // gpu_stall + if (gpu_req_if.valid & !gpu_req_if.ready) begin + perf_gpu_stall <= perf_gpu_stall + 64'd1; + end + // mul_stall + `ifdef EXT_M_ENABLE + if (mul_req_if.valid & !mul_req_if.ready) begin + perf_mul_stall <= perf_mul_stall + 64'd1; + end + `endif + // fpu_stall + `ifdef EXT_F_ENABLE + if (fpu_req_if.valid & !fpu_req_if.ready) begin + perf_fpu_stall <= perf_fpu_stall + 64'd1; + end + `endif + end + end + assign perf_pipeline_stall_if.alu_stall = perf_alu_stall; + assign perf_pipeline_stall_if.lsu_stall = perf_lsu_stall; + assign perf_pipeline_stall_if.csr_stall = perf_csr_stall; + assign perf_pipeline_stall_if.gpu_stall = perf_gpu_stall; + `ifdef EXT_M_ENABLE + assign perf_pipeline_stall_if.mul_stall = perf_mul_stall; + `endif + `ifdef EXT_F_ENABLE + assign perf_pipeline_stall_if.fpu_stall = perf_fpu_stall; + `endif + // gpr_stall, ibuffer_stall, scoreboard_stall, icache_stall come from other stages + `endif + endmodule diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index eab416d8..dd912330 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -8,6 +8,10 @@ module VX_issue #( input wire clk, input wire reset, + `ifdef PERF_ENABLE + VX_perf_pipeline_stall_if perf_pipeline_stall_if, + `endif + VX_decode_if decode_if, VX_writeback_if writeback_if, @@ -120,6 +124,21 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); + `ifdef PERF_ENABLE + reg [63:0] perf_scoreboard_stall; + always @ (posedge clk) begin + if(reset) begin + perf_scoreboard_stall <= 0; + end else begin + // scoreboard_stall + if (ibuf_deq_if.valid & scoreboard_delay) begin + perf_scoreboard_stall <= perf_scoreboard_stall + 64'd1; + end + end + end + assign perf_pipeline_stall_if.scoreboard_stall = perf_scoreboard_stall; + `endif + `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 1c693262..28ff9b53 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -12,6 +12,10 @@ module VX_mem_unit # ( VX_cache_core_req_if core_dcache_req_if, VX_cache_core_rsp_if core_dcache_rsp_if, + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, + `endif + // Core <-> Icache VX_cache_core_req_if core_icache_req_if, VX_cache_core_rsp_if core_icache_rsp_if, @@ -28,6 +32,11 @@ module VX_mem_unit # ( VX_cache_core_req_if io_req_if, VX_cache_core_rsp_if io_rsp_if ); + + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_smem_if(), perf_cache_icache_if(), perf_cache_dcache_if(); + `endif + VX_cache_dram_req_if #( .DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH), .DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), @@ -124,6 +133,10 @@ module VX_mem_unit # ( .core_rsp_tag (dcache_rsp_if.tag), .core_rsp_ready (dcache_rsp_if.ready), + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_dcache_if), + `endif + // DRAM request .dram_req_valid (dcache_dram_req_if.valid), .dram_req_rw (dcache_dram_req_if.rw), @@ -196,6 +209,11 @@ module VX_mem_unit # ( .core_rsp_tag (core_icache_rsp_if.tag), .core_rsp_ready (core_icache_rsp_if.ready), + // PERF: cache read + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_icache_if), + `endif + // DRAM Req .dram_req_valid (icache_dram_req_if.valid), .dram_req_rw (icache_dram_req_if.rw), @@ -268,6 +286,11 @@ module VX_mem_unit # ( .core_rsp_tag (smem_rsp_if.tag), .core_rsp_ready (smem_rsp_if.ready), + // PERF: cache read + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_smem_if), + `endif + // DRAM request `UNUSED_PIN (dram_req_valid), `UNUSED_PIN (dram_req_rw), @@ -340,4 +363,42 @@ module VX_mem_unit # ( .rsp_ready_in (dram_rsp_if.ready) ); + // PERF: cache + // TODO: some cache has dram and write disabled, hence some stats can can be removed. + `ifdef PERF_ENABLE + assign perf_cache_if.read_miss = perf_cache_smem_if.read_miss + + perf_cache_icache_if.read_miss + + perf_cache_dcache_if.read_miss; + assign perf_cache_if.write_miss = perf_cache_smem_if.write_miss + + perf_cache_icache_if.write_miss + + perf_cache_dcache_if.write_miss; + assign perf_cache_if.dram_stall = perf_cache_smem_if.dram_stall + + perf_cache_icache_if.dram_stall + + perf_cache_dcache_if.dram_stall; + assign perf_cache_if.core_rsp_stall = perf_cache_smem_if.core_rsp_stall + + perf_cache_icache_if.core_rsp_stall + + perf_cache_dcache_if.core_rsp_stall; + assign perf_cache_if.msrq_stall = perf_cache_smem_if.msrq_stall + + perf_cache_icache_if.msrq_stall + + perf_cache_dcache_if.msrq_stall; + assign perf_cache_if.total_stall = perf_cache_smem_if.total_stall + + perf_cache_icache_if.total_stall + + perf_cache_dcache_if.total_stall; + assign perf_cache_if.total_read = perf_cache_smem_if.total_read + + perf_cache_icache_if.total_read + + perf_cache_dcache_if.total_read; + assign perf_cache_if.total_write = perf_cache_smem_if.total_write + + perf_cache_icache_if.total_write + + perf_cache_dcache_if.total_write; + assign perf_cache_if.total_eviction = perf_cache_smem_if.total_eviction + + perf_cache_icache_if.total_eviction + + perf_cache_dcache_if.total_eviction; + assign perf_cache_if.dram_latency = perf_cache_smem_if.dram_latency + + perf_cache_icache_if.dram_latency + + perf_cache_dcache_if.dram_latency; + assign perf_cache_if.dram_rsp = perf_cache_smem_if.dram_rsp + + perf_cache_icache_if.dram_rsp + + perf_cache_dcache_if.dram_rsp; + `endif + endmodule diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index e0f0a294..180c6a01 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -51,6 +51,10 @@ module VX_pipeline #( output wire[31:0] csr_io_rsp_data, input wire csr_io_rsp_ready, + // PERF: total reads + `ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, + `endif // Status output wire busy, output wire ebreak @@ -171,6 +175,10 @@ module VX_pipeline #( VX_commit_if fpu_commit_if(); VX_commit_if gpu_commit_if(); + `ifdef PERF_ENABLE + VX_perf_pipeline_stall_if perf_pipeline_stall_if(); + `endif + VX_fetch #( .CORE_ID(CORE_ID) ) fetch ( @@ -206,6 +214,10 @@ module VX_pipeline #( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_pipeline_stall_if (perf_pipeline_stall_if), + `endif + .decode_if (decode_if), .writeback_if (writeback_if), @@ -224,7 +236,13 @@ module VX_pipeline #( .clk (clk), .reset (reset), - + + // PERF: total reads + `ifdef PERF_ENABLE + .perf_cache_if (perf_cache_if), + .perf_pipeline_stall_if (perf_pipeline_stall_if), + `endif + .dcache_req_if (core_dcache_req_if), .dcache_rsp_if (core_dcache_rsp_if), @@ -272,4 +290,27 @@ module VX_pipeline #( .cmt_to_csr_if (cmt_to_csr_if) ); + + `ifdef PERF_ENABLE + reg [63:0] perf_icache_stall; + reg [63:0] perf_ibuffer_stall; + always @ (posedge clk) begin + if(reset) begin + perf_icache_stall <= 0; + perf_ibuffer_stall <= 0; + end else begin + // icache_stall + if (core_icache_req_if.valid & !core_icache_req_if.ready) begin + perf_icache_stall <= perf_icache_stall + 64'd1; + end + // ibuffer_stall: decode_if == issue->ibuffer->ibuf_enq_if + if(decode_if.valid & !decode_if.ready) begin + perf_ibuffer_stall <= perf_ibuffer_stall + 64'd1; + end + end + end + assign perf_pipeline_stall_if.icache_stall = perf_icache_stall; + assign perf_pipeline_stall_if.ibuffer_stall = perf_ibuffer_stall; + `endif + endmodule diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index cf88ed21..807bce34 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -372,6 +372,11 @@ module Vortex ( .core_rsp_tag (cluster_dram_rsp_tag), .core_rsp_ready (cluster_dram_rsp_ready), + // PERF: total read + `ifdef PERF_ENABLE + `UNUSED_PIN (perf_cache_if), + `endif + // DRAM request .dram_req_valid (dram_req_valid), .dram_req_rw (dram_req_rw), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 54b02c02..1f95ed96 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -96,6 +96,15 @@ module VX_bank #( output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, + // PERF: perf_msrq_stall + `ifdef PERF_ENABLE + output wire perf_msrq_stall, + output wire perf_total_stall, + output wire perf_evict, + output wire perf_read_miss, + output wire perf_write_miss, + `endif + // Misses output wire misses ); @@ -948,6 +957,18 @@ end `SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); `SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID)); +`ifdef PERF_ENABLE + assign perf_total_stall = pipeline_stall; + assign perf_msrq_stall = mshr_push_stall; + assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1; + assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1; + if (DRAM_ENABLE) begin + assign perf_evict = dwbq_push & do_writeback_st3 & !is_snp_st3; + end else begin + assign perf_evict = 0; + end +`endif + `ifdef DBG_PRINT_CACHE_BANK wire incoming_fill_dfp_st3 = dram_rsp_fire && (addr_st3 == dram_rsp_addr); always @(posedge clk) begin diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 8b7f06ad..154dc14f 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -70,7 +70,12 @@ module VX_cache #( output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready, - + + // PERF +`ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, +`endif + // DRAM request output wire dram_req_valid, output wire dram_req_rw, @@ -130,7 +135,16 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_miss; assign miss_vec = per_bank_miss; - + + +`ifdef PERF_ENABLE + wire [NUM_BANKS-1:0] perf_msrq_stall_per_bank; + wire [NUM_BANKS-1:0] perf_total_stall_per_bank; + wire [NUM_BANKS-1:0] perf_evict_per_bank; + wire [NUM_BANKS-1:0] perf_read_miss_per_bank; + wire [NUM_BANKS-1:0] perf_write_miss_per_bank; +`endif + if (NUM_BANKS == 1) begin assign snp_req_ready = per_bank_snp_req_ready; end else begin @@ -311,6 +325,15 @@ module VX_cache #( .dram_rsp_addr (curr_bank_dram_rsp_addr), .dram_rsp_ready (curr_bank_dram_rsp_ready), + // PERF: perf_msrq_stall + `ifdef PERF_ENABLE + .perf_msrq_stall (perf_msrq_stall_per_bank[i]), + .perf_total_stall (perf_total_stall_per_bank[i]), + .perf_evict (perf_evict_per_bank[i]), + .perf_read_miss (perf_read_miss_per_bank[i]), + .perf_write_miss (perf_write_miss_per_bank[i]), + `endif + // Snoop request .snp_req_valid (curr_bank_snp_req_valid), .snp_req_addr (curr_bank_snp_req_addr), @@ -407,4 +430,148 @@ module VX_cache #( `UNUSED_VAR (snp_rsp_ready) end + `ifdef PERF_ENABLE + // per cycle: core_req_r, core_req_w + reg[($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle; + if (CORE_TAG_ID_BITS != 0) begin // core_req_rw is 1-bit wide + VX_countones #( // core_req_r + .N(NUM_REQS) + ) perf_countones_core_req_r_count ( + .valids (core_req_valid & {NUM_REQS{(~core_req_rw) & core_req_ready}}), + .count (perf_core_req_r_per_cycle) + ); + VX_countones #( // core_req_w + .N(NUM_REQS) + ) perf_countones_core_req_w_count ( + .valids (core_req_valid & {NUM_REQS{(core_req_rw) & core_req_ready}}), + .count (perf_core_req_w_per_cycle) + ); + end else begin // core_req_rw is NUM_REQS-bit wide + VX_countones #( // core_req_r + .N(NUM_REQS) + ) perf_countones_core_req_r_count ( + .valids (core_req_valid & (~core_req_rw) & {NUM_REQS{core_req_ready}}), + .count (perf_core_req_r_per_cycle) + ); + VX_countones #( // core_req_w + .N(NUM_REQS) + ) perf_countones_core_req_w_count ( + .valids (core_req_valid & (core_req_rw) & {NUM_REQS{core_req_ready}}), + .count (perf_core_req_w_per_cycle) + ); + end + // per cycle: dram_latency + reg[63:0] perf_dram_lat_per_cycle; + always@(posedge clk) begin + if(reset) begin + perf_dram_lat_per_cycle <= 0; + end else begin + if(dram_req_valid & (~dram_req_rw) & dram_req_ready & dram_rsp_valid & dram_rsp_ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle; + end else if(dram_req_valid & (~dram_req_rw) & dram_req_ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1; + end else if(dram_rsp_valid & dram_rsp_ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1; + end + end + end + // per cycle: msrq stalls, total stalls, total eviction, read miss, write miss + reg [($clog2(NUM_BANKS+1)-1):0] perf_msrq_stall_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_total_stall_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_total_eviction_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle; + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_msrq_stall_count ( + .valids (perf_msrq_stall_per_bank), + .count (perf_msrq_stall_per_cycle) + ); + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_total_stall_count ( + .valids (perf_total_stall_per_bank), + .count (perf_total_stall_per_cycle) + ); + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_total_evict_count ( + .valids (perf_evict_per_bank), + .count (perf_total_eviction_per_cycle) + ); + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_read_miss_count ( + .valids (perf_read_miss_per_bank), + .count (perf_read_miss_per_cycle) + ); + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_write_miss_count ( + .valids (perf_write_miss_per_bank), + .count (perf_write_miss_per_cycle) + ); + reg [63:0] perf_core_req_r, perf_core_req_w; + reg [63:0] perf_dram_lat, perf_dram_rsp; + reg [63:0] perf_msrq_stall; + reg [63:0] perf_total_stall; + reg [63:0] perf_total_eviction; + reg [63:0] perf_read_miss, perf_write_miss; + reg [63:0] perf_core_rsp_stall, perf_dram_stall; + always @ (posedge clk) begin + if (reset) begin + perf_core_req_r <= 0; + perf_core_req_w <= 0; + perf_dram_lat <= 0; + perf_dram_rsp <= 0; + perf_msrq_stall <= 0; + perf_total_stall <= 0; + perf_total_eviction <= 0; + perf_read_miss <= 0; + perf_write_miss <= 0; + perf_core_rsp_stall <= 0; + perf_dram_stall <= 0; + end else begin + // core_req_r, core_req_w + perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle); + perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle); + // dram_latency + perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle; + if (dram_rsp_valid & dram_rsp_ready) begin + perf_dram_rsp <= perf_dram_rsp + 64'd1; + end + // miss reserve queue stalls: bank->msrq_push_stall + perf_msrq_stall <= perf_msrq_stall + $bits(perf_msrq_stall)'(perf_msrq_stall_per_cycle); + // total stalls: from bank->pipeline_stall + perf_total_stall <= perf_total_stall + $bits(perf_total_stall)'(perf_total_stall_per_cycle); + // total eviction: from bank-> dwbq_push & do_writeback_st3 & !is_snp_st3 + perf_total_eviction <= perf_total_eviction + $bits(perf_total_eviction)'(perf_total_eviction_per_cycle); + // read miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & !mem_rw_st1 + perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle); + // write miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & mem_rw_st1 + perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle); + // core_rsp_stall + if ((| core_rsp_valid) & !core_rsp_ready) begin + perf_core_rsp_stall <= perf_core_rsp_stall + 64'd1; + end + // dram_stall + if (dram_req_valid & !dram_req_ready) begin + perf_dram_stall <= perf_dram_stall + 64'd1; + end + end + end + assign perf_cache_if.total_read = perf_core_req_r; + assign perf_cache_if.total_write = perf_core_req_w; + assign perf_cache_if.dram_latency = perf_dram_lat; + assign perf_cache_if.dram_rsp = perf_dram_rsp; + assign perf_cache_if.msrq_stall = perf_msrq_stall; + assign perf_cache_if.total_stall = perf_total_stall; + assign perf_cache_if.total_eviction = perf_total_eviction; + assign perf_cache_if.read_miss = perf_read_miss; + assign perf_cache_if.write_miss = perf_write_miss; + assign perf_cache_if.core_rsp_stall = perf_core_rsp_stall; + assign perf_cache_if.dram_stall = perf_dram_stall; + + `endif + endmodule diff --git a/hw/rtl/interfaces/VX_perf_cache_if.v b/hw/rtl/interfaces/VX_perf_cache_if.v new file mode 100644 index 00000000..c5ae52cf --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_cache_if.v @@ -0,0 +1,22 @@ +`ifndef VX_PERF_CACHE_IF +`define VX_PERF_CACHE_IF + +`include "VX_define.vh" + +interface VX_perf_cache_if (); + +wire [63:0] read_miss; +wire [63:0] write_miss; +wire [63:0] dram_stall; +wire [63:0] dram_rsp; +wire [63:0] core_rsp_stall; +wire [63:0] msrq_stall; +wire [63:0] total_stall; +wire [63:0] total_read; +wire [63:0] total_write; +wire [63:0] total_eviction; +wire [63:0] dram_latency; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_perf_pipeline_stall_if.v b/hw/rtl/interfaces/VX_perf_pipeline_stall_if.v new file mode 100644 index 00000000..b470d5f0 --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_pipeline_stall_if.v @@ -0,0 +1,25 @@ +`ifndef VX_PERF_PIPELINE_STALL_IF +`define VX_PERF_PIPELINE_STALL_IF + +`include "VX_define.vh" + +interface VX_perf_pipeline_stall_if (); + // from pipeline + wire [63:0] icache_stall; + wire [63:0] ibuffer_stall; + // from issue + wire [63:0] scoreboard_stall; + // from execute + wire [63:0] lsu_stall; + wire [63:0] csr_stall; + wire [63:0] alu_stall; + wire [63:0] gpu_stall; + `ifdef EXT_M_ENABLE + wire [63:0] mul_stall; + `endif + `ifdef EXT_F_ENABLE + wire [63:0] fpu_stall; + `endif +endinterface + +`endif \ No newline at end of file