From b4b5d6f0abb47acf4f459a3c6b21f1b8568b77be Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 12 Jan 2021 15:19:38 -0800 Subject: [PATCH] minor updates --- driver/common/vx_utils.cpp | 23 ++++---- hw/rtl/VX_commit.v | 8 +-- hw/rtl/VX_issue.v | 4 +- hw/rtl/VX_mem_unit.v | 12 ++--- hw/rtl/VX_warp_sched.v | 7 +-- hw/rtl/cache/VX_cache.v | 64 ++++------------------- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 22 +++++--- hw/rtl/libs/VX_countones.v | 26 --------- 8 files changed, 45 insertions(+), 121 deletions(-) delete mode 100644 hw/rtl/libs/VX_countones.v diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 6debb282..4c65cb85 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -246,7 +246,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // bank_stalls uint64_t dcache_bank_st_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_BANK_ST, CSR_MPM_DCACHE_BANK_ST_H, &dcache_bank_st_per_core); - int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core))) * 100); + int dcache_bank_utilization = (int)((double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core)) * 100); if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization); dcache_bank_stalls += dcache_bank_st_per_core; // mshr_stalls @@ -279,7 +279,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { // bank_stalls uint64_t smem_bank_st_per_core; ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_BANK_ST, CSR_MPM_SMEM_BANK_ST_H, &smem_bank_st_per_core); - int smem_bank_utilization = (int)((1.0 - (double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core))) * 100); + int smem_bank_utilization = (int)((double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core)) * 100); if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization); smem_bank_stalls += smem_bank_st_per_core; @@ -288,12 +288,12 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core); ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core); ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core); - ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core); - int avg_dram_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core)); - int dram_utilization = (int)((1.0 - (double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core))) * 100); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core); + int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100); + int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core); if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization); - if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat); dram_reads += dram_reads_per_core; dram_writes += dram_writes_per_core; dram_stalls += dram_stalls_per_core; @@ -308,10 +308,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100); int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100); int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100); - int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls))) * 100); - int smem_bank_utilization = (int)((1.0 - (double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls))) * 100); - int dram_utilization = (int)((1.0 - (double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls))) * 100); - int avg_dram_lat = (int)(double(dram_lat) / double(dram_reads)); + int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100); + int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100); + int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100); + int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads)); fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); @@ -337,7 +337,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) { fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization); fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes); fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization); - fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat); + fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat); + fprintf(stream, "PERF: dram bandwith=%d cycles\n", dram_avg_lat); #endif return ret; diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index 3936991a..df5bf39c 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -52,13 +52,7 @@ module VX_commit #( assign commit_tmask3 = gpu_commit_fire ? gpu_commit_if.tmask : 0; wire [CMTW-1:0] commit_size; - - VX_countones #( - .N(3*`NUM_THREADS) - ) commit_ctr1 ( - .valids({commit_tmask3, commit_tmask2, commit_tmask1}), - .count (commit_size) - ); + assign commit_size = $countones({commit_tmask3, commit_tmask2, commit_tmask1}); VX_pipe_register #( .DATAW (1 + CMTW), diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 026b130d..8bb8063d 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -152,10 +152,10 @@ module VX_issue #( `endif end else begin if (decode_if.valid & !decode_if.ready) begin - perf_ibf_stalls <= perf_ibf_stalls + 64'd1; + perf_ibf_stalls <= perf_ibf_stalls + 64'd1; end if (ibuf_deq_if.valid & scoreboard_delay) begin - perf_scb_stalls <= perf_scb_stalls + 64'd1; + perf_scb_stalls <= perf_scb_stalls + 64'd1; end if (alu_req_if.valid & !alu_req_if.ready) begin perf_alu_stalls <= perf_alu_stalls + 64'd1; diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 438a0aff..52289b8f 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -351,14 +351,10 @@ end always @(posedge clk) begin if (reset) begin perf_dram_lat_per_cycle <= 0; - end else begin - if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready && dram_rsp_if.valid && dram_rsp_if.ready) begin - perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle; - end else if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) begin - perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1; - end else if (dram_rsp_if.valid && dram_rsp_if.ready) begin - perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1; - end + end else begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + + 64'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) - + 2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready)))); end end diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 249f23b8..bc1c9583 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -161,12 +161,7 @@ module VX_warp_sched #( `IGNORE_WARNINGS_BEGIN wire [`NW_BITS:0] active_barrier_count; `IGNORE_WARNINGS_END - VX_countones #( - .N(`NUM_WARPS) - ) barrier_count ( - .valids(barrier_stall_mask[warp_ctl_if.barrier.id]), - .count (active_barrier_count) - ); + assign active_barrier_count = $countones(barrier_stall_mask[warp_ctl_if.barrier.id]); assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1); diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 42990f42..e75490c4 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -356,69 +356,25 @@ module VX_cache #( reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle; reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle; - VX_countones #( - .N(NUM_REQS) - ) perf_countones_core_reads_count ( - .valids (core_req_valid & core_req_ready & ~core_req_rw), - .count (perf_core_reads_per_cycle) - ); + assign perf_core_reads_per_cycle = $countones(core_req_valid & core_req_ready & ~core_req_rw); + assign perf_core_writes_per_cycle = $countones(core_req_valid & core_req_ready & core_req_rw); - VX_countones #( - .N(NUM_REQS) - ) perf_countones_core_writes_count ( - .valids (core_req_valid & core_req_ready & core_req_rw), - .count (perf_core_writes_per_cycle) - ); - if (CORE_TAG_ID_BITS != 0) begin - VX_countones #( - .N(NUM_REQS) - ) perf_countones_core_rsp_count ( - .valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}), - .count (perf_crsp_stall_per_cycle) - ); + assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & {NUM_REQS{!core_rsp_ready}}); end else begin - VX_countones #( - .N(NUM_REQS) - ) perf_countones_core_rsp_count ( - .valids (core_rsp_valid & ~core_rsp_ready), - .count (perf_crsp_stall_per_cycle) - ); + assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready); end // per cycle: read misses, write misses, msrq stalls, pipeline stalls reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle; reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle; reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle; - reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle; - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_read_miss_count ( - .valids (perf_read_miss_per_bank), - .count (perf_read_miss_per_cycle) - ); - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_write_miss_count ( - .valids (perf_write_miss_per_bank), - .count (perf_write_miss_per_cycle) - ); - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_mshr_stall_count ( - .valids (perf_mshr_stall_per_bank), - .count (perf_mshr_stall_per_cycle) - ); - - VX_countones #( - .N(NUM_BANKS) - ) perf_countones_total_stall_count ( - .valids (perf_pipe_stall_per_bank), - .count (perf_pipe_stall_per_cycle) - ); + reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle; + + assign perf_read_miss_per_cycle = $countones(perf_read_miss_per_bank); + assign perf_write_miss_per_cycle = $countones(perf_write_miss_per_bank); + assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank); + assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank); reg [63:0] perf_core_reads; reg [63:0] perf_core_writes; diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index 04f6e86e..29a9b75b 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -50,7 +50,7 @@ module VX_cache_core_req_bank_sel #( reg [NUM_BANKS-1:0] per_bank_core_req_stall; reg [NUM_REQS-1:0] core_req_ready_r; - reg [NUM_BANKS-1:0] core_req_sel_r; + reg [NUM_REQS-1:0] core_req_sel_r; wire [NUM_REQS-1:0][`BANK_SELECT_BITS-1:0] core_req_bid; for (genvar i = 0; i < NUM_REQS; ++i) begin @@ -80,26 +80,34 @@ module VX_cache_core_req_bank_sel #( end always @(*) begin - core_req_ready_r = 0; - core_req_sel_r = 0; - + core_req_ready_r = 0; for (integer j = 0; j < NUM_BANKS; ++j) begin for (integer i = 0; i < NUM_REQS; ++i) begin if (core_req_valid[i] && (core_req_bid[i] == `BANK_SELECT_BITS'(j))) begin - core_req_ready_r[i] = ~per_bank_core_req_stall[j]; - core_req_sel_r[i] = 1; + core_req_ready_r[i] = ~per_bank_core_req_stall[j]; break; end end end end + always @(*) begin + core_req_sel_r = 0; + for (integer j = 0; j < NUM_BANKS; ++j) begin + for (integer i = 0; i < NUM_REQS; ++i) begin + if (core_req_valid[i] && (core_req_bid[i] == `BANK_SELECT_BITS'(j))) begin + core_req_sel_r[i] = ~per_bank_core_req_stall[j]; + end + end + end + end + reg [63:0] bank_stalls_r; always @(posedge clk) begin if (reset) begin bank_stalls_r <= 0; end else begin - bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_valid & ~core_req_sel_r)); + bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_sel_r & ~core_req_ready_r)); end end diff --git a/hw/rtl/libs/VX_countones.v b/hw/rtl/libs/VX_countones.v deleted file mode 100644 index e06277d2..00000000 --- a/hw/rtl/libs/VX_countones.v +++ /dev/null @@ -1,26 +0,0 @@ - -`include "VX_platform.vh" - -module VX_countones #( - parameter N = 10, - parameter N_BITS = $clog2(N+1) -) ( - input wire [N-1:0] valids, - output wire [N_BITS-1:0] count -); - /*reg [N_BITS-1:0] count_r; - - always @(*) begin - count_r = 0; - for (integer i = N-1; i >= 0; i = i - 1) begin - if (valids[i]) begin - count_r = count_r + N_BITS'(1); - end - end - end - - assign count = count_r;*/ - - assign count = $countones(valids); - -endmodule \ No newline at end of file