From dda67da84cd76dd8e2c5cfe9d44856debf975fa2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 21 Mar 2024 18:11:12 -0700 Subject: [PATCH 1/9] Add issue-stall-by-unit-busy perf counters Add per-issue-width counters instead of using reduce "OR" and causing undercounting. --- hw/rtl/core/VX_dispatch.sv | 54 +++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 7ae67607..10d6018d 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -176,36 +176,72 @@ module VX_dispatch import VX_gpu_pkg::*; #( end `ifdef PERF_ENABLE - wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r; + wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r; + wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r; + wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle_r; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_valids_per_cycle; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_fires_per_cycle; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_valids_r; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_fires_r; for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin always @(*) begin perf_issue_unit_stalls_per_cycle[i] = '0; + perf_issue_unit_valids_per_cycle[i] = '0; + perf_issue_unit_fires_per_cycle[i] = '0; if (operands_if[i].valid && ~operands_if[i].ready) begin perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1; end + if (operands_if[i].valid) begin + perf_issue_unit_valids_per_cycle[i][operands_if[i].data.ex_type] = 1; + end + if (operands_if[i].valid && operands_if[i].ready) begin + perf_issue_unit_fires_per_cycle[i][operands_if[i].data.ex_type] = 1; + end end end - VX_reduce #( - .DATAW_IN (`NUM_EX_UNITS), - .N (`ISSUE_WIDTH), - .OP ("|") - ) reduce ( - .data_in (perf_issue_unit_stalls_per_cycle), - .data_out (perf_unit_stalls_per_cycle) - ); + for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin + always @(*) begin + perf_unit_stalls_per_cycle[i] = '0; + perf_unit_valids_per_cycle[i] = '0; + perf_unit_fires_per_cycle[i] = '0; + for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin + perf_unit_stalls_per_cycle[i] = perf_unit_stalls_per_cycle[i] + perf_issue_unit_stalls_per_cycle[isw][i]; + perf_unit_valids_per_cycle[i] = perf_unit_valids_per_cycle[i] + perf_issue_unit_valids_per_cycle[isw][i]; + perf_unit_fires_per_cycle[i] = perf_unit_fires_per_cycle[i] + perf_issue_unit_fires_per_cycle[isw][i]; + end + end + end + + // VX_reduce #( + // .DATAW_IN (`NUM_EX_UNITS), + // .N (`ISSUE_WIDTH), + // .OP ("|") + // ) reduce ( + // .data_in (perf_issue_unit_stalls_per_cycle), + // .data_out (perf_unit_stalls_per_cycle) + // ); `BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle); + `BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle); + `BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle); for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin always @(posedge clk) begin if (reset) begin perf_stalls_r[i] <= '0; + perf_valids_r[i] <= '0; + perf_fires_r[i] <= '0; end else begin perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); + perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]); + perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]); end end end From 573be030c87fcb5652a201f5b7c33fba7d7dfac4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 00:00:08 -0700 Subject: [PATCH 2/9] Add issue-stall-by-operand-hazard perf counters Do the same reduce by + instead of OR fix for scoreboard counters. --- hw/rtl/core/VX_scoreboard.sv | 64 ++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index b0422656..1469d329 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -34,46 +34,73 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle; - wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r; + wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_per_cycle_r; + reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_per_cycle; reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle; - wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r; + wire [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_per_cycle_r; + reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_per_cycle; wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle; wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r; + wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r; + `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); + `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); + reg [`PERF_CTR_BITS-1:0] perf_scb_fires; + + for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin + always @(*) begin + perf_units_per_cycle[i] = '0; + for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin + perf_units_per_cycle[i] = perf_units_per_cycle[i] + perf_issue_units_per_cycle[isw][i]; + end + end + end + for (genvar i=0; i < `NUM_SFU_UNITS; ++i) begin + always @(*) begin + perf_sfu_per_cycle[i] = '0; + for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin + perf_sfu_per_cycle[i] = perf_sfu_per_cycle[i] + perf_issue_sfu_per_cycle[isw][i]; + end + end + end // NOTE(hansung): Because of OR-reduce, things are counted as once even when // multiple warps were using the same execution unit type at a given cycle. // This might result in an overall undercount - VX_reduce #( - .DATAW_IN (`NUM_EX_UNITS), - .N (`ISSUE_WIDTH), - .OP ("|") - ) perf_units_reduce ( - .data_in (perf_issue_units_per_cycle), - .data_out (perf_units_per_cycle) - ); + // VX_reduce #( + // .DATAW_IN (`NUM_EX_UNITS), + // .N (`ISSUE_WIDTH), + // .OP ("|") + // ) perf_units_reduce ( + // .data_in (perf_issue_units_per_cycle), + // .data_out (perf_units_per_cycle) + // ); - VX_reduce #( - .DATAW_IN (`NUM_SFU_UNITS), - .N (`ISSUE_WIDTH), - .OP ("|") - ) perf_sfu_reduce ( - .data_in (perf_issue_sfu_per_cycle), - .data_out (perf_sfu_per_cycle) - ); + // VX_reduce #( + // .DATAW_IN (`NUM_SFU_UNITS), + // .N (`ISSUE_WIDTH), + // .OP ("|") + // ) perf_sfu_reduce ( + // .data_in (perf_issue_sfu_per_cycle), + // .data_out (perf_sfu_per_cycle) + // ); `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); + `BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle); `BUFFER(perf_units_per_cycle_r, perf_units_per_cycle); `BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle); always @(posedge clk) begin if (reset) begin perf_scb_stalls <= '0; + perf_scb_fires <= '0; end else begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); + perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r); end end @@ -153,6 +180,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end end assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; + assign perf_issue_fires_per_cycle[i] = ibuffer_if[i].valid && ibuffer_if[i].ready; `endif // NOTE(hansung): why is inuse_rd checked? to prevent WAW? From 83e151a189f19afe09c10822755d7104d632860c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 00:01:15 -0700 Subject: [PATCH 3/9] Add valid / fire / cycles-issued perf counters to dispatch --- hw/rtl/core/VX_dispatch.sv | 19 +++++++++++++++++++ hw/rtl/core/VX_issue.sv | 4 ++++ hw/rtl/interfaces/VX_pipeline_perf_if.sv | 14 +++++++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 10d6018d..fa7c99de 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -22,6 +22,9 @@ module VX_dispatch import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS], + output wire [`PERF_CTR_BITS-1:0] perf_valids [`NUM_EX_UNITS], + output wire [`PERF_CTR_BITS-1:0] perf_fires [`NUM_EX_UNITS], + output wire [`PERF_CTR_BITS-1:0] perf_any_fire_cycles, `endif // inputs VX_operands_if.slave operands_if [`ISSUE_WIDTH], @@ -188,6 +191,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_valids_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_fires_r; + reg [`PERF_CTR_BITS-1:0] perf_any_fire_cycles_r; for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin always @(*) begin @@ -232,23 +236,38 @@ module VX_dispatch import VX_gpu_pkg::*; #( `BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle); `BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle); + reg perf_any_fire; + always @(*) begin + perf_any_fire = 1'b0; + for (integer i = 0; i < `NUM_EX_UNITS; ++i) begin + if (perf_unit_fires_per_cycle_r[i] != '0) begin + perf_any_fire = 1'b1; + end + end + end + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin always @(posedge clk) begin if (reset) begin perf_stalls_r[i] <= '0; perf_valids_r[i] <= '0; perf_fires_r[i] <= '0; + perf_any_fire_cycles_r <= '0; end else begin perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]); perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]); + perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire); end end end for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin assign perf_stalls[i] = perf_stalls_r[i]; + assign perf_valids[i] = perf_valids_r[i]; + assign perf_fires[i] = perf_fires_r[i]; end + assign perf_any_fire_cycles = perf_any_fire_cycles_r; `endif `ifdef DBG_TRACE_CORE_PIPELINE_VCS diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 1ba4ca28..f3f1424e 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -86,6 +86,10 @@ module VX_issue #( .reset (dispatch_reset), `ifdef PERF_ENABLE `UNUSED_PIN (perf_stalls), + .perf_stalls (perf_issue_if.dispatch_stalls), + .perf_valids (perf_issue_if.dispatch_valids), + .perf_fires (perf_issue_if.dispatch_fires), + .perf_any_fire_cycles (perf_issue_if.dispatch_any_fire_cycles), `endif .operands_if (operands_if), .alu_dispatch_if(alu_dispatch_if), diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 661ebcdf..29b2903a 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -21,6 +21,10 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; + wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] dispatch_valids [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] dispatch_fires [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] dispatch_any_fire_cycles; wire [`PERF_CTR_BITS-1:0] ifetches; wire [`PERF_CTR_BITS-1:0] loads; @@ -38,7 +42,11 @@ interface VX_pipeline_perf_if (); output ibf_stalls, output scb_stalls, output units_uses, - output sfu_uses + output sfu_uses, + output dispatch_stalls, + output dispatch_valids, + output dispatch_fires, + output dispatch_any_fire_cycles ); modport slave ( @@ -49,6 +57,10 @@ interface VX_pipeline_perf_if (); input scb_stalls, input units_uses, input sfu_uses, + input dispatch_stalls, + input dispatch_valids, + input dispatch_fires, + input dispatch_any_fire_cycles, input ifetches, input loads, input stores, From d99295793ce93aa6d5029c280832ffc6981e0d2d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 00:02:02 -0700 Subject: [PATCH 4/9] Periodically report perf counter; reformat operand/FU stalls --- hw/rtl/core/VX_core.sv | 203 +++++++++++++++++++++++++++-------------- 1 file changed, 132 insertions(+), 71 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index dab5836c..623b1a35 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -335,99 +335,160 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.load_latency = perf_dcache_lat; assign pipeline_perf_if.ifetch_latency = perf_icache_lat; int instrs; - assign instrs = commit_csr_if.instret; + assign instrs = 32'(commit_csr_if.instret); int cycles; - assign cycles = sched_csr_if.cycles; + assign cycles = 32'(sched_csr_if.cycles); int icache_lat; - assign icache_lat = perf_icache_lat; + assign icache_lat = 32'(perf_icache_lat); int ifetches; - assign ifetches = perf_ifetches; + assign ifetches = 32'(perf_ifetches); int dcache_lat; - assign dcache_lat = perf_dcache_lat; + assign dcache_lat = 32'(perf_dcache_lat); int loads; - assign loads = perf_loads; + assign loads = 32'(perf_loads); int scheduler_idles; - assign scheduler_idles = pipeline_perf_if.sched_idles; + assign scheduler_idles = 32'(pipeline_perf_if.sched_idles); int scheduler_stalls; - assign scheduler_stalls = pipeline_perf_if.sched_stalls; + assign scheduler_stalls = 32'(pipeline_perf_if.sched_stalls); int scheduler_barrier_stalls; - assign scheduler_barrier_stalls = pipeline_perf_if.sched_barrier_stalls; + assign scheduler_barrier_stalls = 32'(pipeline_perf_if.sched_barrier_stalls); int ibuf_stalls; - assign ibuf_stalls = pipeline_perf_if.ibf_stalls; + assign ibuf_stalls = 32'(pipeline_perf_if.ibf_stalls); int scrb_alu_per_core; - assign scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU]; + assign scrb_alu_per_core = 32'(pipeline_perf_if.units_uses[`EX_ALU]); int scrb_fpu_per_core; - assign scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU]; + assign scrb_fpu_per_core = 32'(pipeline_perf_if.units_uses[`EX_FPU]); int scrb_lsu_per_core; - assign scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU]; + assign scrb_lsu_per_core = 32'(pipeline_perf_if.units_uses[`EX_LSU]); int scrb_sfu_per_core; - assign scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU]; + assign scrb_sfu_per_core = 32'(pipeline_perf_if.units_uses[`EX_SFU]); int scrb_tot; assign scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core; int scrb_wctl_per_core; - assign scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL]; + assign scrb_wctl_per_core = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL]); int scrb_csrs_per_core; - assign scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS]; + assign scrb_csrs_per_core = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS]); int sfu_tot; assign sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core; - always @(negedge busy) begin - if (!reset) begin - $display("====================CORE : %d===================",CORE_ID); - $display("time : %t", $time); - // $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle); - // $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle); - // $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle); - // $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle); - // $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle); - // $display("perf_icache_pending_reads: %d", perf_icache_pending_reads); - // $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads); - // $display("perf_icache_req_fire: %b", perf_icache_req_fire); - // $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); - // $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); - // $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r); - // $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); - // $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); - // $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); + reg busy_prev; + reg [31:0] report_counter; - $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, - $itor(instrs) / $itor(cycles)); - $display("scheduler idle: %d cycles (%f%%)", pipeline_perf_if.sched_idles, - $itor(scheduler_idles) / $itor(cycles) * 100.0); - $display("scheduler stalls: %d cycles (%f%%)", pipeline_perf_if.sched_stalls, - $itor(scheduler_stalls) / $itor(cycles) * 100.0); - $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%f%%)", - pipeline_perf_if.sched_barrier_stalls, - `NUM_WARPS, - $itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0); - $display("ibuffer stalls: %d cycles (%f%%)",pipeline_perf_if.ibf_stalls, - $itor(ibuf_stalls) / $itor(cycles) * 100.0); - // see VX_scoreboard.sv - $display("issue stalls: %d (summed across ISSUE_WIDTH=%d)", - pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); - $display("issue stalls: alu %d (%f%%)", - scrb_alu_per_core, - $itor(scrb_alu_per_core) / $itor(scrb_tot) * 100.0); - $display("issue stalls: fpu %d (%f%%)", - scrb_fpu_per_core, - $itor(scrb_fpu_per_core) / $itor(scrb_tot) * 100.0); - $display("issue stalls: lsu %d (%f%%)", - scrb_lsu_per_core, - $itor(scrb_lsu_per_core) / $itor(scrb_tot) * 100.0); - $display("issue stalls: sfu %d (%f%%)", - scrb_sfu_per_core, - $itor(scrb_sfu_per_core) / $itor(scrb_tot) * 100.0); - $display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], - $itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0, - $itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0); - $display("ifetches: %d", perf_ifetches); - $display("ifetch latency: %f Cycles", - $itor(icache_lat) / $itor(ifetches)); - $display("loads: %d", perf_loads); - $display("load latency: %f Cycles", - $itor(dcache_lat) / $itor(loads)); - $display("stores: %d", perf_stores); + always @(posedge clk) begin + if (reset) begin + busy_prev <= 1'b0; + report_counter <= 32'd0; + end else begin + busy_prev <= busy; + if (report_counter == 32'd10000) begin + report_counter <= 32'd0; + end else begin + report_counter <= report_counter + 32'd1; + end + end + end + + wire busy_negedge; + assign busy_negedge = busy_prev && !busy; + + reg [`PERF_CTR_BITS-1:0] dispatch_fires_total; + always @(*) begin + dispatch_fires_total = '0; + for (integer i = 0; i < `NUM_EX_UNITS; i++) begin + dispatch_fires_total = dispatch_fires_total + pipeline_perf_if.dispatch_fires[i]; + end + end + + always @(posedge clk) begin + if (!reset && (busy_negedge || (report_counter == 32'd0))) begin + $display("====================CORE : %d===================",CORE_ID); + $display("time : %t", $time); + // disabled as always zero + // $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle); + // $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle); + // $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle); + // $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle); + // $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle); + // $display("perf_icache_pending_reads: %d", perf_icache_pending_reads); + // $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads); + // $display("perf_icache_req_fire: %b", perf_icache_req_fire); + // $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); + // $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); + // $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r); + // $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); + // $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); + // $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); + + $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, + $itor(instrs) / $itor(cycles)); + $display("scheduler idle: %d cycles (%.2f%%)", pipeline_perf_if.sched_idles, + $itor(scheduler_idles) / $itor(cycles) * 100.0); + $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, + $itor(scheduler_stalls) / $itor(cycles) * 100.0); + $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%.2f%%)", + pipeline_perf_if.sched_barrier_stalls, + `NUM_WARPS, + $itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0); + $display("ibuffer stalls: %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, + $itor(ibuf_stalls) / $itor(cycles) * 100.0); + // see VX_scoreboard.sv + // scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready) + // units_uses: valid & ~operands_ready + // this will be a subset of scb_stalls + $display("issue scoreboard: stalls total: %d (summed across ISSUE_WIDTH=%d)", + pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); + $display("issue scoreboard: stalls by operand hazard: alu %d (%.2f%%) (%.2f cycles per issue)", + scrb_alu_per_core, + $itor(scrb_alu_per_core) / $itor(scrb_tot) * 100.0, + $itor(scrb_alu_per_core) / $itor(dispatch_fires_total)); + $display("issue scoreboard: stalls by operand hazard: fpu %d (%.2f%%) (%.2f cycles per issue)", + scrb_fpu_per_core, + $itor(scrb_fpu_per_core) / $itor(scrb_tot) * 100.0, + $itor(scrb_fpu_per_core) / $itor(dispatch_fires_total)); + $display("issue scoreboard: stalls by operand hazard: lsu %d (%.2f%%) (%.2f cycles per issue)", + scrb_lsu_per_core, + $itor(scrb_lsu_per_core) / $itor(scrb_tot) * 100.0, + $itor(scrb_lsu_per_core) / $itor(dispatch_fires_total)); + $display("issue scoreboard: stalls by operand hazard: sfu %d (%.2f%%) (%.2f cycles per issue)", + scrb_sfu_per_core, + $itor(scrb_sfu_per_core) / $itor(scrb_tot) * 100.0, + $itor(scrb_sfu_per_core) / $itor(dispatch_fires_total)); + $display("issue scoreboard: sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], + $itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0, + $itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0); + $display("issue dispatch: stalls by FU busy: alu %d (%.2f cycles per issue)", + pipeline_perf_if.dispatch_stalls[`EX_ALU], + $itor(pipeline_perf_if.dispatch_stalls[`EX_ALU]) / $itor(dispatch_fires_total)); + $display("issue dispatch: stalls by FU busy: fpu %d (%.2f cycles per issue)", + pipeline_perf_if.dispatch_stalls[`EX_FPU], + $itor(pipeline_perf_if.dispatch_stalls[`EX_FPU]) / $itor(dispatch_fires_total)); + $display("issue dispatch: stalls by FU busy: lsu %d (%.2f cycles per issue)", + pipeline_perf_if.dispatch_stalls[`EX_LSU], + $itor(pipeline_perf_if.dispatch_stalls[`EX_LSU]) / $itor(dispatch_fires_total)); + $display("issue dispatch: stalls by FU busy: sfu %d (%.2f cycles per issue)", + pipeline_perf_if.dispatch_stalls[`EX_SFU], + $itor(pipeline_perf_if.dispatch_stalls[`EX_SFU]) / $itor(dispatch_fires_total)); + $display("issue dispatch: fires: total %d", + dispatch_fires_total); + $display("issue dispatch: fires: alu %d", + pipeline_perf_if.dispatch_fires[`EX_ALU]); + $display("issue dispatch: fires: fpu %d", + pipeline_perf_if.dispatch_fires[`EX_FPU]); + $display("issue dispatch: fires: lsu %d", + pipeline_perf_if.dispatch_fires[`EX_LSU]); + $display("issue dispatch: fires: sfu %d", + pipeline_perf_if.dispatch_fires[`EX_SFU]); + $display("issue dispatch: cycles issued: %d (%.2f%%)", + pipeline_perf_if.dispatch_any_fire_cycles, + $itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0); + $display("ifetches: %d", perf_ifetches); + $display("ifetch latency: %f Cycles", + $itor(icache_lat) / $itor(ifetches)); + $display("loads: %d", perf_loads); + $display("load latency: %f Cycles", + $itor(dcache_lat) / $itor(loads)); + $display("stores: %d", perf_stores); end end From 3e6a9a610403f0f3b9f76c23fc05f259a948ee06 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 01:06:40 -0700 Subject: [PATCH 5/9] Expose scoreboard fires to perf interface --- hw/rtl/core/VX_issue.sv | 1 + hw/rtl/core/VX_scoreboard.sv | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index f3f1424e..ef7cf31b 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -61,6 +61,7 @@ module VX_issue #( .reset (scoreboard_reset), `ifdef PERF_ENABLE .perf_scb_stalls(perf_issue_if.scb_stalls), + .perf_scb_fires (perf_issue_if.scb_fires), .perf_units_uses(perf_issue_if.units_uses), .perf_sfu_uses (perf_issue_if.sfu_uses), `endif diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 1469d329..9d66d200 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -21,6 +21,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, + output reg [`PERF_CTR_BITS-1:0] perf_scb_fires, output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS], output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS], `endif @@ -49,7 +50,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); - reg [`PERF_CTR_BITS-1:0] perf_scb_fires; for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin always @(*) begin From dd90736382805ea12fd9901bafb84f5f65bcc893 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 23 Mar 2024 01:07:34 -0700 Subject: [PATCH 6/9] Reformat perfcount report --- hw/rtl/core/VX_core.sv | 43 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 623b1a35..4ffd392f 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -346,14 +346,6 @@ module VX_core import VX_gpu_pkg::*; #( assign dcache_lat = 32'(perf_dcache_lat); int loads; assign loads = 32'(perf_loads); - int scheduler_idles; - assign scheduler_idles = 32'(pipeline_perf_if.sched_idles); - int scheduler_stalls; - assign scheduler_stalls = 32'(pipeline_perf_if.sched_stalls); - int scheduler_barrier_stalls; - assign scheduler_barrier_stalls = 32'(pipeline_perf_if.sched_barrier_stalls); - int ibuf_stalls; - assign ibuf_stalls = 32'(pipeline_perf_if.ibf_stalls); int scrb_alu_per_core; assign scrb_alu_per_core = 32'(pipeline_perf_if.units_uses[`EX_ALU]); int scrb_fpu_per_core; @@ -364,7 +356,6 @@ module VX_core import VX_gpu_pkg::*; #( assign scrb_sfu_per_core = 32'(pipeline_perf_if.units_uses[`EX_SFU]); int scrb_tot; assign scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core; - int scrb_wctl_per_core; assign scrb_wctl_per_core = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL]); int scrb_csrs_per_core; @@ -423,50 +414,48 @@ module VX_core import VX_gpu_pkg::*; #( $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, $itor(instrs) / $itor(cycles)); $display("scheduler idle: %d cycles (%.2f%%)", pipeline_perf_if.sched_idles, - $itor(scheduler_idles) / $itor(cycles) * 100.0); + $itor(pipeline_perf_if.sched_idles) / $itor(cycles) * 100.0); $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, - $itor(scheduler_stalls) / $itor(cycles) * 100.0); + $itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0); $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%.2f%%)", pipeline_perf_if.sched_barrier_stalls, `NUM_WARPS, - $itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0); - $display("ibuffer stalls: %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, - $itor(ibuf_stalls) / $itor(cycles) * 100.0); + $itor(pipeline_perf_if.sched_barrier_stalls) / $itor(cycles) * 100.0); + $display("decode stalls: %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, + $itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0); // see VX_scoreboard.sv // scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready) // units_uses: valid & ~operands_ready // this will be a subset of scb_stalls - $display("issue scoreboard: stalls total: %d (summed across ISSUE_WIDTH=%d)", + $display("issue scoreboard: fires total:\t%d across ISSUE_WIDTH=%d", + pipeline_perf_if.scb_fires, `ISSUE_WIDTH); + $display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); - $display("issue scoreboard: stalls by operand hazard: alu %d (%.2f%%) (%.2f cycles per issue)", + $display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)", scrb_alu_per_core, - $itor(scrb_alu_per_core) / $itor(scrb_tot) * 100.0, $itor(scrb_alu_per_core) / $itor(dispatch_fires_total)); - $display("issue scoreboard: stalls by operand hazard: fpu %d (%.2f%%) (%.2f cycles per issue)", + $display("issue scoreboard: stalls by operand hazard: fpu %d (%2.2f cycles per issue)", scrb_fpu_per_core, - $itor(scrb_fpu_per_core) / $itor(scrb_tot) * 100.0, $itor(scrb_fpu_per_core) / $itor(dispatch_fires_total)); - $display("issue scoreboard: stalls by operand hazard: lsu %d (%.2f%%) (%.2f cycles per issue)", + $display("issue scoreboard: stalls by operand hazard: lsu %d (%2.2f cycles per issue)", scrb_lsu_per_core, - $itor(scrb_lsu_per_core) / $itor(scrb_tot) * 100.0, $itor(scrb_lsu_per_core) / $itor(dispatch_fires_total)); - $display("issue scoreboard: stalls by operand hazard: sfu %d (%.2f%%) (%.2f cycles per issue)", + $display("issue scoreboard: stalls by operand hazard: sfu %d (%2.2f cycles per issue)", scrb_sfu_per_core, - $itor(scrb_sfu_per_core) / $itor(scrb_tot) * 100.0, $itor(scrb_sfu_per_core) / $itor(dispatch_fires_total)); $display("issue scoreboard: sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], $itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0, $itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0); - $display("issue dispatch: stalls by FU busy: alu %d (%.2f cycles per issue)", + $display("issue dispatch: stalls by FU busy: alu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_ALU], $itor(pipeline_perf_if.dispatch_stalls[`EX_ALU]) / $itor(dispatch_fires_total)); - $display("issue dispatch: stalls by FU busy: fpu %d (%.2f cycles per issue)", + $display("issue dispatch: stalls by FU busy: fpu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_FPU], $itor(pipeline_perf_if.dispatch_stalls[`EX_FPU]) / $itor(dispatch_fires_total)); - $display("issue dispatch: stalls by FU busy: lsu %d (%.2f cycles per issue)", + $display("issue dispatch: stalls by FU busy: lsu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_LSU], $itor(pipeline_perf_if.dispatch_stalls[`EX_LSU]) / $itor(dispatch_fires_total)); - $display("issue dispatch: stalls by FU busy: sfu %d (%.2f cycles per issue)", + $display("issue dispatch: stalls by FU busy: sfu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_SFU], $itor(pipeline_perf_if.dispatch_stalls[`EX_SFU]) / $itor(dispatch_fires_total)); $display("issue dispatch: fires: total %d", From 50263a5f7d7135d7434fc93583fa69a1e8214590 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 22:45:12 -0700 Subject: [PATCH 7/9] Rename sched_barrier_stalls -> perf_sched_barrier_idles Sched stall by barrier is really idle because it causes !scheduler_if.valid, which is counted as part of sched_idle. --- hw/rtl/core/VX_core.sv | 18 +++++++++--------- hw/rtl/core/VX_schedule.sv | 12 ++++++------ hw/rtl/interfaces/VX_pipeline_perf_if.sv | 9 ++++++--- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 4ffd392f..37db6123 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -415,13 +415,13 @@ module VX_core import VX_gpu_pkg::*; #( $itor(instrs) / $itor(cycles)); $display("scheduler idle: %d cycles (%.2f%%)", pipeline_perf_if.sched_idles, $itor(pipeline_perf_if.sched_idles) / $itor(cycles) * 100.0); + $display("scheduler barrier idle: %d count across NUM_WARPS=%d", + pipeline_perf_if.sched_barrier_idles, `NUM_WARPS); + // sched_stalls can happen when the later issue stage stalls, + // causing the ibuffer to clog. $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, $itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0); - $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%.2f%%)", - pipeline_perf_if.sched_barrier_stalls, - `NUM_WARPS, - $itor(pipeline_perf_if.sched_barrier_stalls) / $itor(cycles) * 100.0); - $display("decode stalls: %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, + $display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, $itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0); // see VX_scoreboard.sv // scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready) @@ -472,12 +472,12 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.dispatch_any_fire_cycles, $itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0); $display("ifetches: %d", perf_ifetches); - $display("ifetch latency: %f Cycles", + $display("ifetch latency: %f cycles", $itor(icache_lat) / $itor(ifetches)); - $display("loads: %d", perf_loads); - $display("load latency: %f Cycles", + $display("dcache loads: %d", perf_loads); + $display("dcache load latency: %f cycles", $itor(dcache_lat) / $itor(loads)); - $display("stores: %d", perf_stores); + $display("dcache stores: %d", perf_stores); end end diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index c890a2f6..8c165261 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -413,28 +413,28 @@ module VX_schedule import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; - reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_stalls; + reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles; wire schedule_idle = ~schedule_valid; wire schedule_stall = schedule_if.valid && ~schedule_if.ready; - wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_stall; - `POP_COUNT(schedule_barrier_stall, barrier_stalls); + wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle; + `POP_COUNT(schedule_barrier_idle, barrier_stalls); always @(posedge clk) begin if (reset) begin perf_sched_idles <= '0; + perf_sched_barrier_idles <= '0; perf_sched_stalls <= '0; - perf_sched_barrier_stalls <= '0; end else begin perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); + perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); - perf_sched_barrier_stalls <= perf_sched_barrier_stalls + `PERF_CTR_BITS'(schedule_barrier_stall); end end assign perf_schedule_if.sched_idles = perf_sched_idles; + assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles; assign perf_schedule_if.sched_stalls = perf_sched_stalls; - assign perf_schedule_if.sched_barrier_stalls = perf_sched_barrier_stalls; `endif endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 29b2903a..afdca4fa 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -16,9 +16,10 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_idles; wire [`PERF_CTR_BITS-1:0] sched_stalls; - wire [`PERF_CTR_BITS-1:0] sched_barrier_stalls; + wire [`PERF_CTR_BITS-1:0] sched_barrier_idles; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_fires; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS]; @@ -34,13 +35,14 @@ interface VX_pipeline_perf_if (); modport schedule ( output sched_idles, - output sched_barrier_stalls, + output sched_barrier_idles, output sched_stalls ); modport issue ( output ibf_stalls, output scb_stalls, + output scb_fires, output units_uses, output sfu_uses, output dispatch_stalls, @@ -51,10 +53,11 @@ interface VX_pipeline_perf_if (); modport slave ( input sched_idles, - input sched_barrier_stalls, + input sched_barrier_idles, input sched_stalls, input ibf_stalls, input scb_stalls, + input scb_fires, input units_uses, input sfu_uses, input dispatch_stalls, From 62c7d1f4cf8e97fd3f0ce2e75bb1af7c821c2c19 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 29 Mar 2024 12:23:15 -0700 Subject: [PATCH 8/9] Report any fire cycles from scoreboard as well --- hw/rtl/core/VX_core.sv | 5 ++++- hw/rtl/core/VX_dispatch.sv | 8 +++---- hw/rtl/core/VX_issue.sv | 1 + hw/rtl/core/VX_scoreboard.sv | 27 ++++++++++++++++++++++-- hw/rtl/interfaces/VX_pipeline_perf_if.sv | 3 +++ 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 37db6123..45426053 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -429,6 +429,9 @@ module VX_core import VX_gpu_pkg::*; #( // this will be a subset of scb_stalls $display("issue scoreboard: fires total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_fires, `ISSUE_WIDTH); + $display("issue scoreboard: cycles fired:\t%d (%.2f%%)", + pipeline_perf_if.scb_any_fire_cycles, + $itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0); $display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); $display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)", @@ -468,7 +471,7 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.dispatch_fires[`EX_LSU]); $display("issue dispatch: fires: sfu %d", pipeline_perf_if.dispatch_fires[`EX_SFU]); - $display("issue dispatch: cycles issued: %d (%.2f%%)", + $display("issue dispatch: cycles fired: %d (%.2f%%)", pipeline_perf_if.dispatch_any_fire_cycles, $itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0); $display("ifetches: %d", perf_ifetches); diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index fa7c99de..0700d077 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -236,12 +236,12 @@ module VX_dispatch import VX_gpu_pkg::*; #( `BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle); `BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle); - reg perf_any_fire; + reg perf_any_fire_per_cycle; always @(*) begin - perf_any_fire = 1'b0; + perf_any_fire_per_cycle = 1'b0; for (integer i = 0; i < `NUM_EX_UNITS; ++i) begin if (perf_unit_fires_per_cycle_r[i] != '0) begin - perf_any_fire = 1'b1; + perf_any_fire_per_cycle = 1'b1; end end end @@ -257,7 +257,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]); perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]); perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]); - perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire); + perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire_per_cycle); end end end diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index ef7cf31b..4e79ce70 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -62,6 +62,7 @@ module VX_issue #( `ifdef PERF_ENABLE .perf_scb_stalls(perf_issue_if.scb_stalls), .perf_scb_fires (perf_issue_if.scb_fires), + .perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles), .perf_units_uses(perf_issue_if.units_uses), .perf_sfu_uses (perf_issue_if.sfu_uses), `endif diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 9d66d200..fe038fb5 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -22,6 +22,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, output reg [`PERF_CTR_BITS-1:0] perf_scb_fires, + output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles, output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS], output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS], `endif @@ -47,9 +48,13 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle; wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r; + wire perf_any_fire_per_cycle, perf_any_fire_per_cycle_r; + + reg [`PERF_CTR_BITS-1:0] perf_scb_empty; `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); - `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); + `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); + assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle; for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin always @(*) begin @@ -91,16 +96,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); `BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle); + `BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle); `BUFFER(perf_units_per_cycle_r, perf_units_per_cycle); `BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle); always @(posedge clk) begin if (reset) begin perf_scb_stalls <= '0; - perf_scb_fires <= '0; + perf_scb_fires <= '0; + perf_scb_any_fire_cycles <= '0; end else begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r); + perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r); end end @@ -257,4 +265,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end +`ifdef PERF_ENABLE + wire [`ISSUE_WIDTH-1:0] ibuffer_valids; + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign ibuffer_valids[i] = ibuffer_if[i].valid; + end + + always @(posedge clk) begin + if (reset) begin + perf_scb_empty <= '0; + end else begin + perf_scb_empty <= perf_scb_empty + `PERF_CTR_BITS'(~|ibuffer_valids); + end + end +`endif + endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index afdca4fa..874778b8 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -20,6 +20,7 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_fires; + wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS]; @@ -43,6 +44,7 @@ interface VX_pipeline_perf_if (); output ibf_stalls, output scb_stalls, output scb_fires, + output scb_any_fire_cycles, output units_uses, output sfu_uses, output dispatch_stalls, @@ -58,6 +60,7 @@ interface VX_pipeline_perf_if (); input ibf_stalls, input scb_stalls, input scb_fires, + input scb_any_fire_cycles, input units_uses, input sfu_uses, input dispatch_stalls, From 6c632200d544133c3b0cb1adfb907eafe75961ae Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 3 Apr 2024 15:29:51 -0700 Subject: [PATCH 9/9] Divide by per-breakdown cycle for avg stall cycles --- hw/rtl/core/VX_core.sv | 16 ++++++++-------- hw/rtl/core/VX_operands.sv | 3 +++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 45426053..0c9a7ac1 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -436,31 +436,31 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); $display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)", scrb_alu_per_core, - $itor(scrb_alu_per_core) / $itor(dispatch_fires_total)); + $itor(scrb_alu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU])); $display("issue scoreboard: stalls by operand hazard: fpu %d (%2.2f cycles per issue)", scrb_fpu_per_core, - $itor(scrb_fpu_per_core) / $itor(dispatch_fires_total)); + $itor(scrb_fpu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_FPU])); $display("issue scoreboard: stalls by operand hazard: lsu %d (%2.2f cycles per issue)", scrb_lsu_per_core, - $itor(scrb_lsu_per_core) / $itor(dispatch_fires_total)); + $itor(scrb_lsu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_LSU])); $display("issue scoreboard: stalls by operand hazard: sfu %d (%2.2f cycles per issue)", scrb_sfu_per_core, - $itor(scrb_sfu_per_core) / $itor(dispatch_fires_total)); + $itor(scrb_sfu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_SFU])); $display("issue scoreboard: sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], $itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0, $itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0); $display("issue dispatch: stalls by FU busy: alu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_ALU], - $itor(pipeline_perf_if.dispatch_stalls[`EX_ALU]) / $itor(dispatch_fires_total)); + $itor(pipeline_perf_if.dispatch_stalls[`EX_ALU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU])); $display("issue dispatch: stalls by FU busy: fpu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_FPU], - $itor(pipeline_perf_if.dispatch_stalls[`EX_FPU]) / $itor(dispatch_fires_total)); + $itor(pipeline_perf_if.dispatch_stalls[`EX_FPU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_FPU])); $display("issue dispatch: stalls by FU busy: lsu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_LSU], - $itor(pipeline_perf_if.dispatch_stalls[`EX_LSU]) / $itor(dispatch_fires_total)); + $itor(pipeline_perf_if.dispatch_stalls[`EX_LSU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_LSU])); $display("issue dispatch: stalls by FU busy: sfu %d (%2.2f cycles per issue)", pipeline_perf_if.dispatch_stalls[`EX_SFU], - $itor(pipeline_perf_if.dispatch_stalls[`EX_SFU]) / $itor(dispatch_fires_total)); + $itor(pipeline_perf_if.dispatch_stalls[`EX_SFU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_SFU])); $display("issue dispatch: fires: total %d", dispatch_fires_total); $display("issue dispatch: fires: alu %d", diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3747502f..28f1edf1 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -197,6 +197,9 @@ module VX_operands import VX_gpu_pkg::*; #( assign stg_valid_in = scoreboard_if[i].valid && data_ready; assign scoreboard_if[i].ready = stg_ready_in && data_ready; + // NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving + // throughput. Wouldn't this cap overall IPC? Or OK as long as + // ISSUE_WIDTH > 1? VX_toggle_buffer #( .DATAW (DATAW) ) staging_buffer (