diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 4ffd392f..37db6123 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -415,13 +415,13 @@ module VX_core import VX_gpu_pkg::*; #( $itor(instrs) / $itor(cycles)); $display("scheduler idle: %d cycles (%.2f%%)", pipeline_perf_if.sched_idles, $itor(pipeline_perf_if.sched_idles) / $itor(cycles) * 100.0); + $display("scheduler barrier idle: %d count across NUM_WARPS=%d", + pipeline_perf_if.sched_barrier_idles, `NUM_WARPS); + // sched_stalls can happen when the later issue stage stalls, + // causing the ibuffer to clog. $display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls, $itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0); - $display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%.2f%%)", - pipeline_perf_if.sched_barrier_stalls, - `NUM_WARPS, - $itor(pipeline_perf_if.sched_barrier_stalls) / $itor(cycles) * 100.0); - $display("decode stalls: %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, + $display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls, $itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0); // see VX_scoreboard.sv // scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready) @@ -472,12 +472,12 @@ module VX_core import VX_gpu_pkg::*; #( pipeline_perf_if.dispatch_any_fire_cycles, $itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0); $display("ifetches: %d", perf_ifetches); - $display("ifetch latency: %f Cycles", + $display("ifetch latency: %f cycles", $itor(icache_lat) / $itor(ifetches)); - $display("loads: %d", perf_loads); - $display("load latency: %f Cycles", + $display("dcache loads: %d", perf_loads); + $display("dcache load latency: %f cycles", $itor(dcache_lat) / $itor(loads)); - $display("stores: %d", perf_stores); + $display("dcache stores: %d", perf_stores); end end diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index c890a2f6..8c165261 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -413,28 +413,28 @@ module VX_schedule import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_idles; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; - reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_stalls; + reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles; wire schedule_idle = ~schedule_valid; wire schedule_stall = schedule_if.valid && ~schedule_if.ready; - wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_stall; - `POP_COUNT(schedule_barrier_stall, barrier_stalls); + wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle; + `POP_COUNT(schedule_barrier_idle, barrier_stalls); always @(posedge clk) begin if (reset) begin perf_sched_idles <= '0; + perf_sched_barrier_idles <= '0; perf_sched_stalls <= '0; - perf_sched_barrier_stalls <= '0; end else begin perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle); + perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall); - perf_sched_barrier_stalls <= perf_sched_barrier_stalls + `PERF_CTR_BITS'(schedule_barrier_stall); end end assign perf_schedule_if.sched_idles = perf_sched_idles; + assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles; assign perf_schedule_if.sched_stalls = perf_sched_stalls; - assign perf_schedule_if.sched_barrier_stalls = perf_sched_barrier_stalls; `endif endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 29b2903a..afdca4fa 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -16,9 +16,10 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_idles; wire [`PERF_CTR_BITS-1:0] sched_stalls; - wire [`PERF_CTR_BITS-1:0] sched_barrier_stalls; + wire [`PERF_CTR_BITS-1:0] sched_barrier_idles; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_fires; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS]; wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS]; @@ -34,13 +35,14 @@ interface VX_pipeline_perf_if (); modport schedule ( output sched_idles, - output sched_barrier_stalls, + output sched_barrier_idles, output sched_stalls ); modport issue ( output ibf_stalls, output scb_stalls, + output scb_fires, output units_uses, output sfu_uses, output dispatch_stalls, @@ -51,10 +53,11 @@ interface VX_pipeline_perf_if (); modport slave ( input sched_idles, - input sched_barrier_stalls, + input sched_barrier_idles, input sched_stalls, input ibf_stalls, input scb_stalls, + input scb_fires, input units_uses, input sfu_uses, input dispatch_stalls,