This commit is contained in:
Richard Yan
2024-04-09 19:55:11 -07:00
7 changed files with 299 additions and 113 deletions

View File

@@ -335,99 +335,152 @@ module VX_core import VX_gpu_pkg::*; #(
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
int instrs;
assign instrs = commit_csr_if.instret;
assign instrs = 32'(commit_csr_if.instret);
int cycles;
assign cycles = sched_csr_if.cycles;
assign cycles = 32'(sched_csr_if.cycles);
int icache_lat;
assign icache_lat = perf_icache_lat;
assign icache_lat = 32'(perf_icache_lat);
int ifetches;
assign ifetches = perf_ifetches;
assign ifetches = 32'(perf_ifetches);
int dcache_lat;
assign dcache_lat = perf_dcache_lat;
assign dcache_lat = 32'(perf_dcache_lat);
int loads;
assign loads = perf_loads;
int scheduler_idles;
assign scheduler_idles = pipeline_perf_if.sched_idles;
int scheduler_stalls;
assign scheduler_stalls = pipeline_perf_if.sched_stalls;
int scheduler_barrier_stalls;
assign scheduler_barrier_stalls = pipeline_perf_if.sched_barrier_stalls;
int ibuf_stalls;
assign ibuf_stalls = pipeline_perf_if.ibf_stalls;
assign loads = 32'(perf_loads);
int scrb_alu_per_core;
assign scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU];
assign scrb_alu_per_core = 32'(pipeline_perf_if.units_uses[`EX_ALU]);
int scrb_fpu_per_core;
assign scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU];
assign scrb_fpu_per_core = 32'(pipeline_perf_if.units_uses[`EX_FPU]);
int scrb_lsu_per_core;
assign scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU];
assign scrb_lsu_per_core = 32'(pipeline_perf_if.units_uses[`EX_LSU]);
int scrb_sfu_per_core;
assign scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU];
assign scrb_sfu_per_core = 32'(pipeline_perf_if.units_uses[`EX_SFU]);
int scrb_tot;
assign scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core;
int scrb_wctl_per_core;
assign scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL];
assign scrb_wctl_per_core = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL]);
int scrb_csrs_per_core;
assign scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS];
assign scrb_csrs_per_core = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS]);
int sfu_tot;
assign sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core;
always @(negedge busy) begin
if (!reset) begin
$display("====================CORE : %d===================",CORE_ID);
$display("time : %t", $time);
// $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle);
// $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle);
// $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle);
// $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle);
// $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle);
// $display("perf_icache_pending_reads: %d", perf_icache_pending_reads);
// $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads);
// $display("perf_icache_req_fire: %b", perf_icache_req_fire);
// $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire);
// $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire);
// $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r);
// $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire);
// $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r);
// $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire);
reg busy_prev;
reg [31:0] report_counter;
$display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles,
$itor(instrs) / $itor(cycles));
$display("scheduler idle: %d cycles (%f%%)", pipeline_perf_if.sched_idles,
$itor(scheduler_idles) / $itor(cycles) * 100.0);
$display("scheduler stalls: %d cycles (%f%%)", pipeline_perf_if.sched_stalls,
$itor(scheduler_stalls) / $itor(cycles) * 100.0);
$display("scheduler barrier stalls: %d count across NUM_WARPS=%d (%f%%)",
pipeline_perf_if.sched_barrier_stalls,
`NUM_WARPS,
$itor(scheduler_barrier_stalls) / $itor(cycles) * 100.0);
$display("ibuffer stalls: %d cycles (%f%%)",pipeline_perf_if.ibf_stalls,
$itor(ibuf_stalls) / $itor(cycles) * 100.0);
// see VX_scoreboard.sv
$display("issue stalls: %d (summed across ISSUE_WIDTH=%d)",
pipeline_perf_if.scb_stalls, `ISSUE_WIDTH);
$display("issue stalls: alu %d (%f%%)",
scrb_alu_per_core,
$itor(scrb_alu_per_core) / $itor(scrb_tot) * 100.0);
$display("issue stalls: fpu %d (%f%%)",
scrb_fpu_per_core,
$itor(scrb_fpu_per_core) / $itor(scrb_tot) * 100.0);
$display("issue stalls: lsu %d (%f%%)",
scrb_lsu_per_core,
$itor(scrb_lsu_per_core) / $itor(scrb_tot) * 100.0);
$display("issue stalls: sfu %d (%f%%)",
scrb_sfu_per_core,
$itor(scrb_sfu_per_core) / $itor(scrb_tot) * 100.0);
$display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU],
$itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0,
$itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0);
$display("ifetches: %d", perf_ifetches);
$display("ifetch latency: %f Cycles",
$itor(icache_lat) / $itor(ifetches));
$display("loads: %d", perf_loads);
$display("load latency: %f Cycles",
$itor(dcache_lat) / $itor(loads));
$display("stores: %d", perf_stores);
always @(posedge clk) begin
if (reset) begin
busy_prev <= 1'b0;
report_counter <= 32'd0;
end else begin
busy_prev <= busy;
if (report_counter == 32'd10000) begin
report_counter <= 32'd0;
end else begin
report_counter <= report_counter + 32'd1;
end
end
end
wire busy_negedge;
assign busy_negedge = busy_prev && !busy;
reg [`PERF_CTR_BITS-1:0] dispatch_fires_total;
always @(*) begin
dispatch_fires_total = '0;
for (integer i = 0; i < `NUM_EX_UNITS; i++) begin
dispatch_fires_total = dispatch_fires_total + pipeline_perf_if.dispatch_fires[i];
end
end
always @(posedge clk) begin
if (!reset && (busy_negedge || (report_counter == 32'd0))) begin
$display("====================CORE : %d===================",CORE_ID);
$display("time : %t", $time);
// disabled as always zero
// $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle);
// $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle);
// $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle);
// $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle);
// $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle);
// $display("perf_icache_pending_reads: %d", perf_icache_pending_reads);
// $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads);
// $display("perf_icache_req_fire: %b", perf_icache_req_fire);
// $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire);
// $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire);
// $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r);
// $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire);
// $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r);
// $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire);
$display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles,
$itor(instrs) / $itor(cycles));
$display("scheduler idle: %d cycles (%.2f%%)", pipeline_perf_if.sched_idles,
$itor(pipeline_perf_if.sched_idles) / $itor(cycles) * 100.0);
$display("scheduler barrier idle: %d count across NUM_WARPS=%d",
pipeline_perf_if.sched_barrier_idles, `NUM_WARPS);
// sched_stalls can happen when the later issue stage stalls,
// causing the ibuffer to clog.
$display("scheduler stalls: %d cycles (%.2f%%)", pipeline_perf_if.sched_stalls,
$itor(pipeline_perf_if.sched_stalls) / $itor(cycles) * 100.0);
$display("decode stalls (ibuffer not ready): %d cycles (%.2f%%)",pipeline_perf_if.ibf_stalls,
$itor(pipeline_perf_if.ibf_stalls) / $itor(cycles) * 100.0);
// see VX_scoreboard.sv
// scb_stalls: valid & ~ready (ready = stg_ready_in && operands_ready)
// units_uses: valid & ~operands_ready
// this will be a subset of scb_stalls
$display("issue scoreboard: fires total:\t%d across ISSUE_WIDTH=%d",
pipeline_perf_if.scb_fires, `ISSUE_WIDTH);
$display("issue scoreboard: cycles fired:\t%d (%.2f%%)",
pipeline_perf_if.scb_any_fire_cycles,
$itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0);
$display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d",
pipeline_perf_if.scb_stalls, `ISSUE_WIDTH);
$display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)",
scrb_alu_per_core,
$itor(scrb_alu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU]));
$display("issue scoreboard: stalls by operand hazard: fpu %d (%2.2f cycles per issue)",
scrb_fpu_per_core,
$itor(scrb_fpu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_FPU]));
$display("issue scoreboard: stalls by operand hazard: lsu %d (%2.2f cycles per issue)",
scrb_lsu_per_core,
$itor(scrb_lsu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_LSU]));
$display("issue scoreboard: stalls by operand hazard: sfu %d (%2.2f cycles per issue)",
scrb_sfu_per_core,
$itor(scrb_sfu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_SFU]));
$display("issue scoreboard: sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU],
$itor(scrb_csrs_per_core) / $itor(sfu_tot) * 100.0,
$itor(scrb_wctl_per_core) / $itor(sfu_tot) * 100.0);
$display("issue dispatch: stalls by FU busy: alu %d (%2.2f cycles per issue)",
pipeline_perf_if.dispatch_stalls[`EX_ALU],
$itor(pipeline_perf_if.dispatch_stalls[`EX_ALU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU]));
$display("issue dispatch: stalls by FU busy: fpu %d (%2.2f cycles per issue)",
pipeline_perf_if.dispatch_stalls[`EX_FPU],
$itor(pipeline_perf_if.dispatch_stalls[`EX_FPU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_FPU]));
$display("issue dispatch: stalls by FU busy: lsu %d (%2.2f cycles per issue)",
pipeline_perf_if.dispatch_stalls[`EX_LSU],
$itor(pipeline_perf_if.dispatch_stalls[`EX_LSU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_LSU]));
$display("issue dispatch: stalls by FU busy: sfu %d (%2.2f cycles per issue)",
pipeline_perf_if.dispatch_stalls[`EX_SFU],
$itor(pipeline_perf_if.dispatch_stalls[`EX_SFU]) / $itor(pipeline_perf_if.dispatch_fires[`EX_SFU]));
$display("issue dispatch: fires: total %d",
dispatch_fires_total);
$display("issue dispatch: fires: alu %d",
pipeline_perf_if.dispatch_fires[`EX_ALU]);
$display("issue dispatch: fires: fpu %d",
pipeline_perf_if.dispatch_fires[`EX_FPU]);
$display("issue dispatch: fires: lsu %d",
pipeline_perf_if.dispatch_fires[`EX_LSU]);
$display("issue dispatch: fires: sfu %d",
pipeline_perf_if.dispatch_fires[`EX_SFU]);
$display("issue dispatch: cycles fired: %d (%.2f%%)",
pipeline_perf_if.dispatch_any_fire_cycles,
$itor(pipeline_perf_if.dispatch_any_fire_cycles) / $itor(cycles) * 100.0);
$display("ifetches: %d", perf_ifetches);
$display("ifetch latency: %f cycles",
$itor(icache_lat) / $itor(ifetches));
$display("dcache loads: %d", perf_loads);
$display("dcache load latency: %f cycles",
$itor(dcache_lat) / $itor(loads));
$display("dcache stores: %d", perf_stores);
end
end

View File

@@ -22,6 +22,9 @@ module VX_dispatch import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
output wire [`PERF_CTR_BITS-1:0] perf_valids [`NUM_EX_UNITS],
output wire [`PERF_CTR_BITS-1:0] perf_fires [`NUM_EX_UNITS],
output wire [`PERF_CTR_BITS-1:0] perf_any_fire_cycles,
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
@@ -176,43 +179,95 @@ module VX_dispatch import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle_r;
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle_r;
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle_r;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_valids_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_unit_fires_per_cycle;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_valids_per_cycle;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_fires_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_valids_r;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_fires_r;
reg [`PERF_CTR_BITS-1:0] perf_any_fire_cycles_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
perf_issue_unit_valids_per_cycle[i] = '0;
perf_issue_unit_fires_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
if (operands_if[i].valid) begin
perf_issue_unit_valids_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
if (operands_if[i].valid && operands_if[i].ready) begin
perf_issue_unit_fires_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
always @(*) begin
perf_unit_stalls_per_cycle[i] = '0;
perf_unit_valids_per_cycle[i] = '0;
perf_unit_fires_per_cycle[i] = '0;
for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin
perf_unit_stalls_per_cycle[i] = perf_unit_stalls_per_cycle[i] + perf_issue_unit_stalls_per_cycle[isw][i];
perf_unit_valids_per_cycle[i] = perf_unit_valids_per_cycle[i] + perf_issue_unit_valids_per_cycle[isw][i];
perf_unit_fires_per_cycle[i] = perf_unit_fires_per_cycle[i] + perf_issue_unit_fires_per_cycle[isw][i];
end
end
end
// VX_reduce #(
// .DATAW_IN (`NUM_EX_UNITS),
// .N (`ISSUE_WIDTH),
// .OP ("|")
// ) reduce (
// .data_in (perf_issue_unit_stalls_per_cycle),
// .data_out (perf_unit_stalls_per_cycle)
// );
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
`BUFFER(perf_unit_valids_per_cycle_r, perf_unit_valids_per_cycle);
`BUFFER(perf_unit_fires_per_cycle_r, perf_unit_fires_per_cycle);
reg perf_any_fire_per_cycle;
always @(*) begin
perf_any_fire_per_cycle = 1'b0;
for (integer i = 0; i < `NUM_EX_UNITS; ++i) begin
if (perf_unit_fires_per_cycle_r[i] != '0) begin
perf_any_fire_per_cycle = 1'b1;
end
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
perf_valids_r[i] <= '0;
perf_fires_r[i] <= '0;
perf_any_fire_cycles_r <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
perf_valids_r[i] <= perf_valids_r[i] + `PERF_CTR_BITS'(perf_unit_valids_per_cycle_r[i]);
perf_fires_r[i] <= perf_fires_r[i] + `PERF_CTR_BITS'(perf_unit_fires_per_cycle_r[i]);
perf_any_fire_cycles_r <= perf_any_fire_cycles_r + `PERF_CTR_BITS'(perf_any_fire_per_cycle);
end
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
assign perf_valids[i] = perf_valids_r[i];
assign perf_fires[i] = perf_fires_r[i];
end
assign perf_any_fire_cycles = perf_any_fire_cycles_r;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE_VCS

View File

@@ -61,6 +61,8 @@ module VX_issue #(
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_fires (perf_issue_if.scb_fires),
.perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
@@ -86,6 +88,10 @@ module VX_issue #(
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
.perf_stalls (perf_issue_if.dispatch_stalls),
.perf_valids (perf_issue_if.dispatch_valids),
.perf_fires (perf_issue_if.dispatch_fires),
.perf_any_fire_cycles (perf_issue_if.dispatch_any_fire_cycles),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),

View File

@@ -197,6 +197,9 @@ module VX_operands import VX_gpu_pkg::*; #(
assign stg_valid_in = scoreboard_if[i].valid && data_ready;
assign scoreboard_if[i].ready = stg_ready_in && data_ready;
// NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
// throughput. Wouldn't this cap overall IPC? Or OK as long as
// ISSUE_WIDTH > 1?
VX_toggle_buffer #(
.DATAW (DATAW)
) staging_buffer (

View File

@@ -413,28 +413,28 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_stalls;
reg [`PERF_CTR_BITS-1:0] perf_sched_barrier_idles;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_stall;
`POP_COUNT(schedule_barrier_stall, barrier_stalls);
wire [`CLOG2(`NUM_WARPS+1)-1:0] schedule_barrier_idle;
`POP_COUNT(schedule_barrier_idle, barrier_stalls);
always @(posedge clk) begin
if (reset) begin
perf_sched_idles <= '0;
perf_sched_barrier_idles <= '0;
perf_sched_stalls <= '0;
perf_sched_barrier_stalls <= '0;
end else begin
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_barrier_idles <= perf_sched_barrier_idles + `PERF_CTR_BITS'(schedule_barrier_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
perf_sched_barrier_stalls <= perf_sched_barrier_stalls + `PERF_CTR_BITS'(schedule_barrier_stall);
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_barrier_idles = perf_sched_barrier_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign perf_schedule_if.sched_barrier_stalls = perf_sched_barrier_stalls;
`endif
endmodule

View File

@@ -21,6 +21,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_scb_fires,
output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
`endif
@@ -34,46 +36,79 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
wire [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_per_cycle_r;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_per_cycle;
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_per_cycle_r;
reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_per_cycle;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r;
wire perf_any_fire_per_cycle, perf_any_fire_per_cycle_r;
reg [`PERF_CTR_BITS-1:0] perf_scb_empty;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
`POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle);
assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle;
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
always @(*) begin
perf_units_per_cycle[i] = '0;
for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin
perf_units_per_cycle[i] = perf_units_per_cycle[i] + perf_issue_units_per_cycle[isw][i];
end
end
end
for (genvar i=0; i < `NUM_SFU_UNITS; ++i) begin
always @(*) begin
perf_sfu_per_cycle[i] = '0;
for (integer isw = 0; isw < `ISSUE_WIDTH; ++isw) begin
perf_sfu_per_cycle[i] = perf_sfu_per_cycle[i] + perf_issue_sfu_per_cycle[isw][i];
end
end
end
// NOTE(hansung): Because of OR-reduce, things are counted as once even when
// multiple warps were using the same execution unit type at a given cycle.
// This might result in an overall undercount
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_units_reduce (
.data_in (perf_issue_units_per_cycle),
.data_out (perf_units_per_cycle)
);
// VX_reduce #(
// .DATAW_IN (`NUM_EX_UNITS),
// .N (`ISSUE_WIDTH),
// .OP ("|")
// ) perf_units_reduce (
// .data_in (perf_issue_units_per_cycle),
// .data_out (perf_units_per_cycle)
// );
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
// VX_reduce #(
// .DATAW_IN (`NUM_SFU_UNITS),
// .N (`ISSUE_WIDTH),
// .OP ("|")
// ) perf_sfu_reduce (
// .data_in (perf_issue_sfu_per_cycle),
// .data_out (perf_sfu_per_cycle)
// );
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle);
`BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_scb_fires <= '0;
perf_scb_any_fire_cycles <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r);
perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r);
end
end
@@ -153,6 +188,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
assign perf_issue_fires_per_cycle[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
`endif
// NOTE(hansung): why is inuse_rd checked? to prevent WAW?
@@ -229,4 +265,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
wire [`ISSUE_WIDTH-1:0] ibuffer_valids;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign ibuffer_valids[i] = ibuffer_if[i].valid;
end
always @(posedge clk) begin
if (reset) begin
perf_scb_empty <= '0;
end else begin
perf_scb_empty <= perf_scb_empty + `PERF_CTR_BITS'(~|ibuffer_valids);
end
end
`endif
endmodule

View File

@@ -16,11 +16,17 @@
interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] sched_barrier_stalls;
wire [`PERF_CTR_BITS-1:0] sched_barrier_idles;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_fires;
wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles;
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
wire [`PERF_CTR_BITS-1:0] dispatch_stalls [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] dispatch_valids [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] dispatch_fires [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] dispatch_any_fire_cycles;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
@@ -30,25 +36,37 @@ interface VX_pipeline_perf_if ();
modport schedule (
output sched_idles,
output sched_barrier_stalls,
output sched_barrier_idles,
output sched_stalls
);
modport issue (
output ibf_stalls,
output scb_stalls,
output scb_fires,
output scb_any_fire_cycles,
output units_uses,
output sfu_uses
output sfu_uses,
output dispatch_stalls,
output dispatch_valids,
output dispatch_fires,
output dispatch_any_fire_cycles
);
modport slave (
input sched_idles,
input sched_barrier_stalls,
input sched_barrier_idles,
input sched_stalls,
input ibf_stalls,
input scb_stalls,
input scb_fires,
input scb_any_fire_cycles,
input units_uses,
input sfu_uses,
input dispatch_stalls,
input dispatch_valids,
input dispatch_fires,
input dispatch_any_fire_cycles,
input ifetches,
input loads,
input stores,