profiling optimizations
minor updates
This commit is contained in:
@@ -436,7 +436,7 @@
|
|||||||
|
|
||||||
// Number of Banks
|
// Number of Banks
|
||||||
`ifndef DCACHE_NUM_BANKS
|
`ifndef DCACHE_NUM_BANKS
|
||||||
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
|
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Core Response Queue Size
|
// Core Response Queue Size
|
||||||
|
|||||||
@@ -174,30 +174,38 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||||
end
|
end
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
|
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
|
||||||
wire [`ISSUE_WIDTH-1:0] operands_stall;
|
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
|
||||||
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
|
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||||
|
|
||||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
|
always @(*) begin
|
||||||
assign operands_ex_type[i] = operands_if[i].data.ex_type;
|
perf_issue_unit_stalls_per_cycle[i] = '0;
|
||||||
end
|
if (operands_if[i].valid && ~operands_if[i].ready) begin
|
||||||
|
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
|
||||||
always @(*) begin
|
|
||||||
perf_stalls_n = perf_stalls_r;
|
|
||||||
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
|
|
||||||
if (operands_stall[i]) begin
|
|
||||||
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
always @(posedge clk) begin
|
VX_reduce #(
|
||||||
if (reset) begin
|
.DATAW_IN (`NUM_EX_UNITS),
|
||||||
perf_stalls_r <= '0;
|
.N (`ISSUE_WIDTH),
|
||||||
end else begin
|
.OP ("|")
|
||||||
perf_stalls_r <= perf_stalls_n;
|
) reduce (
|
||||||
|
.data_in (perf_issue_unit_stalls_per_cycle),
|
||||||
|
.data_out (perf_unit_stalls_per_cycle)
|
||||||
|
);
|
||||||
|
|
||||||
|
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_stalls_r[i] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -32,19 +32,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
wire [`NUM_EX_UNITS-1:0] scoreboard_uses_per_cycle;
|
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle;
|
||||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle;
|
||||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] scoreboard_uses;
|
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle;
|
||||||
wire [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||||
|
|
||||||
|
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||||
|
|
||||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
|
||||||
VX_reduce #(
|
VX_reduce #(
|
||||||
.DATAW_IN (`NUM_EX_UNITS),
|
.DATAW_IN (`NUM_EX_UNITS),
|
||||||
.N (`ISSUE_WIDTH),
|
.N (`ISSUE_WIDTH),
|
||||||
.OP ("|")
|
.OP ("|")
|
||||||
) reduce (
|
) reduce (
|
||||||
.data_in (scoreboard_uses),
|
.data_in (perf_issue_uses_per_cycle),
|
||||||
.data_out (scoreboard_uses_per_cycle)
|
.data_out (perf_uses_per_cycle)
|
||||||
);
|
);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
@@ -62,23 +63,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
scoreboard_uses[i] = '0;
|
perf_issue_uses_per_cycle[i] = '0;
|
||||||
if (ibuffer_if[i].valid) begin
|
if (ibuffer_if[i].valid) begin
|
||||||
if (inuse_rd) begin
|
if (inuse_rd) begin
|
||||||
scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||||
end
|
end
|
||||||
if (inuse_rs1) begin
|
if (inuse_rs1) begin
|
||||||
scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||||
end
|
end
|
||||||
if (inuse_rs2) begin
|
if (inuse_rs2) begin
|
||||||
scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||||
end
|
end
|
||||||
if (inuse_rs3) begin
|
if (inuse_rs3) begin
|
||||||
scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
reg [DATAW-1:0] data_out_r;
|
reg [DATAW-1:0] data_out_r;
|
||||||
@@ -164,19 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r;
|
||||||
|
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r;
|
||||||
|
|
||||||
|
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||||
|
`BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_scb_stalls <= '0;
|
perf_scb_stalls <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_scb_uses[i] <= '0;
|
perf_scb_uses[i] <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(scoreboard_uses_per_cycle[i]);
|
perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -183,21 +183,23 @@ module VX_stream_xbar #(
|
|||||||
per_cycle_collision = 0;
|
per_cycle_collision = 0;
|
||||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||||
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
||||||
if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin
|
per_cycle_collision[i] |= valid_in[i]
|
||||||
per_cycle_collision[i] |= ready_in[i] | ready_in[j+i];
|
&& valid_in[j+i]
|
||||||
end
|
&& (sel_in[i] == sel_in[j+i])
|
||||||
|
&& (ready_in[i] | ready_in[j+i]);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
|
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count, collision_count_r;
|
||||||
`POP_COUNT(collision_count, per_cycle_collision);
|
`POP_COUNT(collision_count, per_cycle_collision);
|
||||||
|
`BUFFER(collision_count_r, collision_count);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
collisions_r <= '0;
|
collisions_r <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
|
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count_r);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -245,14 +245,19 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||||||
reg [`PERF_CTR_BITS-1:0] perf_writes;
|
reg [`PERF_CTR_BITS-1:0] perf_writes;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||||
|
|
||||||
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle_r;
|
||||||
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle_r;
|
||||||
|
`BUFFER(perf_reads_per_cycle_r, perf_reads_per_cycle);
|
||||||
|
`BUFFER(perf_writes_per_cycle_r, perf_writes_per_cycle);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_reads <= '0;
|
perf_reads <= '0;
|
||||||
perf_writes <= '0;
|
perf_writes <= '0;
|
||||||
perf_crsp_stalls <= '0;
|
perf_crsp_stalls <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle);
|
perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle_r);
|
||||||
perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle);
|
perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle_r);
|
||||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
Reference in New Issue
Block a user