scoreboard optimization & profiling

This commit is contained in:
Blaise Tine
2023-11-27 05:53:36 -08:00
parent 4b68235389
commit 24973ffca0
8 changed files with 206 additions and 112 deletions

View File

@@ -100,15 +100,13 @@ module VX_commit import VX_gpu_pkg::*; #(
// CSRs update // CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r; wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r; wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr; wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire); assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count; `POP_COUNT(commit_size[i], commit_tmask[i]);
`POP_COUNT(pop_count, commit_tmask[i]);
assign commit_size[i] = pop_count;
end end
VX_pipe_register #( VX_pipe_register #(
@@ -129,7 +127,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.OP ("+") .OP ("+")
) commit_size_reduce ( ) commit_size_reduce (
.data_in (commit_size_r), .data_in (commit_size_r),
.data_out (commit_size_all) .data_out (commit_size_all_r)
); );
VX_pipe_register #( VX_pipe_register #(
@@ -139,26 +137,26 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (1'b1), .enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}), .data_in ({commit_fire_any_r, commit_size_all_r}),
.data_out ({commit_fire_any_rr, commit_size_all_r}) .data_out ({commit_fire_any_rr, commit_size_all_rr})
); );
reg [`PERF_CTR_BITS-1:0] instret; reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
instret <= '0; instret <= '0;
end else begin end else begin
if (commit_fire_any_rr) begin if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r); instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
end end
end end
end end
assign commit_csr_if.instret = instret; assign commit_csr_if.instret = instret;
// Committed instructions // Committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
VX_pipe_register #( VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH) .RESETW (`ISSUE_WIDTH)
@@ -166,7 +164,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (1'b1), .enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}), .data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid}) .data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
); );
@@ -182,7 +180,7 @@ module VX_commit import VX_gpu_pkg::*; #(
assign writeback_if[i].data.data = commit_if[i].data.data; assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop; assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop; assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1; assign commit_if[i].ready = 1'b1; // writeback has no backpressure
end end
// simulation helper signal to get RISC-V tests Pass/Fail status // simulation helper signal to get RISC-V tests Pass/Fail status

View File

@@ -118,6 +118,10 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (schedule_reset), .reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
`endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),

View File

@@ -179,14 +179,18 @@ import VX_fpu_pkg::*;
default: begin default: begin
read_addr_valid_r = 0; read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32)) if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin || (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1; read_addr_valid_r = 1;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
case (base_dcrs.mpm_class) case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin `VX_DCR_MPM_CLASS_CORE: begin
case (read_addr) case (read_addr)
// PERF: pipeline // PERF: pipeline
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
@@ -204,6 +208,19 @@ import VX_fpu_pkg::*;
`endif `endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; `VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
// PERF: memory // PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
@@ -225,6 +242,8 @@ import VX_fpu_pkg::*;
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache // PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);

View File

@@ -59,6 +59,10 @@ module VX_issue #(
) scoreboard ( ) scoreboard (
.clk (clk), .clk (clk),
.reset (scoreboard_reset), .reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_uses (perf_issue_if.scb_uses),
`endif
.writeback_if (writeback_if), .writeback_if (writeback_if),
.ibuffer_if (ibuffer_if), .ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if) .scoreboard_if (scoreboard_if)
@@ -152,29 +156,17 @@ module VX_issue #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_ibf_stalls <= '0; perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin end else begin
if (decode_if.valid && ~decode_if.ready) begin if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1); perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
end end
end end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls; assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif `endif
endmodule endmodule

View File

@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
`endif
// configuration // configuration
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
@@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
end end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
always @(posedge clk) begin
if (reset) begin
perf_sched_stalls <= '0;
perf_fetch_stalls <= '0;
end else begin
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
end
end
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
`endif
endmodule endmodule

View File

@@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH], VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH] VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
@@ -26,81 +31,102 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_alu_per_cycle;
`ifdef EXT_F_ENABLE
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_fpu_per_cycle;
`endif
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_lsu_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_sfu_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`EX_BITS-1:0][`ISSUE_WIDTH-1:0] scoreboard_uses;
wire [`ISSUE_WIDTH-1:0] scoreboard_stalls;
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
`POP_COUNT(scoreboard_alu_per_cycle, scoreboard_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`POP_COUNT(scoreboard_fpu_per_cycle, scoreboard_uses[`EX_FPU]);
`endif
`POP_COUNT(scoreboard_lsu_per_cycle, scoreboard_uses[`EX_LSU]);
`POP_COUNT(scoreboard_sfu_per_cycle, scoreboard_uses[`EX_SFU]);
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
reg [3:0] ready_masks, ready_masks_n;
VX_ibuffer_if staging_if(); VX_ibuffer_if staging_if();
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
always @(*) begin always @(*) begin
inuse_regs_n = inuse_regs; scoreboard_uses = '0;
ready_masks_n = ready_masks; if (ibuffer_if[i].valid) begin
if (writeback_fire) begin if (inuse_rd) begin
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0; scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]][i] = 1;
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
& {(writeback_if[i].data.rd == staging_if.data.rd),
(writeback_if[i].data.rd == staging_if.data.rs1),
(writeback_if[i].data.rd == staging_if.data.rs2),
(writeback_if[i].data.rd == staging_if.data.rs3)};
end end
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin if (inuse_rs1) begin
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1; scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]][i] = 1;
ready_masks_n = '0;
end end
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin if (inuse_rs2) begin
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd], scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]][i] = 1;
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1], end
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2], if (inuse_rs3) begin
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]}; scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]][i] = 1;
end end
end end
end
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
valid_out_r <= 0;
inuse_regs <= '0; inuse_regs <= '0;
ready_masks <= '0;
end else begin end else begin
inuse_regs <= inuse_regs_n; if (writeback_fire) begin
ready_masks <= ready_masks_n; inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= ibuffer_if[i].valid && deps_ready;
end else if (staging_if.ready) begin
if (staging_if.data.wb) begin
inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type;
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= ibuffer_if[i].data;
end end
end end
// staging buffer assign ibuffer_if[i].ready = ~valid_out_r && deps_ready;
assign staging_if.valid = valid_out_r;
`RESET_RELAY (stg_buf_reset, reset); assign staging_if.data = data_out_r;
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (ibuffer_if[i].valid),
.ready_in (ibuffer_if[i].ready),
.data_in (ibuffer_if[i].data),
.data_out (staging_if.data),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
// output buffer
wire valid_stg, ready_stg;
wire regs_ready = (& ready_masks);
assign valid_stg = staging_if.valid && regs_ready;
assign staging_if.ready = ready_stg && regs_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (DATAW), .DATAW (DATAW),
.SIZE (2), .SIZE (0),
.OUT_REG (2) .OUT_REG (2)
) out_buf ( ) out_buf (
.clk (clk), .clk (clk),
.reset (out_buf_reset), .reset (reset),
.valid_in (valid_stg), .valid_in (staging_if.valid),
.ready_in (ready_stg), .ready_in (staging_if.ready),
.data_in (staging_if.data), .data_in (staging_if.data),
.data_out (scoreboard_if[i].data), .data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid), .valid_out (scoreboard_if[i].valid),
@@ -114,14 +140,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (reset) begin if (reset) begin
timeout_ctr <= '0; timeout_ctr <= '0;
end else begin end else begin
if (staging_if.valid && ~regs_ready) begin if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE `ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", `TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid)); ~ready_masks, ibuffer_if[i].data.uuid));
`endif `endif
timeout_ctr <= timeout_ctr + 1; timeout_ctr <= timeout_ctr + 1;
end else if (staging_if.valid && staging_if.ready) begin end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
timeout_ctr <= '0; timeout_ctr <= '0;
end end
end end
@@ -129,8 +155,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid)); ~ready_masks, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
@@ -139,4 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end end
`ifdef PERF_ENABLE
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_scb_uses[`EX_ALU] <= '0;
`ifdef EXT_F_ENABLE
perf_scb_uses[`EX_FPU] <= '0;
`endif
perf_scb_uses[`EX_LSU] <= '0;
perf_scb_uses[`EX_SFU] <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
perf_scb_uses[`EX_ALU] <= perf_scb_uses[`EX_ALU] + `PERF_CTR_BITS'(scoreboard_alu_per_cycle);
`ifdef EXT_F_ENABLE
perf_scb_uses[`EX_FPU] <= perf_scb_uses[`EX_FPU] + `PERF_CTR_BITS'(scoreboard_fpu_per_cycle);
`endif
perf_scb_uses[`EX_LSU] <= perf_scb_uses[`EX_LSU] + `PERF_CTR_BITS'(scoreboard_lsu_per_cycle);
perf_scb_uses[`EX_SFU] <= perf_scb_uses[`EX_SFU] + `PERF_CTR_BITS'(scoreboard_sfu_per_cycle);
end
end
`endif
endmodule endmodule

View File

@@ -14,8 +14,11 @@
`include "VX_define.vh" `include "VX_define.vh"
interface VX_pipeline_perf_if (); interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] ifetches; wire [`PERF_CTR_BITS-1:0] ifetches;
@@ -24,15 +27,24 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] ifetch_latency; wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency; wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_stalls,
output fetch_stalls
);
modport issue ( modport issue (
output ibf_stalls, output ibf_stalls,
output scb_stalls, output scb_stalls,
output scb_uses,
output dsp_stalls output dsp_stalls
); );
modport slave ( modport slave (
input sched_stalls,
input fetch_stalls,
input ibf_stalls, input ibf_stalls,
input scb_stalls, input scb_stalls,
input scb_uses,
input dsp_stalls, input dsp_stalls,
input ifetches, input ifetches,
input loads, input loads,

View File

@@ -173,8 +173,8 @@ module VX_stream_xbar #(
end end
// compute inputs collision // compute inputs collision
// we have a collision when there exists a valid transfer with mutiple input candicates // we have a collision when there exists a valid transfer with multiple input candicates
// we caount the unique duplicates each cycle. // we count the unique duplicates each cycle.
reg [PERF_CTR_BITS-1:0] collisions_r; reg [PERF_CTR_BITS-1:0] collisions_r;
reg [NUM_INPUTS-1:0] per_cycle_collision; reg [NUM_INPUTS-1:0] per_cycle_collision;