diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index e5dbe97c..a76f395b 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #( VX_commit_if commit_if[`ISSUE_WIDTH](); - wire [`ISSUE_WIDTH-1:0] commit_fire; + wire [`ISSUE_WIDTH-1:0] commit_fire; wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid; wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask; wire [`ISSUE_WIDTH-1:0] commit_eop; @@ -91,24 +91,22 @@ module VX_commit import VX_gpu_pkg::*; #( `UNUSED_PIN (sel_out) ); - assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; - assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; - assign commit_wid[i] = commit_if[i].data.wid; - assign commit_eop[i] = commit_if[i].data.eop; + assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; + assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; + assign commit_wid[i] = commit_if[i].data.wid; + assign commit_eop[i] = commit_if[i].data.eop; end // CSRs update wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r; - wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r; + wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr; wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr; assign commit_fire_any = (| commit_fire); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - wire [COMMIT_SIZEW-1:0] pop_count; - `POP_COUNT(pop_count, commit_tmask[i]); - assign commit_size[i] = pop_count; + `POP_COUNT(commit_size[i], commit_tmask[i]); end VX_pipe_register #( @@ -129,7 +127,7 @@ module VX_commit import VX_gpu_pkg::*; #( .OP ("+") ) commit_size_reduce ( .data_in (commit_size_r), - .data_out (commit_size_all) + .data_out (commit_size_all_r) ); VX_pipe_register #( @@ -139,26 +137,26 @@ module VX_commit import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({commit_fire_any_r, commit_size_all}), - .data_out ({commit_fire_any_rr, commit_size_all_r}) + .data_in ({commit_fire_any_r, commit_size_all_r}), + .data_out ({commit_fire_any_rr, commit_size_all_rr}) ); reg [`PERF_CTR_BITS-1:0] instret; - always @(posedge clk) begin if (reset) begin instret <= '0; end else begin if (commit_fire_any_rr) begin - instret <= instret + `PERF_CTR_BITS'(commit_size_all_r); + instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr); end end end - assign commit_csr_if.instret = instret; // Committed instructions + wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop; + VX_pipe_register #( .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), .RESETW (`ISSUE_WIDTH) @@ -166,23 +164,23 @@ module VX_commit import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({(commit_fire & commit_eop), commit_wid}), + .data_in ({committed, commit_wid}), .data_out ({commit_sched_if.committed, commit_sched_if.committed_wid}) ); // Writeback for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; + assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; assign writeback_if[i].data.uuid = commit_if[i].data.uuid; - assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); - assign writeback_if[i].data.PC = commit_if[i].data.PC; - assign writeback_if[i].data.tmask = commit_if[i].data.tmask; - assign writeback_if[i].data.rd = commit_if[i].data.rd; + assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); + assign writeback_if[i].data.PC = commit_if[i].data.PC; + assign writeback_if[i].data.tmask= commit_if[i].data.tmask; + assign writeback_if[i].data.rd = commit_if[i].data.rd; assign writeback_if[i].data.data = commit_if[i].data.data; - assign writeback_if[i].data.sop = commit_if[i].data.sop; - assign writeback_if[i].data.eop = commit_if[i].data.eop; - assign commit_if[i].ready = 1'b1; + assign writeback_if[i].data.sop = commit_if[i].data.sop; + assign writeback_if[i].data.eop = commit_if[i].data.eop; + assign commit_if[i].ready = 1'b1; // writeback has no backpressure end // simulation helper signal to get RISC-V tests Pass/Fail status diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 8aaea911..684a9b84 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -116,7 +116,11 @@ module VX_core import VX_gpu_pkg::*; #( .CORE_ID (CORE_ID) ) schedule ( .clk (clk), - .reset (schedule_reset), + .reset (schedule_reset), + + `ifdef PERF_ENABLE + .perf_schedule_if (pipeline_perf_if.schedule), + `endif .base_dcrs (base_dcrs), diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 9ba0ffd0..44e997ff 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -179,14 +179,18 @@ import VX_fpu_pkg::*; default: begin read_addr_valid_r = 0; - if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32)) - || (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin + if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32)) + || (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin read_addr_valid_r = 1; `ifdef PERF_ENABLE case (base_dcrs.mpm_class) `VX_DCR_MPM_CLASS_CORE: begin case (read_addr) - // PERF: pipeline + // PERF: pipeline + `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; + `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0]; + `VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; @@ -204,6 +208,19 @@ import VX_fpu_pkg::*; `endif `VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; `VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; + `ifdef EXT_F_ENABLE + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0]; + `else + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0; + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0; + `endif + `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; + `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; // PERF: memory `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); @@ -214,7 +231,7 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; `VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; - `VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); default:; endcase end @@ -225,6 +242,8 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0]; + `VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: dcache `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index af00014e..53701cc8 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -59,6 +59,10 @@ module VX_issue #( ) scoreboard ( .clk (clk), .reset (scoreboard_reset), + `ifdef PERF_ENABLE + .perf_scb_stalls(perf_issue_if.scb_stalls), + .perf_scb_uses (perf_issue_if.scb_uses), + `endif .writeback_if (writeback_if), .ibuffer_if (ibuffer_if), .scoreboard_if (scoreboard_if) @@ -152,29 +156,17 @@ module VX_issue #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; - reg [`PERF_CTR_BITS-1:0] perf_scb_stalls; - - wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; - reg [`ISSUE_WIDTH-1:0] scoreboard_stalls; - for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin - assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; - end - `POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls); - always @(posedge clk) begin if (reset) begin perf_ibf_stalls <= '0; - perf_scb_stalls <= '0; end else begin if (decode_if.valid && ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1); end - perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle); end end assign perf_issue_if.ibf_stalls = perf_ibf_stalls; - assign perf_issue_if.scb_stalls = perf_scb_stalls; `endif endmodule diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index ea96178e..0ffeafc2 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_pipeline_perf_if.schedule perf_schedule_if, +`endif + // configuration input base_dcrs_t base_dcrs, @@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #( end `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); +`ifdef PERF_ENABLE + reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; + reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls; + always @(posedge clk) begin + if (reset) begin + perf_sched_stalls <= '0; + perf_fetch_stalls <= '0; + end else begin + perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid); + perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready); + end + end + + assign perf_schedule_if.sched_stalls = perf_sched_stalls; + assign perf_schedule_if.fetch_stalls = perf_fetch_stalls; +`endif + endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 90a58134..e3eaa44a 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, + output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS], +`endif + VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH], VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH] @@ -26,81 +31,102 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; +`ifdef PERF_ENABLE + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_alu_per_cycle; +`ifdef EXT_F_ENABLE + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_fpu_per_cycle; +`endif + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_lsu_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_sfu_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; + reg [`EX_BITS-1:0][`ISSUE_WIDTH-1:0] scoreboard_uses; + wire [`ISSUE_WIDTH-1:0] scoreboard_stalls; + `POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls); + `POP_COUNT(scoreboard_alu_per_cycle, scoreboard_uses[`EX_ALU]); +`ifdef EXT_F_ENABLE + `POP_COUNT(scoreboard_fpu_per_cycle, scoreboard_uses[`EX_FPU]); +`endif + `POP_COUNT(scoreboard_lsu_per_cycle, scoreboard_uses[`EX_LSU]); + `POP_COUNT(scoreboard_sfu_per_cycle, scoreboard_uses[`EX_SFU]); +`endif + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; - reg [3:0] ready_masks, ready_masks_n; + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; VX_ibuffer_if staging_if(); - + wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; + wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]; + wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]; + wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]; + wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]; + + `ifdef PERF_ENABLE + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units; always @(*) begin - inuse_regs_n = inuse_regs; - ready_masks_n = ready_masks; - if (writeback_fire) begin - inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0; - ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}} - & {(writeback_if[i].data.rd == staging_if.data.rd), - (writeback_if[i].data.rd == staging_if.data.rs1), - (writeback_if[i].data.rd == staging_if.data.rs2), - (writeback_if[i].data.rd == staging_if.data.rs3)}; - end - if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin - inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1; - ready_masks_n = '0; + scoreboard_uses = '0; + if (ibuffer_if[i].valid) begin + if (inuse_rd) begin + scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]][i] = 1; + end + if (inuse_rs1) begin + scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]][i] = 1; + end + if (inuse_rs2) begin + scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]][i] = 1; + end + if (inuse_rs3) begin + scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]][i] = 1; + end end - if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin - ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]}; - end - end + end + assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; + `endif + + reg [DATAW-1:0] data_out_r; + reg valid_out_r; + + wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; + wire deps_ready = (& ready_masks); always @(posedge clk) begin if (reset) begin - inuse_regs <= '0; - ready_masks <= '0; - end else begin - inuse_regs <= inuse_regs_n; - ready_masks <= ready_masks_n; + valid_out_r <= 0; + inuse_regs <= '0; + end else begin + if (writeback_fire) begin + inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; + end + if (~valid_out_r) begin + valid_out_r <= ibuffer_if[i].valid && deps_ready; + end else if (staging_if.ready) begin + if (staging_if.data.wb) begin + inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1; + `ifdef PERF_ENABLE + inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type; + `endif + end + valid_out_r <= 0; + end + end + if (~valid_out_r) begin + data_out_r <= ibuffer_if[i].data; end end - // staging buffer - - `RESET_RELAY (stg_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW) - ) stg_buf ( - .clk (clk), - .reset (stg_buf_reset), - .valid_in (ibuffer_if[i].valid), - .ready_in (ibuffer_if[i].ready), - .data_in (ibuffer_if[i].data), - .data_out (staging_if.data), - .valid_out (staging_if.valid), - .ready_out (staging_if.ready) - ); - - // output buffer - - wire valid_stg, ready_stg; - wire regs_ready = (& ready_masks); - assign valid_stg = staging_if.valid && regs_ready; - assign staging_if.ready = ready_stg && regs_ready; - - `RESET_RELAY (out_buf_reset, reset); + assign ibuffer_if[i].ready = ~valid_out_r && deps_ready; + assign staging_if.valid = valid_out_r; + assign staging_if.data = data_out_r; VX_elastic_buffer #( .DATAW (DATAW), - .SIZE (2), + .SIZE (0), .OUT_REG (2) ) out_buf ( .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_stg), - .ready_in (ready_stg), + .reset (reset), + .valid_in (staging_if.valid), + .ready_in (staging_if.ready), .data_in (staging_if.data), .data_out (scoreboard_if[i].data), .valid_out (scoreboard_if[i].valid), @@ -108,29 +134,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #( ); `ifdef SIMULATION - reg [31:0] timeout_ctr; - + reg [31:0] timeout_ctr; + always @(posedge clk) begin if (reset) begin timeout_ctr <= '0; end else begin - if (staging_if.valid && ~regs_ready) begin + if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin `ifdef DBG_TRACE_CORE_PIPELINE `TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", - $time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, - ~ready_masks, staging_if.data.uuid)); + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, + ~ready_masks, ibuffer_if[i].data.uuid)); `endif timeout_ctr <= timeout_ctr + 1; - end else if (staging_if.valid && staging_if.ready) begin + end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin timeout_ctr <= '0; end end - end - + end + `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", - $time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, - ~ready_masks, staging_if.data.uuid)); + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, + ~ready_masks, ibuffer_if[i].data.uuid)); `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", @@ -139,4 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end +`ifdef PERF_ENABLE + always @(posedge clk) begin + if (reset) begin + perf_scb_stalls <= '0; + perf_scb_uses[`EX_ALU] <= '0; + `ifdef EXT_F_ENABLE + perf_scb_uses[`EX_FPU] <= '0; + `endif + perf_scb_uses[`EX_LSU] <= '0; + perf_scb_uses[`EX_SFU] <= '0; + end else begin + perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle); + perf_scb_uses[`EX_ALU] <= perf_scb_uses[`EX_ALU] + `PERF_CTR_BITS'(scoreboard_alu_per_cycle); + `ifdef EXT_F_ENABLE + perf_scb_uses[`EX_FPU] <= perf_scb_uses[`EX_FPU] + `PERF_CTR_BITS'(scoreboard_fpu_per_cycle); + `endif + perf_scb_uses[`EX_LSU] <= perf_scb_uses[`EX_LSU] + `PERF_CTR_BITS'(scoreboard_lsu_per_cycle); + perf_scb_uses[`EX_SFU] <= perf_scb_uses[`EX_SFU] + `PERF_CTR_BITS'(scoreboard_sfu_per_cycle); + end + end +`endif + endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index b6123b7f..4f6ffb5d 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -14,8 +14,11 @@ `include "VX_define.vh" interface VX_pipeline_perf_if (); + wire [`PERF_CTR_BITS-1:0] sched_stalls; + wire [`PERF_CTR_BITS-1:0] fetch_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] ifetches; @@ -24,15 +27,24 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] ifetch_latency; wire [`PERF_CTR_BITS-1:0] load_latency; + modport schedule ( + output sched_stalls, + output fetch_stalls + ); + modport issue ( output ibf_stalls, output scb_stalls, + output scb_uses, output dsp_stalls - ); + ); modport slave ( + input sched_stalls, + input fetch_stalls, input ibf_stalls, input scb_stalls, + input scb_uses, input dsp_stalls, input ifetches, input loads, diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index a7de9341..7c1f0f7a 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -22,7 +22,7 @@ module VX_stream_xbar #( parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS), parameter ARBITER = "P", parameter LOCK_ENABLE = 0, - parameter OUT_REG = 0, + parameter OUT_REG = 0, parameter MAX_FANOUT = `MAX_FANOUT, parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1) ) ( @@ -173,8 +173,8 @@ module VX_stream_xbar #( end // compute inputs collision - // we have a collision when there exists a valid transfer with mutiple input candicates - // we caount the unique duplicates each cycle. + // we have a collision when there exists a valid transfer with multiple input candicates + // we count the unique duplicates each cycle. reg [PERF_CTR_BITS-1:0] collisions_r; reg [NUM_INPUTS-1:0] per_cycle_collision;