diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 1f2e6545..67a11baf 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -264,7 +264,7 @@ // Number of SFU units `ifndef NUM_SFU_LANES -`define NUM_SFU_LANES `NUM_THREADS +`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4) `endif // Size of Instruction Buffer diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 0c9a7ac1..5c898a93 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -434,6 +434,8 @@ module VX_core import VX_gpu_pkg::*; #( $itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0); $display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d", pipeline_perf_if.scb_stalls, `ISSUE_WIDTH); + $display("issue scoreboard: stalls by operand hazard: total %d across ISSUE_WIDTH=%d", + pipeline_perf_if.scb_any_unit_uses, `ISSUE_WIDTH); $display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)", scrb_alu_per_core, $itor(scrb_alu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU])); diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 4e79ce70..14b1a17d 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -61,6 +61,7 @@ module VX_issue #( .reset (scoreboard_reset), `ifdef PERF_ENABLE .perf_scb_stalls(perf_issue_if.scb_stalls), + .perf_scb_any_unit_uses(perf_issue_if.scb_any_unit_uses), .perf_scb_fires (perf_issue_if.scb_fires), .perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles), .perf_units_uses(perf_issue_if.units_uses), @@ -71,8 +72,13 @@ module VX_issue #( .scoreboard_if (scoreboard_if) ); +`ifdef GPR_DUPLICATED + VX_operands_dup #( +`else VX_operands #( - .CORE_ID (CORE_ID) +`endif + .CORE_ID (CORE_ID), + .CACHE_ENABLE (0) ) operands ( .clk (clk), .reset (operands_reset), diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 28f1edf1..c5182ff7 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -13,6 +13,8 @@ `include "VX_define.vh" +`ifndef GPR_DUPLICATED + module VX_operands import VX_gpu_pkg::*; #( parameter CORE_ID = 0, parameter CACHE_ENABLE = 0 @@ -197,9 +199,10 @@ module VX_operands import VX_gpu_pkg::*; #( assign stg_valid_in = scoreboard_if[i].valid && data_ready; assign scoreboard_if[i].ready = stg_ready_in && data_ready; - // NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving - // throughput. Wouldn't this cap overall IPC? Or OK as long as - // ISSUE_WIDTH > 1? + // NOTE(hansung): Cannot use stream_buffer here for full throughput + // because data registers (rs1_data, ...) are single-buffered. This + // will probably cap IPC at 50% (notwithstanding the 1-operand-per-cycle + // limit.) VX_toggle_buffer #( .DATAW (DATAW) ) staging_buffer ( @@ -295,3 +298,5 @@ module VX_operands import VX_gpu_pkg::*; #( end endmodule + +`endif diff --git a/hw/rtl/core/VX_operands_dup.sv b/hw/rtl/core/VX_operands_dup.sv new file mode 100644 index 00000000..b43f0976 --- /dev/null +++ b/hw/rtl/core/VX_operands_dup.sv @@ -0,0 +1,230 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +`ifdef GPR_DUPLICATED + +module VX_operands_dup import VX_gpu_pkg::*; #( + parameter CORE_ID = 0, + parameter CACHE_ENABLE = 0 +) ( + input wire clk, + input wire reset, + + VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], + VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH], + VX_operands_if.master operands_if [`ISSUE_WIDTH] +); + `UNUSED_PARAM (CORE_ID) + localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; + localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO); + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + // NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving + // throughput. Wouldn't this cap overall IPC? Or OK as long as + // ISSUE_WIDTH > 1? + VX_stream_buffer #( + .DATAW (DATAW) + ) staging_buffer ( + .clk (clk), + .reset (reset), + .valid_in (scoreboard_if[i].valid), + .data_in ({ + scoreboard_if[i].data.uuid, + scoreboard_if[i].data.wis, + scoreboard_if[i].data.tmask, + scoreboard_if[i].data.PC, + scoreboard_if[i].data.wb, + scoreboard_if[i].data.ex_type, + scoreboard_if[i].data.op_type, + scoreboard_if[i].data.op_mod, + scoreboard_if[i].data.use_PC, + scoreboard_if[i].data.use_imm, + scoreboard_if[i].data.imm, + scoreboard_if[i].data.rd + }), + .ready_in (scoreboard_if[i].ready), + .valid_out (operands_if[i].valid), + .data_out ({ + operands_if[i].data.uuid, + operands_if[i].data.wis, + operands_if[i].data.tmask, + operands_if[i].data.PC, + operands_if[i].data.wb, + operands_if[i].data.ex_type, + operands_if[i].data.op_type, + operands_if[i].data.op_mod, + operands_if[i].data.use_PC, + operands_if[i].data.use_imm, + operands_if[i].data.imm, + operands_if[i].data.rd + }), + .ready_out (operands_if[i].ready) + ); + + wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data; + wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data; + wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data; + + for (genvar j = 0; j < `NUM_THREADS; ++j) begin + VX_stream_buffer #( + .DATAW (`XLEN + `XLEN + `XLEN) + ) staging_data_buffer ( + .clk (clk), + .reset (reset), + .valid_in (scoreboard_if[i].valid), + .data_in ({ + rs1_data[j], rs2_data[j], rs3_data[j] + }), + `UNUSED_PIN (ready_in), + `UNUSED_PIN (valid_out), + .data_out ({ + operands_if[i].data.rs1_data[j], + operands_if[i].data.rs2_data[j], + operands_if[i].data.rs3_data[j] + }), + .ready_out (operands_if[i].ready) + ); + end + + // GPR banks + + wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1; + wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2; + wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3; + wire [RAM_ADDRW-1:0] gpr_wr_addr; + if (ISSUE_WIS != 0) begin + assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd}; + assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1}; + assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2}; + assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; + // always @(posedge clk) begin + // if (reset) begin + // gpr_rd_addr_rs1 <= '0; + // gpr_rd_addr_rs2 <= '0; + // gpr_rd_addr_rs3 <= '0; + // end else begin + // // if (!(operands_if[i].valid && !operands_if[i].ready)) begin + // if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin + // gpr_rd_addr_rs1 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1}; + // gpr_rd_addr_rs2 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2}; + // gpr_rd_addr_rs3 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3}; + // end + // end + // end + end else begin + assign gpr_wr_addr = writeback_if[i].data.rd; + assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1; + assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2; + assign gpr_rd_addr_rs3 = scoreboard_if[i].data.rs3; + // always @(posedge clk) begin + // if (reset) begin + // gpr_rd_addr_rs1 <= '0; + // gpr_rd_addr_rs2 <= '0; + // gpr_rd_addr_rs3 <= '0; + // end else begin + // // if (!(operands_if[i].valid && !operands_if[i].ready)) begin + // if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin + // gpr_rd_addr_rs1 <= scoreboard_if[i].data.rs1; + // gpr_rd_addr_rs2 <= scoreboard_if[i].data.rs2; + // gpr_rd_addr_rs3 <= scoreboard_if[i].data.rs3; + // end + // end + // end + end + + `ifdef GPR_RESET + reg wr_enabled = 0; + always @(posedge clk) begin + if (reset) begin + wr_enabled <= 1; + end + end + `endif + + for (genvar j = 0; j < `NUM_THREADS; ++j) begin + VX_dp_ram #( + .DATAW (`XLEN), + .SIZE (`NUM_REGS * ISSUE_RATIO), + `ifdef GPR_RESET + .INIT_ENABLE (1), + .INIT_VALUE (0), + `endif + .NO_RWCHECK (1) + ) gpr_ram_rs1 ( + .clk (clk), + .read (1'b1), + `UNUSED_PIN (wren), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), + .wdata (writeback_if[i].data.data[j]), + .raddr (gpr_rd_addr_rs1), + .rdata (rs1_data[j]) + ); + + VX_dp_ram #( + .DATAW (`XLEN), + .SIZE (`NUM_REGS * ISSUE_RATIO), + `ifdef GPR_RESET + .INIT_ENABLE (1), + .INIT_VALUE (0), + `endif + .NO_RWCHECK (1) + ) gpr_ram_rs2( + .clk (clk), + .read (1'b1), + `UNUSED_PIN (wren), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), + .wdata (writeback_if[i].data.data[j]), + .raddr (gpr_rd_addr_rs2), + .rdata (rs2_data[j]) + ); + + VX_dp_ram #( + .DATAW (`XLEN), + .SIZE (`NUM_REGS * ISSUE_RATIO), + `ifdef GPR_RESET + .INIT_ENABLE (1), + .INIT_VALUE (0), + `endif + .NO_RWCHECK (1) + ) gpr_ram_rs3 ( + .clk (clk), + .read (1'b1), + `UNUSED_PIN (wren), + `ifdef GPR_RESET + .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `else + .write (writeback_if[i].valid && writeback_if[i].data.tmask[j]), + `endif + .waddr (gpr_wr_addr), + .wdata (writeback_if[i].data.data[j]), + .raddr (gpr_rd_addr_rs3), + .rdata (rs3_data[j]) + ); + end + end + +endmodule + +`endif diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index fe038fb5..c63a5dcb 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -21,6 +21,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, + output reg [`PERF_CTR_BITS-1:0] perf_scb_any_unit_uses, output reg [`PERF_CTR_BITS-1:0] perf_scb_fires, output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles, output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS], @@ -45,6 +46,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle; wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r; + reg [`ISSUE_WIDTH-1:0] perf_issue_any_unit_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_any_unit_per_cycle, perf_any_unit_per_cycle_r; wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle; wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r; @@ -53,6 +56,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( reg [`PERF_CTR_BITS-1:0] perf_scb_empty; `POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle); + `POP_COUNT(perf_any_unit_per_cycle, perf_issue_any_unit_per_cycle); `POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle); assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle; @@ -95,6 +99,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( // ); `BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle); + `BUFFER(perf_any_unit_per_cycle_r, perf_any_unit_per_cycle); `BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle); `BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle); `BUFFER(perf_units_per_cycle_r, perf_units_per_cycle); @@ -103,10 +108,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #( always @(posedge clk) begin if (reset) begin perf_scb_stalls <= '0; + perf_scb_any_unit_uses <= '0; perf_scb_fires <= '0; perf_scb_any_fire_cycles <= '0; end else begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r); + perf_scb_any_unit_uses <= perf_scb_any_unit_uses + `PERF_CTR_BITS'(perf_any_unit_per_cycle_r); perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r); perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r); end @@ -159,27 +166,32 @@ module VX_scoreboard import VX_gpu_pkg::*; #( always @(*) begin perf_issue_units_per_cycle[i] = '0; + perf_issue_any_unit_per_cycle[i] = '0; perf_issue_sfu_per_cycle[i] = '0; if (ibuffer_if[i].valid) begin if (inuse_rd) begin + perf_issue_any_unit_per_cycle[i] = '1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; end end if (inuse_rs1) begin + perf_issue_any_unit_per_cycle[i] = '1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; end end if (inuse_rs2) begin + perf_issue_any_unit_per_cycle[i] = '1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; end end if (inuse_rs3) begin + perf_issue_any_unit_per_cycle[i] = '1; perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 874778b8..fc57cad9 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -19,6 +19,7 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] sched_barrier_idles; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses; wire [`PERF_CTR_BITS-1:0] scb_fires; wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles; wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS]; @@ -43,6 +44,7 @@ interface VX_pipeline_perf_if (); modport issue ( output ibf_stalls, output scb_stalls, + output scb_any_unit_uses, output scb_fires, output scb_any_fire_cycles, output units_uses, @@ -59,6 +61,7 @@ interface VX_pipeline_perf_if (); input sched_stalls, input ibf_stalls, input scb_stalls, + input scb_any_unit_uses, input scb_fires, input scb_any_fire_cycles, input units_uses,