This commit is contained in:
Richard Yan
2024-04-16 23:03:04 -07:00
7 changed files with 263 additions and 5 deletions

View File

@@ -264,7 +264,7 @@
// Number of SFU units
`ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `NUM_THREADS
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
`endif
// Size of Instruction Buffer

View File

@@ -434,6 +434,8 @@ module VX_core import VX_gpu_pkg::*; #(
$itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0);
$display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d",
pipeline_perf_if.scb_stalls, `ISSUE_WIDTH);
$display("issue scoreboard: stalls by operand hazard: total %d across ISSUE_WIDTH=%d",
pipeline_perf_if.scb_any_unit_uses, `ISSUE_WIDTH);
$display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)",
scrb_alu_per_core,
$itor(scrb_alu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU]));

View File

@@ -61,6 +61,7 @@ module VX_issue #(
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_any_unit_uses(perf_issue_if.scb_any_unit_uses),
.perf_scb_fires (perf_issue_if.scb_fires),
.perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles),
.perf_units_uses(perf_issue_if.units_uses),
@@ -71,8 +72,13 @@ module VX_issue #(
.scoreboard_if (scoreboard_if)
);
`ifdef GPR_DUPLICATED
VX_operands_dup #(
`else
VX_operands #(
.CORE_ID (CORE_ID)
`endif
.CORE_ID (CORE_ID),
.CACHE_ENABLE (0)
) operands (
.clk (clk),
.reset (operands_reset),

View File

@@ -13,6 +13,8 @@
`include "VX_define.vh"
`ifndef GPR_DUPLICATED
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
@@ -197,9 +199,10 @@ module VX_operands import VX_gpu_pkg::*; #(
assign stg_valid_in = scoreboard_if[i].valid && data_ready;
assign scoreboard_if[i].ready = stg_ready_in && data_ready;
// NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
// throughput. Wouldn't this cap overall IPC? Or OK as long as
// ISSUE_WIDTH > 1?
// NOTE(hansung): Cannot use stream_buffer here for full throughput
// because data registers (rs1_data, ...) are single-buffered. This
// will probably cap IPC at 50% (notwithstanding the 1-operand-per-cycle
// limit.)
VX_toggle_buffer #(
.DATAW (DATAW)
) staging_buffer (
@@ -295,3 +298,5 @@ module VX_operands import VX_gpu_pkg::*; #(
end
endmodule
`endif

View File

@@ -0,0 +1,230 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifdef GPR_DUPLICATED
module VX_operands_dup import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
// NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
// throughput. Wouldn't this cap overall IPC? Or OK as long as
// ISSUE_WIDTH > 1?
VX_stream_buffer #(
.DATAW (DATAW)
) staging_buffer (
.clk (clk),
.reset (reset),
.valid_in (scoreboard_if[i].valid),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd
}),
.ready_in (scoreboard_if[i].ready),
.valid_out (operands_if[i].valid),
.data_out ({
operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
operands_if[i].data.wb,
operands_if[i].data.ex_type,
operands_if[i].data.op_type,
operands_if[i].data.op_mod,
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd
}),
.ready_out (operands_if[i].ready)
);
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_stream_buffer #(
.DATAW (`XLEN + `XLEN + `XLEN)
) staging_data_buffer (
.clk (clk),
.reset (reset),
.valid_in (scoreboard_if[i].valid),
.data_in ({
rs1_data[j], rs2_data[j], rs3_data[j]
}),
`UNUSED_PIN (ready_in),
`UNUSED_PIN (valid_out),
.data_out ({
operands_if[i].data.rs1_data[j],
operands_if[i].data.rs2_data[j],
operands_if[i].data.rs3_data[j]
}),
.ready_out (operands_if[i].ready)
);
end
// GPR banks
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1;
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2;
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
// always @(posedge clk) begin
// if (reset) begin
// gpr_rd_addr_rs1 <= '0;
// gpr_rd_addr_rs2 <= '0;
// gpr_rd_addr_rs3 <= '0;
// end else begin
// // if (!(operands_if[i].valid && !operands_if[i].ready)) begin
// if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin
// gpr_rd_addr_rs1 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
// gpr_rd_addr_rs2 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
// gpr_rd_addr_rs3 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
// end
// end
// end
end else begin
assign gpr_wr_addr = writeback_if[i].data.rd;
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
assign gpr_rd_addr_rs3 = scoreboard_if[i].data.rs3;
// always @(posedge clk) begin
// if (reset) begin
// gpr_rd_addr_rs1 <= '0;
// gpr_rd_addr_rs2 <= '0;
// gpr_rd_addr_rs3 <= '0;
// end else begin
// // if (!(operands_if[i].valid && !operands_if[i].ready)) begin
// if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin
// gpr_rd_addr_rs1 <= scoreboard_if[i].data.rs1;
// gpr_rd_addr_rs2 <= scoreboard_if[i].data.rs2;
// gpr_rd_addr_rs3 <= scoreboard_if[i].data.rs3;
// end
// end
// end
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram_rs1 (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
.raddr (gpr_rd_addr_rs1),
.rdata (rs1_data[j])
);
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram_rs2(
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
.raddr (gpr_rd_addr_rs2),
.rdata (rs2_data[j])
);
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram_rs3 (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
.raddr (gpr_rd_addr_rs3),
.rdata (rs3_data[j])
);
end
end
endmodule
`endif

View File

@@ -21,6 +21,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_scb_any_unit_uses,
output reg [`PERF_CTR_BITS-1:0] perf_scb_fires,
output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
@@ -45,6 +46,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0] perf_issue_any_unit_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_any_unit_per_cycle, perf_any_unit_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r;
@@ -53,6 +56,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_scb_empty;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
`POP_COUNT(perf_any_unit_per_cycle, perf_issue_any_unit_per_cycle);
`POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle);
assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle;
@@ -95,6 +99,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
// );
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_any_unit_per_cycle_r, perf_any_unit_per_cycle);
`BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle);
`BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
@@ -103,10 +108,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_scb_any_unit_uses <= '0;
perf_scb_fires <= '0;
perf_scb_any_fire_cycles <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_scb_any_unit_uses <= perf_scb_any_unit_uses + `PERF_CTR_BITS'(perf_any_unit_per_cycle_r);
perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r);
perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r);
end
@@ -159,27 +166,32 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(*) begin
perf_issue_units_per_cycle[i] = '0;
perf_issue_any_unit_per_cycle[i] = '0;
perf_issue_sfu_per_cycle[i] = '0;
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
end
end
if (inuse_rs1) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_any_unit_per_cycle[i] = '1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;

View File

@@ -19,6 +19,7 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_barrier_idles;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses;
wire [`PERF_CTR_BITS-1:0] scb_fires;
wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles;
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
@@ -43,6 +44,7 @@ interface VX_pipeline_perf_if ();
modport issue (
output ibf_stalls,
output scb_stalls,
output scb_any_unit_uses,
output scb_fires,
output scb_any_fire_cycles,
output units_uses,
@@ -59,6 +61,7 @@ interface VX_pipeline_perf_if ();
input sched_stalls,
input ibf_stalls,
input scb_stalls,
input scb_any_unit_uses,
input scb_fires,
input scb_any_fire_cycles,
input units_uses,