Merge branch 'rtl' of https://github.com/hansungk/vortex-private into rtl
This commit is contained in:
@@ -264,7 +264,7 @@
|
||||
|
||||
// Number of SFU units
|
||||
`ifndef NUM_SFU_LANES
|
||||
`define NUM_SFU_LANES `NUM_THREADS
|
||||
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
|
||||
`endif
|
||||
|
||||
// Size of Instruction Buffer
|
||||
|
||||
@@ -434,6 +434,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
$itor(pipeline_perf_if.scb_any_fire_cycles) / $itor(cycles) * 100.0);
|
||||
$display("issue scoreboard: stalls total:\t%d across ISSUE_WIDTH=%d",
|
||||
pipeline_perf_if.scb_stalls, `ISSUE_WIDTH);
|
||||
$display("issue scoreboard: stalls by operand hazard: total %d across ISSUE_WIDTH=%d",
|
||||
pipeline_perf_if.scb_any_unit_uses, `ISSUE_WIDTH);
|
||||
$display("issue scoreboard: stalls by operand hazard: alu %d (%2.2f cycles per issue)",
|
||||
scrb_alu_per_core,
|
||||
$itor(scrb_alu_per_core) / $itor(pipeline_perf_if.dispatch_fires[`EX_ALU]));
|
||||
|
||||
@@ -61,6 +61,7 @@ module VX_issue #(
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||
.perf_scb_any_unit_uses(perf_issue_if.scb_any_unit_uses),
|
||||
.perf_scb_fires (perf_issue_if.scb_fires),
|
||||
.perf_scb_any_fire_cycles (perf_issue_if.scb_any_fire_cycles),
|
||||
.perf_units_uses(perf_issue_if.units_uses),
|
||||
@@ -71,8 +72,13 @@ module VX_issue #(
|
||||
.scoreboard_if (scoreboard_if)
|
||||
);
|
||||
|
||||
`ifdef GPR_DUPLICATED
|
||||
VX_operands_dup #(
|
||||
`else
|
||||
VX_operands #(
|
||||
.CORE_ID (CORE_ID)
|
||||
`endif
|
||||
.CORE_ID (CORE_ID),
|
||||
.CACHE_ENABLE (0)
|
||||
) operands (
|
||||
.clk (clk),
|
||||
.reset (operands_reset),
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef GPR_DUPLICATED
|
||||
|
||||
module VX_operands import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter CACHE_ENABLE = 0
|
||||
@@ -197,9 +199,10 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
assign stg_valid_in = scoreboard_if[i].valid && data_ready;
|
||||
assign scoreboard_if[i].ready = stg_ready_in && data_ready;
|
||||
|
||||
// NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
|
||||
// throughput. Wouldn't this cap overall IPC? Or OK as long as
|
||||
// ISSUE_WIDTH > 1?
|
||||
// NOTE(hansung): Cannot use stream_buffer here for full throughput
|
||||
// because data registers (rs1_data, ...) are single-buffered. This
|
||||
// will probably cap IPC at 50% (notwithstanding the 1-operand-per-cycle
|
||||
// limit.)
|
||||
VX_toggle_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) staging_buffer (
|
||||
@@ -295,3 +298,5 @@ module VX_operands import VX_gpu_pkg::*; #(
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
|
||||
230
hw/rtl/core/VX_operands_dup.sv
Normal file
230
hw/rtl/core/VX_operands_dup.sv
Normal file
@@ -0,0 +1,230 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef GPR_DUPLICATED
|
||||
|
||||
module VX_operands_dup import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter CACHE_ENABLE = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
|
||||
VX_operands_if.master operands_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
// NOTE(hansung): toggle_buffer is 1-reg pipe without flow, halving
|
||||
// throughput. Wouldn't this cap overall IPC? Or OK as long as
|
||||
// ISSUE_WIDTH > 1?
|
||||
VX_stream_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) staging_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if[i].valid),
|
||||
.data_in ({
|
||||
scoreboard_if[i].data.uuid,
|
||||
scoreboard_if[i].data.wis,
|
||||
scoreboard_if[i].data.tmask,
|
||||
scoreboard_if[i].data.PC,
|
||||
scoreboard_if[i].data.wb,
|
||||
scoreboard_if[i].data.ex_type,
|
||||
scoreboard_if[i].data.op_type,
|
||||
scoreboard_if[i].data.op_mod,
|
||||
scoreboard_if[i].data.use_PC,
|
||||
scoreboard_if[i].data.use_imm,
|
||||
scoreboard_if[i].data.imm,
|
||||
scoreboard_if[i].data.rd
|
||||
}),
|
||||
.ready_in (scoreboard_if[i].ready),
|
||||
.valid_out (operands_if[i].valid),
|
||||
.data_out ({
|
||||
operands_if[i].data.uuid,
|
||||
operands_if[i].data.wis,
|
||||
operands_if[i].data.tmask,
|
||||
operands_if[i].data.PC,
|
||||
operands_if[i].data.wb,
|
||||
operands_if[i].data.ex_type,
|
||||
operands_if[i].data.op_type,
|
||||
operands_if[i].data.op_mod,
|
||||
operands_if[i].data.use_PC,
|
||||
operands_if[i].data.use_imm,
|
||||
operands_if[i].data.imm,
|
||||
operands_if[i].data.rd
|
||||
}),
|
||||
.ready_out (operands_if[i].ready)
|
||||
);
|
||||
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_stream_buffer #(
|
||||
.DATAW (`XLEN + `XLEN + `XLEN)
|
||||
) staging_data_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if[i].valid),
|
||||
.data_in ({
|
||||
rs1_data[j], rs2_data[j], rs3_data[j]
|
||||
}),
|
||||
`UNUSED_PIN (ready_in),
|
||||
`UNUSED_PIN (valid_out),
|
||||
.data_out ({
|
||||
operands_if[i].data.rs1_data[j],
|
||||
operands_if[i].data.rs2_data[j],
|
||||
operands_if[i].data.rs3_data[j]
|
||||
}),
|
||||
.ready_out (operands_if[i].ready)
|
||||
);
|
||||
end
|
||||
|
||||
// GPR banks
|
||||
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs1;
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs2;
|
||||
wire [RAM_ADDRW-1:0] gpr_rd_addr_rs3;
|
||||
wire [RAM_ADDRW-1:0] gpr_wr_addr;
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||
assign gpr_rd_addr_rs1 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
|
||||
assign gpr_rd_addr_rs2 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
|
||||
assign gpr_rd_addr_rs3 = {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||
// always @(posedge clk) begin
|
||||
// if (reset) begin
|
||||
// gpr_rd_addr_rs1 <= '0;
|
||||
// gpr_rd_addr_rs2 <= '0;
|
||||
// gpr_rd_addr_rs3 <= '0;
|
||||
// end else begin
|
||||
// // if (!(operands_if[i].valid && !operands_if[i].ready)) begin
|
||||
// if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin
|
||||
// gpr_rd_addr_rs1 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs1};
|
||||
// gpr_rd_addr_rs2 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs2};
|
||||
// gpr_rd_addr_rs3 <= {scoreboard_if[i].data.wis, scoreboard_if[i].data.rs3};
|
||||
// end
|
||||
// end
|
||||
// end
|
||||
end else begin
|
||||
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||
assign gpr_rd_addr_rs1 = scoreboard_if[i].data.rs1;
|
||||
assign gpr_rd_addr_rs2 = scoreboard_if[i].data.rs2;
|
||||
assign gpr_rd_addr_rs3 = scoreboard_if[i].data.rs3;
|
||||
// always @(posedge clk) begin
|
||||
// if (reset) begin
|
||||
// gpr_rd_addr_rs1 <= '0;
|
||||
// gpr_rd_addr_rs2 <= '0;
|
||||
// gpr_rd_addr_rs3 <= '0;
|
||||
// end else begin
|
||||
// // if (!(operands_if[i].valid && !operands_if[i].ready)) begin
|
||||
// if (scoreboard_if[i].valid && scoreboard_if[i].ready) begin
|
||||
// gpr_rd_addr_rs1 <= scoreboard_if[i].data.rs1;
|
||||
// gpr_rd_addr_rs2 <= scoreboard_if[i].data.rs2;
|
||||
// gpr_rd_addr_rs3 <= scoreboard_if[i].data.rs3;
|
||||
// end
|
||||
// end
|
||||
// end
|
||||
end
|
||||
|
||||
`ifdef GPR_RESET
|
||||
reg wr_enabled = 0;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
wr_enabled <= 1;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
`endif
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs1 (
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
.raddr (gpr_rd_addr_rs1),
|
||||
.rdata (rs1_data[j])
|
||||
);
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
`endif
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs2(
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
.raddr (gpr_rd_addr_rs2),
|
||||
.rdata (rs2_data[j])
|
||||
);
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`XLEN),
|
||||
.SIZE (`NUM_REGS * ISSUE_RATIO),
|
||||
`ifdef GPR_RESET
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
`endif
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram_rs3 (
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
`UNUSED_PIN (wren),
|
||||
`ifdef GPR_RESET
|
||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`else
|
||||
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||
`endif
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if[i].data.data[j]),
|
||||
.raddr (gpr_rd_addr_rs3),
|
||||
.rdata (rs3_data[j])
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
@@ -21,6 +21,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_any_unit_uses,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_fires,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_any_fire_cycles,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||
@@ -45,6 +46,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||
reg [`ISSUE_WIDTH-1:0] perf_issue_any_unit_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_any_unit_per_cycle, perf_any_unit_per_cycle_r;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] perf_issue_fires_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_fires_per_cycle, perf_fires_per_cycle_r;
|
||||
@@ -53,6 +56,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_empty;
|
||||
|
||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||
`POP_COUNT(perf_any_unit_per_cycle, perf_issue_any_unit_per_cycle);
|
||||
`POP_COUNT(perf_fires_per_cycle, perf_issue_fires_per_cycle);
|
||||
assign perf_any_fire_per_cycle = |perf_issue_fires_per_cycle;
|
||||
|
||||
@@ -95,6 +99,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
// );
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER(perf_any_unit_per_cycle_r, perf_any_unit_per_cycle);
|
||||
`BUFFER(perf_fires_per_cycle_r, perf_fires_per_cycle);
|
||||
`BUFFER(perf_any_fire_per_cycle_r, perf_any_fire_per_cycle);
|
||||
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||
@@ -103,10 +108,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
perf_scb_any_unit_uses <= '0;
|
||||
perf_scb_fires <= '0;
|
||||
perf_scb_any_fire_cycles <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
perf_scb_any_unit_uses <= perf_scb_any_unit_uses + `PERF_CTR_BITS'(perf_any_unit_per_cycle_r);
|
||||
perf_scb_fires <= perf_scb_fires + `PERF_CTR_BITS'(perf_fires_per_cycle_r);
|
||||
perf_scb_any_fire_cycles <= perf_scb_any_fire_cycles + `PERF_CTR_BITS'(perf_any_fire_per_cycle_r);
|
||||
end
|
||||
@@ -159,27 +166,32 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
||||
|
||||
always @(*) begin
|
||||
perf_issue_units_per_cycle[i] = '0;
|
||||
perf_issue_any_unit_per_cycle[i] = '0;
|
||||
perf_issue_sfu_per_cycle[i] = '0;
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
perf_issue_any_unit_per_cycle[i] = '1;
|
||||
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||
|
||||
@@ -19,6 +19,7 @@ interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] sched_barrier_idles;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_any_unit_uses;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_fires;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_any_fire_cycles;
|
||||
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||
@@ -43,6 +44,7 @@ interface VX_pipeline_perf_if ();
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output scb_any_unit_uses,
|
||||
output scb_fires,
|
||||
output scb_any_fire_cycles,
|
||||
output units_uses,
|
||||
@@ -59,6 +61,7 @@ interface VX_pipeline_perf_if ();
|
||||
input sched_stalls,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input scb_any_unit_uses,
|
||||
input scb_fires,
|
||||
input scb_any_fire_cycles,
|
||||
input units_uses,
|
||||
|
||||
Reference in New Issue
Block a user