Merge remote-tracking branch 'upstream/master' into vortex2

This commit is contained in:
Hansung Kim
2024-03-07 14:45:48 -08:00
38 changed files with 553 additions and 390 deletions

View File

@@ -37,8 +37,8 @@ jobs:
script:
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
- make -C $HOME/build32
- XLEN=64 make -C $HOME/build64
- make -C $HOME/build32 > /dev/null
- XLEN=64 make -C $HOME/build64 > /dev/null
- stage: test
name: unittest
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest

View File

@@ -35,7 +35,7 @@ Vortex is a full-stack open-source RISC-V GPGPU.
## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms
- Ubuntu 18.04
- Ubuntu 18.04, 20.04
- Centos 7
### Toolchain Dependencies
- [POCL](http://portablecl.org/)
@@ -54,9 +54,9 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
$ cd Vortex
### Install prebuilt toolchain
By default, the toolchain will install to /opt folder.
You can install the toolchain to a different directory by overriding TOOLDIR (e.g. export TOOLDIR=$HOME/tools).
By default, the toolchain will install to /opt folder which requires sudo access.
You can install the toolchain to a different location of your choice by setting TOOLDIR (e.g. export TOOLDIR=$HOME/tools).
$ export TOOLDIR=/opt
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
### Build Vortex sources

View File

@@ -1,4 +0,0 @@
Release Notes!
* 07/01/2020 - LKG FPGA build - Passed basic, demo, vecadd kernels.

23
TODO
View File

@@ -1,23 +0,0 @@
Functionality:
1) vx_cl_warpSpawn()
-> To be used by pocl->ops->run
2) newlib Integration (LoadFile(""))
-> To be used by the Rhinio benchmarks
3) POCL OPS Vortex Suite
Performance:
1) Icache doesn't need SEND_MEM_REQUEST Stage
-> Blocks are never dirty, so why not evict right away
2) Branch not taken speculation
3) Runtime -02 not running on RTL, and -03 not running on RTL and Emulator
Vector:
1) Cycle accurate simulator (would require Cache Simulator)

View File

@@ -22,7 +22,7 @@ rm -f blackbox.*.cache
unittest()
{
make -C tests/unittest run
make -C hw/unittest
make -C hw/unittest > /dev/null
}
isa()
@@ -31,33 +31,36 @@ echo "begin isa tests..."
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
CONFIGS="-DDPI_DISABLE" make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d || true
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
fi
make -C sim/rtlsim clean && make -C sim/rtlsim
# restore default prebuilt configuration
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
echo "isa tests done!"
}
@@ -134,15 +137,16 @@ debug()
echo "begin debugging tests..."
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
diff trace_rtlsim.csv trace_simx.csv
make -C sim/simx clean && make -C sim/simx
make -C sim/rtlsim clean && make -C sim/rtlsim
# restore default prebuilt configuration
make -C sim/simx clean && make -C sim/simx > /dev/null
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python
# Copyright © 2019-2023
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -34,11 +34,11 @@ def monitor(stop):
break
def execute(command):
process = subprocess.Popen(command, stdout=subprocess.PIPE)
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
output = process.stdout.readline()
if output:
line = output.decode('ascii').rstrip()
line = output.decode('utf-8').rstrip()
print(">>> " + line)
process.stdout.flush()
ret = process.poll()

View File

@@ -136,6 +136,18 @@
`endif
`endif
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif
`ifdef XLEN_64
`ifndef STARTUP_ADDR

View File

@@ -291,16 +291,11 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`ifdef ICACHE_ENABLE
`define L1_ENABLE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`ifdef DCACHE_ENABLE
`define L1_ENABLE
`endif
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE

View File

@@ -130,20 +130,20 @@ module VX_cache_bypass #(
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
wire core_req_nc_ready = ~mem_req_valid_in && mem_req_ready_out;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
.LOCK_ENABLE (1)
) req_arb (
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.unlock (core_req_in_fire),
.requests (core_req_valid_in_nc),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid)
.grant_valid (core_req_nc_valid),
.grant_unlock (core_req_nc_ready)
);
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
@@ -164,7 +164,7 @@ module VX_cache_bypass #(
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_req_ready_out[i];
end

View File

@@ -533,8 +533,9 @@ module VX_decode #(
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE_VCS
always @(posedge clk) begin

View File

@@ -32,7 +32,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
@@ -44,8 +43,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire icache_req_fire = icache_req_valid && icache_req_ready;
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
assign req_tag = schedule_if.data.wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
@@ -68,9 +65,12 @@ module VX_fetch import VX_gpu_pkg::*; #(
.rdata ({rsp_PC, rsp_tmask})
);
`ifndef L1_ENABLE
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_pending_size #(
@@ -85,13 +85,16 @@ module VX_fetch import VX_gpu_pkg::*; #(
`UNUSED_PIN (empty)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
`else
wire ibuf_ready = 1'b1;
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};

View File

@@ -66,8 +66,9 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
.valid_out (ibuffer_if[i].valid),
.ready_out(ibuffer_if[i].ready)
);
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
`endif
end
endmodule

View File

@@ -47,8 +47,6 @@ module VX_operands import VX_gpu_pkg::*; #(
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg valid_out_r;
reg [DATAW-1:0] data_out_r;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
@@ -60,7 +58,7 @@ module VX_operands import VX_gpu_pkg::*; #(
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire ready_out = operands_if[i].ready;
wire stg_valid_in, stg_ready_in;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
@@ -85,7 +83,7 @@ module VX_operands import VX_gpu_pkg::*; #(
case (state)
STATE_IDLE: begin
if (valid_out_r && ready_out) begin
if (operands_if[i].valid && operands_if[i].ready) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
@@ -177,33 +175,11 @@ module VX_operands import VX_gpu_pkg::*; #(
state <= STATE_IDLE;
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
valid_out_r <= 0;
end else begin
state <= state_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
if (~valid_out_r) begin
valid_out_r <= scoreboard_if[i].valid && data_ready;
end else if (ready_out) begin
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= {scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd};
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
@@ -218,8 +194,33 @@ module VX_operands import VX_gpu_pkg::*; #(
cache_tmask <= cache_tmask_n;
end
assign operands_if[i].valid = valid_out_r;
assign {operands_if[i].data.uuid,
assign stg_valid_in = scoreboard_if[i].valid && data_ready;
assign scoreboard_if[i].ready = stg_ready_in && data_ready;
VX_toggle_buffer #(
.DATAW (DATAW)
) staging_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd
}),
.ready_in (stg_ready_in),
.valid_out (operands_if[i].valid),
.data_out ({
operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
@@ -230,13 +231,15 @@ module VX_operands import VX_gpu_pkg::*; #(
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd} = data_out_r;
operands_if[i].data.rd
}),
.ready_out (operands_if[i].ready)
);
assign operands_if[i].data.rs1_data = rs1_data;
assign operands_if[i].data.rs2_data = rs2_data;
assign operands_if[i].data.rs3_data = rs3_data;
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
// GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;

View File

@@ -111,7 +111,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
case (scoreboard_if[i].data.op_type)
case (ibuffer_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
@@ -152,50 +152,46 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire ready_out;
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire operands_ready = ~(| operands_busy);
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
wire stg_valid_in, stg_ready_in;
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
assign ibuffer_if[i].ready = stg_ready_in && operands_ready;
wire valid_in = ibuffer_if[i].valid && deps_ready;
wire ready_in = ~valid_out_r && deps_ready;
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
assign ready_out = scoreboard_if[i].ready;
VX_stream_buffer #(
.DATAW (DATAW)
) staging_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in (ibuffer_if[i].data),
.ready_in (stg_ready_in),
.valid_out (scoreboard_if[i].valid),
.data_out (scoreboard_if[i].data),
.ready_out (scoreboard_if[i].ready)
);
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= valid_in;
end else if (ready_out) begin
if (scoreboard_if[i].data.wb) begin
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
end
end
`ifdef PERF_ENABLE
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
if (ibuffer_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type;
end
end
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= data_in;
end
end
assign ibuffer_if[i].ready = ready_in;
assign scoreboard_if[i].valid = valid_out_r;
assign scoreboard_if[i].data = data_out_r;
`ifdef SIMULATION
reg [31:0] timeout_ctr;
@@ -208,7 +204,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
operands_busy, ibuffer_if[i].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
@@ -220,7 +216,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
operands_busy, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",

View File

@@ -36,21 +36,26 @@ interface VX_decode_if ();
logic valid;
data_t data;
logic ready;
wire [`ISSUE_WIDTH-1:0] ibuf_pop;
`ifndef L1_ENABLE
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
`endif
modport master (
output valid,
output data,
input ibuf_pop,
input ready
`ifndef L1_ENABLE
, input ibuf_pop
`endif
);
modport slave (
input valid,
input data,
output ibuf_pop,
output ready
`ifndef L1_ENABLE
, output ibuf_pop
`endif
);
endinterface

View File

@@ -26,21 +26,26 @@ interface VX_fetch_if ();
logic valid;
data_t data;
logic ready;
`ifndef L1_ENABLE
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
`endif
modport master (
output valid,
output data,
input ibuf_pop,
input ready
`ifndef L1_ENABLE
, input ibuf_pop
`endif
);
modport slave (
input valid,
input data,
output ibuf_pop,
output ready
`ifndef L1_ENABLE
, output ibuf_pop
`endif
);
endinterface

View File

@@ -11,6 +11,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// A bypass elastic buffer operates at full bandwidth where pop can happen if the buffer is empty but is going full
// It has the following benefits:
// + Full-bandwidth throughput
// + use only one register for storage
// It has the following limitations:
// + data_out is not registered
// + ready_in and ready_out are coupled
`include "VX_platform.vh"
`TRACING_OFF
@@ -35,29 +43,26 @@ module VX_bypass_buffer #(
assign data_out = data_in;
end else begin
reg [DATAW-1:0] buffer;
reg buffer_valid;
reg has_data;
always @(posedge clk) begin
if (reset) begin
buffer_valid <= 0;
has_data <= 0;
end else begin
if (ready_out) begin
buffer_valid <= 0;
end
if (valid_in && ~ready_out) begin
`ASSERT(!buffer_valid, ("runtime error"));
buffer_valid <= 1;
has_data <= 0;
end else if (~has_data) begin
has_data <= valid_in;
end
end
if (valid_in && ~ready_out) begin
if (~has_data) begin
buffer <= data_in;
end
end
assign ready_in = ready_out || !buffer_valid;
assign data_out = buffer_valid ? buffer : data_in;
assign valid_out = valid_in || buffer_valid;
assign ready_in = ready_out || ~has_data;
assign data_out = has_data ? buffer : data_in;
assign valid_out = valid_in || has_data;
end
endmodule

View File

@@ -22,14 +22,11 @@ module VX_cyclic_arbiter #(
input wire clk,
input wire reset,
input wire [NUM_REQS-1:0] requests,
input wire unlock,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
output wire grant_valid,
input wire grant_unlock
);
`UNUSED_PARAM (LOCK_ENABLE)
`UNUSED_VAR (unlock)
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
@@ -51,7 +48,7 @@ module VX_cyclic_arbiter #(
end else begin
if (!IS_POW2 && grant_index_r == LOG_NUM_REQS'(NUM_REQS-1)) begin
grant_index_r <= '0;
end else begin
end else if (!LOCK_ENABLE || ~grant_valid || grant_unlock) begin
grant_index_r <= grant_index_r + LOG_NUM_REQS'(1);
end
end

View File

@@ -42,34 +42,33 @@ module VX_elastic_buffer #(
end else if (SIZE == 1) begin
wire stall = valid_out && ~ready_out;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
VX_pipe_buffer #(
.DATAW (DATAW)
) pipe_buffer (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, data_in}),
.data_out ({valid_out, data_out})
.valid_in (valid_in),
.data_in (data_in),
.ready_in (ready_in),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);
assign ready_in = ~stall;
end else if (SIZE == 2) begin
VX_skid_buffer #(
.DATAW (DATAW),
.FULL_BW (OUT_REG != 2),
.OUT_REG (OUT_REG)
) skid_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out (data_out),
.ready_in (ready_in),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);
@@ -111,10 +110,10 @@ module VX_elastic_buffer #(
.clk (clk),
.reset (reset),
.valid_in (~empty),
.ready_in (ready_out_t),
.data_in (data_out_t),
.data_out (data_out),
.ready_in (ready_out_t),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);

View File

@@ -21,17 +21,17 @@ module VX_fair_arbiter #(
) (
input wire clk,
input wire reset,
input wire unlock,
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
output wire grant_valid,
input wire grant_unlock
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (unlock)
`UNUSED_VAR (grant_unlock)
assign grant_index = '0;
assign grant_onehot = requests;
@@ -48,18 +48,14 @@ module VX_fair_arbiter #(
always @(posedge clk) begin
if (reset) begin
buffer <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
buffer <= buffer_n;
end
end
VX_priority_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) priority_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests_qual),
.grant_index (grant_index),
.grant_onehot (grant_onehot),

View File

@@ -22,21 +22,22 @@ module VX_generic_arbiter #(
) (
input wire clk,
input wire reset,
input wire unlock,
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
output wire grant_valid,
input wire grant_unlock
);
if (TYPE == "P") begin
`UNUSED_PARAM (LOCK_ENABLE)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
VX_priority_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) priority_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
@@ -51,11 +52,11 @@ module VX_generic_arbiter #(
) rr_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot)
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "F") begin
@@ -66,11 +67,11 @@ module VX_generic_arbiter #(
) fair_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot)
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "M") begin
@@ -81,11 +82,11 @@ module VX_generic_arbiter #(
) matrix_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot)
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else if (TYPE == "C") begin
@@ -96,11 +97,11 @@ module VX_generic_arbiter #(
) cyclic_arbiter (
.clk (clk),
.reset (reset),
.unlock (unlock),
.requests (requests),
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot)
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
);
end else begin

View File

@@ -21,17 +21,17 @@ module VX_matrix_arbiter #(
) (
input wire clk,
input wire reset,
input wire unlock,
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
output wire grant_valid,
input wire grant_unlock
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (unlock)
`UNUSED_VAR (grant_unlock)
assign grant_index = '0;
assign grant_onehot = requests;
@@ -71,18 +71,18 @@ module VX_matrix_arbiter #(
end
if (LOCK_ENABLE == 0) begin
`UNUSED_VAR (unlock)
`UNUSED_VAR (grant_unlock)
assign grant_onehot = grant_unqual;
end else begin
reg [NUM_REQS-1:0] grant_unqual_prev;
always @(posedge clk) begin
if (reset) begin
grant_unqual_prev <= '0;
end else if (unlock) begin
end else if (grant_unlock) begin
grant_unqual_prev <= grant_unqual;
end
end
assign grant_onehot = unlock ? grant_unqual : grant_unqual_prev;
assign grant_onehot = grant_unlock ? grant_unqual : grant_unqual_prev;
end
VX_onehot_encoder #(

View File

@@ -46,18 +46,20 @@ input wire clk,
wire [LOG_NUM_REQS-1:0] grant_index;
wire grant_valid;
wire rsp_fire;
wire grant_ready;
VX_priority_arbiter #(
.NUM_REQS (NUM_REQS)
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE ("P")
) arbiter (
.clk (clk),
.reset (reset),
.unlock (rsp_fire),
.requests (rsp_valid_in),
.grant_valid (grant_valid),
.grant_index (grant_index),
`UNUSED_PIN (grant_onehot)
`UNUSED_PIN (grant_onehot),
.grant_unlock(grant_ready)
);
reg [NUM_REQS-1:0] rsp_valid_sel;
@@ -78,7 +80,7 @@ input wire clk,
end
end
assign rsp_fire = grant_valid && rsp_ready_unqual;
assign grant_ready = rsp_ready_unqual;
VX_elastic_buffer #(
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),

View File

@@ -0,0 +1,63 @@
// Copyright 2024 blaise
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// A pipelined elastic buffer operates at full bandwidth where push can happen if the buffer is not empty but is going empty
// It has the following benefits:
// + Full-bandwidth throughput
// + use only one register for storage
// + data_out is fully registered
// It has the following limitations:
// + ready_in and ready_out are coupled
`include "VX_platform.vh"
`TRACING_OFF
module VX_pipe_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU != 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
wire stall = valid_out && ~ready_out;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, data_in}),
.data_out ({valid_out, data_out})
);
assign ready_in = ~stall;
end
endmodule
`TRACING_ON

View File

@@ -16,22 +16,13 @@
`TRACING_OFF
module VX_priority_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [NUM_REQS-1:0] requests,
input wire unlock,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
);
`UNUSED_PARAM (LOCK_ENABLE)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (unlock)
if (NUM_REQS == 1) begin
assign grant_index = '0;

View File

@@ -22,17 +22,17 @@ module VX_rr_arbiter #(
) (
input wire clk,
input wire reset,
input wire unlock,
input wire [NUM_REQS-1:0] requests,
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid
output wire grant_valid,
input wire grant_unlock
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (unlock)
`UNUSED_VAR (grant_unlock)
assign grant_index = '0;
assign grant_onehot = requests;
@@ -55,7 +55,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -85,7 +85,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -121,7 +121,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -165,7 +165,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -219,7 +219,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -285,7 +285,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -365,7 +365,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end
@@ -399,7 +399,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
pointer_reg <= {NUM_REQS{1'b1}};
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
if (|req_masked) begin
pointer_reg <= mask_higher_pri_regs;
end else if (|requests) begin
@@ -443,7 +443,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || unlock) begin
end else if (!LOCK_ENABLE || grant_unlock) begin
state <= grant_index_r;
end
end

View File

@@ -17,6 +17,7 @@
module VX_skid_buffer #(
parameter DATAW = 32,
parameter PASSTHRU = 0,
parameter FULL_BW = 0,
parameter OUT_REG = 0
) (
input wire clk,
@@ -30,8 +31,6 @@ module VX_skid_buffer #(
input wire ready_out,
output wire valid_out
);
`STATIC_ASSERT ((OUT_REG <= 2), ("invalid parameter"))
if (PASSTHRU != 0) begin
`UNUSED_VAR (clk)
@@ -41,112 +40,36 @@ module VX_skid_buffer #(
assign data_out = data_in;
assign ready_in = ready_out;
end else if (OUT_REG == 0) begin
end else if (FULL_BW != 0) begin
reg [1:0][DATAW-1:0] shift_reg;
reg valid_out_r, ready_in_r, rd_ptr_r;
wire push = valid_in && ready_in;
wire pop = valid_out_r && ready_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
ready_in_r <= 1;
rd_ptr_r <= 1;
end else begin
if (push) begin
if (!pop) begin
ready_in_r <= rd_ptr_r;
valid_out_r <= 1;
end
end else if (pop) begin
ready_in_r <= 1;
valid_out_r <= rd_ptr_r;
end
rd_ptr_r <= rd_ptr_r ^ (push ^ pop);
end
end
always @(posedge clk) begin
if (push) begin
shift_reg[1] <= shift_reg[0];
shift_reg[0] <= data_in;
end
end
assign ready_in = ready_in_r;
assign valid_out = valid_out_r;
assign data_out = shift_reg[rd_ptr_r];
end else if (OUT_REG == 1) begin
// Full-bandwidth operation: input is consummed every cycle.
// However, data_out register has an additional multiplexer.
reg [DATAW-1:0] data_out_r;
reg [DATAW-1:0] buffer;
reg valid_out_r;
reg use_buffer;
wire push = valid_in && ready_in;
wire stall_out = valid_out_r && ~ready_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
use_buffer <= 0;
end else begin
if (ready_out) begin
use_buffer <= 0;
end else if (valid_in && valid_out) begin
use_buffer <= 1;
end
if (~stall_out) begin
valid_out_r <= valid_in || use_buffer;
end
end
end
always @(posedge clk) begin
if (push) begin
buffer <= data_in;
end
if (~stall_out) begin
data_out_r <= use_buffer ? buffer : data_in;
end
end
assign ready_in = ~use_buffer;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
VX_stream_buffer #(
.DATAW (DATAW),
.OUT_REG (OUT_REG)
) stream_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (data_in),
.ready_in (ready_in),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);
end else begin
// Half-bandwidth operation: input is consummed every other cycle.
// However, data_out register has no additional multiplexer.
reg [DATAW-1:0] data_out_r;
reg has_data;
always @(posedge clk) begin
if (reset) begin
has_data <= 0;
end else begin
if (~has_data) begin
has_data <= valid_in;
end else if (ready_out) begin
has_data <= 0;
end
end
if (~has_data) begin
data_out_r <= data_in;
end
end
assign ready_in = ~has_data;
assign valid_out = has_data;
assign data_out = data_out_r;
VX_toggle_buffer #(
.DATAW (DATAW)
) toggle_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (data_in),
.ready_in (ready_in),
.valid_out (valid_out),
.data_out (data_out),
.ready_out (ready_out)
);
end

View File

@@ -19,7 +19,6 @@ module VX_stream_arb #(
parameter NUM_OUTPUTS = 1,
parameter DATAW = 1,
parameter `STRING ARBITER = "P",
parameter LOCK_ENABLE = 1,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_REG = 0 ,
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
@@ -57,7 +56,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) arb_slice (
@@ -102,7 +100,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) fanout_slice_arb (
@@ -129,7 +126,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (1),
.DATAW (DATAW + LOG_NUM_REQS2),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) fanout_join_arb (
@@ -158,25 +154,25 @@ module VX_stream_arb #(
wire arb_valid;
wire [NUM_REQS_W-1:0] arb_index;
wire [NUM_REQS-1:0] arb_onehot;
wire arb_unlock;
wire arb_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_in),
.unlock (arb_unlock),
.grant_valid (arb_valid),
.grant_index (arb_index),
.grant_onehot (arb_onehot)
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
);
assign valid_in_r = arb_valid;
assign data_in_r = data_in[arb_index];
assign arb_unlock = | (valid_in_r & ready_in_r);
assign arb_ready = ready_in_r;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r & arb_onehot[i];
@@ -217,7 +213,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (BATCH_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) arb_slice (
@@ -252,7 +247,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (NUM_BATCHES),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) fanout_fork_arb (
@@ -280,7 +274,6 @@ module VX_stream_arb #(
.NUM_OUTPUTS (BATCH_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) fanout_slice_arb (
@@ -305,24 +298,24 @@ module VX_stream_arb #(
wire [NUM_OUTPUTS-1:0] arb_requests;
wire arb_valid;
wire [NUM_OUTPUTS-1:0] arb_onehot;
wire arb_unlock;
wire arb_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_OUTPUTS),
.LOCK_ENABLE (LOCK_ENABLE),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (arb_requests),
.unlock (arb_unlock),
.grant_valid (arb_valid),
`UNUSED_PIN (grant_index),
.grant_onehot (arb_onehot)
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
);
assign arb_requests = ready_in_r;
assign arb_unlock = | (valid_in & ready_in);
assign arb_ready = valid_in[0];
assign ready_in = arb_valid;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin

View File

@@ -0,0 +1,128 @@
// Copyright 2024 blaise
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// A stream elastic buffer operates at full-bandwidth where push and pop can happen simultaneously
// It has the following benefits:
// + full-bandwidth throughput
// + ready_in and ready_out are decoupled
// + data_out can be fully registered
// It has the following limitations:
// - requires two registers for storage
`include "VX_platform.vh"
`TRACING_OFF
module VX_stream_buffer #(
parameter DATAW = 1,
parameter OUT_REG = 0,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU != 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] data_out_r;
reg [DATAW-1:0] buffer;
reg valid_out_r;
reg use_buffer;
wire push = valid_in && ready_in;
wire stall_out = valid_out_r && ~ready_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
use_buffer <= 0;
end else begin
if (ready_out) begin
use_buffer <= 0;
end else if (valid_in && valid_out) begin
use_buffer <= 1;
end
if (~stall_out) begin
valid_out_r <= valid_in || use_buffer;
end
end
end
always @(posedge clk) begin
if (push) begin
buffer <= data_in;
end
if (~stall_out) begin
data_out_r <= use_buffer ? buffer : data_in;
end
end
assign ready_in = ~use_buffer;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
end else begin
reg [1:0][DATAW-1:0] shift_reg;
reg valid_out_r, ready_in_r, rd_ptr_r;
wire push = valid_in && ready_in;
wire pop = valid_out_r && ready_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
ready_in_r <= 1;
rd_ptr_r <= 1;
end else begin
if (push) begin
if (!pop) begin
ready_in_r <= rd_ptr_r;
valid_out_r <= 1;
end
end else if (pop) begin
ready_in_r <= 1;
valid_out_r <= rd_ptr_r;
end
rd_ptr_r <= rd_ptr_r ^ (push ^ pop);
end
end
always @(posedge clk) begin
if (push) begin
shift_reg[1] <= shift_reg[0];
shift_reg[0] <= data_in;
end
end
assign ready_in = ready_in_r;
assign valid_out = valid_out_r;
assign data_out = shift_reg[rd_ptr_r];
end
end
endmodule
`TRACING_ON

View File

@@ -21,7 +21,6 @@ module VX_stream_xbar #(
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
parameter ARBITER = "P",
parameter LOCK_ENABLE = 0,
parameter OUT_REG = 0,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
@@ -66,7 +65,6 @@ module VX_stream_xbar #(
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) xbar_arb (
@@ -95,7 +93,6 @@ module VX_stream_xbar #(
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.LOCK_ENABLE (LOCK_ENABLE),
.MAX_FANOUT (MAX_FANOUT),
.OUT_REG (OUT_REG)
) xbar_arb (

View File

@@ -0,0 +1,70 @@
// Copyright 2024 blaise
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// A toggle elastic buffer operates at half-bandwidth where push can only trigger after pop
// It has the following benefits:
// + use only one register for storage
// + ready_in and ready_out are decoupled
// + data_out is fully registered
// It has the following limitations:
// - Half-bandwidth throughput
`include "VX_platform.vh"
`TRACING_OFF
module VX_toggle_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU != 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
reg [DATAW-1:0] buffer;
reg has_data;
always @(posedge clk) begin
if (reset) begin
has_data <= 0;
end else begin
if (~has_data) begin
has_data <= valid_in;
end else if (ready_out) begin
has_data <= 0;
end
end
if (~has_data) begin
buffer <= data_in;
end
end
assign ready_in = ~has_data;
assign valid_out = has_data;
assign data_out = buffer;
end
endmodule
`TRACING_ON

View File

@@ -64,8 +64,8 @@ Cluster::Cluster(const SimContext& ctx,
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE),// C
log2ceil(MEM_BLOCK_SIZE),// L
log2ceil(L2_NUM_WAYS), // W
0, // A
log2ceil(L1_LINE_SIZE), // W
log2ceil(L2_NUM_WAYS), // A
log2ceil(L2_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports

View File

@@ -210,7 +210,7 @@ void Core::schedule() {
void Core::fetch() {
perf_stats_.ifetch_latency += pending_ifetches_;
// handle icache reponse
// handle icache response
auto& icache_rsp_port = icache_rsp_ports.at(0);
if (!icache_rsp_port.empty()){
auto& mem_rsp = icache_rsp_port.front();

View File

@@ -207,7 +207,7 @@ void LsuUnit::tick() {
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
auto mem_addr = trace_data->mem_addrs.at(t + t0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
#ifdef LSU_DUP_ENABLE
@@ -229,7 +229,7 @@ void LsuUnit::tick() {
continue;
auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto mem_addr = trace_data->mem_addrs.at(t + t0);
auto type = core_->get_addr_type(mem_addr.addr);
MemReq mem_req;

View File

@@ -339,7 +339,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
break;
}
case 1: {
// RV64I: SLLI
// RV32I: SLLI
rddata[t].i = rsdata[t][0].i << immsrc;
break;
}
@@ -360,11 +360,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
}
case 5: {
if (func7) {
// RV64I: SRAI
// RV32I: SRAI
Word result = rsdata[t][0].i >> immsrc;
rddata[t].i = result;
} else {
// RV64I: SRLI
// RV32I: SRLI
Word result = rsdata[t][0].u >> immsrc;
rddata[t].i = result;
}

View File

@@ -34,7 +34,7 @@ static void show_usage() {
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
bool showStats = false;;
bool showStats = false;
bool riscv_test = false;
const char* program = nullptr;

View File

@@ -33,8 +33,8 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L3_NUM_WAYS), // W
0, // A
log2ceil(L2_LINE_SIZE), // W
log2ceil(L3_NUM_WAYS), // A
log2ceil(L3_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
@@ -58,7 +58,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
}
// set up memory perf recording
// set up memory profiling
memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
__unused (cycle);
perf_mem_reads_ += !req.write;

View File

@@ -44,7 +44,7 @@ Socket::Socket(const SimContext& ctx,
XLEN, // address bits
1, // number of ports
1, // number of inputs
true, // write-through
false, // write-through
false, // write response
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency