This commit is contained in:
felsabbagh3
2020-06-29 23:00:53 -07:00
18 changed files with 160 additions and 47 deletions

View File

@@ -1,23 +1,34 @@
# Vortex RISC-V GPGPU
Vortex currently supported RISC-V RV32I ISA
Vortex is a full-system RISCV-based GPGPU processor.
/benchmarks containts test benchmarks
Specifications
--------------
/docs contains documentation.
- Support RISC-V RV32I ISA
- Fully scalable: 1 to 16 cores with optional L2 and L3 caches
- OpenCL 1.2 Support
- FPGA target: Intel Arria 10 @ 200 MHz peak Freq
/hw constains hardware sources.
Directory structure
-------------------
/driver contains the driver software.
- benchmarks: OpenCL and RISC-V benchmarks
- docs: documentation.
/runtime contains the kernel runtime software.
- hw: hardware sources.
/SimX contains a cycle-approximate simulator for Vortex.
- driver: driver software.
/evaluation contains the synthesis/runtime reports.
- runtime: runtime software for kernels.
Basic Instructions to run OpenCL Benchmarks on Vortex
-----------------------------------------------------
- simX: Vortex cycle-approximate simulator.
- evaluation: synthesis and performance data.
Basic Installation
------------------
Install development tools
@@ -55,8 +66,8 @@ Install Vortex
$ cd Vortex
$ make
Run SGEMM OpenCL Benchmark
Quick Test running SGEMM kernel
$ cd Vortex/benchmarks/opencl/sgemm
$ cd /Vortex/benchmarks/opencl/sgemm
$ make
$ make run

View File

@@ -13,11 +13,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
#DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2
#DEBUG=1
@@ -38,8 +38,7 @@ RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += --x-initial unique
VL_FLAGS += --x-assign unique
VL_FLAGS += --x-initial unique --x-assign unique
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')

View File

@@ -2,7 +2,7 @@ ASE_BUILD_DIR=build_ase
FPGA_BUILD_DIR=build_fpga
all: ase-2c
all: ase-1c
ase-1c: setup-ase-1c
make -C $(ASE_BUILD_DIR)_1c
@@ -10,47 +10,72 @@ ase-1c: setup-ase-1c
ase-2c: setup-ase-2c
make -C $(ASE_BUILD_DIR)_2c
ase-4c: setup-ase-4c
make -C $(ASE_BUILD_DIR)_4c
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile
setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile
$(ASE_BUILD_DIR)_1c/Makefile:
afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c
$(ASE_BUILD_DIR)_2c/Makefile:
afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c
$(ASE_BUILD_DIR)_4c/Makefile:
afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c
fpga-1c: setup-fpga-1c
cd $(FPGA_BUILD_DIR)_1c && qsub-synth
fpga-2c: setup-fpga-2c
cd $(FPGA_BUILD_DIR)_2c && qsub-synth
fpga-4c: setup-fpga-4c
cd $(FPGA_BUILD_DIR)_4c && qsub-synth
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf
setup-fpga-4c: $(FPGA_BUILD_DIR)_4c/build/dcp.qpf
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c
run-ase-1c:
cd $(ASE_BUILD_DIR)_1c && make sim
run-ase-2c:
cd $(ASE_BUILD_DIR)_2c && make sim
run-ase-4c:
cd $(ASE_BUILD_DIR)_4c && make sim
clean-ase-1c:
rm -rf $(ASE_BUILD_DIR)_1c
clean-ase-2c:
rm -rf $(ASE_BUILD_DIR)_2c
clean-ase-4c:
rm -rf $(ASE_BUILD_DIR)_4c
clean-fpga-1c:
rm -rf $(FPGA_BUILD_DIR)_1c
clean-fpga-2c:
rm -rf $(FPGA_BUILD_DIR)_2c
clean-fpga-4c:
rm -rf $(FPGA_BUILD_DIR)_4c

View File

@@ -62,6 +62,7 @@ make ase
# tests
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
# modify "vsim_run.tcl" to dump VCD trace
vcd file vortex.vcd
@@ -90,5 +91,6 @@ lsof +D build_ase_1c
# quick off cache synthesis
make -C pipeline > pipeline/build.log 2>&1 &
make -C cache > cache/build.log 2>&1 &
make -C core > core/build.log 2>&1 &
make -C vortex > vortex/build.log 2>&1 &
make -C top > top/build.log 2>&1 &

View File

@@ -7,6 +7,9 @@ BUILD_DIR=$1
PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2`
POCL_RT_PATH=$SCRIPT_DIR/../../benchmarks/opencl/runtime/lib
VORTEX_DRV_PATH=$SCRIPT_DIR/../../driver/opae/ase
# Export ASE_WORKDIR variable
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
@@ -33,5 +36,5 @@ done
# run application
pushd $PROGRAM_DIR
echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $*
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH:$VORTEX_DRV_PATH:$LD_LIBRARY_PATH ./$PROGRAM $*
popd

View File

@@ -67,6 +67,7 @@ QI:vortex_afu.qsf
../rtl/libs/VX_priority_encoder.v
../rtl/libs/VX_generic_queue.v
../rtl/libs/VX_indexable_queue.v
../rtl/libs/VX_fair_arbiter.v
../rtl/libs/VX_fixed_arbiter.v
../rtl/libs/VX_rr_arbiter.v
../rtl/libs/VX_countones.v

4
hw/opae/sources_4c.txt Normal file
View File

@@ -0,0 +1,4 @@
+define+NUM_CORES=4
+define+L2_ENABLE=0
C:sources.txt

View File

@@ -13,9 +13,6 @@ module VX_alu_unit (
output reg [31:0] alu_result,
output reg alu_stall
);
localparam DIV_PIPELINE_LEN = 20;
localparam MUL_PIPELINE_LEN = 8;
wire[31:0] div_result_unsigned;
wire[31:0] div_result_signed;
@@ -37,11 +34,11 @@ module VX_alu_unit (
`ALU_DIV,
`ALU_DIVU,
`ALU_REM,
`ALU_REMU: inst_delay = DIV_PIPELINE_LEN;
`ALU_REMU: inst_delay = `DIV_LATENCY;
`ALU_MUL,
`ALU_MULH,
`ALU_MULHSU,
`ALU_MULHU: inst_delay = MUL_PIPELINE_LEN;
`ALU_MULHU: inst_delay = `MUL_LATENCY;
default: inst_delay = 0;
endcase
end
@@ -91,7 +88,7 @@ module VX_alu_unit (
.WIDTHD(32),
.NSIGNED(0),
.DSIGNED(0),
.PIPELINE(DIV_PIPELINE_LEN)
.PIPELINE(`DIV_LATENCY)
) udiv (
.clk(clk),
.reset(reset),
@@ -106,7 +103,7 @@ module VX_alu_unit (
.WIDTHD(32),
.NSIGNED(1),
.DSIGNED(1),
.PIPELINE(DIV_PIPELINE_LEN)
.PIPELINE(`DIV_LATENCY)
) sdiv (
.clk(clk),
.reset(reset),
@@ -124,7 +121,7 @@ module VX_alu_unit (
.WIDTHB(33),
.WIDTHP(64),
.SIGNED(1),
.PIPELINE(MUL_PIPELINE_LEN)
.PIPELINE(`MUL_LATENCY)
) multiplier (
.clk(clk),
.reset(reset),

View File

@@ -8,7 +8,7 @@
`endif
`ifndef NUM_CORES
`define NUM_CORES 1
`define NUM_CORES 4
`endif
`ifndef NUM_WARPS
@@ -52,7 +52,7 @@
`endif
`ifndef L2_ENABLE
`define L2_ENABLE (`NUM_CORES > 2)
`define L2_ENABLE 0
`endif
`ifndef L3_ENABLE

View File

@@ -72,6 +72,10 @@
`define CSR_WIDTH 12
`define DIV_LATENCY 18
`define MUL_LATENCY 2
///////////////////////////////////////////////////////////////////////////////
`define BYTE_EN_NO 3'h7

View File

@@ -116,7 +116,7 @@ module VX_lsu_unit #(
end
if (mrq_pop_part) begin
mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd;
assert(mrq_read_addr == dbg_mrq_write_addr);
assert(($time < 2) || mrq_read_addr == dbg_mrq_write_addr);
end
end

View File

@@ -11,7 +11,6 @@ module VX_fair_arbiter #(
output wire grant_valid
);
if (N == 1) begin
`UNUSED_VAR (clk)
@@ -20,8 +19,7 @@ module VX_fair_arbiter #(
assign grant_onehot = requests;
assign grant_valid = requests[0];
end else begin
end else begin
reg [N-1:0] requests_use;
wire [N-1:0] update_value;
@@ -48,7 +46,7 @@ module VX_fair_arbiter #(
reg [N-1:0] grant_onehot_r;
VX_priority_encoder # (
VX_priority_encoder #(
.N(N)
) priority_encoder (
.data_in (requests_use),
@@ -61,7 +59,7 @@ module VX_fair_arbiter #(
grant_onehot_r[grant_index] = 1;
end
assign grant_onehot = grant_onehot_r;
assign late_value = ((refill_original ^ requests) & ~refill_original);
assign late_value = ((refill_original ^ requests) & ~refill_original);
assign update_value = (requests_use & ~grant_onehot_r) | late_value;
end

View File

@@ -3,7 +3,7 @@
module VX_generic_queue #(
parameter DATAW,
parameter SIZE = 16,
parameter BUFFERED_OUTPUT = (SIZE > 8)
parameter BUFFERED_OUTPUT = 1
) (
input wire clk,
input wire reset,

View File

@@ -1,5 +1,5 @@
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2
# control RTL debug print states
@@ -12,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
#DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/pipe_regs -I../rtl/cache -I../rtl/simulate
@@ -25,7 +25,7 @@ CF += -std=c++11 -fms-extensions -I../..
VF += --language 1800-2009 --assert -Wall -Wpedantic
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += --x-initial unique --x-assign unique
VF += -exe $(SRCS) $(INCLUDE)
DBG += -DVCD_OUTPUT $(DBG_FLAGS)

View File

@@ -9,13 +9,9 @@ double sc_time_stamp() {
return timestamp;
}
Simulator::Simulator() {
#ifdef NDEBUG
Simulator::Simulator() {
// force random values for unitialized signals
Verilated::randReset(2);
Verilated::assertOn(false);
#endif
ram_ = nullptr;
vortex_ = new VVortex();

View File

@@ -8,4 +8,7 @@
!/vortex/Makefile
/pipeline/*
!/pipeline/Makefile
!/pipeline/Makefile
/core/*
!/core/Makefile

View File

@@ -0,0 +1,70 @@
PROJECT = Core
TOP_LEVEL_ENTITY = VX_core
SRC_FILE = VX_core.v
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Part, Family
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
# Executable Configuration
SYN_ARGS = --parallel --read_settings_files=on
FIT_ARGS = --part=$(DEVICE) --read_settings_files=on
ASM_ARGS =
STA_ARGS = --do_report_timing
# Build targets
all: $(PROJECT).sta.rpt
syn: $(PROJECT).syn.rpt
fit: $(PROJECT).fit.rpt
asm: $(PROJECT).asm.rpt
sta: $(PROJECT).sta.rpt
smart: smart.log
# Target implementations
STAMP = echo done >
$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES)
quartus_syn $(PROJECT) $(SYN_ARGS)
$(STAMP) fit.chg
$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt
quartus_fit $(PROJECT) $(FIT_ARGS)
$(STAMP) asm.chg
$(STAMP) sta.chg
$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt
quartus_asm $(PROJECT) $(ASM_ARGS)
$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt
quartus_sta $(PROJECT) $(STA_ARGS)
smart.log: $(PROJECT_FILES)
quartus_sh --determine_smart_action $(PROJECT) > smart.log
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../project.sdc -inc "../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/pipe_regs;../../../rtl/cache"
syn.chg:
$(STAMP) syn.chg
fit.chg:
$(STAMP) fit.chg
sta.chg:
$(STAMP) sta.chg
asm.chg:
$(STAMP) asm.chg
program: $(PROJECT).sof
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
clean:
rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox

View File

@@ -1,6 +1,6 @@
set_time_format -unit ns -decimal_places 3
create_clock -name {clk} -period "250 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
derive_pll_clocks -create_base_clocks
derive_clock_uncertainty