Merge branch 'master' of https://github.gatech.edu/casl/Vortex
This commit is contained in:
35
README.md
35
README.md
@@ -1,23 +1,34 @@
|
||||
# Vortex RISC-V GPGPU
|
||||
|
||||
Vortex currently supported RISC-V RV32I ISA
|
||||
Vortex is a full-system RISCV-based GPGPU processor.
|
||||
|
||||
/benchmarks containts test benchmarks
|
||||
Specifications
|
||||
--------------
|
||||
|
||||
/docs contains documentation.
|
||||
- Support RISC-V RV32I ISA
|
||||
- Fully scalable: 1 to 16 cores with optional L2 and L3 caches
|
||||
- OpenCL 1.2 Support
|
||||
- FPGA target: Intel Arria 10 @ 200 MHz peak Freq
|
||||
|
||||
/hw constains hardware sources.
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
/driver contains the driver software.
|
||||
- benchmarks: OpenCL and RISC-V benchmarks
|
||||
|
||||
- docs: documentation.
|
||||
|
||||
/runtime contains the kernel runtime software.
|
||||
- hw: hardware sources.
|
||||
|
||||
/SimX contains a cycle-approximate simulator for Vortex.
|
||||
- driver: driver software.
|
||||
|
||||
/evaluation contains the synthesis/runtime reports.
|
||||
- runtime: runtime software for kernels.
|
||||
|
||||
Basic Instructions to run OpenCL Benchmarks on Vortex
|
||||
-----------------------------------------------------
|
||||
- simX: Vortex cycle-approximate simulator.
|
||||
|
||||
- evaluation: synthesis and performance data.
|
||||
|
||||
Basic Installation
|
||||
------------------
|
||||
|
||||
Install development tools
|
||||
|
||||
@@ -55,8 +66,8 @@ Install Vortex
|
||||
$ cd Vortex
|
||||
$ make
|
||||
|
||||
Run SGEMM OpenCL Benchmark
|
||||
Quick Test running SGEMM kernel
|
||||
|
||||
$ cd Vortex/benchmarks/opencl/sgemm
|
||||
$ cd /Vortex/benchmarks/opencl/sgemm
|
||||
$ make
|
||||
$ make run
|
||||
|
||||
@@ -13,11 +13,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
#DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CORE_REQ_INFO
|
||||
|
||||
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4
|
||||
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
||||
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||
|
||||
#DEBUG=1
|
||||
@@ -38,8 +38,7 @@ RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../
|
||||
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
VL_FLAGS += --x-initial unique
|
||||
VL_FLAGS += --x-assign unique
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
|
||||
# Enable Verilator multithreaded simulation
|
||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
||||
|
||||
@@ -2,7 +2,7 @@ ASE_BUILD_DIR=build_ase
|
||||
|
||||
FPGA_BUILD_DIR=build_fpga
|
||||
|
||||
all: ase-2c
|
||||
all: ase-1c
|
||||
|
||||
ase-1c: setup-ase-1c
|
||||
make -C $(ASE_BUILD_DIR)_1c
|
||||
@@ -10,47 +10,72 @@ ase-1c: setup-ase-1c
|
||||
ase-2c: setup-ase-2c
|
||||
make -C $(ASE_BUILD_DIR)_2c
|
||||
|
||||
ase-4c: setup-ase-4c
|
||||
make -C $(ASE_BUILD_DIR)_4c
|
||||
|
||||
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
|
||||
|
||||
setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile
|
||||
|
||||
setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile
|
||||
|
||||
$(ASE_BUILD_DIR)_1c/Makefile:
|
||||
afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c
|
||||
|
||||
$(ASE_BUILD_DIR)_2c/Makefile:
|
||||
afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c
|
||||
|
||||
$(ASE_BUILD_DIR)_4c/Makefile:
|
||||
afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c
|
||||
|
||||
fpga-1c: setup-fpga-1c
|
||||
cd $(FPGA_BUILD_DIR)_1c && qsub-synth
|
||||
|
||||
fpga-2c: setup-fpga-2c
|
||||
cd $(FPGA_BUILD_DIR)_2c && qsub-synth
|
||||
|
||||
fpga-4c: setup-fpga-4c
|
||||
cd $(FPGA_BUILD_DIR)_4c && qsub-synth
|
||||
|
||||
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
|
||||
|
||||
setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf
|
||||
|
||||
setup-fpga-4c: $(FPGA_BUILD_DIR)_4c/build/dcp.qpf
|
||||
|
||||
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c
|
||||
|
||||
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c
|
||||
|
||||
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c
|
||||
|
||||
run-ase-1c:
|
||||
cd $(ASE_BUILD_DIR)_1c && make sim
|
||||
|
||||
run-ase-2c:
|
||||
cd $(ASE_BUILD_DIR)_2c && make sim
|
||||
|
||||
run-ase-4c:
|
||||
cd $(ASE_BUILD_DIR)_4c && make sim
|
||||
|
||||
clean-ase-1c:
|
||||
rm -rf $(ASE_BUILD_DIR)_1c
|
||||
|
||||
clean-ase-2c:
|
||||
rm -rf $(ASE_BUILD_DIR)_2c
|
||||
|
||||
clean-ase-4c:
|
||||
rm -rf $(ASE_BUILD_DIR)_4c
|
||||
|
||||
clean-fpga-1c:
|
||||
rm -rf $(FPGA_BUILD_DIR)_1c
|
||||
|
||||
clean-fpga-2c:
|
||||
rm -rf $(FPGA_BUILD_DIR)_2c
|
||||
|
||||
clean-fpga-4c:
|
||||
rm -rf $(FPGA_BUILD_DIR)_4c
|
||||
|
||||
|
||||
@@ -62,6 +62,7 @@ make ase
|
||||
# tests
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo
|
||||
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
|
||||
|
||||
# modify "vsim_run.tcl" to dump VCD trace
|
||||
vcd file vortex.vcd
|
||||
@@ -90,5 +91,6 @@ lsof +D build_ase_1c
|
||||
# quick off cache synthesis
|
||||
make -C pipeline > pipeline/build.log 2>&1 &
|
||||
make -C cache > cache/build.log 2>&1 &
|
||||
make -C core > core/build.log 2>&1 &
|
||||
make -C vortex > vortex/build.log 2>&1 &
|
||||
make -C top > top/build.log 2>&1 &
|
||||
|
||||
@@ -7,6 +7,9 @@ BUILD_DIR=$1
|
||||
PROGRAM=$(basename "$2")
|
||||
PROGRAM_DIR=`dirname $2`
|
||||
|
||||
POCL_RT_PATH=$SCRIPT_DIR/../../benchmarks/opencl/runtime/lib
|
||||
VORTEX_DRV_PATH=$SCRIPT_DIR/../../driver/opae/ase
|
||||
|
||||
# Export ASE_WORKDIR variable
|
||||
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
|
||||
|
||||
@@ -33,5 +36,5 @@ done
|
||||
# run application
|
||||
pushd $PROGRAM_DIR
|
||||
echo " [DBG] running ./$PROGRAM $*"
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH:$VORTEX_DRV_PATH:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||
popd
|
||||
@@ -67,6 +67,7 @@ QI:vortex_afu.qsf
|
||||
../rtl/libs/VX_priority_encoder.v
|
||||
../rtl/libs/VX_generic_queue.v
|
||||
../rtl/libs/VX_indexable_queue.v
|
||||
../rtl/libs/VX_fair_arbiter.v
|
||||
../rtl/libs/VX_fixed_arbiter.v
|
||||
../rtl/libs/VX_rr_arbiter.v
|
||||
../rtl/libs/VX_countones.v
|
||||
|
||||
4
hw/opae/sources_4c.txt
Normal file
4
hw/opae/sources_4c.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
+define+NUM_CORES=4
|
||||
+define+L2_ENABLE=0
|
||||
|
||||
C:sources.txt
|
||||
@@ -13,9 +13,6 @@ module VX_alu_unit (
|
||||
output reg [31:0] alu_result,
|
||||
output reg alu_stall
|
||||
);
|
||||
localparam DIV_PIPELINE_LEN = 20;
|
||||
localparam MUL_PIPELINE_LEN = 8;
|
||||
|
||||
wire[31:0] div_result_unsigned;
|
||||
wire[31:0] div_result_signed;
|
||||
|
||||
@@ -37,11 +34,11 @@ module VX_alu_unit (
|
||||
`ALU_DIV,
|
||||
`ALU_DIVU,
|
||||
`ALU_REM,
|
||||
`ALU_REMU: inst_delay = DIV_PIPELINE_LEN;
|
||||
`ALU_REMU: inst_delay = `DIV_LATENCY;
|
||||
`ALU_MUL,
|
||||
`ALU_MULH,
|
||||
`ALU_MULHSU,
|
||||
`ALU_MULHU: inst_delay = MUL_PIPELINE_LEN;
|
||||
`ALU_MULHU: inst_delay = `MUL_LATENCY;
|
||||
default: inst_delay = 0;
|
||||
endcase
|
||||
end
|
||||
@@ -91,7 +88,7 @@ module VX_alu_unit (
|
||||
.WIDTHD(32),
|
||||
.NSIGNED(0),
|
||||
.DSIGNED(0),
|
||||
.PIPELINE(DIV_PIPELINE_LEN)
|
||||
.PIPELINE(`DIV_LATENCY)
|
||||
) udiv (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
@@ -106,7 +103,7 @@ module VX_alu_unit (
|
||||
.WIDTHD(32),
|
||||
.NSIGNED(1),
|
||||
.DSIGNED(1),
|
||||
.PIPELINE(DIV_PIPELINE_LEN)
|
||||
.PIPELINE(`DIV_LATENCY)
|
||||
) sdiv (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
@@ -124,7 +121,7 @@ module VX_alu_unit (
|
||||
.WIDTHB(33),
|
||||
.WIDTHP(64),
|
||||
.SIGNED(1),
|
||||
.PIPELINE(MUL_PIPELINE_LEN)
|
||||
.PIPELINE(`MUL_LATENCY)
|
||||
) multiplier (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CORES
|
||||
`define NUM_CORES 1
|
||||
`define NUM_CORES 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_WARPS
|
||||
@@ -52,7 +52,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef L2_ENABLE
|
||||
`define L2_ENABLE (`NUM_CORES > 2)
|
||||
`define L2_ENABLE 0
|
||||
`endif
|
||||
|
||||
`ifndef L3_ENABLE
|
||||
|
||||
@@ -72,6 +72,10 @@
|
||||
|
||||
`define CSR_WIDTH 12
|
||||
|
||||
`define DIV_LATENCY 18
|
||||
|
||||
`define MUL_LATENCY 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define BYTE_EN_NO 3'h7
|
||||
|
||||
@@ -116,7 +116,7 @@ module VX_lsu_unit #(
|
||||
end
|
||||
if (mrq_pop_part) begin
|
||||
mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd;
|
||||
assert(mrq_read_addr == dbg_mrq_write_addr);
|
||||
assert(($time < 2) || mrq_read_addr == dbg_mrq_write_addr);
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ module VX_fair_arbiter #(
|
||||
output wire grant_valid
|
||||
);
|
||||
|
||||
|
||||
if (N == 1) begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
@@ -20,8 +19,7 @@ module VX_fair_arbiter #(
|
||||
assign grant_onehot = requests;
|
||||
assign grant_valid = requests[0];
|
||||
|
||||
end else begin
|
||||
|
||||
end else begin
|
||||
|
||||
reg [N-1:0] requests_use;
|
||||
wire [N-1:0] update_value;
|
||||
@@ -48,7 +46,7 @@ module VX_fair_arbiter #(
|
||||
|
||||
reg [N-1:0] grant_onehot_r;
|
||||
|
||||
VX_priority_encoder # (
|
||||
VX_priority_encoder #(
|
||||
.N(N)
|
||||
) priority_encoder (
|
||||
.data_in (requests_use),
|
||||
@@ -61,7 +59,7 @@ module VX_fair_arbiter #(
|
||||
grant_onehot_r[grant_index] = 1;
|
||||
end
|
||||
assign grant_onehot = grant_onehot_r;
|
||||
assign late_value = ((refill_original ^ requests) & ~refill_original);
|
||||
assign late_value = ((refill_original ^ requests) & ~refill_original);
|
||||
assign update_value = (requests_use & ~grant_onehot_r) | late_value;
|
||||
|
||||
end
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
module VX_generic_queue #(
|
||||
parameter DATAW,
|
||||
parameter SIZE = 16,
|
||||
parameter BUFFERED_OUTPUT = (SIZE > 8)
|
||||
parameter BUFFERED_OUTPUT = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4
|
||||
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4
|
||||
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||
|
||||
# control RTL debug print states
|
||||
@@ -12,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
#DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CORE_REQ_INFO
|
||||
|
||||
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/pipe_regs -I../rtl/cache -I../rtl/simulate
|
||||
@@ -25,7 +25,7 @@ CF += -std=c++11 -fms-extensions -I../..
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += --x-initial unique --x-assign unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
|
||||
DBG += -DVCD_OUTPUT $(DBG_FLAGS)
|
||||
|
||||
@@ -9,13 +9,9 @@ double sc_time_stamp() {
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
Simulator::Simulator() {
|
||||
|
||||
#ifdef NDEBUG
|
||||
Simulator::Simulator() {
|
||||
// force random values for unitialized signals
|
||||
Verilated::randReset(2);
|
||||
Verilated::assertOn(false);
|
||||
#endif
|
||||
|
||||
ram_ = nullptr;
|
||||
vortex_ = new VVortex();
|
||||
|
||||
5
hw/syn/quartus/.gitignore
vendored
5
hw/syn/quartus/.gitignore
vendored
@@ -8,4 +8,7 @@
|
||||
!/vortex/Makefile
|
||||
|
||||
/pipeline/*
|
||||
!/pipeline/Makefile
|
||||
!/pipeline/Makefile
|
||||
|
||||
/core/*
|
||||
!/core/Makefile
|
||||
|
||||
70
hw/syn/quartus/core/Makefile
Normal file
70
hw/syn/quartus/core/Makefile
Normal file
@@ -0,0 +1,70 @@
|
||||
PROJECT = Core
|
||||
TOP_LEVEL_ENTITY = VX_core
|
||||
SRC_FILE = VX_core.v
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Part, Family
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
||||
# Executable Configuration
|
||||
SYN_ARGS = --parallel --read_settings_files=on
|
||||
FIT_ARGS = --part=$(DEVICE) --read_settings_files=on
|
||||
ASM_ARGS =
|
||||
STA_ARGS = --do_report_timing
|
||||
|
||||
# Build targets
|
||||
all: $(PROJECT).sta.rpt
|
||||
|
||||
syn: $(PROJECT).syn.rpt
|
||||
|
||||
fit: $(PROJECT).fit.rpt
|
||||
|
||||
asm: $(PROJECT).asm.rpt
|
||||
|
||||
sta: $(PROJECT).sta.rpt
|
||||
|
||||
smart: smart.log
|
||||
|
||||
# Target implementations
|
||||
STAMP = echo done >
|
||||
|
||||
$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES)
|
||||
quartus_syn $(PROJECT) $(SYN_ARGS)
|
||||
$(STAMP) fit.chg
|
||||
|
||||
$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt
|
||||
quartus_fit $(PROJECT) $(FIT_ARGS)
|
||||
$(STAMP) asm.chg
|
||||
$(STAMP) sta.chg
|
||||
|
||||
$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt
|
||||
quartus_asm $(PROJECT) $(ASM_ARGS)
|
||||
|
||||
$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt
|
||||
quartus_sta $(PROJECT) $(STA_ARGS)
|
||||
|
||||
smart.log: $(PROJECT_FILES)
|
||||
quartus_sh --determine_smart_action $(PROJECT) > smart.log
|
||||
|
||||
# Project initialization
|
||||
$(PROJECT_FILES):
|
||||
quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../project.sdc -inc "../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/pipe_regs;../../../rtl/cache"
|
||||
|
||||
syn.chg:
|
||||
$(STAMP) syn.chg
|
||||
|
||||
fit.chg:
|
||||
$(STAMP) fit.chg
|
||||
|
||||
sta.chg:
|
||||
$(STAMP) sta.chg
|
||||
|
||||
asm.chg:
|
||||
$(STAMP) asm.chg
|
||||
|
||||
program: $(PROJECT).sof
|
||||
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
|
||||
|
||||
clean:
|
||||
rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox
|
||||
@@ -1,6 +1,6 @@
|
||||
set_time_format -unit ns -decimal_places 3
|
||||
|
||||
create_clock -name {clk} -period "250 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
|
||||
create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
|
||||
|
||||
derive_pll_clocks -create_base_clocks
|
||||
derive_clock_uncertainty
|
||||
|
||||
Reference in New Issue
Block a user