diff --git a/README.md b/README.md index 5ec60343..e4f27ad7 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,34 @@ # Vortex RISC-V GPGPU -Vortex currently supported RISC-V RV32I ISA +Vortex is a full-system RISCV-based GPGPU processor. -/benchmarks containts test benchmarks +Specifications +-------------- -/docs contains documentation. +- Support RISC-V RV32I ISA +- Fully scalable: 1 to 16 cores with optional L2 and L3 caches +- OpenCL 1.2 Support +- FPGA target: Intel Arria 10 @ 200 MHz peak Freq -/hw constains hardware sources. +Directory structure +------------------- -/driver contains the driver software. +- benchmarks: OpenCL and RISC-V benchmarks + +- docs: documentation. -/runtime contains the kernel runtime software. +- hw: hardware sources. -/SimX contains a cycle-approximate simulator for Vortex. +- driver: driver software. -/evaluation contains the synthesis/runtime reports. +- runtime: runtime software for kernels. -Basic Instructions to run OpenCL Benchmarks on Vortex ------------------------------------------------------ +- simX: Vortex cycle-approximate simulator. + +- evaluation: synthesis and performance data. + +Basic Installation +------------------ Install development tools @@ -55,8 +66,8 @@ Install Vortex $ cd Vortex $ make -Run SGEMM OpenCL Benchmark +Quick Test running SGEMM kernel - $ cd Vortex/benchmarks/opencl/sgemm + $ cd /Vortex/benchmarks/opencl/sgemm $ make $ make run diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index bafa89bc..4129ade5 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -13,11 +13,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -DBG_FLAGS += $(DBG_PRINT_FLAGS) +#DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO -#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 +#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 +#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 #DEBUG=1 @@ -38,8 +38,7 @@ RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../ VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) VL_FLAGS += -Wno-DECLFILENAME -VL_FLAGS += --x-initial unique -VL_FLAGS += --x-assign unique +VL_FLAGS += --x-initial unique --x-assign unique # Enable Verilator multithreaded simulation #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') diff --git a/hw/opae/Makefile b/hw/opae/Makefile index 039f01d2..eddd6849 100644 --- a/hw/opae/Makefile +++ b/hw/opae/Makefile @@ -2,7 +2,7 @@ ASE_BUILD_DIR=build_ase FPGA_BUILD_DIR=build_fpga -all: ase-2c +all: ase-1c ase-1c: setup-ase-1c make -C $(ASE_BUILD_DIR)_1c @@ -10,47 +10,72 @@ ase-1c: setup-ase-1c ase-2c: setup-ase-2c make -C $(ASE_BUILD_DIR)_2c +ase-4c: setup-ase-4c + make -C $(ASE_BUILD_DIR)_4c + setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile +setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile + $(ASE_BUILD_DIR)_1c/Makefile: afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c $(ASE_BUILD_DIR)_2c/Makefile: afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c +$(ASE_BUILD_DIR)_4c/Makefile: + afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c + fpga-1c: setup-fpga-1c cd $(FPGA_BUILD_DIR)_1c && qsub-synth fpga-2c: setup-fpga-2c cd $(FPGA_BUILD_DIR)_2c && qsub-synth + +fpga-4c: setup-fpga-4c + cd $(FPGA_BUILD_DIR)_4c && qsub-synth setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf +setup-fpga-4c: $(FPGA_BUILD_DIR)_4c/build/dcp.qpf + $(FPGA_BUILD_DIR)_1c/build/dcp.qpf: afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c $(FPGA_BUILD_DIR)_2c/build/dcp.qpf: afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c +$(FPGA_BUILD_DIR)_4c/build/dcp.qpf: + afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c + run-ase-1c: cd $(ASE_BUILD_DIR)_1c && make sim run-ase-2c: cd $(ASE_BUILD_DIR)_2c && make sim +run-ase-4c: + cd $(ASE_BUILD_DIR)_4c && make sim + clean-ase-1c: rm -rf $(ASE_BUILD_DIR)_1c clean-ase-2c: rm -rf $(ASE_BUILD_DIR)_2c +clean-ase-4c: + rm -rf $(ASE_BUILD_DIR)_4c + clean-fpga-1c: rm -rf $(FPGA_BUILD_DIR)_1c clean-fpga-2c: rm -rf $(FPGA_BUILD_DIR)_2c +clean-fpga-4c: + rm -rf $(FPGA_BUILD_DIR)_4c + diff --git a/hw/opae/README b/hw/opae/README index b59242c5..86e6f862 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -62,6 +62,7 @@ make ase # tests ./run_ase.sh build_ase_1c ../../driver/tests/basic/basic ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo +./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace vcd file vortex.vcd @@ -90,5 +91,6 @@ lsof +D build_ase_1c # quick off cache synthesis make -C pipeline > pipeline/build.log 2>&1 & make -C cache > cache/build.log 2>&1 & +make -C core > core/build.log 2>&1 & make -C vortex > vortex/build.log 2>&1 & make -C top > top/build.log 2>&1 & diff --git a/hw/opae/run_ase.sh b/hw/opae/run_ase.sh index 9a0fbf64..9e3b5d6c 100755 --- a/hw/opae/run_ase.sh +++ b/hw/opae/run_ase.sh @@ -7,6 +7,9 @@ BUILD_DIR=$1 PROGRAM=$(basename "$2") PROGRAM_DIR=`dirname $2` +POCL_RT_PATH=$SCRIPT_DIR/../../benchmarks/opencl/runtime/lib +VORTEX_DRV_PATH=$SCRIPT_DIR/../../driver/opae/ase + # Export ASE_WORKDIR variable export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work @@ -33,5 +36,5 @@ done # run application pushd $PROGRAM_DIR echo " [DBG] running ./$PROGRAM $*" -ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $* +ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH:$VORTEX_DRV_PATH:$LD_LIBRARY_PATH ./$PROGRAM $* popd \ No newline at end of file diff --git a/hw/opae/sources.txt b/hw/opae/sources.txt index 6058a31a..0263e558 100644 --- a/hw/opae/sources.txt +++ b/hw/opae/sources.txt @@ -67,6 +67,7 @@ QI:vortex_afu.qsf ../rtl/libs/VX_priority_encoder.v ../rtl/libs/VX_generic_queue.v ../rtl/libs/VX_indexable_queue.v +../rtl/libs/VX_fair_arbiter.v ../rtl/libs/VX_fixed_arbiter.v ../rtl/libs/VX_rr_arbiter.v ../rtl/libs/VX_countones.v diff --git a/hw/opae/sources_4c.txt b/hw/opae/sources_4c.txt new file mode 100644 index 00000000..03959c74 --- /dev/null +++ b/hw/opae/sources_4c.txt @@ -0,0 +1,4 @@ ++define+NUM_CORES=4 ++define+L2_ENABLE=0 + +C:sources.txt \ No newline at end of file diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index c2fdee80..b0d0c20f 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -13,9 +13,6 @@ module VX_alu_unit ( output reg [31:0] alu_result, output reg alu_stall ); - localparam DIV_PIPELINE_LEN = 20; - localparam MUL_PIPELINE_LEN = 8; - wire[31:0] div_result_unsigned; wire[31:0] div_result_signed; @@ -37,11 +34,11 @@ module VX_alu_unit ( `ALU_DIV, `ALU_DIVU, `ALU_REM, - `ALU_REMU: inst_delay = DIV_PIPELINE_LEN; + `ALU_REMU: inst_delay = `DIV_LATENCY; `ALU_MUL, `ALU_MULH, `ALU_MULHSU, - `ALU_MULHU: inst_delay = MUL_PIPELINE_LEN; + `ALU_MULHU: inst_delay = `MUL_LATENCY; default: inst_delay = 0; endcase end @@ -91,7 +88,7 @@ module VX_alu_unit ( .WIDTHD(32), .NSIGNED(0), .DSIGNED(0), - .PIPELINE(DIV_PIPELINE_LEN) + .PIPELINE(`DIV_LATENCY) ) udiv ( .clk(clk), .reset(reset), @@ -106,7 +103,7 @@ module VX_alu_unit ( .WIDTHD(32), .NSIGNED(1), .DSIGNED(1), - .PIPELINE(DIV_PIPELINE_LEN) + .PIPELINE(`DIV_LATENCY) ) sdiv ( .clk(clk), .reset(reset), @@ -124,7 +121,7 @@ module VX_alu_unit ( .WIDTHB(33), .WIDTHP(64), .SIGNED(1), - .PIPELINE(MUL_PIPELINE_LEN) + .PIPELINE(`MUL_LATENCY) ) multiplier ( .clk(clk), .reset(reset), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 0478a981..4f6377a0 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -8,7 +8,7 @@ `endif `ifndef NUM_CORES -`define NUM_CORES 1 +`define NUM_CORES 4 `endif `ifndef NUM_WARPS @@ -52,7 +52,7 @@ `endif `ifndef L2_ENABLE -`define L2_ENABLE (`NUM_CORES > 2) +`define L2_ENABLE 0 `endif `ifndef L3_ENABLE diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index b36c1751..3ddc4dff 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -72,6 +72,10 @@ `define CSR_WIDTH 12 +`define DIV_LATENCY 18 + +`define MUL_LATENCY 2 + /////////////////////////////////////////////////////////////////////////////// `define BYTE_EN_NO 3'h7 diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 60a6e7b2..640640b3 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -116,7 +116,7 @@ module VX_lsu_unit #( end if (mrq_pop_part) begin mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd; - assert(mrq_read_addr == dbg_mrq_write_addr); + assert(($time < 2) || mrq_read_addr == dbg_mrq_write_addr); end end diff --git a/hw/rtl/libs/VX_fair_arbiter.v b/hw/rtl/libs/VX_fair_arbiter.v index ca32a5fc..5ac5749e 100644 --- a/hw/rtl/libs/VX_fair_arbiter.v +++ b/hw/rtl/libs/VX_fair_arbiter.v @@ -11,7 +11,6 @@ module VX_fair_arbiter #( output wire grant_valid ); - if (N == 1) begin `UNUSED_VAR (clk) @@ -20,8 +19,7 @@ module VX_fair_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; - end else begin - + end else begin reg [N-1:0] requests_use; wire [N-1:0] update_value; @@ -48,7 +46,7 @@ module VX_fair_arbiter #( reg [N-1:0] grant_onehot_r; - VX_priority_encoder # ( + VX_priority_encoder #( .N(N) ) priority_encoder ( .data_in (requests_use), @@ -61,7 +59,7 @@ module VX_fair_arbiter #( grant_onehot_r[grant_index] = 1; end assign grant_onehot = grant_onehot_r; - assign late_value = ((refill_original ^ requests) & ~refill_original); + assign late_value = ((refill_original ^ requests) & ~refill_original); assign update_value = (requests_use & ~grant_onehot_r) | late_value; end diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 0c1fb742..52e12831 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -3,7 +3,7 @@ module VX_generic_queue #( parameter DATAW, parameter SIZE = 16, - parameter BUFFERED_OUTPUT = (SIZE > 8) + parameter BUFFERED_OUTPUT = 1 ) ( input wire clk, input wire reset, diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index eba2d281..21814760 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -1,5 +1,5 @@ -#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 +#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 +#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 # control RTL debug print states @@ -12,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -DBG_FLAGS += $(DBG_PRINT_FLAGS) +#DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/pipe_regs -I../rtl/cache -I../rtl/simulate @@ -25,7 +25,7 @@ CF += -std=c++11 -fms-extensions -I../.. VF += --language 1800-2009 --assert -Wall -Wpedantic VF += -Wno-DECLFILENAME -VF += --x-initial unique +VF += --x-initial unique --x-assign unique VF += -exe $(SRCS) $(INCLUDE) DBG += -DVCD_OUTPUT $(DBG_FLAGS) diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 09a9b72c..2976a43c 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -9,13 +9,9 @@ double sc_time_stamp() { return timestamp; } -Simulator::Simulator() { - -#ifdef NDEBUG +Simulator::Simulator() { // force random values for unitialized signals Verilated::randReset(2); - Verilated::assertOn(false); -#endif ram_ = nullptr; vortex_ = new VVortex(); diff --git a/hw/syn/quartus/.gitignore b/hw/syn/quartus/.gitignore index fedeee42..eac68fed 100644 --- a/hw/syn/quartus/.gitignore +++ b/hw/syn/quartus/.gitignore @@ -8,4 +8,7 @@ !/vortex/Makefile /pipeline/* -!/pipeline/Makefile \ No newline at end of file +!/pipeline/Makefile + +/core/* +!/core/Makefile diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile new file mode 100644 index 00000000..69ab23ab --- /dev/null +++ b/hw/syn/quartus/core/Makefile @@ -0,0 +1,70 @@ +PROJECT = Core +TOP_LEVEL_ENTITY = VX_core +SRC_FILE = VX_core.v +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf + +# Part, Family +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG + +# Executable Configuration +SYN_ARGS = --parallel --read_settings_files=on +FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +ASM_ARGS = +STA_ARGS = --do_report_timing + +# Build targets +all: $(PROJECT).sta.rpt + +syn: $(PROJECT).syn.rpt + +fit: $(PROJECT).fit.rpt + +asm: $(PROJECT).asm.rpt + +sta: $(PROJECT).sta.rpt + +smart: smart.log + +# Target implementations +STAMP = echo done > + +$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES) + quartus_syn $(PROJECT) $(SYN_ARGS) + $(STAMP) fit.chg + +$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt + quartus_fit $(PROJECT) $(FIT_ARGS) + $(STAMP) asm.chg + $(STAMP) sta.chg + +$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt + quartus_asm $(PROJECT) $(ASM_ARGS) + +$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt + quartus_sta $(PROJECT) $(STA_ARGS) + +smart.log: $(PROJECT_FILES) + quartus_sh --determine_smart_action $(PROJECT) > smart.log + +# Project initialization +$(PROJECT_FILES): + quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../project.sdc -inc "../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/pipe_regs;../../../rtl/cache" + +syn.chg: + $(STAMP) syn.chg + +fit.chg: + $(STAMP) fit.chg + +sta.chg: + $(STAMP) sta.chg + +asm.chg: + $(STAMP) asm.chg + +program: $(PROJECT).sof + quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof" + +clean: + rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox diff --git a/hw/syn/quartus/project.sdc b/hw/syn/quartus/project.sdc index 3c588f3b..59686a41 100644 --- a/hw/syn/quartus/project.sdc +++ b/hw/syn/quartus/project.sdc @@ -1,6 +1,6 @@ set_time_format -unit ns -decimal_places 3 -create_clock -name {clk} -period "250 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] +create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] derive_pll_clocks -create_base_clocks derive_clock_uncertainty