diff --git a/driver/sw/rtlsim/Makefile b/driver/sw/rtlsim/Makefile index ca3991b4..c64f1b12 100644 --- a/driver/sw/rtlsim/Makefile +++ b/driver/sw/rtlsim/Makefile @@ -1,5 +1,5 @@ -CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors -#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors +#CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors +CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors USE_MULTICORE=1 @@ -23,10 +23,9 @@ SRCS = vortex.cpp ../vx_utils.cpp ../../../rtl/simulate/$(RTL_TOP).cpp RTL_INCLUDE = -I../../../rtl -I../../../rtl/interfaces -I../../../rtl/cache -I../../../rtl/VX_cache -I../../../rtl/shared_memory -I../../../rtl/pipe_regs -I../../../rtl/compat -THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') -VL_FLAGS += --threads $(THREADS) - -VL_FLAGS += -Wno-UNOPTFLAT -Wno-WIDTH +# Enable Verilator multithreaded simulation +#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +#VL_FLAGS += --threads $(THREADS) VL_FLAGS += -Wno-UNDRIVEN --Wno-PINMISSING -Wno-STMTDLY -Wno-WIDTH -Wno-UNSIGNED -Wno-UNOPTFLAT -Wno-LITENDIAN diff --git a/driver/tests/demo/Makefile b/driver/tests/demo/Makefile index 9d3049f9..48effb87 100644 --- a/driver/tests/demo/Makefile +++ b/driver/tests/demo/Makefile @@ -43,16 +43,16 @@ $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../sw/simx -lvortex -o $@ run-fpga: $(PROJECT) - LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin + LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 run-ase: $(PROJECT) - LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin + LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 run-rtlsim: $(PROJECT) - LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin + LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 run-simx: $(PROJECT) - LD_LIBRARY_PATH=../../sw/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin + LD_LIBRARY_PATH=../../sw/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16 .depend: $(SRCS) $(CXX) $(CXXFLAGS) -MM $^ > .depend; diff --git a/driver/tests/demo/demo b/driver/tests/demo/demo index 58215168..72482b23 100755 Binary files a/driver/tests/demo/demo and b/driver/tests/demo/demo differ diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 9e486d82..386b8ff4 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -178,6 +178,66 @@ int main(int argc, char *argv[]) { return -1; } + // flush the destination buffer caches + std::cout << "flush the destination buffer caches" << std::endl; + ret = vx_flush_caches(device, kernel_arg.dst_ptr, buf_size); + if (ret != 0) { + cleanup(); + return -1; + } + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + ret = vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0); + if (ret != 0) { + cleanup(); + return -1; + } + + // verify result + std::cout << "verify result" << std::endl; + { + auto buf_ptr = (int*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = i * i; + int cur = buf_ptr[i]; + if (cur != ref) { + ++errors; + } + } + } + + if (errors != 0) { + printf("Found %d errors!\n", errors); + printf("FAILED!\n"); + cleanup(); + return -1; + } + + // start device + std::cout << "start device" << std::endl; + ret = vx_start(device); + if (ret != 0) { + cleanup(); + return -1; + } + + // wait for completion + std::cout << "wait for completion" << std::endl; + ret = vx_ready_wait(device, -1); + if (ret != 0) { + cleanup(); + return -1; + } + + // flush the destination buffer caches + std::cout << "flush the destination buffer caches" << std::endl; + ret = vx_flush_caches(device, kernel_arg.dst_ptr, buf_size); + if (ret != 0) { + cleanup(); + return -1; + } + // download destination buffer std::cout << "download destination buffer" << std::endl; ret = vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0); diff --git a/driver/tests/demo/kernel.bin b/driver/tests/demo/kernel.bin index 80593892..2e2205bb 100755 Binary files a/driver/tests/demo/kernel.bin and b/driver/tests/demo/kernel.bin differ diff --git a/rtl/simulate/Vortex.cpp b/rtl/simulate/Vortex.cpp index 050b4fd1..7de2504e 100644 --- a/rtl/simulate/Vortex.cpp +++ b/rtl/simulate/Vortex.cpp @@ -257,7 +257,7 @@ bool Vortex::is_busy() { void Vortex::send_snoops(uint32_t mem_addr, uint32_t size) { // align address to LLC block boundaries - auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES); + auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * (mem_addr / GLOBAL_BLOCK_SIZE_BYTES); auto aligned_addr_end = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + size + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES); // submit snoop requests for the needed blocks diff --git a/rtl/simulate/Vortex_SOC.cpp b/rtl/simulate/Vortex_SOC.cpp index d12c2d40..8bfc94be 100644 --- a/rtl/simulate/Vortex_SOC.cpp +++ b/rtl/simulate/Vortex_SOC.cpp @@ -195,7 +195,7 @@ bool Vortex_SOC::is_busy() { void Vortex_SOC::send_snoops(uint32_t mem_addr, uint32_t size) { // align address to LLC block boundaries - auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES); + auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * (mem_addr / GLOBAL_BLOCK_SIZE_BYTES); auto aligned_addr_end = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + size + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES); // submit snoop requests for the needed blocks