diff --git a/doc/execute_opencl_on_vortex.md b/doc/execute_opencl_on_vortex.md new file mode 100644 index 00000000..fe7e74b8 --- /dev/null +++ b/doc/execute_opencl_on_vortex.md @@ -0,0 +1,128 @@ +# Execute OpenCL on Vortex backend + +## Requirements +- [Vortex](https://github.com/vortexgpgpu/vortex) +- [POCL for Vortex](https://github.com/vortexgpgpu/pocl) +- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain) +- [llvm-riscv](https://github.com/llvm-mirror/llvm) + +For installation, please see [Basic Installation](https://github.com/vortexgpgpu/vortex#basic-installation) for more details. + +**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.** +```bash +# please modify the DESTDIR variable in the script before execution +bash toolchain_install.sh -all +``` +Assuming we have installed all dependencies in `/opt` path, we can get the following environment: +```bash +tree -L 2 /opt +''' +/opt/ +├── llvm-riscv +│ ├── bin +│ ├── include +│ ├── lib +│ ├── libexec +│ └── share +├── pocl +│ ├── compiler +│ └── runtime +├── riscv-gnu-toolchain +│ ├── bin +│ ├── drops +│ ├── include +│ ├── lib +│ ├── libexec +│ ├── riscv32-unknown-elf +│ ├── share +│ └── var +└── verilator + ├── bin + ├── examples + ├── include + ├── verilator-config.cmake + └── verilator-config-version.cmake +''' +``` +## Execute OpenCL on Vortex +In this tutorial, we show the example of executing a vecadd programs on SIMX backend. +To execute a OpenCL program on Vortex, we have the following steps: +- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler. +- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```). +- Execute the compiled host programs on a backend. + +Thus, we can write a Makefile as following: +```Makefile +LLVM_PREFIX ?= /opt/llvm-riscv +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf +POCL_CC_PATH ?= /opt/pocl/compiler +POCL_RT_PATH ?= /opt/pocl/runtime + +OPTS ?= -n64 + +# please edit these two variable to your environment +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(realpath ../../../runtime) + +K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small" +K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections" +K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm" + +CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors + +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter + +CXXFLAGS += -I$(POCL_RT_PATH)/include + +LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex + +PROJECT = vecadd + +SRCS = main.cc + +all: $(PROJECT) kernel.pocl + +kernel.pocl: kernel.cl + LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-fpga: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-simx: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.pocl *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif +``` + +First, build the host program. +```bash +make all +``` +If we want to execute on SIMX, we can execute the command below. +```bash +make run-simx +``` diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 7186a553..89d70d7a 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -194,7 +194,6 @@ module VX_decode #( end `INST_F: begin ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(func3[0]); op_mod = `INST_MOD_BITS'(1); end `INST_SYS : begin @@ -387,6 +386,12 @@ module VX_decode #( `USED_IREG (rs3); end `endif + 3'h6: begin + ex_type = `EX_LSU; + op_type = `INST_OP_BITS'(`INST_LSU_LW); + op_mod = `INST_MOD_BITS'(2); + `USED_IREG (rs1); + end default:; endcase end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index c5c27899..c3706000 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -154,7 +154,8 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] -`define INST_LSU_IS_FENCE(x) x[0] +`define INST_LSU_IS_FENCE(x) (3'h1 == x) +`define INST_LSU_IS_PREFETCH(x) (3'h2 == x) `define INST_FENCE_BITS 1 `define INST_FENCE_D 1'h0 diff --git a/hw/rtl/VX_instr_demux.sv b/hw/rtl/VX_instr_demux.sv index 60261245..24161f47 100644 --- a/hw/rtl/VX_instr_demux.sv +++ b/hw/rtl/VX_instr_demux.sv @@ -60,17 +60,18 @@ module VX_instr_demux ( wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU); wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type); wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); + wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .OUT_REG (1) ) lsu_buffer ( .clk (clk), .reset (reset), .valid_in (lsu_req_valid), .ready_in (lsu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}), - .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}), + .data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}), .valid_out (lsu_req_if.valid), .ready_out (lsu_req_if.ready) ); diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index cf4d58cf..8541f4c6 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -40,6 +40,7 @@ module VX_lsu_unit #( wire [`NW_BITS-1:0] req_wid; wire [31:0] req_pc; wire req_is_dup; + wire req_is_prefetch; wire mbuf_empty; @@ -79,15 +80,17 @@ module VX_lsu_unit #( wire lsu_valid = lsu_req_if.valid && ~fence_wait; + wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; + VX_pipe_register #( - .DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) ) req_pipe_reg ( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? @@ -99,8 +102,10 @@ module VX_lsu_unit #( wire rsp_wb; wire [`INST_LSU_BITS-1:0] rsp_type; wire rsp_is_dup; + wire rsp_is_prefetch; `UNUSED_VAR (rsp_type) + `UNUSED_VAR (rsp_is_prefetch) reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; wire [`NUM_THREADS-1:0] rsp_rem_mask_n; @@ -131,10 +136,13 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; - `UNUSED_VAR (dcache_rsp_if.tag) + `UNUSED_VAR (dcache_rsp_if.tag) + + // do not writeback from software prefetch + wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1), + .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -142,8 +150,8 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup}), - .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup}), + .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), + .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), .full (mbuf_full), @@ -346,7 +354,7 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); dpi_trace("\n"); end else begin - dpi_trace("%d: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -354,8 +362,8 @@ module VX_lsu_unit #( end end if (dcache_rsp_fire) begin - dpi_trace("%d: D$%0d Rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", - $time, CORE_ID, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); + dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", + $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); dpi_trace(", is_dup=%b\n", rsp_is_dup); end diff --git a/hw/rtl/interfaces/VX_lsu_req_if.sv b/hw/rtl/interfaces/VX_lsu_req_if.sv index 36b4e778..4f31b17c 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.sv +++ b/hw/rtl/interfaces/VX_lsu_req_if.sv @@ -17,6 +17,7 @@ interface VX_lsu_req_if (); wire [`NR_BITS-1:0] rd; wire wb; wire ready; + wire is_prefetch; modport master ( output valid, @@ -30,6 +31,7 @@ interface VX_lsu_req_if (); output offset, output rd, output wb, + output is_prefetch, input ready ); @@ -45,6 +47,7 @@ interface VX_lsu_req_if (); input offset, input rd, input wb, + input is_prefetch, output ready ); diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index f41bb0a3..9c3149d7 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -149,6 +149,11 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps)); } +// Prefetch +inline void vx_prefetch(unsigned addr) { + asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); +} + // Return active warp's thread id inline int vx_thread_id() { int result; diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index 01570aab..dbc7115a 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -182,6 +182,7 @@ static const char* op_string(const Instr &instr) { case 2: return "SPLIT"; case 3: return "JOIN"; case 4: return "BAR"; + case 6: return "PREFETCH"; default: std::abort(); } diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index 01271e59..47bf4e04 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -712,6 +712,11 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { pipeline->stall_warp = true; runOnce = true; } break; + case 6: { + // PREFETCH + int addr = rsdata[0]; + printf("*** PREFETCHED %d ***\n", addr); + } break; default: std::abort(); }