From f8e304bc193593e7bd1197a69364cdce3b21aa28 Mon Sep 17 00:00:00 2001 From: "Lyons, Ethan Tyler" Date: Thu, 21 Nov 2019 21:41:11 -0500 Subject: [PATCH 1/8] Synthesis Compatible --- rtl/modelsim/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/rtl/modelsim/Makefile b/rtl/modelsim/Makefile index e906236f..3a514cf2 100644 --- a/rtl/modelsim/Makefile +++ b/rtl/modelsim/Makefile @@ -7,6 +7,7 @@ SRC = \ vortex_dpi.cpp \ vortex_tb.v \ ../VX_define.v \ +../VX_define_synth.v \ ../interfaces/VX_branch_response_inter.v \ ../interfaces/VX_csr_req_inter.v \ ../interfaces/VX_csr_wb_inter.v \ From fe241bfa30859ab11dff8f9c044479bcb2047b76 Mon Sep 17 00:00:00 2001 From: "Lyons, Ethan Tyler" Date: Thu, 21 Nov 2019 21:41:41 -0500 Subject: [PATCH 2/8] Synthesis Compatible --- rtl/cache/VX_d_cache.v | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rtl/cache/VX_d_cache.v b/rtl/cache/VX_d_cache.v index fd6c9641..78b407f7 100644 --- a/rtl/cache/VX_d_cache.v +++ b/rtl/cache/VX_d_cache.v @@ -304,9 +304,15 @@ module VX_d_cache // 0; wire[1:0] byte_select = bank_addr[1:0]; + wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START]; + + `ifdef SYN_FUNC + wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = 0; + wire[IND_SIZE_END:IND_SIZE_START] cache_index = 0; + `else wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = bank_addr[ADDR_OFFSET_END:ADDR_OFFSET_START]; wire[IND_SIZE_END:IND_SIZE_START] cache_index = bank_addr[ADDR_IND_END:ADDR_IND_START]; - wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START]; + `endif wire normal_valid_in = valid_per_bank[bank_id]; From 2efc954916ccd1ccc3830221f287ad2ed57736e5 Mon Sep 17 00:00:00 2001 From: "Lyons, Ethan Tyler" Date: Thu, 21 Nov 2019 21:42:34 -0500 Subject: [PATCH 3/8] Synthesis Compatible --- rtl/VX_alu.v | 70 +++++++++++++++++++++++++++++++++++++++++-- rtl/VX_define.v | 8 ++--- rtl/VX_define_synth.v | 2 ++ 3 files changed, 73 insertions(+), 7 deletions(-) create mode 100644 rtl/VX_define_synth.v diff --git a/rtl/VX_alu.v b/rtl/VX_alu.v index 3b308297..9688aad2 100644 --- a/rtl/VX_alu.v +++ b/rtl/VX_alu.v @@ -1,4 +1,3 @@ - `include "VX_define.v" module VX_alu( @@ -13,6 +12,71 @@ module VX_alu( ); + `ifdef SYN_FUNC + wire which_in2; + + wire[31:0] ALU_in1; + wire[31:0] ALU_in2; + wire[63:0] ALU_in1_mult; + wire[63:0] ALU_in2_mult; + wire[31:0] upper_immed; + wire[31:0] div_result; + wire[31:0] rem_result; + + + assign which_in2 = in_rs2_src == `RS2_IMMED; + + assign ALU_in1 = in_1; + + assign ALU_in2 = which_in2 ? in_itype_immed : in_2; + + + assign upper_immed = {in_upper_immed, {12{1'b0}}}; + + + + //always @(posedge `MUL) begin + + + /* verilator lint_off UNUSED */ + + + wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1}; + wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2}; + assign ALU_in1_mult = (in_alu_op == `MULHU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in1} : alu_in1_signed; + assign ALU_in2_mult = (in_alu_op == `MULHU || in_alu_op == `MULHSU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in2} : alu_in2_signed; + wire[63:0] mult_result = ALU_in1_mult * ALU_in2_mult; + + /* verilator lint_on UNUSED */ + + always @(in_alu_op or ALU_in1 or ALU_in2) begin + case(in_alu_op) + `ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2); + `SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2); + `SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0]; + `SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0; + `SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0; + `XOR: out_alu_result = ALU_in1 ^ ALU_in2; + `SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0]; + `SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0]; + `OR: out_alu_result = ALU_in1 | ALU_in2; + `AND: out_alu_result = ALU_in2 & ALU_in1; + `SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff; + `LUI_ALU: out_alu_result = upper_immed; + `AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed); + `MUL: out_alu_result = mult_result[31:0]; + `MULH: out_alu_result = mult_result[63:32]; + `MULHSU: out_alu_result = mult_result[63:32]; + `MULHU: out_alu_result = mult_result[63:32]; + `DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : $signed($signed(ALU_in1) / $signed(ALU_in2)); + `DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : ALU_in1 / ALU_in2; + `REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : $signed($signed(ALU_in1) % $signed(ALU_in2)); + `REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2; + default: out_alu_result = 32'h0; + endcase // in_alu_op + end + + `else wire which_in2; wire[31:0] ALU_in1; @@ -69,7 +133,7 @@ module VX_alu( `REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2; default: out_alu_result = 32'h0; endcase // in_alu_op - end - + end + `endif endmodule // VX_alu \ No newline at end of file diff --git a/rtl/VX_define.v b/rtl/VX_define.v index 8f77fdb3..8262bf46 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -1,11 +1,10 @@ +`include "./VX_define_synth.v" + + -`define NT 4 `define NT_M1 (`NT-1) // NW_M1 is actually log2(NW) -//`define NW_M1 (4-1) - -`define NW 8 `define NW_M1 (`CLOG2(`NW)) // Uncomment the below line if NW=1 @@ -13,6 +12,7 @@ // `define SYN 1 //`define ASIC 1 +//`define SYN_FUNC 1 `define NUM_BARRIERS 4 diff --git a/rtl/VX_define_synth.v b/rtl/VX_define_synth.v new file mode 100644 index 00000000..0444fe94 --- /dev/null +++ b/rtl/VX_define_synth.v @@ -0,0 +1,2 @@ +`define NT 4 +`define NW 8 From d95506edd223e8997294671c5d6214b2f613df3e Mon Sep 17 00:00:00 2001 From: "Lyons, Ethan Tyler" Date: Thu, 21 Nov 2019 21:43:43 -0500 Subject: [PATCH 4/8] Synthesis Compatible --- syn/Makefile | 32 +++++++++++++++++++++++--- syn/esyn.tcl | 53 +++++++++++++++++++++++++++++++++++++++++++ syn/run_mult_synth.sh | 28 +++++++++++++++++++++++ 3 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 syn/esyn.tcl create mode 100644 syn/run_mult_synth.sh diff --git a/syn/Makefile b/syn/Makefile index 1cfe585c..ca3094bc 100644 --- a/syn/Makefile +++ b/syn/Makefile @@ -1,7 +1,33 @@ -all: syn +SCRIPT_DIR=./scripts + +all: dc -syn: - dc_shell-t -f fsyn.tcl 2>&1 | tee vortex_syn.log \ No newline at end of file +#syn: + #dc_shell-t -f esyn.tcl 2>&1 | tee vortex_syn.log + #dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log + #dc_shell -f $(SCRIPT_DIR)/dc/dc_script.tcl + +dc: + rm -rf rpt + mkdir rpt + dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log + +clean: + rm -f simv + rm -f *.vcd + rm -f *.key + rm -rf csrc/ + rm -rf *.rpt + rm -rf *.log + rm -rf *.svf + rm -rf *.ddc + rm -rf results_synthesized.v + rm -rf results_synthesized.sdc + rm -rf alib-52/ + rm -rf rpt/ + rm -rf simv.daidir/ + rm -rf encounter* + rm -rf ./synth_out \ No newline at end of file diff --git a/syn/esyn.tcl b/syn/esyn.tcl new file mode 100644 index 00000000..400023b1 --- /dev/null +++ b/syn/esyn.tcl @@ -0,0 +1,53 @@ +#set search_path [concat /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_db/ /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_hard_db/ ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db] +set search_path [concat ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db] +set link_library [concat ./NanGate_15nm_OCL.db] +set symbol_library {} +set target_library [concat ./NanGate_15nm_OCL.db] + +set verilog_files [ list VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v VX_cache_data_per_index.v VX_Cache_Bank.v VX_cache_data.v VX_d_cache.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_define_synth.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v Vortex.v VX_cache_bank_valid.v \ + ] +# set verilog_files [ list Vortex.v VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v cache_set.v VX_Cache_Bank.v VX_Cache_Block_DM.v VX_cache_data.v VX_d_cache.v VX_generic_pc.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ +# ] + +set top_level Vortex +analyze -format sverilog $verilog_files +#analyze -format sverilog -error=LINT-66 $verilog_files +elaborate Vortex +link + +set clk_freq 0.4 +set clk_period [expr 1000.0 / $clk_freq / 1.0] +create_clock [get_ports clk] -period $clk_period +set_max_fanout 20 [get_ports clk] +set_ideal_network [get_ports clk] + +set_max_fanout 20 [get_ports reset] +set_false_path -from [get_ports reset] +all_high_fanout -net -threshold 20 + +# set_register_merging Vortex FALSE +# set compile_seqmap_propagate_constants false +# set compile_seqmap_propagate_high_effort false + +check_design +compile_ultra -no_autoungroup +ungroup -all -flatten +uniquify + +define_name_rules verilog -remove_internal_net_bus -remove_port_bus +change_names -rule verilog -hierarchy + +report_qor +report_area +report_hierarchy +report_cell +report_reference +report_port +report_power + +write -hierarchy -format verilog -output Vortex.netlist.v +remove_ideal_network [get_ports clk] +set_propagated_clock [get_ports clk] +write_sdc -version 1.9 Vortex.sdc +write_file -format ddc -output Vortex.ddc +exit \ No newline at end of file diff --git a/syn/run_mult_synth.sh b/syn/run_mult_synth.sh new file mode 100644 index 00000000..81509aa7 --- /dev/null +++ b/syn/run_mult_synth.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set top_level = Vortex + +source /tools/synopsys/synthesis/j201409/cshrc.syn +set cur_dir = `pwd` +echo $cur_dir + +for number_of_warps in 2 4 8 16 32; do + for number_of_threads in 2 4 8 16 32; do + + echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Launched" + echo "\`define NT $number_of_threads" > ../rtl/VX_define_synth.v + echo "\`define NW $number_of_warps" >> ../rtl/VX_define_synth.v + make dc | tee run.log + sleep 30 + moved_filename="${number_of_warps}_Warps__${number_of_threads}_threads__400MHz.log" + mv ./vortex_syn.log ../../$moved_filename + sleep 30 + + + + + echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Finished" + done +done + + +echo "Done!" From 1788207aa6db316fc1a41ed69d25831b440615d8 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 21 Nov 2019 23:32:06 -0500 Subject: [PATCH 5/8] bfs benchmark --- benchmarks/opencl/bfs/CLHelper.h | 814 +++++++++++++++++++++++++++++++ benchmarks/opencl/bfs/Makefile | 4 +- benchmarks/opencl/bfs/kernel.cl | 53 ++ benchmarks/opencl/bfs/libbfs.a | Bin 0 -> 7846 bytes benchmarks/opencl/bfs/main.cc | 299 ++++++++++++ benchmarks/opencl/bfs/run | 1 + benchmarks/opencl/bfs/timer.cc | 78 +++ benchmarks/opencl/bfs/timer.h | 128 +++++ benchmarks/opencl/bfs/util.h | 72 +++ 9 files changed, 1447 insertions(+), 2 deletions(-) create mode 100755 benchmarks/opencl/bfs/CLHelper.h create mode 100755 benchmarks/opencl/bfs/kernel.cl create mode 100644 benchmarks/opencl/bfs/libbfs.a create mode 100755 benchmarks/opencl/bfs/main.cc create mode 100755 benchmarks/opencl/bfs/run create mode 100755 benchmarks/opencl/bfs/timer.cc create mode 100755 benchmarks/opencl/bfs/timer.h create mode 100755 benchmarks/opencl/bfs/util.h diff --git a/benchmarks/opencl/bfs/CLHelper.h b/benchmarks/opencl/bfs/CLHelper.h new file mode 100755 index 00000000..b9a873e4 --- /dev/null +++ b/benchmarks/opencl/bfs/CLHelper.h @@ -0,0 +1,814 @@ +//------------------------------------------ +//--cambine:helper function for OpenCL +//--programmer: Jianbin Fang +//--date: 27/12/2010 +//------------------------------------------ +#ifndef _CL_HELPER_ +#define _CL_HELPER_ + +#include +#include +#include +#include +#include + +using std::string; +using std::ifstream; +using std::cerr; +using std::endl; +using std::cout; +//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable +#define WORK_DIM 2 //work-items dimensions + +struct oclHandleStruct +{ + cl_context context; + cl_device_id *devices; + cl_command_queue queue; + cl_program program; + cl_int cl_status; + std::string error_str; + std::vector kernel; +}; + +struct oclHandleStruct oclHandles; + +char kernel_file[100] = "Kernels.cl"; +int total_kernels = 2; +string kernel_names[2] = {"BFS_1", "BFS_2"}; +int work_group_size = 512; +int device_id_inused = 0; //deviced id used (default : 0) + +/* + * Converts the contents of a file into a string + */ +string FileToString(const string fileName) +{ + ifstream f(fileName.c_str(), ifstream::in | ifstream::binary); + + try + { + size_t size; + char* str; + string s; + + if(f.is_open()) + { + size_t fileSize; + f.seekg(0, ifstream::end); + size = fileSize = f.tellg(); + f.seekg(0, ifstream::beg); + + str = new char[size+1]; + if (!str) throw(string("Could not allocate memory")); + + f.read(str, fileSize); + f.close(); + str[size] = '\0'; + + s = str; + delete [] str; + return s; + } + } + catch(std::string msg) + { + cerr << "Exception caught in FileToString(): " << msg << endl; + if(f.is_open()) + f.close(); + } + catch(...) + { + cerr << "Exception caught in FileToString()" << endl; + if(f.is_open()) + f.close(); + } + string errorMsg = "FileToString()::Error: Unable to open file " + + fileName; + throw(errorMsg); +} +//--------------------------------------- +//Read command line parameters +// +void _clCmdParams(int argc, char* argv[]){ + for (int i =0; i < argc; ++i) + { + switch (argv[i][1]) + { + case 'g': //--g stands for size of work group + if (++i < argc) + { + sscanf(argv[i], "%u", &work_group_size); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + case 'd': //--d stands for device id used in computaion + if (++i < argc) + { + sscanf(argv[i], "%u", &device_id_inused); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + default: + ; + } + } + +} + +//--------------------------------------- +//Initlize CL objects +//--description: there are 5 steps to initialize all the OpenCL objects needed +//--revised on 04/01/2011: get the number of devices and +// devices have no relationship with context +void _clInit() +{ + int DEVICE_ID_INUSED = device_id_inused; + cl_int resultCL; + + oclHandles.context = NULL; + oclHandles.devices = NULL; + oclHandles.queue = NULL; + oclHandles.program = NULL; + + cl_uint deviceListSize; + + //----------------------------------------------- + //--cambine-1: find the available platforms and select one + + cl_uint numPlatforms; + cl_platform_id targetPlatform = NULL; + + resultCL = clGetPlatformIDs(0, NULL, &numPlatforms); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)")); + //printf("number of platforms:%d\n",numPlatforms); //by cambine + + if (!(numPlatforms > 0)) + throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)")); + + cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); + + resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)")); + + /* Select the target platform. Default: first platform */ + targetPlatform = allPlatforms[0]; + for (int i = 0; i < numPlatforms; i++) + { + char pbuff[128]; + resultCL = clGetPlatformInfo( allPlatforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuff), + pbuff, + NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)")); + + //printf("vedor is %s\n",pbuff); + + } + free(allPlatforms); + + //----------------------------------------------- + //--cambine-2: create an OpenCL context + cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 }; + oclHandles.context = clCreateContextFromType(cprops, + CL_DEVICE_TYPE_GPU, + NULL, + NULL, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL)) + throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)")); + //----------------------------------------------- + //--cambine-3: detect OpenCL devices + /* First, get the size of device list */ + oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exception in _clInit -> clGetDeviceIDs")); + } + if (deviceListSize == 0) + throw(string("InitCL()::Error: No devices found.")); + + //std::cout<<"device number:"< clGetDeviceIDs-2")); + } + //----------------------------------------------- + //--cambine-4: Create an OpenCL command queue + oclHandles.queue = clCreateCommandQueue(oclHandles.context, + oclHandles.devices[DEVICE_ID_INUSED], + 0, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL)) + throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); + //----------------------------------------------- + //--cambine-5: Load CL file, build CL program object, create CL kernel object + std::string source_str = FileToString(kernel_file); + const char * source = source_str.c_str(); + size_t sourceSize[] = { source_str.length() }; + + oclHandles.program = clCreateProgramWithSource(oclHandles.context, + 1, + &source, + sourceSize, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)")); + //insert debug information + //std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines + //options += " -cl-nv-opt-level=3"; + resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + { + cerr << "InitCL()::Error: In clBuildProgram" << endl; + + size_t length; + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + 0, + NULL, + &length); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + char* buffer = (char*)malloc(length); + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + length, + buffer, + NULL); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + cerr << buffer << endl; + free(buffer); + + throw(string("InitCL()::Error: Building Program (clBuildProgram)")); + } + + //get program information in intermediate representation + #ifdef PTX_MSG + size_t binary_sizes[deviceListSize]; + char * binaries[deviceListSize]; + //figure out number of devices and the sizes of the binary for each device. + oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL ); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2")); + } + + std::cout<<"--cambine:"< clGetProgramInfo-3")); + } + for(int i=0;i getting resource information")); + } + + build_log = (char *)malloc(ret_val_size+1); + oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exceptions in _InitCL -> getting resources allocation information-2")); + } + build_log[ret_val_size] = '\0'; + std::cout<<"--cambine:"<d_mem_pinned")); + #endif + //------------ + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \ + size, NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> d_mem ")); + #endif + //---------- + h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \ + CL_MAP_WRITE, 0, size, 0, NULL, \ + NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer")); + #endif + int element_number = size/sizeof(float); + #pragma omp parallel for + for(int i=0;i clEnqueueWriteBuffer")); + #endif + + return d_mem; +} + + +//-------------------------------------------------------- +//--cambine:create write only buffer on device +cl_mem _clMallocWO(int size) throw(string){ + cl_mem d_mem; + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateMem()")); + #endif + return d_mem; +} + +//-------------------------------------------------------- +//transfer data from device to host +void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){ + oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clCpyMemD2H -> "; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_VALUE: + oclHandles.error_str += "CL_INVALID_VALUE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} + +//-------------------------------------------------------- +//set kernel arguments +void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){ + if(!size){ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } + else{ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } +} +void _clFinish() throw(string){ + oclHandles.cl_status = clFinish(oclHandles.queue); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFinish"; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reasons"; + break; + + } + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(oclHandles.error_str); + } + #endif +} +//-------------------------------------------------------- +//--cambine:enqueue kernel +void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){ + cl_uint work_dim = WORK_DIM; + cl_event e[1]; + if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size)); + size_t local_work_size[] = {work_group_size, 1}; + size_t global_work_size[] = {work_items, 1}; + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + // oclHandles.cl_status = clWaitForEvents(1, &e[0]); + // #ifdef ERRMSG + // if (oclHandles.cl_status!= CL_SUCCESS) + // throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + // #endif +} +void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){ + cl_uint work_dim = WORK_DIM; + size_t local_work_size[] = {group_x, group_y}; + size_t global_work_size[] = {range_x, range_y}; + cl_event e[1]; + /*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size));*/ + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + /*oclHandles.cl_status = clWaitForEvents(1, &e[0]); + + #ifdef ERRMSG + + if (oclHandles.cl_status!= CL_SUCCESS) + + throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + + #endif*/ +} + +//-------------------------------------------------------- +//release OpenCL objects +void _clFree(cl_mem ob) throw(string){ + if(ob!=NULL) + oclHandles.cl_status = clReleaseMemObject(ob); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFree() ->"; + switch(oclHandles.cl_status) + { + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if (oclHandles.cl_status!= CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} +#endif //_CL_HELPER_ diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index 5ad87059..0472f69f 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -29,12 +29,12 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio LIBS = -lOpenCL -PROJECT=saxpy +PROJECT=bfs all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf diff --git a/benchmarks/opencl/bfs/kernel.cl b/benchmarks/opencl/bfs/kernel.cl new file mode 100755 index 00000000..51ce5a08 --- /dev/null +++ b/benchmarks/opencl/bfs/kernel.cl @@ -0,0 +1,53 @@ +/* ============================================================ +//--cambine: kernel funtion of Breadth-First-Search +//--author: created by Jianbin Fang +//--date: 06/12/2010 +============================================================ */ + +//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable + +//Structure to hold a node information +typedef struct{ + int starting; + int no_of_edges; +} Node; + +//--7 parameters +__kernel void BFS_1( const __global Node* g_graph_nodes, + const __global int* g_graph_edges, + __global char* g_graph_mask, + __global char* g_updating_graph_mask, + __global char* g_graph_visited, + __global int* g_cost, + const int no_of_nodes){ + int tid = get_global_id(0); + if( tid^6+YL`c5uiN76aXa1Suwg1nk&;u^ms-L^=W+izrjKX+o%vXU7Q{PMq3? z!76QGj}W>ds^TKL?oS07FovdTt~5>R4>C1?*wp^mk4_ar)vRpQG;K@}hT0PEJJ0Vq z&k2yAb(My=(S7&abM8I&p7-5%@BN%7zpSR?yT4N1)>xVRmTF11J{ubY(V)>{S2Yni ziCo8L@)ui)N?937#godOM0zlhRG!-Wj1r)^D&_mB^q#(SYVXhkjY_YY$Fo?~ULY^Tv9;e!nl(R$Vr=$GnMNZ!F;r zMZKX8UnW&y?G1b7h&L2s5AAY=sVC%($Gq(kuNw9GdyS55R%kE}Yt@QTxV<#XKr|FC zO?OxIkGi@y3n2hlDG9nsGc}^Dqoi-wi*-~Bt6PYor}^{bS#yxwb|E5Y<90acB-;FS zm`H+NYaHixq7U`i?LDny(`KLFpl@!(K@qGMca>8wb12=;A<99wP@nf}hbbTtl@^*F z1W`Px4)!(fOQbXXslmpm&+iK~uG^i>4rRX7*4Edb-Mu&Fi>C(KlF5Ao-huvfI+bqI z`}kmNj3s*HNKbn#&=F6Bqfxmx9t?#eiH@E`IM5r8$^PCRe>@fmwan|2{p&8{!QPPU zkH&(5a6IM@s(w{X1R`=s4z)*nq8*`FDCQ4^bFB)Vdz;l_53~>I7lRxP5pNvuQ2p1oBExQ8~7z7v25yhL0)0#SvK`a$o0H0BC%}h zUx8e|h_ys4n|iJbiJwG^s$G%;--IunNUFa6!TzkGrqk*HQg&<$1eGn_Pe0Y!tti`+ zKrpE8&8C!eqOU)bO{C3THv83J*FK|rHnBfTyt)SxgV;Ki`2PJ$UosU_lgd6dt+03U zWn!5O`S_viSnwEuu6k~8)fbPG zY4V2R-d@$4z(%clOD3LLR*m5`9T+H;J!)2+i$+LM_A5#kUVMrhaJPUjP{MQ}&lm75 z8B0KK{!3J@SiUb)?1GhLW7$zV znqz+e*&GXES^g=Mthcr^J{4|OZZ`zk>hD8~joX=X`mNzF$Vwz`XU4>J9*Z;UYLU%x zZ$LJEKrGAL)~bIM9XLLYm*-=~`wht4){o5T7V=xVRC<`>HbY?9OmA2#P@2^+1gX22 zf@+xCj$vGWFFY!ttyuLTy=@uOVHg?rrAf~eOFO-g@v}sMTJB5X%D$5_ezxH>Jucb z-%5q8zrX8tZH2Cd@-Lhut+g6*E9H-#pxleqBweVXynR$?Z`&#UW4kSHe_rH{c2n*r z4OF=H+8wLCaNItfTe^Yd7vX0N;~ZK_p2nY1{?K9J8Gc)6>)~&`3;rxVF243j>c*-m zx3J&UHguA-0e7SI<7g*@*4ing)`LQ885dHEuxV>LZMZwxaCfo+1F3IVzdHd7;IIZb ztOgE^0btPyEPz9+3M_y_GklyH5%S^R*|akglz-_GN#`7R?-N368WGYj9HcdM3GF>`cJCfIxq%DHCna$}9iBkxjtXOji5+B(LoYWLc<{1Z%YhV=kr&9~-B1yrd1|zzHC%?*vW%bTjq^)K=7z^OQ*tPsGu((t#V=;1yv`Y?rJ5FGMv6cagqj}O8 zi>sAbXw5a`KG95h$7!s6tQD*y*?zW+g**Z*M!J9p@V<8Dj@8ase7^z1z0UWijdm#Xbiv>!W}SU-${<_MZEm+6I<}kl zDE-_HJ?y#t|G>+)e`>rOuS@6U=6&x``gT7nUd~JOF)yFt&5wEc9Pa;F@N&y9sGlk? z4~CUkK=DTvHKeo$D|mS@?2QGy{-{?Cc}u3*hmbGxn3wY&`vJ1`C^OHoA?P3T^8bRD zcbLEWmx%eiyj(VZfNz}tclYhaiy$7eueYQ@b zKXbX|MfOqHF$qow&a#F1L)CP?>Rs{-|6Alo-_c9Y@NMD2%G3MoUVaxm_+#+kA~lcWj6YIh*+ ze}K143fWQ6OV9A1@f!eQU;{q!a_&OaFS`*ADFK{y|`vNWCZtMx;1tMdIBbDJ;C=2jQ6FD#T>4O>Xh{kYYe YXtNM*HDo1tn6!B&&E{6z?q1yLKT3|^-T(jq literal 0 HcmV?d00001 diff --git a/benchmarks/opencl/bfs/main.cc b/benchmarks/opencl/bfs/main.cc new file mode 100755 index 00000000..eacc9cbf --- /dev/null +++ b/benchmarks/opencl/bfs/main.cc @@ -0,0 +1,299 @@ +//--by Jianbin Fang + +#define __CL_ENABLE_EXCEPTIONS +#include +#include +#include +#include + +#ifdef PROFILING +#include "timer.h" +#endif + +#include "CLHelper.h" +#include "util.h" + +#define MAX_THREADS_PER_BLOCK 256 + +//Structure to hold a node information +struct Node +{ + int starting; + int no_of_edges; +}; + + +//---------------------------------------------------------- +//--bfs on cpu +//--programmer: jianbin +//--date: 26/01/2011 +//--note: width is changed to the new_width +//---------------------------------------------------------- +void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost_ref){ + char stop; + int k = 0; + do{ + //if no thread changes this value then the loop stops + stop=false; + for(int tid = 0; tid < no_of_nodes; tid++ ) + { + if (h_graph_mask[tid] == true){ + h_graph_mask[tid]=false; + for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ + int id = h_graph_edges[i]; //--cambine: node id is connected with node tid + if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below + h_cost_ref[id]=h_cost_ref[tid]+1; + h_updating_graph_mask[id]=true; + } + } + } + } + + for(int tid=0; tid< no_of_nodes ; tid++ ) + { + if (h_updating_graph_mask[tid] == true){ + h_graph_mask[tid]=true; + h_graph_visited[tid]=true; + stop=true; + h_updating_graph_mask[tid]=false; + } + } + k++; + } + while(stop); +} +//---------------------------------------------------------- +//--breadth first search on GPUs +//---------------------------------------------------------- +void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost) + throw(std::string){ + + //int number_elements = height*width; + char h_over; + cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ + d_graph_visited, d_cost, d_over; + try{ + //--1 transfer data from host to device + _clInit(); + d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); + d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); + d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); + d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); + d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); + + + d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); + d_over = _clMallocRW(sizeof(char), &h_over); + + _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); + _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); + _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); + _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); + _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); + _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); + + //--2 invoke kernel +#ifdef PROFILING + timer kernel_timer; + double kernel_time = 0.0; + kernel_timer.reset(); + kernel_timer.start(); +#endif + do{ + h_over = false; + _clMemcpyH2D(d_over, sizeof(char), &h_over); + //--kernel 0 + int kernel_id = 0; + int kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); + _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_cost); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //int work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + //--kernel 1 + kernel_id = 1; + kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_over); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + _clMemcpyD2H(d_over,sizeof(char), &h_over); + }while(h_over); + + _clFinish(); +#ifdef PROFILING + kernel_timer.stop(); + kernel_time = kernel_timer.getTimeInSeconds(); +#endif + //--3 transfer data from device to host + _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); + //--statistics +#ifdef PROFILING + std::cout<<"kernel time(s):"<\n", argv[0]); + +} +//---------------------------------------------------------- +//--cambine: main function +//--author: created by Jianbin Fang +//--date: 25/01/2011 +//---------------------------------------------------------- +int main(int argc, char * argv[]) +{ + int no_of_nodes; + int edge_list_size; + FILE *fp; + Node* h_graph_nodes; + char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; + try{ + char *input_f; + if(argc!=2){ + Usage(argc, argv); + exit(0); + } + + input_f = argv[1]; + printf("Reading File\n"); + //Read in Graph from a file + fp = fopen(input_f,"r"); + if(!fp){ + printf("Error Reading graph file\n"); + return 0; + } + + int source = 0; + + fscanf(fp,"%d",&no_of_nodes); + + int num_of_blocks = 1; + int num_of_threads_per_block = no_of_nodes; + + //Make execution Parameters according to the number of nodes + //Distribute threads across multiple Blocks if necessary + if(no_of_nodes>MAX_THREADS_PER_BLOCK){ + num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + } + work_group_size = num_of_threads_per_block; + // allocate host memory + h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); + h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes); + + int start, edgeno; + // initalize the memory + for(int i = 0; i < no_of_nodes; i++){ + fscanf(fp,"%d %d",&start,&edgeno); + h_graph_nodes[i].starting = start; + h_graph_nodes[i].no_of_edges = edgeno; + h_graph_mask[i]=false; + h_updating_graph_mask[i]=false; + h_graph_visited[i]=false; + } + //read the source node from the file + fscanf(fp,"%d",&source); + source=0; + //set the source node as true in the mask + h_graph_mask[source]=true; + h_graph_visited[source]=true; + fscanf(fp,"%d",&edge_list_size); + int id,cost; + int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size); + for(int i=0; i < edge_list_size ; i++){ + fscanf(fp,"%d",&id); + fscanf(fp,"%d",&cost); + h_graph_edges[i] = id; + } + + if(fp) + fclose(fp); + // allocate mem for the result on host side + int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); + int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); + for(int i=0;i(h_cost_ref, h_cost, no_of_nodes); + //release host memory + free(h_graph_nodes); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + + } + catch(std::string msg){ + std::cout<<"--cambine: exception in main ->"< +#include +#include +#include + +#include "timer.h" + + +using namespace std; + +double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz(); + + +double timer::get_CPU_speed_in_MHz() +{ +#if defined __linux__ + ifstream infile("/proc/cpuinfo"); + char buffer[256], *colon; + + while (infile.good()) { + infile.getline(buffer, 256); + + if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0) + return atof(colon + 2); + } +#endif + + return 0.0; +} + + +void timer::print_time(ostream &str, const char *which, double time) const +{ + static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 }; + const char **unit = units; + + time = 1000.0 * time / CPU_speed_in_MHz; + + while (time >= 999.5 && unit[1] != 0) { + time /= 1000.0; + ++ unit; + } + + str << which << " = " << setprecision(3) << setw(4) << time << *unit; +} + + +ostream &timer::print(ostream &str) +{ + str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right; + + if (CPU_speed_in_MHz == 0) + str << "could not determine CPU speed\n"; + else if (count > 0) { + double total = static_cast(total_time); + + print_time(str, "avg", total / static_cast(count)); + print_time(str, ", total", total); + str << ", count = " << setw(9) << count << '\n'; + } + else + str << "not used\n"; + + return str; +} + + +ostream &operator << (ostream &str, class timer &timer) +{ + return timer.print(str); +} + +double timer::getTimeInSeconds() +{ + double total = static_cast(total_time); + double res = (total / 1000000.0) / CPU_speed_in_MHz; + return res; +} diff --git a/benchmarks/opencl/bfs/timer.h b/benchmarks/opencl/bfs/timer.h new file mode 100755 index 00000000..e5efdc18 --- /dev/null +++ b/benchmarks/opencl/bfs/timer.h @@ -0,0 +1,128 @@ +#ifndef timer_h +#define timer_h + +#include + + +class timer { + public: + timer(const char *name = 0); + timer(const char *name, std::ostream &write_on_exit); + + ~timer(); + + void start(), stop(); + void reset(); + std::ostream &print(std::ostream &); + + double getTimeInSeconds(); + + private: + void print_time(std::ostream &, const char *which, double time) const; + + union { + long long total_time; + struct { +#if defined __PPC__ + int high, low; +#else + int low, high; +#endif + }; + }; + + unsigned long long count; + const char *const name; + std::ostream *const write_on_exit; + + static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); +}; + + +std::ostream &operator << (std::ostream &, class timer &); + + +inline void timer::reset() +{ + total_time = 0; + count = 0; +} + + +inline timer::timer(const char *name) +: + name(name), + write_on_exit(0) +{ + reset(); +} + + +inline timer::timer(const char *name, std::ostream &write_on_exit) +: + name(name), + write_on_exit(&write_on_exit) +{ + reset(); +} + + +inline timer::~timer() +{ + if (write_on_exit != 0) + print(*write_on_exit); +} + + +inline void timer::start() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time -= ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "subl %%eax, %0\n\t" + "sbbl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#else +#error Compiler/Architecture not recognized +#endif +} + + +inline void timer::stop() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time += ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "addl %%eax, %0\n\t" + "adcl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#endif + + ++ count; +} + +#endif diff --git a/benchmarks/opencl/bfs/util.h b/benchmarks/opencl/bfs/util.h new file mode 100755 index 00000000..425edfba --- /dev/null +++ b/benchmarks/opencl/bfs/util.h @@ -0,0 +1,72 @@ +#ifndef _C_UTIL_ +#define _C_UTIL_ +#include +#include + +//------------------------------------------------------------------- +//--initialize array with maximum limit +//------------------------------------------------------------------- +template +void fill(datatype *A, const int n, const datatype maxi){ + for (int j = 0; j < n; j++) + { + A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f))); + } +} + +//--print matrix +template +void print_matrix(datatype *A, int height, int width){ + for(int i=0; i +void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){ + + char passed = true; +#pragma omp parallel for + for (int i=0; i MAX_RELATIVE_ERROR){ + passed = false; + } + } + if (passed){ + std::cout << "--cambine:passed:-)" << endl; + } + else{ + std::cout << "--cambine: failed:-(" << endl; + } + return ; +} +template +void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){ + + char passed = true; +//#pragma omp parallel for + for (int i=0; i Date: Fri, 22 Nov 2019 00:15:34 -0500 Subject: [PATCH 6/8] kmeans benchmark --- benchmarks/opencl/aes/Makefile | 4 ++-- benchmarks/opencl/bfs/Makefile | 2 +- benchmarks/opencl/fft/Makefile | 4 ++-- benchmarks/opencl/hotspot/Makefile | 4 ++-- benchmarks/opencl/kmeans/Makefile | 6 +++--- benchmarks/opencl/sort/Makefile | 4 ++-- benchmarks/opencl/spmv/Makefile | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/opencl/aes/Makefile b/benchmarks/opencl/aes/Makefile index 5ad87059..7e10d5fe 100644 --- a/benchmarks/opencl/aes/Makefile +++ b/benchmarks/opencl/aes/Makefile @@ -34,7 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -49,4 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.elf *.dump *.hex *.a *.pocl *.o diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index 0472f69f..ad748266 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -37,7 +37,7 @@ lib$(PROJECT).a: kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf + $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex diff --git a/benchmarks/opencl/fft/Makefile b/benchmarks/opencl/fft/Makefile index 5ad87059..7e10d5fe 100644 --- a/benchmarks/opencl/fft/Makefile +++ b/benchmarks/opencl/fft/Makefile @@ -34,7 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -49,4 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.elf *.dump *.hex *.a *.pocl *.o diff --git a/benchmarks/opencl/hotspot/Makefile b/benchmarks/opencl/hotspot/Makefile index 5ad87059..7e10d5fe 100644 --- a/benchmarks/opencl/hotspot/Makefile +++ b/benchmarks/opencl/hotspot/Makefile @@ -34,7 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -49,4 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.elf *.dump *.hex *.a *.pocl *.o diff --git a/benchmarks/opencl/kmeans/Makefile b/benchmarks/opencl/kmeans/Makefile index 5ad87059..8cd607fe 100644 --- a/benchmarks/opencl/kmeans/Makefile +++ b/benchmarks/opencl/kmeans/Makefile @@ -29,15 +29,15 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio LIBS = -lOpenCL -PROJECT=saxpy +PROJECT=kmeans all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf + $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex diff --git a/benchmarks/opencl/sort/Makefile b/benchmarks/opencl/sort/Makefile index 5ad87059..7e10d5fe 100644 --- a/benchmarks/opencl/sort/Makefile +++ b/benchmarks/opencl/sort/Makefile @@ -34,7 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -49,4 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.elf *.dump *.hex *.a *.pocl *.o diff --git a/benchmarks/opencl/spmv/Makefile b/benchmarks/opencl/spmv/Makefile index 5ad87059..12aad1c4 100644 --- a/benchmarks/opencl/spmv/Makefile +++ b/benchmarks/opencl/spmv/Makefile @@ -34,7 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf From 842a202d1968bdb48ca81f51154e50d4142af492 Mon Sep 17 00:00:00 2001 From: "Lyons, Ethan Tyler" Date: Fri, 22 Nov 2019 09:20:20 -0500 Subject: [PATCH 7/8] Fixed GPR Stage to be Generic when ASIC is defined --- rtl/VX_gpr_stage.v | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rtl/VX_gpr_stage.v b/rtl/VX_gpr_stage.v index 89316cbb..3d556a83 100644 --- a/rtl/VX_gpr_stage.v +++ b/rtl/VX_gpr_stage.v @@ -116,7 +116,7 @@ module VX_gpr_stage ( wire store_curr_real = !delayed_lsu_last_cycle && stall_lsu; - VX_generic_register #(.N(256)) lsu_data( + VX_generic_register #(.N(`NT*32*2)) lsu_data( .clk (clk), .reset(reset), .stall(!store_curr_real), @@ -133,7 +133,7 @@ module VX_gpr_stage ( assign VX_lsu_req.base_address = (delayed_lsu_last_cycle) ? temp_base_address : real_base_address; - VX_generic_register #(.N(77 + `NW_M1 + 65*(1 + `NT))) lsu_reg( + VX_generic_register #(.N(77 + `NW_M1 + 1 + (`NT))) lsu_reg( .clk (clk), .reset(reset), .stall(stall_lsu), @@ -142,7 +142,7 @@ module VX_gpr_stage ( .out ({VX_lsu_req.valid , VX_lsu_req.lsu_pc ,VX_lsu_req.warp_num , VX_lsu_req.offset , VX_lsu_req.mem_read , VX_lsu_req.mem_write , VX_lsu_req.rd , VX_lsu_req.wb }) ); - VX_generic_register #(.N(224 + `NW_M1 + 1 + 65*(`NT))) exec_unit_reg( + VX_generic_register #(.N(224 + `NW_M1 + 1 + (`NT))) exec_unit_reg( .clk (clk), .reset(reset), .stall(stall_rest), @@ -154,7 +154,7 @@ module VX_gpr_stage ( assign VX_exec_unit_req.a_reg_data = real_base_address; assign VX_exec_unit_req.b_reg_data = real_store_data; - VX_generic_register #(.N(68 + `NW_M1 + 1 + 33*(`NT))) gpu_inst_reg( + VX_generic_register #(.N(36 + `NW_M1 + 1 + (`NT))) gpu_inst_reg( .clk (clk), .reset(reset), .stall(stall_rest), From aa274f2d209b46cbc9495270e66a9d31cec1b66e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 22 Nov 2019 12:30:21 -0500 Subject: [PATCH 8/8] update --- benchmarks/opencl/aes/Makefile | 10 +--------- benchmarks/opencl/bfs/Makefile | 4 ---- benchmarks/opencl/fft/Makefile | 10 +--------- benchmarks/opencl/hotspot/Makefile | 8 -------- benchmarks/opencl/kmeans/Makefile | 10 ---------- benchmarks/opencl/sort/Makefile | 8 -------- benchmarks/opencl/spmv/Makefile | 4 ---- rtl/VX_define.v | 5 ----- syn/esyn.tcl | 4 ---- 9 files changed, 2 insertions(+), 61 deletions(-) diff --git a/benchmarks/opencl/aes/Makefile b/benchmarks/opencl/aes/Makefile index 3ea71be6..47711c90 100644 --- a/benchmarks/opencl/aes/Makefile +++ b/benchmarks/opencl/aes/Makefile @@ -34,11 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -53,8 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: -<<<<<<< HEAD - rm -rf *.elf *.dump *.hex *.a *.pocl *.o -======= - rm -rf *.elf *.dump *.hex *.a *.pocl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 + rm -rf *.elf *.dump *.hex *.a *.pocl *.o \ No newline at end of file diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index e9d36202..ad748266 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -37,11 +37,7 @@ lib$(PROJECT).a: kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a -<<<<<<< HEAD $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf -======= - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex diff --git a/benchmarks/opencl/fft/Makefile b/benchmarks/opencl/fft/Makefile index 3ea71be6..47711c90 100644 --- a/benchmarks/opencl/fft/Makefile +++ b/benchmarks/opencl/fft/Makefile @@ -34,11 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -53,8 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: -<<<<<<< HEAD - rm -rf *.elf *.dump *.hex *.a *.pocl *.o -======= - rm -rf *.elf *.dump *.hex *.a *.pocl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 + rm -rf *.elf *.dump *.hex *.a *.pocl *.o \ No newline at end of file diff --git a/benchmarks/opencl/hotspot/Makefile b/benchmarks/opencl/hotspot/Makefile index 3ea71be6..7e10d5fe 100644 --- a/benchmarks/opencl/hotspot/Makefile +++ b/benchmarks/opencl/hotspot/Makefile @@ -34,11 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -53,8 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: -<<<<<<< HEAD rm -rf *.elf *.dump *.hex *.a *.pocl *.o -======= - rm -rf *.elf *.dump *.hex *.a *.pocl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 diff --git a/benchmarks/opencl/kmeans/Makefile b/benchmarks/opencl/kmeans/Makefile index 73034561..ef4d8ecc 100644 --- a/benchmarks/opencl/kmeans/Makefile +++ b/benchmarks/opencl/kmeans/Makefile @@ -29,26 +29,16 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio LIBS = -lOpenCL -<<<<<<< HEAD PROJECT=kmeans -======= PROJECT=saxpy ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl - -$(PROJECT).elf: main.cc lib$(PROJECT).a - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex diff --git a/benchmarks/opencl/sort/Makefile b/benchmarks/opencl/sort/Makefile index 3ea71be6..7e10d5fe 100644 --- a/benchmarks/opencl/sort/Makefile +++ b/benchmarks/opencl/sort/Makefile @@ -34,11 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf @@ -53,8 +49,4 @@ run: $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug clean: -<<<<<<< HEAD rm -rf *.elf *.dump *.hex *.a *.pocl *.o -======= - rm -rf *.elf *.dump *.hex *.a *.pocl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 diff --git a/benchmarks/opencl/spmv/Makefile b/benchmarks/opencl/spmv/Makefile index f6895a96..12aad1c4 100644 --- a/benchmarks/opencl/spmv/Makefile +++ b/benchmarks/opencl/spmv/Makefile @@ -34,11 +34,7 @@ PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl -<<<<<<< HEAD POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -======= - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf diff --git a/rtl/VX_define.v b/rtl/VX_define.v index 02bc5926..b40567f8 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -11,13 +11,8 @@ // `define ONLY // `define SYN 1 -<<<<<<< HEAD -//`define ASIC 1 -//`define SYN_FUNC 1 -======= // `define ASIC 1 // `define SYN_FUNC 1 ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 `define NUM_BARRIERS 4 diff --git a/syn/esyn.tcl b/syn/esyn.tcl index 522fb262..10fa09d9 100644 --- a/syn/esyn.tcl +++ b/syn/esyn.tcl @@ -37,11 +37,7 @@ uniquify define_name_rules verilog -remove_internal_net_bus -remove_port_bus change_names -rule verilog -hierarchy -<<<<<<< HEAD -report_qor -======= # report_qor ->>>>>>> d4f6a7e3b221ae64441558037b40b87dbf432798 report_area report_hierarchy report_cell