From 76417b0561d563541d0102e19ec7868c2db7c242 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 23 Nov 2019 08:36:00 -0500 Subject: [PATCH] update --- benchmarks/opencl/bfs/Makefile | 68 ++-- benchmarks/opencl/bfs/libbfs.a | Bin 7846 -> 7846 bytes benchmarks/opencl/bfs/main.cc | 500 ++++++++++++++--------------- benchmarks/opencl/bfs/timer.h | 141 ++++---- benchmarks/opencl/kmeans/Makefile | 83 +++-- benchmarks/opencl/saxpy/Makefile | 2 +- benchmarks/opencl/sfilter/Makefile | 2 +- benchmarks/opencl/sgemm/Makefile | 2 +- benchmarks/opencl/vecadd/Makefile | 2 +- 9 files changed, 404 insertions(+), 396 deletions(-) diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index ad748266..b085b35f 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -1,33 +1,35 @@ +RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc) +POCL_INC_PATH = $(wildcard ../include) +POCL_LIB_PATH = $(wildcard ../lib) +VX_RT_PATH = $(wildcard ../../../runtime) +VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir) -RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) +CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy +GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb -POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) -POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) +VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c +VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s +VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s +VX_SRCS += $(VX_RT_PATH)/tests/tests.c +VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c +VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) -VX_RT_PATH=$(wildcard ../../../runtime) -VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir) +VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc -CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ -DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump -HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy -NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib - -VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c -VX_STR = $(VX_RT_PATH)/startup/vx_start.s -VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s -VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c -VX_FIO = $(VX_RT_PATH)/fileio/fileio.s -VX_API = $(VX_RT_PATH)/vx_api/vx_api.c - -VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) - -CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32 +CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32 CXXFLAGS += -ffreestanding # program may not begin at main() -CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections +CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions +CXXFLAGS += -I$(POCL_INC_PATH) -LIBS = -lOpenCL +VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a +QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a PROJECT=bfs @@ -37,7 +39,10 @@ lib$(PROJECT).a: kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf + $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf + +$(PROJECT).qemu: main.cc lib$(PROJECT).a + $(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex @@ -45,8 +50,17 @@ $(PROJECT).hex: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf $(DMP) -D $(PROJECT).elf > $(PROJECT).dump -run: - $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug +run: $(PROJECT).hex + POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug + +qemu: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu + +gdb-s: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-c: $(PROJECT).qemu + $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file diff --git a/benchmarks/opencl/bfs/libbfs.a b/benchmarks/opencl/bfs/libbfs.a index 99a5c1aeb94f58c89a1e4fbebc702f1ead3b84dc..2025b6cd989ca8793ac8994439a1854c899700bb 100644 GIT binary patch delta 209 zcmY+-F%E(-6o+vc)WpTn&A2lBv9$EH9k>9Oc!5Wom}oH3GqJdsa1kfZBcsgUFZsrL ztlv%6I+zF{7=t9maNa%R?Vc{zh%SH#y@$0(R6K@7O@yFPSp*O77fH3S1f>xPE>5qk zS?68gNUSuIaHR^7)&FOPGbcf%rsHbS$VIF?kEw*z?}ImV zL-#&qV$djTKu9pA=#QJH-tOsgRk#362nzC`ws`cjnrc)Hh+qv|HA`ng3~&v^!=&+* zot8O8PQvP$gj*?MT>L*2kpwzT1g(twb-&GR&7oxDS%{sTaa$vlFdjm1cH)O@{O&%h EAD^r}_5c6? diff --git a/benchmarks/opencl/bfs/main.cc b/benchmarks/opencl/bfs/main.cc index eacc9cbf..a63f1ea3 100755 --- a/benchmarks/opencl/bfs/main.cc +++ b/benchmarks/opencl/bfs/main.cc @@ -1,12 +1,14 @@ //--by Jianbin Fang -#define __CL_ENABLE_EXCEPTIONS #include +#include #include #include -#include +#include +#include +#include -#ifdef PROFILING +#ifdef PROFILING #include "timer.h" #endif @@ -15,285 +17,279 @@ #define MAX_THREADS_PER_BLOCK 256 -//Structure to hold a node information -struct Node -{ - int starting; - int no_of_edges; +// Structure to hold a node information +struct Node { + int starting; + int no_of_edges; }; - //---------------------------------------------------------- //--bfs on cpu //--programmer: jianbin //--date: 26/01/2011 //--note: width is changed to the new_width //---------------------------------------------------------- -void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ - int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ - char *h_graph_visited, int *h_cost_ref){ - char stop; - int k = 0; - do{ - //if no thread changes this value then the loop stops - stop=false; - for(int tid = 0; tid < no_of_nodes; tid++ ) - { - if (h_graph_mask[tid] == true){ - h_graph_mask[tid]=false; - for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ - int id = h_graph_edges[i]; //--cambine: node id is connected with node tid - if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below - h_cost_ref[id]=h_cost_ref[tid]+1; - h_updating_graph_mask[id]=true; - } - } - } - } +void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, + int *h_graph_edges, char *h_graph_mask, + char *h_updating_graph_mask, char *h_graph_visited, + int *h_cost_ref) { + char stop; + int k = 0; + do { + // if no thread changes this value then the loop stops + stop = false; + for (int tid = 0; tid < no_of_nodes; tid++) { + if (h_graph_mask[tid] == true) { + h_graph_mask[tid] = false; + for (int i = h_graph_nodes[tid].starting; + i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); + i++) { + int id = + h_graph_edges[i]; //--cambine: node id is connected with node tid + if (!h_graph_visited[id]) { //--cambine: if node id has not been + //visited, enter the body below + h_cost_ref[id] = h_cost_ref[tid] + 1; + h_updating_graph_mask[id] = true; + } + } + } + } - for(int tid=0; tid< no_of_nodes ; tid++ ) - { - if (h_updating_graph_mask[tid] == true){ - h_graph_mask[tid]=true; - h_graph_visited[tid]=true; - stop=true; - h_updating_graph_mask[tid]=false; - } - } - k++; - } - while(stop); + for (int tid = 0; tid < no_of_nodes; tid++) { + if (h_updating_graph_mask[tid] == true) { + h_graph_mask[tid] = true; + h_graph_visited[tid] = true; + stop = true; + h_updating_graph_mask[tid] = false; + } + } + k++; + } while (stop); } //---------------------------------------------------------- //--breadth first search on GPUs //---------------------------------------------------------- -void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ - int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ - char *h_graph_visited, int *h_cost) - throw(std::string){ +void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, + int *h_graph_edges, char *h_graph_mask, + char *h_updating_graph_mask, char *h_graph_visited, + int *h_cost) throw(std::string) { - //int number_elements = height*width; - char h_over; - cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ - d_graph_visited, d_cost, d_over; - try{ - //--1 transfer data from host to device - _clInit(); - d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); - d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); - d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); - d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); - d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); + // int number_elements = height*width; + char h_over; + cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, + d_graph_visited, d_cost, d_over; + try { + //--1 transfer data from host to device + _clInit(); + d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes); + d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges); + d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask); + d_updating_graph_mask = + _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask); + d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited); + d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost); + d_over = _clMallocRW(sizeof(char), &h_over); - d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); - d_over = _clMallocRW(sizeof(char), &h_over); - - _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); - _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); - _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); - _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); - _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); - _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); - - //--2 invoke kernel -#ifdef PROFILING - timer kernel_timer; - double kernel_time = 0.0; - kernel_timer.reset(); - kernel_timer.start(); + _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes); + _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges); + _clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask); + _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), + h_updating_graph_mask); + _clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited); + _clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost); + +//--2 invoke kernel +#ifdef PROFILING + timer kernel_timer; + double kernel_time = 0.0; + kernel_timer.reset(); + kernel_timer.start(); #endif - do{ - h_over = false; - _clMemcpyH2D(d_over, sizeof(char), &h_over); - //--kernel 0 - int kernel_id = 0; - int kernel_idx = 0; - _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); - _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); - _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); - _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); - _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); - _clSetArgs(kernel_id, kernel_idx++, d_cost); - _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); - - //int work_items = no_of_nodes; - _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); - - //--kernel 1 - kernel_id = 1; - kernel_idx = 0; - _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); - _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); - _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); - _clSetArgs(kernel_id, kernel_idx++, d_over); - _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); - - //work_items = no_of_nodes; - _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); - - _clMemcpyD2H(d_over,sizeof(char), &h_over); - }while(h_over); - - _clFinish(); -#ifdef PROFILING - kernel_timer.stop(); - kernel_time = kernel_timer.getTimeInSeconds(); + do { + h_over = false; + _clMemcpyH2D(d_over, sizeof(char), &h_over); + //--kernel 0 + int kernel_id = 0; + int kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); + _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_cost); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + // int work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + //--kernel 1 + kernel_id = 1; + kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_over); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + // work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + _clMemcpyD2H(d_over, sizeof(char), &h_over); + } while (h_over); + + _clFinish(); +#ifdef PROFILING + kernel_timer.stop(); + kernel_time = kernel_timer.getTimeInSeconds(); #endif - //--3 transfer data from device to host - _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); - //--statistics -#ifdef PROFILING - std::cout<<"kernel time(s):"<\n", argv[0]); - -} //---------------------------------------------------------- //--cambine: main function //--author: created by Jianbin Fang //--date: 25/01/2011 //---------------------------------------------------------- -int main(int argc, char * argv[]) -{ - int no_of_nodes; - int edge_list_size; - FILE *fp; - Node* h_graph_nodes; - char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; - try{ - char *input_f; - if(argc!=2){ - Usage(argc, argv); - exit(0); - } - - input_f = argv[1]; - printf("Reading File\n"); - //Read in Graph from a file - fp = fopen(input_f,"r"); - if(!fp){ - printf("Error Reading graph file\n"); - return 0; - } +int main(int argc, char *argv[]) { + printf("enter demo main\n"); - int source = 0; + int no_of_nodes; + int edge_list_size; + FILE *fp; + Node *h_graph_nodes; + char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; - fscanf(fp,"%d",&no_of_nodes); + try { + char *input_f = "../data/bfs/graph1MW_6.txt"; + printf("Reading File\n"); + // Read in Graph from a file + fp = fopen(input_f, "r"); + if (!fp) { + printf("Error Reading graph file\n"); + return 0; + } - int num_of_blocks = 1; - int num_of_threads_per_block = no_of_nodes; + int source = 0; - //Make execution Parameters according to the number of nodes - //Distribute threads across multiple Blocks if necessary - if(no_of_nodes>MAX_THREADS_PER_BLOCK){ - num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); - num_of_threads_per_block = MAX_THREADS_PER_BLOCK; - } - work_group_size = num_of_threads_per_block; - // allocate host memory - h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); - h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); - h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); - h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes); - - int start, edgeno; - // initalize the memory - for(int i = 0; i < no_of_nodes; i++){ - fscanf(fp,"%d %d",&start,&edgeno); - h_graph_nodes[i].starting = start; - h_graph_nodes[i].no_of_edges = edgeno; - h_graph_mask[i]=false; - h_updating_graph_mask[i]=false; - h_graph_visited[i]=false; - } - //read the source node from the file - fscanf(fp,"%d",&source); - source=0; - //set the source node as true in the mask - h_graph_mask[source]=true; - h_graph_visited[source]=true; - fscanf(fp,"%d",&edge_list_size); - int id,cost; - int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size); - for(int i=0; i < edge_list_size ; i++){ - fscanf(fp,"%d",&id); - fscanf(fp,"%d",&cost); - h_graph_edges[i] = id; - } + fscanf(fp, "%d", &no_of_nodes); - if(fp) - fclose(fp); - // allocate mem for the result on host side - int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); - int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); - for(int i=0;i(h_cost_ref, h_cost, no_of_nodes); - //release host memory - free(h_graph_nodes); - free(h_graph_mask); - free(h_updating_graph_mask); - free(h_graph_visited); + int num_of_blocks = 1; + int num_of_threads_per_block = no_of_nodes; - } - catch(std::string msg){ - std::cout<<"--cambine: exception in main ->"< MAX_THREADS_PER_BLOCK) { + num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + } + work_group_size = num_of_threads_per_block; + // allocate host memory + h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes); + h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes); + h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes); + h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes); + + int start, edgeno; + // initalize the memory + for (int i = 0; i < no_of_nodes; i++) { + fscanf(fp, "%d %d", &start, &edgeno); + h_graph_nodes[i].starting = start; + h_graph_nodes[i].no_of_edges = edgeno; + h_graph_mask[i] = false; + h_updating_graph_mask[i] = false; + h_graph_visited[i] = false; + } + // read the source node from the file + fscanf(fp, "%d", &source); + source = 0; + // set the source node as true in the mask + h_graph_mask[source] = true; + h_graph_visited[source] = true; + fscanf(fp, "%d", &edge_list_size); + int id, cost; + int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size); + for (int i = 0; i < edge_list_size; i++) { + fscanf(fp, "%d", &id); + fscanf(fp, "%d", &cost); + h_graph_edges[i] = id; + } + + if (fp) + fclose(fp); + // allocate mem for the result on host side + int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes); + int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes); + for (int i = 0; i < no_of_nodes; i++) { + h_cost[i] = -1; + h_cost_ref[i] = -1; + } + h_cost[source] = 0; + h_cost_ref[source] = 0; + //--------------------------------------------------------- + //--gpu entry + run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges, + h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost); + //--------------------------------------------------------- + //--cpu entry + // initalize the memory again + for (int i = 0; i < no_of_nodes; i++) { + h_graph_mask[i] = false; + h_updating_graph_mask[i] = false; + h_graph_visited[i] = false; + } + // set the source node as true in the mask + source = 0; + h_graph_mask[source] = true; + h_graph_visited[source] = true; + run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges, + h_graph_mask, h_updating_graph_mask, h_graph_visited, + h_cost_ref); + //--------------------------------------------------------- + //--result varification + compare_results(h_cost_ref, h_cost, no_of_nodes); + // release host memory + free(h_graph_nodes); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + + } catch (std::string msg) { + std::cout << "--cambine: exception in main ->" << msg << std::endl; + // release host memory + free(h_graph_nodes); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + } + + return 0; } diff --git a/benchmarks/opencl/bfs/timer.h b/benchmarks/opencl/bfs/timer.h index e5efdc18..b142e279 100755 --- a/benchmarks/opencl/bfs/timer.h +++ b/benchmarks/opencl/bfs/timer.h @@ -3,126 +3,99 @@ #include - class timer { - public: - timer(const char *name = 0); - timer(const char *name, std::ostream &write_on_exit); +public: + timer(const char *name = 0); + timer(const char *name, std::ostream &write_on_exit); - ~timer(); + ~timer(); - void start(), stop(); - void reset(); - std::ostream &print(std::ostream &); + void start(), stop(); + void reset(); + std::ostream &print(std::ostream &); - double getTimeInSeconds(); + double getTimeInSeconds(); - private: - void print_time(std::ostream &, const char *which, double time) const; +private: + void print_time(std::ostream &, const char *which, double time) const; - union { - long long total_time; - struct { + union { + long long total_time; + struct { #if defined __PPC__ - int high, low; + int high, low; #else - int low, high; + int low, high; #endif - }; - }; + }; + }; - unsigned long long count; - const char *const name; - std::ostream *const write_on_exit; + unsigned long long count; + const char *const name; + std::ostream *const write_on_exit; - static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); + static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); }; +std::ostream &operator<<(std::ostream &, class timer &); -std::ostream &operator << (std::ostream &, class timer &); - - -inline void timer::reset() -{ - total_time = 0; - count = 0; +inline void timer::reset() { + total_time = 0; + count = 0; } - -inline timer::timer(const char *name) -: - name(name), - write_on_exit(0) -{ - reset(); +inline timer::timer(const char *name) : name(name), write_on_exit(0) { + reset(); } - inline timer::timer(const char *name, std::ostream &write_on_exit) -: - name(name), - write_on_exit(&write_on_exit) -{ - reset(); + : name(name), write_on_exit(&write_on_exit) { + reset(); } - -inline timer::~timer() -{ - if (write_on_exit != 0) - print(*write_on_exit); +inline timer::~timer() { + if (write_on_exit != 0) + print(*write_on_exit); } - -inline void timer::start() -{ +inline void timer::start() { #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) - unsigned eax, edx; + unsigned eax, edx; - asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + asm volatile("rdtsc" : "=a"(eax), "=d"(edx)); - total_time -= ((unsigned long long) edx << 32) + eax; -#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) - asm volatile - ( - "rdtsc\n\t" - "subl %%eax, %0\n\t" - "sbbl %%edx, %1" - : - "+m" (low), "+m" (high) - : - : - "eax", "edx" - ); + total_time -= ((unsigned long long)edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \ + (defined __i386 || defined __x86_64) + asm volatile("rdtsc\n\t" + "subl %%eax, %0\n\t" + "sbbl %%edx, %1" + : "+m"(low), "+m"(high) + : + : "eax", "edx"); #else #error Compiler/Architecture not recognized #endif } - -inline void timer::stop() -{ +inline void timer::stop() { #if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) - unsigned eax, edx; + unsigned eax, edx; - asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + asm volatile("rdtsc" : "=a"(eax), "=d"(edx)); - total_time += ((unsigned long long) edx << 32) + eax; -#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) - asm volatile - ( - "rdtsc\n\t" - "addl %%eax, %0\n\t" - "adcl %%edx, %1" - : - "+m" (low), "+m" (high) - : - : - "eax", "edx" - ); + total_time += ((unsigned long long)edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \ + (defined __i386 || defined __x86_64) + asm volatile("rdtsc\n\t" + "addl %%eax, %0\n\t" + "adcl %%edx, %1" + : "+m"(low), "+m"(high) + : + : "eax", "edx"); #endif - ++ count; + ++count; } #endif diff --git a/benchmarks/opencl/kmeans/Makefile b/benchmarks/opencl/kmeans/Makefile index ef4d8ecc..b60de117 100644 --- a/benchmarks/opencl/kmeans/Makefile +++ b/benchmarks/opencl/kmeans/Makefile @@ -1,44 +1,60 @@ +RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc) +POCL_INC_PATH = $(wildcard ../include) +POCL_LIB_PATH = $(wildcard ../lib) +VX_RT_PATH = $(wildcard ../../../runtime) +VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir) -RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops) +CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy +GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb -POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc) -POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt) +VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c +VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s +VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s +VX_SRCS += $(VX_RT_PATH)/tests/tests.c +VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c +VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) -VX_RT_PATH=$(wildcard ../../../runtime) -VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir) +VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc -CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ -DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump -HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy -NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib - -VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c -VX_STR = $(VX_RT_PATH)/startup/vx_start.s -VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s -VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c -VX_FIO = $(VX_RT_PATH)/fileio/fileio.s -VX_API = $(VX_RT_PATH)/vx_api/vx_api.c - -VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) - -CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32 +CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32 CXXFLAGS += -ffreestanding # program may not begin at main() -CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections +CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions +CXXFLAGS += -I$(POCL_INC_PATH) -LIBS = -lOpenCL +VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a +QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a PROJECT=kmeans -PROJECT=saxpy all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl -$(PROJECT).elf: main.cc lib$(PROJECT).a - $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf +kmeans_clustering.o: kmeans_clustering.c + $(CC) $(CXXFLAGS) -c kmeans_clustering.c + +cluster.o: cluster.c + $(CC) $(CXXFLAGS) -c cluster.c + +read_input.o: read_input.c + $(CC) $(CXXFLAGS) -c read_input.c + +rmse.o: rmse.c + $(CC) $(CXXFLAGS) -c rmse.c + +$(PROJECT).elf: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o + $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(VX_LIBS) -o $(PROJECT).elf + +$(PROJECT).qemu: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o + $(CXX) $(CXXFLAGS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(QEMU_LIBS) -o $(PROJECT).qemu $(PROJECT).hex: $(PROJECT).elf $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex @@ -46,8 +62,17 @@ $(PROJECT).hex: $(PROJECT).elf $(PROJECT).dump: $(PROJECT).elf $(DMP) -D $(PROJECT).elf > $(PROJECT).dump -run: - $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug +run: $(PROJECT).hex + POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug + +qemu: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu + +gdb-s: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-c: $(PROJECT).qemu + $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file diff --git a/benchmarks/opencl/saxpy/Makefile b/benchmarks/opencl/saxpy/Makefile index 13343b75..de0784f1 100644 --- a/benchmarks/opencl/saxpy/Makefile +++ b/benchmarks/opencl/saxpy/Makefile @@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file diff --git a/benchmarks/opencl/sfilter/Makefile b/benchmarks/opencl/sfilter/Makefile index 2c8b716a..72b9ae51 100644 --- a/benchmarks/opencl/sfilter/Makefile +++ b/benchmarks/opencl/sfilter/Makefile @@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file diff --git a/benchmarks/opencl/sgemm/Makefile b/benchmarks/opencl/sgemm/Makefile index cd568322..fda3cbc5 100644 --- a/benchmarks/opencl/sgemm/Makefile +++ b/benchmarks/opencl/sgemm/Makefile @@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file diff --git a/benchmarks/opencl/vecadd/Makefile b/benchmarks/opencl/vecadd/Makefile index c257e649..f8b76430 100644 --- a/benchmarks/opencl/vecadd/Makefile +++ b/benchmarks/opencl/vecadd/Makefile @@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu $(GDB) $(PROJECT).qemu clean: - rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file + rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu \ No newline at end of file