merging with master

This commit is contained in:
fares
2019-11-23 20:40:19 -05:00
16 changed files with 29131 additions and 415 deletions

View File

@@ -131,6 +131,7 @@ void _clCmdParams(int argc, char* argv[]){
// devices have no relationship with context
void _clInit()
{
printf("_clInit()\n");
int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL;
@@ -225,15 +226,18 @@ void _clInit()
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object
std::string source_str = FileToString(kernel_file);
/*std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };
size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithSource(oclHandles.context,
oclHandles.program =
clCreateProgramWithBuiltInKernels(oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], "BFS_1, BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);
&resultCL);*/
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));

View File

@@ -1,33 +1,35 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt)
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime)
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=bfs
@@ -37,7 +39,10 @@ lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc timer.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -45,8 +50,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run:
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex *.a *.pocl
rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

28677
benchmarks/opencl/bfs/graph4096.txt Executable file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -1,12 +1,14 @@
//--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
#include <cstring>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING
#ifdef PROFILING
#include "timer.h"
#endif
@@ -15,285 +17,281 @@
#define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information
struct Node
{
int starting;
int no_of_edges;
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
//----------------------------------------------------------
//--bfs on cpu
//--programmer: jianbin
//--date: 26/01/2011
//--note: width is changed to the new_width
//----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost_ref){
char stop;
int k = 0;
do{
//if no thread changes this value then the loop stops
stop=false;
for(int tid = 0; tid < no_of_nodes; tid++ )
{
if (h_graph_mask[tid] == true){
h_graph_mask[tid]=false;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below
h_cost_ref[id]=h_cost_ref[tid]+1;
h_updating_graph_mask[id]=true;
}
}
}
}
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost_ref) {
char stop;
int k = 0;
do {
// if no thread changes this value then the loop stops
stop = false;
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_graph_mask[tid] == true) {
h_graph_mask[tid] = false;
for (int i = h_graph_nodes[tid].starting;
i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
i++) {
int id =
h_graph_edges[i]; //--cambine: node id is connected with node tid
if (!h_graph_visited[id]) { //--cambine: if node id has not been
//visited, enter the body below
h_cost_ref[id] = h_cost_ref[tid] + 1;
h_updating_graph_mask[id] = true;
}
}
}
}
for(int tid=0; tid< no_of_nodes ; tid++ )
{
if (h_updating_graph_mask[tid] == true){
h_graph_mask[tid]=true;
h_graph_visited[tid]=true;
stop=true;
h_updating_graph_mask[tid]=false;
}
}
k++;
}
while(stop);
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_updating_graph_mask[tid] == true) {
h_graph_mask[tid] = true;
h_graph_visited[tid] = true;
stop = true;
h_updating_graph_mask[tid] = false;
}
}
k++;
} while (stop);
}
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost)
throw(std::string){
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost) throw(std::string) {
//int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
d_graph_visited, d_cost, d_over;
try{
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
// int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over;
try {
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask =
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif
do{
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over);
}while(h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
do {
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over, sizeof(char), &h_over);
} while (h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
#endif
//--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
//--statistics
#ifdef PROFILING
std::cout<<"kernel time(s):"<<kernel_time<<std::endl;
//--3 transfer data from device to host
_clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics
#ifdef PROFILING
std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
}
catch(std::string msg){
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
return ;
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
} catch (std::string msg) {
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
return;
}
void Usage(int argc, char**argv){
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
}
//----------------------------------------------------------
//--cambine: main function
//--author: created by Jianbin Fang
//--date: 25/01/2011
//----------------------------------------------------------
int main(int argc, char * argv[])
{
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node* h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try{
char *input_f;
if(argc!=2){
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
//Read in Graph from a file
fp = fopen(input_f,"r");
if(!fp){
printf("Error Reading graph file\n");
return 0;
}
int main(int argc, char *argv[]) {
printf("enter demo main\n");
int source = 0;
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
fscanf(fp,"%d",&no_of_nodes);
try {
char *input_f = "graph4096.txt";
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return 0;
}
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
printf("Reading File completed!\n");
//Make execution Parameters according to the number of nodes
//Distribute threads across multiple Blocks if necessary
if(no_of_nodes>MAX_THREADS_PER_BLOCK){
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
int start, edgeno;
// initalize the memory
for(int i = 0; i < no_of_nodes; i++){
fscanf(fp,"%d %d",&start,&edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//read the source node from the file
fscanf(fp,"%d",&source);
source=0;
//set the source node as true in the mask
h_graph_mask[source]=true;
h_graph_visited[source]=true;
fscanf(fp,"%d",&edge_list_size);
int id,cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
for(int i=0; i < edge_list_size ; i++){
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i] = id;
}
int source = 0;
if(fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
for(int i=0;i<no_of_nodes;i++){
h_cost[i]=-1;
h_cost_ref[i] = -1;
}
h_cost[source]=0;
h_cost_ref[source]=0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for(int i = 0; i < no_of_nodes; i++){
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//set the source node as true in the mask
source=0;
h_graph_mask[source]=true;
h_graph_visited[source]=true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
fscanf(fp, "%d", &no_of_nodes);
}
catch(std::string msg){
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl;
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++) {
h_cost[i] = -1;
h_cost_ref[i] = -1;
}
h_cost[source] = 0;
h_cost_ref[source] = 0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// set the source node as true in the mask
source = 0;
h_graph_mask[source] = true;
h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
}

View File

@@ -3,126 +3,99 @@
#include <iostream>
class timer {
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
~timer();
~timer();
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
double getTimeInSeconds();
double getTimeInSeconds();
private:
void print_time(std::ostream &, const char *which, double time) const;
private:
void print_time(std::ostream &, const char *which, double time) const;
union {
long long total_time;
struct {
union {
long long total_time;
struct {
#if defined __PPC__
int high, low;
int high, low;
#else
int low, high;
int low, high;
#endif
};
};
};
};
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
};
std::ostream &operator<<(std::ostream &, class timer &);
std::ostream &operator << (std::ostream &, class timer &);
inline void timer::reset()
{
total_time = 0;
count = 0;
inline void timer::reset() {
total_time = 0;
count = 0;
}
inline timer::timer(const char *name)
:
name(name),
write_on_exit(0)
{
reset();
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
reset();
}
inline timer::timer(const char *name, std::ostream &write_on_exit)
:
name(name),
write_on_exit(&write_on_exit)
{
reset();
: name(name), write_on_exit(&write_on_exit) {
reset();
}
inline timer::~timer()
{
if (write_on_exit != 0)
print(*write_on_exit);
inline timer::~timer() {
if (write_on_exit != 0)
print(*write_on_exit);
}
inline void timer::start()
{
inline void timer::start() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#else
#error Compiler/Architecture not recognized
#endif
}
inline void timer::stop()
{
inline void timer::stop() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#endif
++ count;
++count;
}
#endif

View File

@@ -1,44 +1,60 @@
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_INC_PATH = $(wildcard ../include)
POCL_LIB_PATH = $(wildcard ../lib)
VX_RT_PATH = $(wildcard ../../../runtime)
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
RISCV_TOOL_PATH=$(wildcard ~/dev/riscv-gnu-toolchain/drops)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
POCL_CC_PATH=$(wildcard ~/dev/pocl/drops_riscv_cc)
POCL_RT_PATH=$(wildcard ~/dev/pocl/drops_riscv_rt)
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_RT_PATH=$(wildcard ../../../runtime)
VX_SIMX_PATH=$(wildcard ../../../simX/obj_dir)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CC=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX=$(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
NEWLIB_PATH=$(RISCV_TOOL_PATH)/riscv32-unknown-elf/lib
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.s
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS = $(VX_STR) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS = -g -O0 -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -march=rv32im -mabi=ilp32
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
LIBS = -lOpenCL
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
PROJECT=kmeans
PROJECT=saxpy
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc rmse.c read_input.c cluster.c kmeans_clustering.c -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf
kmeans_clustering.o: kmeans_clustering.c
$(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a read_input.o rmse.o cluster.o kmeans_clustering.o
$(CXX) $(CXXFLAGS) main.cc read_input.o rmse.o cluster.o kmeans_clustering.o $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
@@ -46,8 +62,17 @@ $(PROJECT).hex: $(PROJECT).elf
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run:
$(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex *.a *.pocl
rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu
rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu
rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex
rm -rf *.elf *.dump *.hex

View File

@@ -63,4 +63,4 @@ gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex *.a *.pocl *.qemu
rm -rf *.o *.elf *.dump *.hex *.a *.pocl *.qemu

View File

@@ -34,8 +34,8 @@ HEX: ELF
$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
ELF:
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
# $(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf~

View File

@@ -6,14 +6,17 @@
int main()
{
vx_tmc(1);
int n = 5;
int scalar = 10;
int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
#if 1
#if 0
//---------------------------------------------------------------
/* vvaddint32
* # vector-vector add routine of 32-bit integers
@@ -43,7 +46,6 @@ int main()
/* # vector-scalar add
# for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
int scalar = 10;
printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb: %d", scalar);
@@ -78,10 +80,12 @@ int main()
if(a[i] != b[i])
{
printf("\n<memcpy> failed at <index: %d>! \n", i);
return;
return 1;
}
}
printf("\nPASSED.......................... <memcpy> \n");
#endif
#if 1
//---------------------------------------------------------------
/* # void saxpy(size_t n, const float a, const float *x, float *y)
# ==> convert to int!!
@@ -99,16 +103,22 @@ int main()
vx_vec_saxpy(n, scalar, a, b);
printf("saxpy\na[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
printf("\nb[%d]: ", n);
for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
for(int i = 0; i < n; ++i)
{
if(b[i] != ((a[i] * scalar) + c[i]))
{
printf("\n<saxpy> failed at <index: %d>! \n", i);
return;
return 1;
}
}
printf("\nPASSED.......................... <saxpy> \n");
#endif
#if 0
//---------------------------------------------------------------
/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a, // m * k matrix
# size_t lda, const float*b, // k * n matrix

View File

@@ -5,10 +5,10 @@
extern "C" {
#endif
void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
//void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
//void vx_vec_vsadd(int n, int* a, int scalar);
//void vx_vec_memcpy(int* a, int* b, int n);
//void vx_vec_saxpy(int n, int scalar, int* a, int* b);
void vx_vec_saxpy(int n, int scalar, int* a, int* b);
//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
#ifdef __cplusplus

View File

@@ -13,16 +13,31 @@
# fa0 a
# a1 x
# a2 y
#vx_vec_saxpy:
# vsetvli a4, a0, e32, m8
#saxpy:
# vlw.v v0, (a1)
# sub a0, a0, a4
# slli a4, a4, 2
# add a1, a1, a4
# vlw.v v8, (a2)
# vfmacc.vf v8, fa0, v0
# vsw.v v8, (a2)
# add a2, a2, a4
# bnez a0, saxpy
# ret
# a0 n, rs1 a, a2 x, a3 y
vx_vec_saxpy:
vsetvli a4, a0, e32, m8
saxpy:
vlw.v v0, (a1)
vlw.v v0, (a2)
sub a0, a0, a4
slli a4, a4, 2
add a1, a1, a4
vlw.v v8, (a2)
vfmacc.vf v8, fa0, v0
vsw.v v8, (a2)
add a2, a2, a4
vlw.v v1, (a3)
vmacc.vx v1, rs1, v0
vsw.v v1, (a3)
add a3, a3, a4
bnez a0, saxpy
ret
ret