project directories reorganization
This commit is contained in:
816
benchmarks/old_opencl/bfs/CLHelper.h
Executable file
816
benchmarks/old_opencl/bfs/CLHelper.h
Executable file
@@ -0,0 +1,816 @@
|
||||
//------------------------------------------
|
||||
//--cambine:helper function for OpenCL
|
||||
//--programmer: Jianbin Fang
|
||||
//--date: 27/12/2010
|
||||
//------------------------------------------
|
||||
#ifndef _CL_HELPER_
|
||||
#define _CL_HELPER_
|
||||
|
||||
#include <CL/cl.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
using std::string;
|
||||
using std::ifstream;
|
||||
using std::cerr;
|
||||
using std::endl;
|
||||
using std::cout;
|
||||
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
|
||||
#define WORK_DIM 2 // work-items dimensions
|
||||
|
||||
struct oclHandleStruct {
|
||||
cl_context context;
|
||||
cl_device_id *devices;
|
||||
cl_command_queue queue;
|
||||
cl_program program;
|
||||
cl_int cl_status;
|
||||
std::string error_str;
|
||||
std::vector<cl_kernel> kernel;
|
||||
};
|
||||
|
||||
struct oclHandleStruct oclHandles;
|
||||
|
||||
char kernel_file[100] = "Kernels.cl";
|
||||
int total_kernels = 2;
|
||||
string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||
int work_group_size = 512;
|
||||
int device_id_inused = 0; // deviced id used (default : 0)
|
||||
|
||||
/*
|
||||
* Converts the contents of a file into a string
|
||||
*/
|
||||
string FileToString(const string fileName) {
|
||||
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
|
||||
|
||||
try {
|
||||
size_t size;
|
||||
char *str;
|
||||
string s;
|
||||
|
||||
if (f.is_open()) {
|
||||
size_t fileSize;
|
||||
f.seekg(0, ifstream::end);
|
||||
size = fileSize = f.tellg();
|
||||
f.seekg(0, ifstream::beg);
|
||||
|
||||
str = new char[size + 1];
|
||||
if (!str)
|
||||
throw(string("Could not allocate memory"));
|
||||
|
||||
f.read(str, fileSize);
|
||||
f.close();
|
||||
str[size] = '\0';
|
||||
|
||||
s = str;
|
||||
delete[] str;
|
||||
return s;
|
||||
}
|
||||
} catch (std::string msg) {
|
||||
cerr << "Exception caught in FileToString(): " << msg << endl;
|
||||
if (f.is_open())
|
||||
f.close();
|
||||
} catch (...) {
|
||||
cerr << "Exception caught in FileToString()" << endl;
|
||||
if (f.is_open())
|
||||
f.close();
|
||||
}
|
||||
string errorMsg = "FileToString()::Error: Unable to open file " + fileName;
|
||||
throw(errorMsg);
|
||||
}
|
||||
//---------------------------------------
|
||||
// Read command line parameters
|
||||
//
|
||||
void _clCmdParams(int argc, char *argv[]) {
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
switch (argv[i][1]) {
|
||||
case 'g': //--g stands for size of work group
|
||||
if (++i < argc) {
|
||||
sscanf(argv[i], "%u", &work_group_size);
|
||||
} else {
|
||||
std::cerr << "Could not read argument after option " << argv[i - 1]
|
||||
<< std::endl;
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
case 'd': //--d stands for device id used in computaion
|
||||
if (++i < argc) {
|
||||
sscanf(argv[i], "%u", &device_id_inused);
|
||||
} else {
|
||||
std::cerr << "Could not read argument after option " << argv[i - 1]
|
||||
<< std::endl;
|
||||
throw;
|
||||
}
|
||||
break;
|
||||
default:;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// Initlize CL objects
|
||||
//--description: there are 5 steps to initialize all the OpenCL objects needed
|
||||
//--revised on 04/01/2011: get the number of devices and
|
||||
// devices have no relationship with context
|
||||
void _clInit() {
|
||||
printf("_clInit()\n");
|
||||
|
||||
int DEVICE_ID_INUSED = device_id_inused;
|
||||
cl_int resultCL;
|
||||
|
||||
oclHandles.context = NULL;
|
||||
oclHandles.devices = NULL;
|
||||
oclHandles.queue = NULL;
|
||||
oclHandles.program = NULL;
|
||||
|
||||
cl_uint deviceListSize;
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-1: find the available platforms and select one
|
||||
|
||||
cl_uint numPlatforms = 1;
|
||||
cl_platform_id targetPlatform = NULL;
|
||||
|
||||
cl_platform_id *allPlatforms =
|
||||
(cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
|
||||
|
||||
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
|
||||
|
||||
// Select the target platform. Default: first platform
|
||||
targetPlatform = allPlatforms[0];
|
||||
|
||||
/*for (int i = 0; i < numPlatforms; i++)
|
||||
{
|
||||
char pbuff[128];
|
||||
resultCL = clGetPlatformInfo( allPlatforms[i],
|
||||
CL_PLATFORM_VENDOR,
|
||||
sizeof(pbuff),
|
||||
pbuff,
|
||||
NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
|
||||
|
||||
//printf("vedor is %s\n",pbuff);
|
||||
|
||||
}
|
||||
free(allPlatforms);*/
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-2: create an OpenCL context
|
||||
/*cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)targetPlatform, 0 };
|
||||
oclHandles.context = clCreateContextFromType(cprops,
|
||||
CL_DEVICE_TYPE_GPU,
|
||||
NULL,
|
||||
NULL,
|
||||
&resultCL);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||
throw (string("InitCL()::Error: Creating Context
|
||||
(clCreateContextFromType)"));
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-3: detect OpenCL devices
|
||||
// First, get the size of device list
|
||||
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0,
|
||||
NULL, &deviceListSize);
|
||||
if(oclHandles.cl_status!=CL_SUCCESS){
|
||||
throw(string("exception in _clInit -> clGetDeviceIDs"));
|
||||
}
|
||||
if (deviceListSize == 0)
|
||||
throw(string("InitCL()::Error: No devices found."));
|
||||
|
||||
printf("OK1()\n");
|
||||
|
||||
//std::cout<<"device number:"<<deviceListSize<<std::endl;*/
|
||||
|
||||
// Now, allocate the device list
|
||||
deviceListSize = 1;
|
||||
oclHandles.devices =
|
||||
(cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
|
||||
if (oclHandles.devices == 0)
|
||||
throw(string("InitCL()::Error: Could not allocate memory."));
|
||||
|
||||
//* Next, get the device list data
|
||||
oclHandles.cl_status =
|
||||
clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_DEFAULT, deviceListSize,
|
||||
oclHandles.devices, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
|
||||
}
|
||||
|
||||
oclHandles.context = clCreateContext(NULL, deviceListSize, oclHandles.devices,
|
||||
NULL, NULL, &resultCL);
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||
throw(string("InitCL()::Error: Creating Context (clCreateContext)"));
|
||||
|
||||
//-----------------------------------------------
|
||||
//--cambine-4: Create an OpenCL command queue
|
||||
oclHandles.queue = clCreateCommandQueue(
|
||||
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
|
||||
printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
|
||||
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
|
||||
//-----------------------------------------------
|
||||
//--cambine-5: Load CL file, build CL program object, create CL kernel object
|
||||
/*std::string source_str = FileToString(kernel_file);
|
||||
const char * source = source_str.c_str();
|
||||
size_t sourceSize[] = { source_str.length() };*/
|
||||
|
||||
oclHandles.program = clCreateProgramWithBuiltInKernels(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
|
||||
"BFS_1;BFS_2", &resultCL);
|
||||
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
|
||||
1,
|
||||
&source,
|
||||
sourceSize,
|
||||
&resultCL);*/
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||
throw(string("InitCL()::Error: Loading Binary into cl_program. "
|
||||
"(clCreateProgramWithBinary)"));
|
||||
|
||||
// insert debug information
|
||||
// std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
|
||||
// options += " -cl-nv-opt-level=3";
|
||||
resultCL = clBuildProgram(oclHandles.program, deviceListSize,
|
||||
oclHandles.devices, NULL, NULL, NULL);
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
|
||||
cerr << "InitCL()::Error: In clBuildProgram" << endl;
|
||||
|
||||
size_t length;
|
||||
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||
oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting Program build "
|
||||
"info(clGetProgramBuildInfo)"));
|
||||
|
||||
char *buffer = (char *)malloc(length);
|
||||
resultCL = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
|
||||
if (resultCL != CL_SUCCESS)
|
||||
throw(string("InitCL()::Error: Getting Program build "
|
||||
"info(clGetProgramBuildInfo)"));
|
||||
|
||||
cerr << buffer << endl;
|
||||
free(buffer);
|
||||
|
||||
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
|
||||
}
|
||||
|
||||
// get program information in intermediate representation
|
||||
#ifdef PTX_MSG
|
||||
size_t binary_sizes[deviceListSize];
|
||||
char *binaries[deviceListSize];
|
||||
// figure out number of devices and the sizes of the binary for each device.
|
||||
oclHandles.cl_status =
|
||||
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES,
|
||||
sizeof(size_t) * deviceListSize, &binary_sizes, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
|
||||
}
|
||||
|
||||
std::cout << "--cambine:" << binary_sizes << std::endl;
|
||||
// copy over all of the generated binaries.
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
binaries[i] = (char *)malloc(sizeof(char) * (binary_sizes[i] + 1));
|
||||
oclHandles.cl_status =
|
||||
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES,
|
||||
sizeof(char *) * deviceListSize, binaries, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
|
||||
}
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
binaries[i][binary_sizes[i]] = '\0';
|
||||
std::cout << "--cambine:writing ptd information..." << std::endl;
|
||||
FILE *ptx_file = fopen("cl.ptx", "w");
|
||||
if (ptx_file == NULL) {
|
||||
throw(string("exceptions in allocate ptx file."));
|
||||
}
|
||||
fprintf(ptx_file, "%s", binaries[DEVICE_ID_INUSED]);
|
||||
fclose(ptx_file);
|
||||
std::cout << "--cambine:writing ptd information done." << std::endl;
|
||||
for (int i = 0; i < deviceListSize; i++)
|
||||
free(binaries[i]);
|
||||
#endif
|
||||
|
||||
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
|
||||
/* get a kernel object handle for a kernel with the given name */
|
||||
cl_kernel kernel = clCreateKernel(
|
||||
oclHandles.program, (kernel_names[nKernel]).c_str(), &resultCL);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
|
||||
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" +
|
||||
kernel_names[nKernel] + "\"";
|
||||
throw(errorMsg);
|
||||
}
|
||||
|
||||
oclHandles.kernel.push_back(kernel);
|
||||
}
|
||||
// get resource alocation information
|
||||
#ifdef RES_MSG
|
||||
char *build_log;
|
||||
size_t ret_val_size;
|
||||
oclHandles.cl_status = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string("exceptions in _InitCL -> getting resource information"));
|
||||
}
|
||||
|
||||
build_log = (char *)malloc(ret_val_size + 1);
|
||||
oclHandles.cl_status = clGetProgramBuildInfo(
|
||||
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
|
||||
CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(string(
|
||||
"exceptions in _InitCL -> getting resources allocation information-2"));
|
||||
}
|
||||
build_log[ret_val_size] = '\0';
|
||||
std::cout << "--cambine:" << build_log << std::endl;
|
||||
free(build_log);
|
||||
#endif
|
||||
}
|
||||
|
||||
//---------------------------------------
|
||||
// release CL objects
|
||||
void _clRelease() {
|
||||
char errorFlag = false;
|
||||
|
||||
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
|
||||
if (oclHandles.kernel[nKernel] != NULL) {
|
||||
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.kernel[nKernel] = NULL;
|
||||
}
|
||||
oclHandles.kernel.clear();
|
||||
}
|
||||
|
||||
if (oclHandles.program != NULL) {
|
||||
cl_int resultCL = clReleaseProgram(oclHandles.program);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.program = NULL;
|
||||
}
|
||||
|
||||
if (oclHandles.queue != NULL) {
|
||||
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.queue = NULL;
|
||||
}
|
||||
|
||||
free(oclHandles.devices);
|
||||
|
||||
if (oclHandles.context != NULL) {
|
||||
cl_int resultCL = clReleaseContext(oclHandles.context);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
oclHandles.context = NULL;
|
||||
}
|
||||
|
||||
if (errorFlag)
|
||||
throw(string("ReleaseCL()::Error encountered."));
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create buffer and then copy data from host to device
|
||||
cl_mem _clCreateAndCpyMem(int size, void *h_mem_source) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_source, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem()"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
//-------------------------------------------------------
|
||||
//--cambine: create read only buffer for devices
|
||||
//--date: 17/01/2011
|
||||
cl_mem _clMallocRW(int size, void *h_mem_ptr) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_ptr, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMallocRW"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
//-------------------------------------------------------
|
||||
//--cambine: create read and write buffer for devices
|
||||
//--date: 17/01/2011
|
||||
cl_mem _clMalloc(int size, void *h_mem_ptr) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size,
|
||||
h_mem_ptr, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMalloc"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------
|
||||
//--cambine: transfer data from host to device
|
||||
//--date: 17/01/2011
|
||||
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string) {
|
||||
oclHandles.cl_status = clEnqueueWriteBuffer(
|
||||
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clMemcpyH2D"));
|
||||
#endif
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create buffer and then copy data from host to device with pinned
|
||||
// memory
|
||||
cl_mem _clCreateAndCpyPinnedMem(int size, float *h_mem_source) throw(string) {
|
||||
cl_mem d_mem, d_mem_pinned;
|
||||
float *h_mem_pinned = NULL;
|
||||
d_mem_pinned = clCreateBuffer(oclHandles.context,
|
||||
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size,
|
||||
NULL, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
|
||||
#endif
|
||||
//------------
|
||||
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, size, NULL,
|
||||
&oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
|
||||
#endif
|
||||
//----------
|
||||
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(
|
||||
oclHandles.queue, d_mem_pinned, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL,
|
||||
NULL, &oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
|
||||
#endif
|
||||
int element_number = size / sizeof(float);
|
||||
#pragma omp parallel for
|
||||
for (int i = 0; i < element_number; i++) {
|
||||
h_mem_pinned[i] = h_mem_source[i];
|
||||
}
|
||||
//----------
|
||||
oclHandles.cl_status = clEnqueueWriteBuffer(
|
||||
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_pinned, 0, NULL, NULL);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
|
||||
#endif
|
||||
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
//--cambine:create write only buffer on device
|
||||
cl_mem _clMallocWO(int size) throw(string) {
|
||||
cl_mem d_mem;
|
||||
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0,
|
||||
&oclHandles.cl_status);
|
||||
#ifdef ERRMSG
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(string("excpetion in _clCreateMem()"));
|
||||
#endif
|
||||
return d_mem;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// transfer data from device to host
|
||||
void _clMemcpyD2H(cl_mem d_mem, int size, void *h_mem) throw(string) {
|
||||
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE,
|
||||
0, size, h_mem, 0, 0, 0);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_VALUE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// set kernel arguments
|
||||
void _clSetArgs(int kernel_id, int arg_idx, void *d_mem,
|
||||
int size = 0) throw(string) {
|
||||
if (!size) {
|
||||
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx,
|
||||
sizeof(d_mem), &d_mem);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||
break;
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_SAMPLER:
|
||||
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||
break;
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
} else {
|
||||
oclHandles.cl_status =
|
||||
clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_ARG_INDEX:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||
break;
|
||||
case CL_INVALID_ARG_VALUE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||
break;
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_INVALID_SAMPLER:
|
||||
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||
break;
|
||||
case CL_INVALID_ARG_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
void _clFinish() throw(string) {
|
||||
oclHandles.cl_status = clFinish(oclHandles.queue);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clFinish";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unknown reasons";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||
throw(oclHandles.error_str);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
//--------------------------------------------------------
|
||||
//--cambine:enqueue kernel
|
||||
void _clInvokeKernel(int kernel_id, int work_items,
|
||||
int work_group_size) throw(string) {
|
||||
cl_uint work_dim = WORK_DIM;
|
||||
cl_event e[1];
|
||||
if (work_items % work_group_size != 0) // process situations that work_items
|
||||
// cannot be divided by work_group_size
|
||||
work_items =
|
||||
work_items + (work_group_size - (work_items % work_group_size));
|
||||
size_t local_work_size[] = {work_group_size, 1};
|
||||
size_t global_work_size[] = {work_items, 1};
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, &(e[0]));
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||
break;
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||
break;
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
//_clFinish();
|
||||
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||
// #ifdef ERRMSG
|
||||
// if (oclHandles.cl_status!= CL_SUCCESS)
|
||||
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||
// #endif
|
||||
}
|
||||
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
|
||||
int group_y) throw(string) {
|
||||
cl_uint work_dim = WORK_DIM;
|
||||
size_t local_work_size[] = {group_x, group_y};
|
||||
size_t global_work_size[] = {range_x, range_y};
|
||||
cl_event e[1];
|
||||
/*if(work_items%work_group_size != 0) //process situations that work_items
|
||||
cannot be divided by work_group_size
|
||||
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, &(e[0]));
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||
break;
|
||||
case CL_INVALID_COMMAND_QUEUE:
|
||||
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||
break;
|
||||
case CL_INVALID_KERNEL:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||
break;
|
||||
case CL_INVALID_CONTEXT:
|
||||
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||
break;
|
||||
case CL_INVALID_KERNEL_ARGS:
|
||||
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||
break;
|
||||
case CL_INVALID_WORK_DIMENSION:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_GROUP_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||
break;
|
||||
case CL_INVALID_WORK_ITEM_SIZE:
|
||||
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||
break;
|
||||
case CL_INVALID_GLOBAL_OFFSET:
|
||||
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
break;
|
||||
case CL_INVALID_EVENT_WAIT_LIST:
|
||||
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
//_clFinish();
|
||||
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||
|
||||
#ifdef ERRMSG
|
||||
|
||||
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||
|
||||
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||
|
||||
#endif*/
|
||||
}
|
||||
|
||||
//--------------------------------------------------------
|
||||
// release OpenCL objects
|
||||
void _clFree(cl_mem ob) throw(string) {
|
||||
if (ob != NULL)
|
||||
oclHandles.cl_status = clReleaseMemObject(ob);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clFree() ->";
|
||||
switch (oclHandles.cl_status) {
|
||||
case CL_INVALID_MEM_OBJECT:
|
||||
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||
break;
|
||||
case CL_OUT_OF_RESOURCES:
|
||||
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||
break;
|
||||
case CL_OUT_OF_HOST_MEMORY:
|
||||
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||
break;
|
||||
default:
|
||||
oclHandles.error_str += "Unkown reseason";
|
||||
break;
|
||||
}
|
||||
if (oclHandles.cl_status != CL_SUCCESS)
|
||||
throw(oclHandles.error_str);
|
||||
#endif
|
||||
}
|
||||
#endif //_CL_HELPER_
|
||||
68
benchmarks/old_opencl/bfs/Makefile
Normal file
68
benchmarks/old_opencl/bfs/Makefile
Normal file
@@ -0,0 +1,68 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH)
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = bfs
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
0
benchmarks/old_opencl/bfs/README
Normal file
0
benchmarks/old_opencl/bfs/README
Normal file
28677
benchmarks/old_opencl/bfs/graph4096.txt
Executable file
28677
benchmarks/old_opencl/bfs/graph4096.txt
Executable file
File diff suppressed because it is too large
Load Diff
53
benchmarks/old_opencl/bfs/kernel.cl
Executable file
53
benchmarks/old_opencl/bfs/kernel.cl
Executable file
@@ -0,0 +1,53 @@
|
||||
/* ============================================================
|
||||
//--cambine: kernel funtion of Breadth-First-Search
|
||||
//--author: created by Jianbin Fang
|
||||
//--date: 06/12/2010
|
||||
============================================================ */
|
||||
|
||||
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
|
||||
|
||||
//Structure to hold a node information
|
||||
typedef struct{
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
} Node;
|
||||
|
||||
//--7 parameters
|
||||
__kernel void BFS_1( const __global Node* g_graph_nodes,
|
||||
const __global int* g_graph_edges,
|
||||
__global char* g_graph_mask,
|
||||
__global char* g_updating_graph_mask,
|
||||
__global char* g_graph_visited,
|
||||
__global int* g_cost,
|
||||
const int no_of_nodes){
|
||||
int tid = get_global_id(0);
|
||||
if( tid<no_of_nodes && g_graph_mask[tid]){
|
||||
g_graph_mask[tid]=false;
|
||||
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
|
||||
int id = g_graph_edges[i];
|
||||
if(!g_graph_visited[id]){
|
||||
g_cost[id]=g_cost[tid]+1;
|
||||
g_updating_graph_mask[id]=true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//--5 parameters
|
||||
__kernel void BFS_2(__global char* g_graph_mask,
|
||||
__global char* g_updating_graph_mask,
|
||||
__global char* g_graph_visited,
|
||||
__global char* g_over,
|
||||
const int no_of_nodes
|
||||
) {
|
||||
int tid = get_global_id(0);
|
||||
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
|
||||
|
||||
g_graph_mask[tid]=true;
|
||||
g_graph_visited[tid]=true;
|
||||
*g_over=true;
|
||||
g_updating_graph_mask[tid]=false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
BIN
benchmarks/old_opencl/bfs/libbfs.a
Normal file
BIN
benchmarks/old_opencl/bfs/libbfs.a
Normal file
Binary file not shown.
297
benchmarks/old_opencl/bfs/main.cc
Executable file
297
benchmarks/old_opencl/bfs/main.cc
Executable file
@@ -0,0 +1,297 @@
|
||||
//--by Jianbin Fang
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifdef PROFILING
|
||||
#include "timer.h"
|
||||
#endif
|
||||
|
||||
#include "CLHelper.h"
|
||||
#include "util.h"
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
|
||||
// Structure to hold a node information
|
||||
struct Node {
|
||||
int starting;
|
||||
int no_of_edges;
|
||||
};
|
||||
|
||||
//----------------------------------------------------------
|
||||
//--bfs on cpu
|
||||
//--programmer: jianbin
|
||||
//--date: 26/01/2011
|
||||
//--note: width is changed to the new_width
|
||||
//----------------------------------------------------------
|
||||
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
||||
int *h_graph_edges, char *h_graph_mask,
|
||||
char *h_updating_graph_mask, char *h_graph_visited,
|
||||
int *h_cost_ref) {
|
||||
char stop;
|
||||
int k = 0;
|
||||
do {
|
||||
// if no thread changes this value then the loop stops
|
||||
stop = false;
|
||||
for (int tid = 0; tid < no_of_nodes; tid++) {
|
||||
if (h_graph_mask[tid] == true) {
|
||||
h_graph_mask[tid] = false;
|
||||
for (int i = h_graph_nodes[tid].starting;
|
||||
i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
|
||||
i++) {
|
||||
int id =
|
||||
h_graph_edges[i]; //--cambine: node id is connected with node tid
|
||||
if (!h_graph_visited[id]) { //--cambine: if node id has not been
|
||||
//visited, enter the body below
|
||||
h_cost_ref[id] = h_cost_ref[tid] + 1;
|
||||
h_updating_graph_mask[id] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int tid = 0; tid < no_of_nodes; tid++) {
|
||||
if (h_updating_graph_mask[tid] == true) {
|
||||
h_graph_mask[tid] = true;
|
||||
h_graph_visited[tid] = true;
|
||||
stop = true;
|
||||
h_updating_graph_mask[tid] = false;
|
||||
}
|
||||
}
|
||||
k++;
|
||||
} while (stop);
|
||||
}
|
||||
//----------------------------------------------------------
|
||||
//--breadth first search on GPUs
|
||||
//----------------------------------------------------------
|
||||
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
||||
int *h_graph_edges, char *h_graph_mask,
|
||||
char *h_updating_graph_mask, char *h_graph_visited,
|
||||
int *h_cost) throw(std::string) {
|
||||
|
||||
// int number_elements = height*width;
|
||||
char h_over;
|
||||
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
|
||||
d_graph_visited, d_cost, d_over;
|
||||
try {
|
||||
//--1 transfer data from host to device
|
||||
_clInit();
|
||||
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
|
||||
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
|
||||
d_updating_graph_mask =
|
||||
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
|
||||
|
||||
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
|
||||
d_over = _clMallocRW(sizeof(char), &h_over);
|
||||
|
||||
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
|
||||
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
|
||||
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
|
||||
h_updating_graph_mask);
|
||||
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
|
||||
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
|
||||
|
||||
//--2 invoke kernel
|
||||
#ifdef PROFILING
|
||||
timer kernel_timer;
|
||||
double kernel_time = 0.0;
|
||||
kernel_timer.reset();
|
||||
kernel_timer.start();
|
||||
#endif
|
||||
do {
|
||||
h_over = false;
|
||||
_clMemcpyH2D(d_over, sizeof(char), &h_over);
|
||||
//--kernel 0
|
||||
int kernel_id = 0;
|
||||
int kernel_idx = 0;
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_cost);
|
||||
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||
|
||||
// int work_items = no_of_nodes;
|
||||
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||
|
||||
//--kernel 1
|
||||
kernel_id = 1;
|
||||
kernel_idx = 0;
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||
_clSetArgs(kernel_id, kernel_idx++, d_over);
|
||||
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||
|
||||
// work_items = no_of_nodes;
|
||||
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||
|
||||
_clMemcpyD2H(d_over, sizeof(char), &h_over);
|
||||
} while (h_over);
|
||||
|
||||
_clFinish();
|
||||
#ifdef PROFILING
|
||||
kernel_timer.stop();
|
||||
kernel_time = kernel_timer.getTimeInSeconds();
|
||||
#endif
|
||||
//--3 transfer data from device to host
|
||||
_clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
|
||||
//--statistics
|
||||
#ifdef PROFILING
|
||||
std::cout << "kernel time(s):" << kernel_time << std::endl;
|
||||
#endif
|
||||
//--4 release cl resources.
|
||||
_clFree(d_graph_nodes);
|
||||
_clFree(d_graph_edges);
|
||||
_clFree(d_graph_mask);
|
||||
_clFree(d_updating_graph_mask);
|
||||
_clFree(d_graph_visited);
|
||||
_clFree(d_cost);
|
||||
_clFree(d_over);
|
||||
_clRelease();
|
||||
} catch (std::string msg) {
|
||||
_clFree(d_graph_nodes);
|
||||
_clFree(d_graph_edges);
|
||||
_clFree(d_graph_mask);
|
||||
_clFree(d_updating_graph_mask);
|
||||
_clFree(d_graph_visited);
|
||||
_clFree(d_cost);
|
||||
_clFree(d_over);
|
||||
_clRelease();
|
||||
std::string e_str = "in run_transpose_gpu -> ";
|
||||
e_str += msg;
|
||||
throw(e_str);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------
|
||||
//--cambine: main function
|
||||
//--author: created by Jianbin Fang
|
||||
//--date: 25/01/2011
|
||||
//----------------------------------------------------------
|
||||
int main(int argc, char *argv[]) {
|
||||
printf("enter demo main\n");
|
||||
|
||||
int no_of_nodes;
|
||||
int edge_list_size;
|
||||
FILE *fp;
|
||||
Node *h_graph_nodes;
|
||||
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
|
||||
|
||||
try {
|
||||
char *input_f = "graph4096.txt";
|
||||
printf("Reading File\n");
|
||||
// Read in Graph from a file
|
||||
fp = fopen(input_f, "r");
|
||||
if (!fp) {
|
||||
printf("Error Reading graph file\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
printf("Reading File completed!\n");
|
||||
|
||||
int source = 0;
|
||||
|
||||
fscanf(fp, "%d", &no_of_nodes);
|
||||
|
||||
int num_of_blocks = 1;
|
||||
int num_of_threads_per_block = no_of_nodes;
|
||||
|
||||
// Make execution Parameters according to the number of nodes
|
||||
// Distribute threads across multiple Blocks if necessary
|
||||
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
|
||||
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
|
||||
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||
}
|
||||
work_group_size = num_of_threads_per_block;
|
||||
// allocate host memory
|
||||
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
|
||||
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
|
||||
|
||||
int start, edgeno;
|
||||
// initalize the memory
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
fscanf(fp, "%d %d", &start, &edgeno);
|
||||
h_graph_nodes[i].starting = start;
|
||||
h_graph_nodes[i].no_of_edges = edgeno;
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
// read the source node from the file
|
||||
fscanf(fp, "%d", &source);
|
||||
source = 0;
|
||||
// set the source node as true in the mask
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
fscanf(fp, "%d", &edge_list_size);
|
||||
int id, cost;
|
||||
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
|
||||
for (int i = 0; i < edge_list_size; i++) {
|
||||
fscanf(fp, "%d", &id);
|
||||
fscanf(fp, "%d", &cost);
|
||||
h_graph_edges[i] = id;
|
||||
}
|
||||
|
||||
if (fp)
|
||||
fclose(fp);
|
||||
// allocate mem for the result on host side
|
||||
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
h_cost[i] = -1;
|
||||
h_cost_ref[i] = -1;
|
||||
}
|
||||
h_cost[source] = 0;
|
||||
h_cost_ref[source] = 0;
|
||||
//---------------------------------------------------------
|
||||
//--gpu entry
|
||||
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
|
||||
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
|
||||
//---------------------------------------------------------
|
||||
//--cpu entry
|
||||
// initalize the memory again
|
||||
for (int i = 0; i < no_of_nodes; i++) {
|
||||
h_graph_mask[i] = false;
|
||||
h_updating_graph_mask[i] = false;
|
||||
h_graph_visited[i] = false;
|
||||
}
|
||||
// set the source node as true in the mask
|
||||
source = 0;
|
||||
h_graph_mask[source] = true;
|
||||
h_graph_visited[source] = true;
|
||||
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
|
||||
h_graph_mask, h_updating_graph_mask, h_graph_visited,
|
||||
h_cost_ref);
|
||||
//---------------------------------------------------------
|
||||
//--result varification
|
||||
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
|
||||
// release host memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
|
||||
} catch (std::string msg) {
|
||||
std::cout << "--cambine: exception in main ->" << msg << std::endl;
|
||||
// release host memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
1
benchmarks/old_opencl/bfs/run
Executable file
1
benchmarks/old_opencl/bfs/run
Executable file
@@ -0,0 +1 @@
|
||||
./bfs ../../data/bfs/graph1MW_6.txt
|
||||
78
benchmarks/old_opencl/bfs/timer.cc
Executable file
78
benchmarks/old_opencl/bfs/timer.cc
Executable file
@@ -0,0 +1,78 @@
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
|
||||
#include "timer.h"
|
||||
|
||||
|
||||
using namespace std;
|
||||
|
||||
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
|
||||
|
||||
|
||||
double timer::get_CPU_speed_in_MHz()
|
||||
{
|
||||
#if defined __linux__
|
||||
ifstream infile("/proc/cpuinfo");
|
||||
char buffer[256], *colon;
|
||||
|
||||
while (infile.good()) {
|
||||
infile.getline(buffer, 256);
|
||||
|
||||
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
|
||||
return atof(colon + 2);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
|
||||
void timer::print_time(ostream &str, const char *which, double time) const
|
||||
{
|
||||
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
|
||||
const char **unit = units;
|
||||
|
||||
time = 1000.0 * time / CPU_speed_in_MHz;
|
||||
|
||||
while (time >= 999.5 && unit[1] != 0) {
|
||||
time /= 1000.0;
|
||||
++ unit;
|
||||
}
|
||||
|
||||
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
|
||||
}
|
||||
|
||||
|
||||
ostream &timer::print(ostream &str)
|
||||
{
|
||||
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
|
||||
|
||||
if (CPU_speed_in_MHz == 0)
|
||||
str << "could not determine CPU speed\n";
|
||||
else if (count > 0) {
|
||||
double total = static_cast<double>(total_time);
|
||||
|
||||
print_time(str, "avg", total / static_cast<double>(count));
|
||||
print_time(str, ", total", total);
|
||||
str << ", count = " << setw(9) << count << '\n';
|
||||
}
|
||||
else
|
||||
str << "not used\n";
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
|
||||
ostream &operator << (ostream &str, class timer &timer)
|
||||
{
|
||||
return timer.print(str);
|
||||
}
|
||||
|
||||
double timer::getTimeInSeconds()
|
||||
{
|
||||
double total = static_cast<double>(total_time);
|
||||
double res = (total / 1000000.0) / CPU_speed_in_MHz;
|
||||
return res;
|
||||
}
|
||||
101
benchmarks/old_opencl/bfs/timer.h
Executable file
101
benchmarks/old_opencl/bfs/timer.h
Executable file
@@ -0,0 +1,101 @@
|
||||
#ifndef timer_h
|
||||
#define timer_h
|
||||
|
||||
#include <iostream>
|
||||
|
||||
class timer {
|
||||
public:
|
||||
timer(const char *name = 0);
|
||||
timer(const char *name, std::ostream &write_on_exit);
|
||||
|
||||
~timer();
|
||||
|
||||
void start(), stop();
|
||||
void reset();
|
||||
std::ostream &print(std::ostream &);
|
||||
|
||||
double getTimeInSeconds();
|
||||
|
||||
private:
|
||||
void print_time(std::ostream &, const char *which, double time) const;
|
||||
|
||||
union {
|
||||
long long total_time;
|
||||
struct {
|
||||
#if defined __PPC__
|
||||
int high, low;
|
||||
#else
|
||||
int low, high;
|
||||
#endif
|
||||
};
|
||||
};
|
||||
|
||||
unsigned long long count;
|
||||
const char *const name;
|
||||
std::ostream *const write_on_exit;
|
||||
|
||||
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
|
||||
};
|
||||
|
||||
std::ostream &operator<<(std::ostream &, class timer &);
|
||||
|
||||
inline void timer::reset() {
|
||||
total_time = 0;
|
||||
count = 0;
|
||||
}
|
||||
|
||||
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
|
||||
reset();
|
||||
}
|
||||
|
||||
inline timer::timer(const char *name, std::ostream &write_on_exit)
|
||||
: name(name), write_on_exit(&write_on_exit) {
|
||||
reset();
|
||||
}
|
||||
|
||||
inline timer::~timer() {
|
||||
if (write_on_exit != 0)
|
||||
print(*write_on_exit);
|
||||
}
|
||||
|
||||
inline void timer::start() {
|
||||
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||
unsigned eax, edx;
|
||||
|
||||
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||
|
||||
total_time -= ((unsigned long long)edx << 32) + eax;
|
||||
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
|
||||
(defined __i386 || defined __x86_64)
|
||||
asm volatile("rdtsc\n\t"
|
||||
"subl %%eax, %0\n\t"
|
||||
"sbbl %%edx, %1"
|
||||
: "+m"(low), "+m"(high)
|
||||
:
|
||||
: "eax", "edx");
|
||||
#else
|
||||
#error Compiler/Architecture not recognized
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void timer::stop() {
|
||||
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||
unsigned eax, edx;
|
||||
|
||||
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||
|
||||
total_time += ((unsigned long long)edx << 32) + eax;
|
||||
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
|
||||
(defined __i386 || defined __x86_64)
|
||||
asm volatile("rdtsc\n\t"
|
||||
"addl %%eax, %0\n\t"
|
||||
"adcl %%edx, %1"
|
||||
: "+m"(low), "+m"(high)
|
||||
:
|
||||
: "eax", "edx");
|
||||
#endif
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
#endif
|
||||
72
benchmarks/old_opencl/bfs/util.h
Executable file
72
benchmarks/old_opencl/bfs/util.h
Executable file
@@ -0,0 +1,72 @@
|
||||
#ifndef _C_UTIL_
|
||||
#define _C_UTIL_
|
||||
#include <math.h>
|
||||
#include <iostream>
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
//--initialize array with maximum limit
|
||||
//-------------------------------------------------------------------
|
||||
template<typename datatype>
|
||||
void fill(datatype *A, const int n, const datatype maxi){
|
||||
for (int j = 0; j < n; j++)
|
||||
{
|
||||
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
|
||||
}
|
||||
}
|
||||
|
||||
//--print matrix
|
||||
template<typename datatype>
|
||||
void print_matrix(datatype *A, int height, int width){
|
||||
for(int i=0; i<height; i++){
|
||||
for(int j=0; j<width; j++){
|
||||
int idx = i*width + j;
|
||||
std::cout<<A[idx]<<" ";
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
//-------------------------------------------------------------------
|
||||
//--verify results
|
||||
//-------------------------------------------------------------------
|
||||
#define MAX_RELATIVE_ERROR .002
|
||||
template<typename datatype>
|
||||
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
|
||||
|
||||
char passed = true;
|
||||
#pragma omp parallel for
|
||||
for (int i=0; i<size; i++){
|
||||
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
|
||||
passed = false;
|
||||
}
|
||||
}
|
||||
if (passed){
|
||||
std::cout << "--cambine:passed:-)" << endl;
|
||||
}
|
||||
else{
|
||||
std::cout << "--cambine: failed:-(" << endl;
|
||||
}
|
||||
return ;
|
||||
}
|
||||
template<typename datatype>
|
||||
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
|
||||
|
||||
char passed = true;
|
||||
//#pragma omp parallel for
|
||||
for (int i=0; i<size; i++){
|
||||
if (cpu_results[i]!=gpu_results[i]){
|
||||
passed = false;
|
||||
}
|
||||
}
|
||||
if (passed){
|
||||
std::cout << "--cambine:passed:-)" << endl;
|
||||
}
|
||||
else{
|
||||
std::cout << "--cambine: failed:-(" << endl;
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user