project directories reorganization

2020-04-14 06:35:20 -04:00
parent 1de06fd9c0
commit fc155e1223
1056 changed files with 8120 additions and 8120 deletions
--- a/benchmarks/old_opencl/bfs/CLHelper.h
+++ b/benchmarks/old_opencl/bfs/CLHelper.h
@@ -0,0 +1,816 @@
+//------------------------------------------
+//--cambine:helper function for OpenCL
+//--programmer:	Jianbin Fang
+//--date:	27/12/2010
+//------------------------------------------
+#ifndef _CL_HELPER_
+#define _CL_HELPER_
+
+#include <CL/cl.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+
+using std::string;
+using std::ifstream;
+using std::cerr;
+using std::endl;
+using std::cout;
+//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
+#define WORK_DIM 2 // work-items dimensions
+
+struct oclHandleStruct {
+  cl_context context;
+  cl_device_id *devices;
+  cl_command_queue queue;
+  cl_program program;
+  cl_int cl_status;
+  std::string error_str;
+  std::vector<cl_kernel> kernel;
+};
+
+struct oclHandleStruct oclHandles;
+
+char kernel_file[100] = "Kernels.cl";
+int total_kernels = 2;
+string kernel_names[2] = {"BFS_1", "BFS_2"};
+int work_group_size = 512;
+int device_id_inused = 0; // deviced id used (default : 0)
+
+/*
+ * Converts the contents of a file into a string
+ */
+string FileToString(const string fileName) {
+  ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
+
+  try {
+    size_t size;
+    char *str;
+    string s;
+
+    if (f.is_open()) {
+      size_t fileSize;
+      f.seekg(0, ifstream::end);
+      size = fileSize = f.tellg();
+      f.seekg(0, ifstream::beg);
+
+      str = new char[size + 1];
+      if (!str)
+        throw(string("Could not allocate memory"));
+
+      f.read(str, fileSize);
+      f.close();
+      str[size] = '\0';
+
+      s = str;
+      delete[] str;
+      return s;
+    }
+  } catch (std::string msg) {
+    cerr << "Exception caught in FileToString(): " << msg << endl;
+    if (f.is_open())
+      f.close();
+  } catch (...) {
+    cerr << "Exception caught in FileToString()" << endl;
+    if (f.is_open())
+      f.close();
+  }
+  string errorMsg = "FileToString()::Error: Unable to open file " + fileName;
+  throw(errorMsg);
+}
+//---------------------------------------
+// Read command line parameters
+//
+void _clCmdParams(int argc, char *argv[]) {
+  for (int i = 0; i < argc; ++i) {
+    switch (argv[i][1]) {
+    case 'g': //--g stands for size of work group
+      if (++i < argc) {
+        sscanf(argv[i], "%u", &work_group_size);
+      } else {
+        std::cerr << "Could not read argument after option " << argv[i - 1]
+                  << std::endl;
+        throw;
+      }
+      break;
+    case 'd': //--d stands for device id used in computaion
+      if (++i < argc) {
+        sscanf(argv[i], "%u", &device_id_inused);
+      } else {
+        std::cerr << "Could not read argument after option " << argv[i - 1]
+                  << std::endl;
+        throw;
+      }
+      break;
+    default:;
+    }
+  }
+}
+
+//---------------------------------------
+// Initlize CL objects
+//--description: there are 5 steps to initialize all the OpenCL objects needed
+//--revised on 04/01/2011: get the number of devices  and
+//  devices have no relationship with context
+void _clInit() {
+  printf("_clInit()\n");
+
+  int DEVICE_ID_INUSED = device_id_inused;
+  cl_int resultCL;
+
+  oclHandles.context = NULL;
+  oclHandles.devices = NULL;
+  oclHandles.queue = NULL;
+  oclHandles.program = NULL;
+
+  cl_uint deviceListSize;
+
+  //-----------------------------------------------
+  //--cambine-1: find the available platforms and select one
+
+  cl_uint numPlatforms = 1;
+  cl_platform_id targetPlatform = NULL;
+
+  cl_platform_id *allPlatforms =
+      (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
+
+  resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
+  if (resultCL != CL_SUCCESS)
+    throw(string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
+
+  // Select the target platform. Default: first platform
+  targetPlatform = allPlatforms[0];
+
+  /*for (int i = 0; i < numPlatforms; i++)
+{
+char pbuff[128];
+resultCL = clGetPlatformInfo( allPlatforms[i],
+                          CL_PLATFORM_VENDOR,
+                          sizeof(pbuff),
+                          pbuff,
+                          NULL);
+if (resultCL != CL_SUCCESS)
+throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
+
+  //printf("vedor is %s\n",pbuff);
+
+}
+free(allPlatforms);*/
+
+  //-----------------------------------------------
+  //--cambine-2: create an OpenCL context
+  /*cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM,
+ (cl_context_properties)targetPlatform, 0 };
+  oclHandles.context = clCreateContextFromType(cprops,
+                                              CL_DEVICE_TYPE_GPU,
+                                              NULL,
+                                              NULL,
+                                              &resultCL);
+
+  if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
+      throw (string("InitCL()::Error: Creating Context
+ (clCreateContextFromType)"));
+
+        //-----------------------------------------------
+  //--cambine-3: detect OpenCL devices
+  // First, get the size of device list
+ oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0,
+ NULL, &deviceListSize);
+ if(oclHandles.cl_status!=CL_SUCCESS){
+      throw(string("exception in _clInit -> clGetDeviceIDs"));
+ }
+ if (deviceListSize == 0)
+      throw(string("InitCL()::Error: No devices found."));
+
+              printf("OK1()\n");
+
+  //std::cout<<"device number:"<<deviceListSize<<std::endl;*/
+
+  // Now, allocate the device list
+  deviceListSize = 1;
+  oclHandles.devices =
+      (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
+  if (oclHandles.devices == 0)
+    throw(string("InitCL()::Error: Could not allocate memory."));
+
+  //* Next, get the device list data
+  oclHandles.cl_status =
+      clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_DEFAULT, deviceListSize,
+                     oclHandles.devices, NULL);
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(string("exception in _clInit -> clGetDeviceIDs-2"));
+  }
+
+  oclHandles.context = clCreateContext(NULL, deviceListSize, oclHandles.devices,
+                                       NULL, NULL, &resultCL);
+  if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
+    throw(string("InitCL()::Error: Creating Context (clCreateContext)"));
+
+  //-----------------------------------------------
+  //--cambine-4: Create an OpenCL command queue
+  oclHandles.queue = clCreateCommandQueue(
+      oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
+  printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
+
+  if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
+    throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
+  //-----------------------------------------------
+  //--cambine-5: Load CL file, build CL program object, create CL kernel object
+  /*std::string  source_str = FileToString(kernel_file);
+  const char * source    = source_str.c_str();
+  size_t sourceSize[]    = { source_str.length() };*/
+
+  oclHandles.program = clCreateProgramWithBuiltInKernels(
+      oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
+      "BFS_1;BFS_2", &resultCL);
+  /*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
+                                                  1,
+                                                  &source,
+                                                  sourceSize,
+                                                  &resultCL);*/
+  if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
+    throw(string("InitCL()::Error: Loading Binary into cl_program. "
+                 "(clCreateProgramWithBinary)"));
+
+  // insert debug information
+  // std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
+  // options += " -cl-nv-opt-level=3";
+  resultCL = clBuildProgram(oclHandles.program, deviceListSize,
+                            oclHandles.devices, NULL, NULL, NULL);
+  if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
+    cerr << "InitCL()::Error: In clBuildProgram" << endl;
+
+    size_t length;
+    resultCL = clGetProgramBuildInfo(oclHandles.program,
+                                     oclHandles.devices[DEVICE_ID_INUSED],
+                                     CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
+    if (resultCL != CL_SUCCESS)
+      throw(string("InitCL()::Error: Getting Program build "
+                   "info(clGetProgramBuildInfo)"));
+
+    char *buffer = (char *)malloc(length);
+    resultCL = clGetProgramBuildInfo(
+        oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
+        CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
+    if (resultCL != CL_SUCCESS)
+      throw(string("InitCL()::Error: Getting Program build "
+                   "info(clGetProgramBuildInfo)"));
+
+    cerr << buffer << endl;
+    free(buffer);
+
+    throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
+  }
+
+// get program information in intermediate representation
+#ifdef PTX_MSG
+  size_t binary_sizes[deviceListSize];
+  char *binaries[deviceListSize];
+  // figure out number of devices and the sizes of the binary for each device.
+  oclHandles.cl_status =
+      clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES,
+                       sizeof(size_t) * deviceListSize, &binary_sizes, NULL);
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
+  }
+
+  std::cout << "--cambine:" << binary_sizes << std::endl;
+  // copy over all of the generated binaries.
+  for (int i = 0; i < deviceListSize; i++)
+    binaries[i] = (char *)malloc(sizeof(char) * (binary_sizes[i] + 1));
+  oclHandles.cl_status =
+      clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES,
+                       sizeof(char *) * deviceListSize, binaries, NULL);
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
+  }
+  for (int i = 0; i < deviceListSize; i++)
+    binaries[i][binary_sizes[i]] = '\0';
+  std::cout << "--cambine:writing ptd information..." << std::endl;
+  FILE *ptx_file = fopen("cl.ptx", "w");
+  if (ptx_file == NULL) {
+    throw(string("exceptions in allocate ptx file."));
+  }
+  fprintf(ptx_file, "%s", binaries[DEVICE_ID_INUSED]);
+  fclose(ptx_file);
+  std::cout << "--cambine:writing ptd information done." << std::endl;
+  for (int i = 0; i < deviceListSize; i++)
+    free(binaries[i]);
+#endif
+
+  for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
+    /* get a kernel object handle for a kernel with the given name */
+    cl_kernel kernel = clCreateKernel(
+        oclHandles.program, (kernel_names[nKernel]).c_str(), &resultCL);
+
+    if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
+      string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" +
+                        kernel_names[nKernel] + "\"";
+      throw(errorMsg);
+    }
+
+    oclHandles.kernel.push_back(kernel);
+  }
+// get resource alocation information
+#ifdef RES_MSG
+  char *build_log;
+  size_t ret_val_size;
+  oclHandles.cl_status = clGetProgramBuildInfo(
+      oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
+      CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(string("exceptions in _InitCL -> getting resource information"));
+  }
+
+  build_log = (char *)malloc(ret_val_size + 1);
+  oclHandles.cl_status = clGetProgramBuildInfo(
+      oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
+      CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(string(
+        "exceptions in _InitCL -> getting resources allocation information-2"));
+  }
+  build_log[ret_val_size] = '\0';
+  std::cout << "--cambine:" << build_log << std::endl;
+  free(build_log);
+#endif
+}
+
+//---------------------------------------
+// release CL objects
+void _clRelease() {
+  char errorFlag = false;
+
+  for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
+    if (oclHandles.kernel[nKernel] != NULL) {
+      cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
+      if (resultCL != CL_SUCCESS) {
+        cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
+        errorFlag = true;
+      }
+      oclHandles.kernel[nKernel] = NULL;
+    }
+    oclHandles.kernel.clear();
+  }
+
+  if (oclHandles.program != NULL) {
+    cl_int resultCL = clReleaseProgram(oclHandles.program);
+    if (resultCL != CL_SUCCESS) {
+      cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
+      errorFlag = true;
+    }
+    oclHandles.program = NULL;
+  }
+
+  if (oclHandles.queue != NULL) {
+    cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
+    if (resultCL != CL_SUCCESS) {
+      cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
+      errorFlag = true;
+    }
+    oclHandles.queue = NULL;
+  }
+
+  free(oclHandles.devices);
+
+  if (oclHandles.context != NULL) {
+    cl_int resultCL = clReleaseContext(oclHandles.context);
+    if (resultCL != CL_SUCCESS) {
+      cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
+      errorFlag = true;
+    }
+    oclHandles.context = NULL;
+  }
+
+  if (errorFlag)
+    throw(string("ReleaseCL()::Error encountered."));
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device
+cl_mem _clCreateAndCpyMem(int size, void *h_mem_source) throw(string) {
+  cl_mem d_mem;
+  d_mem = clCreateBuffer(oclHandles.context,
+                         CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size,
+                         h_mem_source, &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateAndCpyMem()"));
+#endif
+  return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read only  buffer for devices
+//--date:	17/01/2011
+cl_mem _clMallocRW(int size, void *h_mem_ptr) throw(string) {
+  cl_mem d_mem;
+  d_mem = clCreateBuffer(oclHandles.context,
+                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size,
+                         h_mem_ptr, &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clMallocRW"));
+#endif
+  return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read and write buffer for devices
+//--date:	17/01/2011
+cl_mem _clMalloc(int size, void *h_mem_ptr) throw(string) {
+  cl_mem d_mem;
+  d_mem = clCreateBuffer(oclHandles.context,
+                         CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size,
+                         h_mem_ptr, &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clMalloc"));
+#endif
+  return d_mem;
+}
+
+//-------------------------------------------------------
+//--cambine:	transfer data from host to device
+//--date:	17/01/2011
+void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string) {
+  oclHandles.cl_status = clEnqueueWriteBuffer(
+      oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clMemcpyH2D"));
+#endif
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device with pinned
+// memory
+cl_mem _clCreateAndCpyPinnedMem(int size, float *h_mem_source) throw(string) {
+  cl_mem d_mem, d_mem_pinned;
+  float *h_mem_pinned = NULL;
+  d_mem_pinned = clCreateBuffer(oclHandles.context,
+                                CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size,
+                                NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
+#endif
+  //------------
+  d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, size, NULL,
+                         &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
+#endif
+  //----------
+  h_mem_pinned = (cl_float *)clEnqueueMapBuffer(
+      oclHandles.queue, d_mem_pinned, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL,
+      NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
+#endif
+  int element_number = size / sizeof(float);
+#pragma omp parallel for
+  for (int i = 0; i < element_number; i++) {
+    h_mem_pinned[i] = h_mem_source[i];
+  }
+  //----------
+  oclHandles.cl_status = clEnqueueWriteBuffer(
+      oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_pinned, 0, NULL, NULL);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
+#endif
+
+  return d_mem;
+}
+
+//--------------------------------------------------------
+//--cambine:create write only buffer on device
+cl_mem _clMallocWO(int size) throw(string) {
+  cl_mem d_mem;
+  d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0,
+                         &oclHandles.cl_status);
+#ifdef ERRMSG
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(string("excpetion in _clCreateMem()"));
+#endif
+  return d_mem;
+}
+
+//--------------------------------------------------------
+// transfer data from device to host
+void _clMemcpyD2H(cl_mem d_mem, int size, void *h_mem) throw(string) {
+  oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE,
+                                             0, size, h_mem, 0, 0, 0);
+#ifdef ERRMSG
+  oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
+  switch (oclHandles.cl_status) {
+  case CL_INVALID_COMMAND_QUEUE:
+    oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+    break;
+  case CL_INVALID_CONTEXT:
+    oclHandles.error_str += "CL_INVALID_CONTEXT";
+    break;
+  case CL_INVALID_MEM_OBJECT:
+    oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+    break;
+  case CL_INVALID_VALUE:
+    oclHandles.error_str += "CL_INVALID_VALUE";
+    break;
+  case CL_INVALID_EVENT_WAIT_LIST:
+    oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+    break;
+  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+    oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+    break;
+  default:
+    oclHandles.error_str += "Unknown reason";
+    break;
+  }
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(oclHandles.error_str);
+#endif
+}
+
+//--------------------------------------------------------
+// set kernel arguments
+void _clSetArgs(int kernel_id, int arg_idx, void *d_mem,
+                int size = 0) throw(string) {
+  if (!size) {
+    oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx,
+                                          sizeof(d_mem), &d_mem);
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+    switch (oclHandles.cl_status) {
+    case CL_INVALID_KERNEL:
+      oclHandles.error_str += "CL_INVALID_KERNEL";
+      break;
+    case CL_INVALID_ARG_INDEX:
+      oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+      break;
+    case CL_INVALID_ARG_VALUE:
+      oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+      break;
+    case CL_INVALID_MEM_OBJECT:
+      oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+      break;
+    case CL_INVALID_SAMPLER:
+      oclHandles.error_str += "CL_INVALID_SAMPLER";
+      break;
+    case CL_INVALID_ARG_SIZE:
+      oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+      break;
+    case CL_OUT_OF_RESOURCES:
+      oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+      break;
+    case CL_OUT_OF_HOST_MEMORY:
+      oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+      break;
+    default:
+      oclHandles.error_str += "Unknown reason";
+      break;
+    }
+    if (oclHandles.cl_status != CL_SUCCESS)
+      throw(oclHandles.error_str);
+#endif
+  } else {
+    oclHandles.cl_status =
+        clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+    switch (oclHandles.cl_status) {
+    case CL_INVALID_KERNEL:
+      oclHandles.error_str += "CL_INVALID_KERNEL";
+      break;
+    case CL_INVALID_ARG_INDEX:
+      oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+      break;
+    case CL_INVALID_ARG_VALUE:
+      oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+      break;
+    case CL_INVALID_MEM_OBJECT:
+      oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+      break;
+    case CL_INVALID_SAMPLER:
+      oclHandles.error_str += "CL_INVALID_SAMPLER";
+      break;
+    case CL_INVALID_ARG_SIZE:
+      oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+      break;
+    case CL_OUT_OF_RESOURCES:
+      oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+      break;
+    case CL_OUT_OF_HOST_MEMORY:
+      oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+      break;
+    default:
+      oclHandles.error_str += "Unknown reason";
+      break;
+    }
+    if (oclHandles.cl_status != CL_SUCCESS)
+      throw(oclHandles.error_str);
+#endif
+  }
+}
+void _clFinish() throw(string) {
+  oclHandles.cl_status = clFinish(oclHandles.queue);
+#ifdef ERRMSG
+  oclHandles.error_str = "excpetion in _clFinish";
+  switch (oclHandles.cl_status) {
+  case CL_INVALID_COMMAND_QUEUE:
+    oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+    break;
+  case CL_OUT_OF_RESOURCES:
+    oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+    break;
+  default:
+    oclHandles.error_str += "Unknown reasons";
+    break;
+  }
+  if (oclHandles.cl_status != CL_SUCCESS) {
+    throw(oclHandles.error_str);
+  }
+#endif
+}
+//--------------------------------------------------------
+//--cambine:enqueue kernel
+void _clInvokeKernel(int kernel_id, int work_items,
+                     int work_group_size) throw(string) {
+  cl_uint work_dim = WORK_DIM;
+  cl_event e[1];
+  if (work_items % work_group_size != 0) // process situations that work_items
+                                         // cannot be divided by work_group_size
+    work_items =
+        work_items + (work_group_size - (work_items % work_group_size));
+  size_t local_work_size[] = {work_group_size, 1};
+  size_t global_work_size[] = {work_items, 1};
+  oclHandles.cl_status = clEnqueueNDRangeKernel(
+      oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
+      global_work_size, local_work_size, 0, 0, &(e[0]));
+#ifdef ERRMSG
+  oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+  switch (oclHandles.cl_status) {
+  case CL_INVALID_PROGRAM_EXECUTABLE:
+    oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+    break;
+  case CL_INVALID_COMMAND_QUEUE:
+    oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+    break;
+  case CL_INVALID_KERNEL:
+    oclHandles.error_str += "CL_INVALID_KERNEL";
+    break;
+  case CL_INVALID_CONTEXT:
+    oclHandles.error_str += "CL_INVALID_CONTEXT";
+    break;
+  case CL_INVALID_KERNEL_ARGS:
+    oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+    break;
+  case CL_INVALID_WORK_DIMENSION:
+    oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+    break;
+  case CL_INVALID_GLOBAL_WORK_SIZE:
+    oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+    break;
+  case CL_INVALID_WORK_GROUP_SIZE:
+    oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+    break;
+  case CL_INVALID_WORK_ITEM_SIZE:
+    oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+    break;
+  case CL_INVALID_GLOBAL_OFFSET:
+    oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+    break;
+  case CL_OUT_OF_RESOURCES:
+    oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+    break;
+  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+    oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    break;
+  case CL_INVALID_EVENT_WAIT_LIST:
+    oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+    break;
+  default:
+    oclHandles.error_str += "Unkown reseason";
+    break;
+  }
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(oclHandles.error_str);
+#endif
+  //_clFinish();
+  // oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+  // #ifdef ERRMSG
+  // if (oclHandles.cl_status!= CL_SUCCESS)
+  //     throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+  // #endif
+}
+void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
+                       int group_y) throw(string) {
+  cl_uint work_dim = WORK_DIM;
+  size_t local_work_size[] = {group_x, group_y};
+  size_t global_work_size[] = {range_x, range_y};
+  cl_event e[1];
+  /*if(work_items%work_group_size != 0)	//process situations that work_items
+    cannot be divided by work_group_size
+    work_items = work_items + (work_group_size-(work_items%work_group_size));*/
+  oclHandles.cl_status = clEnqueueNDRangeKernel(
+      oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
+      global_work_size, local_work_size, 0, 0, &(e[0]));
+#ifdef ERRMSG
+  oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+  switch (oclHandles.cl_status) {
+  case CL_INVALID_PROGRAM_EXECUTABLE:
+    oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+    break;
+  case CL_INVALID_COMMAND_QUEUE:
+    oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+    break;
+  case CL_INVALID_KERNEL:
+    oclHandles.error_str += "CL_INVALID_KERNEL";
+    break;
+  case CL_INVALID_CONTEXT:
+    oclHandles.error_str += "CL_INVALID_CONTEXT";
+    break;
+  case CL_INVALID_KERNEL_ARGS:
+    oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+    break;
+  case CL_INVALID_WORK_DIMENSION:
+    oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+    break;
+  case CL_INVALID_GLOBAL_WORK_SIZE:
+    oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+    break;
+  case CL_INVALID_WORK_GROUP_SIZE:
+    oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+    break;
+  case CL_INVALID_WORK_ITEM_SIZE:
+    oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+    break;
+  case CL_INVALID_GLOBAL_OFFSET:
+    oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+    break;
+  case CL_OUT_OF_RESOURCES:
+    oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+    break;
+  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+    oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    break;
+  case CL_INVALID_EVENT_WAIT_LIST:
+    oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+    break;
+  default:
+    oclHandles.error_str += "Unkown reseason";
+    break;
+  }
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(oclHandles.error_str);
+#endif
+  //_clFinish();
+  /*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+
+  #ifdef ERRMSG
+
+  if (oclHandles.cl_status!= CL_SUCCESS)
+
+      throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+
+  #endif*/
+}
+
+//--------------------------------------------------------
+// release OpenCL objects
+void _clFree(cl_mem ob) throw(string) {
+  if (ob != NULL)
+    oclHandles.cl_status = clReleaseMemObject(ob);
+#ifdef ERRMSG
+  oclHandles.error_str = "excpetion in _clFree() ->";
+  switch (oclHandles.cl_status) {
+  case CL_INVALID_MEM_OBJECT:
+    oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+    break;
+  case CL_OUT_OF_RESOURCES:
+    oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+    break;
+  case CL_OUT_OF_HOST_MEMORY:
+    oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+    break;
+  default:
+    oclHandles.error_str += "Unkown reseason";
+    break;
+  }
+  if (oclHandles.cl_status != CL_SUCCESS)
+    throw(oclHandles.error_str);
+#endif
+}
+#endif //_CL_HELPER_
--- a/benchmarks/old_opencl/bfs/Makefile
+++ b/benchmarks/old_opencl/bfs/Makefile
@@ -0,0 +1,68 @@
+RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
+POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
+POCL_INC_PATH ?= $(wildcard ../include)
+POCL_LIB_PATH ?= $(wildcard ../lib)
+VX_RT_PATH ?= $(wildcard ../../../runtime)
+VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32 
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH)
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+
+PROJECT = bfs
+
+SRCS = main.cc
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
--- a/benchmarks/old_opencl/bfs/README
+++ b/benchmarks/old_opencl/bfs/README
--- a/benchmarks/old_opencl/bfs/graph4096.txt
+++ b/benchmarks/old_opencl/bfs/graph4096.txt
--- a/benchmarks/old_opencl/bfs/kernel.cl
+++ b/benchmarks/old_opencl/bfs/kernel.cl
@@ -0,0 +1,53 @@
+/* ============================================================
+//--cambine: kernel funtion of Breadth-First-Search
+//--author:	created by Jianbin Fang
+//--date:	06/12/2010
+============================================================ */
+
+//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+
+//Structure to hold a node information
+typedef struct{
+	int starting;
+	int no_of_edges;
+} Node;
+
+//--7 parameters
+__kernel void BFS_1( const __global Node* g_graph_nodes,
+					const __global int* g_graph_edges, 
+					__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global int* g_cost, 
+					const  int no_of_nodes){
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_graph_mask[tid]){
+		g_graph_mask[tid]=false;
+		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
+			int id = g_graph_edges[i];
+			if(!g_graph_visited[id]){
+				g_cost[id]=g_cost[tid]+1;
+				g_updating_graph_mask[id]=true;
+				}
+			}
+	}	
+}
+
+//--5 parameters
+__kernel void BFS_2(__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global char* g_over,
+					const  int no_of_nodes
+					) {
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_updating_graph_mask[tid]){
+
+		g_graph_mask[tid]=true;
+		g_graph_visited[tid]=true;
+		*g_over=true;
+		g_updating_graph_mask[tid]=false;
+	}
+}
+
+
--- a/benchmarks/old_opencl/bfs/libbfs.a
+++ b/benchmarks/old_opencl/bfs/libbfs.a
--- a/benchmarks/old_opencl/bfs/main.cc
+++ b/benchmarks/old_opencl/bfs/main.cc
@@ -0,0 +1,297 @@
+//--by Jianbin Fang
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#ifdef PROFILING
+#include "timer.h"
+#endif
+
+#include "CLHelper.h"
+#include "util.h"
+
+#define MAX_THREADS_PER_BLOCK 256
+
+// Structure to hold a node information
+struct Node {
+  int starting;
+  int no_of_edges;
+};
+
+//----------------------------------------------------------
+//--bfs on cpu
+//--programmer:	jianbin
+//--date:	26/01/2011
+//--note: width is changed to the new_width
+//----------------------------------------------------------
+void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
+                 int *h_graph_edges, char *h_graph_mask,
+                 char *h_updating_graph_mask, char *h_graph_visited,
+                 int *h_cost_ref) {
+  char stop;
+  int k = 0;
+  do {
+    // if no thread changes this value then the loop stops
+    stop = false;
+    for (int tid = 0; tid < no_of_nodes; tid++) {
+      if (h_graph_mask[tid] == true) {
+        h_graph_mask[tid] = false;
+        for (int i = h_graph_nodes[tid].starting;
+             i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
+             i++) {
+          int id =
+              h_graph_edges[i]; //--cambine: node id is connected with node tid
+          if (!h_graph_visited[id]) { //--cambine: if node id has not been
+                                      //visited, enter the body below
+            h_cost_ref[id] = h_cost_ref[tid] + 1;
+            h_updating_graph_mask[id] = true;
+          }
+        }
+      }
+    }
+
+    for (int tid = 0; tid < no_of_nodes; tid++) {
+      if (h_updating_graph_mask[tid] == true) {
+        h_graph_mask[tid] = true;
+        h_graph_visited[tid] = true;
+        stop = true;
+        h_updating_graph_mask[tid] = false;
+      }
+    }
+    k++;
+  } while (stop);
+}
+//----------------------------------------------------------
+//--breadth first search on GPUs
+//----------------------------------------------------------
+void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
+                 int *h_graph_edges, char *h_graph_mask,
+                 char *h_updating_graph_mask, char *h_graph_visited,
+                 int *h_cost) throw(std::string) {
+
+  // int number_elements = height*width;
+  char h_over;
+  cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
+      d_graph_visited, d_cost, d_over;
+  try {
+    //--1 transfer data from host to device
+    _clInit();
+    d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
+    d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
+    d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
+    d_updating_graph_mask =
+        _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
+    d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
+
+    d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
+    d_over = _clMallocRW(sizeof(char), &h_over);
+
+    _clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
+    _clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
+    _clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
+    _clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
+                 h_updating_graph_mask);
+    _clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
+    _clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
+
+//--2 invoke kernel
+#ifdef PROFILING
+    timer kernel_timer;
+    double kernel_time = 0.0;
+    kernel_timer.reset();
+    kernel_timer.start();
+#endif
+    do {
+      h_over = false;
+      _clMemcpyH2D(d_over, sizeof(char), &h_over);
+      //--kernel 0
+      int kernel_id = 0;
+      int kernel_idx = 0;
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+      _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+      _clSetArgs(kernel_id, kernel_idx++, d_cost);
+      _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+
+      // int work_items = no_of_nodes;
+      _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
+
+      //--kernel 1
+      kernel_id = 1;
+      kernel_idx = 0;
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+      _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+      _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+      _clSetArgs(kernel_id, kernel_idx++, d_over);
+      _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+
+      // work_items = no_of_nodes;
+      _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
+
+      _clMemcpyD2H(d_over, sizeof(char), &h_over);
+    } while (h_over);
+
+    _clFinish();
+#ifdef PROFILING
+    kernel_timer.stop();
+    kernel_time = kernel_timer.getTimeInSeconds();
+#endif
+    //--3 transfer data from device to host
+    _clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
+//--statistics
+#ifdef PROFILING
+    std::cout << "kernel time(s):" << kernel_time << std::endl;
+#endif
+    //--4 release cl resources.
+    _clFree(d_graph_nodes);
+    _clFree(d_graph_edges);
+    _clFree(d_graph_mask);
+    _clFree(d_updating_graph_mask);
+    _clFree(d_graph_visited);
+    _clFree(d_cost);
+    _clFree(d_over);
+    _clRelease();
+  } catch (std::string msg) {
+    _clFree(d_graph_nodes);
+    _clFree(d_graph_edges);
+    _clFree(d_graph_mask);
+    _clFree(d_updating_graph_mask);
+    _clFree(d_graph_visited);
+    _clFree(d_cost);
+    _clFree(d_over);
+    _clRelease();
+    std::string e_str = "in run_transpose_gpu -> ";
+    e_str += msg;
+    throw(e_str);
+  }
+  return;
+}
+
+//----------------------------------------------------------
+//--cambine:	main function
+//--author:		created by Jianbin Fang
+//--date:		25/01/2011
+//----------------------------------------------------------
+int main(int argc, char *argv[]) {
+	printf("enter demo main\n");
+
+  int no_of_nodes;
+  int edge_list_size;
+  FILE *fp;
+  Node *h_graph_nodes;
+  char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
+
+  try {
+    char *input_f = "graph4096.txt";
+    printf("Reading File\n");
+    // Read in Graph from a file
+    fp = fopen(input_f, "r");
+    if (!fp) {
+      printf("Error Reading graph file\n");
+      return 0;
+    }
+
+    printf("Reading File completed!\n");
+
+    int source = 0;
+
+    fscanf(fp, "%d", &no_of_nodes);
+
+    int num_of_blocks = 1;
+    int num_of_threads_per_block = no_of_nodes;
+
+    // Make execution Parameters according to the number of nodes
+    // Distribute threads across multiple Blocks if necessary
+    if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
+      num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
+      num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
+    }
+    work_group_size = num_of_threads_per_block;
+    // allocate host memory
+    h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
+    h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
+    h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
+    h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
+
+    int start, edgeno;
+    // initalize the memory
+    for (int i = 0; i < no_of_nodes; i++) {
+      fscanf(fp, "%d %d", &start, &edgeno);
+      h_graph_nodes[i].starting = start;
+      h_graph_nodes[i].no_of_edges = edgeno;
+      h_graph_mask[i] = false;
+      h_updating_graph_mask[i] = false;
+      h_graph_visited[i] = false;
+    }
+    // read the source node from the file
+    fscanf(fp, "%d", &source);
+    source = 0;
+    // set the source node as true in the mask
+    h_graph_mask[source] = true;
+    h_graph_visited[source] = true;
+    fscanf(fp, "%d", &edge_list_size);
+    int id, cost;
+    int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
+    for (int i = 0; i < edge_list_size; i++) {
+      fscanf(fp, "%d", &id);
+      fscanf(fp, "%d", &cost);
+      h_graph_edges[i] = id;
+    }
+
+    if (fp)
+      fclose(fp);
+    // allocate mem for the result on host side
+    int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
+    int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
+    for (int i = 0; i < no_of_nodes; i++) {
+      h_cost[i] = -1;
+      h_cost_ref[i] = -1;
+    }
+    h_cost[source] = 0;
+    h_cost_ref[source] = 0;
+    //---------------------------------------------------------
+    //--gpu entry
+    run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
+                h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
+    //---------------------------------------------------------
+    //--cpu entry
+    // initalize the memory again
+    for (int i = 0; i < no_of_nodes; i++) {
+      h_graph_mask[i] = false;
+      h_updating_graph_mask[i] = false;
+      h_graph_visited[i] = false;
+    }
+    // set the source node as true in the mask
+    source = 0;
+    h_graph_mask[source] = true;
+    h_graph_visited[source] = true;
+    run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
+                h_graph_mask, h_updating_graph_mask, h_graph_visited,
+                h_cost_ref);
+    //---------------------------------------------------------
+    //--result varification
+    compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
+    // release host memory
+    free(h_graph_nodes);
+    free(h_graph_mask);
+    free(h_updating_graph_mask);
+    free(h_graph_visited);
+
+  } catch (std::string msg) {
+    std::cout << "--cambine: exception in main ->" << msg << std::endl;
+    // release host memory
+    free(h_graph_nodes);
+    free(h_graph_mask);
+    free(h_updating_graph_mask);
+    free(h_graph_visited);
+  }
+
+  return 0;
+}
--- a/benchmarks/old_opencl/bfs/run
+++ b/benchmarks/old_opencl/bfs/run
@@ -0,0 +1 @@
+./bfs ../../data/bfs/graph1MW_6.txt
--- a/benchmarks/old_opencl/bfs/timer.cc
+++ b/benchmarks/old_opencl/bfs/timer.cc
@@ -0,0 +1,78 @@
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "timer.h"
+
+
+using namespace std;
+
+double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
+
+
+double timer::get_CPU_speed_in_MHz()
+{
+#if defined __linux__
+    ifstream infile("/proc/cpuinfo");
+    char     buffer[256], *colon;
+
+    while (infile.good()) {
+	infile.getline(buffer, 256);
+
+	if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
+	    return atof(colon + 2);
+    }
+#endif
+
+    return 0.0;
+}
+
+
+void timer::print_time(ostream &str, const char *which, double time) const
+{
+    static const char *units[] = { " ns", " us", " ms", "  s", " ks", 0 };
+    const char	      **unit   = units;
+
+    time = 1000.0 * time / CPU_speed_in_MHz;
+
+    while (time >= 999.5 && unit[1] != 0) {
+	time /= 1000.0;
+	++ unit;
+    }
+
+    str << which << " = " << setprecision(3) << setw(4) << time << *unit;
+}
+
+
+ostream &timer::print(ostream &str)
+{
+    str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
+
+    if (CPU_speed_in_MHz == 0)
+	str << "could not determine CPU speed\n";
+    else if (count > 0) {
+	double total = static_cast<double>(total_time);
+
+	print_time(str, "avg", total / static_cast<double>(count));
+	print_time(str, ", total", total);
+	str << ", count = " << setw(9) << count << '\n';
+    }
+    else
+	str << "not used\n";
+
+    return str;
+}
+
+
+ostream &operator << (ostream &str, class timer &timer)
+{
+    return timer.print(str);
+}
+
+double timer::getTimeInSeconds()
+{
+    double total = static_cast<double>(total_time);
+    double res = (total / 1000000.0) / CPU_speed_in_MHz;
+    return res;
+}
--- a/benchmarks/old_opencl/bfs/timer.h
+++ b/benchmarks/old_opencl/bfs/timer.h
@@ -0,0 +1,101 @@
+#ifndef timer_h
+#define timer_h
+
+#include <iostream>
+
+class timer {
+public:
+  timer(const char *name = 0);
+  timer(const char *name, std::ostream &write_on_exit);
+
+  ~timer();
+
+  void start(), stop();
+  void reset();
+  std::ostream &print(std::ostream &);
+
+  double getTimeInSeconds();
+
+private:
+  void print_time(std::ostream &, const char *which, double time) const;
+
+  union {
+    long long total_time;
+    struct {
+#if defined __PPC__
+      int high, low;
+#else
+      int low, high;
+#endif
+    };
+  };
+
+  unsigned long long count;
+  const char *const name;
+  std::ostream *const write_on_exit;
+
+  static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
+};
+
+std::ostream &operator<<(std::ostream &, class timer &);
+
+inline void timer::reset() {
+  total_time = 0;
+  count = 0;
+}
+
+inline timer::timer(const char *name) : name(name), write_on_exit(0) {
+  reset();
+}
+
+inline timer::timer(const char *name, std::ostream &write_on_exit)
+    : name(name), write_on_exit(&write_on_exit) {
+  reset();
+}
+
+inline timer::~timer() {
+  if (write_on_exit != 0)
+    print(*write_on_exit);
+}
+
+inline void timer::start() {
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+  unsigned eax, edx;
+
+  asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
+
+  total_time -= ((unsigned long long)edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) &&                        \
+    (defined __i386 || defined __x86_64)
+  asm volatile("rdtsc\n\t"
+               "subl %%eax, %0\n\t"
+               "sbbl %%edx, %1"
+               : "+m"(low), "+m"(high)
+               :
+               : "eax", "edx");
+#else
+#error Compiler/Architecture not recognized
+#endif
+}
+
+inline void timer::stop() {
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+  unsigned eax, edx;
+
+  asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
+
+  total_time += ((unsigned long long)edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) &&                        \
+    (defined __i386 || defined __x86_64)
+  asm volatile("rdtsc\n\t"
+               "addl %%eax, %0\n\t"
+               "adcl %%edx, %1"
+               : "+m"(low), "+m"(high)
+               :
+               : "eax", "edx");
+#endif
+
+  ++count;
+}
+
+#endif
--- a/benchmarks/old_opencl/bfs/util.h
+++ b/benchmarks/old_opencl/bfs/util.h
@@ -0,0 +1,72 @@
+#ifndef _C_UTIL_
+#define _C_UTIL_
+#include <math.h>
+#include <iostream>
+
+//-------------------------------------------------------------------
+//--initialize array with maximum limit
+//-------------------------------------------------------------------
+template<typename datatype>
+void fill(datatype *A, const int n, const datatype maxi){
+    for (int j = 0; j < n; j++) 
+    {
+        A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
+    }
+}
+
+//--print matrix
+template<typename datatype>
+void print_matrix(datatype *A, int height, int width){
+	for(int i=0; i<height; i++){
+		for(int j=0; j<width; j++){
+			int idx = i*width + j;
+			std::cout<<A[idx]<<" ";
+		}
+		std::cout<<std::endl;
+	}
+
+	return;
+}
+//-------------------------------------------------------------------
+//--verify results
+//-------------------------------------------------------------------
+#define MAX_RELATIVE_ERROR  .002
+template<typename datatype>
+void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
+
+    char passed = true; 
+#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << endl;
+    }
+    return ;
+}
+template<typename datatype>
+void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
+
+    char passed = true; 
+//#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (cpu_results[i]!=gpu_results[i]){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << endl;
+    }
+    return ;
+}
+
+#endif
+