Files
kernels/tests/opencl/kmeans/main.cc
2021-06-13 17:42:04 -07:00

383 lines
11 KiB
C++
Executable File

#include "kmeans.h"
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#ifdef WIN
#include <windows.h>
#else
#include <pthread.h>
#include <sys/time.h>
double gettime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
#endif
#ifdef NV
#include <oclUtils.h>
#else
#include <CL/cl.h>
#endif
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 256
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE2 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE2 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE2 RD_WG_SIZE
#else
#define BLOCK_SIZE2 256
#endif
// local variables
static cl_context context;
static cl_command_queue cmd_queue;
static cl_device_type device_type;
static cl_device_id *device_list;
static cl_int num_devices;
static int initialize(int use_gpu) {
cl_int result;
size_t size;
/*// create OpenCL context
cl_platform_id platform_id;
if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
return -1;
}
cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_id, 0};
device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL);
if (!context) {
printf("ERROR: clCreateContextFromType(%s) failed\n",
use_gpu ? "GPU" : "CPU");
return -1;
}
// get the list of GPUs
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
num_devices = (int)(size / sizeof(cl_device_id));
if (result != CL_SUCCESS || num_devices < 1) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}
device_list = new cl_device_id[num_devices];
if (!device_list) {
printf("ERROR: new cl_device_id[] failed\n");
return -1;
}
result =
clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL);
if (result != CL_SUCCESS) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}*/
cl_platform_id platform_id;
num_devices = 1;
device_list = new cl_device_id[num_devices];
result = clGetPlatformIDs(1, &platform_id, NULL);
result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL);
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
// create command queue for the first device
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
if (!cmd_queue) {
printf("ERROR: clCreateCommandQueue() failed\n");
return -1;
}
return 0;
}
static int shutdown() {
// release resources
if (cmd_queue)
clReleaseCommandQueue(cmd_queue);
if (context)
clReleaseContext(context);
if (device_list)
delete device_list;
// reset all variables
cmd_queue = 0;
context = 0;
device_list = 0;
num_devices = 0;
device_type = 0;
return 0;
}
cl_mem d_feature;
cl_mem d_feature_swap;
cl_mem d_cluster;
cl_mem d_membership;
cl_kernel kernel;
cl_kernel kernel_s;
cl_kernel kernel2;
int *membership_OCL;
int *membership_d;
float *feature_d;
float *clusters_d;
float *center_d;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
int allocate(int n_points, int n_features, int n_clusters, float **feature) {
/*int sourcesize = 1024 * 1024;
char *source = (char *)calloc(sourcesize, sizeof(char));
if (!source) {
printf("ERROR: calloc(%d) failed\n", sourcesize);
return -1;
}
// read the kernel core source
char *tempchar = "./kmeans.cl";
FILE *fp = fopen(tempchar, "rb");
if (!fp) {
printf("ERROR: unable to open '%s'\n", tempchar);
return -1;
}
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);*/
// OpenCL initialization
int use_gpu = 1;
if (initialize(use_gpu))
return -1;
// compile kernel
cl_int err = 0;
//const char *slist[2] = {source, 0};
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
return -1;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
{ // show warnings/errors
// static char log[65536]; memset(log, 0, sizeof(log));
// cl_device_id device_id = 0;
// err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id),
//&device_id, NULL);
// clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG,
// sizeof(log)-1, log, NULL);
// if(err || strstr(log,"warning:") || strstr(log, "error:"))
// printf("<<<<\n%s\n>>>>\n", log);
}
if (err != CL_SUCCESS) {
printf("ERROR: clBuildProgram() => %d\n", err);
return -1;
}
char *kernel_kmeans_c = "kmeans_kernel_c";
char *kernel_swap = "kmeans_swap";
kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
kernel2 = clCreateKernel(prog, kernel_swap, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
clReleaseProgram(prog);
d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_feature_swap =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_cluster =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_clusters * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n",
n_clusters * n_features, err);
return -1;
}
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * sizeof(int), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points,
err);
return -1;
}
// write buffers
err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0,
n_points * n_features * sizeof(float), feature[0],
0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature);
clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features);
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE; // work group size is defined by
// RD_WG_SIZE_0 or RD_WG_SIZE_0_0
// 2014/06/10 17:00:51
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
membership_OCL = (int *)malloc(n_points * sizeof(int));
}
void deallocateMemory() {
clReleaseMemObject(d_feature);
clReleaseMemObject(d_feature_swap);
clReleaseMemObject(d_cluster);
clReleaseMemObject(d_membership);
free(membership_OCL);
}
int main(int argc, char **argv) {
printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n",
BLOCK_SIZE, BLOCK_SIZE2);
setup(argc, argv);
shutdown();
}
int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
int n_features, int n_points, int n_clusters, int *membership,
float **clusters, int *new_centers_len, float **new_centers) {
int delta = 0;
int i, j, k;
cl_int err = 0;
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE2; // work group size is defined by
// RD_WG_SIZE_1 or RD_WG_SIZE_1_0
// 2014/06/10 17:00:41
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0,
n_clusters * n_features * sizeof(float),
clusters[0], 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points,
err);
return -1;
}
int size = 0;
int offset = 0;
clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster);
clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership);
clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters);
clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features);
clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset);
clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size);
err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
clFinish(cmd_queue);
err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0,
n_points * sizeof(int), membership_OCL, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: Memcopy Out\n");
return -1;
}
delta = 0;
for (i = 0; i < n_points; i++) {
int cluster_id = membership_OCL[i];
new_centers_len[cluster_id]++;
if (membership_OCL[i] != membership[i]) {
delta++;
membership[i] = membership_OCL[i];
}
for (j = 0; j < n_features; j++) {
new_centers[cluster_id][j] += feature[i][j];
}
}
return delta;
}