From 61e3442ef80b5c7db66e436e08855529d9869af0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 14 Nov 2023 22:31:30 -0800 Subject: [PATCH] adding opencl convolution benchmark --- tests/opencl/Makefile | 6 + tests/opencl/convolution/Makefile | 7 + tests/opencl/convolution/kernel.cl | 32 ++++ tests/opencl/convolution/main.cc | 258 +++++++++++++++++++++++++++++ tests/opencl/matmul/Makefile | 2 +- tests/opencl/matmul/kernel.cl | 59 +++---- tests/opencl/matmul/main.cc | 51 +++--- tests/opencl/oclprintf/main.cc | 2 +- tests/opencl/psort/main.cc | 3 +- tests/opencl/sgemm/common.h | 6 +- tests/opencl/sgemm/main.cc | 125 ++++++++------ tests/opencl/vecadd/main.cc | 4 +- tests/regression/demo/main.cpp | 46 ++--- tests/regression/tensor/Makefile | 2 +- tests/regression/tensor/kernel.cpp | 4 +- tests/regression/tensor/main.cpp | 53 +++--- 16 files changed, 490 insertions(+), 170 deletions(-) create mode 100644 tests/opencl/convolution/Makefile create mode 100644 tests/opencl/convolution/kernel.cl create mode 100644 tests/opencl/convolution/main.cc diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 2cee5c5d..c838c3de 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -18,6 +18,7 @@ all: $(MAKE) -C oclprintf $(MAKE) -C blackscholes $(MAKE) -C matmul + $(MAKE) -C convolution run-simx: $(MAKE) -C vecadd run-simx @@ -37,6 +38,7 @@ run-simx: $(MAKE) -C blackscholes run-simx $(MAKE) -C matmul run-simx $(MAKE) -C transpose run-simx + $(MAKE) -C convolution run-simx # $(MAKE) -C vectorhypot run-simx # $(MAKE) -C mri-q run-simx @@ -58,6 +60,7 @@ run-rtlsim: $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C matmul run-rtlsim + $(MAKE) -C convolution run-rtlsim # $(MAKE) -C vectorhypot run-rtlsim # $(MAKE) -C mri-q run-rtlsim @@ -79,6 +82,7 @@ run-opae: $(MAKE) -C oclprintf run-opae $(MAKE) -C blackscholes run-opae $(MAKE) -C matmul run-opae + $(MAKE) -C convolution run-opae # $(MAKE) -C vectorhypot run-opae # $(MAKE) -C mri-q run-opae @@ -102,6 +106,7 @@ clean: $(MAKE) -C oclprintf clean $(MAKE) -C blackscholes clean $(MAKE) -C matmul clean + $(MAKE) -C convolution clean clean-all: $(MAKE) -C vecadd clean-all @@ -124,3 +129,4 @@ clean-all: $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all $(MAKE) -C matmul clean-all + $(MAKE) -C convolution clean-all diff --git a/tests/opencl/convolution/Makefile b/tests/opencl/convolution/Makefile new file mode 100644 index 00000000..42a577d2 --- /dev/null +++ b/tests/opencl/convolution/Makefile @@ -0,0 +1,7 @@ +PROJECT = convolution + +SRCS = main.cc + +OPTS ?= -n32 + +include ../common.mk diff --git a/tests/opencl/convolution/kernel.cl b/tests/opencl/convolution/kernel.cl new file mode 100644 index 00000000..2ef31040 --- /dev/null +++ b/tests/opencl/convolution/kernel.cl @@ -0,0 +1,32 @@ +__kernel void conv3x3(__global float* output, + __global float* input, + __global float* weights, + const int width, + const int height) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + // Adjust for padded borders + int paddedWidth = width + 2; + int paddedX = x + 1; + int paddedY = y + 1; + + // Compute the convolution sum + float sum = 0.0f; + + sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left + sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1]; // Top-center + sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right + + sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3]; // Middle-left + sum += input[paddedY * paddedWidth + paddedX] * weights[4]; // Center + sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5]; // Middle-right + + sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left + sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7]; // Bottom-center + sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right + + // Store the result in the output array + output[y * width + x] = sum; +} \ No newline at end of file diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc new file mode 100644 index 00000000..d7487c2f --- /dev/null +++ b/tests/opencl/convolution/main.cc @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FLOAT_ULP 6 + +#define KERNEL_NAME "conv3x3" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static bool compare_equal(float a, float b) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + return d <= FLOAT_ULP; +} + +static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) { + int paddedWidth = width + 2; + for (int32_t y = 0; y < height; ++y) { + for (int32_t x = 0; x < width; ++x) { + int paddedY = y + 1; + int paddedX = x + 1; + float sum = 0.0f; + for (int32_t ky = -1; ky <= 1; ++ky) { + for (int32_t kx = -1; kx <= 1; ++kx) { + int32_t iy = paddedY + ky; + int32_t ix = paddedX + kx; + float value = I[iy * paddedWidth + ix]; + float weight = W[(ky + 1) * 3 + (kx + 1)]; + sum += value * weight; + } + } + O[y * width + x] = sum; + } + } +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem i_memobj = NULL; +cl_mem w_memobj = NULL; +cl_mem o_memobj = NULL; +uint8_t* kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (i_memobj) clReleaseMemObject(i_memobj); + if (w_memobj) clReleaseMemObject(w_memobj); + if (o_memobj) clReleaseMemObject(o_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + if (kernel_bin) free(kernel_bin); +} + +int size = 32; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +int main (int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + printf("Matrix size=%d\n", size); + + uint32_t o_points = size * size; + uint32_t i_points = (size+2) * (size+2); + uint32_t w_points = 3 * 3; + + cl_platform_id platform_id; + size_t kernel_size; + + // Getting platform and device information + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + printf("Allocate device buffers\n"); + size_t i_nbytes = i_points * sizeof(float); + size_t w_nbytes = w_points * sizeof(float); + size_t o_nbytes = o_points * sizeof(float); + i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err)); + w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err)); + o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err)); + + printf("Create program from kernel source\n"); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif + if (program == NULL) { + cleanup(); + return -1; + } + + // Build program + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + // Create kernel + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + size_t global_size[2] = {size, size}; + + // Set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size)); + + // Allocate memories for input arrays and output arrays. + std::vector h_i(i_points); + std::vector h_w(w_points); + std::vector h_o(o_points, 0.0f); + + // Generate input values + for (int32_t y = -1; y < size+1; ++y) { + for (int32_t x = -1; x < size+1; ++x) { + if (x >= 0 && x < size && y >= 0 && y < size) { + h_i[(y+1) * (size+2) + (x+1)] = static_cast(rand()) / RAND_MAX; + } else { + h_i[(y+1) * (size+2) + (x+1)] = 0; + } + } + } + for (uint32_t i = 0; i < w_points; ++i) { + h_w[i] = static_cast(rand()) / RAND_MAX; + } + + // Creating command queue + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL)); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL)); + + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL)); + + printf("Verify result\n"); + std::vector ref_vec(o_points); + convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size); + int errors = 0; + for (uint32_t i = 0; i < o_points; ++i) { + if (!compare_equal(h_o[i], ref_vec[i])) { + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]); + ++errors; + } + } + if (errors != 0) { + printf("FAILED! - %d errors\n", errors); + } else { + printf("PASSED!\n"); + } + + // Clean up + cleanup(); + + return errors; +} diff --git a/tests/opencl/matmul/Makefile b/tests/opencl/matmul/Makefile index 0d1d136a..39b92b36 100644 --- a/tests/opencl/matmul/Makefile +++ b/tests/opencl/matmul/Makefile @@ -2,6 +2,6 @@ PROJECT = matmul SRCS = main.cc -OPTS ?= -n16 +OPTS ?= -n32 include ../common.mk diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl index a0ef2d81..02aa074c 100644 --- a/tests/opencl/matmul/kernel.cl +++ b/tests/opencl/matmul/kernel.cl @@ -7,43 +7,41 @@ __kernel void matmul(__global float *A, { int globalRow = get_global_id(1); int globalCol = get_global_id(0); - int localRow = get_local_id(1); - int localCol = get_local_id(0); + int localRow = get_local_id(1); + int localCol = get_local_id(0); int localSize = get_local_size(0); // assuming square local size float sum = 0.0f; - // Load initial blocks of A and B into local memory - int k = 0; - localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol]; - localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol]; + // Loop over all blocks of both matrices + for (int k = 0; k < N; k += localSize) { + // Load block of matrix A to local memory + localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol]; - // Iterate over blocks - for (k = 0; k < N; k += 16) { - // Ensure the initial block is loaded + // Load block of matrix B to local memory, adjusting for column-major access + localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol]; + + // Synchronize to make sure the tiles are loaded barrier(CLK_LOCAL_MEM_FENCE); - // Compute multiplication for this block - for (int j = 0; j < 16; j++) { + // Multiply the two matrix blocks and accumulate result + for (int j = 0; j < localSize; j++) { sum += localA[localRow * localSize + j] * localB[j * localSize + localCol]; } - - // Load the next block of matrix A into local memory - if (k + 16 < N) { - localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol]; - localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol]; - } } C[globalRow * N + globalCol] = sum; } -/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N) +/*__kernel void matmul(__global float *A, + __global float *B, + __global float *C, + const unsigned int N) { int globalRow = get_global_id(1); int globalCol = get_global_id(0); - int localRow = get_local_id(1); - int localCol = get_local_id(0); + int localRow = get_local_id(1); + int localCol = get_local_id(0); // Static local memory declaration __local float localA[16][16]; @@ -51,26 +49,21 @@ __kernel void matmul(__global float *A, float sum = 0.0f; - // Load initial blocks of A and B into local memory - int k = 0; - localA[localRow][localCol] = A[globalRow * N + k + localCol]; - localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; - // Iterate over blocks - for (k = 0; k < N; k += 16) { - // Ensure the initial block is loaded + for (int k = 0; k < N; k += 16) { + // Load a block of matrix A into local memory + localA[localRow][localCol] = A[globalRow * N + k + localCol]; + + // Load a block of matrix B into local memory + localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; + + // Ensure the entire block is loaded barrier(CLK_LOCAL_MEM_FENCE); // Compute multiplication for this block for (int j = 0; j < 16; j++) { sum += localA[localRow][j] * localB[j][localCol]; } - - // Load the next block of matrix A into local memory - if (k + 16 < N) { - localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol]; - localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol]; - } } C[globalRow * N + globalCol] = sum; diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc index f7714dd7..3d26ff0c 100644 --- a/tests/opencl/matmul/main.cc +++ b/tests/opencl/matmul/main.cc @@ -10,6 +10,8 @@ #define LOCAL_SIZE 16 +#define FLOAT_ULP 6 + #define KERNEL_NAME "matmul" #define CL_CHECK(_expr) \ @@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } -static bool compare_equal(float a, float b, int ulp = 21) { - union fi_t { int i; float f; }; +static bool compare_equal(float a, float b) { + union fi_t { float f; int32_t i; }; fi_t fa, fb; fa.f = a; fb.f = b; - return std::abs(fa.i - fb.i) <= ulp; + auto d = std::abs(fa.i - fb.i); + return d <= FLOAT_ULP; } -static void matrix_multiply_cpu(float *A, float *B, float *C, int N) { +static void matmul_cpu(float *C, float *A, float *B, int N) { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float sum = 0.0f; @@ -98,7 +101,7 @@ static void cleanup() { if (kernel_bin) free(kernel_bin); } -int size = 64; +int size = 32; static void show_usage() { printf("Usage: [-n size] [-h: help]\n"); @@ -106,7 +109,7 @@ static void show_usage() { static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "fn:h?")) != -1) { + while ((c = getopt(argc, argv, "n:h?")) != -1) { switch (c) { case 'n': size = atoi(optarg); @@ -127,6 +130,8 @@ int main (int argc, char **argv) { // parse command arguments parse_args(argc, argv); + uint32_t num_points = size * size; + printf("Matrix size=%d\n", size); if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) { printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE); @@ -148,7 +153,7 @@ int main (int argc, char **argv) { printf("Using device: %s\n", device_string); printf("Allocate device buffers\n"); - size_t nbytes = size * size * sizeof(float); + size_t nbytes = num_points * sizeof(float); a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); @@ -176,32 +181,26 @@ int main (int argc, char **argv) { // Create kernel kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); - size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t global_size[2] = {size, size}; + size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE}; // Set kernel arguments CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); - //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); - //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); + CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); + CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); // Allocate memories for input arrays and output arrays. - std::vector h_a(size * size); - std::vector h_b(size * size); - std::vector h_c(size * size); + std::vector h_a(num_points); + std::vector h_b(num_points); + std::vector h_c(num_points); - // Initialize values for array members. - for (int i = 0; i < (size * size); ++i) { - #ifdef USE_FLOAT - h_a[i] = (float)rand() / (float)RAND_MAX; - h_b[i] = (float)rand() / (float)RAND_MAX; - #else - h_a[i] = rand(); - h_b[i] = rand(); - #endif - h_c[i] = 0xdeadbeef; + // Generate input values + for (uint32_t i = 0; i < num_points; ++i) { + h_a[i] = static_cast(rand()) / RAND_MAX; + h_b[i] = static_cast(rand()) / RAND_MAX; } // Creating command queue @@ -223,10 +222,10 @@ int main (int argc, char **argv) { CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL)); printf("Verify result\n"); - std::vector ref_vec(size * size); - matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size); + std::vector ref_vec(num_points); + matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size); int errors = 0; - for (int i = 0; i < (size * size); i++) { + for (uint32_t i = 0; i < num_points; ++i) { if (!compare_equal(h_c[i], ref_vec[i])) { if (errors < 100) printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]); diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc index 7c0463cf..184eec96 100644 --- a/tests/opencl/oclprintf/main.cc +++ b/tests/opencl/oclprintf/main.cc @@ -143,7 +143,7 @@ int main (int argc, char **argv) { // Allocate memories for input arrays and output arrays. h_a = (int*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { h_a[i] = -1 + i; } diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc index 26a42807..b627ceee 100644 --- a/tests/opencl/psort/main.cc +++ b/tests/opencl/psort/main.cc @@ -155,9 +155,8 @@ int main (int argc, char **argv) { h_a = (int*)malloc(nbytes); h_c = (int*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { - h_c[i] = 0xdeadbeef; if (float_enable) { float value = sinf(i)*sinf(i); h_a[i] = *(int*)&value; diff --git a/tests/opencl/sgemm/common.h b/tests/opencl/sgemm/common.h index 01f68d48..fdb40bce 100644 --- a/tests/opencl/sgemm/common.h +++ b/tests/opencl/sgemm/common.h @@ -1,12 +1,8 @@ #ifndef COMMON_H #define COMMON_H -#define USE_FLOAT - -#ifdef USE_FLOAT +#ifndef TYPE #define TYPE float -#else -#define TYPE int #endif #endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc index 3ca14792..7a02929f 100644 --- a/tests/opencl/sgemm/main.cc +++ b/tests/opencl/sgemm/main.cc @@ -11,6 +11,8 @@ #define KERNEL_NAME "sgemm" +#define FLOAT_ULP 6 + #define CL_CHECK(_expr) \ do { \ cl_int _err = _expr; \ @@ -33,6 +35,66 @@ _ret; \ }) +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } + return false; + } + return true; + } +}; + +/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + TYPE acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +}*/ + static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { if (nullptr == filename || nullptr == data || 0 == size) return -1; @@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } -/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - TYPE acc = 0; - for (int k = 0; k < K; ++k) { - acc += A[k * M + m] * B[n * K + k]; - } - C[n * M + m] = acc; - } - } -}*/ - -#ifdef USE_FLOAT -static bool compare_equal(float a, float b, int ulp = 21) { - union fi_t { int i; float f; }; - fi_t fa, fb; - fa.f = a; - fb.f = b; - return std::abs(fa.i - fb.i) <= ulp; -} -#else -static bool compare_equal(int a, int b, int ulp = 21) { - return (a == b); -} -#endif - cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue commandQueue = NULL; @@ -145,6 +181,8 @@ int main (int argc, char **argv) { // parse command arguments parse_args(argc, argv); + uint32_t num_points = size * size; + cl_platform_id platform_id; size_t kernel_size; cl_int binary_status; @@ -163,7 +201,7 @@ int main (int argc, char **argv) { context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); // Allocate device buffers - size_t nbytes = size * size * sizeof(TYPE); + size_t nbytes = num_points * sizeof(TYPE); a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); @@ -194,23 +232,17 @@ int main (int argc, char **argv) { h_b = (TYPE*)malloc(nbytes); h_c = (TYPE*)malloc(nbytes); - // Initialize values for array members. - for (int i = 0; i < (size * size); ++i) { - #ifdef USE_FLOAT - h_a[i] = (float)rand() / (float)RAND_MAX; - h_b[i] = (float)rand() / (float)RAND_MAX; - #else - h_a[i] = rand(); - h_b[i] = rand(); - #endif - h_c[i] = 0xdeadbeef; + // Generate input values + for (uint32_t i = 0; i < num_points; ++i) { + h_a[i] = Comparator::generate(); + h_b[i] = Comparator::generate(); } size_t global_offset[2] = {0, 0}; size_t global_work_size[2] = {size, size}; size_t local_work_size[2] = {1, 1}; - std::vector ref_vec(size * size); + std::vector ref_vec(num_points); // reference generation size_t num_groups_y = global_work_size[1] / local_work_size[1]; @@ -228,12 +260,7 @@ int main (int argc, char **argv) { TYPE acc = 0; for (int k = 0; k < width; k++) { acc += h_a[k * width + r] * h_b[c * width + k]; - } - /*#ifdef USE_FLOAT - printf("*** r=%d, c=%d, v=%f\n", r, c, acc); - #else - printf("*** r=%d, c=%d, v=%d\n", r, c, acc); - #endif*/ + } ref_vec[c * width + r] = acc; } } @@ -260,14 +287,8 @@ int main (int argc, char **argv) { printf("Verify result\n"); int errors = 0; - for (int i = 0; i < (size * size); i++) { - if (!compare_equal(h_c[i], ref_vec[i])) { - if (errors < 100) - #ifdef USE_FLOAT - printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]); - #else - printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]); - #endif + for (uint32_t i = 0; i < num_points; ++i) { + if (!Comparator::compare(h_c[i], ref_vec[i], i, errors)) { ++errors; } } diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc index 23aa49b4..992e88be 100644 --- a/tests/opencl/vecadd/main.cc +++ b/tests/opencl/vecadd/main.cc @@ -166,12 +166,10 @@ int main (int argc, char **argv) { h_b = (float*)malloc(nbytes); h_c = (float*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); - h_c[i] = 0xdeadbeef; - //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]); } // Creating command queue diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 63556a5f..f14f66c3 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -19,16 +19,6 @@ /////////////////////////////////////////////////////////////////////////////// -union Float_t { - float f; - int i; - struct { - uint32_t man : 23; - uint32_t exp : 8; - uint32_t sign : 1; - } parts; -}; - template class Comparator {}; @@ -38,22 +28,41 @@ public: static const char* type_str() { return "integer"; } - static bool compare(int a, int b) { - return a == b; + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; } }; template <> class Comparator { +private: + union Float_t { float f; int i; }; public: static const char* type_str() { return "float"; } - static bool compare(float a, float b) { - Float_t fa{a}, fb{b}; + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { - std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } return false; } return true; @@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg, for (uint32_t i = 0; i < num_points; ++i) { auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref)) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + if (!Comparator::compare(cur, ref, i, errors)) { ++errors; } } @@ -196,8 +203,7 @@ int main(int argc, char *argv[]) { // generate source data source_data.resize(2 * num_points); for (uint32_t i = 0; i < source_data.size(); ++i) { - auto r = static_cast(std::rand()) / RAND_MAX; - source_data[i] = static_cast(r * 2 * num_points); + source_data[i] = Comparator::generate(); } // upload source buffer0 diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile index 790664dc..dbb70c3b 100644 --- a/tests/regression/tensor/Makefile +++ b/tests/regression/tensor/Makefile @@ -4,6 +4,6 @@ SRCS = main.cpp VX_SRCS = kernel.cpp -OPTS ?= -s16 +OPTS ?= -n32 include ../common.mk \ No newline at end of file diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp index 5cf0851c..b0e8f69e 100644 --- a/tests/regression/tensor/kernel.cpp +++ b/tests/regression/tensor/kernel.cpp @@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) { } void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) { - auto size = arg->size; - auto A = reinterpret_cast(arg->A_addr); + auto A = reinterpret_cast(arg->A_addr); auto B = reinterpret_cast(arg->B_addr); auto C = reinterpret_cast(arg->C_addr); + auto size = arg->size; uint32_t row, col; if (is_log2(size)) { diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp index d93f3177..81103c10 100644 --- a/tests/regression/tensor/main.cpp +++ b/tests/regression/tensor/main.cpp @@ -19,16 +19,6 @@ /////////////////////////////////////////////////////////////////////////////// -union Float_t { - float f; - int i; - struct { - uint32_t man : 23; - uint32_t exp : 8; - uint32_t sign : 1; - } parts; -}; - template class Comparator {}; @@ -38,8 +28,17 @@ public: static const char* type_str() { return "integer"; } - static bool compare(int a, int b) { - return a == b; + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; } }; @@ -49,18 +48,26 @@ public: static const char* type_str() { return "float"; } - static bool compare(float a, float b) { - Float_t fa{a}, fb{b}; + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { - std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } return false; } return true; } }; -static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { +static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { for (uint32_t row = 0; row < height; ++row) { for (uint32_t col = 0; col < width; ++col) { TYPE sum(0); @@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t } const char* kernel_file = "kernel.bin"; -uint32_t size = 16; +uint32_t size = 32; vx_device_h device = nullptr; std::vector staging_buf; @@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {}; static void show_usage() { std::cout << "Vortex Test." << std::endl; - std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl; + std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "s:k:h?")) != -1) { + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { switch (c) { - case 's': + case 'n': size = atoi(optarg); break; case 'k': @@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg, for (uint32_t i = 0; i < refs.size(); ++i) { auto ref = refs[i]; auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref)) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + if (!Comparator::compare(cur, ref, i, errors)) { ++errors; } } @@ -208,7 +213,7 @@ int main(int argc, char *argv[]) { src_A[i] = static_cast(a * size); src_B[i] = static_cast(b * size); } - cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size); + matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size); // upload source buffer0 {