From 61e3442ef80b5c7db66e436e08855529d9869af0 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Tue, 14 Nov 2023 22:31:30 -0800
Subject: [PATCH] adding opencl convolution benchmark

---
 tests/opencl/Makefile              |   6 +
 tests/opencl/convolution/Makefile  |   7 +
 tests/opencl/convolution/kernel.cl |  32 ++++
 tests/opencl/convolution/main.cc   | 258 +++++++++++++++++++++++++++++
 tests/opencl/matmul/Makefile       |   2 +-
 tests/opencl/matmul/kernel.cl      |  59 +++----
 tests/opencl/matmul/main.cc        |  51 +++---
 tests/opencl/oclprintf/main.cc     |   2 +-
 tests/opencl/psort/main.cc         |   3 +-
 tests/opencl/sgemm/common.h        |   6 +-
 tests/opencl/sgemm/main.cc         | 125 ++++++++------
 tests/opencl/vecadd/main.cc        |   4 +-
 tests/regression/demo/main.cpp     |  46 ++---
 tests/regression/tensor/Makefile   |   2 +-
 tests/regression/tensor/kernel.cpp |   4 +-
 tests/regression/tensor/main.cpp   |  53 +++---
 16 files changed, 490 insertions(+), 170 deletions(-)
 create mode 100644 tests/opencl/convolution/Makefile
 create mode 100644 tests/opencl/convolution/kernel.cl
 create mode 100644 tests/opencl/convolution/main.cc

diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index 2cee5c5d..c838c3de 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -18,6 +18,7 @@ all:
 	$(MAKE) -C oclprintf
 	$(MAKE) -C blackscholes
 	$(MAKE) -C matmul
+	$(MAKE) -C convolution
 
 run-simx:
 	$(MAKE) -C vecadd run-simx
@@ -37,6 +38,7 @@ run-simx:
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C matmul run-simx
 	$(MAKE) -C transpose run-simx
+	$(MAKE) -C convolution run-simx
 #	$(MAKE) -C vectorhypot run-simx	
 #	$(MAKE) -C mri-q run-simx
 
@@ -58,6 +60,7 @@ run-rtlsim:
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C matmul run-rtlsim
+	$(MAKE) -C convolution run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim
 
@@ -79,6 +82,7 @@ run-opae:
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
 	$(MAKE) -C matmul run-opae
+	$(MAKE) -C convolution run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae
 
@@ -102,6 +106,7 @@ clean:
 	$(MAKE) -C oclprintf clean
 	$(MAKE) -C blackscholes clean
 	$(MAKE) -C matmul clean
+	$(MAKE) -C convolution clean
 
 clean-all:
 	$(MAKE) -C vecadd clean-all
@@ -124,3 +129,4 @@ clean-all:
 	$(MAKE) -C oclprintf clean-all
 	$(MAKE) -C blackscholes clean-all
 	$(MAKE) -C matmul clean-all
+	$(MAKE) -C convolution clean-all
diff --git a/tests/opencl/convolution/Makefile b/tests/opencl/convolution/Makefile
new file mode 100644
index 00000000..42a577d2
--- /dev/null
+++ b/tests/opencl/convolution/Makefile
@@ -0,0 +1,7 @@
+PROJECT = convolution
+
+SRCS = main.cc
+
+OPTS ?= -n32
+
+include ../common.mk
diff --git a/tests/opencl/convolution/kernel.cl b/tests/opencl/convolution/kernel.cl
new file mode 100644
index 00000000..2ef31040
--- /dev/null
+++ b/tests/opencl/convolution/kernel.cl
@@ -0,0 +1,32 @@
+__kernel void conv3x3(__global float* output, 
+                      __global float* input,
+                      __global float* weights, 
+                      const int width, 
+                      const int height) 
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = x + 1;
+    int paddedY = y + 1;
+
+    // Compute the convolution sum
+    float sum = 0.0f;
+
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
+    sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1];       // Top-center
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
+
+    sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3];       // Middle-left
+    sum += input[paddedY * paddedWidth + paddedX] * weights[4];             // Center
+    sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5];       // Middle-right
+
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
+    sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7];       // Bottom-center
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
+
+    // Store the result in the output array
+    output[y * width + x] = sum;
+}
\ No newline at end of file
diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc
new file mode 100644
index 00000000..d7487c2f
--- /dev/null
+++ b/tests/opencl/convolution/main.cc
@@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h> 
+#include <chrono>
+#include <vector>
+
+#define FLOAT_ULP 6
+
+#define KERNEL_NAME "conv3x3"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
+}
+
+static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
+  int paddedWidth = width + 2;
+  for (int32_t y = 0; y < height; ++y) {
+    for (int32_t x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int32_t ky = -1; ky <= 1; ++ky) {
+        for (int32_t kx = -1; kx <= 1; ++kx) {
+          int32_t iy = paddedY + ky;
+          int32_t ix = paddedX + kx;
+          float value = I[iy * paddedWidth + ix];
+          float weight = W[(ky + 1) * 3 + (kx + 1)];
+          sum += value * weight;
+        }
+      }
+      O[y * width + x] = sum;
+    }
+  }
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem i_memobj = NULL;
+cl_mem w_memobj = NULL;
+cl_mem o_memobj = NULL;
+uint8_t* kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (i_memobj) clReleaseMemObject(i_memobj);
+  if (w_memobj) clReleaseMemObject(w_memobj);
+  if (o_memobj) clReleaseMemObject(o_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);  
+  if (kernel_bin) free(kernel_bin);
+}
+
+int size = 32;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  printf("Matrix size=%d\n", size);
+
+  uint32_t o_points = size * size;
+  uint32_t i_points = (size+2) * (size+2);
+  uint32_t w_points = 3 * 3;
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  printf("Allocate device buffers\n");  
+  size_t i_nbytes = i_points * sizeof(float);
+  size_t w_nbytes = w_points * sizeof(float);
+  size_t o_nbytes = o_points * sizeof(float);
+  i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
+  w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
+  o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif  
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t global_size[2] = {size, size};
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
+
+  // Allocate memories for input arrays and output arrays.
+  std::vector<float> h_i(i_points);
+  std::vector<float> h_w(w_points);
+  std::vector<float> h_o(o_points, 0.0f);
+	
+  // Generate input values
+  for (int32_t y = -1; y < size+1; ++y) {
+    for (int32_t x = -1; x < size+1; ++x) {
+      if (x >= 0 && x < size && y >= 0 && y < size) {
+        h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
+      } else {
+        h_i[(y+1) * (size+2) + (x+1)] = 0;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < w_points; ++i) {
+    h_w[i] = static_cast<float>(rand()) / RAND_MAX;
+  }
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  std::vector<float> ref_vec(o_points);
+  convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
+  int errors = 0;
+  for (uint32_t i = 0; i < o_points; ++i) {
+    if (!compare_equal(h_o[i], ref_vec[i])) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
+      ++errors;
+    }
+  }
+  if (errors != 0) {
+    printf("FAILED! - %d errors\n", errors);    
+  } else {
+    printf("PASSED!\n");
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
diff --git a/tests/opencl/matmul/Makefile b/tests/opencl/matmul/Makefile
index 0d1d136a..39b92b36 100644
--- a/tests/opencl/matmul/Makefile
+++ b/tests/opencl/matmul/Makefile
@@ -2,6 +2,6 @@ PROJECT = matmul
 
 SRCS = main.cc
 
-OPTS ?= -n16
+OPTS ?= -n32
 
 include ../common.mk
diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl
index a0ef2d81..02aa074c 100644
--- a/tests/opencl/matmul/kernel.cl
+++ b/tests/opencl/matmul/kernel.cl
@@ -7,43 +7,41 @@ __kernel void matmul(__global float *A,
 {
     int globalRow = get_global_id(1);
     int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
     int localSize = get_local_size(0);  // assuming square local size
 
     float sum = 0.0f;
 
-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
-    localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+    // Loop over all blocks of both matrices
+    for (int k = 0; k < N; k += localSize) {
+        // Load block of matrix A to local memory
+        localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
 
-    // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+        // Load block of matrix B to local memory, adjusting for column-major access
+        localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+
+        // Synchronize to make sure the tiles are loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        // Compute multiplication for this block
-        for (int j = 0; j < 16; j++) {
+        // Multiply the two matrix blocks and accumulate result
+        for (int j = 0; j < localSize; j++) {
             sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
         }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
     }
 
     C[globalRow * N + globalCol] = sum;
 }
 
-/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
+/*__kernel void matmul(__global float *A, 
+                       __global float *B, 
+                       __global float *C, 
+                       const unsigned int N)
 {
     int globalRow = get_global_id(1);
     int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
 
     // Static local memory declaration
     __local float localA[16][16];
@@ -51,26 +49,21 @@ __kernel void matmul(__global float *A,
 
     float sum = 0.0f;
 
-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow][localCol] = A[globalRow * N + k + localCol];
-    localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
-
     // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+    for (int k = 0; k < N; k += 16) {
+        // Load a block of matrix A into local memory
+        localA[localRow][localCol] = A[globalRow * N + k + localCol];
+
+        // Load a block of matrix B into local memory
+        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
+
+        // Ensure the entire block is loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Compute multiplication for this block
         for (int j = 0; j < 16; j++) {
             sum += localA[localRow][j] * localB[j][localCol];
         }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
     }
 
     C[globalRow * N + globalCol] = sum;
diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc
index f7714dd7..3d26ff0c 100644
--- a/tests/opencl/matmul/main.cc
+++ b/tests/opencl/matmul/main.cc
@@ -10,6 +10,8 @@
 
 #define LOCAL_SIZE 16
 
+#define FLOAT_ULP 6
+
 #define KERNEL_NAME "matmul"
 
 #define CL_CHECK(_expr)                                                \
@@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
   fi_t fa, fb;
   fa.f = a;
   fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
 }
 
-static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
+static void matmul_cpu(float *C, float *A, float *B, int N) {
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
@@ -98,7 +101,7 @@ static void cleanup() {
   if (kernel_bin) free(kernel_bin);
 }
 
-int size = 64;
+int size = 32;
 
 static void show_usage() {
   printf("Usage: [-n size] [-h: help]\n");
@@ -106,7 +109,7 @@ static void show_usage() {
 
 static void parse_args(int argc, char **argv) {
   int c;
-  while ((c = getopt(argc, argv, "fn:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
     switch (c) {
     case 'n':
       size = atoi(optarg);
@@ -127,6 +130,8 @@ int main (int argc, char **argv) {
   // parse command arguments
   parse_args(argc, argv);
 
+  uint32_t num_points = size * size;
+
   printf("Matrix size=%d\n", size);
   if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
     printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
@@ -148,7 +153,7 @@ int main (int argc, char **argv) {
   printf("Using device: %s\n", device_string);
 
   printf("Allocate device buffers\n");
-  size_t nbytes = size * size * sizeof(float);
+  size_t nbytes = num_points * sizeof(float);
   a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@@ -176,32 +181,26 @@ int main (int argc, char **argv) {
   // Create kernel
   kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
 
-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
   size_t global_size[2] = {size, size};
+  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};  
 
   // Set kernel arguments
   CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
   CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
   CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
   CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
-  //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
-  //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
 
  // Allocate memories for input arrays and output arrays.
- std::vector<float> h_a(size * size);
- std::vector<float> h_b(size * size);
- std::vector<float> h_c(size * size);
+ std::vector<float> h_a(num_points);
+ std::vector<float> h_b(num_points);
+ std::vector<float> h_c(num_points);
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = static_cast<float>(rand()) / RAND_MAX;
+    h_b[i] = static_cast<float>(rand()) / RAND_MAX;
   }
 
   // Creating command queue
@@ -223,10 +222,10 @@ int main (int argc, char **argv) {
   CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
 
   printf("Verify result\n");
-  std::vector<float> ref_vec(size * size);
-  matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
+  std::vector<float> ref_vec(num_points);
+  matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
   int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
+  for (uint32_t i = 0; i < num_points; ++i) {
     if (!compare_equal(h_c[i], ref_vec[i])) {
       if (errors < 100) 
         printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc
index 7c0463cf..184eec96 100644
--- a/tests/opencl/oclprintf/main.cc
+++ b/tests/opencl/oclprintf/main.cc
@@ -143,7 +143,7 @@ int main (int argc, char **argv) {
   // Allocate memories for input arrays and output arrays.    
   h_a = (int*)malloc(nbytes);
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
     h_a[i] = -1 + i;
   }
diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc
index 26a42807..b627ceee 100644
--- a/tests/opencl/psort/main.cc
+++ b/tests/opencl/psort/main.cc
@@ -155,9 +155,8 @@ int main (int argc, char **argv) {
   h_a = (int*)malloc(nbytes);
   h_c = (int*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
-    h_c[i] = 0xdeadbeef;
     if (float_enable) {
       float value = sinf(i)*sinf(i);
       h_a[i] = *(int*)&value;
diff --git a/tests/opencl/sgemm/common.h b/tests/opencl/sgemm/common.h
index 01f68d48..fdb40bce 100644
--- a/tests/opencl/sgemm/common.h
+++ b/tests/opencl/sgemm/common.h
@@ -1,12 +1,8 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#define USE_FLOAT
-
-#ifdef USE_FLOAT
+#ifndef TYPE
 #define TYPE float
-#else
-#define TYPE int
 #endif
 
 #endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc
index 3ca14792..7a02929f 100644
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@@ -11,6 +11,8 @@
 
 #define KERNEL_NAME "sgemm"
 
+#define FLOAT_ULP 6
+
 #define CL_CHECK(_expr)                                                \
    do {                                                                \
      cl_int _err = _expr;                                              \
@@ -33,6 +35,66 @@
      _ret;                                                             \
    })
 
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      TYPE acc = 0;
+      for (int k = 0; k < K; ++k) {
+          acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}*/
+
 static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
   if (nullptr == filename || nullptr == data || 0 == size)
     return -1;
@@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
-/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      TYPE acc = 0;
-      for (int k = 0; k < K; ++k) {
-          acc += A[k * M + m] * B[n * K + k];
-      }
-      C[n * M + m] = acc;
-    }
-  }
-}*/
-
-#ifdef USE_FLOAT
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
-  fi_t fa, fb;
-  fa.f = a;
-  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
-}
-#else
-static bool compare_equal(int a, int b, int ulp = 21) {
-  return (a == b);
-}
-#endif
-
 cl_device_id device_id = NULL;
 cl_context context = NULL;
 cl_command_queue commandQueue = NULL;
@@ -145,6 +181,8 @@ int main (int argc, char **argv) {
   // parse command arguments
   parse_args(argc, argv);
 
+  uint32_t num_points = size * size;
+
   cl_platform_id platform_id;
   size_t kernel_size;
   cl_int binary_status;
@@ -163,7 +201,7 @@ int main (int argc, char **argv) {
   context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
 
   // Allocate device buffers
-  size_t nbytes = size * size * sizeof(TYPE);
+  size_t nbytes = num_points * sizeof(TYPE);
   a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@@ -194,23 +232,17 @@ int main (int argc, char **argv) {
   h_b = (TYPE*)malloc(nbytes);
   h_c = (TYPE*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values 
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = Comparator<TYPE>::generate();
+    h_b[i] = Comparator<TYPE>::generate();
   }
 
   size_t global_offset[2] = {0, 0};
   size_t global_work_size[2] = {size, size};
   size_t local_work_size[2] = {1, 1};
 
-  std::vector<float> ref_vec(size * size);
+  std::vector<float> ref_vec(num_points);
 
   // reference generation
   size_t num_groups_y = global_work_size[1] / local_work_size[1];
@@ -228,12 +260,7 @@ int main (int argc, char **argv) {
           TYPE acc = 0;
           for (int k = 0; k < width; k++) {
             acc += h_a[k * width + r] * h_b[c * width + k];
-          }
-        /*#ifdef USE_FLOAT
-          printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
-        #else
-          printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
-        #endif*/                   
+          }                  
           ref_vec[c * width + r] = acc;         
         }
       }
@@ -260,14 +287,8 @@ int main (int argc, char **argv) {
 
   printf("Verify result\n");
   int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
-    if (!compare_equal(h_c[i], ref_vec[i])) {
-      if (errors < 100) 
-      #ifdef USE_FLOAT
-        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
-      #else
-        printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
-      #endif
+  for (uint32_t i = 0; i < num_points; ++i) {
+    if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
       ++errors;
     }
   }
diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc
index 23aa49b4..992e88be 100644
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -166,12 +166,10 @@ int main (int argc, char **argv) {
   h_b = (float*)malloc(nbytes);
   h_c = (float*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
     h_a[i] = sinf(i)*sinf(i);
     h_b[i] = cosf(i)*cosf(i);
-    h_c[i] = 0xdeadbeef;
-    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
   }
 
   // Creating command queue
diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp
index 63556a5f..f14f66c3 100644
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@@ -19,16 +19,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};
 
@@ -38,22 +28,41 @@ public:
   static const char* type_str() {
     return "integer";
   }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
   }  
 };
 
 template <>
 class Comparator<float> {
+private:
+  union Float_t { float f; int i; };
 public:
   static const char* type_str() {
     return "float";
   }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {     
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
     auto d = std::abs(fa.i - fb.i);
     if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
       return false;
     }
     return true;
@@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg,
     for (uint32_t i = 0; i < num_points; ++i) {
       auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
       auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
         ++errors;
       }
     }
@@ -196,8 +203,7 @@ int main(int argc, char *argv[]) {
   // generate source data
   source_data.resize(2 * num_points);
   for (uint32_t i = 0; i < source_data.size(); ++i) {
-    auto r = static_cast<float>(std::rand()) / RAND_MAX;
-    source_data[i] = static_cast<TYPE>(r * 2 * num_points);
+    source_data[i] = Comparator<TYPE>::generate();
   }
 
   // upload source buffer0
diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile
index 790664dc..dbb70c3b 100644
--- a/tests/regression/tensor/Makefile
+++ b/tests/regression/tensor/Makefile
@@ -4,6 +4,6 @@ SRCS = main.cpp
 
 VX_SRCS = kernel.cpp
 
-OPTS ?= -s16
+OPTS ?= -n32
 
 include ../common.mk
\ No newline at end of file
diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp
index 5cf0851c..b0e8f69e 100644
--- a/tests/regression/tensor/kernel.cpp
+++ b/tests/regression/tensor/kernel.cpp
@@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) {
 }
 
 void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
-	auto size  = arg->size;
-    auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
 	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
 	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size  = arg->size;
 
     uint32_t row, col;
     if (is_log2(size)) {
diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp
index d93f3177..81103c10 100644
--- a/tests/regression/tensor/main.cpp
+++ b/tests/regression/tensor/main.cpp
@@ -19,16 +19,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};
 
@@ -38,8 +28,17 @@ public:
   static const char* type_str() {
     return "integer";
   }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
   }  
 };
 
@@ -49,18 +48,26 @@ public:
   static const char* type_str() {
     return "float";
   }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
     auto d = std::abs(fa.i - fb.i);
     if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
       return false;
     }
     return true;
   }  
 };
 
-static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
   for (uint32_t row = 0; row < height; ++row) {
     for (uint32_t col = 0; col < width; ++col) {
       TYPE sum(0);
@@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t
 }
 
 const char* kernel_file = "kernel.bin";
-uint32_t size = 16;
+uint32_t size = 32;
 
 vx_device_h device = nullptr;
 std::vector<uint8_t> staging_buf;
@@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {};
 
 static void show_usage() {
    std::cout << "Vortex Test." << std::endl;
-   std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
 }
 
 static void parse_args(int argc, char **argv) {
   int c;
-  while ((c = getopt(argc, argv, "s:k:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
     switch (c) {
-    case 's':
+    case 'n':
       size = atoi(optarg);
       break;
     case 'k':
@@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg,
     for (uint32_t i = 0; i < refs.size(); ++i) {
       auto ref = refs[i];
       auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
         ++errors;
       }
     }
@@ -208,7 +213,7 @@ int main(int argc, char *argv[]) {
     src_A[i] = static_cast<TYPE>(a * size);
     src_B[i] = static_cast<TYPE>(b * size);
   }
-  cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size);
+  matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
 
   // upload source buffer0
   {