diff --git a/tests/opencl/vecadd-loop/.gitignore b/tests/opencl/vecadd-loop/.gitignore
new file mode 100644
index 00000000..c2f4a18b
--- /dev/null
+++ b/tests/opencl/vecadd-loop/.gitignore
@@ -0,0 +1,2 @@
+vecadd-loop
+*.ll
diff --git a/tests/opencl/vecadd-loop/Makefile b/tests/opencl/vecadd-loop/Makefile
new file mode 100644
index 00000000..bed44ce1
--- /dev/null
+++ b/tests/opencl/vecadd-loop/Makefile
@@ -0,0 +1,72 @@
+XLEN ?= 32
+
+LLVM_PREFIX ?= /opt/llvm-riscv
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
+POCL_CC_PATH ?= /opt/pocl/compiler
+POCL_RT_PATH ?= /opt/pocl/runtime
+
+OPTS ?= -n64
+
+VORTEX_DRV_PATH ?= $(realpath ../../../driver)
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
+K_CFLAGS   += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
+K_LDFLAGS  += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+
+CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
+
+CXXFLAGS += -I$(POCL_RT_PATH)/include
+
+LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT = vecadd-loop
+
+SRCS = main.cc
+
+all: $(PROJECT) kernel.pocl
+
+kernel.pocl: kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+ 
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-fpga: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-asesim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	
+run-vlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	cp -f args.bin $(PROJECT).args.bin
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend 
+
+clean-all: clean
+	rm -rf *.pocl *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
diff --git a/tests/opencl/vecadd-loop/README b/tests/opencl/vecadd-loop/README
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/opencl/vecadd-loop/kernel.alll1hit.loop1000.cl b/tests/opencl/vecadd-loop/kernel.alll1hit.loop1000.cl
new file mode 100644
index 00000000..a610b183
--- /dev/null
+++ b/tests/opencl/vecadd-loop/kernel.alll1hit.loop1000.cl
@@ -0,0 +1,12 @@
+__kernel void vecadd (__global const float *A,
+	                    __global const float *B,
+	                    __global float *C)
+{
+  int gid = get_global_id(0);
+  float sum = 0.;
+  for (int i = 0; i < 1000; i++) {
+	  int addr = gid + (i % 2);
+	  sum += A[addr] + B[addr];
+  }
+  C[gid] = sum;
+}
diff --git a/tests/opencl/vecadd-loop/kernel.cl b/tests/opencl/vecadd-loop/kernel.cl
new file mode 100644
index 00000000..535b2cf8
--- /dev/null
+++ b/tests/opencl/vecadd-loop/kernel.cl
@@ -0,0 +1,13 @@
+__kernel void vecadd_loop (__global volatile const float *A,
+	                    __global volatile const float *B,
+	                    __global volatile float *C)
+{
+  int gid = get_global_id(0);
+  float sum = 0.;
+  for (int i = 0; i < 500; i++) {
+      // int addr = gid + (i % 2);
+	  int addr = gid;
+      C[addr] += A[addr] + B[addr];
+  }
+  // C[gid] = sum;
+}
diff --git a/tests/opencl/vecadd-loop/kernel.cl.loop b/tests/opencl/vecadd-loop/kernel.cl.loop
new file mode 100644
index 00000000..9a4c2882
--- /dev/null
+++ b/tests/opencl/vecadd-loop/kernel.cl.loop
@@ -0,0 +1,9 @@
+__kernel void vecadd_loop (__global volatile const float *A,
+                    __global volatile const float *B,
+                    __global volatile float *C)
+{
+  int gid = get_global_id(0);
+  for (int i = 0; i < 100; i++) {
+    C[gid] = A[gid] + B[gid];
+  }
+}
diff --git a/tests/opencl/vecadd-loop/kernel.pocl b/tests/opencl/vecadd-loop/kernel.pocl
new file mode 100644
index 00000000..8422ab30
Binary files /dev/null and b/tests/opencl/vecadd-loop/kernel.pocl differ
diff --git a/tests/opencl/vecadd-loop/main.cc b/tests/opencl/vecadd-loop/main.cc
new file mode 100644
index 00000000..f0eec447
--- /dev/null
+++ b/tests/opencl/vecadd-loop/main.cc
@@ -0,0 +1,250 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "vecadd_loop"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+static bool almost_equal(float a, float b, int ulp = 4) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;  
+float *h_a = NULL;
+float *h_b = NULL;
+float *h_c = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (a_memobj) clReleaseMemObject(a_memobj);
+  if (b_memobj) clReleaseMemObject(b_memobj);
+  if (c_memobj) clReleaseMemObject(c_memobj);  
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_a) free(h_a);
+  if (h_b) free(h_b);
+  if (h_c) free(h_c);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * sizeof(float);
+  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  // Set kernel arguments
+  // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
+  // operation
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_a = (float*)malloc(nbytes);
+  h_b = (float*)malloc(nbytes);
+  h_c = (float*)malloc(nbytes);	
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < size; ++i) {
+    h_a[i] = sinf(i)*sinf(i);
+    h_b[i] = cosf(i)*cosf(i);
+    h_c[i] = 0xdeadbeef;
+    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
+  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("vecadd.input.a.bin", h_a, nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("vecadd.input.b.bin", h_b, nbytes) != 0)
+    return EXIT_FAILURE;
+
+  // Creating command queue
+  // NOTE(hansung): The 3rd properties arg is a bit-field, where fields like
+  // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE can be set.  With value of 0,
+  // nothing is set and the commands in the queue will be completed in-order.
+  // See OpenCL 1.2 spec, section 5.1
+  commandQueue = CL_CHECK2(clCreateCommandQueue(
+      context, device_id, 0 /* command-queue properties */, &_err));
+
+  printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  // NOTE(hansung): clFinish blocks until all kernels in the command queue are
+  // finished.  This seems to be what actually kicks off kernel execution.
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < size; ++i) {
+    float ref = h_a[i] + h_b[i];
+    if (!almost_equal(h_c[i], ref)) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
+      ++errors;
+    }
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}