Add vecadd-loop

This is the same kernel as vecadd but repeated in a for-loop many times so that the runtime overhead at the startup is amortized.
Force per-global-thread stack allocation
2023-10-31 23:35:11 -07:00 · 2023-10-22 01:56:14 -07:00 · 2023-10-07 21:22:45 -07:00 · 2023-10-07 21:18:27 -07:00 · 2023-10-07 19:15:10 -07:00 · 2023-10-06 13:58:25 -07:00
60 changed files with 5581 additions and 48 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.dump
+*.o
+*.bin
+*.elf
+.depend
+*.a
+*.so
+*.log
+*.vcd
+blackbox.*.cache
--- a/driver/common/vx_malloc.h
+++ b/driver/common/vx_malloc.h
@@ -29,6 +29,10 @@ public:
        }
    }

+    // NOTE(hansung): This is code running on the CPU, but CPU is still the one
+    // that keeps track of allocation of the GPU memory.  GPU kernel simply runs
+    // assuming that CPU has done the right thing and returned a safe and valid
+    // chunk of memory.
    int allocate(uint64_t size, uint64_t* addr) {
        if (size == 0 || addr == nullptr)
            return -1;
--- a/driver/common/vx_utils.h
+++ b/driver/common/vx_utils.h
@@ -7,5 +7,12 @@ uint64_t aligned_size(uint64_t size, uint64_t alignment);
 bool is_aligned(uint64_t addr, uint64_t alignment);

 #define CACHE_BLOCK_SIZE    64
-#define ALLOC_BASE_ADDR     0x00000000
-#define LOCAL_MEM_SIZE      4294967296     // 4 GB 
+// NOTE(hansung): This is changed to something more akin to be in a heap area
+// for a CPU userspace program, since that works better with Chipyard's default
+// memory mapping scheme (0x80000000 and above).  This gives us a pretty small
+// space though.
+#define ALLOC_BASE_ADDR     0xc0000000ul
+#define LOCAL_MEM_SIZE      0x40000000ul     // 1 GB 
+// #define ALLOC_BASE_ADDR     0x00000000
+// #define LOCAL_MEM_SIZE      4294967296     // 4 GB 
+#define DEVICE_MAX_ADDR     0xfffffffful
--- a/driver/rtlsim/vortex.cpp
+++ b/driver/rtlsim/vortex.cpp
@@ -86,7 +86,7 @@ public:

    int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
-        if (dest_addr + asize > LOCAL_MEM_SIZE)
+        if (dest_addr + asize > DEVICE_MAX_ADDR)
            return -1;

        /*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset));
@@ -104,7 +104,7 @@ public:

    int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
-        if (src_addr + asize > LOCAL_MEM_SIZE)
+        if (src_addr + asize > DEVICE_MAX_ADDR)
            return -1;

        ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
--- a/driver/simx/vortex.cpp
+++ b/driver/simx/vortex.cpp
@@ -93,7 +93,7 @@ public:

    int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
-        if (dest_addr + asize > LOCAL_MEM_SIZE)
+        if (dest_addr + asize > DEVICE_MAX_ADDR)
            return -1;

        ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
@@ -108,7 +108,7 @@ public:

    int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
        uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
-        if (src_addr + asize > LOCAL_MEM_SIZE)
+        if (src_addr + asize > DEVICE_MAX_ADDR)
            return -1;

        ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
--- a/env.my-pocl-riscv32.sh
+++ b/env.my-pocl-riscv32.sh
@@ -0,0 +1,7 @@
+export VORTEX_ENV="my-pocl-riscv32"
+export LLVM_PREFIX=/scratch/hansung/build/llvm-riscv32-unknown-linux-gnu-10.0.1
+export POCL_CC_PATH=/scratch/hansung/build/pocl-riscv32/compiler
+export POCL_RT_PATH=/scratch/hansung/build/pocl-riscv32/runtime
+export VERILATOR_ROOT=/scratch/hansung/build/vortex-toolchain-prebuilt/verilator
+export RISCV_TOOLCHAIN_PATH=/scratch/hansung/build/vortex-toolchain-prebuilt
+export PATH="/scratch/hansung/build/vortex-toolchain-prebuilt/verilator/bin:$PATH"
--- a/env.my-pocl.sh
+++ b/env.my-pocl.sh
@@ -0,0 +1,7 @@
+export VORTEX_ENV="my-pocl"
+export LLVM_PREFIX=/scratch/hansung/build/vortex-toolchain-prebuilt/llvm-riscv/
+export POCL_CC_PATH=/scratch/hansung/build/pocl-vortex/compiler
+export POCL_RT_PATH=/scratch/hansung/build/pocl-vortex/runtime
+export VERILATOR_ROOT=/scratch/hansung/build/vortex-toolchain-prebuilt/verilator
+export RISCV_TOOLCHAIN_PATH=/scratch/hansung/build/vortex-toolchain-prebuilt
+export PATH="/scratch/hansung/build/vortex-toolchain-prebuilt/verilator/bin:$PATH"
--- a/env.vortex-prebuilt.sh
+++ b/env.vortex-prebuilt.sh
@@ -0,0 +1,17 @@
+if [ -n "$VORTEX_ENV" ]
+then
+    echo "VORTEX_ENV already set. Exiting."
+    return
+fi
+
+# PREBUILT_DIR=/scratch/hansung/build/vortex-toolchain-prebuilt-d2ba5df-230831
+PREBUILT_DIR=/scratch/hansung/build/vortex-toolchain-prebuilt-230831
+
+export VORTEX_ENV="vortex-prebuilt"
+export LLVM_PREFIX=$PREBUILT_DIR/llvm-riscv/
+export POCL_CC_PATH=$PREBUILT_DIR/pocl/compiler
+export POCL_RT_PATH=$PREBUILT_DIR/pocl/runtime
+export VERILATOR_ROOT=$PREBUILT_DIR/verilator
+export RISCV_TOOLCHAIN_PATH=$PREBUILT_DIR/
+export PATH="$BUILDDIR/vortex-toolchain-prebuilt-d2ba5df-230831/verilator/bin:$PATH"
+export PS1="($VORTEX_ENV) $PS1"
--- a/env.vortex-prebuilt2.sh
+++ b/env.vortex-prebuilt2.sh
@@ -0,0 +1,7 @@
+export VORTEX_ENV="vortex-prebuilt2"
+export LLVM_PREFIX=/scratch/hansung/build/vortex-toolchain-prebuilt/llvm-riscv2
+export POCL_CC_PATH=/scratch/hansung/build/vortex-toolchain-prebuilt/pocl2/compiler
+export POCL_RT_PATH=/scratch/hansung/build/vortex-toolchain-prebuilt/pocl2/runtime
+export VERILATOR_ROOT=/scratch/hansung/build/vortex-toolchain-prebuilt/verilator
+export RISCV_TOOLCHAIN_PATH=/scratch/hansung/build/vortex-toolchain-prebuilt
+export PATH="/scratch/hansung/build/vortex-toolchain-prebuilt/verilator/bin:$PATH"
--- a/hw/rtl/VX_execute.sv
+++ b/hw/rtl/VX_execute.sv
@@ -9,6 +9,7 @@ module VX_execute #(
    input wire reset,    

    // Dcache interface
+    // NOTE(hansung): this comes out of VX_lsu_unit
    VX_dcache_req_if.master dcache_req_if,
    VX_dcache_rsp_if.slave dcache_rsp_if,

--- a/hw/rtl/VX_lsu_unit.sv
+++ b/hw/rtl/VX_lsu_unit.sv
@@ -34,7 +34,7 @@ module VX_lsu_unit #(
    wire [`INST_LSU_BITS-1:0]     req_type;
    wire [`NUM_THREADS-1:0][31:0] req_data;   
    wire [`NR_BITS-1:0]           req_rd;
-    wire                          req_wb;
+    wire                          req_wb; // NOTE(hansung): 0:load, 1:store
    wire [`NW_BITS-1:0]           req_wid;
    wire [31:0]                   req_pc;
    wire                          req_is_dup;
--- a/hw/rtl/cache/VX_cache.sv
+++ b/hw/rtl/cache/VX_cache.sv
@@ -250,6 +250,8 @@ module VX_cache #(
    wire [MEM_TAG_IN_WIDTH-1:0]     mem_rsp_tag_c;
    wire                            mem_rsp_ready_c;

+    // NOTE(hansung): non-cacheable addresses. Although is this applied for
+    // all address range?
    if (NC_ENABLE) begin
        VX_nc_bypass #( 
            .NUM_PORTS         (NUM_PORTS),
--- a/hw/rtl/cache/VX_cache_define.vh
+++ b/hw/rtl/cache/VX_cache_define.vh
@@ -55,6 +55,7 @@

 ///////////////////////////////////////////////////////////////////////////////

+// NOTE(hansung): what does CORE_TAG_ID_BITS == 0 mean?
 `define CORE_RSP_TAGS           ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)

 `define LINE_TO_MEM_ADDR(x, i)  {x, `BANK_SELECT_BITS'(i)}
--- a/hw/rtl/cache/VX_core_req_bank_sel.sv
+++ b/hw/rtl/cache/VX_core_req_bank_sel.sv
@@ -53,6 +53,7 @@ module VX_core_req_bank_sel #(
        
    wire [NUM_REQS-1:0][`LINE_ADDR_WIDTH-1:0]       core_req_line_addr;
    wire [NUM_REQS-1:0][`UP(`WORD_SELECT_BITS)-1:0] core_req_wsel;
+    // NOTE(hansung): "bank id"
    wire [NUM_REQS-1:0][`UP(`BANK_SELECT_BITS)-1:0] core_req_bid;

    for (genvar i = 0; i < NUM_REQS; i++) begin    
@@ -123,6 +124,9 @@ module VX_core_req_bank_sel #(
                    per_bank_core_req_tid_r   = 'x;
                    req_select_table_r        = 'x;

+                    // NOTE(hansung): if we're simply overwriting assignment in
+                    // a loop with decrementing index, wouldn't this be unfair
+                    // for reqs with higher index?
                    for (integer i = NUM_REQS-1; i >= 0; --i) begin
                        if (core_req_valid[i]) begin                    
                            per_bank_core_req_valid_r[core_req_bid[i]]                 = 1;
@@ -184,6 +188,8 @@ module VX_core_req_bank_sel #(
            end

        end else begin
+            // NOTE(hansung): this is what the default config elaborates, i.e.
+            // NUM_REQS > 1, NUM_PORTS == 1

            always @(*) begin
                per_bank_core_req_valid_r = 0;
@@ -204,6 +210,8 @@ module VX_core_req_bank_sel #(
                        per_bank_core_req_byteen_r[core_req_bid[i]]= core_req_byteen[i];
                        per_bank_core_req_data_r[core_req_bid[i]]  = core_req_data[i];
                        per_bank_core_req_tag_r[core_req_bid[i]]   = core_req_tag[i];
+                        // NOTE(hansung): this marks which req 'won' mapping
+                        // to this bank eventually
                        per_bank_core_req_tid_r[core_req_bid[i]]   = `REQS_BITS'(i);
                    end
                end
@@ -216,6 +224,7 @@ module VX_core_req_bank_sel #(
                    core_req_ready_r = 0;
                    for (integer i = 0; i < NUM_BANKS; ++i) begin
                        if (per_bank_core_req_valid_r[i]) begin
+                            // NOTE(hansung): this flows back to upstream
                            core_req_ready_r[per_bank_core_req_tid_r[i]] = per_bank_core_req_ready[i];
                        end
                    end
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -1,9 +1,9 @@
 XLEN ?= 32

 ifeq ($(XLEN),32)
-RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
 else
-RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
 endif

 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
--- a/runtime/src/vx_spawn.c
+++ b/runtime/src/vx_spawn.c
@@ -97,6 +97,8 @@ static void spawn_tasks_rem_cb(int thread_mask) {
  vx_tmc(1);
 }

+// NOTE(hansung): where is this used? The main section in the POCL binary calls
+// `vx_spawn_kernel` but not this one
 void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
  int NC = vx_num_cores();
@@ -282,8 +284,11 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {

  //--
  wspawn_kernel_args_t wspawn_args = {
-    ctx, callback, arg, core_id * wgs_per_core, fW, rW, 0, isXYpow2, isXpow2, log2XY, log2X 
-  };
+      ctx,      callback, arg,      core_id * wgs_per_core /*offset*/,
+      fW /*N*/, rW /*R*/, 0 /*NW*/, isXYpow2,
+      isXpow2,  log2XY,   log2X};
+
+  // NOTE(hansung): core_id is capped at NUM_CORES_MAX = 32
  g_wspawn_args[core_id] = &wspawn_args;

  //--
--- a/runtime/src/vx_start.S
+++ b/runtime/src/vx_start.S
@@ -77,11 +77,14 @@ init_regs:
  # allocate stack region for a threads on the processor 
  # set stack pointer
  li sp, SMEM_BASE_ADDR # load stack base address
-  #if SM_ENABLE
-  csrr a0, CSR_LTID    # get local thread id
-  #else
+  # NOTE(hansung): Force per-global-thread stack allocation, since
+  # we're experimenting with different memory hierarchy (i.e. no private cache)
+  # and it's easy to miss setting SM_ENABLE accordingly.
+  # #if SM_ENABLE
+  # csrr a0, CSR_LTID    # get local thread id
+  # #else
  csrr a0, CSR_GTID    # get global thread id
-  #endif
+  # #endif
  sll  a1, a0, STACK_LOG2_SIZE
  sub  sp, sp, a1

--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@@ -691,7 +691,13 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        uint64_t mem_data = 0;
        core_->dcache_read(&mem_data, mem_addr, mem_bytes);
        trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});
-        DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
+        DP(2, "LOAD MEM: CYCLE=" << std::dec << SimPlatform::instance().cycles()
+              << ", CORE=" << core_->id()
+              << ", WARP=" << id_
+              << ", THREAD=" << t
+              << ", ADDRESS=0x" << std::hex << mem_addr
+              << ", DATA=0x" << mem_data << std::dec
+              << ", BYTES=" << mem_bytes);
        switch (func3) {
        case 0:
          // RV32I: LB
@@ -731,7 +737,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
          core_->dcache_read(&mem_data, mem_addr, 4);
          Word *result_ptr = (Word *)(vd.data() + i);
          *result_ptr = mem_data;
-          DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);        
+          DP(2, "LOAD MEM: CYCLE=" << std::dec << SimPlatform::instance().cycles()
+                << ", CORE=" << core_->id()
+                << ", WARP=" << id_
+                << ", VLEN=" << vl_
+                << ", VID=" << i
+                << ", ADDRESS=0x" << std::hex << mem_addr
+                << ", DATA=0x" << mem_data << std::dec
+                << ", BYTES=" << 4);
        }
        break;
      } 
@@ -762,7 +775,13 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
          mem_data &= mask;
        }
        trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});        
-        DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
+        DP(2, "STORE MEM: CYCLE=" << std::dec << SimPlatform::instance().cycles()
+              << ", CORE=" << core_->id()
+              << ", WARP=" << id_
+              << ", THREAD=" << t
+              << ", ADDRESS=0x" << std::hex << mem_addr
+              << ", DATA=0x" << mem_data << std::dec
+              << ", BYTES=" << mem_bytes);
        switch (func3) {
        case 0:
        case 1:
@@ -782,7 +801,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
          // store word and unit strided (not checking for unit stride)          
          uint32_t mem_data = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
          core_->dcache_write(&mem_data, mem_addr, 4);
-          DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
+          DP(2, "STORE MEM: CYCLE=" << std::dec << SimPlatform::instance().cycles()
+                << ", CORE=" << core_->id()
+                << ", WARP=" << id_
+                << ", VLEN=" << vl_
+                << ", VID=" << i
+                << ", ADDRESS=0x" << std::hex << mem_addr
+                << ", DATA=0x" << mem_data << std::dec
+                << ", BYTES=" << 4);
          break;
        } 
        default:
@@ -878,6 +904,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  case FENCE: {
    trace->exe_type = ExeType::LSU;    
    trace->lsu.type = LsuType::FENCE;
+    DP(2, "FENCE MEM");
    break;
  }   
  case FCI: {       
@@ -1304,6 +1331,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } else {
        tmask_.reset();
        for (uint32_t t = 0; t < num_threads; ++t) {
+          // NOTE(hansung): `ts` is the left-most lane currently enabled.
+          // Doing this only respects the operand of that lane, even though
+          // every lane might have different operand for the tmask.
          tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
        }
      }
@@ -1397,6 +1427,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      trace->exe_type = ExeType::LSU; 
      trace->lsu.type = LsuType::PREFETCH; 
      trace->used_iregs.set(rsrc0);
+      DP(2, "PREFETCH MEM");
      for (uint32_t t = 0; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;
--- a/sim/simx/tex_unit.h
+++ b/sim/simx/tex_unit.h
@@ -1,5 +1,6 @@
 #pragma once

+#include <array>
 #include "types.h"

 namespace vortex {
--- a/tests/opencl/DotProduct/Makefile
+++ b/tests/opencl/DotProduct/Makefile
@@ -37,7 +37,7 @@ PROJECT=DotProduct
 all: $(PROJECT).dump $(PROJECT).hex

 lib$(PROJECT).a: DotProduct.cl
-	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib:$(LLVM_PREFIX)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a DotProduct.cl

 $(PROJECT).elf: main.cc lib$(PROJECT).a
 	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
--- a/tests/opencl/convolution/.depend
+++ b/tests/opencl/convolution/.depend
@@ -0,0 +1,6 @@
+main.o: main.cpp \
+ /scratch/hansung/build/vortex-toolchain-prebuilt/pocl/runtime/include/CL/cl.h \
+ /scratch/hansung/build/vortex-toolchain-prebuilt/pocl/runtime/include/CL/cl_version.h \
+ /scratch/hansung/build/vortex-toolchain-prebuilt/pocl/runtime/include/CL/cl_platform.h \
+ utils.h
+utils.o: utils.cpp utils.h
--- a/tests/opencl/convolution/Makefile
+++ b/tests/opencl/convolution/Makefile
@@ -17,7 +17,7 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors

 CXXFLAGS += -I$(POCL_RT_PATH)/include

-LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex
+LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -Wl,-rpath $(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex

 # Debugigng
 ifdef DEBUG
--- a/tests/opencl/reduce0/Makefile
+++ b/tests/opencl/reduce0/Makefile
@@ -37,7 +37,7 @@ PROJECT=reduce0
 all: $(PROJECT).dump $(PROJECT).hex

 lib$(PROJECT).a: oclReduction_kernel.cl
-	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib:$(LLVM_PREFIX)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a oclReduction_kernel.cl

 $(PROJECT).elf: main.cc lib$(PROJECT).a
 	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
--- a/tests/opencl/saxpy/Makefile
+++ b/tests/opencl/saxpy/Makefile
@@ -37,7 +37,7 @@ SRCS = main.cc
 all: $(PROJECT) kernel.pocl

 kernel.pocl: kernel.cl
-	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
 
 $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
--- a/tests/opencl/saxpy/main.cc
+++ b/tests/opencl/saxpy/main.cc
@@ -78,6 +78,25 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
 uint8_t *kernel_bin = NULL;

 ///
@@ -209,6 +228,11 @@ int main(int argc, char **argv) {
  for (int i = 0; i < size; i++) {
    h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("saxpy.input.src.bin", h_src, nbytes) != 0)
+    return EXIT_FAILURE;
+
  CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
  free(h_src);

--- a/tests/opencl/sgemm/Makefile
+++ b/tests/opencl/sgemm/Makefile
@@ -37,7 +37,7 @@ SRCS = main.cc
 all: $(PROJECT) kernel.pocl

 kernel.pocl: kernel.cl
-	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
 
 $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@@ -52,6 +52,25 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
 static void matmul(float *C, const float* A, const float *B, int M, int N, int K) {
  for (int m = 0; m < M; ++m) {
    for (int n = 0; n < N; ++n) {
@@ -194,6 +213,12 @@ int main (int argc, char **argv) {
    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
  }

+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("sgemm.input.a.bin", h_a, nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("sgemm.input.b.bin", h_b, nbytes) != 0)
+    return EXIT_FAILURE;
+
  // Creating command queue
  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  

--- a/tests/opencl/tid/.gitignore
+++ b/tests/opencl/tid/.gitignore
@@ -0,0 +1 @@
+tid
--- a/tests/opencl/tid/Makefile
+++ b/tests/opencl/tid/Makefile
@@ -0,0 +1,71 @@
+XLEN ?= 32
+
+LLVM_PREFIX ?= /opt/llvm-riscv
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
+POCL_CC_PATH ?= /opt/pocl/compiler
+POCL_RT_PATH ?= /opt/pocl/runtime
+
+OPTS ?= -n64
+
+VORTEX_DRV_PATH ?= $(realpath ../../../driver)
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
+K_CFLAGS   += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
+K_LDFLAGS  += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+
+CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
+
+CXXFLAGS += -I$(POCL_RT_PATH)/include
+
+LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT = tid
+
+SRCS = main.cc
+
+all: $(PROJECT) kernel.pocl
+
+kernel.pocl: kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-fpga: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-asesim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-vlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend 
+
+clean-all: clean
+	rm -rf *.pocl *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/opencl/tid/kernel.cl
+++ b/tests/opencl/tid/kernel.cl
@@ -0,0 +1,6 @@
+__kernel void tid()
+{
+  __global int *out = (__global int *)0xc0000000;
+  int gid = get_global_id(0);
+  out[gid] = gid;
+}
--- a/tests/opencl/tid/main.cc
+++ b/tests/opencl/tid/main.cc
@@ -0,0 +1,221 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "tid"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;  
+float *h_a = NULL;
+float *h_b = NULL;
+float *h_c = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (a_memobj) clReleaseMemObject(a_memobj);
+  if (b_memobj) clReleaseMemObject(b_memobj);
+  if (c_memobj) clReleaseMemObject(c_memobj);  
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_a) free(h_a);
+  if (h_b) free(h_b);
+  if (h_c) free(h_c);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * sizeof(float);
+  // a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  // b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  // c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  // Set kernel arguments
+  // CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
+  // CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
+  // CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_a = (float*)malloc(nbytes);
+  h_b = (float*)malloc(nbytes);
+  h_c = (float*)malloc(nbytes);	
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(
+      context, device_id, 0 /* command-queue properties */, &_err));
+
+  // printf("Upload source buffers\n");
+  // CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));
+  // CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // printf("Download destination buffer\n");
+  // CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
+
+  // printf("Verify result\n");
+  // int errors = 0;
+  // for (int i = 0; i < size; ++i) {
+  //   float ref = h_a[i] + h_b[i];
+  //   if (!almost_equal(h_c[i], ref)) {
+  //     if (errors < 100) 
+  //       printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
+  //     ++errors;
+  //   }
+  // }
+  // if (0 == errors) {
+  //   printf("PASSED!\n");
+  // } else {
+  //   printf("FAILED! - %d errors\n", errors);    
+  // }
+
+  // Clean up		
+  cleanup();  
+
+  // return errors;
+  return 0;
+}
--- a/tests/opencl/vecadd-loop/.gitignore
+++ b/tests/opencl/vecadd-loop/.gitignore
@@ -0,0 +1,2 @@
+vecadd-loop
+*.ll
--- a/tests/opencl/vecadd-loop/Makefile
+++ b/tests/opencl/vecadd-loop/Makefile
@@ -0,0 +1,72 @@
+XLEN ?= 32
+
+LLVM_PREFIX ?= /opt/llvm-riscv
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
+POCL_CC_PATH ?= /opt/pocl/compiler
+POCL_RT_PATH ?= /opt/pocl/runtime
+
+OPTS ?= -n64
+
+VORTEX_DRV_PATH ?= $(realpath ../../../driver)
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
+K_CFLAGS   += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
+K_LDFLAGS  += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+
+CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
+
+CXXFLAGS += -I$(POCL_RT_PATH)/include
+
+LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT = vecadd-loop
+
+SRCS = main.cc
+
+all: $(PROJECT) kernel.pocl
+
+kernel.pocl: kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+ 
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-fpga: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-asesim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	
+run-vlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	cp -f args.bin $(PROJECT).args.bin
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend 
+
+clean-all: clean
+	rm -rf *.pocl *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/opencl/vecadd-loop/README
+++ b/tests/opencl/vecadd-loop/README
--- a/tests/opencl/vecadd-loop/kernel.alll1hit.loop1000.cl
+++ b/tests/opencl/vecadd-loop/kernel.alll1hit.loop1000.cl
@@ -0,0 +1,12 @@
+__kernel void vecadd (__global const float *A,
+	                    __global const float *B,
+	                    __global float *C)
+{
+  int gid = get_global_id(0);
+  float sum = 0.;
+  for (int i = 0; i < 1000; i++) {
+	  int addr = gid + (i % 2);
+	  sum += A[addr] + B[addr];
+  }
+  C[gid] = sum;
+}
--- a/tests/opencl/vecadd-loop/kernel.cl
+++ b/tests/opencl/vecadd-loop/kernel.cl
@@ -0,0 +1,13 @@
+__kernel void vecadd_loop (__global volatile const float *A,
+	                    __global volatile const float *B,
+	                    __global volatile float *C)
+{
+  int gid = get_global_id(0);
+  float sum = 0.;
+  for (int i = 0; i < 500; i++) {
+      // int addr = gid + (i % 2);
+	  int addr = gid;
+      C[addr] += A[addr] + B[addr];
+  }
+  // C[gid] = sum;
+}
--- a/tests/opencl/vecadd-loop/kernel.cl.loop
+++ b/tests/opencl/vecadd-loop/kernel.cl.loop
@@ -0,0 +1,9 @@
+__kernel void vecadd_loop (__global volatile const float *A,
+                    __global volatile const float *B,
+                    __global volatile float *C)
+{
+  int gid = get_global_id(0);
+  for (int i = 0; i < 100; i++) {
+    C[gid] = A[gid] + B[gid];
+  }
+}
--- a/tests/opencl/vecadd-loop/kernel.pocl
+++ b/tests/opencl/vecadd-loop/kernel.pocl
--- a/tests/opencl/vecadd-loop/main.cc
+++ b/tests/opencl/vecadd-loop/main.cc
@@ -0,0 +1,250 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <CL/opencl.h>
+#include <unistd.h> 
+#include <string.h>
+#include <chrono>
+
+#define KERNEL_NAME "vecadd_loop"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+static bool almost_equal(float a, float b, int ulp = 4) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;  
+float *h_a = NULL;
+float *h_b = NULL;
+float *h_c = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (a_memobj) clReleaseMemObject(a_memobj);
+  if (b_memobj) clReleaseMemObject(b_memobj);
+  if (c_memobj) clReleaseMemObject(c_memobj);  
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);
+  
+  if (kernel_bin) free(kernel_bin);
+  if (h_a) free(h_a);
+  if (h_b) free(h_b);
+  if (h_c) free(h_c);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+
+  printf("Workload size=%d\n", size);
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  cl_int binary_status;
+
+  // read kernel binary from file  
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * sizeof(float);
+  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  // Set kernel arguments
+  // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
+  // operation
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
+
+  // Allocate memories for input arrays and output arrays.    
+  h_a = (float*)malloc(nbytes);
+  h_b = (float*)malloc(nbytes);
+  h_c = (float*)malloc(nbytes);	
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < size; ++i) {
+    h_a[i] = sinf(i)*sinf(i);
+    h_b[i] = cosf(i)*cosf(i);
+    h_c[i] = 0xdeadbeef;
+    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
+  }
+
+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("vecadd.input.a.bin", h_a, nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("vecadd.input.b.bin", h_b, nbytes) != 0)
+    return EXIT_FAILURE;
+
+  // Creating command queue
+  // NOTE(hansung): The 3rd properties arg is a bit-field, where fields like
+  // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE can be set.  With value of 0,
+  // nothing is set and the commands in the queue will be completed in-order.
+  // See OpenCL 1.2 spec, section 5.1
+  commandQueue = CL_CHECK2(clCreateCommandQueue(
+      context, device_id, 0 /* command-queue properties */, &_err));
+
+  printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {1};
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  // NOTE(hansung): clFinish blocks until all kernels in the command queue are
+  // finished.  This seems to be what actually kicks off kernel execution.
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
+
+  printf("Verify result\n");
+  int errors = 0;
+  for (int i = 0; i < size; ++i) {
+    float ref = h_a[i] + h_b[i];
+    if (!almost_equal(h_c[i], ref)) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
+      ++errors;
+    }
+  }
+  if (0 == errors) {
+    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
--- a/tests/opencl/vecadd/Makefile
+++ b/tests/opencl/vecadd/Makefile
@@ -37,7 +37,7 @@ SRCS = main.cc
 all: $(PROJECT) kernel.pocl

 kernel.pocl: kernel.cl
-	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
+	LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
 
 $(PROJECT): $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
@@ -53,6 +53,7 @@ run-vlsim: $(PROJECT) kernel.pocl

 run-simx: $(PROJECT) kernel.pocl   
 	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	cp -f args.bin $(PROJECT).args.bin

 run-rtlsim: $(PROJECT) kernel.pocl   
 	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
--- a/tests/opencl/vecadd/kernel.alll1hit.loop1000.cl
+++ b/tests/opencl/vecadd/kernel.alll1hit.loop1000.cl
@@ -0,0 +1,12 @@
+__kernel void vecadd (__global const float *A,
+	                    __global const float *B,
+	                    __global float *C)
+{
+  int gid = get_global_id(0);
+  float sum = 0.;
+  for (int i = 0; i < 1000; i++) {
+	  int addr = gid + (i % 2);
+	  sum += A[addr] + B[addr];
+  }
+  C[gid] = sum;
+}
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -52,6 +52,25 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

+static int write_operand_file(const char* filename, void* data, size_t size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "wb");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  size_t wsize = fwrite(data, size, 1, fp);
+  if (wsize != 1) {
+    fprintf(stderr, "Failed to write operand data.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
 static bool almost_equal(float a, float b, int ulp = 4) {
  union fi_t { int i; float f; };
  fi_t fa, fb;
@@ -156,6 +175,8 @@ int main (int argc, char **argv) {
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  // Set kernel arguments
+  // NOTE(hansung): clSetKernelArg doesn't seem to incur any device-specific
+  // operation
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));	
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
@@ -173,8 +194,19 @@ int main (int argc, char **argv) {
    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
  }

+  // NOTE(hansung): Dump operand buffer to a file
+  if (write_operand_file("vecadd.input.a.bin", h_a, nbytes) != 0)
+    return EXIT_FAILURE;
+  if (write_operand_file("vecadd.input.b.bin", h_b, nbytes) != 0)
+    return EXIT_FAILURE;
+
  // Creating command queue
-  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+  // NOTE(hansung): The 3rd properties arg is a bit-field, where fields like
+  // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE can be set.  With value of 0,
+  // nothing is set and the commands in the queue will be completed in-order.
+  // See OpenCL 1.2 spec, section 5.1
+  commandQueue = CL_CHECK2(clCreateCommandQueue(
+      context, device_id, 0 /* command-queue properties */, &_err));

  printf("Upload source buffers\n");
  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a, 0, NULL, NULL));
@@ -185,6 +217,8 @@ int main (int argc, char **argv) {
  size_t local_work_size[1] = {1};
  auto time_start = std::chrono::high_resolution_clock::now();
  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+  // NOTE(hansung): clFinish blocks until all kernels in the command queue are
+  // finished.  This seems to be what actually kicks off kernel execution.
  CL_CHECK(clFinish(commandQueue));
  auto time_end = std::chrono::high_resolution_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
--- a/tests/opencl/vecadd/vecadd.dump.annotated
+++ b/tests/opencl/vecadd/vecadd.dump.annotated
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@@ -11,6 +11,7 @@ all:
 	$(MAKE) -C no_mf_ext
 	$(MAKE) -C no_smem
 	$(MAKE) -C prefetch
+	$(MAKE) -C relu

 run-simx:
 	$(MAKE) -C basic run-simx
@@ -25,6 +26,7 @@ run-simx:
 	$(MAKE) -C no_mf_ext run-simx
 	$(MAKE) -C no_smem run-simx
 	$(MAKE) -C prefetch run-simx
+	$(MAKE) -C relu run-simx

 run-rtlsim:
 	$(MAKE) -C basic run-rtlsim
@@ -39,6 +41,7 @@ run-rtlsim:
 	$(MAKE) -C no_mf_ext run-rtlsim
 	$(MAKE) -C no_smem run-rtlsim
 	$(MAKE) -C prefetch run-rtlsim
+	$(MAKE) -C relu run-rtlsim

 run-vlsim:
 	$(MAKE) -C basic run-vlsim
@@ -53,6 +56,7 @@ run-vlsim:
 	$(MAKE) -C no_mf_ext run-vlsim
 	$(MAKE) -C no_smem run-vlsim
 	$(MAKE) -C prefetch run-vlsim
+	$(MAKE) -C relu run-vlsim

 clean:
 	$(MAKE) -C basic clean
@@ -67,6 +71,7 @@ clean:
 	$(MAKE) -C no_mf_ext clean
 	$(MAKE) -C no_smem clean
 	$(MAKE) -C prefetch clean
+	$(MAKE) -C relu clean

 clean-all:
 	$(MAKE) -C basic clean-all
@@ -81,3 +86,4 @@ clean-all:
 	$(MAKE) -C no_mf_ext clean-all
 	$(MAKE) -C no_smem clean-all
 	$(MAKE) -C prefetch clean-all
+	$(MAKE) -C relu clean-all
--- a/tests/regression/relu/.depend
+++ b/tests/regression/relu/.depend
@@ -0,0 +1,2 @@
+main.o: main.cpp \
+ /home/eecs/nicolas.a.castaneda/vortex/driver/include/vortex.h common.h
--- a/tests/regression/relu/Makefile
+++ b/tests/regression/relu/Makefile
@@ -0,0 +1,77 @@
+XLEN ?= 32
+
+VORTEX_DRV_PATH ?= $(realpath ../../../driver)
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+OPTS ?= -n64
+
+VX_CC  = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
+VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
+VX_DP  = $(LLVM_PREFIX)/bin/llvm-objdump
+VX_CP  = $(LLVM_PREFIX)/bin/llvm-objcopy
+
+VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
+VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
+
+VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
+
+VX_SRCS = kernel.c
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
+
+LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+PROJECT = relu
+
+SRCS = main.cpp
+
+all: $(PROJECT) kernel.bin kernel.dump
+ 
+kernel.dump: kernel.elf
+	$(VX_DP) -D kernel.elf > kernel.dump
+
+kernel.bin: kernel.elf
+	$(VX_CP) -O binary kernel.elf kernel.bin
+
+kernel.elf: $(VX_SRCS)
+	$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-simx: $(PROJECT) kernel.bin   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	
+run-fpga: $(PROJECT) kernel.bin   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-asesim: $(PROJECT) kernel.bin   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	
+run-vlsim: $(PROJECT) kernel.bin   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-rtlsim: $(PROJECT) kernel.bin   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all: clean
+	rm -rf *.elf *.bin *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/regression/relu/common.h
+++ b/tests/regression/relu/common.h
@@ -0,0 +1,12 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+typedef struct {
+  uint32_t num_points;
+  uint32_t src_addr;
+  uint32_t dst_addr;  
+} kernel_arg_t;
+
+#endif
--- a/tests/regression/relu/kernel.bin
+++ b/tests/regression/relu/kernel.bin
--- a/tests/regression/relu/kernel.c
+++ b/tests/regression/relu/kernel.c
@@ -0,0 +1,28 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+void kernel_body(int __DIVERGENT__ task_id, kernel_arg_t* arg) {
+	uint32_t num_points = arg->num_points;
+	uint32_t points_per_core = num_points / vx_num_warps();
+	int tid = vx_thread_lid();
+	int32_t* src_ptr = (int32_t*)arg->src_addr;
+	int32_t* dst_ptr = (int32_t*)arg->dst_addr;
+
+	int32_t ref_value = src_ptr[task_id];
+	int ref_negative = ref_value < 0;
+	vx_split(ref_negative);
+	if (ref_negative) {
+		ref_value = 0;
+	}
+	vx_join();
+	
+	dst_ptr[task_id] = ref_value;
+}
+
+void main() {
+	kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+	int num_warps = vx_num_warps();
+	vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
+}
--- a/tests/regression/relu/kernel.dump
+++ b/tests/regression/relu/kernel.dump
--- a/tests/regression/relu/kernel.elf
+++ b/tests/regression/relu/kernel.elf
--- a/tests/regression/relu/main.cpp
+++ b/tests/regression/relu/main.cpp
@@ -0,0 +1,218 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vortex.h>
+#include <vector>
+#include "common.h"
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+const char* kernel_file = "kernel.bin";
+uint32_t count = 0;
+
+std::vector<int32_t> src_data;
+std::vector<int32_t> ref_data;
+
+vx_device_h device = nullptr;
+vx_buffer_h staging_buf = nullptr;
+kernel_arg_t kernel_arg;
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      count = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (staging_buf) {
+    vx_buf_free(staging_buf);
+  }
+  if (device) {
+    vx_mem_free(device, kernel_arg.src_addr);
+    vx_mem_free(device, kernel_arg.dst_addr);
+    vx_dev_close(device);
+  }
+}
+
+void gen_input_data(uint32_t num_points) {
+  src_data.resize(num_points);
+
+  for (uint32_t i = 0; i < src_data.size(); ++i) {
+    int value = std::rand() - (RAND_MAX / 2);
+    src_data[i] = value;
+  }
+}
+
+void gen_ref_data(uint32_t num_points) {
+  ref_data.resize(num_points);
+
+  for (uint32_t i = 0; i < num_points; ++i) {
+    int32_t ref_value = src_data.at(i);
+    ref_data.at(i) = std::max(0, ref_value);
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size, 
+             uint32_t num_points) {
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
+
+  // verify result
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      int ref = ref_data.at(i);
+      int cur = buf_ptr[i];
+      if (cur != ref) {
+        std::cout << "error at result #" << std::dec << i
+                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  size_t value;
+  
+  // parse command arguments
+  parse_args(argc, argv);
+
+  if (count == 0) {
+    count = 1;
+  }
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  uint32_t num_points = 256;
+
+  // generate input data
+  gen_input_data(num_points);
+
+  // generate reference data
+  gen_ref_data(num_points);
+
+  uint32_t src_buf_size = src_data.size() * sizeof(int32_t);  
+  uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
+
+  std::cout << "number of points: " << num_points << std::endl;
+  std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;  
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;  
+
+  RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
+  kernel_arg.src_addr = value;
+  RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
+  kernel_arg.dst_addr = value;
+
+  kernel_arg.num_points = num_points;
+
+  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
+  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
+  
+  // allocate shared memory  
+  std::cout << "allocate shared memory" << std::endl;    
+  uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
+                                  std::max<uint32_t>(dst_buf_size, 
+                                    sizeof(kernel_arg_t)));
+  RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  {
+    auto buf_ptr = (int*)vx_host_ptr(staging_buf);
+    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
+  }
+
+  // upload source buffer
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = src_data.at(i);
+    }
+  }
+  std::cout << "upload source buffer" << std::endl;      
+  RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
+
+  // clear destination buffer
+  {
+    auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = 0xdeadbeef;
+    }
+  }
+  std::cout << "clear destination buffer" << std::endl;      
+  RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));  
+
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}
--- a/tests/regression/relu/ramulator.ddr4.log
+++ b/tests/regression/relu/ramulator.ddr4.log
@@ -0,0 +1,278 @@
+               ramulator.active_cycles_0                1072                                      # Total active cycles for level _0
+                 ramulator.busy_cycles_0                1072                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0
+            ramulator.serving_requests_0                1496                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0
+    ramulator.average_serving_requests_0            0.130336                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0
+             ramulator.active_cycles_0_0                1072                                      # Total active cycles for level _0_0
+               ramulator.busy_cycles_0_0                1384                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0
+          ramulator.serving_requests_0_0                1496                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0
+  ramulator.average_serving_requests_0_0            0.130336                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0
+           ramulator.active_cycles_0_0_0                1030                                      # Total active cycles for level _0_0_0
+             ramulator.busy_cycles_0_0_0                1030                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0
+        ramulator.serving_requests_0_0_0                1440                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
+ramulator.average_serving_requests_0_0_0            0.125457                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
+         ramulator.active_cycles_0_0_0_0                1030                                      # Total active cycles for level _0_0_0_0
+           ramulator.busy_cycles_0_0_0_0                1030                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_0
+      ramulator.serving_requests_0_0_0_0                1440                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
+ramulator.average_serving_requests_0_0_0_0            0.125457                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
+         ramulator.active_cycles_0_0_0_1                   0                                      # Total active cycles for level _0_0_0_1
+           ramulator.busy_cycles_0_0_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_1
+      ramulator.serving_requests_0_0_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
+ramulator.average_serving_requests_0_0_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
+         ramulator.active_cycles_0_0_0_2                   0                                      # Total active cycles for level _0_0_0_2
+           ramulator.busy_cycles_0_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_2
+      ramulator.serving_requests_0_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
+ramulator.average_serving_requests_0_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
+         ramulator.active_cycles_0_0_0_3                   0                                      # Total active cycles for level _0_0_0_3
+           ramulator.busy_cycles_0_0_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_3
+      ramulator.serving_requests_0_0_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
+ramulator.average_serving_requests_0_0_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
+           ramulator.active_cycles_0_0_1                   0                                      # Total active cycles for level _0_0_1
+             ramulator.busy_cycles_0_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1
+        ramulator.serving_requests_0_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
+ramulator.average_serving_requests_0_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
+         ramulator.active_cycles_0_0_1_0                   0                                      # Total active cycles for level _0_0_1_0
+           ramulator.busy_cycles_0_0_1_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_0
+      ramulator.serving_requests_0_0_1_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
+ramulator.average_serving_requests_0_0_1_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
+         ramulator.active_cycles_0_0_1_1                   0                                      # Total active cycles for level _0_0_1_1
+           ramulator.busy_cycles_0_0_1_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_1
+      ramulator.serving_requests_0_0_1_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
+ramulator.average_serving_requests_0_0_1_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
+         ramulator.active_cycles_0_0_1_2                   0                                      # Total active cycles for level _0_0_1_2
+           ramulator.busy_cycles_0_0_1_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_2
+      ramulator.serving_requests_0_0_1_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
+ramulator.average_serving_requests_0_0_1_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
+         ramulator.active_cycles_0_0_1_3                   0                                      # Total active cycles for level _0_0_1_3
+           ramulator.busy_cycles_0_0_1_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_3
+      ramulator.serving_requests_0_0_1_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
+ramulator.average_serving_requests_0_0_1_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
+           ramulator.active_cycles_0_0_2                   0                                      # Total active cycles for level _0_0_2
+             ramulator.busy_cycles_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2
+        ramulator.serving_requests_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
+ramulator.average_serving_requests_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
+         ramulator.active_cycles_0_0_2_0                   0                                      # Total active cycles for level _0_0_2_0
+           ramulator.busy_cycles_0_0_2_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_0
+      ramulator.serving_requests_0_0_2_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
+ramulator.average_serving_requests_0_0_2_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
+         ramulator.active_cycles_0_0_2_1                   0                                      # Total active cycles for level _0_0_2_1
+           ramulator.busy_cycles_0_0_2_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_1
+      ramulator.serving_requests_0_0_2_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
+ramulator.average_serving_requests_0_0_2_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
+         ramulator.active_cycles_0_0_2_2                   0                                      # Total active cycles for level _0_0_2_2
+           ramulator.busy_cycles_0_0_2_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_2
+      ramulator.serving_requests_0_0_2_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
+ramulator.average_serving_requests_0_0_2_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
+         ramulator.active_cycles_0_0_2_3                   0                                      # Total active cycles for level _0_0_2_3
+           ramulator.busy_cycles_0_0_2_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_3
+      ramulator.serving_requests_0_0_2_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
+ramulator.average_serving_requests_0_0_2_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
+           ramulator.active_cycles_0_0_3                  42                                      # Total active cycles for level _0_0_3
+             ramulator.busy_cycles_0_0_3                  42                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3
+        ramulator.serving_requests_0_0_3                  56                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
+ramulator.average_serving_requests_0_0_3            0.004879                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
+         ramulator.active_cycles_0_0_3_0                   0                                      # Total active cycles for level _0_0_3_0
+           ramulator.busy_cycles_0_0_3_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_0
+      ramulator.serving_requests_0_0_3_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
+ramulator.average_serving_requests_0_0_3_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
+         ramulator.active_cycles_0_0_3_1                   0                                      # Total active cycles for level _0_0_3_1
+           ramulator.busy_cycles_0_0_3_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_1
+      ramulator.serving_requests_0_0_3_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
+ramulator.average_serving_requests_0_0_3_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
+         ramulator.active_cycles_0_0_3_2                   0                                      # Total active cycles for level _0_0_3_2
+           ramulator.busy_cycles_0_0_3_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_2
+      ramulator.serving_requests_0_0_3_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
+ramulator.average_serving_requests_0_0_3_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
+         ramulator.active_cycles_0_0_3_3                  42                                      # Total active cycles for level _0_0_3_3
+           ramulator.busy_cycles_0_0_3_3                  42                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_3
+      ramulator.serving_requests_0_0_3_3                  56                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
+ramulator.average_serving_requests_0_0_3_3            0.004879                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
+      ramulator.read_transaction_bytes_0                3712                                      # The total byte of read transaction per channel
+     ramulator.write_transaction_bytes_0               16000                                      # The total byte of write transaction per channel
+       ramulator.row_hits_channel_0_core                 296                                      # Number of row hits per channel per core
+     ramulator.row_misses_channel_0_core                   3                                      # Number of row misses per channel per core
+  ramulator.row_conflicts_channel_0_core                   9                                      # Number of row conflicts per channel per core
+  ramulator.read_row_hits_channel_0_core                  51                                      # Number of row hits for read requests per channel per core
+                                     [0]                51.0                                      # 
+ramulator.read_row_misses_channel_0_core                   2                                      # Number of row misses for read requests per channel per core
+                                     [0]                 2.0                                      # 
+ramulator.read_row_conflicts_channel_0_core                   5                                      # Number of row conflicts for read requests per channel per core
+                                     [0]                 5.0                                      # 
+ ramulator.write_row_hits_channel_0_core                 245                                      # Number of row hits for write requests per channel per core
+                                     [0]               245.0                                      # 
+ramulator.write_row_misses_channel_0_core                   1                                      # Number of row misses for write requests per channel per core
+                                     [0]                 1.0                                      # 
+ramulator.write_row_conflicts_channel_0_core                   4                                      # Number of row conflicts for write requests per channel per core
+                                     [0]                 4.0                                      # 
+      ramulator.useless_activates_0_core                   0                                      # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
+            ramulator.read_latency_avg_0           41.689655                                      # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
+            ramulator.read_latency_sum_0                2418                                      # The memory latency cycles (in memory time domain) sum for all read requests in this channel
+        ramulator.req_queue_length_avg_0            0.993466                                      # Average of read and write queue length per memory cycle per channel.
+        ramulator.req_queue_length_sum_0               11403                                      # Sum of read and write queue length per memory cycle per channel.
+   ramulator.read_req_queue_length_avg_0            0.200906                                      # Read queue length average per memory cycle per channel.
+   ramulator.read_req_queue_length_sum_0                2306                                      # Read queue length sum per memory cycle per channel.
+  ramulator.write_req_queue_length_avg_0            0.792560                                      # Write queue length average per memory cycle per channel.
+  ramulator.write_req_queue_length_sum_0                9097                                      # Write queue length sum per memory cycle per channel.
+              ramulator.record_read_hits                 0.0                                      # record read hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+            ramulator.record_read_misses                 0.0                                      # record_read_miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_read_conflicts                 0.0                                      # record read conflict count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+             ramulator.record_write_hits                 0.0                                      # record write hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+           ramulator.record_write_misses                 0.0                                      # record write miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+        ramulator.record_write_conflicts                 0.0                                      # record write conflict for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+               ramulator.active_cycles_1                1071                                      # Total active cycles for level _1
+                 ramulator.busy_cycles_1                1071                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1
+            ramulator.serving_requests_1                1472                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1
+    ramulator.average_serving_requests_1            0.128245                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1
+             ramulator.active_cycles_1_0                1071                                      # Total active cycles for level _1_0
+               ramulator.busy_cycles_1_0                1383                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0
+          ramulator.serving_requests_1_0                1472                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0
+  ramulator.average_serving_requests_1_0            0.128245                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0
+           ramulator.active_cycles_1_0_0                1071                                      # Total active cycles for level _1_0_0
+             ramulator.busy_cycles_1_0_0                1071                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0
+        ramulator.serving_requests_1_0_0                1472                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
+ramulator.average_serving_requests_1_0_0            0.128245                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
+         ramulator.active_cycles_1_0_0_0                1071                                      # Total active cycles for level _1_0_0_0
+           ramulator.busy_cycles_1_0_0_0                1071                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_0
+      ramulator.serving_requests_1_0_0_0                1472                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
+ramulator.average_serving_requests_1_0_0_0            0.128245                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
+         ramulator.active_cycles_1_0_0_1                   0                                      # Total active cycles for level _1_0_0_1
+           ramulator.busy_cycles_1_0_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_1
+      ramulator.serving_requests_1_0_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
+ramulator.average_serving_requests_1_0_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
+         ramulator.active_cycles_1_0_0_2                   0                                      # Total active cycles for level _1_0_0_2
+           ramulator.busy_cycles_1_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_2
+      ramulator.serving_requests_1_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
+ramulator.average_serving_requests_1_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
+         ramulator.active_cycles_1_0_0_3                   0                                      # Total active cycles for level _1_0_0_3
+           ramulator.busy_cycles_1_0_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_3
+      ramulator.serving_requests_1_0_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
+ramulator.average_serving_requests_1_0_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
+           ramulator.active_cycles_1_0_1                   0                                      # Total active cycles for level _1_0_1
+             ramulator.busy_cycles_1_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1
+        ramulator.serving_requests_1_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
+ramulator.average_serving_requests_1_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
+         ramulator.active_cycles_1_0_1_0                   0                                      # Total active cycles for level _1_0_1_0
+           ramulator.busy_cycles_1_0_1_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_0
+      ramulator.serving_requests_1_0_1_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
+ramulator.average_serving_requests_1_0_1_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
+         ramulator.active_cycles_1_0_1_1                   0                                      # Total active cycles for level _1_0_1_1
+           ramulator.busy_cycles_1_0_1_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_1
+      ramulator.serving_requests_1_0_1_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
+ramulator.average_serving_requests_1_0_1_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
+         ramulator.active_cycles_1_0_1_2                   0                                      # Total active cycles for level _1_0_1_2
+           ramulator.busy_cycles_1_0_1_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_2
+      ramulator.serving_requests_1_0_1_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
+ramulator.average_serving_requests_1_0_1_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
+         ramulator.active_cycles_1_0_1_3                   0                                      # Total active cycles for level _1_0_1_3
+           ramulator.busy_cycles_1_0_1_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_3
+      ramulator.serving_requests_1_0_1_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
+ramulator.average_serving_requests_1_0_1_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
+           ramulator.active_cycles_1_0_2                   0                                      # Total active cycles for level _1_0_2
+             ramulator.busy_cycles_1_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2
+        ramulator.serving_requests_1_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
+ramulator.average_serving_requests_1_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
+         ramulator.active_cycles_1_0_2_0                   0                                      # Total active cycles for level _1_0_2_0
+           ramulator.busy_cycles_1_0_2_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_0
+      ramulator.serving_requests_1_0_2_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
+ramulator.average_serving_requests_1_0_2_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
+         ramulator.active_cycles_1_0_2_1                   0                                      # Total active cycles for level _1_0_2_1
+           ramulator.busy_cycles_1_0_2_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_1
+      ramulator.serving_requests_1_0_2_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
+ramulator.average_serving_requests_1_0_2_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
+         ramulator.active_cycles_1_0_2_2                   0                                      # Total active cycles for level _1_0_2_2
+           ramulator.busy_cycles_1_0_2_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_2
+      ramulator.serving_requests_1_0_2_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
+ramulator.average_serving_requests_1_0_2_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
+         ramulator.active_cycles_1_0_2_3                   0                                      # Total active cycles for level _1_0_2_3
+           ramulator.busy_cycles_1_0_2_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_3
+      ramulator.serving_requests_1_0_2_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
+ramulator.average_serving_requests_1_0_2_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
+           ramulator.active_cycles_1_0_3                   0                                      # Total active cycles for level _1_0_3
+             ramulator.busy_cycles_1_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3
+        ramulator.serving_requests_1_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
+ramulator.average_serving_requests_1_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
+         ramulator.active_cycles_1_0_3_0                   0                                      # Total active cycles for level _1_0_3_0
+           ramulator.busy_cycles_1_0_3_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_0
+      ramulator.serving_requests_1_0_3_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
+ramulator.average_serving_requests_1_0_3_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
+         ramulator.active_cycles_1_0_3_1                   0                                      # Total active cycles for level _1_0_3_1
+           ramulator.busy_cycles_1_0_3_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_1
+      ramulator.serving_requests_1_0_3_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
+ramulator.average_serving_requests_1_0_3_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
+         ramulator.active_cycles_1_0_3_2                   0                                      # Total active cycles for level _1_0_3_2
+           ramulator.busy_cycles_1_0_3_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_2
+      ramulator.serving_requests_1_0_3_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
+ramulator.average_serving_requests_1_0_3_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
+         ramulator.active_cycles_1_0_3_3                   0                                      # Total active cycles for level _1_0_3_3
+           ramulator.busy_cycles_1_0_3_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_3
+      ramulator.serving_requests_1_0_3_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
+ramulator.average_serving_requests_1_0_3_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
+      ramulator.read_transaction_bytes_1                3584                                      # The total byte of read transaction per channel
+     ramulator.write_transaction_bytes_1               14848                                      # The total byte of write transaction per channel
+       ramulator.row_hits_channel_1_core                 276                                      # Number of row hits per channel per core
+     ramulator.row_misses_channel_1_core                   2                                      # Number of row misses per channel per core
+  ramulator.row_conflicts_channel_1_core                  10                                      # Number of row conflicts per channel per core
+  ramulator.read_row_hits_channel_1_core                  49                                      # Number of row hits for read requests per channel per core
+                                     [0]                49.0                                      # 
+ramulator.read_row_misses_channel_1_core                   1                                      # Number of row misses for read requests per channel per core
+                                     [0]                 1.0                                      # 
+ramulator.read_row_conflicts_channel_1_core                   6                                      # Number of row conflicts for read requests per channel per core
+                                     [0]                 6.0                                      # 
+ ramulator.write_row_hits_channel_1_core                 227                                      # Number of row hits for write requests per channel per core
+                                     [0]               227.0                                      # 
+ramulator.write_row_misses_channel_1_core                   1                                      # Number of row misses for write requests per channel per core
+                                     [0]                 1.0                                      # 
+ramulator.write_row_conflicts_channel_1_core                   4                                      # Number of row conflicts for write requests per channel per core
+                                     [0]                 4.0                                      # 
+      ramulator.useless_activates_1_core                   0                                      # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
+            ramulator.read_latency_avg_1           34.642857                                      # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
+            ramulator.read_latency_sum_1                1940                                      # The memory latency cycles (in memory time domain) sum for all read requests in this channel
+        ramulator.req_queue_length_avg_1            0.524830                                      # Average of read and write queue length per memory cycle per channel.
+        ramulator.req_queue_length_sum_1                6024                                      # Sum of read and write queue length per memory cycle per channel.
+   ramulator.read_req_queue_length_avg_1            0.159261                                      # Read queue length average per memory cycle per channel.
+   ramulator.read_req_queue_length_sum_1                1828                                      # Read queue length sum per memory cycle per channel.
+  ramulator.write_req_queue_length_avg_1            0.365569                                      # Write queue length average per memory cycle per channel.
+  ramulator.write_req_queue_length_sum_1                4196                                      # Write queue length sum per memory cycle per channel.
+              ramulator.record_read_hits                 0.0                                      # record read hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+            ramulator.record_read_misses                 0.0                                      # record_read_miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_read_conflicts                 0.0                                      # record read conflict count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+             ramulator.record_write_hits                 0.0                                      # record write hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+           ramulator.record_write_misses                 0.0                                      # record write miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+        ramulator.record_write_conflicts                 0.0                                      # record write conflict for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+                 ramulator.dram_capacity          8589934592                                      # Number of bytes in simulated DRAM
+                   ramulator.dram_cycles               11478                                      # Number of DRAM cycles simulated
+             ramulator.incoming_requests                 596                                      # Number of incoming requests to DRAM
+                 ramulator.read_requests                 114                                      # Number of incoming read requests to DRAM per core
+                                     [0]               114.0                                      # 
+                ramulator.write_requests                 482                                      # Number of incoming write requests to DRAM per core
+                                     [0]               482.0                                      # 
+       ramulator.ramulator_active_cycles                2049                                      # The total number of cycles that the DRAM part is active (serving R/W)
+ ramulator.incoming_requests_per_channel               596.0                                      # Number of incoming requests to each DRAM channel
+                                     [0]               308.0                                      # 
+                                     [1]               288.0                                      # 
+ramulator.incoming_read_reqs_per_channel               114.0                                      # Number of incoming read requests to each DRAM channel
+                                     [0]                58.0                                      # 
+                                     [1]                56.0                                      # 
+     ramulator.physical_page_replacement                   0                                      # The number of times that physical page replacement happens.
+             ramulator.maximum_bandwidth         38400000000                                      # The theoretical maximum bandwidth (Bps)
+          ramulator.in_queue_req_num_sum               17427                                      # Sum of read/write queue length
+     ramulator.in_queue_read_req_num_sum                4134                                      # Sum of read queue length
+    ramulator.in_queue_write_req_num_sum               13293                                      # Sum of write queue length
+          ramulator.in_queue_req_num_avg            1.518296                                      # Average of read/write queue length per memory cycle
+     ramulator.in_queue_read_req_num_avg            0.360167                                      # Average of read queue length per memory cycle
+    ramulator.in_queue_write_req_num_avg            1.158129                                      # Average of write queue length per memory cycle
+          ramulator.record_read_requests                 0.0                                      # record read requests for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_write_requests                 0.0                                      # record write requests for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
--- a/tests/regression/relu/relu
+++ b/tests/regression/relu/relu
--- a/tests/runtime/fibonacci/Makefile
+++ b/tests/runtime/fibonacci/Makefile
@@ -1,9 +1,9 @@
 XLEN ?= 32

 ifeq ($(XLEN),32)
-RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
 else
-RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
 endif

 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
--- a/tests/runtime/hello/Makefile
+++ b/tests/runtime/hello/Makefile
@@ -1,9 +1,9 @@
 XLEN ?= 32

 ifeq ($(XLEN),32)
-RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
 else
-RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
 endif

 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
--- a/tests/runtime/simple/Makefile
+++ b/tests/runtime/simple/Makefile
@@ -1,9 +1,9 @@
 XLEN ?= 32

 ifeq ($(XLEN),32)
-RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
 else
-RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
 endif

 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
--- a/vortex-rtlsim.rc
+++ b/vortex-rtlsim.rc
@@ -0,0 +1,219 @@
+Magic 271485
+Revision Verdi_S-2021.09-SP1-1
+
+; Window Layout <x> <y> <width> <height> <signalwidth> <valuewidth>
+viewPort 0 33 3840 1560 374 148
+
+; File list:
+; openDirFile [-d delimiter] [-s time_offset] [-rf auto_bus_rule_file] path_name file_name
+openDirFile -d / "" "/scratch/hansung/src/vortex/trace.vcd.fsdb"
+
+; file time scale:
+; fileTimeScale ### s|ms|us|ns|ps
+
+; signal spacing:
+signalSpacing 5
+
+; windowTimeUnit is used for zoom, cursor & marker
+; waveform viewport range
+zoom 75133.753950 75225.192159
+cursor 75155.000000
+marker 0.000000
+
+; user define markers
+; userMarker time_pos marker_name color linestyle
+; visible top row signal index
+top 42
+; marker line index
+markerPos 78
+
+; event list
+; addEvent event_name event_expression
+; curEvent event_name
+
+
+
+COMPLEX_EVENT_BEGIN
+
+
+COMPLEX_EVENT_END
+
+
+
+; toolbar current search type
+; curSTATUS search_type
+curSTATUS ByValue
+
+
+addGroup "G1"
+activeDirFile "" "/scratch/hansung/src/vortex/trace.vcd.fsdb"
+addSignal -h 30 /TOP/clk
+addSignal -h 30 -holdScope reset
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/dcache_req_valid[3:0]
+addSignal -h 30 -holdScope dcache_rsp_ready
+addSubGroup "Issue"
+addSubGroup "Ibuffer"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/ibuffer/ibuffer_if/PC[31:0]
+addSignal -h 30 -holdScope rs1[5:0]
+addSignal -h 30 -holdScope wid[1:0]
+endSubGroup "Ibuffer"
+addSubGroup "gpr_rsp_if"
+addSignal -h 30 -UNSIGNED -HEX /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/dispatch/gpr_rsp_if/\rs1_data[0] [31:0]
+addSignal -h 30 -holdScope \rs1_data[1] [31:0]
+addSignal -h 30 -holdScope \rs1_data[2] [31:0]
+addSignal -h 30 -holdScope \rs1_data[3] [31:0]
+endSubGroup "gpr_rsp_if"
+addSubGroup "Dispatch"
+endSubGroup "Dispatch"
+endSubGroup "Issue"
+addSubGroup "Execute"
+addSubGroup "LSU"
+addSubGroup "lsu_req_if" -e FALSE
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_req_if/ready
+addSignal -h 30 -holdScope valid
+addSignal -h 30 -UNSIGNED -HEX -holdScope PC[31:0]
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/lsu_req_if/\base_addr[0] [31:0]
+addSignal -h 30 -holdScope \base_addr[1] [31:0]
+addSignal -h 30 -holdScope \base_addr[2] [31:0]
+addSignal -h 30 -holdScope \base_addr[3] [31:0]
+addSignal -h 30 -holdScope offset[31:0]
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_req_if/tmask[3:0]
+addSignal -h 30 -holdScope op_type[3:0]
+endSubGroup "lsu_req_if"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/req_valid
+addSignal -h 30 -holdScope req_pc[31:0]
+addSignal -h 30 -holdScope dcache_req_ready
+addSignal -h 30 -holdScope req_sent_mask[3:0]
+endSubGroup "LSU"
+addSubGroup "dcache_req_if"
+addSignal -h 30 -UNSIGNED -HEX /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/dcache_req_if/ready[3:0]
+addSignal -h 30 -holdScope valid[3:0]
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/dcache_req_if/\addr[0] [29:0]
+addSignal -h 30 -holdScope \addr[1] [29:0]
+addSignal -h 30 -holdScope \addr[2] [29:0]
+addSignal -h 30 -holdScope \addr[3] [29:0]
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/\per_bank_core_req_addr[0] [25:0]
+addSignal -h 30 -holdScope \per_bank_core_req_addr[1] [25:0]
+addSignal -h 30 -holdScope \per_bank_core_req_addr[2] [25:0]
+addSignal -h 30 -holdScope \per_bank_core_req_addr[3] [25:0]
+endSubGroup "dcache_req_if"
+addSubGroup "dcache_rsp_if"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/dcache_rsp_if/ready
+addSignal -h 30 -holdScope valid
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/dcache_rsp_if/tag[48:0]
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/dcache_rsp_if/tmask[3:0]
+endSubGroup "dcache_rsp_if"
+addSubGroup "alu_req_if" -e FALSE
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/alu_req_if/PC[31:0]
+addSignal -h 30 -holdScope tmask[3:0]
+addSignal -h 30 -holdScope ready
+addSignal -h 30 -holdScope valid
+endSubGroup "alu_req_if"
+endSubGroup "Execute"
+addSubGroup "Decode" -e FALSE
+addSignal -h 30 -UNSIGNED -HEX /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/decode/decode_if/tmask[3:0]
+endSubGroup "Decode"
+addGroup "L1 Dcache"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_ready[3:0]
+addSignal -h 30 -UNSIGNED -HEX -holdScope core_req_valid[3:0]
+addSignal -h 30 -holdScope core_req_rw[3:0]
+addSignal -h 30 -holdScope \core_req_addr[0] [29:0]
+addSignal -h 30 -holdScope \core_req_addr[1] [29:0]
+addSignal -h 30 -holdScope \core_req_addr[2] [29:0]
+addSignal -h 30 -holdScope \core_req_addr[3] [29:0]
+addSubGroup "BankSel"
+addSignal -h 30 -UNSIGNED -HEX /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel/\core_req_bid[0] [1:0]
+addSignal -h 30 -holdScope \core_req_bid[1] [1:0]
+addSignal -h 30 -holdScope \core_req_bid[2] [1:0]
+addSignal -h 30 -holdScope \core_req_bid[3] [1:0]
+addSignal -h 30 -holdScope \core_req_line_addr[0] [25:0]
+addSignal -h 30 -holdScope \core_req_line_addr[1] [25:0]
+addSignal -h 30 -holdScope \core_req_line_addr[2] [25:0]
+addSignal -h 30 -holdScope \core_req_line_addr[3] [25:0]
+addSignal -h 30 -holdScope \per_bank_core_req_tid_r[0][0] [1:0]
+addSignal -h 30 -holdScope \per_bank_core_req_tid_r[1][0] [1:0]
+addSignal -h 30 -holdScope \per_bank_core_req_tid_r[2][0] [1:0]
+addSignal -h 30 -holdScope \per_bank_core_req_tid_r[3][0] [1:0]
+addSignal -h 30 -UNSIGNED -BIN -holdScope per_bank_core_req_valid[3:0]
+addSignal -h 30 -holdScope core_req_ready[3:0]
+endSubGroup "BankSel"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_rsp_valid[0:0]
+addSignal -h 30 -holdScope mem_req_valid
+addSignal -h 30 -holdScope mem_req_rw
+addSignal -h 30 -holdScope mem_rsp_valid
+addGroup "L2"
+addSignal -h 30 -UNSIGNED -HEX /TOP/Vortex/\genblk2[0] /cluster/genblk3/l2cache/core_req_ready[1:0]
+addSignal -h 30 -holdScope core_req_valid[1:0]
+addSignal -h 30 -holdScope mem_req_valid
+addGroup "DRAM"
+addSignal -h 30 /TOP/Vortex/\genblk2[0] /cluster/genblk3/l2cache/mem_req_valid
+addSignal -h 30 -holdScope mem_rsp_valid
+addGroup "G3"
+
+; getSignalForm Scope Hierarchy Status
+; active file of getSignalForm
+activeDirFile "" "/scratch/hansung/src/vortex/trace.vcd.fsdb"
+
+GETSIGNALFORM_SCOPE_HIERARCHY_BEGIN
+getSignalForm close
+
+"/TOP"
+"/TOP/Vortex"
+"/TOP/Vortex/\genblk2[0] "
+"/TOP/Vortex/\genblk2[0] /cluster"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] "
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/dcache_rsp_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/\genblk7[0] "
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/mem_req_arb"
+"/TOP/Vortex/genblk3"
+
+SCOPE_LIST_BEGIN
+"/TOP"
+"/TOP/Vortex"
+"/TOP/Vortex/genblk3"
+"/TOP/Vortex/\genblk2[0]"
+"/TOP/Vortex/\genblk2[0] "
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/dcache_rsp_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/dcache_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] "
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/dcache_rsp_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/dcache_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache_mem_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/mem_req_arb"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/\genblk7[0] "
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/mem_req_arb/genblk1"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/lsu_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/execute/lsu_unit/lsu_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/dispatch"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/dispatch/ibuffer_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/dispatch/gpr_rsp_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/ibuffer"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/ibuffer_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/ibuffer/ibuffer_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/ibuffer/decode_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/gpr_rsp_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/pipeline/issue/gpr_req_if"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel/genblk5"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_rsp_merge"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel/genblk5/genblk1"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel/genblk5/genblk1/genblk1"
+"/TOP/Vortex/\genblk2[0] /cluster/\genblk2[0] /core/mem_unit/dcache/core_req_bank_sel/genblk5/genblk1/genblk1/unnamedblk7"
+SCOPE_LIST_END
+
+GETSIGNALFORM_SCOPE_HIERARCHY_END
+
+
Author	SHA1	Message	Date
Hansung Kim	2f4fd11c93	Add vecadd-loop This is the same kernel as vecadd but repeated in a for-loop many times so that the runtime overhead at the startup is amortized.	2023-10-31 23:35:11 -07:00
Hansung Kim	6563ed696e	Force per-global-thread stack allocation It's easy to miss setting SM_ENABLE according to our memory hierarchy, which leads to data races in the stack that's hard to catch. So be safe and force enable per-core stack allocation.	2023-10-22 01:56:14 -07:00
Hansung Kim	92ed21f83f	[driver] Set different base address for device malloc Change the target area of malloc to something more akin to the heap area for a CPU userspace program, since that works better with Chipyard's default memory mapping scheme (0x80000000 and above).	2023-10-07 21:22:45 -07:00
Hansung Kim	46a60cf58e	[driver] Fix bug in addr range check for upload/download Device address should not be compared against LOCAL_MEM_SIZE but against an absolute max address. Introduce new DEVICE_MAX_ADDR for this.	2023-10-07 21:18:27 -07:00
Hansung Kim	6ffb8c37e9	[tests] Make an args.bin copy in vecadd Makefile	2023-10-07 19:15:10 -07:00
Hansung Kim	7da7a1a983	Merge remote-tracking branch 'ncastaneda02/master' Adds relu kernel.	2023-10-06 13:58:25 -07:00
Nico Castaneda	5b89ff2741	added split/join to relu	2023-10-06 13:23:13 -07:00
Nico Castaneda	8296e6be0f	relu test added	2023-10-06 13:20:31 -07:00
Hansung Kim	719b8048ab	[debug] Print warp id for memtraces	2023-09-26 13:36:22 -07:00
Hansung Kim	d34177ea9c	[debug] Elevate DEBUG_LEVEL for load/store; trace prefetch and fence	2023-09-26 11:44:20 -07:00
Hansung Kim	b97e94b8ed	[tests] vecadd\|sgemm\|saxpy: save input buffers to file	2023-09-25 13:45:03 -07:00
Hansung Kim	576e7aab78	[tests] Remove -LLCFLAGS from Makefile	2023-09-25 13:28:05 -07:00
Hansung Kim	905b1877fb	Add simple tid kernel that has zero arguments	2023-09-20 17:31:14 -07:00
Hansung Kim	3846d2ae59	Add annotated assembly dump for vecadd	2023-09-20 14:43:01 -07:00
Hansung Kim	c90fe56588	More doc comments	2023-09-20 14:42:56 -07:00
Hansung Kim	9efdd2ebb7	Change path to prebuilt toolchain	2023-09-10 14:43:54 -07:00
Hansung Kim	0caf3ad471	Add more comments to vx_spawn.c	2023-09-04 19:04:22 -07:00
Hansung Kim	2fa94b9c21	Hardcode paths into env script instead of args	2023-08-31 14:24:58 -07:00
Hansung Kim	62ebe0312f	[tests] Add comment on in-order command queue dispatch	2023-08-28 11:19:11 -07:00
Hansung Kim	19734fc5b6	Accept build dir and avoid dup run in env script	2023-08-24 13:53:44 -07:00
Hansung Kim	fdc0fdc958	[rtl] Add doc comments	2023-07-08 09:54:51 -07:00
Hansung Kim	3e290f6321	Add Verdi signal file	2023-07-08 00:23:31 -07:00
Hansung Kim	c24916b5e0	[tests] Add compute-bound variant of vecadd This loops 1000 times over `sum += A[i] + B[i]`, making every memory op hit at L1 cache.	2023-07-05 21:24:53 -07:00
Hansung Kim	0c40864522	Update PS1 in env script for clarity	2023-07-02 21:54:13 -07:00
Hansung Kim	f2f1249b93	Make base build directory a variable in script	2023-07-02 13:48:39 -07:00
Hansung Kim	f46383f350	Add #include <array> to fix compile error gcc complains std::array being undeclared when trying to build a fresh clone.	2023-07-02 13:47:41 -07:00
Hansung Kim	8caf476b1a	Merge remote-tracking branch 'upstream/master'	2023-07-02 13:27:08 -07:00
Hansung Kim	9cf5a29917	simx: add cycle and core id to load/store memory debug trace	2023-02-17 18:31:29 -08:00
Hansung Kim	d81e4085e2	simx: add thread ID and vlen to load/store memory debug trace	2023-02-17 17:59:56 -08:00
Hansung Kim	b9d1684582	Add sourceme scripts for env variables	2023-02-17 16:04:34 -08:00
Hansung Kim	f8fc305cbd	Add sourceme.sh Needed to set env variables for a custom setup where the prebuilt binaries are located outside of /opt.	2023-01-23 18:01:18 -08:00
Hansung Kim	70e1e2089d	Add .gitignore	2023-01-20 15:20:01 -08:00
Hansung Kim	547216d43f	[tests] reduce0, DotProduct: include LLVM_PREFIX in LD_LIBRARY_PATH Without this, poclcc fails with `error while loading shared libraries: libclangCodeGen.so.10: cannot open shared object file: No such file or directory`. Also fix wrong kernel file name.	2023-01-17 19:04:09 -08:00
Hansung Kim	bb4f38d000	[tests] opencl/convolution: fix linking of libsimx.so Fixes linker error by following suggestion of `/usr/bin/ld: warning: libsimx.so, needed by /scratch/hansung/src/vortex/driver/simx/libvortex.so, not found (try using -rpath or -rpath-link)`.	2023-01-17 19:04:09 -08:00
Hansung Kim	7c39cc2b5b	Makefile: respect RISCV_TOOLCHAIN_PATH if already set in env	2023-01-17 19:04:03 -08:00