diff --git a/.travis.yml b/.travis.yml
index d652b27f..b07f5f45 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -19,7 +19,7 @@ install:
   - export PATH=$VERILATOR_ROOT/bin:$PATH
   # Install toolchain
   - ci/toolchain_install.sh -all
-  # clone build directory
+  # build project
   - make -s
 
 # stages ordering
@@ -30,19 +30,16 @@ jobs:
   include:
     - stage: test
       name: coverage
-      script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
-    - stage: test
-      name: tex
-      script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
+      script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage    
     - stage: test
       name: cluster
-      script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
-    - stage: test
-      name: debug
-      script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
+      script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster    
     - stage: test
       name: config
       script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
+    - stage: test
+      name: debug
+      script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
     - stage: test
       name: stress0
       script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
@@ -52,6 +49,12 @@ jobs:
     - stage: test
       name: compiler
       script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
+    - stage: test
+      name: tex
+      script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
+    - stage: test
+      name: unittest
+      script: cp -r $PWD ../build_coverage && cd ../build_unittest && ./ci/travis_run.py ./ci/regression.sh -unittest
   
 after_success:
   # Gather code coverage
diff --git a/ci/regression.sh b/ci/regression.sh
index 2b485a63..6976b4e3 100755
--- a/ci/regression.sh
+++ b/ci/regression.sh
@@ -185,7 +185,8 @@ while [ "$1" != "" ]; do
         -stress ) stress0
                   stress1
                 ;;
-        -all ) coverage
+        -all ) unittest
+               coverage
                tex
                cluster
                debug
diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv
index 5116035f..85cec545 100644
--- a/hw/rtl/VX_lsu_unit.sv
+++ b/hw/rtl/VX_lsu_unit.sv
@@ -310,7 +310,7 @@ module VX_lsu_unit #(
 
 `ifndef SYNTHESIS
     reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
-    wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
+    wire [63:0] delay_timeout = 40000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
 
     always @(posedge clk) begin
         if (reset) begin
diff --git a/hw/rtl/cache/VX_nc_bypass.sv b/hw/rtl/cache/VX_nc_bypass.sv
index 21eb440a..19cd3921 100644
--- a/hw/rtl/cache/VX_nc_bypass.sv
+++ b/hw/rtl/cache/VX_nc_bypass.sv
@@ -100,7 +100,6 @@ module VX_nc_bypass #(
     localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH);
     localparam MEM_LDATAW  = $clog2(MEM_DATA_WIDTH);
     localparam D = MEM_LDATAW - CORE_LDATAW;
-    localparam P = 2**D;
 
     // core request handling
 
diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv
index 8c8b08d3..b1500af4 100644
--- a/hw/rtl/libs/VX_popcount.sv
+++ b/hw/rtl/libs/VX_popcount.sv
@@ -9,6 +9,8 @@ module VX_popcount #(
     input  wire [N-1:0] in_i,
     output wire [M-1:0] cnt_o
 );
+    `UNUSED_PARAM (MODEL)
+
 `ifndef SYNTHESIS
     assign cnt_o = $countones(in_i);
 `else
diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile
index 607dcf41..50c032c1 100644
--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@@ -54,16 +54,14 @@ VL_FLAGS += $(CONFIGS)
 CXXFLAGS += $(CONFIGS)
 
 # Enable Verilator multithreaded simulation
-#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
+THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
+VL_FLAGS += -j $(THREADS)
 #VL_FLAGS += --threads $(THREADS)
 
-# Enable VCD trace
-VCD_TRACE = -DVCD_OUTPUT
-
 # Debugigng
 ifdef DEBUG
-	VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
+	VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
+	CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
 else    
 	VL_FLAGS += -DNDEBUG
 	CXXFLAGS += -O2 -DNDEBUG
@@ -83,8 +81,6 @@ VL_FLAGS += -DIDIV_DPI
 FPU_CORE ?= FPU_DPI
 VL_FLAGS += -D$(FPU_CORE)
 
-THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
-
 PROJECT = rtlsim
 
 all: $(PROJECT)
diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile
index bd34e60f..663c764e 100644
--- a/sim/vlsim/Makefile
+++ b/sim/vlsim/Makefile
@@ -51,16 +51,14 @@ VL_FLAGS += $(CONFIGS)
 CXXFLAGS += $(CONFIGS)
 
 # Enable Verilator multithreaded simulation
-#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
+THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
+VL_FLAGS += -j $(THREADS)
 #VL_FLAGS += --threads $(THREADS)
 
-# Enable VCD trace
-#VCD_TRACE = -DVCD_OUTPUT
-
 # Debugigng
 ifdef DEBUG
-	VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
+	VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
+	#CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
 else    
 	VL_FLAGS += -DNDEBUG
 	CXXFLAGS += -O2 -DNDEBUG
diff --git a/tests/regression/basic/kernel_scheduler.h b/tests/regression/basic/kernel_scheduler.h
deleted file mode 100644
index 8fd7dc28..00000000
--- a/tests/regression/basic/kernel_scheduler.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#include <iostream>
-#include <assert.h>
-
-#define NUM_CORES_MAX 32
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-struct context_t {
-  uint32_t num_groups[3];
-  uint32_t global_offset[3];
-  uint32_t local_size[3];
-  char * printf_buffer;
-  uint32_t *printf_buffer_position;
-  uint32_t printf_buffer_capacity;
-  uint32_t work_dim;
-};
-
-typedef void (*vx_pocl_workgroup_func) (
-  const void * /* args */,
-	const struct context_t * /* context */,
-	uint32_t /* group_x */,
-	uint32_t /* group_y */,
-	uint32_t /* group_z */
-);
-
-typedef struct {
-  struct context_t * ctx;
-  vx_pocl_workgroup_func pfn;
-  const void * args;
-  int offset; 
-  int N;
-  int R;
-} wspawn_args_t;
-
-void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) {
-  assert(nW <= NW);
-  for (int wid = 0; wid < nW; ++wid) {
-    for (int tid = 0; tid < NT; ++tid) {
-      int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
-      int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
-      int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
-
-      int X = p_wspawn_args->ctx->num_groups[0];
-      int Y = p_wspawn_args->ctx->num_groups[1];
-      int XY = X * Y;
-
-      for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {    
-        int k = wg_id / XY;
-        int wg_2d = wg_id - k * XY;
-        int j = wg_2d / X;
-        int i = wg_2d - j * X;
-
-        int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
-        int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
-        int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
-
-        printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
-      }
-    }
-  }
-}
-
-void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) {    
-  assert(wid < NW);
-  assert(nT <= NT);
-  for (int t = 0; t < nT; ++t) {
-    int tid = core_id * NW * NT + wid * NT + t;
-
-    int wg_id = p_wspawn_args->offset + tid;
-
-    int X = p_wspawn_args->ctx->num_groups[0];
-    int Y = p_wspawn_args->ctx->num_groups[1];
-    int XY = X * Y;
-    
-    int k = wg_id / XY;
-    int wg_2d = wg_id - k * XY;
-    int j = wg_2d / X;
-    int i = wg_2d - j * X;
-
-    int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
-    int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
-    int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
-
-    printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
-  }
-}
-
-void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) {
-    // total number of WGs
-    int X = ctx->num_groups[0];
-    int Y = ctx->num_groups[1];
-    int Z = ctx->num_groups[2];
-    int Q = X * Y * Z;
-
-    // current core id
-    if (core_id >= NUM_CORES_MAX)
-      return;
-
-    // calculate necessary active cores
-    int WT = NW * NT;
-    int nC = (Q > WT) ? (Q / WT) : 1;
-    int nc = MIN(nC, NC);
-    if (core_id >= nc)
-      return; // terminate extra cores
-
-    // number of workgroups per core
-    int wgs_per_core = Q / nc;
-    int wgs_per_core0 = wgs_per_core;  
-    if (core_id == (NC-1)) {    
-      int QC_r = Q - (nc * wgs_per_core0); 
-      wgs_per_core0 += QC_r; // last core executes remaining WGs
-    }
-
-    // number of workgroups per warp
-    int nW = wgs_per_core0 / NT;              // total warps per core
-    int rT = wgs_per_core0 - (nW * NT);       // remaining threads
-    int fW = (nW >= NW) ? (nW / NW) : 0;      // full warps iterations
-    int rW = (fW != 0) ? (nW - fW * NW) : 0;  // reamining full warps
-    if (0 == fW)
-      fW = 1;
-
-    //--
-    wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW };
-
-    //--
-    if (nW >= 1)	{ 
-      int nw = MIN(nW, NW);
-      kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args);
-    }  
-
-    //--    
-    if (rT != 0) {
-      wspawn_args.offset = wgs_per_core0 - rT;
-      kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args);
-    }
-  }
-
-  void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) {
-    context_t ctx;
-
-    ctx.num_groups[0] = X;
-    ctx.num_groups[1] = Y;
-    ctx.num_groups[2] = Z;
-    ctx.global_offset[0] = 0;
-    ctx.global_offset[1] = 0;
-    ctx.global_offset[2] = 0;
-
-    for (int cid = 0; cid < NC; ++cid) {
-      kernel_run_once(&ctx, NC, NW, NT, cid);
-    }
-
-    exit (0);
-  }
\ No newline at end of file
diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp
index 5183b04c..bf993834 100755
--- a/tests/regression/basic/main.cpp
+++ b/tests/regression/basic/main.cpp
@@ -4,7 +4,6 @@
 #include <vortex.h>
 #include <chrono>
 #include "common.h"
-#include "kernel_scheduler.h"
 
 #define RT_CHECK(_expr)                                         \
    do {                                                         \