diff --git a/.travis.yml b/.travis.yml index d652b27f..b07f5f45 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,7 +19,7 @@ install: - export PATH=$VERILATOR_ROOT/bin:$PATH # Install toolchain - ci/toolchain_install.sh -all - # clone build directory + # build project - make -s # stages ordering @@ -30,19 +30,16 @@ jobs: include: - stage: test name: coverage - script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage - - stage: test - name: tex - script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex + script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage - stage: test name: cluster - script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster - - stage: test - name: debug - script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug + script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster - stage: test name: config script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config + - stage: test + name: debug + script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug - stage: test name: stress0 script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0 @@ -52,6 +49,12 @@ jobs: - stage: test name: compiler script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh + - stage: test + name: tex + script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex + - stage: test + name: unittest + script: cp -r $PWD ../build_coverage && cd ../build_unittest && ./ci/travis_run.py ./ci/regression.sh -unittest after_success: # Gather code coverage diff --git a/ci/regression.sh b/ci/regression.sh index 2b485a63..6976b4e3 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -185,7 +185,8 @@ while [ "$1" != "" ]; do -stress ) stress0 stress1 ;; - -all ) coverage + -all ) unittest + coverage tex cluster debug diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index 5116035f..85cec545 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -310,7 +310,7 @@ module VX_lsu_unit #( `ifndef SYNTHESIS reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs; - wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); + wire [63:0] delay_timeout = 40000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); always @(posedge clk) begin if (reset) begin diff --git a/hw/rtl/cache/VX_nc_bypass.sv b/hw/rtl/cache/VX_nc_bypass.sv index 21eb440a..19cd3921 100644 --- a/hw/rtl/cache/VX_nc_bypass.sv +++ b/hw/rtl/cache/VX_nc_bypass.sv @@ -100,7 +100,6 @@ module VX_nc_bypass #( localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH); localparam MEM_LDATAW = $clog2(MEM_DATA_WIDTH); localparam D = MEM_LDATAW - CORE_LDATAW; - localparam P = 2**D; // core request handling diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv index 8c8b08d3..b1500af4 100644 --- a/hw/rtl/libs/VX_popcount.sv +++ b/hw/rtl/libs/VX_popcount.sv @@ -9,6 +9,8 @@ module VX_popcount #( input wire [N-1:0] in_i, output wire [M-1:0] cnt_o ); + `UNUSED_PARAM (MODEL) + `ifndef SYNTHESIS assign cnt_o = $countones(in_i); `else diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index 607dcf41..50c032c1 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -54,16 +54,14 @@ VL_FLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation -#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())') +VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Enable VCD trace -VCD_TRACE = -DVCD_OUTPUT - # Debugigng ifdef DEBUG - VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) + VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS) + CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG @@ -83,8 +81,6 @@ VL_FLAGS += -DIDIV_DPI FPU_CORE ?= FPU_DPI VL_FLAGS += -D$(FPU_CORE) -THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') - PROJECT = rtlsim all: $(PROJECT) diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index bd34e60f..663c764e 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -51,16 +51,14 @@ VL_FLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation -#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') +THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())') +VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) -# Enable VCD trace -#VCD_TRACE = -DVCD_OUTPUT - # Debugigng ifdef DEBUG - VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS) - CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS) + VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS) + #CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS) else VL_FLAGS += -DNDEBUG CXXFLAGS += -O2 -DNDEBUG diff --git a/tests/regression/basic/kernel_scheduler.h b/tests/regression/basic/kernel_scheduler.h deleted file mode 100644 index 8fd7dc28..00000000 --- a/tests/regression/basic/kernel_scheduler.h +++ /dev/null @@ -1,153 +0,0 @@ -#include -#include - -#define NUM_CORES_MAX 32 - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -struct context_t { - uint32_t num_groups[3]; - uint32_t global_offset[3]; - uint32_t local_size[3]; - char * printf_buffer; - uint32_t *printf_buffer_position; - uint32_t printf_buffer_capacity; - uint32_t work_dim; -}; - -typedef void (*vx_pocl_workgroup_func) ( - const void * /* args */, - const struct context_t * /* context */, - uint32_t /* group_x */, - uint32_t /* group_y */, - uint32_t /* group_z */ -); - -typedef struct { - struct context_t * ctx; - vx_pocl_workgroup_func pfn; - const void * args; - int offset; - int N; - int R; -} wspawn_args_t; - -void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) { - assert(nW <= NW); - for (int wid = 0; wid < nW; ++wid) { - for (int tid = 0; tid < NT; ++tid) { - int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid); - int tK = p_wspawn_args->N + (wid < p_wspawn_args->R); - int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK); - - int X = p_wspawn_args->ctx->num_groups[0]; - int Y = p_wspawn_args->ctx->num_groups[1]; - int XY = X * Y; - - for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) { - int k = wg_id / XY; - int wg_2d = wg_id - k * XY; - int j = wg_2d / X; - int i = wg_2d - j * X; - - int gid0 = p_wspawn_args->ctx->global_offset[0] + i; - int gid1 = p_wspawn_args->ctx->global_offset[1] + j; - int gid2 = p_wspawn_args->ctx->global_offset[2] + k; - - printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2); - } - } - } -} - -void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) { - assert(wid < NW); - assert(nT <= NT); - for (int t = 0; t < nT; ++t) { - int tid = core_id * NW * NT + wid * NT + t; - - int wg_id = p_wspawn_args->offset + tid; - - int X = p_wspawn_args->ctx->num_groups[0]; - int Y = p_wspawn_args->ctx->num_groups[1]; - int XY = X * Y; - - int k = wg_id / XY; - int wg_2d = wg_id - k * XY; - int j = wg_2d / X; - int i = wg_2d - j * X; - - int gid0 = p_wspawn_args->ctx->global_offset[0] + i; - int gid1 = p_wspawn_args->ctx->global_offset[1] + j; - int gid2 = p_wspawn_args->ctx->global_offset[2] + k; - - printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2); - } -} - -void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) { - // total number of WGs - int X = ctx->num_groups[0]; - int Y = ctx->num_groups[1]; - int Z = ctx->num_groups[2]; - int Q = X * Y * Z; - - // current core id - if (core_id >= NUM_CORES_MAX) - return; - - // calculate necessary active cores - int WT = NW * NT; - int nC = (Q > WT) ? (Q / WT) : 1; - int nc = MIN(nC, NC); - if (core_id >= nc) - return; // terminate extra cores - - // number of workgroups per core - int wgs_per_core = Q / nc; - int wgs_per_core0 = wgs_per_core; - if (core_id == (NC-1)) { - int QC_r = Q - (nc * wgs_per_core0); - wgs_per_core0 += QC_r; // last core executes remaining WGs - } - - // number of workgroups per warp - int nW = wgs_per_core0 / NT; // total warps per core - int rT = wgs_per_core0 - (nW * NT); // remaining threads - int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations - int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps - if (0 == fW) - fW = 1; - - //-- - wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW }; - - //-- - if (nW >= 1) { - int nw = MIN(nW, NW); - kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args); - } - - //-- - if (rT != 0) { - wspawn_args.offset = wgs_per_core0 - rT; - kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args); - } - } - - void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) { - context_t ctx; - - ctx.num_groups[0] = X; - ctx.num_groups[1] = Y; - ctx.num_groups[2] = Z; - ctx.global_offset[0] = 0; - ctx.global_offset[1] = 0; - ctx.global_offset[2] = 0; - - for (int cid = 0; cid < NC; ++cid) { - kernel_run_once(&ctx, NC, NW, NT, cid); - } - - exit (0); - } \ No newline at end of file diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index 5183b04c..bf993834 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -4,7 +4,6 @@ #include #include #include "common.h" -#include "kernel_scheduler.h" #define RT_CHECK(_expr) \ do { \