minor updates
This commit is contained in:
17
.travis.yml
17
.travis.yml
@@ -19,7 +19,7 @@ install:
|
|||||||
- export PATH=$VERILATOR_ROOT/bin:$PATH
|
- export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||||
# Install toolchain
|
# Install toolchain
|
||||||
- ci/toolchain_install.sh -all
|
- ci/toolchain_install.sh -all
|
||||||
# clone build directory
|
# build project
|
||||||
- make -s
|
- make -s
|
||||||
|
|
||||||
# stages ordering
|
# stages ordering
|
||||||
@@ -31,18 +31,15 @@ jobs:
|
|||||||
- stage: test
|
- stage: test
|
||||||
name: coverage
|
name: coverage
|
||||||
script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
|
script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
|
||||||
- stage: test
|
|
||||||
name: tex
|
|
||||||
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
|
|
||||||
- stage: test
|
- stage: test
|
||||||
name: cluster
|
name: cluster
|
||||||
script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
|
script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
|
||||||
- stage: test
|
|
||||||
name: debug
|
|
||||||
script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
|
|
||||||
- stage: test
|
- stage: test
|
||||||
name: config
|
name: config
|
||||||
script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
|
script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
|
||||||
|
- stage: test
|
||||||
|
name: debug
|
||||||
|
script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
|
||||||
- stage: test
|
- stage: test
|
||||||
name: stress0
|
name: stress0
|
||||||
script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
|
script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
|
||||||
@@ -52,6 +49,12 @@ jobs:
|
|||||||
- stage: test
|
- stage: test
|
||||||
name: compiler
|
name: compiler
|
||||||
script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
|
script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
|
||||||
|
- stage: test
|
||||||
|
name: tex
|
||||||
|
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
|
||||||
|
- stage: test
|
||||||
|
name: unittest
|
||||||
|
script: cp -r $PWD ../build_coverage && cd ../build_unittest && ./ci/travis_run.py ./ci/regression.sh -unittest
|
||||||
|
|
||||||
after_success:
|
after_success:
|
||||||
# Gather code coverage
|
# Gather code coverage
|
||||||
|
|||||||
@@ -185,7 +185,8 @@ while [ "$1" != "" ]; do
|
|||||||
-stress ) stress0
|
-stress ) stress0
|
||||||
stress1
|
stress1
|
||||||
;;
|
;;
|
||||||
-all ) coverage
|
-all ) unittest
|
||||||
|
coverage
|
||||||
tex
|
tex
|
||||||
cluster
|
cluster
|
||||||
debug
|
debug
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ module VX_lsu_unit #(
|
|||||||
|
|
||||||
`ifndef SYNTHESIS
|
`ifndef SYNTHESIS
|
||||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
|
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
|
||||||
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
wire [63:0] delay_timeout = 40000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
|
|||||||
1
hw/rtl/cache/VX_nc_bypass.sv
vendored
1
hw/rtl/cache/VX_nc_bypass.sv
vendored
@@ -100,7 +100,6 @@ module VX_nc_bypass #(
|
|||||||
localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH);
|
localparam CORE_LDATAW = $clog2(CORE_DATA_WIDTH);
|
||||||
localparam MEM_LDATAW = $clog2(MEM_DATA_WIDTH);
|
localparam MEM_LDATAW = $clog2(MEM_DATA_WIDTH);
|
||||||
localparam D = MEM_LDATAW - CORE_LDATAW;
|
localparam D = MEM_LDATAW - CORE_LDATAW;
|
||||||
localparam P = 2**D;
|
|
||||||
|
|
||||||
// core request handling
|
// core request handling
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ module VX_popcount #(
|
|||||||
input wire [N-1:0] in_i,
|
input wire [N-1:0] in_i,
|
||||||
output wire [M-1:0] cnt_o
|
output wire [M-1:0] cnt_o
|
||||||
);
|
);
|
||||||
|
`UNUSED_PARAM (MODEL)
|
||||||
|
|
||||||
`ifndef SYNTHESIS
|
`ifndef SYNTHESIS
|
||||||
assign cnt_o = $countones(in_i);
|
assign cnt_o = $countones(in_i);
|
||||||
`else
|
`else
|
||||||
|
|||||||
@@ -54,16 +54,14 @@ VL_FLAGS += $(CONFIGS)
|
|||||||
CXXFLAGS += $(CONFIGS)
|
CXXFLAGS += $(CONFIGS)
|
||||||
|
|
||||||
# Enable Verilator multithreaded simulation
|
# Enable Verilator multithreaded simulation
|
||||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
|
||||||
|
VL_FLAGS += -j $(THREADS)
|
||||||
#VL_FLAGS += --threads $(THREADS)
|
#VL_FLAGS += --threads $(THREADS)
|
||||||
|
|
||||||
# Enable VCD trace
|
|
||||||
VCD_TRACE = -DVCD_OUTPUT
|
|
||||||
|
|
||||||
# Debugigng
|
# Debugigng
|
||||||
ifdef DEBUG
|
ifdef DEBUG
|
||||||
VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
|
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
|
||||||
CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
|
CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
|
||||||
else
|
else
|
||||||
VL_FLAGS += -DNDEBUG
|
VL_FLAGS += -DNDEBUG
|
||||||
CXXFLAGS += -O2 -DNDEBUG
|
CXXFLAGS += -O2 -DNDEBUG
|
||||||
@@ -83,8 +81,6 @@ VL_FLAGS += -DIDIV_DPI
|
|||||||
FPU_CORE ?= FPU_DPI
|
FPU_CORE ?= FPU_DPI
|
||||||
VL_FLAGS += -D$(FPU_CORE)
|
VL_FLAGS += -D$(FPU_CORE)
|
||||||
|
|
||||||
THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
|
||||||
|
|
||||||
PROJECT = rtlsim
|
PROJECT = rtlsim
|
||||||
|
|
||||||
all: $(PROJECT)
|
all: $(PROJECT)
|
||||||
|
|||||||
@@ -51,16 +51,14 @@ VL_FLAGS += $(CONFIGS)
|
|||||||
CXXFLAGS += $(CONFIGS)
|
CXXFLAGS += $(CONFIGS)
|
||||||
|
|
||||||
# Enable Verilator multithreaded simulation
|
# Enable Verilator multithreaded simulation
|
||||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
|
||||||
|
VL_FLAGS += -j $(THREADS)
|
||||||
#VL_FLAGS += --threads $(THREADS)
|
#VL_FLAGS += --threads $(THREADS)
|
||||||
|
|
||||||
# Enable VCD trace
|
|
||||||
#VCD_TRACE = -DVCD_OUTPUT
|
|
||||||
|
|
||||||
# Debugigng
|
# Debugigng
|
||||||
ifdef DEBUG
|
ifdef DEBUG
|
||||||
VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
|
VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
|
||||||
CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
|
#CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
|
||||||
else
|
else
|
||||||
VL_FLAGS += -DNDEBUG
|
VL_FLAGS += -DNDEBUG
|
||||||
CXXFLAGS += -O2 -DNDEBUG
|
CXXFLAGS += -O2 -DNDEBUG
|
||||||
|
|||||||
@@ -1,153 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
#define NUM_CORES_MAX 32
|
|
||||||
|
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
|
||||||
|
|
||||||
struct context_t {
|
|
||||||
uint32_t num_groups[3];
|
|
||||||
uint32_t global_offset[3];
|
|
||||||
uint32_t local_size[3];
|
|
||||||
char * printf_buffer;
|
|
||||||
uint32_t *printf_buffer_position;
|
|
||||||
uint32_t printf_buffer_capacity;
|
|
||||||
uint32_t work_dim;
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef void (*vx_pocl_workgroup_func) (
|
|
||||||
const void * /* args */,
|
|
||||||
const struct context_t * /* context */,
|
|
||||||
uint32_t /* group_x */,
|
|
||||||
uint32_t /* group_y */,
|
|
||||||
uint32_t /* group_z */
|
|
||||||
);
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
struct context_t * ctx;
|
|
||||||
vx_pocl_workgroup_func pfn;
|
|
||||||
const void * args;
|
|
||||||
int offset;
|
|
||||||
int N;
|
|
||||||
int R;
|
|
||||||
} wspawn_args_t;
|
|
||||||
|
|
||||||
void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) {
|
|
||||||
assert(nW <= NW);
|
|
||||||
for (int wid = 0; wid < nW; ++wid) {
|
|
||||||
for (int tid = 0; tid < NT; ++tid) {
|
|
||||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
|
||||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
|
||||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
|
||||||
|
|
||||||
int X = p_wspawn_args->ctx->num_groups[0];
|
|
||||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
|
||||||
int XY = X * Y;
|
|
||||||
|
|
||||||
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
|
|
||||||
int k = wg_id / XY;
|
|
||||||
int wg_2d = wg_id - k * XY;
|
|
||||||
int j = wg_2d / X;
|
|
||||||
int i = wg_2d - j * X;
|
|
||||||
|
|
||||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
|
||||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
|
||||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
|
||||||
|
|
||||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) {
|
|
||||||
assert(wid < NW);
|
|
||||||
assert(nT <= NT);
|
|
||||||
for (int t = 0; t < nT; ++t) {
|
|
||||||
int tid = core_id * NW * NT + wid * NT + t;
|
|
||||||
|
|
||||||
int wg_id = p_wspawn_args->offset + tid;
|
|
||||||
|
|
||||||
int X = p_wspawn_args->ctx->num_groups[0];
|
|
||||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
|
||||||
int XY = X * Y;
|
|
||||||
|
|
||||||
int k = wg_id / XY;
|
|
||||||
int wg_2d = wg_id - k * XY;
|
|
||||||
int j = wg_2d / X;
|
|
||||||
int i = wg_2d - j * X;
|
|
||||||
|
|
||||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
|
||||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
|
||||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
|
||||||
|
|
||||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) {
|
|
||||||
// total number of WGs
|
|
||||||
int X = ctx->num_groups[0];
|
|
||||||
int Y = ctx->num_groups[1];
|
|
||||||
int Z = ctx->num_groups[2];
|
|
||||||
int Q = X * Y * Z;
|
|
||||||
|
|
||||||
// current core id
|
|
||||||
if (core_id >= NUM_CORES_MAX)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// calculate necessary active cores
|
|
||||||
int WT = NW * NT;
|
|
||||||
int nC = (Q > WT) ? (Q / WT) : 1;
|
|
||||||
int nc = MIN(nC, NC);
|
|
||||||
if (core_id >= nc)
|
|
||||||
return; // terminate extra cores
|
|
||||||
|
|
||||||
// number of workgroups per core
|
|
||||||
int wgs_per_core = Q / nc;
|
|
||||||
int wgs_per_core0 = wgs_per_core;
|
|
||||||
if (core_id == (NC-1)) {
|
|
||||||
int QC_r = Q - (nc * wgs_per_core0);
|
|
||||||
wgs_per_core0 += QC_r; // last core executes remaining WGs
|
|
||||||
}
|
|
||||||
|
|
||||||
// number of workgroups per warp
|
|
||||||
int nW = wgs_per_core0 / NT; // total warps per core
|
|
||||||
int rT = wgs_per_core0 - (nW * NT); // remaining threads
|
|
||||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
|
||||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
|
|
||||||
if (0 == fW)
|
|
||||||
fW = 1;
|
|
||||||
|
|
||||||
//--
|
|
||||||
wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW };
|
|
||||||
|
|
||||||
//--
|
|
||||||
if (nW >= 1) {
|
|
||||||
int nw = MIN(nW, NW);
|
|
||||||
kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args);
|
|
||||||
}
|
|
||||||
|
|
||||||
//--
|
|
||||||
if (rT != 0) {
|
|
||||||
wspawn_args.offset = wgs_per_core0 - rT;
|
|
||||||
kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) {
|
|
||||||
context_t ctx;
|
|
||||||
|
|
||||||
ctx.num_groups[0] = X;
|
|
||||||
ctx.num_groups[1] = Y;
|
|
||||||
ctx.num_groups[2] = Z;
|
|
||||||
ctx.global_offset[0] = 0;
|
|
||||||
ctx.global_offset[1] = 0;
|
|
||||||
ctx.global_offset[2] = 0;
|
|
||||||
|
|
||||||
for (int cid = 0; cid < NC; ++cid) {
|
|
||||||
kernel_run_once(&ctx, NC, NW, NT, cid);
|
|
||||||
}
|
|
||||||
|
|
||||||
exit (0);
|
|
||||||
}
|
|
||||||
@@ -4,7 +4,6 @@
|
|||||||
#include <vortex.h>
|
#include <vortex.h>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "kernel_scheduler.h"
|
|
||||||
|
|
||||||
#define RT_CHECK(_expr) \
|
#define RT_CHECK(_expr) \
|
||||||
do { \
|
do { \
|
||||||
|
|||||||
Reference in New Issue
Block a user