Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -0,0 +1,52 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = conform
SRCS = main.cpp tests.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View File

@@ -0,0 +1,36 @@
#include "tests.h"
#include <vx_print.h>
int main() {
int errors = 0;
errors += test_global_memory();
errors += test_shared_memory();
errors += test_tmc();
errors += test_pred();
errors += test_divergence();
errors += test_wsapwn();
errors += test_spawn_tasks();
errors += test_serial();
errors += test_tmask();
errors += test_barrier();
errors += test_tls();
if (0 == errors) {
PRINTF("Passed!\n");
} else {
PRINTF("Failed!\n");
}
return errors;
}

View File

@@ -0,0 +1,339 @@
#include "tests.h"
#include <stdio.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>
int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
int errors = 0;
for (int i = offset; i < size; i++) {
int value = buffer[i];
int ref_value = 65 + i;
if (value == ref_value) {
//PRINTF("[%d] %c\n", i, value);
} else {
PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
++errors;
}
}
return errors;
}
int __attribute__((noinline)) make_select_tmask(int tid) {
return (1 << tid);
}
int __attribute__((noinline)) make_full_tmask(int num_threads) {
return (1 << num_threads) - 1;
}
///////////////////////////////////////////////////////////////////////////////
#define GLOBAL_MEM_SZ 8
int global_buffer[GLOBAL_MEM_SZ];
int test_global_memory() {
PRINTF("Global Memory Test\n");
for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
global_buffer[i] = 65 + i;
}
return check_error(global_buffer, 0, GLOBAL_MEM_SZ);
}
///////////////////////////////////////////////////////////////////////////////
int* smem_addr = (int*)SMEM_BASE_ADDR;
int smem_buffer[8];
void __attribute__((noinline)) do_smem_wr() {
unsigned tid = vx_thread_id();
smem_addr[tid] = 65 + tid;
}
void __attribute__((noinline)) do_smem_rd() {
unsigned tid = vx_thread_id();
smem_buffer[tid] = smem_addr[tid];
}
int test_shared_memory() {
static const int SHARED_MEM_SZ = 8;
PRINTF("Shared Memory Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_smem_wr();
do_smem_rd();
vx_tmc_one();
return check_error(smem_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int tmc_buffer[8];
void __attribute__((noinline)) do_tmc() {
unsigned tid = vx_thread_id();
tmc_buffer[tid] = 65 + tid;
}
int test_tmc() {
PRINTF("TMC Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_tmc();
vx_tmc_one();
return check_error(tmc_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int pred_buffer[8];
void __attribute__((noinline)) do_pred() {
unsigned tid = vx_thread_id();
vx_pred((tid == 0), 1);
pred_buffer[tid] = 65;
}
int test_pred() {
PRINTF("PRED Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
for (int i = 1; i < num_threads; i++) {
pred_buffer[i] = 65 + i;
}
vx_tmc(tmask);
do_pred();
vx_tmc_one();
return check_error(pred_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int wspawn_buffer[8];
void wspawn_kernel() {
unsigned wid = vx_warp_id();
wspawn_buffer[wid] = 65 + wid;
vx_tmc(0 == wid);
}
int test_wsapwn() {
PRINTF("Wspawn Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, wspawn_kernel);
wspawn_kernel();
return check_error(wspawn_buffer, 0, num_warps);
}
///////////////////////////////////////////////////////////////////////////////
int dvg_buffer[4];
void __attribute__((noinline)) __attribute__((optimize("O1"))) do_divergence() {
int tid = vx_thread_id();
int cond1 = tid < 2;
int sp1 = vx_split(cond1);
if (cond1) {
{
int cond2 = tid < 1;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 65; // A
} else {
dvg_buffer[tid] = 66; // B
}
vx_join(sp2);
}
{
int cond3 = tid < 0;
int sp3 = vx_split(cond3);
if (cond3) {
dvg_buffer[tid] = 67; // C
}
vx_join(sp3);
}
} else {
{
int cond2 = tid < 3;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 67; // C
} else {
dvg_buffer[tid] = 68; // D
}
vx_join(sp2);
}
}
vx_join(sp1);
}
int test_divergence() {
PRINTF("Control Divergence Test\n");
int num_threads = std::min(vx_num_threads(), 4);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_divergence();
vx_tmc_one();
return check_error(dvg_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
#define ST_BUF_SZ 8
typedef struct {
int * src;
int * dst;
} st_args_t;
int st_buffer_src[ST_BUF_SZ];
int st_buffer_dst[ST_BUF_SZ];
void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
arg->dst[task_id] = arg->src[task_id];
}
int test_spawn_tasks() {
PRINTF("SpawnTasks Test\n");
st_args_t arg;
arg.src = st_buffer_src;
arg.dst = st_buffer_dst;
for (int i = 0; i < ST_BUF_SZ; i++) {
st_buffer_src[i] = 65 + i;
}
vx_spawn_tasks(ST_BUF_SZ, (vx_spawn_tasks_cb)st_kernel, &arg);
return check_error(st_buffer_dst, 0, ST_BUF_SZ);
}
///////////////////////////////////////////////////////////////////////////////
typedef struct {
int * buf;
} sr_args_t;
int sr_buffer[8];
void sr_kernel(const sr_args_t * arg) {
int tid = vx_thread_id();
arg->buf[tid] = 65 + tid;
}
void __attribute__((noinline)) do_serial() {
sr_args_t arg;
arg.buf = sr_buffer;
vx_serial((vx_serial_cb)sr_kernel, &arg);
}
int test_serial() {
PRINTF("Serial Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_serial();
vx_tmc_one();
return check_error(sr_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int tmask_buffer[8];
int __attribute__((noinline)) do_tmask() {
int tid = vx_thread_id();
int tmask = make_select_tmask(tid);
int cur_tmask = vx_thread_mask();
tmask_buffer[tid] = (cur_tmask == tmask) ? (65 + tid) : 0;
return tid + 1;
}
int test_tmask() {
PRINTF("Thread Mask Test\n");
// activate all thread to populate shared variables
vx_tmc(-1);
int num_threads = std::min(vx_num_threads(), 8);
int tid = 0;
l_start:
int tmask = make_select_tmask(tid);
vx_tmc(tmask);
tid = do_tmask();
if (tid < num_threads)
goto l_start;
vx_tmc_one();
return check_error(tmask_buffer, 0, num_threads);
}
///////////////////////////////////////////////////////////////////////////////
int barrier_buffer[8];
volatile int barrier_ctr;
volatile int barrier_stall;
void barrier_kernel() {
unsigned wid = vx_warp_id();
for (int i = 0; i <= (wid * 256); ++i) {
++barrier_stall;
}
barrier_buffer[wid] = 65 + wid;
vx_barrier(0, barrier_ctr);
vx_tmc(0 == wid);
}
int test_barrier() {
PRINTF("Barrier Test\n");
int num_warps = std::min(vx_num_warps(), 8);
barrier_ctr = num_warps;
barrier_stall = 0;
vx_wspawn(num_warps, barrier_kernel);
barrier_kernel();
return check_error(barrier_buffer, 0, num_warps);
}
///////////////////////////////////////////////////////////////////////////////
int tls_buffer[8];
__thread int tls_var;
__attribute__((noinline)) void print_tls_var() {
unsigned wid = vx_warp_id();
tls_buffer[wid] = 65 + tls_var;
}
void tls_kernel() {
unsigned wid = vx_warp_id();
tls_var = wid;
print_tls_var();
vx_tmc(0 == wid);
}
int test_tls() {
PRINTF("TLS Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, tls_kernel);
tls_kernel();
return check_error(tls_buffer, 0, num_warps);
}

View File

@@ -0,0 +1,28 @@
#ifndef TESTS
#define TESTS
#define PRINTF vx_printf
int test_global_memory();
int test_shared_memory();
int test_tmc();
int test_pred();
int test_divergence();
int test_wsapwn();
int test_spawn_tasks();
int test_serial();
int test_tmask();
int test_barrier();
int test_tls();
#endif