redesigned driver demo, fixed startup code, removed --cpu from simx,

This commit is contained in:
Blaise Tine
2020-03-29 00:38:17 -04:00
parent 2d5cf89e00
commit c8a6470595
63 changed files with 40963 additions and 364160 deletions

View File

@@ -8,29 +8,27 @@ extern "C" {
#endif
// Spawns Warps
void vx_wspawn (unsigned numWarps, unsigned PC_spawn);
void vx_wspawn(unsigned numWarps, unsigned PC_spawn);
// Changes thread mask (activated/deactivates threads)
void vx_tmc (unsigned numThreads);
void vx_tmc(unsigned numThreads);
// Warp Barrier
void vx_barrier(unsigned barriedID, unsigned numWarps);
// split on a predicate
void vx_split (unsigned predicate);
void vx_split(unsigned predicate);
// Join
void vx_join (void);
void vx_join(void);
// Get Hardware thread ID
unsigned vx_threadID(void);
// Get hardware warp ID
unsigned vx_warpID(void);
// Get global warp number
unsigned vx_warpNum(void);
// Get Number cycles/Inst
@@ -39,16 +37,13 @@ unsigned vx_getInst(void);
void vx_resetStack(void);
#define __if(b) vx_split(b); \
if (b)
#define __else else
#define __endif vx_join();
#ifdef __cplusplus
}
#endif

View File

@@ -1,9 +1,5 @@
.section .text
.type vx_wspawn, @function
.global vx_wspawn
vx_wspawn:
@@ -16,7 +12,6 @@ vx_tmc:
.word 0x0005006b # tmc a0
ret
.type vx_barrier, @function
.global vx_barrier
vx_barrier:
@@ -35,12 +30,12 @@ vx_join:
.word 0x0000306b #join
ret
.type vx_warpID, @function
.global vx_warpID
vx_warpID:
csrr a0, 0x21 # read warp IDs
ret
.type vx_warpNum, @function
.global vx_warpNum
vx_warpNum:
@@ -59,14 +54,12 @@ vx_getCycles:
csrr a0, 0x26 # read thread IDs
ret
.type vx_getInst, @function
.global vx_getInst
vx_getInst:
csrr a0, 0x25 # read thread IDs
ret
.type vx_resetStack, @function
.global vx_resetStack
vx_resetStack:
@@ -89,4 +82,4 @@ vx_resetStack:
.word 0x0005006b # tmc 0
RETURN:
ret

View File

@@ -11,7 +11,7 @@ DMP = $(TOOLPATH)/riscv32-unknown-elf-objdump
CPY = $(TOOLPATH)/riscv32-unknown-elf-objcopy
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -7,12 +7,12 @@ CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostart
DMP = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
CPY = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
# VX_STR = ../../startup/vx_start.s
# VX_STR = ../../startup/vx_start.S
NEWLIB = ../../newlib/newlib.c
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -8,7 +8,7 @@ CPY = /opt/riscv/bin/riscv32-unknown-elf-objcopy
NEWLIB = ../../newlib/newlib.c ../../newlib/newlib_notimp.c ../../newlib/newlib.s
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -7,7 +7,7 @@ CPY = /opt/riscv-new/drops/bin/riscv32-unknown-elf-objcopy
NEWLIB = ../../newlib/newlib.c
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -7,12 +7,12 @@ CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostart
DMP = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
CPY = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
# VX_STR = ../../startup/vx_start.s
# VX_STR = ../../startup/vx_start.S
NEWLIB = ../../newlib/newlib.c
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -9,12 +9,12 @@ DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
# VX_STR = ../../startup/vx_start.s
# VX_STR = ../../startup/vx_start.S
NEWLIB = ../../newlib/newlib.c
VX_STR = ../../startup/vx_start.s
VX_STR = ../../startup/vx_start.S
VX_INT = ../../intrinsics/vx_intrinsics.s
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c

View File

@@ -0,0 +1,63 @@
#include "../config.h"
.section .init, "ax"
.global _start
.type _start, @function
_start:
la a1, vx_set_sp
li a0, NW # activate all warps
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
jal vx_set_sp
li a0, 1
.word 0x0005006b # back to single thread
# Initialize global pointerp
# call __cxx_global_var_init
# Clear the bss segment
la a0, _edata
la a2, _end
sub a2, a2, a0
li a1, 0
call memset
la a0, __libc_fini_array # Register global termination functions
call atexit # to be called upon exit
call __libc_init_array # Run global initialization functions
call main
tail exit
.size _start, .-_start
.section .text
.type vx_set_sp, @function
.global vx_set_sp
vx_set_sp:
li a0, NT
.word 0x0005006b # activate all threads
.option push
.option norelax
1:auipc gp, %pcrel_hi(__global_pointer$)
addi gp, gp, %pcrel_lo(1b)
.option pop
csrr a3, 0x22 # get global warp number
slli a3, a3, 0x1a # shift by wid
csrr a2, 0x20 # get tid
slli a1, a2, 10 # multiply tid by 1024
slli a2, a2, 2 # multiply tid by 4
lui sp, 0x6ffff # load base sp
sub sp, sp, a1 # sub sp - (1024*tid)
sub sp, sp, a3 # shoft per warp
add sp, sp, a2 # shift sp for better performance
csrr a3, 0x21 # get wid
beqz a3, RETURN
li a0, 0
.word 0x0005006b # tmc 0
RETURN:
ret
.section .data
.global __dso_handle
.weak __dso_handle
__dso_handle:
.long 0

View File

@@ -1,87 +0,0 @@
# .section .init, "ax"
# .global _start
# _start:
# .cfi_startproc
# .cfi_undefined ra
# .option push
# .option norelax
# la gp, __global_pointer$
# .option pop
# la sp, __stack_top
# add s0, sp, zero
# jal zero, main
# .cfi_endproc
# .end
.section .init, "ax"
.global _start
.type _start, @function
_start:
# li a0, 4
# .word 0x0005006b # tmc 4
# csrr a2, 0x20 # get tid
# slli a2, a2, 2
# la a3, 0x80000000
# add a3, a3, a2
# lw a4, 0(a3)
la a1, vx_set_sp
li a0, 4
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
jal vx_set_sp
##########################################
# li a0, 1
# .word 0x0005006b # tmc 1
# # Initialize global pointerp
# call __cxx_global_var_init
# # Clear the bss segment
# la a0, _edata
# la a2, _end
# sub a2, a2, a0
# li a1, 0
# call memset
# la a0, __libc_fini_array # Register global termination functions
# call atexit # to be called upon exit
# call __libc_init_array # Run global initialization functions
li a0, 4
.word 0x0005006b # tmc 4
##############################################
call main
tail exit
.size _start, .-_start
.section .text
.type vx_set_sp, @function
.global vx_set_sp
vx_set_sp:
li a0, 4
.word 0x0005006b # tmc 4
.option push
.option norelax
1:auipc gp, %pcrel_hi(__global_pointer$)
addi gp, gp, %pcrel_lo(1b)
.option pop
csrr a3, 0x22 # get wid
slli a3, a3, 0x1a # shift by wid
csrr a2, 0x20 # get tid
slli a1, a2, 10 # multiply tid by 1024
slli a2, a2, 2 # multiply tid by 4
lui sp, 0x6ffff # load base sp
sub sp, sp, a1 # sub sp - (1024*tid)
sub sp, sp, a3 # shoft per warp
add sp, sp, a2 # shift sp for better performance
csrr a3, 0x21 # get wid
beqz a3, RETURN
li a0, 0
.word 0x0005006b # tmc 0
RETURN:
ret
.section .data
.global __dso_handle
.weak __dso_handle
__dso_handle:
.long 0

View File

@@ -1,4 +1,4 @@
#include "../config.h"
#include "../intrinsics/vx_intrinsics.h"
#include "vx_api.h"
#include <inttypes.h>
@@ -11,24 +11,27 @@ func_t global_function_pointer;
void * global_argument_struct;
unsigned global_num_threads;
void setup_call() {
void spawn_warp_runonce() {
// active all threads
vx_tmc(global_num_threads);
// call user routine
global_function_pointer(global_argument_struct);
// resume single-thread execution on exit
unsigned wid = vx_warpID();
if (wid != 0) {
vx_tmc(0); // Halt Warp Execution
} else {
vx_tmc(1); // Only activate one thread
}
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
vx_tmc(tmask);
}
void vx_spawnWarps(unsigned numWarps, unsigned numThreads, func_t func_ptr, void * args) {
global_function_pointer = func_ptr;
global_argument_struct = args;
global_num_threads = numThreads;
vx_wspawn(numWarps, (unsigned) setup_call);
setup_call();
if (numWarps > 1) {
vx_wspawn(numWarps, (unsigned)spawn_warp_runonce);
}
spawn_warp_runonce();
}
unsigned pocl_threads;
@@ -36,20 +39,20 @@ struct context_t * pocl_ctx;
vx_pocl_workgroup_func pocl_pfn;
const void * pocl_args;
void pocl_spawn_runonce() {
void pocl_spawn_warp_runonce() {
// active all threads
vx_tmc(pocl_threads);
int x = vx_threadID();
int y = vx_warpID();
int y = vx_warpNum();
// call kernel routine
(pocl_pfn)(pocl_args, pocl_ctx, x, y, 0);
if (y != 0) {
vx_tmc(0);
}
vx_tmc(1);
// resume single-thread execution on exit
int wid = vx_warpID();
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
vx_tmc(tmask);
}
void pocl_spawn(struct context_t * ctx, vx_pocl_workgroup_func pfn, const void * args) {
@@ -64,10 +67,10 @@ void pocl_spawn(struct context_t * ctx, vx_pocl_workgroup_func pfn, const void *
pocl_args = args;
if (ctx->num_groups[1] > 1) {
vx_wspawn(ctx->num_groups[1], (unsigned)&pocl_spawn_runonce);
vx_wspawn(ctx->num_groups[1], (unsigned)&pocl_spawn_warp_runonce);
}
pocl_spawn_runonce();
pocl_spawn_warp_runonce();
}
#ifdef __cplusplus