redesigned driver demo, fixed startup code, removed --cpu from simx,
This commit is contained in:
@@ -8,29 +8,27 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// Spawns Warps
|
||||
void vx_wspawn (unsigned numWarps, unsigned PC_spawn);
|
||||
void vx_wspawn(unsigned numWarps, unsigned PC_spawn);
|
||||
|
||||
// Changes thread mask (activated/deactivates threads)
|
||||
void vx_tmc (unsigned numThreads);
|
||||
void vx_tmc(unsigned numThreads);
|
||||
|
||||
// Warp Barrier
|
||||
void vx_barrier(unsigned barriedID, unsigned numWarps);
|
||||
|
||||
// split on a predicate
|
||||
void vx_split (unsigned predicate);
|
||||
|
||||
void vx_split(unsigned predicate);
|
||||
|
||||
// Join
|
||||
void vx_join (void);
|
||||
|
||||
void vx_join(void);
|
||||
|
||||
// Get Hardware thread ID
|
||||
unsigned vx_threadID(void);
|
||||
|
||||
|
||||
// Get hardware warp ID
|
||||
unsigned vx_warpID(void);
|
||||
|
||||
// Get global warp number
|
||||
unsigned vx_warpNum(void);
|
||||
|
||||
// Get Number cycles/Inst
|
||||
@@ -39,16 +37,13 @@ unsigned vx_getInst(void);
|
||||
|
||||
void vx_resetStack(void);
|
||||
|
||||
|
||||
#define __if(b) vx_split(b); \
|
||||
if (b)
|
||||
|
||||
#define __else else
|
||||
|
||||
|
||||
#define __endif vx_join();
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
|
||||
|
||||
|
||||
.section .text
|
||||
|
||||
|
||||
.type vx_wspawn, @function
|
||||
.global vx_wspawn
|
||||
vx_wspawn:
|
||||
@@ -16,7 +12,6 @@ vx_tmc:
|
||||
.word 0x0005006b # tmc a0
|
||||
ret
|
||||
|
||||
|
||||
.type vx_barrier, @function
|
||||
.global vx_barrier
|
||||
vx_barrier:
|
||||
@@ -35,12 +30,12 @@ vx_join:
|
||||
.word 0x0000306b #join
|
||||
ret
|
||||
|
||||
|
||||
.type vx_warpID, @function
|
||||
.global vx_warpID
|
||||
vx_warpID:
|
||||
csrr a0, 0x21 # read warp IDs
|
||||
ret
|
||||
|
||||
.type vx_warpNum, @function
|
||||
.global vx_warpNum
|
||||
vx_warpNum:
|
||||
@@ -59,14 +54,12 @@ vx_getCycles:
|
||||
csrr a0, 0x26 # read thread IDs
|
||||
ret
|
||||
|
||||
|
||||
.type vx_getInst, @function
|
||||
.global vx_getInst
|
||||
vx_getInst:
|
||||
csrr a0, 0x25 # read thread IDs
|
||||
ret
|
||||
|
||||
|
||||
.type vx_resetStack, @function
|
||||
.global vx_resetStack
|
||||
vx_resetStack:
|
||||
@@ -89,4 +82,4 @@ vx_resetStack:
|
||||
.word 0x0005006b # tmc 0
|
||||
RETURN:
|
||||
ret
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ DMP = $(TOOLPATH)/riscv32-unknown-elf-objdump
|
||||
CPY = $(TOOLPATH)/riscv32-unknown-elf-objcopy
|
||||
|
||||
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
@@ -7,12 +7,12 @@ CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostart
|
||||
DMP = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
|
||||
CPY = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
# VX_STR = ../../startup/vx_start.s
|
||||
# VX_STR = ../../startup/vx_start.S
|
||||
|
||||
|
||||
|
||||
NEWLIB = ../../newlib/newlib.c
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
@@ -8,7 +8,7 @@ CPY = /opt/riscv/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
|
||||
NEWLIB = ../../newlib/newlib.c ../../newlib/newlib_notimp.c ../../newlib/newlib.s
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
@@ -7,7 +7,7 @@ CPY = /opt/riscv-new/drops/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
|
||||
NEWLIB = ../../newlib/newlib.c
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
@@ -7,12 +7,12 @@ CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostart
|
||||
DMP = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
|
||||
CPY = ../../../../riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
# VX_STR = ../../startup/vx_start.s
|
||||
# VX_STR = ../../startup/vx_start.S
|
||||
|
||||
|
||||
|
||||
NEWLIB = ../../newlib/newlib.c
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
@@ -9,12 +9,12 @@ DMP = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
|
||||
CPY = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
|
||||
# VX_STR = ../../startup/vx_start.s
|
||||
# VX_STR = ../../startup/vx_start.S
|
||||
|
||||
|
||||
|
||||
NEWLIB = ../../newlib/newlib.c
|
||||
VX_STR = ../../startup/vx_start.s
|
||||
VX_STR = ../../startup/vx_start.S
|
||||
VX_INT = ../../intrinsics/vx_intrinsics.s
|
||||
VX_IO = ../../io/vx_io.s ../../io/vx_io.c
|
||||
VX_API = ../../vx_api/vx_api.c
|
||||
|
||||
63
runtime/startup/vx_start.S
Normal file
63
runtime/startup/vx_start.S
Normal file
@@ -0,0 +1,63 @@
|
||||
#include "../config.h"
|
||||
|
||||
.section .init, "ax"
|
||||
.global _start
|
||||
.type _start, @function
|
||||
_start:
|
||||
la a1, vx_set_sp
|
||||
li a0, NW # activate all warps
|
||||
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
|
||||
jal vx_set_sp
|
||||
li a0, 1
|
||||
.word 0x0005006b # back to single thread
|
||||
# Initialize global pointerp
|
||||
# call __cxx_global_var_init
|
||||
# Clear the bss segment
|
||||
la a0, _edata
|
||||
la a2, _end
|
||||
sub a2, a2, a0
|
||||
li a1, 0
|
||||
call memset
|
||||
la a0, __libc_fini_array # Register global termination functions
|
||||
call atexit # to be called upon exit
|
||||
call __libc_init_array # Run global initialization functions
|
||||
call main
|
||||
tail exit
|
||||
.size _start, .-_start
|
||||
|
||||
.section .text
|
||||
.type vx_set_sp, @function
|
||||
.global vx_set_sp
|
||||
vx_set_sp:
|
||||
li a0, NT
|
||||
.word 0x0005006b # activate all threads
|
||||
|
||||
.option push
|
||||
.option norelax
|
||||
1:auipc gp, %pcrel_hi(__global_pointer$)
|
||||
addi gp, gp, %pcrel_lo(1b)
|
||||
.option pop
|
||||
|
||||
csrr a3, 0x22 # get global warp number
|
||||
slli a3, a3, 0x1a # shift by wid
|
||||
csrr a2, 0x20 # get tid
|
||||
slli a1, a2, 10 # multiply tid by 1024
|
||||
slli a2, a2, 2 # multiply tid by 4
|
||||
lui sp, 0x6ffff # load base sp
|
||||
sub sp, sp, a1 # sub sp - (1024*tid)
|
||||
sub sp, sp, a3 # shoft per warp
|
||||
add sp, sp, a2 # shift sp for better performance
|
||||
|
||||
csrr a3, 0x21 # get wid
|
||||
beqz a3, RETURN
|
||||
li a0, 0
|
||||
.word 0x0005006b # tmc 0
|
||||
RETURN:
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.global __dso_handle
|
||||
.weak __dso_handle
|
||||
__dso_handle:
|
||||
.long 0
|
||||
|
||||
@@ -1,87 +0,0 @@
|
||||
# .section .init, "ax"
|
||||
# .global _start
|
||||
# _start:
|
||||
# .cfi_startproc
|
||||
# .cfi_undefined ra
|
||||
# .option push
|
||||
# .option norelax
|
||||
# la gp, __global_pointer$
|
||||
# .option pop
|
||||
# la sp, __stack_top
|
||||
# add s0, sp, zero
|
||||
# jal zero, main
|
||||
# .cfi_endproc
|
||||
# .end
|
||||
|
||||
.section .init, "ax"
|
||||
.global _start
|
||||
.type _start, @function
|
||||
_start:
|
||||
# li a0, 4
|
||||
# .word 0x0005006b # tmc 4
|
||||
# csrr a2, 0x20 # get tid
|
||||
# slli a2, a2, 2
|
||||
# la a3, 0x80000000
|
||||
# add a3, a3, a2
|
||||
# lw a4, 0(a3)
|
||||
la a1, vx_set_sp
|
||||
li a0, 4
|
||||
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
|
||||
jal vx_set_sp
|
||||
##########################################
|
||||
# li a0, 1
|
||||
# .word 0x0005006b # tmc 1
|
||||
# # Initialize global pointerp
|
||||
# call __cxx_global_var_init
|
||||
# # Clear the bss segment
|
||||
# la a0, _edata
|
||||
# la a2, _end
|
||||
# sub a2, a2, a0
|
||||
# li a1, 0
|
||||
# call memset
|
||||
# la a0, __libc_fini_array # Register global termination functions
|
||||
# call atexit # to be called upon exit
|
||||
# call __libc_init_array # Run global initialization functions
|
||||
li a0, 4
|
||||
.word 0x0005006b # tmc 4
|
||||
##############################################
|
||||
call main
|
||||
tail exit
|
||||
.size _start, .-_start
|
||||
|
||||
.section .text
|
||||
.type vx_set_sp, @function
|
||||
.global vx_set_sp
|
||||
vx_set_sp:
|
||||
li a0, 4
|
||||
.word 0x0005006b # tmc 4
|
||||
|
||||
.option push
|
||||
.option norelax
|
||||
1:auipc gp, %pcrel_hi(__global_pointer$)
|
||||
addi gp, gp, %pcrel_lo(1b)
|
||||
.option pop
|
||||
|
||||
csrr a3, 0x22 # get wid
|
||||
slli a3, a3, 0x1a # shift by wid
|
||||
csrr a2, 0x20 # get tid
|
||||
slli a1, a2, 10 # multiply tid by 1024
|
||||
slli a2, a2, 2 # multiply tid by 4
|
||||
lui sp, 0x6ffff # load base sp
|
||||
sub sp, sp, a1 # sub sp - (1024*tid)
|
||||
sub sp, sp, a3 # shoft per warp
|
||||
add sp, sp, a2 # shift sp for better performance
|
||||
|
||||
csrr a3, 0x21 # get wid
|
||||
beqz a3, RETURN
|
||||
li a0, 0
|
||||
.word 0x0005006b # tmc 0
|
||||
RETURN:
|
||||
ret
|
||||
|
||||
.section .data
|
||||
.global __dso_handle
|
||||
.weak __dso_handle
|
||||
__dso_handle:
|
||||
.long 0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
|
||||
#include "../config.h"
|
||||
#include "../intrinsics/vx_intrinsics.h"
|
||||
#include "vx_api.h"
|
||||
#include <inttypes.h>
|
||||
@@ -11,24 +11,27 @@ func_t global_function_pointer;
|
||||
void * global_argument_struct;
|
||||
unsigned global_num_threads;
|
||||
|
||||
void setup_call() {
|
||||
void spawn_warp_runonce() {
|
||||
// active all threads
|
||||
vx_tmc(global_num_threads);
|
||||
|
||||
// call user routine
|
||||
global_function_pointer(global_argument_struct);
|
||||
|
||||
// resume single-thread execution on exit
|
||||
unsigned wid = vx_warpID();
|
||||
if (wid != 0) {
|
||||
vx_tmc(0); // Halt Warp Execution
|
||||
} else {
|
||||
vx_tmc(1); // Only activate one thread
|
||||
}
|
||||
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
|
||||
vx_tmc(tmask);
|
||||
}
|
||||
|
||||
void vx_spawnWarps(unsigned numWarps, unsigned numThreads, func_t func_ptr, void * args) {
|
||||
global_function_pointer = func_ptr;
|
||||
global_argument_struct = args;
|
||||
global_num_threads = numThreads;
|
||||
vx_wspawn(numWarps, (unsigned) setup_call);
|
||||
setup_call();
|
||||
if (numWarps > 1) {
|
||||
vx_wspawn(numWarps, (unsigned)spawn_warp_runonce);
|
||||
}
|
||||
spawn_warp_runonce();
|
||||
}
|
||||
|
||||
unsigned pocl_threads;
|
||||
@@ -36,20 +39,20 @@ struct context_t * pocl_ctx;
|
||||
vx_pocl_workgroup_func pocl_pfn;
|
||||
const void * pocl_args;
|
||||
|
||||
void pocl_spawn_runonce() {
|
||||
|
||||
void pocl_spawn_warp_runonce() {
|
||||
// active all threads
|
||||
vx_tmc(pocl_threads);
|
||||
|
||||
int x = vx_threadID();
|
||||
int y = vx_warpID();
|
||||
int y = vx_warpNum();
|
||||
|
||||
// call kernel routine
|
||||
(pocl_pfn)(pocl_args, pocl_ctx, x, y, 0);
|
||||
|
||||
if (y != 0) {
|
||||
vx_tmc(0);
|
||||
}
|
||||
|
||||
vx_tmc(1);
|
||||
// resume single-thread execution on exit
|
||||
int wid = vx_warpID();
|
||||
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
|
||||
vx_tmc(tmask);
|
||||
}
|
||||
|
||||
void pocl_spawn(struct context_t * ctx, vx_pocl_workgroup_func pfn, const void * args) {
|
||||
@@ -64,10 +67,10 @@ void pocl_spawn(struct context_t * ctx, vx_pocl_workgroup_func pfn, const void *
|
||||
pocl_args = args;
|
||||
|
||||
if (ctx->num_groups[1] > 1) {
|
||||
vx_wspawn(ctx->num_groups[1], (unsigned)&pocl_spawn_runonce);
|
||||
vx_wspawn(ctx->num_groups[1], (unsigned)&pocl_spawn_warp_runonce);
|
||||
}
|
||||
|
||||
pocl_spawn_runonce();
|
||||
pocl_spawn_warp_runonce();
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Reference in New Issue
Block a user