From 3602d287b46c4b0affaf87c2299e7edc6e26f1d9 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 25 Jan 2021 07:04:54 -0800 Subject: [PATCH] wspawn fix for small sets --- driver/tests/basic/basic.cpp | 4 + driver/tests/basic/kernel_scheduler.h | 153 ++++++++++++++++++++++++++ runtime/src/vx_spawn.c | 6 +- 3 files changed, 160 insertions(+), 3 deletions(-) create mode 100644 driver/tests/basic/kernel_scheduler.h diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index f33db409..fb232c7c 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -4,6 +4,7 @@ #include #include #include "common.h" +#include "kernel_scheduler.h" #define RT_CHECK(_expr) \ do { \ @@ -212,6 +213,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, } int main(int argc, char *argv[]) { + size_t value; kernel_arg_t kernel_arg; @@ -222,6 +224,8 @@ int main(int argc, char *argv[]) { count = 1; } + //kernel_run(count, 1, 1, test, 4, 4); + // open device connection std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); diff --git a/driver/tests/basic/kernel_scheduler.h b/driver/tests/basic/kernel_scheduler.h new file mode 100644 index 00000000..8fd7dc28 --- /dev/null +++ b/driver/tests/basic/kernel_scheduler.h @@ -0,0 +1,153 @@ +#include +#include + +#define NUM_CORES_MAX 32 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +struct context_t { + uint32_t num_groups[3]; + uint32_t global_offset[3]; + uint32_t local_size[3]; + char * printf_buffer; + uint32_t *printf_buffer_position; + uint32_t printf_buffer_capacity; + uint32_t work_dim; +}; + +typedef void (*vx_pocl_workgroup_func) ( + const void * /* args */, + const struct context_t * /* context */, + uint32_t /* group_x */, + uint32_t /* group_y */, + uint32_t /* group_z */ +); + +typedef struct { + struct context_t * ctx; + vx_pocl_workgroup_func pfn; + const void * args; + int offset; + int N; + int R; +} wspawn_args_t; + +void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) { + assert(nW <= NW); + for (int wid = 0; wid < nW; ++wid) { + for (int tid = 0; tid < NT; ++tid) { + int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid); + int tK = p_wspawn_args->N + (wid < p_wspawn_args->R); + int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK); + + int X = p_wspawn_args->ctx->num_groups[0]; + int Y = p_wspawn_args->ctx->num_groups[1]; + int XY = X * Y; + + for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) { + int k = wg_id / XY; + int wg_2d = wg_id - k * XY; + int j = wg_2d / X; + int i = wg_2d - j * X; + + int gid0 = p_wspawn_args->ctx->global_offset[0] + i; + int gid1 = p_wspawn_args->ctx->global_offset[1] + j; + int gid2 = p_wspawn_args->ctx->global_offset[2] + k; + + printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2); + } + } + } +} + +void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) { + assert(wid < NW); + assert(nT <= NT); + for (int t = 0; t < nT; ++t) { + int tid = core_id * NW * NT + wid * NT + t; + + int wg_id = p_wspawn_args->offset + tid; + + int X = p_wspawn_args->ctx->num_groups[0]; + int Y = p_wspawn_args->ctx->num_groups[1]; + int XY = X * Y; + + int k = wg_id / XY; + int wg_2d = wg_id - k * XY; + int j = wg_2d / X; + int i = wg_2d - j * X; + + int gid0 = p_wspawn_args->ctx->global_offset[0] + i; + int gid1 = p_wspawn_args->ctx->global_offset[1] + j; + int gid2 = p_wspawn_args->ctx->global_offset[2] + k; + + printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2); + } +} + +void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) { + // total number of WGs + int X = ctx->num_groups[0]; + int Y = ctx->num_groups[1]; + int Z = ctx->num_groups[2]; + int Q = X * Y * Z; + + // current core id + if (core_id >= NUM_CORES_MAX) + return; + + // calculate necessary active cores + int WT = NW * NT; + int nC = (Q > WT) ? (Q / WT) : 1; + int nc = MIN(nC, NC); + if (core_id >= nc) + return; // terminate extra cores + + // number of workgroups per core + int wgs_per_core = Q / nc; + int wgs_per_core0 = wgs_per_core; + if (core_id == (NC-1)) { + int QC_r = Q - (nc * wgs_per_core0); + wgs_per_core0 += QC_r; // last core executes remaining WGs + } + + // number of workgroups per warp + int nW = wgs_per_core0 / NT; // total warps per core + int rT = wgs_per_core0 - (nW * NT); // remaining threads + int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations + int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps + if (0 == fW) + fW = 1; + + //-- + wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW }; + + //-- + if (nW >= 1) { + int nw = MIN(nW, NW); + kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args); + } + + //-- + if (rT != 0) { + wspawn_args.offset = wgs_per_core0 - rT; + kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args); + } + } + + void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) { + context_t ctx; + + ctx.num_groups[0] = X; + ctx.num_groups[1] = Y; + ctx.num_groups[2] = Z; + ctx.global_offset[0] = 0; + ctx.global_offset[1] = 0; + ctx.global_offset[2] = 0; + + for (int cid = 0; cid < NC; ++cid) { + kernel_run_once(&ctx, NC, NW, NT, cid); + } + + exit (0); + } \ No newline at end of file diff --git a/runtime/src/vx_spawn.c b/runtime/src/vx_spawn.c index b73817f2..e172fee2 100644 --- a/runtime/src/vx_spawn.c +++ b/runtime/src/vx_spawn.c @@ -6,7 +6,7 @@ extern "C" { #endif -#define NUM_CORES_MAX 16 +#define NUM_CORES_MAX 32 #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -71,7 +71,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) { int nC = (num_tasks > WT) ? (num_tasks / WT) : 1; int nc = MIN(nC, NC); if (core_id >= nc) - return; // terminate unused cores + return; // terminate extra cores // number of tasks per core int tasks_per_core = num_tasks / nc; @@ -94,7 +94,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) { g_wspawn_args[core_id] = &wspawn_args; //-- - if (nW > 1) { + if (nW >= 1) { int nw = MIN(nW, NW); vx_wspawn(nw, (unsigned)&spawn_tasks_callback); spawn_tasks_callback();