From dd289ef964875739a9eef1fad18ec69ad80e3db0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 Nov 2023 11:30:54 -0800 Subject: [PATCH 01/75] Write operand to file in matmul kernel --- tests/opencl/matmul/main.cc | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc index 8e20a3ef..f7657a60 100644 --- a/tests/opencl/matmul/main.cc +++ b/tests/opencl/matmul/main.cc @@ -56,6 +56,27 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } +static int write_operand_file(const char* filename, void* data, size_t size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "wb"); + if (NULL == fp) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + size_t wsize = fwrite(data, size, 1, fp); + if (wsize != 1) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + fclose(fp); + + return 0; +} + static bool compare_equal(float a, float b, int ulp = 21) { union fi_t { int i; float f; }; fi_t fa, fb; @@ -204,6 +225,12 @@ int main (int argc, char **argv) { h_c[i] = 0xdeadbeef; } + // NOTE(hansung): Dump operand buffer to a file + if (write_operand_file("matmul.input.a.bin", h_a.data(), nbytes) != 0) + return EXIT_FAILURE; + if (write_operand_file("matmul.input.b.bin", h_b.data(), nbytes) != 0) + return EXIT_FAILURE; + // Creating command queue commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); From 1586ea93ef523fc01bc19cedf630be6bcb7fd40a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 1 Jan 2024 13:56:46 -0800 Subject: [PATCH 02/75] Add new 'sharedmem' kernel --- tests/opencl/sharedmem/.depend | 8 ++ tests/opencl/sharedmem/.gitignore | 5 + tests/opencl/sharedmem/Makefile | 7 + tests/opencl/sharedmem/README | 0 tests/opencl/sharedmem/kernel.cl | 13 ++ tests/opencl/sharedmem/main.cc | 212 ++++++++++++++++++++++++++++++ 6 files changed, 245 insertions(+) create mode 100644 tests/opencl/sharedmem/.depend create mode 100644 tests/opencl/sharedmem/.gitignore create mode 100644 tests/opencl/sharedmem/Makefile create mode 100644 tests/opencl/sharedmem/README create mode 100644 tests/opencl/sharedmem/kernel.cl create mode 100644 tests/opencl/sharedmem/main.cc diff --git a/tests/opencl/sharedmem/.depend b/tests/opencl/sharedmem/.depend new file mode 100644 index 00000000..6f7bdaac --- /dev/null +++ b/tests/opencl/sharedmem/.depend @@ -0,0 +1,8 @@ +main.o: main.cc \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h diff --git a/tests/opencl/sharedmem/.gitignore b/tests/opencl/sharedmem/.gitignore new file mode 100644 index 00000000..ae170236 --- /dev/null +++ b/tests/opencl/sharedmem/.gitignore @@ -0,0 +1,5 @@ +sharedmem +*.bin* +*.pocl +*.dump +*.o diff --git a/tests/opencl/sharedmem/Makefile b/tests/opencl/sharedmem/Makefile new file mode 100644 index 00000000..bc0e3197 --- /dev/null +++ b/tests/opencl/sharedmem/Makefile @@ -0,0 +1,7 @@ +PROJECT = sharedmem + +SRCS = main.cc + +OPTS ?= -n64 + +include ../common.mk diff --git a/tests/opencl/sharedmem/README b/tests/opencl/sharedmem/README new file mode 100644 index 00000000..e69de29b diff --git a/tests/opencl/sharedmem/kernel.cl b/tests/opencl/sharedmem/kernel.cl new file mode 100644 index 00000000..76bf54cc --- /dev/null +++ b/tests/opencl/sharedmem/kernel.cl @@ -0,0 +1,13 @@ +__kernel void sharedmem (__global volatile const float *src, + __global volatile float *dst, + __local volatile float *smem) +{ + int gid = get_global_id(0); + smem[gid] = src[gid]; + float read; + __attribute__((opencl_unroll_hint)) + for (int i = 0; i < 500; i++) { + read = smem[gid]; + } + dst[gid] = read; +} diff --git a/tests/opencl/sharedmem/main.cc b/tests/opencl/sharedmem/main.cc new file mode 100644 index 00000000..4ecd7489 --- /dev/null +++ b/tests/opencl/sharedmem/main.cc @@ -0,0 +1,212 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_NAME "sharedmem" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static bool almost_equal(float a, float b, int ulp = 4) { + union fi_t { int i; float f; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + return std::abs(fa.i - fb.i) <= ulp; +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem src_memobj = NULL; +cl_mem dst_memobj = NULL; +float *h_src = NULL; +float *h_dst = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (src_memobj) clReleaseMemObject(src_memobj); + if (dst_memobj) clReleaseMemObject(dst_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + + if (kernel_bin) free(kernel_bin); + if (h_src) free(h_src); + if (h_dst) free(h_dst); +} + +int size = 64; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + printf("Workload size=%d\n", size); +} + +int main (int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + cl_platform_id platform_id; + size_t kernel_size; + cl_int binary_status; + + // read kernel binary from file + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + + // Getting platform and device information + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + printf("Allocate device buffers\n"); + size_t nbytes = size * sizeof(float); + src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); + dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); + + printf("Create program from kernel source\n"); + cl_int _err; + program = clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err); + if (program == NULL) { + cleanup(); + return -1; + } + + // Build program + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + // Create kernel + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + // store entire array to sharedmem + size_t local_size = size; + + // Set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj)); + CL_CHECK(clSetKernelArg(kernel, 2, local_size*sizeof(float), NULL)); + + // Allocate memories for input arrays and output arrays. + h_src = (float*)malloc(nbytes); + h_dst = (float*)malloc(nbytes); + + // Initialize values for array members. + for (int i = 0; i < size; ++i) { + h_src[i] = sinf(i)*sinf(i); + h_dst[i] = 0xdeadbeef; + //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]); + } + + // Creating command queue + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL)); + + printf("Execute the kernel\n"); + size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL)); + + printf("Verify result\n"); + int errors = 0; + for (int i = 0; i < size; ++i) { + float ref = h_src[i]; + if (!almost_equal(h_dst[i], ref)) { + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f, src=%f\n", i, ref, h_dst[i], h_src[i]); + ++errors; + } + } + if (0 == errors) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + + // Clean up + cleanup(); + + return errors; +} From ee1ed315e2438b3ee64610533ba9905cbb5e6f14 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 1 Jan 2024 14:26:10 -0800 Subject: [PATCH 03/75] Write out operand files in sharedmem kernel --- tests/opencl/sharedmem/main.cc | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/opencl/sharedmem/main.cc b/tests/opencl/sharedmem/main.cc index 4ecd7489..e53b2db4 100644 --- a/tests/opencl/sharedmem/main.cc +++ b/tests/opencl/sharedmem/main.cc @@ -52,6 +52,27 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } +static int write_operand_file(const char* filename, void* data, size_t size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "wb"); + if (NULL == fp) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + size_t wsize = fwrite(data, size, 1, fp); + if (wsize != 1) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + fclose(fp); + + return 0; +} + static bool almost_equal(float a, float b, int ulp = 4) { union fi_t { int i; float f; }; fi_t fa, fb; @@ -170,6 +191,10 @@ int main (int argc, char **argv) { //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]); } + // NOTE(hansung): Dump operand buffer to a file + if (write_operand_file("sharedmem.input.src.bin", h_src, nbytes) != 0) + return EXIT_FAILURE; + // Creating command queue commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); From edb385f13859b03f1a9520ea7bd274753f249c81 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 1 Jan 2024 14:26:19 -0800 Subject: [PATCH 04/75] Increase smem read loop iter --- tests/opencl/sharedmem/kernel.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/opencl/sharedmem/kernel.cl b/tests/opencl/sharedmem/kernel.cl index 76bf54cc..3540343a 100644 --- a/tests/opencl/sharedmem/kernel.cl +++ b/tests/opencl/sharedmem/kernel.cl @@ -6,7 +6,7 @@ __kernel void sharedmem (__global volatile const float *src, smem[gid] = src[gid]; float read; __attribute__((opencl_unroll_hint)) - for (int i = 0; i < 500; i++) { + for (int i = 0; i < 5000; i++) { read = smem[gid]; } dst[gid] = read; From a5c50b60c86c38b18498a7d0d8e6f9def836c1cb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 2 Jan 2024 20:28:40 -0800 Subject: [PATCH 05/75] Add 'smemcoherence' kernel --- tests/opencl/flops/.depend | 8 + tests/opencl/flops/.gitignore | 6 + tests/opencl/flops/Makefile | 7 + tests/opencl/flops/README | 0 tests/opencl/flops/kernel.cl | 13 ++ tests/opencl/flops/main.cc | 237 +++++++++++++++++++++++++ tests/opencl/smemcoherence/.depend | 8 + tests/opencl/smemcoherence/.gitignore | 5 + tests/opencl/smemcoherence/Makefile | 7 + tests/opencl/smemcoherence/README | 0 tests/opencl/smemcoherence/kernel.cl | 33 ++++ tests/opencl/smemcoherence/main.cc | 238 ++++++++++++++++++++++++++ 12 files changed, 562 insertions(+) create mode 100644 tests/opencl/flops/.depend create mode 100644 tests/opencl/flops/.gitignore create mode 100644 tests/opencl/flops/Makefile create mode 100644 tests/opencl/flops/README create mode 100644 tests/opencl/flops/kernel.cl create mode 100644 tests/opencl/flops/main.cc create mode 100644 tests/opencl/smemcoherence/.depend create mode 100644 tests/opencl/smemcoherence/.gitignore create mode 100644 tests/opencl/smemcoherence/Makefile create mode 100644 tests/opencl/smemcoherence/README create mode 100644 tests/opencl/smemcoherence/kernel.cl create mode 100644 tests/opencl/smemcoherence/main.cc diff --git a/tests/opencl/flops/.depend b/tests/opencl/flops/.depend new file mode 100644 index 00000000..6f7bdaac --- /dev/null +++ b/tests/opencl/flops/.depend @@ -0,0 +1,8 @@ +main.o: main.cc \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h diff --git a/tests/opencl/flops/.gitignore b/tests/opencl/flops/.gitignore new file mode 100644 index 00000000..3ca9b5b2 --- /dev/null +++ b/tests/opencl/flops/.gitignore @@ -0,0 +1,6 @@ +flops +*.o +*.bin* +*.pocl +*.dump +*.vcd diff --git a/tests/opencl/flops/Makefile b/tests/opencl/flops/Makefile new file mode 100644 index 00000000..a3301c6f --- /dev/null +++ b/tests/opencl/flops/Makefile @@ -0,0 +1,7 @@ +PROJECT = flops + +SRCS = main.cc + +OPTS ?= -n64 + +include ../common.mk diff --git a/tests/opencl/flops/README b/tests/opencl/flops/README new file mode 100644 index 00000000..e69de29b diff --git a/tests/opencl/flops/kernel.cl b/tests/opencl/flops/kernel.cl new file mode 100644 index 00000000..ab26c745 --- /dev/null +++ b/tests/opencl/flops/kernel.cl @@ -0,0 +1,13 @@ +__kernel void flops (__global volatile const float *src, + __global volatile float *dst, + __local volatile float *smem) +{ + int gid = get_global_id(0); + float f = 0.0f; + float incr = src[0]; + __attribute__((opencl_unroll_hint)) + for (int i = 0; i < 5000; i++) { + f += incr; + } + dst[gid] = f; +} diff --git a/tests/opencl/flops/main.cc b/tests/opencl/flops/main.cc new file mode 100644 index 00000000..ebab1825 --- /dev/null +++ b/tests/opencl/flops/main.cc @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_NAME "flops" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static int write_operand_file(const char* filename, void* data, size_t size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "wb"); + if (NULL == fp) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + size_t wsize = fwrite(data, size, 1, fp); + if (wsize != 1) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + fclose(fp); + + return 0; +} + +static bool almost_equal(float a, float b, int ulp = 4) { + union fi_t { int i; float f; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + return std::abs(fa.i - fb.i) <= ulp; +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem src_memobj = NULL; +cl_mem dst_memobj = NULL; +float *h_src = NULL; +float *h_dst = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (src_memobj) clReleaseMemObject(src_memobj); + if (dst_memobj) clReleaseMemObject(dst_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + + if (kernel_bin) free(kernel_bin); + if (h_src) free(h_src); + if (h_dst) free(h_dst); +} + +int size = 64; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + printf("Workload size=%d\n", size); +} + +int main (int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + cl_platform_id platform_id; + size_t kernel_size; + cl_int binary_status; + + // read kernel binary from file + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + + // Getting platform and device information + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + printf("Allocate device buffers\n"); + size_t nbytes = size * sizeof(float); + src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); + dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); + + printf("Create program from kernel source\n"); + cl_int _err; + program = clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err); + if (program == NULL) { + cleanup(); + return -1; + } + + // Build program + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + // Create kernel + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + // store entire array to sharedmem + size_t local_size = size; + + // Set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj)); + CL_CHECK(clSetKernelArg(kernel, 2, local_size*sizeof(float), NULL)); + + // Allocate memories for input arrays and output arrays. + h_src = (float*)malloc(nbytes); + h_dst = (float*)malloc(nbytes); + + // Initialize values for array members. + for (int i = 0; i < size; ++i) { + h_src[i] = sinf(i)*sinf(i); + h_dst[i] = 0xdeadbeef; + //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]); + } + + // NOTE(hansung): Dump operand buffer to a file + if (write_operand_file("flops.input.src.bin", h_src, nbytes) != 0) + return EXIT_FAILURE; + + // Creating command queue + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL)); + + printf("Execute the kernel\n"); + size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL)); + + printf("Verify result\n"); + int errors = 0; + for (int i = 0; i < size; ++i) { + float ref = h_src[i]; + if (!almost_equal(h_dst[i], ref)) { + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f, src=%f\n", i, ref, h_dst[i], h_src[i]); + ++errors; + } + } + if (0 == errors) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + + // Clean up + cleanup(); + + return errors; +} diff --git a/tests/opencl/smemcoherence/.depend b/tests/opencl/smemcoherence/.depend new file mode 100644 index 00000000..6f7bdaac --- /dev/null +++ b/tests/opencl/smemcoherence/.depend @@ -0,0 +1,8 @@ +main.o: main.cc \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \ + /scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h diff --git a/tests/opencl/smemcoherence/.gitignore b/tests/opencl/smemcoherence/.gitignore new file mode 100644 index 00000000..95d1c091 --- /dev/null +++ b/tests/opencl/smemcoherence/.gitignore @@ -0,0 +1,5 @@ +smemcoherence +*.bin* +*.pocl +*.dump +*.o diff --git a/tests/opencl/smemcoherence/Makefile b/tests/opencl/smemcoherence/Makefile new file mode 100644 index 00000000..0ee5beae --- /dev/null +++ b/tests/opencl/smemcoherence/Makefile @@ -0,0 +1,7 @@ +PROJECT = smemcoherence + +SRCS = main.cc + +OPTS ?= -n64 + +include ../common.mk diff --git a/tests/opencl/smemcoherence/README b/tests/opencl/smemcoherence/README new file mode 100644 index 00000000..e69de29b diff --git a/tests/opencl/smemcoherence/kernel.cl b/tests/opencl/smemcoherence/kernel.cl new file mode 100644 index 00000000..ace1bbcd --- /dev/null +++ b/tests/opencl/smemcoherence/kernel.cl @@ -0,0 +1,33 @@ +__kernel void smemcoherence (__global volatile const int *src, + __global volatile int *dst, + __local volatile int *smem, + int n) +{ + __local volatile int *markers = (__local int *)((__local unsigned char *)smem + 0x1000); + int gid = get_global_id(0); + + // assumes total store ordering on smem + markers[gid] = 0; + smem[gid] = gid; + markers[gid] = 1; + + // 0-th thread checks if all threads finished writing + if (gid == 0) { + int gridsize = get_global_size(0); + int retry = 0; + for (;; retry++) { + for (int i = 0; i < gridsize; i++) { + if (markers[i] != 1) { + goto try_again; + } + } + break; + try_again:; + } + + for (int i = 0; i < n; i++) { + dst[i] = smem[i]; + } + dst[n] = retry; + } +} diff --git a/tests/opencl/smemcoherence/main.cc b/tests/opencl/smemcoherence/main.cc new file mode 100644 index 00000000..5bb1bd4f --- /dev/null +++ b/tests/opencl/smemcoherence/main.cc @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_NAME "smemcoherence" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static int write_operand_file(const char* filename, void* data, size_t size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "wb"); + if (NULL == fp) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + size_t wsize = fwrite(data, size, 1, fp); + if (wsize != 1) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + fclose(fp); + + return 0; +} + +static bool almost_equal(float a, float b, int ulp = 4) { + union fi_t { int i; float f; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + return std::abs(fa.i - fb.i) <= ulp; +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem src_memobj = NULL; +cl_mem dst_memobj = NULL; +int *h_src = NULL; +int *h_dst = NULL; +uint8_t *kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (src_memobj) clReleaseMemObject(src_memobj); + if (dst_memobj) clReleaseMemObject(dst_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + + if (kernel_bin) free(kernel_bin); + if (h_src) free(h_src); + if (h_dst) free(h_dst); +} + +int size = 64; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } + + printf("Workload size=%d\n", size); +} + +int main (int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + cl_platform_id platform_id; + size_t kernel_size; + cl_int binary_status; + + // read kernel binary from file + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + + // Getting platform and device information + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + printf("Allocate device buffers\n"); + // + 1 for the trial value + size_t nbytes = (size + 1) * sizeof(int); + src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); + dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); + + printf("Create program from kernel source\n"); + cl_int _err; + program = clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err); + if (program == NULL) { + cleanup(); + return -1; + } + + // Build program + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + // Create kernel + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + size_t local_nbytes = 0x2000; + + // Set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj)); + CL_CHECK(clSetKernelArg(kernel, 2, local_nbytes, NULL)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); + + // Allocate memories for input arrays and output arrays. + h_src = (int*)malloc(nbytes); + h_dst = (int*)malloc(nbytes); + + // Initialize values for array members. + for (int i = 0; i < size; ++i) { + h_src[i] = i; + h_dst[i] = 0xdeadbeef; + //printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]); + } + + // NOTE(hansung): Dump operand buffer to a file + if (write_operand_file("smemcoherence.input.src.bin", h_src, nbytes) != 0) + return EXIT_FAILURE; + + // Creating command queue + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL)); + + printf("Execute the kernel\n"); + size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL)); + + printf("Verify result\n"); + int errors = 0; + for (int i = 0; i < size; ++i) { + int ref = i; + if (h_dst[i] != ref) { + printf("*** error: [%d] expected=%d, actual=%d\n", i, ref, h_dst[i]); + ++errors; + } + } + printf("smem check re-trial count: %d\n", h_dst[size]); + if (0 == errors) { + printf("PASSED!\n"); + } else { + printf("FAILED! - %d errors\n", errors); + } + + // Clean up + cleanup(); + + return errors; +} From 7f6f1d605f4735967106cbb5c00b413a535681db Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 24 Jan 2024 16:24:19 -0800 Subject: [PATCH 06/75] Add bare mmio kernel --- tests/kernel/gemmini_mmio/Makefile | 52 ++++++++++++++++++++++++++++++ tests/kernel/gemmini_mmio/main.cpp | 35 ++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tests/kernel/gemmini_mmio/Makefile create mode 100644 tests/kernel/gemmini_mmio/main.cpp diff --git a/tests/kernel/gemmini_mmio/Makefile b/tests/kernel/gemmini_mmio/Makefile new file mode 100644 index 00000000..0591bf72 --- /dev/null +++ b/tests/kernel/gemmini_mmio/Makefile @@ -0,0 +1,52 @@ +XLEN ?= 32 + +ifeq ($(XLEN),64) +RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain +CFLAGS += -march=rv64imafd -mabi=lp64d +else +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +CFLAGS += -march=rv32imaf -mabi=ilp32f +endif + +RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf + +VORTEX_KN_PATH ?= $(realpath ../../../kernel) + +CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc +AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar +DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump +CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy + +SIM_DIR = ../../../sim + +CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections +CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw + +LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a + +PROJECT = gemmini_mmio + +SRCS = main.cpp + +all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump + +$(PROJECT).dump: $(PROJECT).elf + $(DP) -D $(PROJECT).elf > $(PROJECT).dump + +$(PROJECT).bin: $(PROJECT).elf + $(CP) -O binary $(PROJECT).elf $(PROJECT).bin + +$(PROJECT).elf: $(SRCS) + $(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf + +run-rtlsim: $(PROJECT).bin + $(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin + +run-simx: $(PROJECT).bin + $(SIM_DIR)/simx/simx $(PROJECT).bin + +.depend: $(SRCS) + $(CC) $(CFLAGS) -MM $^ > .depend; + +clean: + rm -rf *.elf *.bin *.dump .depend diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp new file mode 100644 index 00000000..5b83ff25 --- /dev/null +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include + +#define ADDR_LEN 32 +#define XCUSTOM_ACC 3 +#define k_MVOUT_SPAD 23 + +// fence +#define gemmini_fence() asm volatile("fence") + +#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ + /* printf("function %d\n", funct); */ \ + uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) funct << 25); \ + *((volatile uint64_t*) 0xff002010) = (uint64_t) (rs1); \ + *((volatile uint64_t*) 0xff002018) = (uint64_t) (rs2); \ + /* gemmini_fence(); */ \ + *((volatile uint32_t*) 0xff002000) = instruction; \ +} + +#define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(dst_stride) << 32) | (uint64_t)(dst_addr), ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(src_addr), k_MVOUT_SPAD) + +#define gemmini_mvout_spad(dst_addr, src_addr, cols, rows) \ + gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) + +int main() { + gemmini_mvout_spad(0xff000000, 0xff000100, 4, 4); + + // volatile uint32_t *ptr_cmd = (volatile uint32_t *)0xff100000; + // *ptr_cmd = 0xdeadbeef; + + return 0; +} From 0462a91953eb30e8f03616dd1958b6c87782512b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 1 Feb 2024 13:52:29 -0800 Subject: [PATCH 07/75] Update mmio kernel to do single gemm --- tests/kernel/gemmini_mmio/main.cpp | 67 ++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 5b83ff25..ee65ad43 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -2,13 +2,11 @@ #include #include #include +#include -#define ADDR_LEN 32 -#define XCUSTOM_ACC 3 -#define k_MVOUT_SPAD 23 - -// fence -#define gemmini_fence() asm volatile("fence") +// #define ADDR_LEN 32 +// #define XCUSTOM_ACC 3 +// #define k_MVOUT_SPAD 23 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ /* printf("function %d\n", funct); */ \ @@ -19,17 +17,60 @@ *((volatile uint32_t*) 0xff002000) = instruction; \ } -#define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \ - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(dst_stride) << 32) | (uint64_t)(dst_addr), ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(src_addr), k_MVOUT_SPAD) +// #define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \ +// ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(dst_stride) << 32) | (uint64_t)(dst_addr), ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(src_addr), k_MVOUT_SPAD) -#define gemmini_mvout_spad(dst_addr, src_addr, cols, rows) \ - gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) +// #define gemmini_mvout_spad(dst_addr, src_addr, cols, rows) \ +// gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) int main() { - gemmini_mvout_spad(0xff000000, 0xff000100, 4, 4); + volatile uint64_t *bogus = (uint64_t *)0x00001000; - // volatile uint32_t *ptr_cmd = (volatile uint32_t *)0xff100000; - // *ptr_cmd = 0xdeadbeef; + gemmini_config_ld(0); + gemmini_config_st(0); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + + // bogus loop to give slack for MMIO to settle without fences + for (int i = 0; i < 10; i++) { + *bogus = 0xdeadbeef; + } + + // load up A and B and C + float *A = (float *)0xff000000; + float *B = (float *)0xff000100; + float *C = (float *)0xff000200; + float *D = (float *)0xff000300; + for (int i = 0; i < DIM; i++) { + for (int j = 0; j < DIM; j++) { + A[i * DIM + j] = 1.0f; + B[i * DIM + j] = 1.0f; + C[i * DIM + j] = 0.0f; + D[i * DIM + j] = 0.0f; + } + } + + for (int i = 0; i < 10; i++) { + *bogus = 0xdeadbeef; + } + + gemmini_extended_preload(B, C, DIM, DIM, DIM, DIM); + + for (int i = 0; i < 10; i++) { + *bogus = 0xdeadbeef; + } + + gemmini_extended_compute_preloaded(A, D, DIM, DIM, DIM, DIM); + + for (int i = 0; i < 10; i++) { + *bogus = 0xdeadbeef; + } + + // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); + gemmini_mvout_spad(0x00000000, 0x00000200/*C*/, DIM, DIM); + + for (int i = 0; i < 100; i++) { + *bogus = 0xdeadbeef; + } return 0; } From b5bfa7d4b9ce635bc0f89948dde85de9fc8b2f5f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 1 Feb 2024 14:05:13 -0800 Subject: [PATCH 08/75] Fix bogus spad address --- tests/kernel/gemmini_mmio/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index ee65ad43..f988ae8e 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -24,7 +24,7 @@ // gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) int main() { - volatile uint64_t *bogus = (uint64_t *)0x00001000; + volatile uint32_t *bogus = (uint32_t *)0xff001ff0; gemmini_config_ld(0); gemmini_config_st(0); From ad8bf9b223e61cf9321d8533fdd6268a74b31436 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 7 Feb 2024 21:31:08 -0800 Subject: [PATCH 09/75] Add sgemm_wg C kernel --- tests/regression/sgemm_wg/.gitignore | 5 + tests/regression/sgemm_wg/Makefile | 9 ++ tests/regression/sgemm_wg/common.h | 12 ++ tests/regression/sgemm_wg/kernel.cpp | 18 +++ tests/regression/sgemm_wg/main.cpp | 210 +++++++++++++++++++++++++++ 5 files changed, 254 insertions(+) create mode 100644 tests/regression/sgemm_wg/.gitignore create mode 100644 tests/regression/sgemm_wg/Makefile create mode 100644 tests/regression/sgemm_wg/common.h create mode 100644 tests/regression/sgemm_wg/kernel.cpp create mode 100644 tests/regression/sgemm_wg/main.cpp diff --git a/tests/regression/sgemm_wg/.gitignore b/tests/regression/sgemm_wg/.gitignore new file mode 100644 index 00000000..7c35ba59 --- /dev/null +++ b/tests/regression/sgemm_wg/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +sgemm_wg +.depend diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile new file mode 100644 index 00000000..6fbe650b --- /dev/null +++ b/tests/regression/sgemm_wg/Makefile @@ -0,0 +1,9 @@ +PROJECT = sgemm_wg + +SRCS = main.cpp + +VX_SRCS = kernel.cpp + +OPTS ?= -n256 + +include ../common.mk diff --git a/tests/regression/sgemm_wg/common.h b/tests/regression/sgemm_wg/common.h new file mode 100644 index 00000000..c150a28f --- /dev/null +++ b/tests/regression/sgemm_wg/common.h @@ -0,0 +1,12 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +typedef struct { + uint32_t num_points; + uint64_t src_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp new file mode 100644 index 00000000..da824888 --- /dev/null +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -0,0 +1,18 @@ +#include +#include +#include +#include "common.h" + +void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { + uint32_t num_points = arg->num_points; + float* src_ptr = (float*)arg->src_addr; + float* dst_ptr = (float*)arg->dst_addr; + + dst_ptr[task_id] = 2 * src_ptr[task_id]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; +} diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp new file mode 100644 index 00000000..b52b6e6a --- /dev/null +++ b/tests/regression/sgemm_wg/main.cpp @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.src_addr); + vx_mem_free(device, kernel_arg.dst_addr); + vx_dev_close(device); + } +} + +void gen_input_data(uint32_t len) { + src_data.resize(len); + + for (uint32_t i = 0; i < len; ++i) { + src_data[i] = (float)i; + std::cout << i << ": value=" << src_data[i] << std::endl; + } +} + +void gen_ref_data(uint32_t num_points) { + ref_data.resize(num_points); + + for (uint32_t i = 0; i < num_points; ++i) { + float ref_value = 2 * src_data.at(i); + ref_data.at(i) = ref_value; + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t num_points = count; + + // generate input data + gen_input_data(num_points); + + // generate reference data + gen_ref_data(num_points); + + uint32_t src_buf_size = src_data.size() * sizeof(int32_t); + uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + + kernel_arg.num_points = num_points; + + std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max(src_buf_size, + std::max(dst_buf_size, + sizeof(kernel_arg_t))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + } + + // upload source buffer + { + std::cout << "upload source buffer" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_data.data(), num_points * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size)); + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} From 12bdab80430f9a96c35b5a3423daa3fb96c7e2d6 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Thu, 8 Feb 2024 17:00:19 -0800 Subject: [PATCH 10/75] update gemmini matmul kernel --- tests/kernel/gemmini_mmio/main.cpp | 76 ++++++++++++++++++------------ 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index f988ae8e..5bffa15f 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -8,13 +8,16 @@ // #define XCUSTOM_ACC 3 // #define k_MVOUT_SPAD 23 +#define pfence() { for (int i = 0; i < 10; i++) *((uint32_t *) 0xffff0000) = 0xdeadbeef; } + #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ /* printf("function %d\n", funct); */ \ uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) funct << 25); \ - *((volatile uint64_t*) 0xff002010) = (uint64_t) (rs1); \ - *((volatile uint64_t*) 0xff002018) = (uint64_t) (rs2); \ + *((volatile uint64_t*) 0xff100010) = (uint64_t) (rs1); \ + *((volatile uint64_t*) 0xff100018) = (uint64_t) (rs2); \ + pfence(); \ /* gemmini_fence(); */ \ - *((volatile uint32_t*) 0xff002000) = instruction; \ + *((volatile uint32_t*) 0xff100000) = instruction; \ } // #define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \ @@ -24,52 +27,67 @@ // gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) int main() { - volatile uint32_t *bogus = (uint32_t *)0xff001ff0; + + char *print_buf = ((char *) 0xff005000); + sprintf(print_buf, "hello world\n"); gemmini_config_ld(0); gemmini_config_st(0); gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); // bogus loop to give slack for MMIO to settle without fences - for (int i = 0; i < 10; i++) { - *bogus = 0xdeadbeef; - } // load up A and B and C - float *A = (float *)0xff000000; - float *B = (float *)0xff000100; - float *C = (float *)0xff000200; - float *D = (float *)0xff000300; + float *smem_A = (float *)0xff000000; // byte addressed + uint32_t spad_A = 0x00000000; + float *smem_B = (float *)0xff000040; + uint32_t spad_B = 0x00000004; // 16B word addressed + float *smem_C = (float *)0xff000080; + uint32_t acc_C = 0x80000000; + uint32_t spad_C = 0x00000008; + float *smem_D = (float *)0xff0000c0; + uint32_t spad_D = 0x0000000c; + for (int i = 0; i < DIM; i++) { for (int j = 0; j < DIM; j++) { - A[i * DIM + j] = 1.0f; - B[i * DIM + j] = 1.0f; - C[i * DIM + j] = 0.0f; - D[i * DIM + j] = 0.0f; + smem_A[i * DIM + j] = 1.0f; + smem_B[i * DIM + j] = 1.0f; + smem_C[i * DIM + j] = 0.0f; + smem_D[i * DIM + j] = 0.0f; } } - - for (int i = 0; i < 10; i++) { - *bogus = 0xdeadbeef; + pfence(); + sprintf(print_buf, "\nC before\n"); + for (int i = 0; i < DIM; i++) { + for (int j = 0; j < DIM; j++) { + sprintf(print_buf, "%d ", (int) (smem_C[i * DIM + j])); + } + sprintf(print_buf, "\n"); } - gemmini_extended_preload(B, C, DIM, DIM, DIM, DIM); + pfence(); - for (int i = 0; i < 10; i++) { - *bogus = 0xdeadbeef; - } + gemmini_extended_preload(spad_B, acc_C, DIM, DIM, DIM, DIM); - gemmini_extended_compute_preloaded(A, D, DIM, DIM, DIM, DIM); + pfence(); - for (int i = 0; i < 10; i++) { - *bogus = 0xdeadbeef; - } + gemmini_extended_compute_preloaded(spad_A, spad_D, DIM, DIM, DIM, DIM); + + pfence(); // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); - gemmini_mvout_spad(0x00000000, 0x00000200/*C*/, DIM, DIM); + gemmini_mvout_spad(spad_C, acc_C, DIM, DIM); - for (int i = 0; i < 100; i++) { - *bogus = 0xdeadbeef; + pfence(); + + + sprintf(print_buf, "\nC after\n"); + + for (int i = 0; i < DIM; i++) { + for (int j = 0; j < DIM; j++) { + sprintf(print_buf, "%d ", (int) (100 * smem_C[i * DIM + j])); + } + sprintf(print_buf, "\n"); } return 0; From 5a216ef9ca8978dc77725399f739abf3bf20b884 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 12 Feb 2024 20:46:50 -0800 Subject: [PATCH 11/75] flops: unroll by 16 --- tests/opencl/flops/kernel.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/opencl/flops/kernel.cl b/tests/opencl/flops/kernel.cl index ab26c745..181e1171 100644 --- a/tests/opencl/flops/kernel.cl +++ b/tests/opencl/flops/kernel.cl @@ -5,7 +5,7 @@ __kernel void flops (__global volatile const float *src, int gid = get_global_id(0); float f = 0.0f; float incr = src[0]; - __attribute__((opencl_unroll_hint)) + __attribute__((opencl_unroll_hint(16))) for (int i = 0; i < 5000; i++) { f += incr; } From f586ab28df4dc47033ea0d07c64289aa83dac7ca Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 12 Feb 2024 20:47:36 -0800 Subject: [PATCH 12/75] vecadd: save operand to file --- tests/opencl/vecadd/main.cc | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc index 23aa49b4..9e03ac42 100644 --- a/tests/opencl/vecadd/main.cc +++ b/tests/opencl/vecadd/main.cc @@ -52,6 +52,27 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } +static int write_operand_file(const char* filename, void* data, size_t size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "wb"); + if (NULL == fp) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + size_t wsize = fwrite(data, size, 1, fp); + if (wsize != 1) { + fprintf(stderr, "Failed to write operand data.\n"); + return -1; + } + + fclose(fp); + + return 0; +} + static bool almost_equal(float a, float b, int ulp = 4) { union fi_t { int i; float f; }; fi_t fa, fb; @@ -174,6 +195,12 @@ int main (int argc, char **argv) { //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]); } + // NOTE(hansung): Dump operand buffer to a file + if (write_operand_file("vecadd.input.a.size64.bin", h_a, nbytes) != 0) + return EXIT_FAILURE; + if (write_operand_file("vecadd.input.b.size64.bin", h_b, nbytes) != 0) + return EXIT_FAILURE; + // Creating command queue commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); @@ -183,8 +210,9 @@ int main (int argc, char **argv) { printf("Execute the kernel\n"); size_t global_work_size[1] = {size}; + size_t local_work_size[1] = {1}; auto time_start = std::chrono::high_resolution_clock::now(); - CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL)); CL_CHECK(clFinish(commandQueue)); auto time_end = std::chrono::high_resolution_clock::now(); double elapsed = std::chrono::duration_cast(time_end - time_start).count(); From 6a1a506b6489db2b6a09e0e92cd9d26e31f5c056 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 12 Feb 2024 20:48:23 -0800 Subject: [PATCH 13/75] sgemm_wg: save args and input bin --- tests/regression/sgemm_wg/Makefile | 2 +- tests/regression/sgemm_wg/common.h | 3 ++- tests/regression/sgemm_wg/kernel.cpp | 18 ++++++++++---- tests/regression/sgemm_wg/main.cpp | 36 +++++++++++++++++++++++----- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile index 6fbe650b..f57f6124 100644 --- a/tests/regression/sgemm_wg/Makefile +++ b/tests/regression/sgemm_wg/Makefile @@ -4,6 +4,6 @@ SRCS = main.cpp VX_SRCS = kernel.cpp -OPTS ?= -n256 +OPTS ?= -n16 include ../common.mk diff --git a/tests/regression/sgemm_wg/common.h b/tests/regression/sgemm_wg/common.h index c150a28f..b82ea12f 100644 --- a/tests/regression/sgemm_wg/common.h +++ b/tests/regression/sgemm_wg/common.h @@ -1,7 +1,8 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 typedef struct { uint32_t num_points; diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index da824888..574798f6 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -4,15 +4,23 @@ #include "common.h" void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { - uint32_t num_points = arg->num_points; - float* src_ptr = (float*)arg->src_addr; - float* dst_ptr = (float*)arg->dst_addr; + uint32_t num_points = arg->num_points; + float *src_ptr = (float *)arg->src_addr; + float *dst_ptr = (float *)arg->dst_addr; - dst_ptr[task_id] = 2 * src_ptr[task_id]; + float *local_a = (float *)DEV_SMEM_START_ADDR; + + local_a[num_points - 1 - task_id] = 2 * src_ptr[num_points - 1 - task_id]; + // local_a[task_id] = 2 * src_ptr[task_id]; + + vx_barrier(0, vx_num_warps()); + + dst_ptr[task_id] = local_a[task_id]; } int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); + int threads_per_core = vx_num_warps() * vx_num_threads(); + vx_spawn_tasks(threads_per_core, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index b52b6e6a..f03a44c0 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -66,15 +67,15 @@ void gen_input_data(uint32_t len) { src_data.resize(len); for (uint32_t i = 0; i < len; ++i) { - src_data[i] = (float)i; + src_data[i] = static_cast(i); std::cout << i << ": value=" << src_data[i] << std::endl; } } -void gen_ref_data(uint32_t num_points) { - ref_data.resize(num_points); +void gen_ref_data(uint32_t len) { + ref_data.resize(len); - for (uint32_t i = 0; i < num_points; ++i) { + for (uint32_t i = 0; i < len; ++i) { float ref_value = 2 * src_data.at(i); ref_data.at(i) = ref_value; } @@ -141,8 +142,8 @@ int main(int argc, char *argv[]) { // generate reference data gen_ref_data(num_points); - uint32_t src_buf_size = src_data.size() * sizeof(int32_t); - uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t); + uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(src_data[0]); std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; @@ -176,6 +177,18 @@ int main(int argc, char *argv[]) { auto buf_ptr = staging_buf.data(); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); } // upload source buffer @@ -184,6 +197,17 @@ int main(int argc, char *argv[]) { auto buf_ptr = staging_buf.data(); memcpy(buf_ptr, src_data.data(), num_points * sizeof(float)); RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size)); + + std::cout << "uploading source buffer to device, device mem address=" + << std::hex << kernel_arg.src_addr << ", size=" << std::dec + << src_buf_size << " bytes\n"; + std::ofstream file("input.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_buf_size); + file.close(); } // clear destination buffer From 6b420aceb6e58afc9fd93e004fcc03cf0edb3e85 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 12 Feb 2024 22:22:28 -0800 Subject: [PATCH 14/75] sgemm_wg: write simple C=A*A matmul --- tests/regression/sgemm_wg/common.h | 8 ++-- tests/regression/sgemm_wg/kernel.cpp | 26 ++++++++--- tests/regression/sgemm_wg/main.cpp | 70 ++++++++++++++-------------- 3 files changed, 59 insertions(+), 45 deletions(-) diff --git a/tests/regression/sgemm_wg/common.h b/tests/regression/sgemm_wg/common.h index b82ea12f..ef1e85a8 100644 --- a/tests/regression/sgemm_wg/common.h +++ b/tests/regression/sgemm_wg/common.h @@ -1,13 +1,15 @@ #ifndef _COMMON_H_ #define _COMMON_H_ +#include + #define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 #define DEV_SMEM_START_ADDR 0xff000000 typedef struct { - uint32_t num_points; - uint64_t src_addr; - uint64_t dst_addr; + uint32_t matrix_dim; + uint64_t addr_a; + uint64_t addr_c; } kernel_arg_t; #endif diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 574798f6..ee09cf99 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -4,18 +4,30 @@ #include "common.h" void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { - uint32_t num_points = arg->num_points; - float *src_ptr = (float *)arg->src_addr; - float *dst_ptr = (float *)arg->dst_addr; + const float *global_a = (const float *)arg->addr_a; + float *global_c = (float *)arg->addr_c; - float *local_a = (float *)DEV_SMEM_START_ADDR; + // assumes NT == NW == matrix_dim + const uint32_t dim = arg->matrix_dim; + const uint32_t row = vx_warp_id(); + const uint32_t col = vx_thread_id(); - local_a[num_points - 1 - task_id] = 2 * src_ptr[num_points - 1 - task_id]; - // local_a[task_id] = 2 * src_ptr[task_id]; + float *local_c = (float *)DEV_SMEM_START_ADDR; + float *local_a = (float *)DEV_SMEM_START_ADDR + (dim * dim); + float *local_b = (float *)DEV_SMEM_START_ADDR + 2 * (dim * dim); + + local_a[dim * row + col] = global_a[dim * row + col]; + local_c[dim * row + col] = 0.0f; vx_barrier(0, vx_num_warps()); - dst_ptr[task_id] = local_a[task_id]; + for (uint32_t k = 0; k < dim; k++) { + local_c[dim * row + col] += local_a[dim * row + k] * local_a[dim * k + col]; + } + + vx_barrier(0, vx_num_warps()); + + global_c[dim * row + col] = local_c[dim * row + col]; } int main() { diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index f03a44c0..9252b7b0 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -57,8 +57,8 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { - vx_mem_free(device, kernel_arg.src_addr); - vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_arg.addr_a); + vx_mem_free(device, kernel_arg.addr_c); vx_dev_close(device); } } @@ -69,20 +69,20 @@ void gen_input_data(uint32_t len) { for (uint32_t i = 0; i < len; ++i) { src_data[i] = static_cast(i); std::cout << i << ": value=" << src_data[i] << std::endl; - } + } } void gen_ref_data(uint32_t len) { ref_data.resize(len); for (uint32_t i = 0; i < len; ++i) { - float ref_value = 2 * src_data.at(i); + float ref_value = src_data.at(i); ref_data.at(i) = ref_value; } } int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, + uint32_t buf_size, uint32_t num_points) { // start device std::cout << "start device" << std::endl; @@ -94,10 +94,10 @@ int run_test(const kernel_arg_t& kernel_arg, // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size)); // verify result - std::cout << "verify result" << std::endl; + std::cout << "verify result" << std::endl; { int errors = 0; auto buf_ptr = (float*)staging_buf.data(); @@ -113,14 +113,14 @@ int run_test(const kernel_arg_t& kernel_arg, if (errors != 0) { std::cout << "Found " << std::dec << errors << " errors!" << std::endl; std::cout << "FAILED!" << std::endl; - return 1; + return 1; } } return 0; } -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -131,47 +131,47 @@ int main(int argc, char *argv[]) { std::srand(50); // open device connection - std::cout << "open device connection" << std::endl; + std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - uint32_t num_points = count; + uint32_t matrix_size = count; // generate input data - gen_input_data(num_points); + gen_input_data(matrix_size); // generate reference data - gen_ref_data(num_points); + gen_ref_data(matrix_size); - uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); + uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); uint32_t dst_buf_size = ref_data.size() * sizeof(src_data[0]); - std::cout << "number of points: " << num_points << std::endl; + std::cout << "number of elements: " << matrix_size << std::endl; std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; // upload program - std::cout << "upload program" << std::endl; + std::cout << "upload program" << std::endl; RT_CHECK(vx_upload_kernel_file(device, kernel_file)); // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr)); - RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); - kernel_arg.num_points = num_points; + kernel_arg.matrix_dim = 4; // FIXME: hardcoded - std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; - - // allocate staging buffer + std::cout << "dev_src=0x" << std::hex << kernel_arg.addr_a << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.addr_c << std::endl; + + // allocate staging buffer { - std::cout << "allocate staging buffer" << std::endl; + std::cout << "allocate staging buffer" << std::endl; uint32_t staging_buf_size = std::max(src_buf_size, - std::max(dst_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t))); staging_buf.resize(staging_buf_size); } - - // upload kernel argument + + // upload kernel argument { std::cout << "upload kernel argument" << std::endl; auto buf_ptr = staging_buf.data(); @@ -195,11 +195,11 @@ int main(int argc, char *argv[]) { { std::cout << "upload source buffer" << std::endl; auto buf_ptr = staging_buf.data(); - memcpy(buf_ptr, src_data.data(), num_points * sizeof(float)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size)); + memcpy(buf_ptr, src_data.data(), matrix_size * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), src_buf_size)); std::cout << "uploading source buffer to device, device mem address=" - << std::hex << kernel_arg.src_addr << ", size=" << std::dec + << std::hex << kernel_arg.addr_a << ", size=" << std::dec << src_buf_size << " bytes\n"; std::ofstream file("input.bin", std::ios::binary | std::ios::out); if (!file) { @@ -214,18 +214,18 @@ int main(int argc, char *argv[]) { { std::cout << "clear destination buffer" << std::endl; auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { + for (uint32_t i = 0; i < matrix_size; ++i) { buf_ptr[i] = 0xdeadbeef; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size)); } // run tests std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + RT_CHECK(run_test(kernel_arg, dst_buf_size, matrix_size)); // cleanup - std::cout << "cleanup" << std::endl; + std::cout << "cleanup" << std::endl; cleanup(); std::cout << "PASSED!" << std::endl; From 5f79e8a3f1fdbf6c78248094bbce3bb836c9d4c4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 12 Feb 2024 22:29:38 -0800 Subject: [PATCH 15/75] sgemm_wg: reference matmul in cpu --- tests/regression/sgemm_wg/main.cpp | 37 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 9252b7b0..d12216e4 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -63,27 +63,32 @@ void cleanup() { } } -void gen_input_data(uint32_t len) { - src_data.resize(len); +void generate_source_matrix(uint32_t dim) { + src_data.resize(dim * dim); - for (uint32_t i = 0; i < len; ++i) { + for (uint32_t i = 0; i < dim * dim; ++i) { src_data[i] = static_cast(i); std::cout << i << ": value=" << src_data[i] << std::endl; } } -void gen_ref_data(uint32_t len) { - ref_data.resize(len); +void generate_reference_matmul(uint32_t dim) { + ref_data.resize(dim * dim); - for (uint32_t i = 0; i < len; ++i) { - float ref_value = src_data.at(i); - ref_data.at(i) = ref_value; + for (uint32_t i = 0; i < dim; ++i) { + for (uint32_t j = 0; j < dim; ++j) { + float ref = 0.0f; + for (uint32_t k = 0; k < dim; ++k) { + ref += src_data[dim * i + k] * src_data[dim * k + j]; + } + ref_data.at(dim * i + j) = ref; + } } } int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, - uint32_t num_points) { + uint32_t dim) { // start device std::cout << "start device" << std::endl; RT_CHECK(vx_start(device)); @@ -101,7 +106,7 @@ int run_test(const kernel_arg_t& kernel_arg, { int errors = 0; auto buf_ptr = (float*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { + for (uint32_t i = 0; i < dim * dim; ++i) { float ref = ref_data.at(i); float cur = buf_ptr[i]; if (cur != ref) { @@ -135,12 +140,10 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); uint32_t matrix_size = count; + uint32_t matrix_dim = 4; // FIXME: hardcoded - // generate input data - gen_input_data(matrix_size); - - // generate reference data - gen_ref_data(matrix_size); + generate_source_matrix(matrix_dim); + generate_reference_matmul(matrix_dim); uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); uint32_t dst_buf_size = ref_data.size() * sizeof(src_data[0]); @@ -157,7 +160,7 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); - kernel_arg.matrix_dim = 4; // FIXME: hardcoded + kernel_arg.matrix_dim = matrix_dim; std::cout << "dev_src=0x" << std::hex << kernel_arg.addr_a << std::endl; std::cout << "dev_dst=0x" << std::hex << kernel_arg.addr_c << std::endl; @@ -222,7 +225,7 @@ int main(int argc, char *argv[]) { // run tests std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, matrix_size)); + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.matrix_dim)); // cleanup std::cout << "cleanup" << std::endl; From 301f1ca26097f932bb3b4a37b3fbc93b67d5452a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 16 Feb 2024 16:20:45 -0800 Subject: [PATCH 16/75] sgemm_wg: Implement blocking over k-dimension --- tests/regression/sgemm_wg/common.h | 5 +- tests/regression/sgemm_wg/kernel.cpp | 51 ++++++++---- tests/regression/sgemm_wg/main.cpp | 120 +++++++++++++++++---------- 3 files changed, 113 insertions(+), 63 deletions(-) diff --git a/tests/regression/sgemm_wg/common.h b/tests/regression/sgemm_wg/common.h index ef1e85a8..74941562 100644 --- a/tests/regression/sgemm_wg/common.h +++ b/tests/regression/sgemm_wg/common.h @@ -7,8 +7,11 @@ #define DEV_SMEM_START_ADDR 0xff000000 typedef struct { - uint32_t matrix_dim; + uint32_t dim_m; + uint32_t dim_n; + uint32_t dim_k; uint64_t addr_a; + uint64_t addr_b; uint64_t addr_c; } kernel_arg_t; diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index ee09cf99..368d9270 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -5,34 +5,49 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const float *global_a = (const float *)arg->addr_a; + const float *global_b = (const float *)arg->addr_b; float *global_c = (float *)arg->addr_c; // assumes NT == NW == matrix_dim - const uint32_t dim = arg->matrix_dim; - const uint32_t row = vx_warp_id(); - const uint32_t col = vx_thread_id(); + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t dim_k = arg->dim_k; + const uint32_t block_dim = vx_num_warps(); + const uint32_t local_row = vx_warp_id(); + const uint32_t local_col = vx_thread_id(); - float *local_c = (float *)DEV_SMEM_START_ADDR; - float *local_a = (float *)DEV_SMEM_START_ADDR + (dim * dim); - float *local_b = (float *)DEV_SMEM_START_ADDR + 2 * (dim * dim); + // each thread generates one output element + float reg_c = 0.0f; - local_a[dim * row + col] = global_a[dim * row + col]; - local_c[dim * row + col] = 0.0f; + for (uint32_t k = 0; k < dim_k; k += block_dim) { + float *local_a = (float *)DEV_SMEM_START_ADDR; + float *local_b = (float *)DEV_SMEM_START_ADDR + (block_dim * block_dim); - vx_barrier(0, vx_num_warps()); + // FIXME: assumes local block size is square shape + // TODO: "local_row" should be global_row + uint32_t offset_global_a = dim_k * local_row + (k + local_col); + uint32_t offset_global_b = dim_n * (local_row + k) + local_col; + local_a[block_dim * local_row + local_col] = global_a[offset_global_a]; + local_b[block_dim * local_row + local_col] = global_b[offset_global_b]; - for (uint32_t k = 0; k < dim; k++) { - local_c[dim * row + col] += local_a[dim * row + k] * local_a[dim * k + col]; + vx_barrier(0, vx_num_warps()); + vx_fence(); + + for (uint32_t local_k = 0; local_k < block_dim; local_k++) { + reg_c += local_a[block_dim * local_row + local_k] * + local_b[block_dim * local_k + local_col]; + } + + vx_barrier(0, vx_num_warps()); + vx_fence(); } - vx_barrier(0, vx_num_warps()); - - global_c[dim * row + col] = local_c[dim * row + col]; + global_c[dim_n * local_row + local_col] = reg_c; } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - int threads_per_core = vx_num_warps() * vx_num_threads(); - vx_spawn_tasks(threads_per_core, (vx_spawn_tasks_cb)kernel_body, arg); - return 0; + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + int threads_per_core = vx_num_warps() * vx_num_threads(); + vx_spawn_tasks(threads_per_core, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index d12216e4..a6babcb0 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -21,7 +21,8 @@ const char* kernel_file = "kernel.bin"; uint32_t count = 0; -std::vector src_data; +std::vector src_a_data; +std::vector src_b_data; std::vector ref_data; vx_device_h device = nullptr; @@ -58,37 +59,43 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { vx_mem_free(device, kernel_arg.addr_a); + vx_mem_free(device, kernel_arg.addr_b); vx_mem_free(device, kernel_arg.addr_c); vx_dev_close(device); } } -void generate_source_matrix(uint32_t dim) { - src_data.resize(dim * dim); +void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + src_a_data.resize(dim_m * dim_k); + src_b_data.resize(dim_k * dim_n); - for (uint32_t i = 0; i < dim * dim; ++i) { - src_data[i] = static_cast(i); - std::cout << i << ": value=" << src_data[i] << std::endl; + for (uint32_t i = 0; i < src_a_data.size(); ++i) { + src_a_data[i] = static_cast(i); + std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl; + } + for (uint32_t i = 0; i < src_b_data.size(); ++i) { + src_b_data[i] = static_cast(i); + std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl; } } -void generate_reference_matmul(uint32_t dim) { - ref_data.resize(dim * dim); +void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + ref_data.resize(dim_m * dim_n); - for (uint32_t i = 0; i < dim; ++i) { - for (uint32_t j = 0; j < dim; ++j) { + for (uint32_t i = 0; i < dim_m; ++i) { + for (uint32_t j = 0; j < dim_n; ++j) { float ref = 0.0f; - for (uint32_t k = 0; k < dim; ++k) { - ref += src_data[dim * i + k] * src_data[dim * k + j]; + for (uint32_t k = 0; k < dim_k; ++k) { + ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j]; } - ref_data.at(dim * i + j) = ref; + ref_data.at(dim_n * i + j) = ref; } } } int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, - uint32_t dim) { + uint32_t dim_m, uint32_t dim_n) { // start device std::cout << "start device" << std::endl; RT_CHECK(vx_start(device)); @@ -106,7 +113,7 @@ int run_test(const kernel_arg_t& kernel_arg, { int errors = 0; auto buf_ptr = (float*)staging_buf.data(); - for (uint32_t i = 0; i < dim * dim; ++i) { + for (uint32_t i = 0; i < dim_m * dim_n; ++i) { float ref = ref_data.at(i); float cur = buf_ptr[i]; if (cur != ref) { @@ -139,16 +146,17 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - uint32_t matrix_size = count; - uint32_t matrix_dim = 4; // FIXME: hardcoded + uint32_t dim_m = 4; // FIXME: hardcoded + uint32_t dim_n = 4; // FIXME: hardcoded + uint32_t dim_k = 128; // FIXME: hardcoded - generate_source_matrix(matrix_dim); - generate_reference_matmul(matrix_dim); + generate_source_matrix(dim_m, dim_n, dim_k); + generate_reference_matmul(dim_m, dim_n, dim_k); - uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); - uint32_t dst_buf_size = ref_data.size() * sizeof(src_data[0]); + uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]); + uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]); - std::cout << "number of elements: " << matrix_size << std::endl; std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; // upload program @@ -157,20 +165,26 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); - kernel_arg.matrix_dim = matrix_dim; + kernel_arg.dim_m = dim_m; + kernel_arg.dim_n = dim_n; + kernel_arg.dim_k = dim_k; - std::cout << "dev_src=0x" << std::hex << kernel_arg.addr_a << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.addr_c << std::endl; + std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl; + std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl; + std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl; // allocate staging buffer { std::cout << "allocate staging buffer" << std::endl; - uint32_t staging_buf_size = std::max(src_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t))); + uint32_t staging_buf_size = std::max( + src_a_buf_size, + std::max( + src_b_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); staging_buf.resize(staging_buf_size); } @@ -196,28 +210,47 @@ int main(int argc, char *argv[]) { // upload source buffer { - std::cout << "upload source buffer" << std::endl; - auto buf_ptr = staging_buf.data(); - memcpy(buf_ptr, src_data.data(), matrix_size * sizeof(float)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), src_buf_size)); + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), + src_a_buf_size)); - std::cout << "uploading source buffer to device, device mem address=" - << std::hex << kernel_arg.addr_a << ", size=" << std::dec - << src_buf_size << " bytes\n"; - std::ofstream file("input.bin", std::ios::binary | std::ios::out); - if (!file) { + std::cout << "uploading source A matrix to device, device mem address=" + << std::hex << kernel_arg.addr_a << ", size=" << std::dec + << src_a_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { std::cerr << "error: failed to open args.bin for writing\n"; exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_a_buf_size); + file.close(); + } + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(), + src_b_buf_size)); + + std::cout << "uploading source B matrix to device, device mem address=" + << std::hex << kernel_arg.addr_b << ", size=" << std::dec + << src_b_buf_size << " bytes\n"; + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_b_buf_size); + file.close(); } - file.write(reinterpret_cast(buf_ptr), src_buf_size); - file.close(); } // clear destination buffer { std::cout << "clear destination buffer" << std::endl; auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < matrix_size; ++i) { + for (uint32_t i = 0; i < ref_data.size(); ++i) { buf_ptr[i] = 0xdeadbeef; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size)); @@ -225,13 +258,12 @@ int main(int argc, char *argv[]) { // run tests std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.matrix_dim)); + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n)); + std::cout << "PASSED!" << std::endl; // cleanup std::cout << "cleanup" << std::endl; cleanup(); - std::cout << "PASSED!" << std::endl; - return 0; } From d2da0d339494cad4a1fea7c8b14f2b263aba4872 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 17 Feb 2024 18:05:59 -0800 Subject: [PATCH 17/75] sgemm_wg: Parameterize threadblock dimensions --- tests/regression/sgemm_wg/kernel.cpp | 71 ++++++++++++++++++++-------- tests/regression/sgemm_wg/main.cpp | 9 ++-- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 368d9270..ec207821 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -3,7 +3,13 @@ #include #include "common.h" -void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { +inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, + const uint32_t tid_in_threadblock_x, + const uint32_t tid_in_threadblock_y, + const uint32_t threadblock_dim_x, + const uint32_t threadblock_dim_y, + const uint32_t threadblock_id_x, + const uint32_t threadblock_id_y) { const float *global_a = (const float *)arg->addr_a; const float *global_b = (const float *)arg->addr_b; float *global_c = (float *)arg->addr_c; @@ -12,42 +18,67 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; - const uint32_t block_dim = vx_num_warps(); - const uint32_t local_row = vx_warp_id(); - const uint32_t local_col = vx_thread_id(); + + // FIXME: assumes local block size is square shape + const uint32_t local_row = tid_in_threadblock_y; + const uint32_t local_col = tid_in_threadblock_x; + const uint32_t global_row = threadblock_id_y * threadblock_dim_y + local_row; + const uint32_t global_col = threadblock_id_x * threadblock_dim_x + local_col; // each thread generates one output element float reg_c = 0.0f; - for (uint32_t k = 0; k < dim_k; k += block_dim) { + for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { float *local_a = (float *)DEV_SMEM_START_ADDR; - float *local_b = (float *)DEV_SMEM_START_ADDR + (block_dim * block_dim); + size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + float *local_b = (float *)DEV_SMEM_START_ADDR + local_a_elems; - // FIXME: assumes local block size is square shape - // TODO: "local_row" should be global_row - uint32_t offset_global_a = dim_k * local_row + (k + local_col); - uint32_t offset_global_b = dim_n * (local_row + k) + local_col; - local_a[block_dim * local_row + local_col] = global_a[offset_global_a]; - local_b[block_dim * local_row + local_col] = global_b[offset_global_b]; + uint32_t offset_global_a = dim_k * global_row + (k + local_col); + uint32_t offset_global_b = dim_n * (local_row + k) + global_col; + // local_a: threadblock_dim_y rows, threadblock_dim_x cols + // local_b: threadblock_dim_x rows, threadblock_dim_y cols + // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n + local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; + local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; - vx_barrier(0, vx_num_warps()); + vx_barrier(0, threadblock_dim_y); vx_fence(); - for (uint32_t local_k = 0; local_k < block_dim; local_k++) { - reg_c += local_a[block_dim * local_row + local_k] * - local_b[block_dim * local_k + local_col]; + for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { + reg_c += local_a[threadblock_dim_x * local_row + local_k] * + local_b[threadblock_dim_y * local_col + local_k]; } - vx_barrier(0, vx_num_warps()); + vx_barrier(0, threadblock_dim_y); vx_fence(); } - global_c[dim_n * local_row + local_col] = reg_c; + global_c[dim_n * global_row + global_col] = reg_c; +} + +void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { + const uint32_t dim_n = arg->dim_n; + int tid_x = task_id % dim_n; + int tid_y = task_id / dim_n; + + const uint32_t threadblock_dim_x = vx_num_threads(); + const uint32_t threadblock_dim_y = vx_num_warps(); + const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; + const int threadblock_id = task_id / threads_per_threadblock; + + const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; + const int threadblock_id_x = threadblock_id % dim_n_in_blocks; + const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + + const int tid_in_threadblock_x = vx_thread_id(); + const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; + thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, + threadblock_dim_y, threadblock_id_x, threadblock_id_y); } int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - int threads_per_core = vx_num_warps() * vx_num_threads(); - vx_spawn_tasks(threads_per_core, (vx_spawn_tasks_cb)kernel_body, arg); + const uint32_t grid_size = arg->dim_m * arg->dim_n; + vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index a6babcb0..c6252991 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -116,7 +116,7 @@ int run_test(const kernel_arg_t& kernel_arg, for (uint32_t i = 0; i < dim_m * dim_n; ++i) { float ref = ref_data.at(i); float cur = buf_ptr[i]; - if (cur != ref) { + if (std::abs((cur - ref) / ref) > 1e-6) { std::cout << "error at result #" << std::dec << i << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; ++errors; @@ -146,9 +146,10 @@ int main(int argc, char *argv[]) { std::cout << "open device connection" << std::endl; RT_CHECK(vx_dev_open(&device)); - uint32_t dim_m = 4; // FIXME: hardcoded - uint32_t dim_n = 4; // FIXME: hardcoded - uint32_t dim_k = 128; // FIXME: hardcoded + // FIXME: hardcoded + uint32_t dim_m = 16; + uint32_t dim_n = 16; + uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From be7d87c82ddede45d06901b7ede3fd46b04bd3cd Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Thu, 22 Feb 2024 16:31:42 -0800 Subject: [PATCH 18/75] temp --- ci/toolchain_env.sh | 4 ++++ hw/rtl/core/VX_core.sv | 31 +++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh index 440a899e..3d4e2d41 100644 --- a/ci/toolchain_env.sh +++ b/ci/toolchain_env.sh @@ -24,3 +24,7 @@ export PATH=$SV2V_PATH/bin:$PATH export YOSYS_PATH=$TOOLDIR/yosys export PATH=$YOSYS_PATH/bin:$PATH + +export LLVM_VORTEX=$TOOLDIR/llvm-vortex +export POCL_CC_PATH=$TOOLDIR/pocl/compiler +export POCL_RT_PATH=$TOOLDIR/pocl/runtime \ No newline at end of file diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index dde085a8..e5e57d99 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -45,7 +45,7 @@ module VX_core import VX_gpu_pkg::*; #( output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, // Status - output wire busy + output wire busy //stays 1 when busy, 0 when done (termination) detect the negative edge ); VX_schedule_if schedule_if(); VX_fetch_if fetch_if(); @@ -258,7 +258,7 @@ module VX_core import VX_gpu_pkg::*; #( `endif -`ifdef PERF_ENABLE +`ifdef PERF_ENABLE // expose these perf counter to console using $display, %time; flag: --perf=0? wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; @@ -333,6 +333,33 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.ifetch_latency = perf_icache_lat; assign pipeline_perf_if.load_latency = perf_dcache_lat; + + always @(negedge busy) begin + if (!reset) begin + $display("time : %t", $time); + $display("perf_dcache_rd_req_per_cycle: %h", perf_dcache_rd_req_per_cycle); + $display("perf_dcache_wr_req_per_cycle: %h", perf_dcache_wr_req_per_cycle); + $display("perf_dcache_rsp_per_cycle: %h", perf_dcache_rsp_per_cycle); + $display("perf_icache_pending_read_cycle: %h", perf_icache_pending_read_cycle); + $display("perf_dcache_pending_read_cycle: %h", perf_dcache_pending_read_cycle); + $display("perf_icache_pending_reads: %h", perf_icache_pending_reads); + $display("perf_dcache_pending_reads: %h", perf_dcache_pending_reads); + $display("perf_ifetches: %h", perf_ifetches); + $display("perf_loads: %h", perf_loads); + $display("perf_stores: %h", perf_stores); + $display("perf_icache_req_fire: %b", perf_icache_req_fire); + $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); + $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); + $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r); + $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); + $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); + $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); + $display("scheduler idle: %d", pipeline_perf_if.sched_idles[31:0]); + $display("Instruction: %d",commit_csr_if.instret[31:0]); + $display("Cycle: %d",sched_csr_if.cycles); + end + end + `endif endmodule From 914864206af76214c6bf77d2d4869b710336d323 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Sat, 24 Feb 2024 00:27:16 -0800 Subject: [PATCH 19/75] MMIO gemmini matmul kernel --- tests/kernel/gemmini_mmio/Makefile | 4 +- tests/kernel/gemmini_mmio/gemmini_mmio.h | 120 ++++++++++++++++++++ tests/kernel/gemmini_mmio/main.cpp | 138 +++++++++++++---------- 3 files changed, 199 insertions(+), 63 deletions(-) create mode 100644 tests/kernel/gemmini_mmio/gemmini_mmio.h diff --git a/tests/kernel/gemmini_mmio/Makefile b/tests/kernel/gemmini_mmio/Makefile index 0591bf72..03059e47 100644 --- a/tests/kernel/gemmini_mmio/Makefile +++ b/tests/kernel/gemmini_mmio/Makefile @@ -12,6 +12,8 @@ RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf VORTEX_KN_PATH ?= $(realpath ../../../kernel) +GEMMINI_SW_PATH ?= $(realpath /scratch/yrh/chipyard/generators/gemmini/software/gemmini-rocc-tests) + CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump @@ -20,7 +22,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy SIM_DIR = ../../../sim CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw +CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH) LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a diff --git a/tests/kernel/gemmini_mmio/gemmini_mmio.h b/tests/kernel/gemmini_mmio/gemmini_mmio.h new file mode 100644 index 00000000..e09b0489 --- /dev/null +++ b/tests/kernel/gemmini_mmio/gemmini_mmio.h @@ -0,0 +1,120 @@ +#ifndef GEMMINI_MMIO_H +#define GEMMINI_MMIO_H +#ifndef GEMMINI_PARAMS_H + #error INCLUDE GEMMINI.H FIRST +#endif + +#define SMEM_BASE 0xff000000 +#define SMEM_SIZE 0x4000 +#define SMEM_MASK (SMEM_SIZE - 1) +#define SMEM_ADDR_END 0xff008000 + +#define SPAD_BASE 0x0 +#define SPAD_ROW_SIZE (DIM * sizeof(elem_t)) +#define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE) +#define SPAD_MASK (SPAD_NUM_ROWS - 1) + +#define SMEM_GARBAGE_ADDR 0xffff0000 +#define PRINT_BUF SMEM_ADDR_END +#define GEMMINI_RS1_ADDR 0xff007010 +#define GEMMINI_RS2_ADDR 0xff007018 +#define GEMMINI_INST_ADDR 0xff007000 + +#define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE) +#define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE) + +// convert normal matrix i,j into tiled smem offset +// top_in_tiles = i / DIM +// left_in_tiles = j / DIM +// num_tiles_before_current = top_in_tiles * (J / DIM) + left_in_tiles +// smem_addr = num_tiles_before_current * DIM * DIM + (i % DIM) * DIM + (j % DIM) +#define SMEM_MAT_OFFSET(i, j, J) \ + (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM)) + +#define pfence() { for (int i = 0; i < 5; i++) *((volatile uint32_t *) SMEM_GARBAGE_ADDR) = 0xdeadbeef; } + +#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ + /* printf("function %d\n", funct); */ \ + uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) (funct) << 25); \ + *((volatile uint64_t*) GEMMINI_RS1_ADDR) = (uint64_t) (rs1); \ + *((volatile uint64_t*) GEMMINI_RS2_ADDR) = (uint64_t) (rs2); \ + /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \ + /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \ + pfence(); \ + /* gemmini_fence(); */ \ + *((volatile uint32_t*) GEMMINI_INST_ADDR) = instruction; \ + /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \ +} + +static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start, + const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start, + size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K, + bool a_transpose, bool b_transpose, + bool full_C, bool low_D, + bool no_bias, bool repeating_bias, + int act) { + + // const uint32_t A_sp_addr_start = 0; + // const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM; + // const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1); + const uint32_t C_sp_addr_start = 3 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3)); + // const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) : + // (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC); + const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN); + // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t); + const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t); + + for (size_t k = 0; k < K; k++) { + for (size_t j = 0; j < J; j++) { + for (size_t i = 0; i < I; i++) { + const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) : + (A_sp_addr_start + (i*K + k)*DIM); + const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) : + (B_sp_addr_start + (k*J + j)*DIM); + const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM; + // Compute + { + uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR; + uint32_t out_sp_addr = C_sp_addr; + // If we're not using a bias, then we want to overwrite what's in the + // accumulator, rather than writing over it + int no_bias_new_matrix = (k == 0); // no_bias && D != NULL && k == 0; + if (no_bias_new_matrix) { + out_sp_addr &= ~(1 << (ADDR_LEN-2)); + } + const size_t A_cols = DIM; // - (k == K - 1 ? pad_K : 0); + const size_t A_rows = DIM; // - (i == I - 1 ? pad_I : 0); + const size_t B_cols = DIM; // - (j == J - 1 ? pad_J : 0); + const size_t B_rows = DIM; // - (k == K - 1 ? pad_K : 0); + const size_t C_cols = DIM; // - (j == J - 1 ? pad_J : 0); + const size_t C_rows = DIM; // - (i == I - 1 ? pad_I : 0); + gemmini_extended_preload(pre_sp_addr, out_sp_addr, B_cols, B_rows, DIM, DIM); + if (i == 0) { // First iteration + gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM); + } else { // All other iterations + gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM); + } + } + if (k == K - 1) { + // Move-out C (if not normalizing) + // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) { + const size_t rounded_j = (j / C_blocks) * C_blocks; + const uint32_t rounded_C_sp_addr = C_sp_addr_start + (i*J + rounded_j)*DIM; + + uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C; + + const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j; + const size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0); + const size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0); + + gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows); + // } + } + } + } + } + pfence(); +} + + +#endif diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 5bffa15f..8ef76e96 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -3,92 +3,106 @@ #include #include #include - -// #define ADDR_LEN 32 -// #define XCUSTOM_ACC 3 -// #define k_MVOUT_SPAD 23 - -#define pfence() { for (int i = 0; i < 10; i++) *((uint32_t *) 0xffff0000) = 0xdeadbeef; } - -#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ - /* printf("function %d\n", funct); */ \ - uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) funct << 25); \ - *((volatile uint64_t*) 0xff100010) = (uint64_t) (rs1); \ - *((volatile uint64_t*) 0xff100018) = (uint64_t) (rs2); \ - pfence(); \ - /* gemmini_fence(); */ \ - *((volatile uint32_t*) 0xff100000) = instruction; \ -} - -// #define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \ -// ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(dst_stride) << 32) | (uint64_t)(dst_addr), ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(src_addr), k_MVOUT_SPAD) - -// #define gemmini_mvout_spad(dst_addr, src_addr, cols, rows) \ -// gemmini_extended_mvout_spad(dst_addr, 1, src_addr, cols, rows) +#include "gemmini_mmio.h" int main() { - char *print_buf = ((char *) 0xff005000); - sprintf(print_buf, "hello world\n"); + char *print_buf = (char *) PRINT_BUF; + + sprintf(print_buf, "\n%d\n", DIM); gemmini_config_ld(0); - gemmini_config_st(0); gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); - // bogus loop to give slack for MMIO to settle without fences - // load up A and B and C - float *smem_A = (float *)0xff000000; // byte addressed uint32_t spad_A = 0x00000000; - float *smem_B = (float *)0xff000040; - uint32_t spad_B = 0x00000004; // 16B word addressed - float *smem_C = (float *)0xff000080; - uint32_t acc_C = 0x80000000; - uint32_t spad_C = 0x00000008; - float *smem_D = (float *)0xff0000c0; - uint32_t spad_D = 0x0000000c; + uint32_t spad_B = 0x00000100; // 16B word addressed + uint32_t acc_C = 0x80000000; // accmem + accumulate + uint32_t spad_C = 0x00000200; - for (int i = 0; i < DIM; i++) { - for (int j = 0; j < DIM; j++) { - smem_A[i * DIM + j] = 1.0f; - smem_B[i * DIM + j] = 1.0f; - smem_C[i * DIM + j] = 0.0f; - smem_D[i * DIM + j] = 0.0f; + float *smem_A = (float *) SPAD_TO_SMEM(spad_A); // 0xff000000; // byte addressed + float *smem_B = (float *) SPAD_TO_SMEM(spad_B); // 0xff000200; + float *smem_C = (float *) SPAD_TO_SMEM(spad_C); // 0xff000400; + + int I = 5; + int J = 5; + int K = 5; + + gemmini_config_st(DIM * 4 * J) + + // load A with 128->1 in row-major order + for (int i = 0; i < I; i++) { + for (int k = 0; k < K; k++) { + int tile_byte_offset = (i * K + k) * DIM * DIM; + for (int x = 0; x < DIM; x++) + for (int y = 0; y < DIM; y++) + smem_A[tile_byte_offset + x * DIM + y] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64); } } - pfence(); - sprintf(print_buf, "\nC before\n"); - for (int i = 0; i < DIM; i++) { - for (int j = 0; j < DIM; j++) { - sprintf(print_buf, "%d ", (int) (smem_C[i * DIM + j])); + + // load B with 0->191 in row-major order + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + int tile_byte_offset = (k * J + j) * DIM * DIM; + for (int x = 0; x < DIM; x++) + for (int y = 0; y < DIM; y++) + smem_B[tile_byte_offset + x * DIM + y] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64); } - sprintf(print_buf, "\n"); } - pfence(); - - gemmini_extended_preload(spad_B, acc_C, DIM, DIM, DIM, DIM); + for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 0.f; pfence(); - gemmini_extended_compute_preloaded(spad_A, spad_D, DIM, DIM, DIM, DIM); + // sprintf(print_buf, "\nA in\n"); + // for (int i = I * DIM - 1; i < I * DIM; i++) { + // for (int j = 0; j < K * DIM; j++) { + // sprintf(print_buf, "%d ", (int) (smem_A[SMEM_MAT_OFFSET(i, j, K * DIM)])); + // } + // sprintf(print_buf, "\n"); + // } - pfence(); + // sprintf(print_buf, "\nB in\n"); + // for (int i = 0; i < K * DIM; i++) { + // for (int j = 0; j < J * DIM; j++) { + // sprintf(print_buf, "%d ", (int) (smem_B[SMEM_MAT_OFFSET(i, j, J * DIM)])); + // } + // sprintf(print_buf, "\n"); + // if (i == 2) i = K * DIM - 3; + // } + // gemmini_extended_preload(spad_B, acc_C, DIM, DIM, DIM, DIM); + // gemmini_extended_compute_preloaded(spad_A, GARBAGE_ADDR, DIM, DIM, DIM, DIM); // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); - gemmini_mvout_spad(spad_C, acc_C, DIM, DIM); - - pfence(); - - - sprintf(print_buf, "\nC after\n"); + // gemmini_extended_mvout_spad(spad_C, 1, acc_C, DIM, DIM); - for (int i = 0; i < DIM; i++) { - for (int j = 0; j < DIM; j++) { - sprintf(print_buf, "%d ", (int) (100 * smem_C[i * DIM + j])); + sp_tiled_matmul_full_spad_ws(spad_A, spad_B, /*spad_D=*/0, spad_C, + /*I=*/I, /*J=*/J, /*K=*/K, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, + /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, + /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); + + for (int i = 0; i < 32; i++) pfence(); + + // check results + for (int i = 0; i < I * DIM; i++) { + for (int j = 0; j < J * DIM; j++) { + int sum = 0; + for (int k = 0; k < K * DIM; k++) sum += ((I * K * DIM * DIM - i * K * DIM - k) % 64) * ((k * J * DIM + j) % 64); + if ((int) (smem_C[SMEM_MAT_OFFSET(i, j, J * DIM)] * 10) != (int) (sum * 10)) { + sprintf(print_buf, "TEST FAILED (actual/reference)\n"); + for (int ii = 0; ii < I * DIM; ii++) { + for (int jj = 0; jj < J * DIM; jj++) { + sum = 0; + for (int k = 0; k < K * DIM; k++) sum += ((I * K * DIM * DIM - ii * K * DIM - k) % 64) * ((k * J * DIM + jj) % 64); + sprintf(print_buf, "%d/%d ", (int) (smem_C[SMEM_MAT_OFFSET(ii, jj, J * DIM)]), (int) sum); + } + sprintf(print_buf, "\n"); + } + return 1; + } } - sprintf(print_buf, "\n"); } + sprintf(print_buf, "TEST PASSED\n"); return 0; } From f1e7407d3a689d943316af08c5eff28f345a7e8f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:44:04 -0800 Subject: [PATCH 20/75] sgemm_wg: Run multiple threadblock per core --- tests/regression/sgemm_wg/kernel.cpp | 34 +++++++++++++++++++++------- tests/regression/sgemm_wg/main.cpp | 4 ++-- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index ec207821..4609b9e6 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -9,7 +9,9 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_dim_x, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, - const uint32_t threadblock_id_y) { + const uint32_t threadblock_id_y, + const uint32_t threadblock_id_in_core, + float *sharedmem_per_threadblock) { const float *global_a = (const float *)arg->addr_a; const float *global_b = (const float *)arg->addr_b; float *global_c = (float *)arg->addr_c; @@ -29,19 +31,24 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, float reg_c = 0.0f; for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { - float *local_a = (float *)DEV_SMEM_START_ADDR; + float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; - float *local_b = (float *)DEV_SMEM_START_ADDR + local_a_elems; + float *local_b = sharedmem_per_threadblock + local_a_elems; uint32_t offset_global_a = dim_k * global_row + (k + local_col); uint32_t offset_global_b = dim_n * (local_row + k) + global_col; + // FIXME: threadblocks size must be BM*BN, not BM*BK or BN*BK. This means + // there is a mismatch between the number of elements in the A/B tile and + // the C tile. This is handled by each thread computing multiple result + // elements. + // // local_a: threadblock_dim_y rows, threadblock_dim_x cols // local_b: threadblock_dim_x rows, threadblock_dim_y cols // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; - vx_barrier(0, threadblock_dim_y); + vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { @@ -49,7 +56,7 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, local_b[threadblock_dim_y * local_col + local_k]; } - vx_barrier(0, threadblock_dim_y); + vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); } @@ -57,14 +64,19 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { + // @perf: All threads are running these compute whose result is mostly same + // across the threadblock + const uint32_t dim_n = arg->dim_n; int tid_x = task_id % dim_n; int tid_y = task_id / dim_n; + const uint32_t threadblocks_per_core = 2; const uint32_t threadblock_dim_x = vx_num_threads(); - const uint32_t threadblock_dim_y = vx_num_warps(); + const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; + const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; const int threadblock_id_x = threadblock_id % dim_n_in_blocks; @@ -72,8 +84,14 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int tid_in_threadblock_x = vx_thread_id(); const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; - thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, - threadblock_dim_y, threadblock_id_x, threadblock_id_y); + + float *sharedmem_per_threadblock = + (float *)DEV_SMEM_START_ADDR + + (2 * threadblock_dim_x * threadblock_dim_y) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, + threadblock_dim_x, threadblock_dim_y, threadblock_id_x, + threadblock_id_y, threadblock_id_in_core, + sharedmem_per_threadblock); } int main() { diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c6252991..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,8 +147,8 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 16; - uint32_t dim_n = 16; + uint32_t dim_m = 32; + uint32_t dim_n = 32; uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); From 2b1b5fe5377b3eb756d4e8e2a076ab65f55d832d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:45:22 -0800 Subject: [PATCH 21/75] convolution: Fix write_operand_file after upstream merge --- tests/opencl/convolution/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc index 5c62b56e..dded468f 100644 --- a/tests/opencl/convolution/main.cc +++ b/tests/opencl/convolution/main.cc @@ -238,9 +238,9 @@ int main (int argc, char **argv) { } // NOTE(hansung): Dump operand buffer to a file - if (write_operand_file("matmul.input.a.bin", h_a.data(), nbytes) != 0) + if (write_operand_file("convolution.input.input.bin", h_i.data(), i_nbytes) != 0) return EXIT_FAILURE; - if (write_operand_file("matmul.input.b.bin", h_b.data(), nbytes) != 0) + if (write_operand_file("convolution.input.weights.bin", h_w.data(), w_nbytes) != 0) return EXIT_FAILURE; // Creating command queue From a2ea27b2b522bcd3e45e18d8d67a201fb71aa204 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:46:02 -0800 Subject: [PATCH 22/75] vx_spawn: Add spawn_tasks_contiguous_all_stub Spawns tasks in a way that the threads in a warp see contiguous thread_id, unlike the original variant where each thread were allocated a range of thread_id that spans the number of batches. E.g. in a 4-thread config, instead of mapping IDs (0,2,4,6)->(1,3,5,7), map (0,1,2,3)->(4,5,6,7). TODO remaining logic not implemented. --- kernel/src/vx_spawn.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index fd8258e1..eb0bdb90 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,6 +74,27 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + // FIXME: handle RW + int waves = p_wspawn_args->NWs; + int offset = p_wspawn_args->offset + (NT * wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { int cid = vx_core_id(); int tid = vx_thread_id(); @@ -88,7 +109,8 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc(-1); // call stub routine - spawn_tasks_all_stub(); + // spawn_tasks_all_stub(); + spawn_tasks_contiguous_all_stub(); // disable warp vx_tmc_zero(); @@ -141,7 +163,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_all_stub(); + spawn_tasks_contiguous_all_stub(); // back to single-threaded vx_tmc_one(); From 27646bb507645169fbade129dbe4b055528b8a3a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 22:06:01 -0800 Subject: [PATCH 23/75] sgemm_wg: Implement multiple C per thread with sliding A/B blocks --- tests/regression/sgemm_wg/kernel.cpp | 103 ++++++++++++++++----------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 4609b9e6..58d54b36 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -3,7 +3,10 @@ #include #include "common.h" -inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, +#define MAX_TM 4 + +void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, + const uint32_t tid_in_threadblock, const uint32_t tid_in_threadblock_x, const uint32_t tid_in_threadblock_y, const uint32_t threadblock_dim_x, @@ -12,83 +15,103 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_id_y, const uint32_t threadblock_id_in_core, float *sharedmem_per_threadblock) { - const float *global_a = (const float *)arg->addr_a; - const float *global_b = (const float *)arg->addr_b; - float *global_c = (float *)arg->addr_c; + const float *A = (const float *)arg->addr_a; + const float *B = (const float *)arg->addr_b; + float *C = (float *)arg->addr_c; - // assumes NT == NW == matrix_dim const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; - // FIXME: assumes local block size is square shape - const uint32_t local_row = tid_in_threadblock_y; - const uint32_t local_col = tid_in_threadblock_x; - const uint32_t global_row = threadblock_id_y * threadblock_dim_y + local_row; - const uint32_t global_col = threadblock_id_x * threadblock_dim_x + local_col; + // FIXME: Output block size is assumed to be square, i.e. BM == BN + // const uint32_t BM = threadblock_dim_y; + // const uint32_t BN = threadblock_dim_y; + // const uint32_t BK = threadblock_dim_x; + constexpr uint32_t BM = 8; + constexpr uint32_t BN = 8; + constexpr uint32_t BK = 4; + constexpr uint32_t TM = 2; + + const uint32_t local_a_row = tid_in_threadblock / BK; + const uint32_t local_a_col = tid_in_threadblock % BK; + const uint32_t local_b_row = tid_in_threadblock / BN; + const uint32_t local_b_col = tid_in_threadblock % BN; + const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; + const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; + + A += dim_k * BM * threadblock_id_y; + B += BN * threadblock_id_x; + C += dim_n * BM * threadblock_id_y + BN * threadblock_id_x; // each thread generates one output element - float reg_c = 0.0f; + float reg_c[MAX_TM] = { 0.0f }; - for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { + for (uint32_t k = 0; k < dim_k; k += BK) { float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; float *local_b = sharedmem_per_threadblock + local_a_elems; - uint32_t offset_global_a = dim_k * global_row + (k + local_col); - uint32_t offset_global_b = dim_n * (local_row + k) + global_col; - // FIXME: threadblocks size must be BM*BN, not BM*BK or BN*BK. This means - // there is a mismatch between the number of elements in the A/B tile and - // the C tile. This is handled by each thread computing multiple result - // elements. - // - // local_a: threadblock_dim_y rows, threadblock_dim_x cols - // local_b: threadblock_dim_x rows, threadblock_dim_y cols - // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n - local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; - local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; + // NOTE: local_b is transposed to column-major to facilitate better memory + // access. + local_a[BK * local_a_row + local_a_col] = A[dim_k * local_a_row + local_a_col]; + local_b[BN * local_b_row + local_b_col] = B[dim_n * local_b_row + local_b_col]; + + // Advance A and B block + A += BK; + B += dim_n * BK; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); - for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { - reg_c += local_a[threadblock_dim_x * local_row + local_k] * - local_b[threadblock_dim_y * local_col + local_k]; + for (uint32_t local_k = 0; local_k < BK; local_k++) { + // Compute multiple result elements (TM) per thread + const float local_b_tmp = local_b[BN * local_k + local_b_col]; +#pragma GCC unroll 1 + for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + reg_c[result_idx] += + local_a[BK * (TM * local_b_row + result_idx) + local_k] * + local_b_tmp; + } } vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); } - global_c[dim_n * global_row + global_col] = reg_c; +#pragma GCC unroll 1 + for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + C[dim_n * (TM * local_b_row + result_idx) + local_b_col] = reg_c[result_idx]; + } } void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t dim_n = arg->dim_n; - int tid_x = task_id % dim_n; - int tid_y = task_id / dim_n; - - const uint32_t threadblocks_per_core = 2; + const uint32_t threadblocks_per_core = 1; const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; - const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; - const int threadblock_id_x = threadblock_id % dim_n_in_blocks; - const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - + const int tid_in_threadblock = task_id % threads_per_threadblock; const int tid_in_threadblock_x = vx_thread_id(); const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t BN = 8; + const uint32_t dim_n_in_blocks = dim_n / BN; + const int threadblock_id_x = threadblock_id % dim_n_in_blocks; + const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // const int threadblock_id_x = dim_n / threadblock_dim_x; + // const int threadblock_id_y = dim_m / threadblock_dim_y / 1; + float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + - (2 * threadblock_dim_x * threadblock_dim_y) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, + (2 * threads_per_threadblock) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, sharedmem_per_threadblock); @@ -96,7 +119,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n; + const uint32_t grid_size = arg->dim_m * arg->dim_n / 2; vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } From 46f242e520ac0d7f1b174c7f98986871001d6f5a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 22:23:25 -0800 Subject: [PATCH 24/75] sgemm_wg: Constantify BM/BN/BK/TM, computationally set gridsize and TB/core --- tests/regression/sgemm_wg/kernel.cpp | 51 ++++++++++++---------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 58d54b36..69ef9f14 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -3,12 +3,13 @@ #include #include "common.h" -#define MAX_TM 4 +#define BM 8 +#define BN BM +#define BK 8 +#define TM (BM/BK) void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, - const uint32_t tid_in_threadblock_x, - const uint32_t tid_in_threadblock_y, const uint32_t threadblock_dim_x, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, @@ -19,6 +20,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const float *B = (const float *)arg->addr_b; float *C = (float *)arg->addr_c; + // assumes NT == NW == matrix_dim const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; @@ -27,10 +29,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // const uint32_t BM = threadblock_dim_y; // const uint32_t BN = threadblock_dim_y; // const uint32_t BK = threadblock_dim_x; - constexpr uint32_t BM = 8; - constexpr uint32_t BN = 8; - constexpr uint32_t BK = 4; - constexpr uint32_t TM = 2; + // constexpr uint32_t BM = 8; + // constexpr uint32_t BN = 8; + // constexpr uint32_t BK = 2; const uint32_t local_a_row = tid_in_threadblock / BK; const uint32_t local_a_col = tid_in_threadblock % BK; @@ -39,26 +40,21 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; - A += dim_k * BM * threadblock_id_y; - B += BN * threadblock_id_x; - C += dim_n * BM * threadblock_id_y + BN * threadblock_id_x; - // each thread generates one output element - float reg_c[MAX_TM] = { 0.0f }; + float reg_c[TM] = { 0.0f }; for (uint32_t k = 0; k < dim_k; k += BK) { float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; float *local_b = sharedmem_per_threadblock + local_a_elems; + uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); + uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; + // NOTE: local_b is transposed to column-major to facilitate better memory // access. - local_a[BK * local_a_row + local_a_col] = A[dim_k * local_a_row + local_a_col]; - local_b[BN * local_b_row + local_b_col] = B[dim_n * local_b_row + local_b_col]; - - // Advance A and B block - A += BK; - B += dim_n * BK; + local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; + local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); @@ -66,7 +62,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, for (uint32_t local_k = 0; local_k < BK; local_k++) { // Compute multiple result elements (TM) per thread const float local_b_tmp = local_b[BN * local_k + local_b_col]; -#pragma GCC unroll 1 +#pragma GCC unroll 4 for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { reg_c[result_idx] += local_a[BK * (TM * local_b_row + result_idx) + local_k] * @@ -78,9 +74,10 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, vx_fence(); } -#pragma GCC unroll 1 +#pragma GCC unroll 4 for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - C[dim_n * (TM * local_b_row + result_idx) + local_b_col] = reg_c[result_idx]; + C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + + global_b_col] = reg_c[result_idx]; } } @@ -88,30 +85,24 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threadblocks_per_core = 1; + const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / (BM*BK); const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; - const int tid_in_threadblock = task_id % threads_per_threadblock; - const int tid_in_threadblock_x = vx_thread_id(); - const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; - const uint32_t BN = 8; const uint32_t dim_n_in_blocks = dim_n / BN; const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - // const int threadblock_id_x = dim_n / threadblock_dim_x; - // const int threadblock_id_y = dim_m / threadblock_dim_y / 1; float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * threads_per_threadblock) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock, tid_in_threadblock_x, tid_in_threadblock_y, + thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, sharedmem_per_threadblock); @@ -119,7 +110,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n / 2; + const uint32_t grid_size = arg->dim_m * arg->dim_n / TM; vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } From a06b2dd20ea702f4e3824cb519a0081d46312cfc Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 28 Feb 2024 21:17:42 -0800 Subject: [PATCH 25/75] sgemm_wg: Cleanup & proper unroll --- tests/regression/sgemm_wg/kernel.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 69ef9f14..9b767d35 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -40,30 +40,30 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; - // each thread generates one output element + // each thread generates TM output element float reg_c[TM] = { 0.0f }; - for (uint32_t k = 0; k < dim_k; k += BK) { - float *local_a = sharedmem_per_threadblock; - size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; - float *local_b = sharedmem_per_threadblock + local_a_elems; + volatile float *local_a = sharedmem_per_threadblock; + const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + volatile float *local_b = sharedmem_per_threadblock + local_a_elems; + for (uint32_t k = 0; k < dim_k; k += BK) { uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; - // NOTE: local_b is transposed to column-major to facilitate better memory - // access. local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); +#pragma GCC unroll TM for (uint32_t local_k = 0; local_k < BK; local_k++) { // Compute multiple result elements (TM) per thread const float local_b_tmp = local_b[BN * local_k + local_b_col]; -#pragma GCC unroll 4 +#pragma GCC unroll TM for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + // NOTE use of local_b_row reg_c[result_idx] += local_a[BK * (TM * local_b_row + result_idx) + local_k] * local_b_tmp; @@ -74,8 +74,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, vx_fence(); } -#pragma GCC unroll 4 +#pragma GCC unroll TM for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + // NOTE use of local_b_row and global_b_col here C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + global_b_col] = reg_c[result_idx]; } From 6f4dfe5a0e4fa8b8530986b0d1921d8cdfd29068 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 29 Feb 2024 14:40:54 -0800 Subject: [PATCH 26/75] sgemm_wg: Implement 2D threadtiling --- tests/regression/sgemm_wg/kernel.cpp | 88 ++++++++++++++++++++-------- tests/regression/sgemm_wg/main.cpp | 6 +- 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 9b767d35..7f33a90a 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -5,8 +5,11 @@ #define BM 8 #define BN BM -#define BK 8 -#define TM (BM/BK) +#define BK 2 +// #define TM (BM/BK) +// #define TN (BN/BK) +#define TM 4 +#define TN 4 void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, @@ -40,33 +43,63 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; + const uint32_t local_c_row = tid_in_threadblock / (BN / TN); + const uint32_t local_c_col = tid_in_threadblock % (BN / TN); + // each thread generates TM output element - float reg_c[TM] = { 0.0f }; + float reg_c[TM * TN] = { 0.0f }; + float reg_a[TM] = { 0.0f }; + float reg_b[TN] = { 0.0f }; volatile float *local_a = sharedmem_per_threadblock; - const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + const size_t local_a_elems = (BM * BK); volatile float *local_b = sharedmem_per_threadblock + local_a_elems; - for (uint32_t k = 0; k < dim_k; k += BK) { - uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); - uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; + constexpr uint32_t stride_a = (BM * BN) / BK / (TM * TN); + constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN); - local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; - local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; + for (uint32_t k = 0; k < dim_k; k += BK) { + for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { + const uint32_t global_a_offset = + dim_k * (global_a_row + load_offset) + (k + local_a_col); + local_a[BK * (local_a_row + load_offset) + local_a_col] = + A[global_a_offset]; + } + for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { + const uint32_t global_b_offset = + dim_n * (k + local_b_row + load_offset) + global_b_col; + local_b[BN * (local_b_row + load_offset) + local_b_col] = + B[global_b_offset]; + } vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); -#pragma GCC unroll TM for (uint32_t local_k = 0; local_k < BK; local_k++) { - // Compute multiple result elements (TM) per thread - const float local_b_tmp = local_b[BN * local_k + local_b_col]; #pragma GCC unroll TM - for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - // NOTE use of local_b_row - reg_c[result_idx] += - local_a[BK * (TM * local_b_row + result_idx) + local_k] * - local_b_tmp; + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { + reg_a[res_idx_m] = + local_a[BK * (TM * local_c_row + res_idx_m) + local_k]; + } +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + reg_b[res_idx_n] = + local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } + + // Compute multiple result elements (TM) per thread +#pragma GCC unroll TM + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row + reg_c[TN * res_idx_m + res_idx_n] += + reg_a[res_idx_m] * reg_b[res_idx_n]; + // reg_c[TN * res_idx_m + res_idx_n] += + // local_a[BK * (TM * local_c_row + res_idx_m) + local_k] * + // local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } } } @@ -75,10 +108,14 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } #pragma GCC unroll TM - for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - // NOTE use of local_b_row and global_b_col here - C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + - global_b_col] = reg_c[result_idx]; + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row and global_b_col here + C[dim_n * (BM * threadblock_id_y + TM * local_c_row + res_idx_m) + + (BN * threadblock_id_x + TN * local_c_col + res_idx_n)] = + reg_c[TN * res_idx_m + res_idx_n]; + } } } @@ -86,10 +123,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / (BM*BK); + const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN)); + const uint32_t threadblocks_per_core = + vx_num_threads() * vx_num_warps() / threads_per_threadblock; const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; - const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; const int tid_in_threadblock = task_id % threads_per_threadblock; @@ -102,7 +140,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + - (2 * threads_per_threadblock) * threadblock_id_in_core; + (2 * BM * BK) * threadblock_id_in_core; thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, @@ -111,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n / TM; + const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN); vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 229463ef..c86f7aaf 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,9 +147,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 32; - uint32_t dim_n = 32; - uint32_t dim_k = 32; + uint32_t dim_m = 64; + uint32_t dim_n = 64; + uint32_t dim_k = 64; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From a9709edae238a3cea3020f96c4ea006b79be4fd6 Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Fri, 1 Mar 2024 21:05:52 -0800 Subject: [PATCH 27/75] first pass --- hw/rtl/core/VX_core.sv | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index e5e57d99..e239ea4b 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -331,22 +331,25 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.stores = perf_stores; assign pipeline_perf_if.load_latency = perf_dcache_lat; assign pipeline_perf_if.ifetch_latency = perf_icache_lat; - assign pipeline_perf_if.load_latency = perf_dcache_lat; + real instrs = commit_csr_if.instret; + real cycles = sched_csr_if.cycles; + real icache_lat = perf_icache_lat; + real ifetches = perf_ifetches; + real dcache_lat = perf_dcache_lat; + real loads = perf_loads; always @(negedge busy) begin if (!reset) begin + $display("====================CORE : %d===================",CORE_ID); $display("time : %t", $time); - $display("perf_dcache_rd_req_per_cycle: %h", perf_dcache_rd_req_per_cycle); - $display("perf_dcache_wr_req_per_cycle: %h", perf_dcache_wr_req_per_cycle); - $display("perf_dcache_rsp_per_cycle: %h", perf_dcache_rsp_per_cycle); - $display("perf_icache_pending_read_cycle: %h", perf_icache_pending_read_cycle); - $display("perf_dcache_pending_read_cycle: %h", perf_dcache_pending_read_cycle); - $display("perf_icache_pending_reads: %h", perf_icache_pending_reads); - $display("perf_dcache_pending_reads: %h", perf_dcache_pending_reads); - $display("perf_ifetches: %h", perf_ifetches); - $display("perf_loads: %h", perf_loads); - $display("perf_stores: %h", perf_stores); + $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle); + $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle); + $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle); + $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle); + $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle); + $display("perf_icache_pending_reads: %d", perf_icache_pending_reads); + $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads); $display("perf_icache_req_fire: %b", perf_icache_req_fire); $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); @@ -354,9 +357,18 @@ module VX_core import VX_gpu_pkg::*; #( $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); - $display("scheduler idle: %d", pipeline_perf_if.sched_idles[31:0]); - $display("Instruction: %d",commit_csr_if.instret[31:0]); - $display("Cycle: %d",sched_csr_if.cycles); + + $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles); + $display("scheduler idle: %d", pipeline_perf_if.sched_idles); + $display("scheduler stalls: %d", pipeline_perf_if.sched_stalls); + $display("ibuffer stalls: %d",pipeline_perf_if.ibf_stalls); + $display("issue stalls: %d",pipeline_perf_if.scb_stalls); + $display("sfu stalls: %d",pipeline_perf_if.units_uses[2]); + $display("ifetches: %d", perf_ifetches); + $display("ifetch latency: %f Cycles", icache_lat/ifetches); + $display("loads: %d", perf_loads); + $display("load latency: %f Cycles", dcache_lat/loads); + $display("stores: %d", perf_stores); end end From 3c2a266d379d9eda658048248a487da04eac4a0b Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Fri, 1 Mar 2024 21:27:26 -0800 Subject: [PATCH 28/75] second pass --- hw/rtl/core/VX_core.sv | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index e239ea4b..453ebb03 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -337,7 +337,18 @@ module VX_core import VX_gpu_pkg::*; #( real ifetches = perf_ifetches; real dcache_lat = perf_dcache_lat; real loads = perf_loads; + real scheduler_idles = pipeline_perf_if.sched_idles; + real scheduler_stalls = pipeline_perf_if.sched_stalls; + real ibuf_stalls = pipeline_perf_if.ibf_stalls; + real scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU]; + real scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU]; + real scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU]; + real scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU]; + real scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core; + real scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL]; + real scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS]; + real sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core; always @(negedge busy) begin if (!reset) begin @@ -359,11 +370,11 @@ module VX_core import VX_gpu_pkg::*; #( $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles); - $display("scheduler idle: %d", pipeline_perf_if.sched_idles); - $display("scheduler stalls: %d", pipeline_perf_if.sched_stalls); - $display("ibuffer stalls: %d",pipeline_perf_if.ibf_stalls); - $display("issue stalls: %d",pipeline_perf_if.scb_stalls); - $display("sfu stalls: %d",pipeline_perf_if.units_uses[2]); + $display("scheduler idle: %d (%f)", pipeline_perf_if.sched_idles, scheduler_idles/cycles); + $display("scheduler stalls: %d (%f)", pipeline_perf_if.sched_stalls, scheduler_stalls/cycles); + $display("ibuffer stalls: %d (%f)",pipeline_perf_if.ibf_stalls, ibuf_stalls/cycles); + $display("issue stalls: %d(alu=%f, fpu=%f, lsu=%f, sfu=%f)",pipeline_perf_if.scb_stalls, scrb_alu_per_core/scrb_tot, scrb_fpu_per_core/scrb_tot, scrb_lsu_per_core/scrb_tot, scrb_sfu_per_core/scrb_tot); + $display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], scrb_csrs_per_core/sfu_tot, scrb_wctl_per_core/sfu_tot); $display("ifetches: %d", perf_ifetches); $display("ifetch latency: %f Cycles", icache_lat/ifetches); $display("loads: %d", perf_loads); From fbe872c8314a7a6f1e79ca19c6f6dbeb66f46d90 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 12 Mar 2024 15:34:17 -0700 Subject: [PATCH 29/75] sgemm_wg: Add missing makefile dep to common.h --- tests/regression/sgemm_wg/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile index f57f6124..289369d2 100644 --- a/tests/regression/sgemm_wg/Makefile +++ b/tests/regression/sgemm_wg/Makefile @@ -1,6 +1,6 @@ PROJECT = sgemm_wg -SRCS = main.cpp +SRCS = main.cpp common.h VX_SRCS = kernel.cpp From 510a834db529f20bac2fdea09f7a92372462402e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 12 Mar 2024 15:34:42 -0700 Subject: [PATCH 30/75] sgemm_wg: Implement software barrier for inter-core synchronization --- tests/regression/sgemm_wg/kernel.cpp | 45 +++++++++++++++++++++++----- tests/regression/sgemm_wg/main.cpp | 6 ++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 7f33a90a..a65e1e5f 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "common.h" @@ -8,8 +9,35 @@ #define BK 2 // #define TM (BM/BK) // #define TN (BN/BK) -#define TM 4 -#define TN 4 +#define TM 2 +#define TN 2 + +#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL +#define CORES_PER_CLUSTER 4 + +void threadblock_barrier(unsigned int barrier_id, unsigned int count) { + vx_barrier(barrier_id, count); + vx_fence(); + +#if CORES_PER_CLUSTER != 1 + if (vx_thread_id() == 0) { + volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + int core_id = vx_core_id(); + const uint32_t barrier_stride = CORES_PER_CLUSTER; + const uint32_t barrier_offset = barrier_stride * barrier_id; + // 1 : 0x00 is reserved for mmio read reg + mmio[barrier_offset + 1 + core_id] = 1; + vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count); + + // wait for other cores in the cluster to finish by waiting on the + // all-synced read-only mmio reg + while (mmio[barrier_offset] == 0); + + // reset per-core flag back to zero for the next barrier + mmio[barrier_offset + 1 + core_id] = 0; + } +#endif +} void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, @@ -73,8 +101,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); for (uint32_t local_k = 0; local_k < BK; local_k++) { #pragma GCC unroll TM @@ -103,8 +130,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); } #pragma GCC unroll TM @@ -123,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN)); + const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; const uint32_t threadblock_dim_x = vx_num_threads(); @@ -138,6 +164,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // initialize barrier MMIO + volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + *barrier_mmio = 0; + vx_fence(); + float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c86f7aaf..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,9 +147,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 64; - uint32_t dim_n = 64; - uint32_t dim_k = 64; + uint32_t dim_m = 32; + uint32_t dim_n = 32; + uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From 2036d37840e2ec72f49dff3a3e06b593dbd3d610 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 13 Mar 2024 21:32:57 -0700 Subject: [PATCH 31/75] sgemm_wg: Prevent run-ahead using ternary flags; reduce mem accesses --- tests/regression/sgemm_wg/kernel.cpp | 61 ++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index a65e1e5f..44299934 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -13,29 +13,61 @@ #define TN 2 #define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL -#define CORES_PER_CLUSTER 4 +#define CORES_PER_CLUSTER 2 +#define BARRIER_STRIDE 4 -void threadblock_barrier(unsigned int barrier_id, unsigned int count) { +void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_barrier(barrier_id, count); vx_fence(); -#if CORES_PER_CLUSTER != 1 - if (vx_thread_id() == 0) { + // vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count); + +#if CORES_PER_CLUSTER != 0 + // this code doesn't work without the memory-mapped register implemented in + // hardware, hence the #ifdef. + + if (tid_in_threadblock == 0) { volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); int core_id = vx_core_id(); - const uint32_t barrier_stride = CORES_PER_CLUSTER; + // FIXME: hardcoded + const uint32_t barrier_stride = BARRIER_STRIDE; const uint32_t barrier_offset = barrier_stride * barrier_id; - // 1 : 0x00 is reserved for mmio read reg + + // wait for the barrier to be initialized + while (mmio[barrier_offset + 1 + core_id] != 0); + + // signal internal-core synchronization done mmio[barrier_offset + 1 + core_id] = 1; - vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count); // wait for other cores in the cluster to finish by waiting on the // all-synced read-only mmio reg while (mmio[barrier_offset] == 0); - // reset per-core flag back to zero for the next barrier - mmio[barrier_offset + 1 + core_id] = 0; + // need to signal that this core passed the barrier; otherwise, if we + // reset this to 0 right away, the other core still waiting for the + // barrier might never see the all-sync mmio reg as 1. + mmio[barrier_offset + 1 + core_id] = 2; + + // // if this core is the last one passing the barrier, reset all per-core + // // flags to 0 to get ready for the next barrier + // bool all_passed = true; + // for (int i = 0; i < CORES_PER_CLUSTER; i++) { + // // if (i == core_id) continue; + // // NOTE: this requires coherent access of store-to-load to the same + // // address + // if (mmio[barrier_offset + 1 + i] != 2) { + // all_passed = false; + // break; + // } + // } + // if (all_passed) { + // for (int i = 0; i < CORES_PER_CLUSTER; i++) { + // mmio[barrier_offset + 1 + i] = 0; + // } + // } } + + vx_barrier(barrier_id, count); #endif } @@ -101,7 +133,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); + threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_dim_y); for (uint32_t local_k = 0; local_k < BK; local_k++) { #pragma GCC unroll TM @@ -130,7 +163,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); + threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_dim_y); } #pragma GCC unroll TM @@ -164,11 +198,6 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - // initialize barrier MMIO - volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); - *barrier_mmio = 0; - vx_fence(); - float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; From 12ee2a3a0fe2d2456461f912852e9732749c54a4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 18 Mar 2024 16:40:02 -0700 Subject: [PATCH 32/75] Write cluster-aware thread scheduling NOTE: cores per cluster is hardcoded as a constant --- kernel/src/vx_spawn.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index eb0bdb90..c4c00a06 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -15,6 +15,8 @@ #include #include +#define CORES_PER_CLUSTER 2 + #ifdef __cplusplus extern "C" { #endif @@ -95,6 +97,30 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + const int core_id_in_cluster = vx_core_id() % CORES_PER_CLUSTER; + const int cluster_wid = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + // FIXME: handle RW + int waves = p_wspawn_args->NWs; + int offset = p_wspawn_args->offset + (NT * cluster_wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { int cid = vx_core_id(); int tid = vx_thread_id(); @@ -110,7 +136,7 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // call stub routine // spawn_tasks_all_stub(); - spawn_tasks_contiguous_all_stub(); + spawn_tasks_cluster_all_stub(); // disable warp vx_tmc_zero(); @@ -151,7 +177,11 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { rW = TW - fW * NW; // remaining warps } - wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + int cluster_id = core_id / CORES_PER_CLUSTER; + const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; + const int offset = cluster_id * tasks_per_cluster; + wspawn_tasks_args_t wspawn_args = { callback, arg, offset, fW, rW }; + // wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; g_wspawn_args[core_id] = &wspawn_args; if (TW >= 1) { @@ -163,7 +193,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_contiguous_all_stub(); + spawn_tasks_cluster_all_stub(); // back to single-threaded vx_tmc_one(); From 94ad1850a925ec3a3c1fd0029b35a72b90ce213f Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Wed, 20 Mar 2024 15:18:31 -0700 Subject: [PATCH 33/75] implement correct gemmini fence and loop fsm support --- tests/kernel/gemmini_mmio/Makefile | 2 +- tests/kernel/gemmini_mmio/gemmini_mmio.h | 133 +++++++++++++++-------- 2 files changed, 87 insertions(+), 48 deletions(-) diff --git a/tests/kernel/gemmini_mmio/Makefile b/tests/kernel/gemmini_mmio/Makefile index 03059e47..390b7f81 100644 --- a/tests/kernel/gemmini_mmio/Makefile +++ b/tests/kernel/gemmini_mmio/Makefile @@ -21,7 +21,7 @@ CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy SIM_DIR = ../../../sim -CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections +CFLAGS += -O3 -funroll-loops -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH) LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a diff --git a/tests/kernel/gemmini_mmio/gemmini_mmio.h b/tests/kernel/gemmini_mmio/gemmini_mmio.h index e09b0489..b7712441 100644 --- a/tests/kernel/gemmini_mmio/gemmini_mmio.h +++ b/tests/kernel/gemmini_mmio/gemmini_mmio.h @@ -14,11 +14,11 @@ #define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE) #define SPAD_MASK (SPAD_NUM_ROWS - 1) -#define SMEM_GARBAGE_ADDR 0xffff0000 #define PRINT_BUF SMEM_ADDR_END #define GEMMINI_RS1_ADDR 0xff007010 #define GEMMINI_RS2_ADDR 0xff007018 #define GEMMINI_INST_ADDR 0xff007000 +#define GEMMINI_BUSY_ADDR 0xff007020 #define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE) #define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE) @@ -31,16 +31,17 @@ #define SMEM_MAT_OFFSET(i, j, J) \ (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM)) -#define pfence() { for (int i = 0; i < 5; i++) *((volatile uint32_t *) SMEM_GARBAGE_ADDR) = 0xdeadbeef; } +// #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; } +#define fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } +#undef ROCC_INSTRUCTION_RS1_RS2 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ /* printf("function %d\n", funct); */ \ uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) (funct) << 25); \ - *((volatile uint64_t*) GEMMINI_RS1_ADDR) = (uint64_t) (rs1); \ - *((volatile uint64_t*) GEMMINI_RS2_ADDR) = (uint64_t) (rs2); \ + *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (volatile uint64_t) (rs1); \ + *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (volatile uint64_t) (rs2); \ /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \ /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \ - pfence(); \ /* gemmini_fence(); */ \ *((volatile uint32_t*) GEMMINI_INST_ADDR) = instruction; \ /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \ @@ -54,66 +55,104 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u bool no_bias, bool repeating_bias, int act) { + gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, + A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start, + a_transpose, b_transpose, + full_C, low_D, false, + act, 0, 0, false); + return; + + // const uint32_t A_sp_addr_start = 0; // const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM; // const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1); - const uint32_t C_sp_addr_start = 3 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3)); + const uint32_t C_sp_addr_start = 2 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3)); // const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) : // (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC); const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN); // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t); const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t); + fence(); - for (size_t k = 0; k < K; k++) { - for (size_t j = 0; j < J; j++) { - for (size_t i = 0; i < I; i++) { - const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) : - (A_sp_addr_start + (i*K + k)*DIM); - const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) : - (B_sp_addr_start + (k*J + j)*DIM); - const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM; - // Compute - { + if (a_transpose || b_transpose || (I < 4)) { + for (size_t k = 0; k < K; k++) { + for (size_t j = 0; j < J; j++) { + for (size_t i = 0; i < I; i++) { + const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) : + (A_sp_addr_start + (i*K + k)*DIM); + const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) : + (B_sp_addr_start + (k*J + j)*DIM); + const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM; + // Compute uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR; - uint32_t out_sp_addr = C_sp_addr; - // If we're not using a bias, then we want to overwrite what's in the - // accumulator, rather than writing over it - int no_bias_new_matrix = (k == 0); // no_bias && D != NULL && k == 0; - if (no_bias_new_matrix) { - out_sp_addr &= ~(1 << (ADDR_LEN-2)); - } - const size_t A_cols = DIM; // - (k == K - 1 ? pad_K : 0); - const size_t A_rows = DIM; // - (i == I - 1 ? pad_I : 0); - const size_t B_cols = DIM; // - (j == J - 1 ? pad_J : 0); - const size_t B_rows = DIM; // - (k == K - 1 ? pad_K : 0); - const size_t C_cols = DIM; // - (j == J - 1 ? pad_J : 0); - const size_t C_rows = DIM; // - (i == I - 1 ? pad_I : 0); - gemmini_extended_preload(pre_sp_addr, out_sp_addr, B_cols, B_rows, DIM, DIM); + uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2)); + gemmini_extended_preload(pre_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM); if (i == 0) { // First iteration - gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM); + gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); } else { // All other iterations - gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } + if (k == K - 1) { + // Move-out C (if not normalizing) + // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) { + const size_t rounded_j = j; // (j / C_blocks) * C_blocks; + const uint32_t rounded_C_sp_addr = C_sp_addr; // C_sp_addr_start + (i*J + rounded_j)*DIM; + + const uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C; + + // const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j; + constexpr size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0); + constexpr size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0); + + gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows); + // } } } - if (k == K - 1) { - // Move-out C (if not normalizing) - // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) { - const size_t rounded_j = (j / C_blocks) * C_blocks; - const uint32_t rounded_C_sp_addr = C_sp_addr_start + (i*J + rounded_j)*DIM; - - uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C; - - const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j; - const size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0); - const size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0); - - gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows); - // } + } + } + } else { + for (size_t k = 0; k < K; k++) { + for (size_t j = 0; j < J; j++) { + uint32_t A_sp_addr = A_sp_addr_start + k * DIM; // (i*K + k)*DIM; + const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM; + uint32_t C_sp_addr = C_sp_addr_start + j * DIM; // (i*J + j)*DIM; + for (size_t i = 0; i < I; i += 4) { + // Compute + // constexpr uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR; + const uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2)); + if (i == 0) { // First iteration + gemmini_extended_preload(B_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM); + gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } else { // All other iterations + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } + if (k == K - 1) { + for (int x = 0; x < 3; x++) fence(); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 3) * J + j) * DIM, 1, C_sp_addr + 3 * J * DIM, DIM, DIM); + } + A_sp_addr += 4 * K * DIM; + C_sp_addr += 4 * J * DIM; } } } } - pfence(); + fence(); } From c18267443f5e4f7804d07d3ca61a8074a511b9f3 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Wed, 20 Mar 2024 15:22:25 -0700 Subject: [PATCH 34/75] matmul kernel switch to proper fence and fsm --- tests/kernel/gemmini_mmio/main.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 8ef76e96..053e0c25 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -16,19 +16,23 @@ int main() { // load up A and B and C uint32_t spad_A = 0x00000000; - uint32_t spad_B = 0x00000100; // 16B word addressed + uint32_t spad_B = 0x00000080; // 16B word addressed uint32_t acc_C = 0x80000000; // accmem + accumulate - uint32_t spad_C = 0x00000200; + uint32_t spad_C = 0x00000100; - float *smem_A = (float *) SPAD_TO_SMEM(spad_A); // 0xff000000; // byte addressed + volatile float *smem_A = (float *) SPAD_TO_SMEM(spad_A); // 0xff000000; // byte addressed float *smem_B = (float *) SPAD_TO_SMEM(spad_B); // 0xff000200; float *smem_C = (float *) SPAD_TO_SMEM(spad_C); // 0xff000400; - int I = 5; - int J = 5; - int K = 5; + int I = 32 / DIM; + int J = 32 / DIM; + int K = 32 / DIM; - gemmini_config_st(DIM * 4 * J) + sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM); + sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM); + sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); + + gemmini_config_st(DIM * 4 * J); // load A with 128->1 in row-major order for (int i = 0; i < I; i++) { @@ -50,12 +54,12 @@ int main() { } } - for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 0.f; + for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 1.f; - pfence(); + fence(); // sprintf(print_buf, "\nA in\n"); - // for (int i = I * DIM - 1; i < I * DIM; i++) { + // for (int i = 0; i < I * DIM; i++) { // for (int j = 0; j < K * DIM; j++) { // sprintf(print_buf, "%d ", (int) (smem_A[SMEM_MAT_OFFSET(i, j, K * DIM)])); // } @@ -81,7 +85,7 @@ int main() { /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); - for (int i = 0; i < 32; i++) pfence(); + fence(); // check results for (int i = 0; i < I * DIM; i++) { From f590c4b41744bce9d3340f3c063c8a468c1d329c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:44:49 -0700 Subject: [PATCH 35/75] Add vx_spawn.h as dependency to kernel/Makefile --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 07b8c97b..575707f8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,10 +51,10 @@ $(PROJECT).dump: $(PROJECT).a %.S.o: src/%.S $(CC) $(CFLAGS) -c $< -o $@ -%.cpp.o: src/%.cpp +%.cpp.o: src/%.cpp include/vx_spawn.h $(CXX) $(CFLAGS) -c $< -o $@ -%.c.o: src/%.c +%.c.o: src/%.c include/vx_spawn.h $(CC) $(CFLAGS) -c $< -o $@ $(PROJECT).a: $(OBJS) From 8f3474b15167e1fdbc04c00c3da2c2e90fd07972 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:45:08 -0700 Subject: [PATCH 36/75] Don't clean *.bin --- tests/regression/common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 6a858edc..d38df853 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -115,7 +115,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf *.elf *.bin *.dump + rm -rf *.elf *.dump ifneq ($(MAKECMDGOALS),clean) -include .depend From 7d177492b2f99f4dd388caf2ca9d9be167f02036 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:45:30 -0700 Subject: [PATCH 37/75] Move CORES_PER_CLUSTER to vx_spawn.h --- kernel/include/vx_spawn.h | 2 ++ kernel/src/vx_spawn.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 2584b997..321e3f83 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,6 +17,8 @@ #include #include +#define CORES_PER_CLUSTER 2 + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index c4c00a06..c57e55f2 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -15,8 +15,6 @@ #include #include -#define CORES_PER_CLUSTER 2 - #ifdef __cplusplus extern "C" { #endif From ff401bdec0eb4a916d0fc126136620ceb0c0531c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:47:00 -0700 Subject: [PATCH 38/75] Cleanup tests/.gitignore --- tests/.gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/.gitignore b/tests/.gitignore index a9884992..30ca0fa4 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,7 @@ **/*.log +.depend +*.bin +*.dump +*.elf +*.o +*.ll From cc7b34ec5b9dc92200acb80f78c34a689219a52b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:44:02 -0700 Subject: [PATCH 39/75] vecaddx: Write args.bin and input.bin --- tests/regression/vecaddx/main.cpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 117f3470..4f3b77af 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -201,10 +202,19 @@ int main(int argc, char *argv[]) { memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), sizeof(kernel_arg_t)); + file.close(); + // generate source data source_data.resize(2 * num_points); for (uint32_t i = 0; i < source_data.size(); ++i) { - source_data[i] = Comparator::generate(); + // source_data[i] = Comparator::generate(); + source_data[i] = static_cast(i); } // upload source buffer0 @@ -215,6 +225,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 0]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.a.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // upload source buffer1 @@ -225,6 +243,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 1]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.b.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // clear destination buffer @@ -243,4 +269,4 @@ int main(int argc, char *argv[]) { std::cout << "PASSED!" << std::endl; return 0; -} \ No newline at end of file +} From 7f00e6c37665662471ecfa54a9a2f8a3e04d2253 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:44:33 -0700 Subject: [PATCH 40/75] vecaddx: Change arg device address to 7fff0000 --- tests/regression/vecaddx/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h index 2b8f164a..a7b26936 100644 --- a/tests/regression/vecaddx/common.h +++ b/tests/regression/vecaddx/common.h @@ -1,7 +1,7 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 #ifndef TYPE #define TYPE float From f050a08d77792c7d4ad2d83a1725776a2c929835 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:45:14 -0700 Subject: [PATCH 41/75] Write vx_spawn_tasks_cluster This scheduling logic tries to evenly distribute warps across *all* cores, instead of trying to fill up the first cores as much as possible. This scheme is necessary for the intra-cluster cores which are assumed to have equal workloads distributed. --- kernel/include/vx_spawn.h | 1 + kernel/src/vx_spawn.c | 81 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 321e3f83..06a85af7 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -50,6 +50,7 @@ void vx_wspawn_wait(); void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index c57e55f2..04b58253 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -140,6 +140,87 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { + // device specs + int NC = vx_num_cores(); + int NW = vx_num_warps(); + int NT = vx_num_threads(); + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + + // Distribute threads equally across as many cores as possible, even if they + // don't fill up NW*NT in a single core. This makes sure the warps get evenly + // distributed in a single cluster + // + // TODO: Try to contain in a single cluster if possible? + int num_active_cores = (num_tasks > NT) ? (num_tasks / NT) : 1; + num_active_cores = MIN(num_active_cores, NC); + if (core_id >= num_active_cores) + return; // terminate extra cores + + int tasks_per_core = num_tasks / num_active_cores; + int tasks_per_core_last = tasks_per_core; + if (core_id == (num_active_cores - 1)) { + int rem = num_tasks % num_active_cores; + tasks_per_core_last += rem; // last core also executes remaining tasks + } + + int num_full_warps = tasks_per_core_last / NT; + int rem_threads_in_last_warp = tasks_per_core_last % NT; + // sequential iterations + int num_full_waves = 1; + int rem_warps_in_last_wave = 0; + if (num_full_warps >= NW) { + // this division will result in the same value for both the last core and + // the rest + num_full_waves = num_full_warps / NW; + rem_warps_in_last_wave = num_full_warps % NW; + } + + int cluster_id = core_id / CORES_PER_CLUSTER; + const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; + const int offset = cluster_id * tasks_per_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, rem_warps_in_last_wave}; + g_wspawn_args[core_id] = &wspawn_args; + + if (num_full_warps >= 1) { + // execute callback on other warps + int nw = MIN(num_full_warps, NW); + vx_wspawn(nw, spawn_tasks_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + if (rem_threads_in_last_warp != 0) { + // adjust offset + wspawn_args.offset += (tasks_per_core_last - rem_threads_in_last_warp); + + // activate remaining threads + int tmask = (1 << rem_threads_in_last_warp) - 1; + vx_tmc(tmask); + + // call stub routine + // FIXME: unimplemented for cluster! + spawn_tasks_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); From 3729a05adccd890a991a1b6984204861de4a9ef2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:36:57 -0700 Subject: [PATCH 42/75] vx_spawn.c: Separate cluster-based scheduling code from original --- kernel/include/vx_spawn.h | 2 ++ kernel/src/vx_spawn.c | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 06a85af7..8ebbab09 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,7 +17,9 @@ #include #include +#ifndef CORES_PER_CLUSTER #define CORES_PER_CLUSTER 2 +#endif #ifdef __cplusplus extern "C" { diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 04b58253..87688e1c 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -128,7 +128,7 @@ static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } -static void __attribute__ ((noinline)) spawn_tasks_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -140,6 +140,17 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +static void __attribute__ ((noinline)) spawn_tasks_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_all_stub(); + + // disable warp + vx_tmc_zero(); +} + void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs int NC = vx_num_cores(); @@ -189,7 +200,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg if (num_full_warps >= 1) { // execute callback on other warps int nw = MIN(num_full_warps, NW); - vx_wspawn(nw, spawn_tasks_all_cb); + vx_wspawn(nw, spawn_tasks_cluster_all_cb); // activate all threads vx_tmc(-1); @@ -256,11 +267,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { rW = TW - fW * NW; // remaining warps } - int cluster_id = core_id / CORES_PER_CLUSTER; - const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; - const int offset = cluster_id * tasks_per_cluster; - wspawn_tasks_args_t wspawn_args = { callback, arg, offset, fW, rW }; - // wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; g_wspawn_args[core_id] = &wspawn_args; if (TW >= 1) { @@ -272,7 +279,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_cluster_all_stub(); + spawn_tasks_all_stub(); // back to single-threaded vx_tmc_one(); From 4d2c0084d126b1d14252c716e320b9ede808d295 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:37:44 -0700 Subject: [PATCH 43/75] common.mk: Compile separate cluster ELF ... using -DRADIANCE, which the kernel C code use explicitly to switch between vx_spawn_tasks and vx_spawn_tasks_cluster. This is to ease running both simX and Chipyard simulations without mixing up binaries. --- tests/regression/common.mk | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index d38df853..087561b0 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -78,17 +78,23 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump -kernel.bin: kernel.elf +kernel.radiance.dump: kernel.radiance.elf + $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump + +kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf +kernel.radiance.elf: $(VX_SRCS) + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ From b54580949604e19093610c1d9d5f6dc1697b6703 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:42:36 -0700 Subject: [PATCH 44/75] vecaddx: Use -DRADIANCE --- tests/regression/vecaddx/kernel.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp index 6ed42164..6e782586 100644 --- a/tests/regression/vecaddx/kernel.cpp +++ b/tests/regression/vecaddx/kernel.cpp @@ -13,6 +13,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; +#ifdef RADIANCE + vx_spawn_tasks_cluster(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#else vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } From b88dbd7a83dde81ddf50b07466fe58ae5374127a Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 26 Mar 2024 16:43:49 -0700 Subject: [PATCH 45/75] add cycle count and multi core support --- tests/kernel/gemmini_mmio/main.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 053e0c25..3eaf3621 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -79,13 +79,22 @@ int main() { // gemmini_extended_compute_preloaded(spad_A, GARBAGE_ADDR, DIM, DIM, DIM, DIM); // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); // gemmini_extended_mvout_spad(spad_C, 1, acc_C, DIM, DIM); - + + uint32_t core_id; + asm volatile ("csrr %0, 0xcc2" : "=r" (core_id)); + printf("core id %d\n", core_id); + if (core_id > 0) return 0; + + uint32_t start_cycles, end_cycles; + asm volatile ("csrr %0, mcycle" : "=r" (start_cycles)); sp_tiled_matmul_full_spad_ws(spad_A, spad_B, /*spad_D=*/0, spad_C, /*I=*/I, /*J=*/J, /*K=*/K, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); fence(); + asm volatile ("csrr %0, mcycle" : "=r" (end_cycles)); + sprintf(print_buf, "gemmini cycles taken: %d\n", end_cycles - start_cycles); // check results for (int i = 0; i < I * DIM; i++) { From df1f7f242a05d2d8fa21e3cd29994943545a121f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 23:51:59 -0700 Subject: [PATCH 46/75] vx_spawn.c: Implement spawn_tasks_cluster_rem_stub --- kernel/src/vx_spawn.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 87688e1c..fb36b0bc 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -102,14 +102,15 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int wid = vx_warp_id(); int tid = vx_thread_id(); - const int core_id_in_cluster = vx_core_id() % CORES_PER_CLUSTER; - const int cluster_wid = CORES_PER_CLUSTER * wid + core_id_in_cluster; + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; // FIXME: handle RW int waves = p_wspawn_args->NWs; - int offset = p_wspawn_args->offset + (NT * cluster_wid + tid); + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); vx_spawn_tasks_cb callback = p_wspawn_args->callback; void* arg = p_wspawn_args->arg; @@ -128,6 +129,25 @@ static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } +static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { + int NT = vx_num_threads(); + int cid = vx_core_id(); + int tid = vx_thread_id(); + int wid = vx_warp_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + // FIXME: This assumes that all cores but the last one are working with full + // warps, and only the last core has a partially-filled warp. + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + int task_id = offset; + (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -224,8 +244,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg vx_tmc(tmask); // call stub routine - // FIXME: unimplemented for cluster! - spawn_tasks_rem_stub(); + spawn_tasks_cluster_rem_stub(); // back to single-threaded vx_tmc_one(); From 4e834f21035c72e93a16cc0cfbf7b190b16324a1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:09:45 -0700 Subject: [PATCH 47/75] vx_spawn.c: Rewrite cluster-based vx_spawn_tasks variant Implements round-robin allocation of warps to cores & maintains contiguous thread ID allocation to neighboring threads. Also handles partially-enabled remainder warp logic. TODO: Hardcodes only 1 cluster in the system. --- kernel/src/vx_spawn.c | 88 ++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index fb36b0bc..8e5002f4 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,27 +74,6 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } -static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { - int NT = vx_num_threads(); - int NW = vx_num_warps(); - int cid = vx_core_id(); - int wid = vx_warp_id(); - int tid = vx_thread_id(); - - wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; - - // FIXME: handle RW - int waves = p_wspawn_args->NWs; - int offset = p_wspawn_args->offset + (NT * wid + tid); - - vx_spawn_tasks_cb callback = p_wspawn_args->callback; - void* arg = p_wspawn_args->arg; - for (int wave_id = 0; wave_id < waves; ++wave_id) { - int task_id = offset + (wave_id * NT * NW); - callback(task_id, arg); - } -} - static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int NT = vx_num_threads(); int NW = vx_num_warps(); @@ -109,11 +88,13 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; // FIXME: handle RW - int waves = p_wspawn_args->NWs; + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); vx_spawn_tasks_cb callback = p_wspawn_args->callback; void* arg = p_wspawn_args->arg; + + // sequential iterations for (int wave_id = 0; wave_id < waves; ++wave_id) { int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); callback(task_id, arg); @@ -171,6 +152,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +// This function runs in every core, but with only 1 warp and 1 thread enabled. +// The logic in this function figures out how many warps/threads this particular +// core has to enable to fulfill an entire grid of computation. void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs int NC = vx_num_cores(); @@ -181,45 +165,49 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg int core_id = vx_core_id(); if (core_id >= NUM_CORES_MAX) return; + const int cluster_id = core_id / CORES_PER_CLUSTER; + const int core_id_in_cluster = core_id % CORES_PER_CLUSTER; // Distribute threads equally across as many cores as possible, even if they // don't fill up NW*NT in a single core. This makes sure the warps get evenly // distributed in a single cluster // // TODO: Try to contain in a single cluster if possible? - int num_active_cores = (num_tasks > NT) ? (num_tasks / NT) : 1; - num_active_cores = MIN(num_active_cores, NC); + const int num_active_cores = (num_tasks + (NT - 1)) / NT; if (core_id >= num_active_cores) return; // terminate extra cores - int tasks_per_core = num_tasks / num_active_cores; - int tasks_per_core_last = tasks_per_core; - if (core_id == (num_active_cores - 1)) { - int rem = num_tasks % num_active_cores; - tasks_per_core_last += rem; // last core also executes remaining tasks + // FIXME: HARDCODES 1 CLUSTER! + const int num_tasks_this_cluster = num_tasks; + const int num_full_warps = num_tasks_this_cluster / NT; + const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; + // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; + + int num_warps_this_core = num_full_warps / CORES_PER_CLUSTER; + const int num_warps_in_last_row = num_full_warps % CORES_PER_CLUSTER; + if (core_id_in_cluster < num_warps_in_last_row) { + num_warps_this_core++; + } + // if 0, last warp is full-threads enabled + int rem_threads_in_last_warp_this_core = 0; + if (rem_threads_in_last_warp != 0) { + if (core_id_in_cluster == num_warps_in_last_row - 1) { + rem_threads_in_last_warp_this_core = rem_threads_in_last_warp; + } } - int num_full_warps = tasks_per_core_last / NT; - int rem_threads_in_last_warp = tasks_per_core_last % NT; // sequential iterations - int num_full_waves = 1; - int rem_warps_in_last_wave = 0; - if (num_full_warps >= NW) { - // this division will result in the same value for both the last core and - // the rest - num_full_waves = num_full_warps / NW; - rem_warps_in_last_wave = num_full_warps % NW; - } + const int num_full_waves = num_warps_this_core / NW; + const int rem_full_warps_in_last_wave = num_warps_this_core % NW; - int cluster_id = core_id / CORES_PER_CLUSTER; - const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; - const int offset = cluster_id * tasks_per_cluster; - wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, rem_warps_in_last_wave}; + const const int offset = cluster_id * num_tasks_this_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, + rem_full_warps_in_last_wave}; g_wspawn_args[core_id] = &wspawn_args; - if (num_full_warps >= 1) { + if (num_warps_this_core > 0) { // execute callback on other warps - int nw = MIN(num_full_warps, NW); + const int nw = MIN(num_warps_this_core, NW); vx_wspawn(nw, spawn_tasks_cluster_all_cb); // activate all threads @@ -235,12 +223,16 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg vx_wspawn_wait(); } - if (rem_threads_in_last_warp != 0) { + // TODO: Instead of launching an additional wave just to work on remaining + // threads, handle this in the last wave amongst other full warps. + if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { // adjust offset - wspawn_args.offset += (tasks_per_core_last - rem_threads_in_last_warp); + // FIXME: consider cluster_id here + // FIXME: use rem_threads_in_last_warp_this_core + wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); // activate remaining threads - int tmask = (1 << rem_threads_in_last_warp) - 1; + const int tmask = (1 << rem_threads_in_last_warp) - 1; vx_tmc(tmask); // call stub routine From fa6adceb7e48b3a70454acc0f138ce2b705ea437 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:15:38 -0700 Subject: [PATCH 48/75] vecaddx: Hardcode args/input device address to match chipyard Don't use mem_alloc/mem_free API --- tests/regression/vecaddx/main.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 4f3b77af..e25ad5b4 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -107,9 +107,9 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { - vx_mem_free(device, kernel_arg.src0_addr); - vx_mem_free(device, kernel_arg.src1_addr); - vx_mem_free(device, kernel_arg.dst_addr); + // vx_mem_free(device, kernel_arg.src0_addr); + // vx_mem_free(device, kernel_arg.src1_addr); + // vx_mem_free(device, kernel_arg.dst_addr); vx_dev_close(device); } } @@ -182,9 +182,12 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + kernel_arg.src0_addr = 0x20000UL; + kernel_arg.src1_addr = 0x28000UL; + kernel_arg.dst_addr = 0xc0000000UL; kernel_arg.num_points = num_points; From 870846f20fbe3d3ab307a32b6bbbd8d009901983 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:38:52 -0700 Subject: [PATCH 49/75] vx_spawn.c: Create separate vx_spawn_tasks_contiguous --- kernel/include/vx_spawn.h | 1 + kernel/src/vx_spawn.c | 105 +++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 8ebbab09..84dad2bc 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -53,6 +53,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 8e5002f4..278516a3 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,6 +74,26 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); + int offset = p_wspawn_args->offset + (NT * wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int NT = vx_num_threads(); int NW = vx_num_warps(); @@ -87,7 +107,6 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; - // FIXME: handle RW int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); @@ -129,12 +148,22 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // disable warp + vx_tmc_zero(); +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); // call stub routine - // spawn_tasks_all_stub(); spawn_tasks_cluster_all_stub(); // disable warp @@ -243,6 +272,78 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg } } +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { + // device specs + int NC = vx_num_cores(); + int NW = vx_num_warps(); + int NT = vx_num_threads(); + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + + // calculate necessary active cores + int WT = NW * NT; + int nC = (num_tasks > WT) ? (num_tasks / WT) : 1; + int nc = MIN(nC, NC); + if (core_id >= nc) + return; // terminate extra cores + + // number of tasks per core + int tasks_per_core = num_tasks / nc; + int tasks_per_core_n1 = tasks_per_core; + if (core_id == (nc-1)) { + int rem = num_tasks - (nc * tasks_per_core); + tasks_per_core_n1 += rem; // last core also executes remaining tasks + } + + // number of tasks per warp + int TW = tasks_per_core_n1 / NT; // occupied warps + int rT = tasks_per_core_n1 - TW * NT; // remaining threads + int fW = 1, rW = 0; + if (TW >= NW) { + fW = TW / NW; // full warps iterations + rW = TW - fW * NW; // remaining warps + } + + wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + g_wspawn_args[core_id] = &wspawn_args; + + if (TW >= 1) { + // execute callback on other warps + int nw = MIN(TW, NW); + vx_wspawn(nw, spawn_tasks_contiguous_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + if (rT != 0) { + // adjust offset + wspawn_args.offset += (tasks_per_core_n1 - rT); + + // activate remaining threads + int tmask = (1 << rT) - 1; + vx_tmc(tmask); + + // call stub routine + spawn_tasks_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); From 09822764e7f3931edea74198eb86917ccd758086 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 22:43:25 -0700 Subject: [PATCH 50/75] sgemm_wg: Remove software-based barrier implementation Intra-cluster barrier is now implemented in hardware, transparent to the ISA. --- tests/regression/sgemm_wg/kernel.cpp | 54 ---------------------------- 1 file changed, 54 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 44299934..78f056fa 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -12,63 +12,9 @@ #define TM 2 #define TN 2 -#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL -#define CORES_PER_CLUSTER 2 -#define BARRIER_STRIDE 4 - void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { - vx_barrier(barrier_id, count); vx_fence(); - - // vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count); - -#if CORES_PER_CLUSTER != 0 - // this code doesn't work without the memory-mapped register implemented in - // hardware, hence the #ifdef. - - if (tid_in_threadblock == 0) { - volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); - int core_id = vx_core_id(); - // FIXME: hardcoded - const uint32_t barrier_stride = BARRIER_STRIDE; - const uint32_t barrier_offset = barrier_stride * barrier_id; - - // wait for the barrier to be initialized - while (mmio[barrier_offset + 1 + core_id] != 0); - - // signal internal-core synchronization done - mmio[barrier_offset + 1 + core_id] = 1; - - // wait for other cores in the cluster to finish by waiting on the - // all-synced read-only mmio reg - while (mmio[barrier_offset] == 0); - - // need to signal that this core passed the barrier; otherwise, if we - // reset this to 0 right away, the other core still waiting for the - // barrier might never see the all-sync mmio reg as 1. - mmio[barrier_offset + 1 + core_id] = 2; - - // // if this core is the last one passing the barrier, reset all per-core - // // flags to 0 to get ready for the next barrier - // bool all_passed = true; - // for (int i = 0; i < CORES_PER_CLUSTER; i++) { - // // if (i == core_id) continue; - // // NOTE: this requires coherent access of store-to-load to the same - // // address - // if (mmio[barrier_offset + 1 + i] != 2) { - // all_passed = false; - // break; - // } - // } - // if (all_passed) { - // for (int i = 0; i < CORES_PER_CLUSTER; i++) { - // mmio[barrier_offset + 1 + i] = 0; - // } - // } - } - vx_barrier(barrier_id, count); -#endif } void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, From 9555b790e71b2df7d79ea70ba49ba6d34809b552 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 22:45:51 -0700 Subject: [PATCH 51/75] sgemm_wg: ifdef-guard cluster specific code --- tests/regression/sgemm_wg/kernel.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 78f056fa..d34861a7 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -130,8 +130,13 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // across the threadblock const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); +#ifdef RADIANCE + const uint32_t threadblocks_per_core = + vx_num_threads() * vx_num_warps() / (threads_per_threadblock / CORES_PER_CLUSTER); +#else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; +#endif const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const int threadblock_id = task_id / threads_per_threadblock; @@ -156,6 +161,12 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN); - vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for threadblock + // allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } From 9673db4e8cae4dd6fc66682761c0f310c0b95f66 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 17:35:47 -0700 Subject: [PATCH 52/75] sgemm_wg: Fix possible divide-by-0 --- tests/regression/sgemm_wg/kernel.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index d34861a7..5fc1b8b8 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -132,7 +132,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); #ifdef RADIANCE const uint32_t threadblocks_per_core = - vx_num_threads() * vx_num_warps() / (threads_per_threadblock / CORES_PER_CLUSTER); + vx_num_threads() * vx_num_warps() / threads_per_threadblock * CORES_PER_CLUSTER; #else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; @@ -149,13 +149,13 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // "static" shared memory allocation. This would determine threadblock + // occupancy of a single cluster float *sharedmem_per_threadblock = - (float *)DEV_SMEM_START_ADDR + - (2 * BM * BK) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock, - threadblock_dim_x, threadblock_dim_y, threadblock_id_x, - threadblock_id_y, threadblock_id_in_core, - sharedmem_per_threadblock); + (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, + threadblock_dim_y, threadblock_id_x, threadblock_id_y, + threadblock_id_in_core, sharedmem_per_threadblock); } int main() { From a9b0814211b760b2b0299614ead326af1e989c46 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 18:17:00 -0700 Subject: [PATCH 53/75] sgemm_wg: Document tiling parameter constraints --- tests/regression/sgemm_wg/kernel.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 5fc1b8b8..11612db1 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -4,11 +4,20 @@ #include #include "common.h" +// Constraints on parameters: +// * Memory: +// (BM + BN) * BK * sizeof(float) <= sharedmem size. +// BM * BK == BN * BK >= threadblock size >= NT * CORES_PER_CLUSTER +// When larger, the kernel runs a sequential loop to read into sharedmem; +// but smaller case is not handled. +// * Compute: +// ( M* N) / (TM*TN) == grid size >= NC*NW*NT +// (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER +// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields +// BM <= BK*TM*TN. #define BM 8 #define BN BM #define BK 2 -// #define TM (BM/BK) -// #define TN (BN/BK) #define TM 2 #define TN 2 @@ -82,7 +91,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, threadblock_dim_y); + // Compute single tile*tile matmul for (uint32_t local_k = 0; local_k < BK; local_k++) { + // First, pump data from SMEM->RF #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { reg_a[res_idx_m] = @@ -94,7 +105,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; } - // Compute multiple result elements (TM) per thread + // Next, compute multiple result elements (TM*TN) by reusing data in RF #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { #pragma GCC unroll TN @@ -113,6 +124,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); } + // Store result data from RF to GMEM #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { #pragma GCC unroll TN From e4eec8ab4d2260eafcfbd98d18addb840dcb37c1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 20:16:44 -0700 Subject: [PATCH 54/75] vx_spawn.c: Handle num_clusters > 1 WIP: still assumes num_tasks is divisible by num_cluster --- kernel/src/vx_spawn.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 278516a3..9ea45ded 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -148,7 +148,7 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } -static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { // activate all threads vx_tmc(-1); @@ -159,7 +159,7 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { vx_tmc_zero(); } -static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -186,9 +186,11 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // core has to enable to fulfill an entire grid of computation. void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs - int NC = vx_num_cores(); - int NW = vx_num_warps(); - int NT = vx_num_threads(); + const int NC = vx_num_cores(); + const int NW = vx_num_warps(); + const int NT = vx_num_threads(); + // NOTE: assumes divisible + const int num_cluster = NC / CORES_PER_CLUSTER; // current core id int core_id = vx_core_id(); @@ -206,8 +208,8 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg if (core_id >= num_active_cores) return; // terminate extra cores - // FIXME: HARDCODES 1 CLUSTER! - const int num_tasks_this_cluster = num_tasks; + // FIXME: assumes num_tasks is divisible by num_cluster + const int num_tasks_this_cluster = num_tasks / num_cluster; const int num_full_warps = num_tasks_this_cluster / NT; const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; From 537b97eb202b248ea3d0e228a62240cce862468f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 20:17:26 -0700 Subject: [PATCH 55/75] common.mk: Don't clean all *.elf --- tests/regression/common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 087561b0..8f4c4db1 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -121,7 +121,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf *.elf *.dump + rm -rf kernel.elf kernel.radiance.elf *.dump ifneq ($(MAKECMDGOALS),clean) -include .depend From fa2b6e2ad0d27da4dfae778626a714281c2ad505 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 29 Mar 2024 02:48:29 -0700 Subject: [PATCH 56/75] sgemm_wg: Explicitly limit unroll to reduce stack spilling This needs to be done case-by-case for different BK/TM/TN combinations and examining the assembly. --- tests/regression/sgemm_wg/kernel.cpp | 40 +++++++++++++++++----------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 11612db1..4833154c 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -12,14 +12,15 @@ // but smaller case is not handled. // * Compute: // ( M* N) / (TM*TN) == grid size >= NC*NW*NT +// (BM*BN) / (TM*TN) == threadblock size < NT * NW * CORES_PER_CLUSTER // (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER // * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields -// BM <= BK*TM*TN. -#define BM 8 +// BM <= BK*TM*TN +#define BM 16 #define BN BM -#define BK 2 -#define TM 2 -#define TN 2 +#define BK 4 +#define TM 4 +#define TN 4 void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -32,7 +33,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, const uint32_t threadblock_id_y, - const uint32_t threadblock_id_in_core, + const uint32_t threadblock_id_in_cluster, float *sharedmem_per_threadblock) { const float *A = (const float *)arg->addr_a; const float *B = (const float *)arg->addr_b; @@ -75,12 +76,17 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN); for (uint32_t k = 0; k < dim_k; k += BK) { + // Data move from GMEM to SMEM + // + // Make sure global offset values for A and B are contiguous between + // neighboring threads to ensure GMEM coalescing. for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { const uint32_t global_a_offset = dim_k * (global_a_row + load_offset) + (k + local_a_col); local_a[BK * (local_a_row + load_offset) + local_a_col] = A[global_a_offset]; } +// #pragma GCC unroll 1 for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { const uint32_t global_b_offset = dim_n * (k + local_b_row + load_offset) + global_b_col; @@ -88,10 +94,11 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, threadblock_dim_y); // Compute single tile*tile matmul +#pragma GCC unroll 2 for (uint32_t local_k = 0; local_k < BK; local_k++) { // First, pump data from SMEM->RF #pragma GCC unroll TM @@ -120,7 +127,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, threadblock_dim_y); } @@ -137,14 +144,15 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } -void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); #ifdef RADIANCE - const uint32_t threadblocks_per_core = - vx_num_threads() * vx_num_warps() / threads_per_threadblock * CORES_PER_CLUSTER; + const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / + threads_per_threadblock * + CORES_PER_CLUSTER; #else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; @@ -152,7 +160,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const int threadblock_id = task_id / threads_per_threadblock; - const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; + const int threadblock_id_in_cluster = threadblock_id % threadblocks_per_core; const int tid_in_threadblock = task_id % threads_per_threadblock; const uint32_t dim_m = arg->dim_m; @@ -164,10 +172,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // "static" shared memory allocation. This would determine threadblock // occupancy of a single cluster float *sharedmem_per_threadblock = - (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; + (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_cluster; thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, - threadblock_id_in_core, sharedmem_per_threadblock); + threadblock_id_in_cluster, sharedmem_per_threadblock); } int main() { @@ -176,8 +184,8 @@ int main() { #ifdef RADIANCE vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #else - // NOTE: This kernel assumes contiguous thread scheduling for threadblock - // allocation, and therefore does not work with original vx_spawn_tasks + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #endif return 0; From b0c1f773889936934db2552f66921022bc89d6e1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 29 Mar 2024 12:24:55 -0700 Subject: [PATCH 57/75] vx_start.S: Swizzle stack space Striding stack space for threads by power-of-two risks possibilities of bank conflicts or cache aliasing problems. Add an extra offset of 4 bytes to avoid this. --- kernel/src/vx_start.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index b5065c95..d2a81707 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -102,6 +102,8 @@ init_regs: #endif csrr t0, VX_CSR_MHARTID sll t1, t0, STACK_LOG2_SIZE + sll t2, t0, 2 + add t1, t1, t2 sub sp, sp, t1 # set thread pointer register From 84a31f338493b5091d809f7f1408ebdbc0dd9e27 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Mon, 1 Apr 2024 11:10:32 -0700 Subject: [PATCH 58/75] thread parallel data loading for word strided bank --- tests/kernel/gemmini_mmio/main.cpp | 86 +++++++++++++++++++----------- 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 3eaf3621..a3979829 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -2,23 +2,25 @@ #include #include #include +#include #include #include "gemmini_mmio.h" +#define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) + int main() { - char *print_buf = (char *) PRINT_BUF; + int cid; + asm volatile ("csrr %0, 0xcc2" : "=r" (cid)); + if (cid > 0) return 0; - sprintf(print_buf, "\n%d\n", DIM); - - gemmini_config_ld(0); - gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + vx_tmc(0xff); // load up A and B and C - uint32_t spad_A = 0x00000000; - uint32_t spad_B = 0x00000080; // 16B word addressed - uint32_t acc_C = 0x80000000; // accmem + accumulate - uint32_t spad_C = 0x00000100; + const uint32_t spad_A = 0x00000000; + const uint32_t spad_B = 0x00000080; // 16B word addressed + const uint32_t acc_C = 0x80000000; // accmem + accumulate + const uint32_t spad_C = 0x00000100; volatile float *smem_A = (float *) SPAD_TO_SMEM(spad_A); // 0xff000000; // byte addressed float *smem_B = (float *) SPAD_TO_SMEM(spad_B); // 0xff000200; @@ -28,33 +30,62 @@ int main() { int J = 32 / DIM; int K = 32 / DIM; - sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM); - sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM); - sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); + char *print_buf = (char *) PRINT_BUF; + // int cid = vx_core_id(); + int nc = vx_num_cores(); + int nt = vx_num_threads(); + int tid = vx_thread_id(); + + vx_tmc_one(); + gemmini_config_ld(0); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); gemmini_config_st(DIM * 4 * J); + sprintf(print_buf, "DIM %d\n", DIM); + sprintf(print_buf, "num cores %d\n", nc); + sprintf(print_buf, "num threads %d\n", nt); + sprintf(print_buf, "thread ids "); + vx_tmc(-1); + sprintf(print_buf, "%d", tid); + uint32_t start_cycles, end_cycles; + /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM); + sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM); + sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */ + + rd_cycles(start_cycles); // load A with 128->1 in row-major order - for (int i = 0; i < I; i++) { + for (int t = 0; t < DIM * DIM / nt; t++) { + int n = tid + t * nt; + int x = n / DIM; + int y = n % DIM; for (int k = 0; k < K; k++) { - int tile_byte_offset = (i * K + k) * DIM * DIM; - for (int x = 0; x < DIM; x++) - for (int y = 0; y < DIM; y++) - smem_A[tile_byte_offset + x * DIM + y] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64); + for (int i = 0; i < I; i++) { + int tile_byte_offset = (i * K + k) * DIM * DIM; + smem_A[tile_byte_offset + n] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64); + // smem_A[tile_byte_offset + x * DIM + y] = (float) ((I * K * DIM * DIM - ((i * DIM + x) * DIM * K + (k * DIM + y))) % 64); + } } } // load B with 0->191 in row-major order - for (int k = 0; k < K; k++) { - for (int j = 0; j < J; j++) { - int tile_byte_offset = (k * J + j) * DIM * DIM; - for (int x = 0; x < DIM; x++) - for (int y = 0; y < DIM; y++) - smem_B[tile_byte_offset + x * DIM + y] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64); + for (int t = 0; t < DIM * DIM / nt; t++) { + int n = tid + t * nt; + int x = n / DIM; + int y = n % DIM; + for (int k = 0; k < K; k++) { + for (int j = 0; j < J; j++) { + int tile_byte_offset = (k * J + j) * DIM * DIM; + smem_B[tile_byte_offset + n] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64); + } + // smem_B[tile_byte_offset + x * DIM + y] = (float) (((k * DIM + x) * DIM * J + (j * DIM + y)) % 64); } } + rd_cycles(end_cycles); - for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 1.f; + // for (int i = 0; i < I * J * DIM * DIM; i++) smem_C[i] = 1.f; + vx_tmc_one(); + sprintf(print_buf, "\ndata loading took %d cycles for %d floats\n", end_cycles - start_cycles, DIM * DIM * (I * K + J * K)); fence(); @@ -65,7 +96,6 @@ int main() { // } // sprintf(print_buf, "\n"); // } - // sprintf(print_buf, "\nB in\n"); // for (int i = 0; i < K * DIM; i++) { // for (int j = 0; j < J * DIM; j++) { @@ -80,12 +110,6 @@ int main() { // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); // gemmini_extended_mvout_spad(spad_C, 1, acc_C, DIM, DIM); - uint32_t core_id; - asm volatile ("csrr %0, 0xcc2" : "=r" (core_id)); - printf("core id %d\n", core_id); - if (core_id > 0) return 0; - - uint32_t start_cycles, end_cycles; asm volatile ("csrr %0, mcycle" : "=r" (start_cycles)); sp_tiled_matmul_full_spad_ws(spad_A, spad_B, /*spad_D=*/0, spad_C, /*I=*/I, /*J=*/J, /*K=*/K, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, From 93a00101ae6b8cc1275ea25db7311f3e52b7d14c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 4 Apr 2024 21:05:06 -0700 Subject: [PATCH 59/75] sgemm_wg: revert to faster params --- tests/regression/sgemm_wg/kernel.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 4833154c..e9f898a0 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -16,11 +16,11 @@ // (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER // * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields // BM <= BK*TM*TN -#define BM 16 +#define BM 8 #define BN BM -#define BK 4 -#define TM 4 -#define TN 4 +#define BK 2 +#define TM 2 +#define TN 2 void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -80,6 +80,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // // Make sure global offset values for A and B are contiguous between // neighboring threads to ensure GMEM coalescing. +// #pragma GCC unroll 1 for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { const uint32_t global_a_offset = dim_k * (global_a_row + load_offset) + (k + local_a_col); @@ -98,7 +99,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); // Compute single tile*tile matmul -#pragma GCC unroll 2 +// #pragma GCC unroll 2 for (uint32_t local_k = 0; local_k < BK; local_k++) { // First, pump data from SMEM->RF #pragma GCC unroll TM @@ -136,7 +137,6 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { #pragma GCC unroll TN for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { - // NOTE use of local_b_row and global_b_col here C[dim_n * (BM * threadblock_id_y + TM * local_c_row + res_idx_m) + (BN * threadblock_id_x + TN * local_c_col + res_idx_n)] = reg_c[TN * res_idx_m + res_idx_n]; From 7bf72c95688a8916ecec7e89d2f6aaee8a87f4ab Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 9 Apr 2024 19:53:17 -0700 Subject: [PATCH 60/75] cycle counting for fence --- tests/kernel/gemmini_mmio/main.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index a3979829..3d660b7e 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -105,20 +105,17 @@ int main() { // if (i == 2) i = K * DIM - 3; // } - // gemmini_extended_preload(spad_B, acc_C, DIM, DIM, DIM, DIM); - // gemmini_extended_compute_preloaded(spad_A, GARBAGE_ADDR, DIM, DIM, DIM, DIM); - // gemmini_extended_mvout(0xc0000000, 0xff000000, DIM, DIM); - // gemmini_extended_mvout_spad(spad_C, 1, acc_C, DIM, DIM); - - asm volatile ("csrr %0, mcycle" : "=r" (start_cycles)); + uint32_t fence_cycles; + rd_cycles(start_cycles); sp_tiled_matmul_full_spad_ws(spad_A, spad_B, /*spad_D=*/0, spad_C, /*I=*/I, /*J=*/J, /*K=*/K, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); + rd_cycles(fence_cycles); fence(); - asm volatile ("csrr %0, mcycle" : "=r" (end_cycles)); - sprintf(print_buf, "gemmini cycles taken: %d\n", end_cycles - start_cycles); + rd_cycles(end_cycles); + sprintf(print_buf, "gemmini cycles taken: %d, fence cycles: %d\n", end_cycles - start_cycles, end_cycles - fence_cycles); // check results for (int i = 0; i < I * DIM; i++) { From 3383b7073273a46d2230178b658ed7208ed71c8f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 14 Apr 2024 12:33:02 -0700 Subject: [PATCH 61/75] sgemm_wg: Hardcode device address --- tests/regression/sgemm_wg/main.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 229463ef..c8816e0a 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -58,9 +58,9 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { - vx_mem_free(device, kernel_arg.addr_a); - vx_mem_free(device, kernel_arg.addr_b); - vx_mem_free(device, kernel_arg.addr_c); + // vx_mem_free(device, kernel_arg.addr_a); + // vx_mem_free(device, kernel_arg.addr_b); + // vx_mem_free(device, kernel_arg.addr_c); vx_dev_close(device); } } @@ -166,9 +166,12 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); - RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); - RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); + // RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + // RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); + // RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); + kernel_arg.addr_a = 0x20000UL; + kernel_arg.addr_b = 0x28000UL; + kernel_arg.addr_c = 0xc0000000UL; kernel_arg.dim_m = dim_m; kernel_arg.dim_n = dim_n; From 37a60b1141b9d62c635f54528f78b03b2fa231ca Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 14 Apr 2024 12:33:27 -0700 Subject: [PATCH 62/75] sgemm_wg: Output C result to binary --- tests/regression/sgemm_wg/main.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c8816e0a..93152896 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -108,6 +108,17 @@ int run_test(const kernel_arg_t& kernel_arg, std::cout << "download destination buffer" << std::endl; RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size)); + std::cout << "downloading result C matrix from device, device mem address=" + << std::hex << kernel_arg.addr_c << ", size=" << std::dec + << buf_size << " bytes\n"; + std::ofstream file("output.c.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open output.c.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), buf_size); + file.close(); + // verify result std::cout << "verify result" << std::endl; { @@ -225,7 +236,7 @@ int main(int argc, char *argv[]) { << src_a_buf_size << " bytes\n"; std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); if (!file) { - std::cerr << "error: failed to open args.bin for writing\n"; + std::cerr << "error: failed to open input.a.bin for writing\n"; exit(EXIT_FAILURE); } file.write(reinterpret_cast(buf_ptr), src_a_buf_size); @@ -242,7 +253,7 @@ int main(int argc, char *argv[]) { << src_b_buf_size << " bytes\n"; std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); if (!file) { - std::cerr << "error: failed to open args.bin for writing\n"; + std::cerr << "error: failed to open input.b.bin for writing\n"; exit(EXIT_FAILURE); } file.write(reinterpret_cast(buf_ptr), src_b_buf_size); From d8eddb21ea21b1e6827f1f8f106df46acc63c876 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Mon, 15 Apr 2024 10:04:54 -0700 Subject: [PATCH 63/75] add gemmini dependency --- .gitmodules | 3 +++ third_party/gemmini-rocc-tests | 1 + 2 files changed, 4 insertions(+) create mode 160000 third_party/gemmini-rocc-tests diff --git a/.gitmodules b/.gitmodules index af1d1a47..6bc2bb4c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "third_party/ramulator"] path = third_party/ramulator url = https://github.com/CMU-SAFARI/ramulator.git +[submodule "third_party/gemmini-rocc-tests"] + path = third_party/gemmini-rocc-tests + url = https://github.com/ucb-bar/gemmini-rocc-tests diff --git a/third_party/gemmini-rocc-tests b/third_party/gemmini-rocc-tests new file mode 160000 index 00000000..62106286 --- /dev/null +++ b/third_party/gemmini-rocc-tests @@ -0,0 +1 @@ +Subproject commit 62106286e5b7479065025666cdc5f6bc020be764 From 0bb7aeb45b61503044177e69033700864c39fc5e Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Mon, 15 Apr 2024 10:10:20 -0700 Subject: [PATCH 64/75] add gpu+gemmini gemm kernel --- tests/regression/common.mk | 3 +- tests/regression/sgemm_gemmini/.gitignore | 5 + tests/regression/sgemm_gemmini/Makefile | 9 + tests/regression/sgemm_gemmini/common.h | 18 ++ tests/regression/sgemm_gemmini/kernel.cpp | 269 ++++++++++++++++++ tests/regression/sgemm_gemmini/main.cpp | 274 +++++++++++++++++++ tests/regression/sgemm_gemmini/sgemm_gemmini | Bin 0 -> 28448 bytes 7 files changed, 577 insertions(+), 1 deletion(-) create mode 100644 tests/regression/sgemm_gemmini/.gitignore create mode 100644 tests/regression/sgemm_gemmini/Makefile create mode 100644 tests/regression/sgemm_gemmini/common.h create mode 100644 tests/regression/sgemm_gemmini/kernel.cpp create mode 100644 tests/regression/sgemm_gemmini/main.cpp create mode 100755 tests/regression/sgemm_gemmini/sgemm_gemmini diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 8f4c4db1..81df3139 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -22,6 +22,7 @@ RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) VORTEX_RT_PATH ?= $(realpath ../../../runtime) VORTEX_KN_PATH ?= $(realpath ../../../kernel) +GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests) FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae @@ -49,7 +50,7 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy VX_CFLAGS += -v -O3 -std=c++17 VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw +VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH) VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a diff --git a/tests/regression/sgemm_gemmini/.gitignore b/tests/regression/sgemm_gemmini/.gitignore new file mode 100644 index 00000000..7c35ba59 --- /dev/null +++ b/tests/regression/sgemm_gemmini/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +sgemm_wg +.depend diff --git a/tests/regression/sgemm_gemmini/Makefile b/tests/regression/sgemm_gemmini/Makefile new file mode 100644 index 00000000..a36f6d21 --- /dev/null +++ b/tests/regression/sgemm_gemmini/Makefile @@ -0,0 +1,9 @@ +PROJECT = sgemm_gemmini + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/sgemm_gemmini/common.h b/tests/regression/sgemm_gemmini/common.h new file mode 100644 index 00000000..74941562 --- /dev/null +++ b/tests/regression/sgemm_gemmini/common.h @@ -0,0 +1,18 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t dim_m; + uint32_t dim_n; + uint32_t dim_k; + uint64_t addr_a; + uint64_t addr_b; + uint64_t addr_c; +} kernel_arg_t; + +#endif diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp new file mode 100644 index 00000000..34c72d00 --- /dev/null +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -0,0 +1,269 @@ +#include +#include +#include +#include +#include "common.h" +#include "include/gemmini.h" +#include "gemmini_mmio.h" + +#define MATRIX_M 64 // TODO: remove hardcode +#define MATRIX_N 64 +#define MATRIX_K 64 +#define TILE_M 32 // tile size = SMEM size / 2 (double buffering) / 4 (A, B, C, Psum) +#define TILE_N 32 +#define TILE_K 32 +#define TILE_MN 1024 +#define TILE_MK 1024 +#define TILE_NK 1024 + +#define NUM_CLUSTERS 1 +#define TB_M (MATRIX_M / NUM_CLUSTERS) +#define TB_N MATRIX_N +#define TB_SIZE (TB_M * TB_N) +#define NUM_TILE_ROWS_PER_TB (TB_M / TILE_M) +#define THREAD_ELEMS 8 // elements per thread in a tile +#define THREAD_STRIDE 8 // threads per core + +#define SMEM_ADDR_0K ((float *) 0xff000000) +#define SMEM_ADDR_4K ((float *) 0xff001000) +#define SMEM_ADDR_8K ((float *) 0xff002000) +#define SMEM_ADDR_12K ((float *) 0xff003000) + +#define SPAD_ADDR_0K 0x0 +#define SPAD_ADDR_4K 0x80 +#define SPAD_ADDR_8K 0x100 +#define SPAD_ADDR_12K 0x180 + +// #define DEBUG_PRINT +#define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) + +void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, + const uint32_t threadblock_id, + const uint32_t tid_in_threadblock) { + const float * const A = (const float * const) arg->addr_a; + const float * const B = (const float * const) arg->addr_b; + float * const C = (float * const) arg->addr_c; + + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t dim_k = arg->dim_k; + const uint32_t num_tiles_n = dim_n / TILE_N; + const uint32_t num_tiles_k = dim_k / TILE_K; + // TODO: make this into constexpr by subbing architectural params with macros + const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + const uint32_t hw_tid = tid_in_threadblock % num_threads_in_cluster; + const uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster; + const uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; + const uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; + const uint32_t thread_load_offset = hw_tid; + const uint32_t thread_load_stride = num_threads_in_cluster; + + uint32_t marker0, marker1, marker2, marker3, marker4; + uint32_t marker5, marker6, marker7, marker8, marker9; + + if (hw_tid == 0) { + gemmini_config_ld(0); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + gemmini_config_st(0); + sprintf(PRINT_BUF, "start\n"); + } + + + // TODO: check for tb id + rd_cycles(marker0); + + for (int tile_i = NUM_TILE_ROWS_PER_TB * threadblock_id; + tile_i < NUM_TILE_ROWS_PER_TB * (threadblock_id + 1); + tile_i += 1) { + for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { + float * const smem_c_tile_start = SMEM_ADDR_4K; + float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; + + for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { + // TODO: double buffer + const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K; + const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N; + float * const smem_a_tile_start = SMEM_ADDR_0K; + float * const smem_b_tile_start = SMEM_ADDR_12K; + + rd_cycles(marker1); + + // preload A matrix +#pragma GCC unroll 8 // TODO: macro computed + for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ + dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; + } + +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nA %d %d\n", tile_i, tile_k); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_K; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K); + sprintf(PRINT_BUF, "%x %x ", + (int) (smem_a_tile_start[mat_offset]), + (int) (smem_a_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + + // preload B matrix +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < b_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + smem_b_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)] = \ + dram_b_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N]; + } + +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nB %d %d\n", tile_k, tile_j); + for (int i = 0; i < TILE_K; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%x %x ", + (int) (smem_b_tile_start[mat_offset]), + (int) (smem_b_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + rd_cycles(marker2); + + // cluster wide barrier to wait for A and B loads to complete + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + rd_cycles(marker3); + if (hw_tid == 0) { + sp_tiled_matmul_full_spad_ws(SPAD_ADDR_0K, SPAD_ADDR_12K, /*spad_D=*/0, SPAD_ADDR_4K, + /*I=*/TILE_M / DIM, /*J=*/TILE_N / DIM, /*K=*/TILE_K / DIM, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, + /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, + /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); + gemmini_fence(); + } + rd_cycles(marker4); + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + rd_cycles(marker5); + + // accumulate C matrix + if (tile_k == 0) { +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + *(SMEM_ADDR_8K + elem_offset) = smem_c_tile_start[elem_offset]; + } + } else { +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + *(SMEM_ADDR_8K + elem_offset) += smem_c_tile_start[elem_offset]; + } + } + + rd_cycles(marker6); +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nC %d %d %d\n", tile_i, tile_j, tile_k); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%d %d ", + (int) (smem_c_tile_start[mat_offset]), + (int) (smem_c_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + } + + rd_cycles(marker7); + // move out to dram + #pragma GCC unroll 8 // TODO: macro computed + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + dram_c_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N] = \ + *(SMEM_ADDR_8K + SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)); + } + + rd_cycles(marker8); + /* if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nC %d %d\n", tile_i, tile_j); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%d %d ", + (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j]), + (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } */ + } + } + // last thread block complete + if (threadblock_id == NUM_CLUSTERS - 1) { + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + rd_cycles(marker9); + if (hw_tid == 0) { + sprintf(PRINT_BUF, "complete\n"); + sprintf(PRINT_BUF, "total cycles: %d\n", marker9 - marker0); + sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); + sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); + sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); + sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); + sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); + sprintf(PRINT_BUF, "accumulation cycles: %d\n", marker6 - marker5); + sprintf(PRINT_BUF, "dram mvout cycles: %d\n", marker8 - marker7); + } + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + if (hw_tid == num_threads_in_cluster - 1) { + sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); + sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); + sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); + sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); + sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); + } + vx_tmc_one(); + } + vx_tmc(0); +} + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + // @perf: All threads are running these compute whose result is mostly same + // across the threadblock + + const int threadblock_id = task_id / TB_SIZE; + const int tid_in_threadblock = task_id % TB_SIZE; + + thread_block_matmul_gemmini(arg, threadblock_id, tid_in_threadblock); +} + +int main() { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + sprintf(PRINT_BUF, "m=%d, n=%d\n", arg->dim_m, arg->dim_n); + + const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif + return 0; +} \ No newline at end of file diff --git a/tests/regression/sgemm_gemmini/main.cpp b/tests/regression/sgemm_gemmini/main.cpp new file mode 100644 index 00000000..54531062 --- /dev/null +++ b/tests/regression/sgemm_gemmini/main.cpp @@ -0,0 +1,274 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_a_data; +std::vector src_b_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.addr_a); + vx_mem_free(device, kernel_arg.addr_b); + vx_mem_free(device, kernel_arg.addr_c); + vx_dev_close(device); + } +} + +void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + src_a_data.resize(dim_m * dim_k); + src_b_data.resize(dim_k * dim_n); + + for (uint32_t i = 0; i < src_a_data.size(); ++i) { + src_a_data[i] = static_cast(i); + std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl; + } + for (uint32_t i = 0; i < src_b_data.size(); ++i) { + src_b_data[i] = static_cast(i); + std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl; + } +} + +void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + ref_data.resize(dim_m * dim_n); + + for (uint32_t i = 0; i < dim_m; ++i) { + for (uint32_t j = 0; j < dim_n; ++j) { + float ref = 0.0f; + for (uint32_t k = 0; k < dim_k; ++k) { + ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j]; + } + ref_data.at(dim_n * i + j) = ref; + } + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t dim_m, uint32_t dim_n) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < dim_m * dim_n; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (std::abs((cur - ref) / ref) > 1e-6) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // FIXME: hardcoded + uint32_t dim_m = 64; + uint32_t dim_n = 64; + uint32_t dim_k = 64; + + generate_source_matrix(dim_m, dim_n, dim_k); + generate_reference_matmul(dim_m, dim_n, dim_k); + + uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]); + uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]); + + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); + + kernel_arg.dim_m = dim_m; + kernel_arg.dim_n = dim_n; + kernel_arg.dim_k = dim_k; + + std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl; + std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl; + std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max( + src_a_buf_size, + std::max( + src_b_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + kernel_arg.addr_a = (uint64_t) 0x20000; + kernel_arg.addr_b = (uint64_t) 0x28000; + kernel_arg.addr_c = (uint64_t) 0xc0000000ULL; + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); + + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + } + + // upload source buffer + { + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), + src_a_buf_size)); + + std::cout << "uploading source A matrix to device, device mem address=" + << std::hex << kernel_arg.addr_a << ", size=" << std::dec + << src_a_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_a_buf_size); + file.close(); + } + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(), + src_b_buf_size)); + + std::cout << "uploading source B matrix to device, device mem address=" + << std::hex << kernel_arg.addr_b << ", size=" << std::dec + << src_b_buf_size << " bytes\n"; + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_b_buf_size); + file.close(); + } + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < ref_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n)); + std::cout << "PASSED!" << std::endl; + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} diff --git a/tests/regression/sgemm_gemmini/sgemm_gemmini b/tests/regression/sgemm_gemmini/sgemm_gemmini new file mode 100755 index 0000000000000000000000000000000000000000..67ade61b5a3759e630c5a613c78401d05b717369 GIT binary patch literal 28448 zcmeHwdwf*Ywg1UuBqC&jk~S)p5rZZIF#&=J?MNmMIHMCuAt+k$WHOnM)MO@|2Lyjw zqaoV4(`oV6TKlW^hppbLZT$(owL+{IKnV3#M68P91FPtXh+q&zeB^%Dew>^+%=BY> z@9*~)`}4_Jd#%0J+H0@zCP+CZkf!0?b;)p#D0k5qPM31zE|O~!a!o=` zLWl6Dgwmg6BmGqfy;kTz8U+V?DvDKZsgSe6TEQuX zD-z25z709@%U2De+Lc8jzjXDmL6loUY9|yf>1c1BGy9T`mKh!GfoS)P?ut1x=FBb$ z2TNwMbh3-`pf~@v!n|uUF1jCMrauK;3kge`n{^&T%!r|J5gFhWb3`=HD}}h9#zW zJckU5e>cWnMRPaSYc2kee|dX2;tw^}*K`B}{zh+ehhNdO z<(*PJgH7?dS;l2w`chpjdpbey)D`e z{!qZ*p|!RnFR+FD-j-F`N^g6FFeuV|9l3a->T{DI|>Hq9Rj1w##2 z_$Z^r!H#fUJL8+}543c+eTySyWxh6VNQ;DESr`V?Edp&;vp3xC(}Lj$%b3Sj*dL0xJBdHocydCEIU!Qf9t>;H>7VTmv`1>o>Q-HqS9FBk zU#%{Dji`xojn`;kWhXP<;@}$T;F+_fKZDVT7Hri*-oSFd*4i0?{Ef_uvwcY7O1@GV zMCDxxYrjg7VG6RMCZ)O21{R{(b_QGg)O45od|H@VZkeJj39M`nv}lX`;bZ*2YWhJwe z1s-=bP>^Sr%q&ep%gU0_N;Age+RT#KMyzZ!I&P-@FS{2?GTH+A5z;lfoAI79IW zkql@$2s))V;qQeQlV@ZWXk-I&Inrra;^=sE4DQ+a+T^_@+4rm{iklj|nA zUQx%>D2zjE1VJ5n&_x_BRpiHqq>c-&O{eg$kH~L=#;k%1136K--v`ACc2y>P|_|F z-Kcxe_n7EtKt|YSqK`GGilUq7=paToY@&}dsETsjL`Q=j4WhotMw;WvkYl3TB@r;s zM5q2HLxG7tK@tHSCc1e(SY)D$Ee|GDYNFHnQ-%r?Jzo+5t4#E-ndqvC{&f?*!9*`G z(U+R&XPf9vCi*!hdYg%Uu8H1dqE9l>yG`_OnCLww`gtb$4@~sQCi)r^-C?5NW1^pL zqOUd4r zPI=&z2Tpn5lm||E;J@pEBlh!7sJ$QLsBzmX^Ke|#74-b9Mz-i$>JmfWSTJR4KecQlkp+7w2GH_bp4-b_YI4$glht4-} zTF?&_{)oiY*qeGA1_Cv9NFNYzDWE6zp1ux{#y{89uB%*2T#c^S=S$SY zj9bse&tufs5%ty&iZGJ3sPXKnw6&wgRu>k4P&S~(-dAI9)Ivo)+Ia8rkw{bT@f#vn z+%lo}_`K+JeI*90vJrhPI$-bdK-8heop-^Yej3-xj_DgfkJ)~ME^*+o-s5eN=k)my z=zVTP4y8!xW+=Zd*5J&DeL1kU+J$0JxXN=(7heK*L{*Cp7#cB(4)M?+i4{^N> zCU|0B!n#f1&{x5_etKm<$9(8eW82i&pk7UeqSj7)Mh2J9LP|dk>=3(y#HO%(!`y2fCG#TLg+(%C+9w7aQ_;LGv1F^7xeCXtanFFthzv-KzYKk51)X)S=A-VCy+vY+ge=Rb>>SnEf;_f{K^n0yT#64eIqwLl#D@O>D?v)%dEK zxUMiy)@3HuWgQHM)OUk+Y;|D~h6KYR5&9t8;o|A zi>(cDWVR7nka8_yS#5f}PL2h21-cC?rH__k`YOiUoy?3npocuM4@J9G6Td6WL+vLy z>FHz4Nrg;yP_H9>!bvk2?U4R$(8x*Xjc90pbM?x7C0!^Cm5r1 zBE^HSAU?hxT54nO#`CL4Yr%Lcf>Cr6<2 zS7^isehH=bbGhAA+Aomvfrr_mHnweO9{fOYckJn*8WBfJ9l8i{`{W)N%hnTY{hM$u zqq5ygw#1#U<18IzdzXRo8cGR4HApXT(nOFj$e#{C0j_DrD-o0tn9nGKw$DXWzm$vK z!$miO495Q}857Qnxab3tGHAPBihhowBG+YH^m;D3STZJ@A7CvDWo}6sw9S{Io4M!| z5`|rG53?&#ejHIyjv5r_cCO`sr1UV$%vk$rrAE9I$rdz|~rP_*c@C2Ftp1pQh= zjqQ(&Lp5FqF@57lXe});al7qF@MSE6tYVfQT8Tty5B5U(X@39>>m5uNVc#{R5#h}v$%1{sZSW}EU??c9F-$iK?!VqbkXMk$wl(~25suLiGv^I^zg-eDt`#GQXcThq@*oZk5Tkn2AST9DBj)WQ?a zT#|X1@!B#|<#okqUc(qaWL_`LfZPO;*Ut&YotGo8P0Do_+VnOMD8tJrLrk41!&`ZV z_mN-)&+tyfPb%E2R3cuuVpHM#B)9euFI*AyyW>|oP*+c)ZeycoPhmvP`1P z$u)9T?BJO@NXZk-THC}_nJ*j7JYp)XJ{@uwh|GUWdAyHhP&ION*&qFD-){Tr6&Pr1 z6U`%aWyk8uP}MW2e;haq?b@AKkWpXxo_+O|U~$I=)i~b9@%$M`SBtl(n@?nVDo@yN z-;L~!RNM2ipP=W&4fdiPN9@-HE50nd2+8#F_l&L}*sSxp58=>6`T3-TOvCdgnxNAh z?1Y>i&{~7NNcA2lJz~#;el-q0STvxIgFE|aKM#7|1|&T+Yt$EWW|+j7TkVGiAh1Q> zOP1k9g3Qpc-SIzVP{T%Fw6A^xZ;&3Wfg;uRO)G0WvFBX9+cWgv9OSh=6UIt!_Wd5K z;kv$K(I(70C!re_nckRez3qv4kTVrWe}zg(27BT}ON_6;YkU)1AjHSOh*HcdB6pHN z&d4s}$+2%be4vMqYjPob7L_gGeEniHg>I(M_B`UXw6b&#B9_X(c9gY0hdbvLWGH53 z6-=JmL6z=_KOlPcJNDJDqm8(FZYrd%{UeIoH$@6-*!=1P?t~d|hMZxnA&m*=<;Yy$ zPfm*4E@Os}TYty2s(lCTtM7oOo|{Q$4r2CAQ5r2KlGH+>^<}_8I1e?6~VH`=i@w(+e*YdYANmfKEDoJ8BuD zdTIiAXQ!}DFeTQZO|!Sfha|1U)QUA==Du8h5| zj1Myo)mFmt=r7R83z9ndYS^kMqyjlXOQ_$28BxL+!lWPGTzml4m&m?dt=w#1JqZrQ zBL8vB4uz-!X^*EQ}E*dn2aokuk`zkMZVT&qWpj*dKiz@BccB@K*aB zAqd-V^WX{PlwP(M-p1x+<@kBPJc)8_UMSH8L$tEN;Ka+1Az~#PA$oNx;-G%U5Osoa zg?P#2KrB&%b<)KMGC*_mx<598v8T55Os(O=WMQ3E3OEC4;W*4HbutC1ser@&!{bNvRxZ}gTC;lff%3+xQ1C4An z&SQpvy70tC^f^q&7%?bUm3k@<+gD%66M1(!8wky(;G5I2r$N3s2H$*hd=@a&QVWzu zYle=ZK&&0|nF{*&4exQk907F4>!Bcj*F~d#+5Y$ZQiJh8`sM9H;g|J#HypvX6rDRk z)ML;r{gOi?2=~h(aM3H+?3Z==Ypj|^{bHKe-3#0K)JOQ`6a74(a(-RB)r0Lt?4E{@Jv-xJ{!?RLV!Ts} zx9isuy#f;YIuw-V&#}W>4gGvRJ&)$ndLb*)~?PgJl1acB++c*l)WFlAg*V^|7NKd(DCRjHB+#XFc`>L+;J{GvOtD z9_**Cl*leBI|vulVmCMYA)enhSNz3bSJVOt&epS=A||Yt>q2s5ZjOw?Kj=GAM0nV>YYEqJ3@c%9rLT1<?N8~JPBpkJ#c4iKX4x_KgK6| zFaH#NBENMa{lLG$Up~Ak_5bsBN`JYo_9UEpu~qN73Tj)ZV%ZO2W~O zh{C>Vb9f^HcU+*%b$ES|sJFwZOn3OZyZpY0zs0G{4@Ltm4#mOv!{;mWU2acZ?fHsp zt^*P)yd6=$Q>g|Ln0v$Awt4tO$R7}efnk9F)R1A~i#r&D+uv(!9>n?oq5c3G)$VD1cv=w+EI> zcZhuWM}`i3)`ah-0uhv*%y9~9#wth%mo(#BruG0nZ!7Vd;>|4X>+pL+uLgu(|YZ~gb1)hb~F3)tw zg+iB6I4{$fR$uc!s;@b{zLs$0AJyg)<>k(y1}od9Xt|C^P;`dr0y^Lr2i2QD9Cix# z@(w=NK_B>G#DEOznX);vfrCE5L#ox3;ayj4 z?ILs&-kp+~5!H9FwP@7gTBKMM$7qih{iD3R<$2R+PO50qIqIKqFdFjt9WF^ z{)+xdQ(mh*&FSqKZ73=2ss4Xr4|--Q&zl<>_c(?hjfNjnZJ@m*wMSD~TiQFd&ZKdG zD1l;gw{P*cdhzPeyj>W@TVy-Kgya?(Czv?+*xFCFue&Vc+^lat0G#$CD_$8H*#o#? z@5l)K5@H8n0bs=&BO?>D6=f;l8Z1=`FdLo@yaKC*0q_@L!Q279Zot)mI^ey4D)twf zuvWSUo2b77r1OkY(02gV5+C4Fz#JSq=){jhspH`5rsFo~*8$f6Qkckrl5yiAC8Ikp zJ()MEe8& zeqf@J&K*RDtqp)mZEjxXg4`UU@5A2*pnCz8#@xJ{b8{3G(c}25g`XR-&{O8;=KVO+ zlUwkktm<4xBHNu?^g~;1Zt3l(sks%sW9oC8vSwzD&8=|dmb!9_z*3!CP@S7sotrcN z3@Quxya4u;0V+#K7Z`-DYD1TcbX}fXm06QYlY8s-zdXD1IW@Pccg*dl{m_=k{!!MC zGc#{YnQiSEtPFobe*3W3R!{k|GSpJObyNnHFATo@G-TI1W`1r%=I)fb++0@<>f#-2 z=Q(J}%yiWymq9h@s>-@IGo`Z2nLiq#ONBqqC%djm_D3E0gV`nh@x7Fq7MzjkP35P$ zlz!rE6}h1uH+Wb}9`lPwa87rwY|qp#sO=7*-EBM_UjUDRajcxF>WbC-<+=ODW&J7# z=GB~3@h;jD#;!99raUW0^MtgyD#1zZaW?)Y!3GCl_7*%4=p6Y6@cTQM&Ywpt0i+edpMJ!-&2FQfnLkI{>Vn*=Kq|MfZ>>Loo(AZ_8r}#aBN^Xp{1-Un zv(qpB=~>F~*c~w7rE%jQIZOHVcxqtBa>w6)mhww`4}>?KX+!*_Gi^xy_{_=Rhj{iG zDD(gSfa56^*x-bEE#)Y=kAf8*g%ReDn=D1soOd;NSo!D=h56kCiFNaY%kA zPRBnKXk4OzUxKh8zYC`^lmgWyg`;1N1PQJc`N_IiD;Q`vqOeD#^Vx}_JT1m+YIhW< z%}}uVUWh3@*Yk8a#LDtg8>Aq|*spU`$q@yR^tHJ>LE7CW9EFy`g3SLsflEH@JhSqF zosqEX$naglk$7J+T-u4_6^6Hn^iKsY{UX%=j{$kFnN?w_gHx&mY!I+Xz%BuM1Y9HF zS^?JyI3VCI0rv@bSU_B+#X_Ed4gpIAtP-$6z$O8^1nd!Tjeu(fTqodwfV%|TC*WZL z<)u1#Q@9-t0ZRp}60kwQCIPzy>=AH{fLzNz{>*9OGFjEhvbZy2uA^v4b2Jc%Ix2BX zM(K>QD8tIWKeM!?6xU`Hb9%B|Dl7}VyFfNH6GImEASzPQvy`PN>DkInDd{$&ADh(U zc8b%C{+mo6qsV@qOwUnd`%k8yuE>6qOdqSr{+Uca!|2D!bc|{aGlj>qaI2ytB|TSJ zo1%WR6uYu6B|T4ZOiiVKf+ELFb7~gdG*xA$@OTzx%kq9cb7~geoNE@CDUv^5S#w1y z`qvaWKATgsl&>ptTu!DJD5=JUER6PY98KoODBonJ@OYN#{wZ^6mT{}(C^ZAoY~`bo zQ3{XIc%O;4NnRPpY2HGC-ZMm(es&NMf!t1XdASCgKY&(+zYIuq_z+=n4i^HX%9nP* zNl!y5=aWyG!xO#7LZ?BS=<>ZQ`TG!|eAhK|3YC+>UHHpL;r}NS-(~z0l^>>P52?!gF=djXy)DKIFq38h4#r?SXu62e2r>e)vjGm`73IBM7z_c{{m82hbw)r@t^z%H>&quxmO`I<6{7xD@ zy=myzLh#JYlgI5}rs4lB=wwfhFFD`egPyAW{AU_He@#PwI}M%IQ4`P(^DN~(Ci1o3 z_csBKO6C7)pp*T1K5oBkhqKf0)BZ^+J%#8O)IJ?zoV%FklsRemf0BlNb{c!0PQyP$`NE$)B40i}B6sE=X~if!4|E!T6u~c3 zCSlyjgPtx?FOohjjh=aF=(m7Q?XdJRu7Qusit=z8{{5g+zE;0|l!m_$?~7FJ!OiIC z*J9o17X`Qmbn?$(G0w>L=1-$1A^01FA0+=hY4{&WL*EH{s`tfvj34j!N^YPdi$fcY z1HxuAm;}1ROc3#kH2e*Wj`3gk0nLMjCeX>A!{Yr?hWe)PFKP7LoreBc8u}K{9ZX~L zHhVj? z792!{HE*04G0`}}fxFk}z zW8wVyi|cSFsH@sjM_Na(XsoJY7bN1ML%PS1T^7o()f5T*=0twmb+^{l z)y`x$Va_aH?hp7wI5NeJsW|ub2Yh}FCnKHFj_-c=yW&E}(QN4IM9oj7s`)vWWrig4bvVQ5J zQ!w-PnS3Tj7klE?&(Tuj#!buRuI9^j&DV#H%24}4K}CCD7u6)Inm9$mRz*va9EUrob~sw15T2q+ih8oOTN09ZWd1BL-SzQ=K8`> z_QIgw1JUnP=wO{GFy9s}FNIFhfSTvOgJ5GWIxXGCD>$ULQ(OUERb{%CH)(8;FBBRZ zv}|O zlmvnif64Mdv;;qT!ZqEIRVGq%v>pEvqrF9taEDN=eJA0J7jP65=WN ztyh?ScE#8Pq9HBh@9>g!oy)qXFibjBv-_Xv3jcL;;YN8~#x zFV|)5ppl4_m;3z^%5yxhuweE7_dumHGAS?51te@0a?*auC*k*zPG@csm*)ot6!K(0 z*(vQ8WF>(J6;aB|a|Q|JIiSp6%FFW87ZxO6jXw(VTv0-K4oUK~XK9tcA2jlb)UP;z zP;js(R%pq=+J5duvRU5Y;QSKy$btzq*g0oG!hZ*5mR~!Cr%NdLIFZ3NIY>E4dkPVA{_^}qLV2Do`K|ujCgcq_RS8Z3 zb)#Z9-J1WiAXENQUY_Sj_y89(p4R;LTIA)qkA&6^FWZk)xZf%-&I9B*(E`aRAlYgT zp8%nwZ_(|Ib0qoyoUG-Q`Ae7ug?5v?Ja1a+;kigYj*Yf|Nw-6uTrBgK=Th?jb2)@# zssB=ta?-Bff<|SM^78zu=St#MlpafgrM!gnyEL=B(!|8@!x0W00+m3@Nu17~h$i!| zYT_vlAzvyJgg}*m5?mniFO>uVrTjDtoouCGhO+JP6#i@w`FD#Sc+l?_X5mlTFUK!B kRfBuzjhmHZ(;=a_C0u%OC+0X9c<#Q*>R literal 0 HcmV?d00001 From 041d49fb58a7d0fb8087d20b6d4ef48c4ce35738 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Mon, 15 Apr 2024 10:22:00 -0700 Subject: [PATCH 65/75] update gemmini only kernel --- tests/kernel/gemmini_mmio/gemmini_mmio.h | 13 ++++++++----- tests/kernel/gemmini_mmio/main.cpp | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/kernel/gemmini_mmio/gemmini_mmio.h b/tests/kernel/gemmini_mmio/gemmini_mmio.h index b7712441..e2876927 100644 --- a/tests/kernel/gemmini_mmio/gemmini_mmio.h +++ b/tests/kernel/gemmini_mmio/gemmini_mmio.h @@ -14,7 +14,7 @@ #define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE) #define SPAD_MASK (SPAD_NUM_ROWS - 1) -#define PRINT_BUF SMEM_ADDR_END +#define PRINT_BUF ((char *) (SMEM_ADDR_END)) #define GEMMINI_RS1_ADDR 0xff007010 #define GEMMINI_RS2_ADDR 0xff007018 #define GEMMINI_INST_ADDR 0xff007000 @@ -32,7 +32,8 @@ (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM)) // #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; } -#define fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } +#undef gemmini_fence +#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } #undef ROCC_INSTRUCTION_RS1_RS2 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ @@ -60,6 +61,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u a_transpose, b_transpose, full_C, low_D, false, act, 0, 0, false); + /* return; @@ -72,7 +74,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN); // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t); const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t); - fence(); + gemmini_fence(); if (a_transpose || b_transpose || (I < 4)) { for (size_t k = 0; k < K; k++) { @@ -140,7 +142,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); } if (k == K - 1) { - for (int x = 0; x < 3; x++) fence(); + for (int x = 0; x < 3; x++) gemmini_fence(); gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM); gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM); gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM); @@ -152,7 +154,8 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u } } } - fence(); + gemmini_fence(); + */ } diff --git a/tests/kernel/gemmini_mmio/main.cpp b/tests/kernel/gemmini_mmio/main.cpp index 3d660b7e..d5be2558 100644 --- a/tests/kernel/gemmini_mmio/main.cpp +++ b/tests/kernel/gemmini_mmio/main.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include "include/gemmini.h" #include "gemmini_mmio.h" #define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) @@ -12,7 +12,7 @@ int main() { int cid; asm volatile ("csrr %0, 0xcc2" : "=r" (cid)); - if (cid > 0) return 0; + if (cid > 0) vx_tmc(0); vx_tmc(0xff); @@ -40,7 +40,11 @@ int main() { vx_tmc_one(); gemmini_config_ld(0); gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); - gemmini_config_st(DIM * 4 * J); + gemmini_config_st(0); + /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM); + sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM); + sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */ + sprintf(print_buf, "DIM %d\n", DIM); sprintf(print_buf, "num cores %d\n", nc); sprintf(print_buf, "num threads %d\n", nt); @@ -49,9 +53,6 @@ int main() { sprintf(print_buf, "%d", tid); uint32_t start_cycles, end_cycles; - /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM); - sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM); - sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */ rd_cycles(start_cycles); // load A with 128->1 in row-major order @@ -87,7 +88,7 @@ int main() { vx_tmc_one(); sprintf(print_buf, "\ndata loading took %d cycles for %d floats\n", end_cycles - start_cycles, DIM * DIM * (I * K + J * K)); - fence(); + gemmini_fence(); // sprintf(print_buf, "\nA in\n"); // for (int i = 0; i < I * DIM; i++) { @@ -113,7 +114,7 @@ int main() { /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); rd_cycles(fence_cycles); - fence(); + gemmini_fence(); rd_cycles(end_cycles); sprintf(print_buf, "gemmini cycles taken: %d, fence cycles: %d\n", end_cycles - start_cycles, end_cycles - fence_cycles); @@ -138,5 +139,6 @@ int main() { } sprintf(print_buf, "TEST PASSED\n"); + vx_tmc(0); return 0; } From 449d99f0bb4b3e978e5bae107e9f59f61e8c58b6 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 16 Apr 2024 17:15:22 -0700 Subject: [PATCH 66/75] dram gemm kernel --- kernel/include/gemmini_mmio.h | 162 ++++++++++++ tests/regression/sgemm_gemmini/kernel.cpp | 286 +++++++++++++++++----- 2 files changed, 387 insertions(+), 61 deletions(-) create mode 100644 kernel/include/gemmini_mmio.h diff --git a/kernel/include/gemmini_mmio.h b/kernel/include/gemmini_mmio.h new file mode 100644 index 00000000..b9dde44d --- /dev/null +++ b/kernel/include/gemmini_mmio.h @@ -0,0 +1,162 @@ +#ifndef GEMMINI_MMIO_H +#define GEMMINI_MMIO_H +#ifndef GEMMINI_PARAMS_H +#error INCLUDE GEMMINI.H FIRST +#endif + +#define SMEM_BASE 0xff000000 +#define SMEM_SIZE 0x4000 +#define SMEM_MASK (SMEM_SIZE - 1) +#define SMEM_ADDR_END 0xff008000 + +#define SPAD_BASE 0x0 +#define SPAD_ROW_SIZE (DIM * sizeof(elem_t)) +#define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE) +#define SPAD_MASK (SPAD_NUM_ROWS - 1) + +#define PRINT_BUF ((char *) (SMEM_ADDR_END)) +#define GEMMINI_RS1_ADDR 0xff007010 +#define GEMMINI_RS2_ADDR 0xff007018 +#define GEMMINI_INST_ADDR 0xff007000 +#define GEMMINI_BUSY_ADDR 0xff007020 + +#define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE) +#define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE) + +// convert normal matrix i,j into tiled smem offset +// top_in_tiles = i / DIM +// left_in_tiles = j / DIM +// num_tiles_before_current = top_in_tiles * (J / DIM) + left_in_tiles +// smem_addr = num_tiles_before_current * DIM * DIM + (i % DIM) * DIM + (j % DIM) +#define SMEM_MAT_OFFSET(i, j, J) \ + (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM)) + +// #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; } +#undef gemmini_fence +#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } + +#undef ROCC_INSTRUCTION_RS1_RS2 +#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ + /* printf("function %d\n", funct); */ \ + uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) (funct) << 25); \ + *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (volatile uint64_t) (rs1); \ + *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (volatile uint64_t) (rs2); \ + /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \ + /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \ + /* gemmini_fence(); */ \ + *((volatile uint32_t*) GEMMINI_INST_ADDR) = instruction; \ + /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \ +} + +static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start, + const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start, + size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K, + bool a_transpose, bool b_transpose, + bool full_C, bool low_D, + bool no_bias, bool repeating_bias, + int act) { + + gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, + A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start, + a_transpose, b_transpose, + full_C, low_D, false, + act, 0, 0, false); + /* + return; + + + // const uint32_t A_sp_addr_start = 0; + // const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM; + // const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1); + const uint32_t C_sp_addr_start = 2 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3)); + // const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) : + // (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC); + const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN); + // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t); + const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t); + gemmini_fence(); + + if (a_transpose || b_transpose || (I < 4)) { + for (size_t k = 0; k < K; k++) { + for (size_t j = 0; j < J; j++) { + for (size_t i = 0; i < I; i++) { + const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) : + (A_sp_addr_start + (i*K + k)*DIM); + const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) : + (B_sp_addr_start + (k*J + j)*DIM); + const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM; + // Compute + uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR; + uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2)); + gemmini_extended_preload(pre_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM); + if (i == 0) { // First iteration + gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } else { // All other iterations + gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } + if (k == K - 1) { + // Move-out C (if not normalizing) + // if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) { + const size_t rounded_j = j; // (j / C_blocks) * C_blocks; + const uint32_t rounded_C_sp_addr = C_sp_addr; // C_sp_addr_start + (i*J + rounded_j)*DIM; + + const uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C; + + // const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j; + constexpr size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0); + constexpr size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0); + + gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows); + // } + } + } + } + } + } else { + for (size_t k = 0; k < K; k++) { + for (size_t j = 0; j < J; j++) { + uint32_t A_sp_addr = A_sp_addr_start + k * DIM; // (i*K + k)*DIM; + const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM; + uint32_t C_sp_addr = C_sp_addr_start + j * DIM; // (i*J + j)*DIM; + for (size_t i = 0; i < I; i += 4) { + // Compute + // constexpr uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR; + const uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2)); + if (i == 0) { // First iteration + gemmini_extended_preload(B_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM); + gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } else { // All other iterations + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM); + gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM); + } + if (k == K - 1) { + for (int x = 0; x < 3; x++) gemmini_fence(); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM); + gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 3) * J + j) * DIM, 1, C_sp_addr + 3 * J * DIM, DIM, DIM); + } + A_sp_addr += 4 * K * DIM; + C_sp_addr += 4 * J * DIM; + } + } + } + } + gemmini_fence(); + */ +} + + +#endif diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp index 34c72d00..dfe15327 100644 --- a/tests/regression/sgemm_gemmini/kernel.cpp +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -24,16 +24,20 @@ #define THREAD_ELEMS 8 // elements per thread in a tile #define THREAD_STRIDE 8 // threads per core -#define SMEM_ADDR_0K ((float *) 0xff000000) -#define SMEM_ADDR_4K ((float *) 0xff001000) -#define SMEM_ADDR_8K ((float *) 0xff002000) -#define SMEM_ADDR_12K ((float *) 0xff003000) +#define SMEM_ADDR_0K ((float * const) 0xff000000) +#define SMEM_ADDR_4K ((float * const) 0xff001000) +#define SMEM_ADDR_8K ((float * const) 0xff002000) +#define SMEM_ADDR_12K ((float * const) 0xff003000) #define SPAD_ADDR_0K 0x0 #define SPAD_ADDR_4K 0x80 #define SPAD_ADDR_8K 0x100 #define SPAD_ADDR_12K 0x180 +#define HARDCODE +#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) +//#define PRINTF(...) vx_printf(__VA_ARGS__) + // #define DEBUG_PRINT #define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) @@ -55,13 +59,27 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const uint32_t num_tiles_n = dim_n / TILE_N; const uint32_t num_tiles_k = dim_k / TILE_K; // TODO: make this into constexpr by subbing architectural params with macros - const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + // const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + constexpr uint32_t num_threads_in_cluster = 128; + constexpr uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster; + constexpr uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; + constexpr uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; const uint32_t hw_tid = tid_in_threadblock % num_threads_in_cluster; - const uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster; - const uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; - const uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; const uint32_t thread_load_offset = hw_tid; - const uint32_t thread_load_stride = num_threads_in_cluster; + constexpr uint32_t thread_load_stride = num_threads_in_cluster; + + // the dram coordinates are (i1 + i0, j1 + j0). i0 and j0 are both spatially mapped only. + const uint32_t j0 = hw_tid % DIM; + const uint32_t i0 = (hw_tid / DIM) % DIM; + + // j1 is both spatially and temporally mapped. j1 increases every iteration. + const uint32_t j1_idx = (hw_tid / DIM / DIM) * DIM; // A: % TILE_K, B: % TILE_N, C: % TILE_N + // every iteratioon, j1 increases by j1_stride + constexpr uint32_t j1_stride = (num_threads_in_cluster / DIM / DIM) * DIM; // mod TILE_W after stride + + // i1 is only temporally mapped. i1 increments every one or more iterations + constexpr uint32_t i1_stride = DIM; // step per increment (increment doesnt happen every iteration) + constexpr uint32_t i1_iters = (DIM * DIM * (TILE_K / DIM)) / num_threads_in_cluster; // num of iters before striding uint32_t marker0, marker1, marker2, marker3, marker4; uint32_t marker5, marker6, marker7, marker8, marker9; @@ -70,10 +88,9 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, gemmini_config_ld(0); gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); gemmini_config_st(0); - sprintf(PRINT_BUF, "start\n"); + PRINTF("start\n"); } - // TODO: check for tb id rd_cycles(marker0); @@ -82,6 +99,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, tile_i += 1) { for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { float * const smem_c_tile_start = SMEM_ADDR_4K; + float * const smem_acc_tile_start = SMEM_ADDR_8K; float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { @@ -93,57 +111,153 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, rd_cycles(marker1); + #ifdef HARDCODE + #if (TILE_MK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 + #error CANNOT UNROLL + #endif // preload A matrix -#pragma GCC unroll 8 // TODO: macro computed - for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { - uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; - smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ - dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; - } + { + constexpr uint32_t every_iter = j1_stride; + const uint32_t every_2iters = i1_stride * dim_k; + const uint32_t runtime_const = i0 * dim_k + j1_idx + j0; + smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; + smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; + smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; + smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; + smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; + smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; + smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; + smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; + /* const float v0 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; + const float v1 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; + const float v2 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; + const float v3 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; + const float v4 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; + const float v5 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; + const float v6 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; + const float v7 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; -#ifdef DEBUG_PRINT + smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = v0; + smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = v1; + smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = v2; + smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = v3; + smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = v4; + smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = v5; + smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = v6; + smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = v7; */ + } + #else + #pragma GCC unroll 8 // TODO: macro computed + for (uint32_t thread_i = 0, j1 = 0, i1 = 0; + thread_i < a_elems_per_thread; + thread_i += 1, + j1 = (j1 + j1_stride) % TILE_K, + i1 = (thread_i % i1_iters == 0) ? i1 + i1_stride : i1) { + smem_a_tile_start[thread_i * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[(0 + i0) * dim_k + j1 + j1_idx + j0]; + } + // for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { + // uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + // smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ + // dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; + // } + #endif + + #ifdef DEBUG_PRINT if (hw_tid == 0) { - sprintf(PRINT_BUF, "\nA %d %d\n", tile_i, tile_k); + PRINTF("\nA %d %d\n", tile_i, tile_k); for (int i = 0; i < TILE_M; i += 8) { for (int j = 0; j < TILE_K; j += 8) { uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K); - sprintf(PRINT_BUF, "%x %x ", + PRINTF("%x %x ", (int) (smem_a_tile_start[mat_offset]), (int) (smem_a_tile_start[mat_offset + 4]) ); } - sprintf(PRINT_BUF, "\n"); + PRINTF("\n"); } } -#endif + #endif + + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); // preload B matrix -#pragma GCC unroll 8 + #ifdef HARDCODE + #if (TILE_NK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 + #error CANNOT UNROLL + #endif + constexpr uint32_t every_iter = j1_stride; + const uint32_t every_2iters = i1_stride * dim_n; + const uint32_t runtime_const = i0 * dim_n + j1_idx + j0; + smem_b_tile_start[0 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; + smem_b_tile_start[1 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; + smem_b_tile_start[2 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; + smem_b_tile_start[3 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; + smem_b_tile_start[4 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; + smem_b_tile_start[5 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; + smem_b_tile_start[6 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; + smem_b_tile_start[7 * num_threads_in_cluster + hw_tid] = \ + dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; + /* const float v0 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; + const float v1 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; + const float v2 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; + const float v3 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; + const float v4 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; + const float v5 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; + const float v6 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; + const float v7 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; + + smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = v0; + smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = v1; + smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = v2; + smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = v3; + smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = v4; + smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = v5; + smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = v6; + smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = v7; */ + #else + #pragma GCC unroll 8 for (int thread_i = 0; thread_i < b_elems_per_thread; thread_i++) { uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; smem_b_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)] = \ dram_b_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N]; } + #endif -#ifdef DEBUG_PRINT + #ifdef DEBUG_PRINT if (hw_tid == 0) { - sprintf(PRINT_BUF, "\nB %d %d\n", tile_k, tile_j); + PRINTF("\nB %d %d\n", tile_k, tile_j); for (int i = 0; i < TILE_K; i += 8) { for (int j = 0; j < TILE_N; j += 8) { uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); - sprintf(PRINT_BUF, "%x %x ", + PRINTF("%x %x ", (int) (smem_b_tile_start[mat_offset]), (int) (smem_b_tile_start[mat_offset + 4]) ); } - sprintf(PRINT_BUF, "\n"); + PRINTF("\n"); } } -#endif - rd_cycles(marker2); + #endif + rd_cycles(marker2); // cluster wide barrier to wait for A and B loads to complete - threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); rd_cycles(marker3); if (hw_tid == 0) { sp_tiled_matmul_full_spad_ws(SPAD_ADDR_0K, SPAD_ADDR_12K, /*spad_D=*/0, SPAD_ADDR_4K, @@ -153,57 +267,92 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, gemmini_fence(); } rd_cycles(marker4); - threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); rd_cycles(marker5); // accumulate C matrix if (tile_k == 0) { -#pragma GCC unroll 8 + #pragma GCC ivdep + #pragma GCC unroll 8 for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; - *(SMEM_ADDR_8K + elem_offset) = smem_c_tile_start[elem_offset]; + smem_acc_tile_start[elem_offset] = smem_c_tile_start[elem_offset]; } } else { -#pragma GCC unroll 8 - for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { - uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; - *(SMEM_ADDR_8K + elem_offset) += smem_c_tile_start[elem_offset]; + #if (TILE_NK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 + #error CANNOT UNROLL + #endif + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i += 8) { + constexpr uint32_t s = num_threads_in_cluster; + smem_acc_tile_start[hw_tid + s * 0] += smem_c_tile_start[hw_tid + s * 0]; + smem_acc_tile_start[hw_tid + s * 1] += smem_c_tile_start[hw_tid + s * 1]; + smem_acc_tile_start[hw_tid + s * 2] += smem_c_tile_start[hw_tid + s * 2]; + smem_acc_tile_start[hw_tid + s * 3] += smem_c_tile_start[hw_tid + s * 3]; + smem_acc_tile_start[hw_tid + s * 4] += smem_c_tile_start[hw_tid + s * 4]; + smem_acc_tile_start[hw_tid + s * 5] += smem_c_tile_start[hw_tid + s * 5]; + smem_acc_tile_start[hw_tid + s * 6] += smem_c_tile_start[hw_tid + s * 6]; + smem_acc_tile_start[hw_tid + s * 7] += smem_c_tile_start[hw_tid + s * 7]; } } rd_cycles(marker6); -#ifdef DEBUG_PRINT + #ifdef DEBUG_PRINT if (hw_tid == 0) { - sprintf(PRINT_BUF, "\nC %d %d %d\n", tile_i, tile_j, tile_k); + PRINTF("\nC %d %d %d\n", tile_i, tile_j, tile_k); for (int i = 0; i < TILE_M; i += 8) { for (int j = 0; j < TILE_N; j += 8) { uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); - sprintf(PRINT_BUF, "%d %d ", + PRINTF("%d %d ", (int) (smem_c_tile_start[mat_offset]), (int) (smem_c_tile_start[mat_offset + 4]) ); } - sprintf(PRINT_BUF, "\n"); + PRINTF("\n"); } } -#endif + #endif } rd_cycles(marker7); // move out to dram - #pragma GCC unroll 8 // TODO: macro computed + + #ifdef HARDCODE + #if (TILE_MN / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 + #error CANNOT UNROLL + #endif + constexpr uint32_t every_iter = j1_stride; + const uint32_t every_2iters = i1_stride * dim_n; + const uint32_t runtime_const = i0 * dim_n + j1_idx + j0; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 0] = \ + smem_acc_tile_start[0 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 0] = \ + smem_acc_tile_start[1 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 1] = \ + smem_acc_tile_start[2 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 1] = \ + smem_acc_tile_start[3 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 2] = \ + smem_acc_tile_start[4 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 2] = \ + smem_acc_tile_start[5 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 3] = \ + smem_acc_tile_start[6 * num_threads_in_cluster + hw_tid]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 3] = \ + smem_acc_tile_start[7 * num_threads_in_cluster + hw_tid]; + #else + #pragma GCC unroll 8 for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; dram_c_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N] = \ *(SMEM_ADDR_8K + SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)); } + #endif rd_cycles(marker8); /* if (hw_tid == 0) { sprintf(PRINT_BUF, "\nC %d %d\n", tile_i, tile_j); for (int i = 0; i < TILE_M; i += 8) { for (int j = 0; j < TILE_N; j += 8) { - uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); sprintf(PRINT_BUF, "%d %d ", (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j]), (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j + 4]) @@ -216,26 +365,42 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, } // last thread block complete if (threadblock_id == NUM_CLUSTERS - 1) { - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker9); if (hw_tid == 0) { - sprintf(PRINT_BUF, "complete\n"); - sprintf(PRINT_BUF, "total cycles: %d\n", marker9 - marker0); - sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); - sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); - sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); - sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); - sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); - sprintf(PRINT_BUF, "accumulation cycles: %d\n", marker6 - marker5); - sprintf(PRINT_BUF, "dram mvout cycles: %d\n", marker8 - marker7); + PRINTF("\ncomplete\n"); + PRINTF("total cycles: %d\n", marker9 - marker0); + PRINTF("tile start: %d\n", marker1); + PRINTF("single tile cycles: %d\n", marker6 - marker1); + PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); + PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); + PRINTF("second barrier: %d\n", marker5 - marker4); + PRINTF("accumulation cycles: %d\n", marker6 - marker5); + PRINTF("dram mvout cycles: %d\n", marker8 - marker7); } - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + threadblock_barrier(0, /*barrier_id=*/1, /*count=*/NUM_WARPS); if (hw_tid == num_threads_in_cluster - 1) { - sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); - sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); - sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); - sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); - sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); + PRINTF("\ntile start: %d\n", marker1); + PRINTF("single tile cycles: %d\n", marker6 - marker1); + PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); + PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("second barrier: %d\n", marker5 - marker4); + PRINTF("accumulation cycles: %d\n", marker6 - marker5); + PRINTF("dram mvout cycles: %d\n", marker8 - marker7); + } + threadblock_barrier(0, /*barrier_id=*/2, /*count=*/NUM_WARPS); + if (hw_tid == 0) { + for (int i = 0; i < dim_m; i += 8) { + for (int j = 0; j < dim_n; j += 8) { + sprintf(PRINT_BUF, "%d %d ", + (int) (C[i * dim_n + j]), + (int) (C[i * dim_n + j + 4]) + ); + } + PRINTF("\n"); + } } vx_tmc_one(); } @@ -254,7 +419,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - sprintf(PRINT_BUF, "m=%d, n=%d\n", arg->dim_m, arg->dim_n); const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; From 4e9855dc3323e4df1845bf2fb0576be40ab3f72f Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 16 Apr 2024 22:19:30 -0700 Subject: [PATCH 67/75] highly unrolled a/b load --- tests/regression/sgemm_gemmini/kernel.cpp | 199 +++++++++++----------- 1 file changed, 95 insertions(+), 104 deletions(-) diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp index dfe15327..7029c511 100644 --- a/tests/regression/sgemm_gemmini/kernel.cpp +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -104,99 +104,76 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { // TODO: double buffer - const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K; - const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N; - float * const smem_a_tile_start = SMEM_ADDR_0K; - float * const smem_b_tile_start = SMEM_ADDR_12K; - rd_cycles(marker1); #ifdef HARDCODE #if (TILE_MK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 #error CANNOT UNROLL #endif - // preload A matrix - { - constexpr uint32_t every_iter = j1_stride; - const uint32_t every_2iters = i1_stride * dim_k; - const uint32_t runtime_const = i0 * dim_k + j1_idx + j0; - smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; - smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; - smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; - smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; - smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; - smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; - smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; - smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; - /* const float v0 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; - const float v1 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; - const float v2 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; - const float v3 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; - const float v4 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; - const float v5 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; - const float v6 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; - const float v7 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; + // preload A B matrix - smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = v0; - smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = v1; - smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = v2; - smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = v3; - smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = v4; - smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = v5; - smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = v6; - smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = v7; */ - } - #else - #pragma GCC unroll 8 // TODO: macro computed - for (uint32_t thread_i = 0, j1 = 0, i1 = 0; - thread_i < a_elems_per_thread; - thread_i += 1, - j1 = (j1 + j1_stride) % TILE_K, - i1 = (thread_i % i1_iters == 0) ? i1 + i1_stride : i1) { - smem_a_tile_start[thread_i * num_threads_in_cluster + hw_tid] = \ - dram_a_tile_start[(0 + i0) * dim_k + j1 + j1_idx + j0]; - } - // for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { - // uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; - // smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ - // dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; - // } - #endif - - #ifdef DEBUG_PRINT - if (hw_tid == 0) { - PRINTF("\nA %d %d\n", tile_i, tile_k); - for (int i = 0; i < TILE_M; i += 8) { - for (int j = 0; j < TILE_K; j += 8) { - uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K); - PRINTF("%x %x ", - (int) (smem_a_tile_start[mat_offset]), - (int) (smem_a_tile_start[mat_offset + 4]) - ); - } - PRINTF("\n"); - } - } - #endif - - threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); - - // preload B matrix - #ifdef HARDCODE - #if (TILE_NK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 - #error CANNOT UNROLL - #endif constexpr uint32_t every_iter = j1_stride; - const uint32_t every_2iters = i1_stride * dim_n; - const uint32_t runtime_const = i0 * dim_n + j1_idx + j0; + const uint32_t every_2iters_a = i1_stride * dim_k; + const uint32_t runtime_const_a = i0 * dim_k + j1_idx + j0; + const uint32_t every_2iters_b = i1_stride * dim_n; + const uint32_t runtime_const_b = i0 * dim_n + j1_idx + j0; + + const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K + runtime_const_a; + const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N + runtime_const_b; + float * const smem_a_tile_start = SMEM_ADDR_0K + hw_tid; + float * const smem_b_tile_start = SMEM_ADDR_12K + hw_tid; + + const float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0]; + const float w0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0]; + const float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0]; + const float w1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0]; + const float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1]; + const float w2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1]; + const float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1]; + const float w3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1]; + const float v4 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2]; + const float w4 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2]; + const float v5 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2]; + const float w5 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2]; + const float v6 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3]; + const float w6 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3]; + const float v7 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3]; + const float w7 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3]; + + smem_a_tile_start[0 * num_threads_in_cluster] = v0; + smem_b_tile_start[0 * num_threads_in_cluster] = w0; + smem_a_tile_start[1 * num_threads_in_cluster] = v1; + smem_b_tile_start[1 * num_threads_in_cluster] = w1; + smem_a_tile_start[2 * num_threads_in_cluster] = v2; + smem_b_tile_start[2 * num_threads_in_cluster] = w2; + smem_a_tile_start[3 * num_threads_in_cluster] = v3; + smem_b_tile_start[3 * num_threads_in_cluster] = w3; + smem_a_tile_start[4 * num_threads_in_cluster] = v4; + smem_b_tile_start[4 * num_threads_in_cluster] = w4; + smem_a_tile_start[5 * num_threads_in_cluster] = v5; + smem_b_tile_start[5 * num_threads_in_cluster] = w5; + smem_a_tile_start[6 * num_threads_in_cluster] = v6; + smem_b_tile_start[6 * num_threads_in_cluster] = w6; + smem_a_tile_start[7 * num_threads_in_cluster] = v7; + smem_b_tile_start[7 * num_threads_in_cluster] = w7; + + /* smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; + smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; + smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; + smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; + smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; + smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; + smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; + smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; + smem_b_tile_start[0 * num_threads_in_cluster + hw_tid] = \ dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; smem_b_tile_start[1 * num_threads_in_cluster + hw_tid] = \ @@ -212,25 +189,27 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, smem_b_tile_start[6 * num_threads_in_cluster + hw_tid] = \ dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; smem_b_tile_start[7 * num_threads_in_cluster + hw_tid] = \ - dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; - /* const float v0 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; - const float v1 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 0]; - const float v2 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 1]; - const float v3 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 1]; - const float v4 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 2]; - const float v5 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 2]; - const float v6 = dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; - const float v7 = dram_a_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; - - smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = v0; - smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = v1; - smem_a_tile_start[2 * num_threads_in_cluster + hw_tid] = v2; - smem_a_tile_start[3 * num_threads_in_cluster + hw_tid] = v3; - smem_a_tile_start[4 * num_threads_in_cluster + hw_tid] = v4; - smem_a_tile_start[5 * num_threads_in_cluster + hw_tid] = v5; - smem_a_tile_start[6 * num_threads_in_cluster + hw_tid] = v6; - smem_a_tile_start[7 * num_threads_in_cluster + hw_tid] = v7; */ + dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; */ #else + const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K; + const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N; + float * const smem_a_tile_start = SMEM_ADDR_0K; + float * const smem_b_tile_start = SMEM_ADDR_12K; + + #pragma GCC unroll 8 // TODO: macro computed + for (uint32_t thread_i = 0, j1 = 0, i1 = 0; + thread_i < a_elems_per_thread; + thread_i += 1, + j1 = (j1 + j1_stride) % TILE_K, + i1 = (thread_i % i1_iters == 0) ? i1 + i1_stride : i1) { + smem_a_tile_start[thread_i * num_threads_in_cluster + hw_tid] = \ + dram_a_tile_start[(0 + i0) * dim_k + j1 + j1_idx + j0]; + } + // for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { + // uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + // smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ + // dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; + // } #pragma GCC unroll 8 for (int thread_i = 0; thread_i < b_elems_per_thread; thread_i++) { uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; @@ -241,6 +220,17 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, #ifdef DEBUG_PRINT if (hw_tid == 0) { + PRINTF("\nA %d %d\n", tile_i, tile_k); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_K; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K); + PRINTF("%x %x ", + (int) (smem_a_tile_start[mat_offset]), + (int) (smem_a_tile_start[mat_offset + 4]) + ); + } + PRINTF("\n"); + } PRINTF("\nB %d %d\n", tile_k, tile_j); for (int i = 0; i < TILE_K; i += 8) { for (int j = 0; j < TILE_N; j += 8) { @@ -255,6 +245,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, } #endif + rd_cycles(marker2); // cluster wide barrier to wait for A and B loads to complete threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); @@ -384,8 +375,8 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, PRINTF("\ntile start: %d\n", marker1); PRINTF("single tile cycles: %d\n", marker6 - marker1); PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); - PRINTF("gemmini cycles: %d\n", marker4 - marker3); PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); PRINTF("second barrier: %d\n", marker5 - marker4); PRINTF("accumulation cycles: %d\n", marker6 - marker5); PRINTF("dram mvout cycles: %d\n", marker8 - marker7); From 6cbfbfb856e7759de035639b4d5dba17e82f0c0f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 14 Apr 2024 21:25:14 -0700 Subject: [PATCH 68/75] sgemm_wg: Output CPU data to binary --- kernel/src/vx_spawn.c | 1 - tests/regression/sgemm_wg/main.cpp | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 9ea45ded..ffbbaccb 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -258,7 +258,6 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg // threads, handle this in the last wave amongst other full warps. if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { // adjust offset - // FIXME: consider cluster_id here // FIXME: use rem_threads_in_last_warp_this_core wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 93152896..709d804c 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -119,6 +119,14 @@ int run_test(const kernel_arg_t& kernel_arg, file.write(reinterpret_cast(staging_buf.data()), buf_size); file.close(); + std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out); + if (!ref_file) { + std::cerr << "error: failed to open reference.c.bin for writing\n"; + exit(EXIT_FAILURE); + } + ref_file.write(reinterpret_cast(ref_data.data()), buf_size); + ref_file.close(); + // verify result std::cout << "verify result" << std::endl; { From 689043b45e9edcb8808c2962a08dd3ac681a977a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 16 Apr 2024 15:22:03 -0700 Subject: [PATCH 69/75] Add regression flops --- tests/regression/flops/.gitignore | 5 + tests/regression/flops/Makefile | 9 ++ tests/regression/flops/common.h | 15 ++ tests/regression/flops/flops | Bin 0 -> 37976 bytes tests/regression/flops/kernel.cpp | 41 +++++ tests/regression/flops/main.cpp | 252 ++++++++++++++++++++++++++++++ 6 files changed, 322 insertions(+) create mode 100644 tests/regression/flops/.gitignore create mode 100644 tests/regression/flops/Makefile create mode 100644 tests/regression/flops/common.h create mode 100755 tests/regression/flops/flops create mode 100644 tests/regression/flops/kernel.cpp create mode 100644 tests/regression/flops/main.cpp diff --git a/tests/regression/flops/.gitignore b/tests/regression/flops/.gitignore new file mode 100644 index 00000000..c791df5d --- /dev/null +++ b/tests/regression/flops/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +flops +.depend diff --git a/tests/regression/flops/Makefile b/tests/regression/flops/Makefile new file mode 100644 index 00000000..b5d37285 --- /dev/null +++ b/tests/regression/flops/Makefile @@ -0,0 +1,9 @@ +PROJECT = flops + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/flops/common.h b/tests/regression/flops/common.h new file mode 100644 index 00000000..a609a0b4 --- /dev/null +++ b/tests/regression/flops/common.h @@ -0,0 +1,15 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t size; + uint32_t addr_src; + uint32_t addr_dst; +} kernel_arg_t; + +#endif diff --git a/tests/regression/flops/flops b/tests/regression/flops/flops new file mode 100755 index 0000000000000000000000000000000000000000..dfd6a6c8f97beb7149347256f244d09478eb282a GIT binary patch literal 37976 zcmeHw3v^V~x&NLd1Cd82DuGx;1`SFI!~_UUsF4IRa7HH>NkDwiVVF!vN|K2)6M6iL z1_Rb{n2OfhdaL%2y|!{&+tS;|RjeAS80__dZS|s6q&`nT1cM@4b^gC^KTggZ<|w!8 zuK!y1uAH@IzWsfBf6x8xefBwL=Ik3hz6Dth2UABjyORj|?c zp2Xa281NiU6ZO>spjI6FveIF$q-O&XUm+DTz_SIJDX1(YNPKoj#aW_8L6$FY;&W2b zO9JZ2-O!G`|NHg;ZbLqE?Z&hOjA zbA^qZTW##`v!T~xAY}6QWgB|4jov*rak$^cFNcjiH`&NPVH1aQZR8xYp|7!#|AvkI zGi~Jj%qBk9+t4d)=$F~}^}bDh{?5k!vu)_BY~p{0jr@QOJrD75<6}MM*ywGxiQA(# z?Y|Gfney&c7zm8Y2w6{$N`;3e8t7y0jry9*nK)40(f|il(rCO>0xA!xN-- z+#tme?hJL1OVLG%PE{3p~LMwzJ6z$$de`BfN(=Z#sh6~|J zf1SUzBN~dtyzL&(;)Zaes4(~_jw`#GGm3}B2&J;eB&umpRO3?EP%dl`CfG?6K{6bj zt7TtG(|(0D^o+|L8eH3Q6kk}+A!1K#+?k$?CgqR1X%_b z46hTT3p!e3Dq+gkL|S7Z@=b7`TG-Q)j>ZZN3k)lA6~oBLm~eS>TUWG24$Zcx7tUIT zTn4v-jTjx}elch}*0jrje#zM7-p9VA(JbzKJ?zLZ9?JY4A;icNJ^<#6x#biGfNYJK zXqL*ta97M9ZuUn49jikA=JpsITMtE)xu_Bv8EMui$l?__7F|FK{?<0J0q# zdMu1o;)5D9=UUA9?VW=Sbq==08-zib)Gvr1;9qh)1jwUrF*YUX+UWhFDs6{75)Q8LRYl?@hW zl$0A}=)g%e3)gnpxGtn`C;wJ=GKK-8W4NppZKxfc<%hA;BwcZ5vk{zs80bzz)=1E> zc;LriOI3169S&ScPQzM)u3g>u)Stf&cQ8h>y`a(IfV{r|QvC{iI!}BbD~SGPL_j(1<8mN-wr3TnM*D-9I(+V+oGC{X>A?N;OBVR z{*K$ehuYz6vc$W-(s~-jsZ_~3i|{OVk(8(ItElx*XDLaA-Euvo=-WyJeXWIFD%S6aA=( zKGsA(VWN*S(U}|<)F*`|daj8sZw>R>d=vdEruKy<`gjxFZKA7t(o{XgM4w=4UuvSC zW1?4>=o3x!Y7^aUqH8AlBon>fL_gO=UuL48XQHn#(Thy<788B4iQZ|VpKqeCHPJ6H z(YsCbDJJ^OCVH`nzSTshYjbtnX`)Y6M8It(`ZN=LyNNERcwW21M3;{;IQ>Brz04%1 z*F>LTqVF-$=`22U3`Jll0z(lPioj3={x?M6nCqNRw4Qf!wS@B}KV#bF-dI*@uh#QK z?&I9ORQVa-U~C{Y`Ah)UL`LmfsAi!5Z>dyjE5~VJKG6TXfzyI~pnsQv(?Wcp|5pZ1 z3-E#ddkvfx-UI!&88|Jt2l_V}I4$Z1`mZ%`T3`?K2MnAR)&u>Q8#pbf2l{IaoEFjp z{bdGD3+RFVNd`^}=Yjsy4V)It1N{yIr-kxB|NEaSe`$dX{|%fL#_-?3X+aGC4V)Ij z@ZZ2`0Sx~QoEEhi7N+puz-d7W{|%fLqVV6qX#ooV4V)I9@ZZ2` z!3qBjoEDn!-@s{s3I7e87MAeez-d7V{|%fLlJMWaX#olU4V*5R;lF{?LK6NPI4vO2 z|DP)V>B1iUuW*cx&DYTAsAYV706%LlP&lJ3tnKsM_X{G z1^?{I^#1+Of*-cvZ&~nHE%*x-{22?r+k!u0!GCMPe{R8lY{Bob;QwU7lNNll1>a!7 zyDWIS!nMS=iVF2+OrgXkOH}Jx{O%l4{#H@7mUN!Lk0GdD z!+AshJ4P7^snFlYe5A#X=<6U%{{UotE=v6eFxyfCx6Xu?%Y~N3TuW_Hy-;yY4hr8Y z(jfm%B_HQ4{VtLZsppZ@AClC5=;=>kPN(*z`g%gwlW4SR2jYvrqkqWZNBVL=qQ3=b zjQ^UYHA`w5YvP|S(UQ{>n4B84_%UtcyZJcpHED^Q$uxhFgSjA-^=k2hT6}*kq#pTu zD%Eo$68N>Y&xwvMx0a~dw2ZNxbe=0au5U+8 z+^bn;o;|tod4>8GG7H`n87h+H zPvZPXo~{rwo?1#WM%v5Jq>KX}!_09|594l^GR`I$ko5%WYARoi@7qa>p&Hlx-CAOF zA3C?^#3(KPtZUvMv_!!#fyaBb;up2bqpoj1jp69J@fY}3VOl~$lT{TkNPnD^Cab1V zc@=~|L~?FAft!U;;lzRaXf?EiSf20);=zXL&QBVp>qWm0A??>KNprqf$ zZ7O&Z$tHqy6=u<$vxHUWh#=jMsZXCl(M&k+LOD_X61oM^`wkE-yG2iURLXe@^|K&R zg{T=zGdaEnxDg)8JYW1h8Cxy+KoPQ?rG;tVM?6e3IB%bRCCM8j3~Jz^JEC`iMqxsH z5T-vPTE6%}U;H?w;!~su&_Mr<-N#d@M8SGUiSI}KwJ7MLoYc0U z&Jgr4WT@!IgrHnv{;m-8>0*?>20^OLGSTKuqK8GBeN;XZZIA=A$n?#rm{Y%>EPnyR z0zwy%hnIuI+4VkA$wXx_Rn7wG(BJ6l(HnoF|6WMpSEBLV{Xa&zxx_Vb6BLW}G`~Jg zR&9V%eH4WxQB`$0=_y#iL3xmBenh4v3uX$^EV8iASqwl2Y+8&78eQK2B1gG%HHa z6S@6|KQKEhudmAvd*jc4LyNzz^}KnszR{O-{tlg^B}dQYpz2PDC1K8wA2CD#BTE9aS74Bn71z{ey#X(Eq+wn_1;`<*NJS+@uc?r z=dm#m@PZVOn>rxIk>aOb>aOZSMzXG{OSGPsj(piUg zj^*Gr^b>#o8dNHKaPNUnl?Z$c*O{=dpM=B^7?fY7a^W#vejeqWnBb_7Jd;4L7i}H_ zoo-n4ABAF`MTx4>P)rG$+rJteWYI70cYKUY@g+td1n|at>x%bycb&-cRrak}9r4y7 zc;&}j`J?}W;aK+6;C#~TI?{VgbA7eM_^p${ z{MB~7ljTj0snja>#(t{pN@Z#Bf^M$y#GxO+E-n5!+-xROil5Zq!^sFHUBNALE(4He z^{?h3tLE32(*7%XtI?MLq}$&R9dA~f2bI&&h_rj-CxkcTpd0!J{W;l}H-s-@{mAXy zkDMVp7YI9_BQ)XM3n1-0bZ_&Z@2-i$-kK|X$*MCKVVNNp#vh;u@4!umCGi({-(!9J zcSPZ69gN)gZVRTbv}N)#KA-6UbT3l#kJJiR4RwhJh95;X)NMZO+O!Bu;hOFZ70k8i zlU!QQJlZuM5`T!c`nsdw!|A)`ed4bgpp^2UOpCtB;M;ko$f-H72t3~;o)39j=QlFx zdL5?)scWf8*9_ttEp@#>Fi~|AbUn(hIp3+@2LdUaK?=tUg|Fg13>1Fs1LG#4@G#0J zId|=#!k=R)qnz7Ce0Lua&NV@Pq5(^(@=p+GoYkN`mpmOW75@T4c3vbD{}MNNptzd^ zj1-DvC?}%10zi-XR0|c-z(PQPeO?|;elSP72Xz%2Ypy8 zXgGZ+0engI0NI-?9|m3_2OV=WxWTE#uPLhV#oLNB{rG#-%6Y`Y2Om}mxEO>a^nPps zW6kj`MN_Z{(cJt@T)za?K=;Gc zj4SBronjb}tSwrGT%rm$mXdlB?qC@M>-96hrSYnYQ= zn_e0Qua4`tqdk^HE$|CVqEFDV=)5YPaJbs_#b4KVaBjM?eey8m5HEfi!+2b2p_#m! zUqZ{!l~U-c^@pix()pWODEU*F2pwOQIx7*)=J*{&Q!t#!A>;BJ0dRw6L{F7v^c2dY z_h~JdDSC=Z(NkqaZxrue=LYC9bf^Gwn>Q6LL$GBG3n_+PbheD)vnZ+<9zRRO@MUr+ zafdM16vi8|(n1(7g+i=?S|GgVfD`>P&9!+eB8*_>Qzp|jJOwjIDFviZU-zyI=IxN6 zE=a!zZAkuhFNVYkuF)AlS&YT|r@(rMr9qy|w!uk->VQ&7Ru{2L=NsmMOdRtMxYf~*{dVDu7 zPjC&;2nMbA58AFzvb4&BuA44{%{VWO+K#WBpB;0ZURyNknCqHdDEW$VkGa0F%hl+- z0OR|Z>nbdPFLX{t`MP^|o=)nL<&R)p4P$T8VC?B|)0-HsVR4_Vih*Jo@O->}dI*i; zPiP6^z75>C`ytxBuEsS@41B4qKL)qygn~9u56koo(6Mu*(7l{=$NL~n|DL5`uHHz- z2t8tM&oSoqb_%nY+bNKI`lEX=_Z}3%ZiR_Gh^S*QhMRwn`!Na7Xv*D4@k}}o&LaYGk~Kww34a1ES$Yyr-S7q5%o7#zo0X zR93?I?X4~1_Eyq)j;!K$wkDAd@VIH>TLud?ZWYaG$|MKW{;^-0f+*@qQb#uM>}e}n zrauhtF{@o84bYHw%eWRbH^%ACSJL?})xs*8NA;hOOpF>jvl`b9uFaz{h!C}DxQ3vU zu)2b%@js45;3lY~?mwdIFq~$4tVj=o<58T_;Cyu{uC830R`NSzM^bxTj~?w`0z|AI z)eW4cv1psDa$+()Dmyj#Aj(ro(fL&{>FpI*}bH;5JQGR_b;q+0jNMTSPh z_!uOKJp4XNa*6SE;ud>7F+}vd>eoa@@W=6I=dPA-o}NO!AF;RitimgpY!Du;VP;C zET(>iooC@j3Pzo2dGTf)!bcRt?HA4F8V2`={$L?&JC zDxNP=?o~i2FV)@PQZ0L}o>;?GtR#_j{LTqaZOp-@+FGi89pR%p2n(q@c!C!APEx3m z4i)gPbA%?q`A92V;NA^Z{Q~+rm%g5P2Vb}mBrb>#pD}nrJb*IVk9wO_Roz4R!Y_#b zj>eB+v7XYOC*D`+>p$^@%-t^Xthg2#GYcefar{$Jd9|n z9h?ui$i!UoeN_Kk1VgqnUoUE^exgnN=Hsz7+CvmH920A<5Z4J_+(74d&g0LDE1oN` zyj_^D#b3~-UZ@rC_2R^ZD>m=0&$E2R$F<7G*UTyF?aR3lV-MGZy$(>iaEBx32VR_J zbLm>fG@+^aOxdYb?swgI2RMC|$LiuA`dsr5)j2-&RzBl%E$sL1dMk?tUlghu=gd;9g8sDbiD)2+VwWgCfc#P7%V? z4I+l#_-8eZGKMdT7+xl0_>5M(hllY!3S6-kf0@>b&*`tc0mY8Tz47JNhzWmrobD!F zm4~#ruDRyp{*sPmTx2;GsB4#XUFj)1?kRf{0=>zLreT403nbPhHZX5uNv^N>6AH>D zb(>RN^_t^QT|C6Rj#uj9UHK?lHPX6B^^Z6HA}y##BLY|^cpXn3x<}5FpVL}kGJ1yM z_6U?6`U&F2Fr4%wuV;)GzaCv(iPOSTh^<-Tjelr7u4w|f!B_klMN5BeKZ3FAv+UT! zvcJl__QeNqHxv(RYEolxt^YWB{nf66mgF*B15^Jzj&SjQqTg8{4GAAQh9WQ&fuRTt zMPMibLlGE?z)%E+A}|zzp$H5`U?>9rKO^8^S!^kPzl*yu6pfYOpZ9owiyJRjX$|t1 zu5f&cz>ArO-hwj6eZl1D1@1^F)`j=8G`S}?O&Y=IwJh%DaKs%9w|BOMVy)p0))ZdT zK`(NFxM&Rj%_pK8?+j@UMc6cV6#vw(V(yjeVxi~=hBp}Cg%c$!@gMz22n|9V!4Su; z3q@L+*Fjsft1ZU(YgpWYn8e-Zu{rKQFxC}ltAfbT+Rjig7HXXAq?W2>} zf*P;SQ#*-u%vn9BWiDPD5bBUl!r+b$aFN0CwK7yY<$UF{FqoS=2NJoxOuV7S-5Ck5 ziUitOppD)i6ALMscmY8;vW`K7KhV?^@kb-Us?xQCcvCb+xWR@fSHvOp865hlBPf`Fr|2!RP2X)B|KV)&NK>zpHhUR;VFeoB?^*A1^0Y+ zdmt8RUF#+tGVs$RbfW{@Br6n+RuMnXgE?+`xdbvc7IyOt7)&~zy7UqgD4^#VncJsj zrj>=%uT3UlG?$)S#^7^h~_r+E2SW9bk`o-ks zlwwOYm&W6G^2?BnV47Z#B*J9GUA$b$Xn3h-@gk4UU$c0jzt*$VU+1aw*VNW7#vl}l zm)0F*Pq=xC6pmi;$BLzRnfKHSDl28qQcT5&vF!tLDRW#^VxJsRl}8qT25l1=gYjkK zOC(6X8JQ%4N%lv>U6CLLEOPJ^Y|m`BX&hQiKSjHFrf2E{RHWIB!R)8OZ0Z%n+0-a) z6p3+T%EhKoa{$w)KhTM(t%>K>dfYw6&vR2L`44AKWXZ$7-S*LwJ1%yN&;I%YAmNz< z%l%_2bpUY6GpSTAo~Kj*x&gJlsniE}P;}<=snn-<5Lf}Y3Gn89snnPp#@4=+O6>t( z%gd=$4DBBT+yb}{un>HA{xy}F1*pA}N_7I#bCwF!PXVOoHX7gxz!t!5fZOp1V-)1? z!(+cv$k*^xa4jGmWG5Yt^^2KfZN6jtsNuO=!9#TX)`V?`O&3yw+Tr;Gitw3*kNYY3 z0+}o?e?ea1C9aWca=Y2wugv-C%%XEa<^0zG7CxOyaYiEIAh|Jo@`+CT@}M?b@Y#nO z`1q{`KZv#spTB^HpBeCjXurYdfQd#jDPHfw)_Oozo0p%pFfW(d9>Avv@l^bz`vZJ> z!CwYH*^<2cTeIfn6@DjsUY;3d3m|s$De%qnP;EV@~V4=-!yECGnw<9>|3+4){n^4xOOCew?Z5AJqR0r54B8# zjmt;mmy%B!`E+^Sw$rjV56@`nCG?wvY534#(pfyH6WnH((T?Ij3m+fy`#%74 zE`SXuWO_wQyusAX8X98T$aCg(?~k55S%UNT!^h z0YOZ@L;akje%_K(n>Xbqr;d~?h zVC@-plXFW>GW$DOJPxI3>!NWEm_vgQ2Y$c7=yx`4e@Ug@AOiLMi+QYo#p*w*~7zJ|2&Qz8SYGuV?WAu{&*bwd2Yq?W7*B8y_WsRIQF~I&W+<7yYif` zj&=Om^?$QK>N$g+;s5N+3s8RhOefkNKhp_GJ!38f<()B3$opW72jzzfV8`PHP844& zz*D$X!2}e4GnQJ2xTNkn-ErDtwA${#m!zc}#fyK0PP;94H2$4(AopR=|82zRU*xfu zNBjUz+>+-+`Tjg7YEvmk&PY748ajp|Fcg8I2ndA91co9o6oH`#3`Jll z0{_D!px(=?-ovZjyZb-ft^alAi@O)tW;P&mEQnT}BcFydqi$}XoEwV!H&v$ zNoE>u#n3_b!swuTS#((Mx=1!wdcCL@cK~s(+H#*+ieuwMGq#Ox9^z2+Ui>|P4rTWN z8K~0)+0c)FXYoU|!yhZSe`wB0_l~8T|l7FtS!!2Q{gw+z(OSnS9P6@ju+$!NV33o`? zE8$)V4@h`aLUx|?UqUyeG$AXH?eXR~D)={bz{+p!+>UVSw$nCr4J zE2#M(oj#nYc_W>k%hY&Er=P~uI83LHU~0Ul(?=TfLOLC@q}xmpac?6)v?9fo9_g!Jdrt0!HtIvL4xN zEK~Enxh@;Cr<&)}>4nDrN9pvlSf=?W8)pqQU#GXn*`?b|5#?;tK1$}=Y&OA+^0EWp zIqdzEwSvwoSvYHT(}NNmDtvT4Bf5$|Jx8MBVtj}`^#YC>-LFZyS{J<`s#y@w0XwJl z2m;wbM+Bct_G|#1i@iZrI8Bc?>7mELw%Ql_dgxmCHXle`azyV^7LM z`V%(GkhfQGIVa~^JLniiCmr9mp>G5IT=YvdB?b<)E_w*`O#6SmYa?fBR%X3(LC1YC->{(&4hxRYa85!`E~nVj zoPIXTmwB$%WiQysIR^TOwlU`lMQ_24i(hK|5J|u3Tp5W4Sf;f zO!FtR*gqHZa6aUh%JHK7x|;Wk!Ukd3hMu&cKLa}DdGA~yV-oby@sUmYnK<83KI|wH z?d}zT)qzfSTH{YI=gbthEuho5Q~UgMOF2*2wD0G1%zM?MVMw6&b3<9a*{1zAPRDt+ zMl?iv^J53-WKXv||CY&qf7wP(7EVB!?3`pnF9F>xb(ukx4gCtxsb98j6s^@b>adZ| zXm_=eU}q<`q4SlMMq99RW*7aon6^x;P`_SWd}-N?!JlsWW8eH{Nif{jB$=t?U+BAR zUX2eM!ga0md((CRrJe446@F~GNt^GnU?die;kV=^K{mJ}sXx|En}T+PqS*Tl`+%-$ z3$F~c`J3>Y+^9d$wU)_Wr-zzKDoST#le$ceu>W93D}Id}39Q5Jb}{^pyBWXn4*8q9 z+S}KG#f0%q5Yw9a{RCZs{I|)*~JEagLczadHxXn4)=>wCEbS>_|Rhb zVBM56U1R#Do#eKq-g_DL2yDX6palO?v@~%?Z_>gX#Y8y$1Q# zjQ`Mvt`_r<__q7erZ>gUj!*u%xct38{k|SCoH27%s3R1?pB1p7uKc3~ceh<`rfBhm z@4A>y!>Ead{VmwJQEbec_Dg@L_?rBq0ge4r-!%3lPfYv$VuMTk|DpDZ5|~_qorW`Q zmYR7B+%%~rl{|@z{Y29$F^%v)z5EyUjZNq0X{N@Q%1_$#6TKpabNa8sX`4#yVJU@R z1Y`e5%XX{gy^PJ9VGi=}(L>t>cHC?X% zD(ul}C7q&Uv|DN>)>GO1*JgWDi#W>;rLvZ9|A}6uKl;!=N-#BJQ*1T)|95W6n`Vy4 zWAmohC+*9d-X43Jsx5-WuGYqm#%b+%Bfgugm5jk<+T-`++AlkUeA$M%7KuFjib5gu?lBS-m}=!NNlMajCS$Wic)qeA+QKK)k*8* z$eI2y!;Lm?eXjN^Pg92RBDVyVWR*tz1aItMY&pH3e3nUXKn|$sUM;DNJ)Xa0MTNx6 zv_pKF)xtO08{R_pHc~1^u-ADs7K&iPp%rgqU?tAR>ANn2PwsatcS5Faqt&hGYm4z; zl;WB8=C)X6-D_D^%TUWnl5aDQS#ni-xI<((t&+*1%xQDWF;NpSg2%GM^_2{$cf$8W9v+$m0hiEP19SOBnewUPsg9- z7;oMZh_DkpWz)xT+8`RGH+E84$@$ z4z#yImynM6hT&4Io&U|$e0}(jmgzYNUBYHb0qVJ*g6g@xk}ofJ70MrT;?|78ubvMo z=r&r40&b6_9kYR%`Iq57Djfw_ zm5_W{Q1E`>W`6bjS3%Xzj(>-RUp*&QF#k*RKV;!o&odQtS^{9T=P?VvdhV>C`dyH+ z-x|N&l3(?|@qo2na1 +#include +#include +#include "common.h" + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + const float *A = (const float *)arg->addr_src; + float *C = (float *)arg->addr_dst; + + int incr = A[task_id]; + float sum = 0.0f; + float sum1 = 0.0f; + float sum2 = 0.0f; + float sum3 = 0.0f; + float sum4 = 0.0f; + float sum5 = 0.0f; +#pragma unroll 8 + for (int i = 0; i < 5000; i++) { + sum1 = sum2 + 5.0f; + sum2 = sum3 + 5.0f; + sum3 = sum4 + 5.0f; + sum4 = sum5 + 5.0f; + sum5 = sum1 + 5.0f; + } + + sum = sum1 + sum2 + sum3 + sum4 + sum5; + C[task_id] = static_cast(sum); +} + +int main() { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + const uint32_t grid_size = arg->size; +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif + return 0; +} diff --git a/tests/regression/flops/main.cpp b/tests/regression/flops/main.cpp new file mode 100644 index 00000000..72aa56ba --- /dev/null +++ b/tests/regression/flops/main.cpp @@ -0,0 +1,252 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + // vx_mem_free(device, kernel_arg.addr_a); + // vx_mem_free(device, kernel_arg.addr_b); + // vx_mem_free(device, kernel_arg.addr_c); + vx_dev_close(device); + } +} + +void generate_source_data(size_t size) { + src_data.resize(size); + + for (uint32_t i = 0; i < src_data.size(); ++i) { + src_data[i] = static_cast(i); + } +} + +void generate_reference_data(size_t size) { + ref_data.resize(size); + + for (uint32_t i = 0; i < ref_data.size(); ++i) { + ref_data[i] = static_cast(i) * 1000.0f; + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t size) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_dst, buf_size)); + + std::cout << "downloading result C matrix from device, device mem address=" + << std::hex << kernel_arg.addr_dst << ", size=" << std::dec + << buf_size << " bytes\n"; + std::ofstream file("output.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open output.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), buf_size); + file.close(); + + std::ofstream ref_file("reference.bin", std::ios::binary | std::ios::out); + if (!ref_file) { + std::cerr << "error: failed to open reference.bin for writing\n"; + exit(EXIT_FAILURE); + } + ref_file.write(reinterpret_cast(ref_data.data()), buf_size); + ref_file.close(); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < size; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (std::abs((cur - ref) / ref) > 1e-6) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + size_t size = 64; + + generate_source_data(size); + generate_reference_data(size); + + uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(ref_data[0]); + + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + // RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_src)); + // RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_dst)); + kernel_arg.addr_src = 0x20000UL; + kernel_arg.addr_dst = 0xc0000000UL; + kernel_arg.size = size; + + std::cout << "dev_addr_src=0x" << std::hex << kernel_arg.addr_src << std::endl; + std::cout << "dev_addr_dst=0x" << std::hex << kernel_arg.addr_dst << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max( + src_buf_size, + std::max( + src_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); + } + + // upload source buffer + { + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_data.data(), src_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_src, staging_buf.data(), + src_buf_size)); + + std::cout << "uploading source data to device, device mem address=" + << std::hex << kernel_arg.addr_src << ", size=" << std::dec + << src_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.a.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_buf_size); + file.close(); + } + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < ref_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_dst, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.size)); + std::cout << "PASSED!" << std::endl; + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} From 793779aa6cd0fe0e316ff455e3a5dbee4635a7a7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 24 Apr 2024 21:08:31 -0700 Subject: [PATCH 70/75] sgemm_wg: 128x128 config --- tests/regression/sgemm_wg/kernel.cpp | 14 +++++++------- tests/regression/sgemm_wg/main.cpp | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index e9f898a0..86b7309d 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -16,11 +16,11 @@ // (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER // * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields // BM <= BK*TM*TN -#define BM 8 +#define BM 32 #define BN BM -#define BK 2 -#define TM 2 -#define TN 2 +#define BK 8 +#define TM 4 +#define TN 4 void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -80,14 +80,14 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // // Make sure global offset values for A and B are contiguous between // neighboring threads to ensure GMEM coalescing. -// #pragma GCC unroll 1 +#pragma GCC unroll 2 for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { const uint32_t global_a_offset = dim_k * (global_a_row + load_offset) + (k + local_a_col); local_a[BK * (local_a_row + load_offset) + local_a_col] = A[global_a_offset]; } -// #pragma GCC unroll 1 +#pragma GCC unroll 2 for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { const uint32_t global_b_offset = dim_n * (k + local_b_row + load_offset) + global_b_col; @@ -99,7 +99,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); // Compute single tile*tile matmul -// #pragma GCC unroll 2 +#pragma GCC unroll 4 for (uint32_t local_k = 0; local_k < BK; local_k++) { // First, pump data from SMEM->RF #pragma GCC unroll TM diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 709d804c..62625c44 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -166,9 +166,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 32; - uint32_t dim_n = 32; - uint32_t dim_k = 32; + uint32_t dim_m = 128; + uint32_t dim_n = 128; + uint32_t dim_k = 128; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From df881fd69f08028b1e2fdd37267145a0033c35d0 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 24 Apr 2024 21:09:01 -0700 Subject: [PATCH 71/75] Generate separate ELF for radiance --- tests/regression/common.mk | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 81df3139..e90b3635 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -79,7 +79,7 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.radiance.$(CONFIG).dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump @@ -87,15 +87,21 @@ kernel.dump: kernel.elf kernel.radiance.dump: kernel.radiance.elf $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump +kernel.radiance.$(CONFIG).dump: kernel.radiance.$(CONFIG).elf + $(VX_DP) -D kernel.radiance.$(CONFIG).elf > kernel.radiance.$(CONFIG).dump + kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf -kernel.radiance.elf: $(VX_SRCS) +kernel.radiance.elf: kernel.elf $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf +kernel.radiance.$(CONFIG).elf: kernel.radiance.elf + cp $< $@ + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ @@ -122,7 +128,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf kernel.elf kernel.radiance.elf *.dump + rm -rf kernel.elf kernel.dump ifneq ($(MAKECMDGOALS),clean) -include .depend From 6eafa2de547bcc012620b151e5f8549cc5c20fec Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Wed, 24 Apr 2024 22:09:30 -0700 Subject: [PATCH 72/75] write operands to elf --- kernel/linker/vx_link32.ld | 18 ++++++++++++++++++ tests/regression/common.mk | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/kernel/linker/vx_link32.ld b/kernel/linker/vx_link32.ld index d8a50026..ea5c4e56 100644 --- a/kernel/linker/vx_link32.ld +++ b/kernel/linker/vx_link32.ld @@ -7,6 +7,13 @@ OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", "elf32-littleriscv") OUTPUT_ARCH(riscv) ENTRY(_start) + +MEMORY { + DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M + DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 32K + DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 32K +} + SECTIONS { . = STARTUP_ADDR; @@ -85,6 +92,7 @@ SECTIONS /* Adjust the address for the data segment. We want to adjust up to the same address within the page on the next page up. */ . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE)); + /* Exception handling */ .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) } .gnu_extab : ONLY_IF_RW { *(.gnu_extab) } @@ -166,6 +174,7 @@ SECTIONS *(.data .data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) } + .data1 : { *(.data1) } .got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) } /* We want the small data sections together, so single-instruction offsets @@ -200,6 +209,7 @@ SECTIONS } . = ALIGN(32 / 8); . = SEGMENT_START("ldata-segment", .); + . = ALIGN(32 / 8); __BSS_END__ = .; __global_pointer = MIN(__SDATA_BEGIN__ + 0x800, @@ -249,4 +259,12 @@ SECTIONS .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } + .operand.a : { + *(.operand.a) + . += 32K; + }> DRAM1 + .operand.b : { + *(.operand.b) + . += 32K; + }> DRAM2 } diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 81df3139..f279b79d 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -93,8 +93,14 @@ kernel.bin: kernel.elf kernel.radiance.elf kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf +OBJCOPY ?= "riscv32-unknown-elf-objcopy" +OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS" kernel.radiance.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf + $(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) kernel.radiance.elf + $(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) kernel.radiance.elf + $(OBJCOPY) --update-section .operand.a=input.a.bin kernel.radiance.elf + $(OBJCOPY) --update-section .operand.b=input.b.bin kernel.radiance.elf $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ From d21e7b92c7bf536f4b94890aef0e97771e34b2fd Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Thu, 25 Apr 2024 15:28:12 -0700 Subject: [PATCH 73/75] internal accumulation, forced rematerialization, better unrolling --- tests/regression/sgemm_gemmini/kernel.cpp | 273 +++++++++++++--------- 1 file changed, 163 insertions(+), 110 deletions(-) diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp index 7029c511..243a2a24 100644 --- a/tests/regression/sgemm_gemmini/kernel.cpp +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -16,6 +16,8 @@ #define TILE_MK 1024 #define TILE_NK 1024 +//#define EXT_ACCUMULATE + #define NUM_CLUSTERS 1 #define TB_M (MATRIX_M / NUM_CLUSTERS) #define TB_N MATRIX_N @@ -41,6 +43,8 @@ // #define DEBUG_PRINT #define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) +#define HW_TID() ({uint32_t gtid; asm ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) + void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); vx_barrier(barrier_id, count); @@ -49,6 +53,7 @@ void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_i void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_id, const uint32_t tid_in_threadblock) { + __asm__("matmul_start:"); const float * const A = (const float * const) arg->addr_a; const float * const B = (const float * const) arg->addr_b; float * const C = (float * const) arg->addr_c; @@ -65,15 +70,13 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, constexpr uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; constexpr uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; const uint32_t hw_tid = tid_in_threadblock % num_threads_in_cluster; - const uint32_t thread_load_offset = hw_tid; - constexpr uint32_t thread_load_stride = num_threads_in_cluster; // the dram coordinates are (i1 + i0, j1 + j0). i0 and j0 are both spatially mapped only. - const uint32_t j0 = hw_tid % DIM; - const uint32_t i0 = (hw_tid / DIM) % DIM; + const uint32_t j0 = HW_TID() % DIM; + const uint32_t i0 = (HW_TID() / DIM) % DIM; // j1 is both spatially and temporally mapped. j1 increases every iteration. - const uint32_t j1_idx = (hw_tid / DIM / DIM) * DIM; // A: % TILE_K, B: % TILE_N, C: % TILE_N + const uint32_t j1_idx = (HW_TID() / DIM / DIM) * DIM; // A: % TILE_K, B: % TILE_N, C: % TILE_N // every iteratioon, j1 increases by j1_stride constexpr uint32_t j1_stride = (num_threads_in_cluster / DIM / DIM) * DIM; // mod TILE_W after stride @@ -84,7 +87,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, uint32_t marker0, marker1, marker2, marker3, marker4; uint32_t marker5, marker6, marker7, marker8, marker9; - if (hw_tid == 0) { + if (HW_TID() == 0) { gemmini_config_ld(0); gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); gemmini_config_st(0); @@ -94,14 +97,20 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, // TODO: check for tb id rd_cycles(marker0); + __asm__("i_loop:"); for (int tile_i = NUM_TILE_ROWS_PER_TB * threadblock_id; tile_i < NUM_TILE_ROWS_PER_TB * (threadblock_id + 1); tile_i += 1) { + __asm__("j_loop:"); for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { float * const smem_c_tile_start = SMEM_ADDR_4K; - float * const smem_acc_tile_start = SMEM_ADDR_8K; - float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; + #ifndef EXT_ACCUMULATE + float * const smem_acc_tile_start = SMEM_ADDR_0K + HW_TID(); + #else + float * const smem_acc_tile_start = SMEM_ADDR_8K + hw_tid; + #endif + __asm__("k_loop:"); for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { // TODO: double buffer rd_cycles(marker1); @@ -120,43 +129,51 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K + runtime_const_a; const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N + runtime_const_b; - float * const smem_a_tile_start = SMEM_ADDR_0K + hw_tid; - float * const smem_b_tile_start = SMEM_ADDR_12K + hw_tid; - - const float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0]; - const float w0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0]; - const float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0]; - const float w1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0]; - const float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1]; - const float w2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1]; - const float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1]; - const float w3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1]; - const float v4 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2]; - const float w4 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2]; - const float v5 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2]; - const float w5 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2]; - const float v6 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3]; - const float w6 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3]; - const float v7 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3]; - const float w7 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3]; + float * const smem_a_tile_start = SMEM_ADDR_0K + HW_TID(); + float * const smem_b_tile_start = SMEM_ADDR_12K + HW_TID(); + __asm__("load_ab:"); + float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0]; + float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0]; + float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1]; + float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1]; smem_a_tile_start[0 * num_threads_in_cluster] = v0; - smem_b_tile_start[0 * num_threads_in_cluster] = w0; smem_a_tile_start[1 * num_threads_in_cluster] = v1; - smem_b_tile_start[1 * num_threads_in_cluster] = w1; smem_a_tile_start[2 * num_threads_in_cluster] = v2; - smem_b_tile_start[2 * num_threads_in_cluster] = w2; smem_a_tile_start[3 * num_threads_in_cluster] = v3; - smem_b_tile_start[3 * num_threads_in_cluster] = w3; - smem_a_tile_start[4 * num_threads_in_cluster] = v4; - smem_b_tile_start[4 * num_threads_in_cluster] = w4; - smem_a_tile_start[5 * num_threads_in_cluster] = v5; - smem_b_tile_start[5 * num_threads_in_cluster] = w5; - smem_a_tile_start[6 * num_threads_in_cluster] = v6; - smem_b_tile_start[6 * num_threads_in_cluster] = w6; - smem_a_tile_start[7 * num_threads_in_cluster] = v7; - smem_b_tile_start[7 * num_threads_in_cluster] = w7; + __asm__("load_ab1:"); + v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0]; + v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0]; + v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1]; + v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1]; + smem_b_tile_start[0 * num_threads_in_cluster] = v0; + smem_b_tile_start[1 * num_threads_in_cluster] = v1; + smem_b_tile_start[2 * num_threads_in_cluster] = v2; + smem_b_tile_start[3 * num_threads_in_cluster] = v3; + + __asm__("load_ab2:"); + v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2]; + v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2]; + v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3]; + v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3]; + smem_a_tile_start[4 * num_threads_in_cluster] = v0; + smem_a_tile_start[5 * num_threads_in_cluster] = v1; + smem_a_tile_start[6 * num_threads_in_cluster] = v2; + smem_a_tile_start[7 * num_threads_in_cluster] = v3; + + __asm__("load_ab3:"); + v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2]; + v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2]; + v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3]; + v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3]; + smem_b_tile_start[4 * num_threads_in_cluster] = v0; + smem_b_tile_start[5 * num_threads_in_cluster] = v1; + smem_b_tile_start[6 * num_threads_in_cluster] = v2; + smem_b_tile_start[7 * num_threads_in_cluster] = v3; + + __asm__("end_loadab:"); + #else /* smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \ dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; smem_a_tile_start[1 * num_threads_in_cluster + hw_tid] = \ @@ -190,7 +207,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, dram_b_tile_start[runtime_const + every_iter * 0 + every_2iters * 3]; smem_b_tile_start[7 * num_threads_in_cluster + hw_tid] = \ dram_b_tile_start[runtime_const + every_iter * 1 + every_2iters * 3]; */ - #else + const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K; const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N; float * const smem_a_tile_start = SMEM_ADDR_0K; @@ -248,26 +265,34 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, rd_cycles(marker2); // cluster wide barrier to wait for A and B loads to complete - threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker3); - if (hw_tid == 0) { + __asm__("gemmini:"); + if (HW_TID() == 0) { sp_tiled_matmul_full_spad_ws(SPAD_ADDR_0K, SPAD_ADDR_12K, /*spad_D=*/0, SPAD_ADDR_4K, /*I=*/TILE_M / DIM, /*J=*/TILE_N / DIM, /*K=*/TILE_K / DIM, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, - /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); + #ifdef EXT_ACCUMULATE + /*acc=*/0, /*act=*/NO_ACTIVATION, /*skips=*/0x38U); + #else + /*acc=*/tile_k != 0, /*act=*/NO_ACTIVATION, /*skips=*/0xB8U); + #endif gemmini_fence(); } + __asm__("end_gemmini:"); rd_cycles(marker4); - threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/NUM_WARPS); + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker5); // accumulate C matrix + #ifdef EXT_ACCUMULATE + __asm__("accumulate:"); if (tile_k == 0) { #pragma GCC ivdep #pragma GCC unroll 8 for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { - uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; - smem_acc_tile_start[elem_offset] = smem_c_tile_start[elem_offset]; + constexpr uint32_t s = num_threads_in_cluster; + smem_acc_tile_start[thread_i * s] = smem_c_tile_start[hw_tid + s * thread_i]; } } else { #if (TILE_NK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 @@ -275,18 +300,19 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, #endif for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i += 8) { constexpr uint32_t s = num_threads_in_cluster; - smem_acc_tile_start[hw_tid + s * 0] += smem_c_tile_start[hw_tid + s * 0]; - smem_acc_tile_start[hw_tid + s * 1] += smem_c_tile_start[hw_tid + s * 1]; - smem_acc_tile_start[hw_tid + s * 2] += smem_c_tile_start[hw_tid + s * 2]; - smem_acc_tile_start[hw_tid + s * 3] += smem_c_tile_start[hw_tid + s * 3]; - smem_acc_tile_start[hw_tid + s * 4] += smem_c_tile_start[hw_tid + s * 4]; - smem_acc_tile_start[hw_tid + s * 5] += smem_c_tile_start[hw_tid + s * 5]; - smem_acc_tile_start[hw_tid + s * 6] += smem_c_tile_start[hw_tid + s * 6]; - smem_acc_tile_start[hw_tid + s * 7] += smem_c_tile_start[hw_tid + s * 7]; + smem_acc_tile_start[s * 0] += smem_c_tile_start[hw_tid + s * 0]; + smem_acc_tile_start[s * 1] += smem_c_tile_start[hw_tid + s * 1]; + smem_acc_tile_start[s * 2] += smem_c_tile_start[hw_tid + s * 2]; + smem_acc_tile_start[s * 3] += smem_c_tile_start[hw_tid + s * 3]; + smem_acc_tile_start[s * 4] += smem_c_tile_start[hw_tid + s * 4]; + smem_acc_tile_start[s * 5] += smem_c_tile_start[hw_tid + s * 5]; + smem_acc_tile_start[s * 6] += smem_c_tile_start[hw_tid + s * 6]; + smem_acc_tile_start[s * 7] += smem_c_tile_start[hw_tid + s * 7]; } } + __asm__("end_accumulate:"); + #endif - rd_cycles(marker6); #ifdef DEBUG_PRINT if (hw_tid == 0) { PRINTF("\nC %d %d %d\n", tile_i, tile_j, tile_k); @@ -302,11 +328,34 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, } } #endif + rd_cycles(marker6); } + #ifndef EXT_ACCUMULATE + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + rd_cycles(marker6); + __asm__("mvout_spad_ser:"); + // mvout to scratchpad for activation + if (HW_TID() == 0) { + __asm__("mvout_spad:"); + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (4ULL << 32) | (4ULL << 16) | 4ULL, k_LOOP_WS_CONFIG_BOUNDS) + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0, k_LOOP_WS_CONFIG_SPAD_AB) + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0x78U, k_LOOP_WS) + /* #pragma gcc unroll 16 + for (int i = 0; i < TILE_MN / DIM; i += DIM) { + gemmini_mvout_spad(i, 0x80000000ULL + i); // FIXME: C is not necessarily at 0 + } */ + __asm__("mvout_spad_fence:"); + gemmini_fence(); + } + __asm__("mvout_spad_bar:"); + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + __asm__("end_mvout_spad:"); + #endif rd_cycles(marker7); - // move out to dram + // move out to dram + __asm__("mvout_dram:"); #ifdef HARDCODE #if (TILE_MN / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 #error CANNOT UNROLL @@ -314,23 +363,44 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, constexpr uint32_t every_iter = j1_stride; const uint32_t every_2iters = i1_stride * dim_n; const uint32_t runtime_const = i0 * dim_n + j1_idx + j0; - dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 0] = \ - smem_acc_tile_start[0 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 0] = \ - smem_acc_tile_start[1 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 1] = \ - smem_acc_tile_start[2 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 1] = \ - smem_acc_tile_start[3 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 2] = \ - smem_acc_tile_start[4 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 2] = \ - smem_acc_tile_start[5 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 3] = \ - smem_acc_tile_start[6 * num_threads_in_cluster + hw_tid]; - dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 3] = \ - smem_acc_tile_start[7 * num_threads_in_cluster + hw_tid]; + float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N + runtime_const; + + float v0 = smem_acc_tile_start[0 * num_threads_in_cluster]; + float v1 = smem_acc_tile_start[1 * num_threads_in_cluster]; + float v2 = smem_acc_tile_start[2 * num_threads_in_cluster]; + float v3 = smem_acc_tile_start[3 * num_threads_in_cluster]; + dram_c_tile_start[every_iter * 0 + every_2iters * 0] = v0; + dram_c_tile_start[every_iter * 1 + every_2iters * 0] = v1; + dram_c_tile_start[every_iter * 0 + every_2iters * 1] = v2; + dram_c_tile_start[every_iter * 1 + every_2iters * 1] = v3; + + v0 = smem_acc_tile_start[4 * num_threads_in_cluster]; + v1 = smem_acc_tile_start[5 * num_threads_in_cluster]; + v2 = smem_acc_tile_start[6 * num_threads_in_cluster]; + v3 = smem_acc_tile_start[7 * num_threads_in_cluster]; + dram_c_tile_start[every_iter * 0 + every_2iters * 2] = v0; + dram_c_tile_start[every_iter * 1 + every_2iters * 2] = v1; + dram_c_tile_start[every_iter * 0 + every_2iters * 3] = v2; + dram_c_tile_start[every_iter * 1 + every_2iters * 3] = v3; + #else + /*dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 0] = \ + smem_acc_tile_start[0 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 0] = \ + smem_acc_tile_start[1 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 1] = \ + smem_acc_tile_start[2 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 1] = \ + smem_acc_tile_start[3 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 2] = \ + smem_acc_tile_start[4 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 2] = \ + smem_acc_tile_start[5 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 0 + every_2iters * 3] = \ + smem_acc_tile_start[6 * num_threads_in_cluster]; + dram_c_tile_start[runtime_const + every_iter * 1 + every_2iters * 3] = \ + smem_acc_tile_start[7 * num_threads_in_cluster];*/ + #pragma GCC unroll 8 for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; @@ -338,62 +408,45 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, *(SMEM_ADDR_8K + SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)); } #endif + __asm__("end_mvout_dram:"); rd_cycles(marker8); - /* if (hw_tid == 0) { - sprintf(PRINT_BUF, "\nC %d %d\n", tile_i, tile_j); - for (int i = 0; i < TILE_M; i += 8) { - for (int j = 0; j < TILE_N; j += 8) { - sprintf(PRINT_BUF, "%d %d ", - (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j]), - (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j + 4]) - ); - } - sprintf(PRINT_BUF, "\n"); - } - } */ } } // last thread block complete if (threadblock_id == NUM_CLUSTERS - 1) { threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker9); - if (hw_tid == 0) { + if (HW_TID() == 0) { PRINTF("\ncomplete\n"); PRINTF("total cycles: %d\n", marker9 - marker0); - PRINTF("tile start: %d\n", marker1); - PRINTF("single tile cycles: %d\n", marker6 - marker1); - PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); - PRINTF("first barrier: %d\n", marker3 - marker2); - PRINTF("gemmini cycles: %d\n", marker4 - marker3); - PRINTF("second barrier: %d\n", marker5 - marker4); - PRINTF("accumulation cycles: %d\n", marker6 - marker5); - PRINTF("dram mvout cycles: %d\n", marker8 - marker7); } - threadblock_barrier(0, /*barrier_id=*/1, /*count=*/NUM_WARPS); - if (hw_tid == num_threads_in_cluster - 1) { - PRINTF("\ntile start: %d\n", marker1); - PRINTF("single tile cycles: %d\n", marker6 - marker1); - PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); - PRINTF("first barrier: %d\n", marker3 - marker2); - PRINTF("gemmini cycles: %d\n", marker4 - marker3); - PRINTF("second barrier: %d\n", marker5 - marker4); - PRINTF("accumulation cycles: %d\n", marker6 - marker5); - PRINTF("dram mvout cycles: %d\n", marker8 - marker7); + vx_tmc(0x81); + for (int x = 0; x < num_threads_in_cluster; x += num_threads_in_cluster - 1) { + if (HW_TID() == x) { + PRINTF("\ntile start: %d\n", marker1); + PRINTF("single tile cycles: %d\n", marker6 - marker1); + PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); + PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); + PRINTF("second barrier: %d\n", marker5 - marker4); + #ifdef EXT_ACCUMULATE + PRINTF("accumulation cycles: %d\n", marker6 - marker5); + #else + PRINTF("smem mvout cycles: %d %d-%d\n", marker7 - marker6, marker7, marker6); + #endif + PRINTF("dram mvout cycles: %d\n", marker8 - marker7); + } + threadblock_barrier(0, /*barrier_id=*/1, /*count=*/NUM_WARPS); } - threadblock_barrier(0, /*barrier_id=*/2, /*count=*/NUM_WARPS); - if (hw_tid == 0) { + if (HW_TID() == 0) { for (int i = 0; i < dim_m; i += 8) { for (int j = 0; j < dim_n; j += 8) { - sprintf(PRINT_BUF, "%d %d ", - (int) (C[i * dim_n + j]), - (int) (C[i * dim_n + j + 4]) - ); + PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); } PRINTF("\n"); } } - vx_tmc_one(); } vx_tmc(0); } From 01f4a69ae9025ae62fa870e52f9d03abf4aba208 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Sun, 28 Apr 2024 01:18:51 -0700 Subject: [PATCH 74/75] dma mvout, double buffering & other opts --- kernel/include/gemmini_mmio.h | 34 +-- kernel/src/vx_start.S | 2 +- tests/regression/sgemm_gemmini/kernel.cpp | 241 ++++++++++++---------- third_party/gemmini-rocc-tests | 2 +- 4 files changed, 154 insertions(+), 125 deletions(-) diff --git a/kernel/include/gemmini_mmio.h b/kernel/include/gemmini_mmio.h index b9dde44d..072fa8fc 100644 --- a/kernel/include/gemmini_mmio.h +++ b/kernel/include/gemmini_mmio.h @@ -37,30 +37,33 @@ #undef ROCC_INSTRUCTION_RS1_RS2 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ - /* printf("function %d\n", funct); */ \ - uint32_t instruction = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((uint32_t) (funct) << 25); \ - *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (volatile uint64_t) (rs1); \ - *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (volatile uint64_t) (rs2); \ + /* printf("function %d\n", funct); */ \ + *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (rs1); \ + *((volatile uint64_t *) GEMMINI_RS2_ADDR) = (rs2); \ /* *((volatile uint32_t*) GEMMINI_RS2_ADDR) = (uint32_t) ((uint64_t) (rs2) & 0xFFFFFFFFULL); */ \ /* *((volatile uint32_t*) (GEMMINI_RS2_ADDR + 4)) = (uint32_t) ((uint64_t) (rs2) >> 32); */ \ /* gemmini_fence(); */ \ - *((volatile uint32_t*) GEMMINI_INST_ADDR) = instruction; \ + *((volatile uint32_t*) GEMMINI_INST_ADDR) = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((funct) << 25); \ /* sprintf((char *) PRINT_BUF, "%llx %llx %d\n", rs1, rs2, funct); */ \ } -static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start, - const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start, - size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K, - bool a_transpose, bool b_transpose, - bool full_C, bool low_D, - bool no_bias, bool repeating_bias, - int act) { +#define sp_tiled_matmul_full_spad_ws(A_sp_addr_start, B_sp_addr_start, D_sp_addr_start, C_dst_sp_addr_start,\ + I, J, K, pad_I, pad_J, pad_K, a_transpose, b_transpose, full_C, low_D, acc, act, skips) \ + gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A_sp_addr_start, (B_sp_addr_start) + (K) * (J) * DIM, NULL, \ + C_dst_sp_addr_start, a_transpose, b_transpose, full_C, low_D, acc, act, 0, 0, false, skips) + +/* inline static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start, + const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start, + size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K, + bool a_transpose, bool b_transpose, + bool full_C, bool low_D, bool acc, + int act, int skip_mvout) { gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start, a_transpose, b_transpose, - full_C, low_D, false, - act, 0, 0, false); + full_C, low_D, acc, + act, 0, 0, false, skip_mvout); */ /* return; @@ -155,8 +158,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u } } gemmini_fence(); - */ -} +}*/ #endif diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index d2a81707..49e520b6 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -102,7 +102,7 @@ init_regs: #endif csrr t0, VX_CSR_MHARTID sll t1, t0, STACK_LOG2_SIZE - sll t2, t0, 2 + sll t2, t0, 4 add t1, t1, t2 sub sp, sp, t1 diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp index 243a2a24..f1893f6d 100644 --- a/tests/regression/sgemm_gemmini/kernel.cpp +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -6,46 +6,42 @@ #include "include/gemmini.h" #include "gemmini_mmio.h" -#define MATRIX_M 64 // TODO: remove hardcode -#define MATRIX_N 64 -#define MATRIX_K 64 -#define TILE_M 32 // tile size = SMEM size / 2 (double buffering) / 4 (A, B, C, Psum) +#define TILE_M 32 #define TILE_N 32 #define TILE_K 32 #define TILE_MN 1024 #define TILE_MK 1024 #define TILE_NK 1024 -//#define EXT_ACCUMULATE - #define NUM_CLUSTERS 1 -#define TB_M (MATRIX_M / NUM_CLUSTERS) -#define TB_N MATRIX_N -#define TB_SIZE (TB_M * TB_N) -#define NUM_TILE_ROWS_PER_TB (TB_M / TILE_M) -#define THREAD_ELEMS 8 // elements per thread in a tile -#define THREAD_STRIDE 8 // threads per core +#define NUM_THREADS_IN_CLUSTER 128 #define SMEM_ADDR_0K ((float * const) 0xff000000) #define SMEM_ADDR_4K ((float * const) 0xff001000) #define SMEM_ADDR_8K ((float * const) 0xff002000) #define SMEM_ADDR_12K ((float * const) 0xff003000) - #define SPAD_ADDR_0K 0x0 #define SPAD_ADDR_4K 0x80 #define SPAD_ADDR_8K 0x100 #define SPAD_ADDR_12K 0x180 -#define HARDCODE -#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) -//#define PRINTF(...) vx_printf(__VA_ARGS__) - // #define DEBUG_PRINT -#define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) +// #define EXT_ACCUMULATE +#define HARDCODE +#define DBUF +// #define DETAILED_PERF -#define HW_TID() ({uint32_t gtid; asm ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) +#define rd_cycles_force(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) +#ifdef DETAILED_PERF + #define rd_cycles(x) rd_cycles_force(x) +#else + #define rd_cycles(x) +#endif +#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) +#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) +// #define PRINTF(...) vx_printf(__VA_ARGS__) -void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { +inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) { vx_fence(); vx_barrier(barrier_id, count); } @@ -58,14 +54,26 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const float * const B = (const float * const) arg->addr_b; float * const C = (float * const) arg->addr_c; + if (HW_TID() == 0) { + gemmini_config_ld(0); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + gemmini_config_st(0); + PRINTF("start\n"); + } + + vx_fence(); + + uint32_t marker0, marker1, marker2, marker3, marker4; + uint32_t marker5, marker6, marker7, marker8, marker9; + rd_cycles_force(marker0); + const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; + const uint32_t num_tiles_m = dim_m / TILE_M; const uint32_t num_tiles_n = dim_n / TILE_N; const uint32_t num_tiles_k = dim_k / TILE_K; - // TODO: make this into constexpr by subbing architectural params with macros - // const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; - constexpr uint32_t num_threads_in_cluster = 128; + constexpr uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER; constexpr uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster; constexpr uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; constexpr uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; @@ -84,25 +92,13 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, constexpr uint32_t i1_stride = DIM; // step per increment (increment doesnt happen every iteration) constexpr uint32_t i1_iters = (DIM * DIM * (TILE_K / DIM)) / num_threads_in_cluster; // num of iters before striding - uint32_t marker0, marker1, marker2, marker3, marker4; - uint32_t marker5, marker6, marker7, marker8, marker9; - - if (HW_TID() == 0) { - gemmini_config_ld(0); - gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); - gemmini_config_st(0); - PRINTF("start\n"); - } - - // TODO: check for tb id - rd_cycles(marker0); - - __asm__("i_loop:"); - for (int tile_i = NUM_TILE_ROWS_PER_TB * threadblock_id; - tile_i < NUM_TILE_ROWS_PER_TB * (threadblock_id + 1); - tile_i += 1) { - __asm__("j_loop:"); + const uint32_t num_tile_rows_per_tb = num_tiles_m / NUM_CLUSTERS; + for (uint32_t tile_i = num_tile_rows_per_tb * threadblock_id; + tile_i < num_tile_rows_per_tb * (threadblock_id + 1); + tile_i += 1) { + __asm__("i_loop:"); for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { + __asm__("j_loop:"); float * const smem_c_tile_start = SMEM_ADDR_4K; #ifndef EXT_ACCUMULATE float * const smem_acc_tile_start = SMEM_ADDR_0K + HW_TID(); @@ -119,7 +115,6 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, #if (TILE_MK / NUM_THREADS / NUM_WARPS / CORES_PER_CLUSTER) != 8 #error CANNOT UNROLL #endif - // preload A B matrix constexpr uint32_t every_iter = j1_stride; const uint32_t every_2iters_a = i1_stride * dim_k; @@ -129,50 +124,57 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K + runtime_const_a; const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N + runtime_const_b; + #ifdef DBUF + float * const smem_a_tile_start = ((tile_k & 1) ? SMEM_ADDR_4K : SMEM_ADDR_0K) + HW_TID(); + float * const smem_b_tile_start = ((tile_k & 1) ? SMEM_ADDR_12K : SMEM_ADDR_8K) + HW_TID(); + #else float * const smem_a_tile_start = SMEM_ADDR_0K + HW_TID(); float * const smem_b_tile_start = SMEM_ADDR_12K + HW_TID(); + #endif - __asm__("load_ab:"); - float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0]; - float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0]; - float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1]; - float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1]; - smem_a_tile_start[0 * num_threads_in_cluster] = v0; - smem_a_tile_start[1 * num_threads_in_cluster] = v1; - smem_a_tile_start[2 * num_threads_in_cluster] = v2; - smem_a_tile_start[3 * num_threads_in_cluster] = v3; + { + __asm__("load_ab:"); + float v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 0]; + float v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 0]; + float v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 1]; + float v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 1]; + smem_a_tile_start[0 * num_threads_in_cluster] = v0; + smem_a_tile_start[1 * num_threads_in_cluster] = v1; + smem_a_tile_start[2 * num_threads_in_cluster] = v2; + smem_a_tile_start[3 * num_threads_in_cluster] = v3; - __asm__("load_ab1:"); - v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0]; - v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0]; - v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1]; - v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1]; - smem_b_tile_start[0 * num_threads_in_cluster] = v0; - smem_b_tile_start[1 * num_threads_in_cluster] = v1; - smem_b_tile_start[2 * num_threads_in_cluster] = v2; - smem_b_tile_start[3 * num_threads_in_cluster] = v3; + __asm__("load_ab1:"); + v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 0]; + v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 0]; + v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 1]; + v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 1]; + smem_b_tile_start[0 * num_threads_in_cluster] = v0; + smem_b_tile_start[1 * num_threads_in_cluster] = v1; + smem_b_tile_start[2 * num_threads_in_cluster] = v2; + smem_b_tile_start[3 * num_threads_in_cluster] = v3; - __asm__("load_ab2:"); - v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2]; - v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2]; - v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3]; - v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3]; - smem_a_tile_start[4 * num_threads_in_cluster] = v0; - smem_a_tile_start[5 * num_threads_in_cluster] = v1; - smem_a_tile_start[6 * num_threads_in_cluster] = v2; - smem_a_tile_start[7 * num_threads_in_cluster] = v3; + __asm__("load_ab2:"); + v0 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 2]; + v1 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 2]; + v2 = dram_a_tile_start[every_iter * 0 + every_2iters_a * 3]; + v3 = dram_a_tile_start[every_iter * 1 + every_2iters_a * 3]; + smem_a_tile_start[4 * num_threads_in_cluster] = v0; + smem_a_tile_start[5 * num_threads_in_cluster] = v1; + smem_a_tile_start[6 * num_threads_in_cluster] = v2; + smem_a_tile_start[7 * num_threads_in_cluster] = v3; - __asm__("load_ab3:"); - v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2]; - v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2]; - v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3]; - v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3]; - smem_b_tile_start[4 * num_threads_in_cluster] = v0; - smem_b_tile_start[5 * num_threads_in_cluster] = v1; - smem_b_tile_start[6 * num_threads_in_cluster] = v2; - smem_b_tile_start[7 * num_threads_in_cluster] = v3; + __asm__("load_ab3:"); + v0 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 2]; + v1 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 2]; + v2 = dram_b_tile_start[every_iter * 0 + every_2iters_b * 3]; + v3 = dram_b_tile_start[every_iter * 1 + every_2iters_b * 3]; + smem_b_tile_start[4 * num_threads_in_cluster] = v0; + smem_b_tile_start[5 * num_threads_in_cluster] = v1; + smem_b_tile_start[6 * num_threads_in_cluster] = v2; + smem_b_tile_start[7 * num_threads_in_cluster] = v3; - __asm__("end_loadab:"); + __asm__("end_loadab:"); + } #else /* smem_a_tile_start[0 * num_threads_in_cluster + hw_tid] = \ dram_a_tile_start[runtime_const + every_iter * 0 + every_2iters * 0]; @@ -265,11 +267,20 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, rd_cycles(marker2); // cluster wide barrier to wait for A and B loads to complete - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker3); __asm__("gemmini:"); if (HW_TID() == 0) { - sp_tiled_matmul_full_spad_ws(SPAD_ADDR_0K, SPAD_ADDR_12K, /*spad_D=*/0, SPAD_ADDR_4K, + #ifdef DBUF + gemmini_fence(); + #endif + sp_tiled_matmul_full_spad_ws( + #ifdef DBUF + (tile_k & 1) ? SPAD_ADDR_4K : SPAD_ADDR_0K, (tile_k & 1) ? SPAD_ADDR_12K : SPAD_ADDR_8K, + #else + SPAD_ADDR_0K, SPAD_ADDR_12K, + #endif + /*spad_D=*/0, /*spad_C=*/SPAD_ADDR_4K, /*I=*/TILE_M / DIM, /*J=*/TILE_N / DIM, /*K=*/TILE_K / DIM, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, #ifdef EXT_ACCUMULATE @@ -277,11 +288,13 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, #else /*acc=*/tile_k != 0, /*act=*/NO_ACTIVATION, /*skips=*/0xB8U); #endif + #ifndef DBUF gemmini_fence(); + #endif } __asm__("end_gemmini:"); rd_cycles(marker4); - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker5); // accumulate C matrix @@ -329,18 +342,30 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, } #endif rd_cycles(marker6); + + /* if (HW_TID() == 0) { + PRINTF("\ntile start: %d\n", marker1); + PRINTF("single tile cycles: %d\n", marker6 - marker1); + PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); + PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); + PRINTF("second barrier: %d\n", marker5 - marker4); + } */ + } #ifndef EXT_ACCUMULATE - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles(marker6); __asm__("mvout_spad_ser:"); // mvout to scratchpad for activation if (HW_TID() == 0) { __asm__("mvout_spad:"); + #ifdef DBUF + gemmini_fence(); + #endif ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (4ULL << 32) | (4ULL << 16) | 4ULL, k_LOOP_WS_CONFIG_BOUNDS) - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0, k_LOOP_WS_CONFIG_SPAD_AB) - ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0x78U, k_LOOP_WS) + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, 0x278U, k_LOOP_WS) /* #pragma gcc unroll 16 for (int i = 0; i < TILE_MN / DIM; i += DIM) { gemmini_mvout_spad(i, 0x80000000ULL + i); // FIXME: C is not necessarily at 0 @@ -349,7 +374,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, gemmini_fence(); } __asm__("mvout_spad_bar:"); - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); __asm__("end_mvout_spad:"); #endif rd_cycles(marker7); @@ -415,30 +440,32 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, } // last thread block complete if (threadblock_id == NUM_CLUSTERS - 1) { - threadblock_barrier(0, /*barrier_id=*/0, /*count=*/NUM_WARPS); - rd_cycles(marker9); + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); + rd_cycles_force(marker9); if (HW_TID() == 0) { PRINTF("\ncomplete\n"); PRINTF("total cycles: %d\n", marker9 - marker0); } - vx_tmc(0x81); - for (int x = 0; x < num_threads_in_cluster; x += num_threads_in_cluster - 1) { - if (HW_TID() == x) { - PRINTF("\ntile start: %d\n", marker1); - PRINTF("single tile cycles: %d\n", marker6 - marker1); - PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); - PRINTF("first barrier: %d\n", marker3 - marker2); - PRINTF("gemmini cycles: %d\n", marker4 - marker3); - PRINTF("second barrier: %d\n", marker5 - marker4); - #ifdef EXT_ACCUMULATE - PRINTF("accumulation cycles: %d\n", marker6 - marker5); - #else - PRINTF("smem mvout cycles: %d %d-%d\n", marker7 - marker6, marker7, marker6); - #endif - PRINTF("dram mvout cycles: %d\n", marker8 - marker7); + #ifdef DETAILED_PERF + vx_tmc(0x81); + for (int x = 0; x < num_threads_in_cluster; x += num_threads_in_cluster - 1) { + if (HW_TID() == x) { + PRINTF("\ntile start: %d\n", marker1); + PRINTF("single tile cycles: %d\n", marker6 - marker1); + PRINTF("A/B tile load cycles: %d\n", marker2 - marker1); + PRINTF("first barrier: %d\n", marker3 - marker2); + PRINTF("gemmini cycles: %d\n", marker4 - marker3); + PRINTF("second barrier: %d\n", marker5 - marker4); + #ifdef EXT_ACCUMULATE + PRINTF("accumulation cycles: %d\n", marker6 - marker5); + #else + PRINTF("smem mvout cycles: %d %d-%d\n", marker7 - marker6, marker7, marker6); + #endif + PRINTF("dram mvout cycles: %d\n", marker8 - marker7); + } + threadblock_barrier(/*barrier_id=*/1, /*count=*/NUM_WARPS); } - threadblock_barrier(0, /*barrier_id=*/1, /*count=*/NUM_WARPS); - } + #endif if (HW_TID() == 0) { for (int i = 0; i < dim_m; i += 8) { for (int j = 0; j < dim_n; j += 8) { @@ -455,8 +482,8 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const int threadblock_id = task_id / TB_SIZE; - const int tid_in_threadblock = task_id % TB_SIZE; + const int threadblock_id = task_id / NUM_THREADS_IN_CLUSTER; + const int tid_in_threadblock = task_id % NUM_THREADS_IN_CLUSTER; thread_block_matmul_gemmini(arg, threadblock_id, tid_in_threadblock); } diff --git a/third_party/gemmini-rocc-tests b/third_party/gemmini-rocc-tests index 62106286..6148fc0d 160000 --- a/third_party/gemmini-rocc-tests +++ b/third_party/gemmini-rocc-tests @@ -1 +1 @@ -Subproject commit 62106286e5b7479065025666cdc5f6bc020be764 +Subproject commit 6148fc0d2c7a91ec87e72bdd3c3808c6f985a77e From a606a9ef42708984e130b582042be04a3f783f66 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 29 Apr 2024 17:14:28 -0700 Subject: [PATCH 75/75] common.mk: properly handle unspecified CONFIG --- tests/regression/common.mk | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index a0873857..24a871eb 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -79,7 +79,10 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.radiance.$(CONFIG).dump +# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes +CONFIGEXT = $(if $(CONFIG),.$(CONFIG),) + +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump @@ -87,8 +90,10 @@ kernel.dump: kernel.elf kernel.radiance.dump: kernel.radiance.elf $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump -kernel.radiance.$(CONFIG).dump: kernel.radiance.$(CONFIG).elf - $(VX_DP) -D kernel.radiance.$(CONFIG).elf > kernel.radiance.$(CONFIG).dump +ifneq ($(CONFIG),) +kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf + $(VX_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump +endif kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin @@ -105,8 +110,10 @@ kernel.radiance.elf: kernel.elf $(OBJCOPY) --update-section .operand.a=input.a.bin kernel.radiance.elf $(OBJCOPY) --update-section .operand.b=input.b.bin kernel.radiance.elf -kernel.radiance.$(CONFIG).elf: kernel.radiance.elf +ifneq ($(CONFIG),) +kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf cp $< $@ +endif $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@