From b1e649563018f2f54ecca60b6cb441e2477e0f26 Mon Sep 17 00:00:00 2001 From: Richard Yan Date: Tue, 28 Jan 2025 16:37:22 -0800 Subject: [PATCH] update kernels --- kernel/include/gemmini_mmio.h | 71 ++++++++---------- tests/regression/sgemm_gemmini_dma/args/1024 | Bin 0 -> 40 bytes tests/regression/sgemm_gemmini_dma/args/128 | Bin 0 -> 40 bytes tests/regression/sgemm_gemmini_dma/args/256 | Bin 0 -> 40 bytes tests/regression/sgemm_gemmini_dma/args/512 | Bin 0 -> 40 bytes .../sgemm_gemmini_dma/compile_ampere.sh | 11 +++ .../sgemm_gemmini_dma/compile_hopper.sh | 11 +++ .../sgemm_gemmini_dma/generate_operands.py | 22 +++--- tests/regression/sgemm_gemmini_dma/kernel.cpp | 2 +- 9 files changed, 67 insertions(+), 50 deletions(-) create mode 100644 tests/regression/sgemm_gemmini_dma/args/1024 create mode 100644 tests/regression/sgemm_gemmini_dma/args/128 create mode 100644 tests/regression/sgemm_gemmini_dma/args/256 create mode 100644 tests/regression/sgemm_gemmini_dma/args/512 create mode 100755 tests/regression/sgemm_gemmini_dma/compile_ampere.sh create mode 100755 tests/regression/sgemm_gemmini_dma/compile_hopper.sh diff --git a/kernel/include/gemmini_mmio.h b/kernel/include/gemmini_mmio.h index 894d5fc6..08e1a792 100644 --- a/kernel/include/gemmini_mmio.h +++ b/kernel/include/gemmini_mmio.h @@ -4,6 +4,8 @@ #error INCLUDE GEMMINI.H FIRST #endif +/* shared memory constants and helpers */ +/* =================================== */ #define SMEM_BASE 0xff000000 // 16KB // #define SMEM_SIZE 0x4000 @@ -13,50 +15,19 @@ // #define SMEM_SIZE 0x20000 // 256KB #define SMEM_SIZE 0x20000 - #define SMEM_MASK (SMEM_SIZE - 1) #define SMEM_ADDR_END (SMEM_BASE + SMEM_SIZE) -static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTERS] = {0}; - -#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) -#define use_gemmini(i) {gemmini_tile_idx[HW_TID()] = (i);} -#define GEMMINI_TILE_IDX() (gemmini_tile_idx[HW_TID()]) -#define GEMMINI_CTRL (SMEM_BASE + SMEM_SIZE + 0x3000 + 0x100 * GEMMINI_TILE_IDX()) -#define GEMMINI_CISC_IMM(x, i) ((x) + 32 * (i)) - #define SPAD_BASE 0x0 #define SPAD_ROW_SIZE (DIM * sizeof(elem_t)) #define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE) #define SPAD_MASK (SPAD_NUM_ROWS - 1) #define PRINT_BUF ((char *) (SMEM_ADDR_END)) -#define GEMMINI_RS1_ADDR (GEMMINI_CTRL + 0x10) -#define GEMMINI_RS2_ADDR (GEMMINI_CTRL + 0x18) -#define GEMMINI_INST_ADDR (GEMMINI_CTRL + 0x0) -#define GEMMINI_BUSY_ADDR (GEMMINI_CTRL + 0x20) - +#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) #define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE) #define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE) - -// CISC instructions: -// 0, 1, 2: tile-sized matmuls -// 0: k = 0, no accumulation -// 1: k % 2 = 0, buffer regions 0 -// 2: k % 2 = 1, buffer regions 1 -// 8, 9, 10, 11: memory ops -// 8: tile-sized move-in stride -// 9: tile-sized move-out -// 10: tile-sized move-in, buffer regions 0 -// 11: tile-sized move-in, buffer regions 1 -// bits [4:0] is the opcode -// bits [7:5] is the target gemmini id, zero-indexed -// #define GEMMINI_CISC_CMD_I(x) asm("csrwi 0xacc, %0" :: "i" (x)) -#define GEMMINI_CISC_CMD_I(x) asm("csrw 0xacc, %0" :: "r" (x)) -#define GEMMINI_CISC_CMD_R(x) asm("csrw 0xacc, %0" :: "r" (x)) -#define GEMMINI_STATUS() ({uint32_t status; asm volatile ("csrr %0, 0xacc" : "=r" (status)); status;}) - // convert normal matrix i,j into tiled smem offset // top_in_tiles = i / DIM // left_in_tiles = j / DIM @@ -65,11 +36,17 @@ static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTER #define SMEM_MAT_OFFSET(i, j, J) \ (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM)) -// #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; } -#undef gemmini_fence -//#define gemmini_fence() { while (GEMMINI_STATUS()); } -#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } - +/* gemmini mmio interface */ +/* ====================== */ +static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTERS] = {0}; +#define use_gemmini(i) {gemmini_tile_idx[HW_TID()] = (i);} +#define GEMMINI_TILE_IDX() (gemmini_tile_idx[HW_TID()]) +#define GEMMINI_CISC_IMM(x, i) ((x) + 32 * (i)) +#define GEMMINI_CTRL (SMEM_BASE + SMEM_SIZE + 0x3000 + 0x100 * GEMMINI_TILE_IDX()) +#define GEMMINI_RS1_ADDR (GEMMINI_CTRL + 0x10) +#define GEMMINI_RS2_ADDR (GEMMINI_CTRL + 0x18) +#define GEMMINI_INST_ADDR (GEMMINI_CTRL + 0x0) +#define GEMMINI_BUSY_ADDR (GEMMINI_CTRL + 0x20) #undef ROCC_INSTRUCTION_RS1_RS2 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \ *((volatile uint64_t *) GEMMINI_RS1_ADDR) = (rs1); \ @@ -77,6 +54,8 @@ static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTER *((volatile uint32_t*) GEMMINI_INST_ADDR) = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((funct) << 25); \ } +/* additional intrinsics */ +/* ===================== */ #define loop_matmul_skips(skip_lda, skip_ldb, skip_ldd, skip_ex, skip_stc) \ (((skip_lda) | ((skip_ldb) << 1) | ((skip_ldd) << 2) | ((skip_ex) << 3) | ((skip_stc) << 4)) << 3) @@ -85,6 +64,20 @@ static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTER gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A_sp_addr_start, (B_sp_addr_start) + (K) * (J) * DIM, NULL, \ C_dst_sp_addr_start, a_transpose, b_transpose, full_C, low_D, acc, act, 0, 0, false, skips) +#define gemmini_status() ({uint32_t status; asm volatile ("csrr %0, 0xacc" : "=r" (status)); status;}) + +#undef gemmini_fence +//#define gemmini_fence() { while (gemmini_status()); } +#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); } + +/* cisc instructions */ +/* ================= */ + +// bits [4:0] is the opcode +// bits [7:5] is the target gemmini id, zero-indexed +// #define GEMMINI_CISC_CMD_I(x) asm("csrwi 0xacc, %0" :: "i" (x)) +#define GEMMINI_CISC_CMD_I(x) asm("csrw 0xacc, %0" :: "r" (x)) // use registers even for immediate calls for now +#define GEMMINI_CISC_CMD_R(x) asm("csrw 0xacc, %0" :: "r" (x)) #define GEMMINI_CISC_COMPUTE_HEXADECILES 0 #define GEMMINI_CISC_SET_AB_STRIDE 8 @@ -93,7 +86,8 @@ static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTER #define GEMMINI_CISC_SET_DC_STRIDE 11 #define GEMMINI_CISC_STORE_TO_GMEM 12 -// cisc instruction wrappers +/* high level virgo routines */ +/* ========================= */ inline void gemmini_tile_load_ab(const elem_t * const a_addr, const elem_t * const b_addr, const uint32_t a_hexadecile, const uint32_t b_hexadecile, const uint32_t tile_idx_i, const uint32_t tile_idx_j, const uint32_t tile_idx_k, @@ -130,6 +124,7 @@ inline void gemmini_tile_store_c_gmem(elem_t * const c_addr, inline void gemmini_tile_store_c_spad(const uint32_t c_hexadecile) { GEMMINI_CISC_CMD_R(((uint32_t) (c_hexadecile << 8)) | GEMMINI_CISC_STORE_TO_SPAD); } + /* inline static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start, const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start, size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K, diff --git a/tests/regression/sgemm_gemmini_dma/args/1024 b/tests/regression/sgemm_gemmini_dma/args/1024 new file mode 100644 index 0000000000000000000000000000000000000000..38f0be5410c0dbd313d2b529ba4398231d19dce3 GIT binary patch literal 40 bcmZQzVPIf@VIUI-7Jx|xhJ{f20Eh+v8rcF4 literal 0 HcmV?d00001 diff --git a/tests/regression/sgemm_gemmini_dma/args/128 b/tests/regression/sgemm_gemmini_dma/args/128 new file mode 100644 index 0000000000000000000000000000000000000000..dae91e7956de06dbee33997cef53875806dd77c8 GIT binary patch literal 40 bcmZo*U|?u~Vju+q3&11;!$K&107L@-Pc;LA literal 0 HcmV?d00001 diff --git a/tests/regression/sgemm_gemmini_dma/args/256 b/tests/regression/sgemm_gemmini_dma/args/256 new file mode 100644 index 0000000000000000000000000000000000000000..ed11ccde38a7fc35d41d5912e858ecb4711fb0c5 GIT binary patch literal 40 bcmZQzWME)`VIUI-7Jx|xhJ{f20Eh+v8J+?K literal 0 HcmV?d00001 diff --git a/tests/regression/sgemm_gemmini_dma/args/512 b/tests/regression/sgemm_gemmini_dma/args/512 new file mode 100644 index 0000000000000000000000000000000000000000..031177e33d4db50992ac1e0e7a222e79c09fab0f GIT binary patch literal 40 bcmZQzVqjo^VIUI-7Jx|xhJ{f20Eh+v8V3Rh literal 0 HcmV?d00001 diff --git a/tests/regression/sgemm_gemmini_dma/compile_ampere.sh b/tests/regression/sgemm_gemmini_dma/compile_ampere.sh new file mode 100755 index 00000000..9bcee2df --- /dev/null +++ b/tests/regression/sgemm_gemmini_dma/compile_ampere.sh @@ -0,0 +1,11 @@ +rm kernel.radiance.elf +rm -rf binaries +mkdir binaries +for a in args/*; do + cp -f $a args.bin + aa=$(basename "$a") + cp -f input.a/"$aa" input.a.bin + cp -f input.b/"$aa" input.b.bin + make > /dev/null + mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf +done diff --git a/tests/regression/sgemm_gemmini_dma/compile_hopper.sh b/tests/regression/sgemm_gemmini_dma/compile_hopper.sh new file mode 100755 index 00000000..7f8ada72 --- /dev/null +++ b/tests/regression/sgemm_gemmini_dma/compile_hopper.sh @@ -0,0 +1,11 @@ +rm kernel.radiance.elf +rm -rf binaries +mkdir binaries +for a in args/*; do + cp -f $a args.bin + aa=$(basename "$a") + cp -f input.a/"$aa" input.a.bin + cp -f input.b/"$aa" input.b.bin + make > /dev/null + mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf +done diff --git a/tests/regression/sgemm_gemmini_dma/generate_operands.py b/tests/regression/sgemm_gemmini_dma/generate_operands.py index 414b07cf..21ca9b72 100644 --- a/tests/regression/sgemm_gemmini_dma/generate_operands.py +++ b/tests/regression/sgemm_gemmini_dma/generate_operands.py @@ -15,20 +15,20 @@ def truncated_matrix_multiplication(matrix_a, matrix_b, size): result = np.matmul(truncated_a, truncated_b) return result.astype(np.float16) -# Generate the 512x512 matrices -size = 512 -matrix_a = generate_fp16_matrix(size) -matrix_b = generate_fp16_matrix(size) - -# Save the operand matrices to binary files -save_matrix_to_bin("input.a.bin", matrix_a) -save_matrix_to_bin("input.b.bin", matrix_b) - # Generate and save the reference matrices for 128x128, 256x256, and 512x512 sizes -sizes = [128, 256, 512] +sizes = [128, 256, 512, 1024] for s in sizes: + np.random.seed(0) + matrix_a = generate_fp16_matrix(s) + matrix_b = generate_fp16_matrix(s) + + # Save the operand matrices to binary files + save_matrix_to_bin("input.a.bin", matrix_a) + save_matrix_to_bin(f"input.a/{s}", matrix_a) + save_matrix_to_bin("input.b.bin", matrix_b) + save_matrix_to_bin(f"input.b/{s}", matrix_b) + ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s) - print(ref_matrix) save_matrix_to_bin(f"ref{s}.bin", ref_matrix) print("All files generated successfully.") diff --git a/tests/regression/sgemm_gemmini_dma/kernel.cpp b/tests/regression/sgemm_gemmini_dma/kernel.cpp index 2e55cb5e..efcb5b52 100644 --- a/tests/regression/sgemm_gemmini_dma/kernel.cpp +++ b/tests/regression/sgemm_gemmini_dma/kernel.cpp @@ -107,7 +107,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, for (uint32_t tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { for (uint32_t tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { uint32_t a_hexadecile = (tile_k & 1) << 2; - uint32_t b_hexadecile = a_hexadecile + 8; + uint32_t b_hexadecile = a_hexadecile + 11; gemmini_tile_load_ab(A, B, a_hexadecile, b_hexadecile, tile_i, tile_j, tile_k, dim_m, dim_n, dim_k, TILE_M, TILE_N, TILE_K);