diff --git a/kernels/sgemm_gemmini/compile_ampere.sh b/kernels/sgemm_gemmini/compile_ampere.sh index 43dd96ae..a4713106 100755 --- a/kernels/sgemm_gemmini/compile_ampere.sh +++ b/kernels/sgemm_gemmini/compile_ampere.sh @@ -1,11 +1,16 @@ -rm kernel.radiance.elf -rm -rf binaries -mkdir binaries +#!/bin/sh + for a in args/*; do + echo "compiling GEMM kernel for Virgo with dim ${a}" cp -f $a args.bin aa=$(basename "$a") - cp -f input.a/"$aa" input.a.bin - cp -f input.b/"$aa" input.b.bin - make > /dev/null - mv kernel.radiance.elf binaries/gemmini_fp16nodma"$aa".elf + cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin + cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin + touch input.c.bin + + # touch source file to force re-building, as the Makefile does not track + # binary changes + touch kernel.cpp + + make CONFIG=gemm.virgo.ampere.nodma.dim${aa} done diff --git a/kernels/sgemm_gemmini/compile_hopper.sh b/kernels/sgemm_gemmini/compile_hopper.sh index 7b816d6f..b6a0a80e 100755 --- a/kernels/sgemm_gemmini/compile_hopper.sh +++ b/kernels/sgemm_gemmini/compile_hopper.sh @@ -1,11 +1,16 @@ -rm kernel.radiance.elf -rm -rf binaries -mkdir binaries +#!/bin/sh + for a in args/*; do + echo "compiling GEMM kernel for Virgo with dim ${a}" cp -f $a args.bin aa=$(basename "$a") - cp -f input.a/"$aa" input.a.bin - cp -f input.b/"$aa" input.b.bin - make > /dev/null - mv kernel.radiance.elf binaries/gemmini_hopper_nodma"$aa".elf + cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin + cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin + touch input.c.bin + + # touch source file to force re-building, as the Makefile does not track + # binary changes + touch kernel.cpp + + make CONFIG=gemm.virgo.hopper.nodma.dim${aa} done diff --git a/kernels/sgemm_gemmini/input.a b/kernels/sgemm_gemmini/input.a deleted file mode 120000 index 52e944e9..00000000 --- a/kernels/sgemm_gemmini/input.a +++ /dev/null @@ -1 +0,0 @@ -../sgemm_gemmini_dma/input.a \ No newline at end of file diff --git a/kernels/sgemm_gemmini/input.b b/kernels/sgemm_gemmini/input.b deleted file mode 120000 index d01681eb..00000000 --- a/kernels/sgemm_gemmini/input.b +++ /dev/null @@ -1 +0,0 @@ -../sgemm_gemmini_dma/input.b \ No newline at end of file diff --git a/kernels/sgemm_gemmini_dma/compile_ampere.sh b/kernels/sgemm_gemmini_dma/compile_ampere.sh deleted file mode 100755 index d623d2cf..00000000 --- a/kernels/sgemm_gemmini_dma/compile_ampere.sh +++ /dev/null @@ -1,12 +0,0 @@ -rm kernel.radiance.elf -rm -rf binaries -mkdir binaries -touch input.c.bin -for a in args/*; do - cp -f $a args.bin - aa=$(basename "$a") - cp -f input.a/"$aa" input.a.bin - cp -f input.b/"$aa" input.b.bin - make > /dev/null - mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf -done diff --git a/kernels/sgemm_gemmini_dma/compile_debug.sh b/kernels/sgemm_gemmini_dma/compile_debug.sh deleted file mode 100755 index c245b2bc..00000000 --- a/kernels/sgemm_gemmini_dma/compile_debug.sh +++ /dev/null @@ -1,12 +0,0 @@ -rm kernel.radiance.elf -rm -rf binaries -mkdir binaries -touch input.c.bin -for a in args/*; do - cp -f $a args.bin - aa=$(basename "$a") - cp -f input.a/"$aa" input.a.bin - cp -f input.b/"$aa" input.b.bin - make > /dev/null - mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf -done diff --git a/kernels/sgemm_gemmini_dma/compile_hopper.sh b/kernels/sgemm_gemmini_dma/compile_hopper.sh deleted file mode 100755 index 41b30728..00000000 --- a/kernels/sgemm_gemmini_dma/compile_hopper.sh +++ /dev/null @@ -1,12 +0,0 @@ -rm kernel.radiance.elf -rm -rf binaries -mkdir binaries -touch input.c.bin -for a in args/*; do - cp -f $a args.bin - aa=$(basename "$a") - cp -f input.a/"$aa" input.a.bin - cp -f input.b/"$aa" input.b.bin - make > /dev/null - mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf -done diff --git a/kernels/sgemm_gemmini_dma/compile_virgo.sh b/kernels/sgemm_gemmini_dma/compile_virgo.sh new file mode 100755 index 00000000..f2deb04c --- /dev/null +++ b/kernels/sgemm_gemmini_dma/compile_virgo.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then + echo "input binaries not found, generating operands" + python3 generate_operands.py +fi + +for a in args/*; do + echo "compiling GEMM kernel for Virgo with dim ${a}" + cp -f $a args.bin + aa=$(basename "$a") + cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin + cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin + touch input.c.bin + + # touch source file to force re-building, as the Makefile does not track + # binary changes + touch kernel.cpp + + make CONFIG=gemm.virgo.hopper.dim${aa} +done diff --git a/kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh b/kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh new file mode 100755 index 00000000..d71785f3 --- /dev/null +++ b/kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh @@ -0,0 +1,25 @@ +#!/bin/sh +# +# This script generates the 8-core-per-cluster version of Virgo GEMM kernels. +# We use the 4-core version for final evaluation; the 8-core kernels should +# behave identically. + +if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then + echo "input binaries not found, generating operands" + python3 generate_operands.py +fi + +for a in args/*; do + echo "compiling GEMM kernel for Virgo with dim ${a}" + cp -f $a args.bin + aa=$(basename "$a") + cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin + cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin + touch input.c.bin + + # touch source file to force re-building, as the Makefile does not track + # binary changes + touch kernel.cpp + + make CONFIG=gemm.virgo.ampere.dim${aa} +done diff --git a/kernels/sgemm_gemmini_dma/generate_operands.py b/kernels/sgemm_gemmini_dma/generate_operands.py index 21ca9b72..31767b4b 100644 --- a/kernels/sgemm_gemmini_dma/generate_operands.py +++ b/kernels/sgemm_gemmini_dma/generate_operands.py @@ -24,9 +24,9 @@ for s in sizes: # Save the operand matrices to binary files save_matrix_to_bin("input.a.bin", matrix_a) - save_matrix_to_bin(f"input.a/{s}", matrix_a) + save_matrix_to_bin(f"input.a.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_a) save_matrix_to_bin("input.b.bin", matrix_b) - save_matrix_to_bin(f"input.b/{s}", matrix_b) + save_matrix_to_bin(f"input.b.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_b) ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s) save_matrix_to_bin(f"ref{s}.bin", ref_matrix) diff --git a/kernels/sgemm_gemmini_dma/input.a/128 b/kernels/sgemm_gemmini_dma/input.a/128 deleted file mode 100644 index 66035f64..00000000 Binary files a/kernels/sgemm_gemmini_dma/input.a/128 and /dev/null differ diff --git a/kernels/sgemm_gemmini_dma/input.b/128 b/kernels/sgemm_gemmini_dma/input.b/128 deleted file mode 100644 index 5b6ff306..00000000 Binary files a/kernels/sgemm_gemmini_dma/input.b/128 and /dev/null differ diff --git a/kernels/sgemm_tcore/args.bin b/kernels/sgemm_tcore/args.bin deleted file mode 120000 index a4214e3e..00000000 --- a/kernels/sgemm_tcore/args.bin +++ /dev/null @@ -1 +0,0 @@ -args.m256n256k256.bin \ No newline at end of file diff --git a/kernels/sgemm_tcore/compile_tcore.sh b/kernels/sgemm_tcore/compile_tcore.sh new file mode 100755 index 00000000..0d3403d3 --- /dev/null +++ b/kernels/sgemm_tcore/compile_tcore.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +archs=("volta" "ampere" "hopper") +dims=("256" "512" "1024") + +if [ -z "$TOOLDIR" ]; then + echo "error: \$TOOLDIR not set. Did you run source ci/toolchain_env.sh?" + exit 1 +fi + +switch_binaries() { + local dim="$1" + local arch="$2" + dma=1 + [[ "$arch" == "volta" ]] && dma=0 + echo "dma is $dma" + if [ "$dma" == "1" ]; then + layout_a="row.swizzle_fp16" + layout_b="row" + else + layout_a="col.swizzle_fp16" + layout_b="row.swizzle_fp16" + fi + + args="args.m$1n$1k$1.bin" + input_a="input.a.rand01.fp16.m$1n$1k$1.$layout_a.bin" + input_b="input.b.rand01.fp16.m$1n$1k$1.$layout_b.bin" + check_exists "$args" + check_exists "$input_a" + check_exists "$input_b" + + ln -sf -v "$args" "args.bin" + ln -sf -v "$input_a" "input.a.bin" + ln -sf -v "$input_b" "input.b.bin" +} + +check_exists() { + if ! [ -f "$1" ]; then + echo "error: looked for file $1 that does not exist." + exit 1 + fi +} + +for arch in "${archs[@]}"; do + git checkout kernels-asplos-ae-$arch + + # re-compile libvortexrt.a + # FIXME after restructure + pushd ../../../kernel + make + popd + + for dim in "${dims[@]}"; do + echo "compiling GEMM kernel for $arch with dim $dim" + + switch_binaries $dim $arch + + # touch source file to force re-building, as the Makefile does not track + # binary changes + touch kernel.cpp + + make CONFIG=gemm.tcore.$arch.dim$dim + done +done diff --git a/kernels/sgemm_tcore/input.a.bin b/kernels/sgemm_tcore/input.a.bin deleted file mode 120000 index 25594851..00000000 --- a/kernels/sgemm_tcore/input.a.bin +++ /dev/null @@ -1 +0,0 @@ -input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin \ No newline at end of file diff --git a/kernels/sgemm_gemmini_dma/input.a/1024 b/kernels/sgemm_tcore/input.a.rand01.fp16.m1024n1024k1024.row.swizzle_fp16.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.a/1024 rename to kernels/sgemm_tcore/input.a.rand01.fp16.m1024n1024k1024.row.swizzle_fp16.bin diff --git a/kernels/sgemm_gemmini_dma/input.a/256 b/kernels/sgemm_tcore/input.a.rand01.fp16.m256n256k256.row.swizzle_fp16.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.a/256 rename to kernels/sgemm_tcore/input.a.rand01.fp16.m256n256k256.row.swizzle_fp16.bin diff --git a/kernels/sgemm_gemmini_dma/input.a/512 b/kernels/sgemm_tcore/input.a.rand01.fp16.m512n512k512.row.swizzle_fp16.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.a/512 rename to kernels/sgemm_tcore/input.a.rand01.fp16.m512n512k512.row.swizzle_fp16.bin diff --git a/kernels/sgemm_tcore/input.b.bin b/kernels/sgemm_tcore/input.b.bin deleted file mode 120000 index 69136c11..00000000 --- a/kernels/sgemm_tcore/input.b.bin +++ /dev/null @@ -1 +0,0 @@ -input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin \ No newline at end of file diff --git a/kernels/sgemm_gemmini_dma/input.b/1024 b/kernels/sgemm_tcore/input.b.rand01.fp16.m1024n1024k1024.row.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.b/1024 rename to kernels/sgemm_tcore/input.b.rand01.fp16.m1024n1024k1024.row.bin diff --git a/kernels/sgemm_gemmini_dma/input.b/256 b/kernels/sgemm_tcore/input.b.rand01.fp16.m256n256k256.row.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.b/256 rename to kernels/sgemm_tcore/input.b.rand01.fp16.m256n256k256.row.bin diff --git a/kernels/sgemm_gemmini_dma/input.b/512 b/kernels/sgemm_tcore/input.b.rand01.fp16.m512n512k512.row.bin similarity index 100% rename from kernels/sgemm_gemmini_dma/input.b/512 rename to kernels/sgemm_tcore/input.b.rand01.fp16.m512n512k512.row.bin diff --git a/kernels/sgemm_tcore/kernel.cpp b/kernels/sgemm_tcore/kernel.cpp index d3bda941..38830b86 100644 --- a/kernels/sgemm_tcore/kernel.cpp +++ b/kernels/sgemm_tcore/kernel.cpp @@ -91,8 +91,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { DEV_SMEM_START_ADDR + sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster); - MARK_BEG(); - // NOTE: hardcoded constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4 static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant"); @@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { threadblocks_per_cluster, threadblock_id_in_cluster, sharedmem_per_threadblock); - MARK_END(); - float *gmem_tmp_d0 = reinterpret_cast(0xd0000000UL); float *gmem_tmp_d1 = reinterpret_cast(0xd1000000UL); float *gmem_tmp_d2 = reinterpret_cast(0xd2000000UL); diff --git a/kernels/sgemm_tcore/sgemm_impl.hpp b/kernels/sgemm_tcore/sgemm_impl.hpp index 76d3755d..10829db2 100644 --- a/kernels/sgemm_tcore/sgemm_impl.hpp +++ b/kernels/sgemm_tcore/sgemm_impl.hpp @@ -136,6 +136,10 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER == #error Unsupported smem size #endif +// timing markers +#define MARK_BEG() asm volatile ("slti x0, x1, -1047") +#define MARK_END() asm volatile ("slti x0, x1, -499") + enum class MemLayout { MN_major, K_major, @@ -1220,6 +1224,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) { asm volatile("loop_k_start_%=:" ::); + MARK_BEG(); + // producer code: GMEM->SMEM memory movement // --------------------------------------------------------------------- // @@ -1395,6 +1401,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, threadblock_barrier(threadblock_id_in_cluster, warps_per_threadblock_per_core); + MARK_END(); + asm volatile("loop_k_end_%=:" ::); } @@ -1422,8 +1430,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, asm volatile("move_out_end_%=:" ::); } + + asm volatile("loop_mn_end_%=:" ::); } - asm volatile("loop_mn_end_%=:" ::); } }