merge kernel changes from kernels-asplos-ae

This commit is contained in:
Richard Yan
2025-01-29 22:11:25 -08:00
parent a61bf257ff
commit 91a82c9f0f
24 changed files with 146 additions and 62 deletions

View File

@@ -1,11 +1,16 @@
rm kernel.radiance.elf #!/bin/sh
rm -rf binaries
mkdir binaries
for a in args/*; do for a in args/*; do
echo "compiling GEMM kernel for Virgo with dim ${a}"
cp -f $a args.bin cp -f $a args.bin
aa=$(basename "$a") aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
cp -f input.b/"$aa" input.b.bin cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
make > /dev/null touch input.c.bin
mv kernel.radiance.elf binaries/gemmini_fp16nodma"$aa".elf
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
make CONFIG=gemm.virgo.ampere.nodma.dim${aa}
done done

View File

@@ -1,11 +1,16 @@
rm kernel.radiance.elf #!/bin/sh
rm -rf binaries
mkdir binaries
for a in args/*; do for a in args/*; do
echo "compiling GEMM kernel for Virgo with dim ${a}"
cp -f $a args.bin cp -f $a args.bin
aa=$(basename "$a") aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
cp -f input.b/"$aa" input.b.bin cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
make > /dev/null touch input.c.bin
mv kernel.radiance.elf binaries/gemmini_hopper_nodma"$aa".elf
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
make CONFIG=gemm.virgo.hopper.nodma.dim${aa}
done done

View File

@@ -1 +0,0 @@
../sgemm_gemmini_dma/input.a

View File

@@ -1 +0,0 @@
../sgemm_gemmini_dma/input.b

View File

@@ -1,12 +0,0 @@
rm kernel.radiance.elf
rm -rf binaries
mkdir binaries
touch input.c.bin
for a in args/*; do
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin
cp -f input.b/"$aa" input.b.bin
make > /dev/null
mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf
done

View File

@@ -1,12 +0,0 @@
rm kernel.radiance.elf
rm -rf binaries
mkdir binaries
touch input.c.bin
for a in args/*; do
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin
cp -f input.b/"$aa" input.b.bin
make > /dev/null
mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
done

View File

@@ -1,12 +0,0 @@
rm kernel.radiance.elf
rm -rf binaries
mkdir binaries
touch input.c.bin
for a in args/*; do
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin
cp -f input.b/"$aa" input.b.bin
make > /dev/null
mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf
done

View File

@@ -0,0 +1,21 @@
#!/bin/sh
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
echo "input binaries not found, generating operands"
python3 generate_operands.py
fi
for a in args/*; do
echo "compiling GEMM kernel for Virgo with dim ${a}"
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
touch input.c.bin
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
make CONFIG=gemm.virgo.hopper.dim${aa}
done

View File

@@ -0,0 +1,25 @@
#!/bin/sh
#
# This script generates the 8-core-per-cluster version of Virgo GEMM kernels.
# We use the 4-core version for final evaluation; the 8-core kernels should
# behave identically.
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
echo "input binaries not found, generating operands"
python3 generate_operands.py
fi
for a in args/*; do
echo "compiling GEMM kernel for Virgo with dim ${a}"
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
touch input.c.bin
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
make CONFIG=gemm.virgo.ampere.dim${aa}
done

View File

@@ -24,9 +24,9 @@ for s in sizes:
# Save the operand matrices to binary files # Save the operand matrices to binary files
save_matrix_to_bin("input.a.bin", matrix_a) save_matrix_to_bin("input.a.bin", matrix_a)
save_matrix_to_bin(f"input.a/{s}", matrix_a) save_matrix_to_bin(f"input.a.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_a)
save_matrix_to_bin("input.b.bin", matrix_b) save_matrix_to_bin("input.b.bin", matrix_b)
save_matrix_to_bin(f"input.b/{s}", matrix_b) save_matrix_to_bin(f"input.b.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_b)
ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s) ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
save_matrix_to_bin(f"ref{s}.bin", ref_matrix) save_matrix_to_bin(f"ref{s}.bin", ref_matrix)

View File

@@ -1 +0,0 @@
args.m256n256k256.bin

View File

@@ -0,0 +1,64 @@
#!/bin/bash
archs=("volta" "ampere" "hopper")
dims=("256" "512" "1024")
if [ -z "$TOOLDIR" ]; then
echo "error: \$TOOLDIR not set. Did you run source ci/toolchain_env.sh?"
exit 1
fi
switch_binaries() {
local dim="$1"
local arch="$2"
dma=1
[[ "$arch" == "volta" ]] && dma=0
echo "dma is $dma"
if [ "$dma" == "1" ]; then
layout_a="row.swizzle_fp16"
layout_b="row"
else
layout_a="col.swizzle_fp16"
layout_b="row.swizzle_fp16"
fi
args="args.m$1n$1k$1.bin"
input_a="input.a.rand01.fp16.m$1n$1k$1.$layout_a.bin"
input_b="input.b.rand01.fp16.m$1n$1k$1.$layout_b.bin"
check_exists "$args"
check_exists "$input_a"
check_exists "$input_b"
ln -sf -v "$args" "args.bin"
ln -sf -v "$input_a" "input.a.bin"
ln -sf -v "$input_b" "input.b.bin"
}
check_exists() {
if ! [ -f "$1" ]; then
echo "error: looked for file $1 that does not exist."
exit 1
fi
}
for arch in "${archs[@]}"; do
git checkout kernels-asplos-ae-$arch
# re-compile libvortexrt.a
# FIXME after restructure
pushd ../../../kernel
make
popd
for dim in "${dims[@]}"; do
echo "compiling GEMM kernel for $arch with dim $dim"
switch_binaries $dim $arch
# touch source file to force re-building, as the Makefile does not track
# binary changes
touch kernel.cpp
make CONFIG=gemm.tcore.$arch.dim$dim
done
done

View File

@@ -1 +0,0 @@
input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin

View File

@@ -1 +0,0 @@
input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin

View File

@@ -91,8 +91,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
DEV_SMEM_START_ADDR + DEV_SMEM_START_ADDR +
sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster); sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster);
MARK_BEG();
// NOTE: hardcoded // NOTE: hardcoded
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4 constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant"); static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
@@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
threadblocks_per_cluster, threadblock_id_in_cluster, threadblocks_per_cluster, threadblock_id_in_cluster,
sharedmem_per_threadblock); sharedmem_per_threadblock);
MARK_END();
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL); float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL); float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL); float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);

View File

@@ -136,6 +136,10 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER ==
#error Unsupported smem size #error Unsupported smem size
#endif #endif
// timing markers
#define MARK_BEG() asm volatile ("slti x0, x1, -1047")
#define MARK_END() asm volatile ("slti x0, x1, -499")
enum class MemLayout { enum class MemLayout {
MN_major, MN_major,
K_major, K_major,
@@ -1220,6 +1224,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) { for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) {
asm volatile("loop_k_start_%=:" ::); asm volatile("loop_k_start_%=:" ::);
MARK_BEG();
// producer code: GMEM->SMEM memory movement // producer code: GMEM->SMEM memory movement
// --------------------------------------------------------------------- // ---------------------------------------------------------------------
// //
@@ -1395,6 +1401,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
threadblock_barrier(threadblock_id_in_cluster, threadblock_barrier(threadblock_id_in_cluster,
warps_per_threadblock_per_core); warps_per_threadblock_per_core);
MARK_END();
asm volatile("loop_k_end_%=:" ::); asm volatile("loop_k_end_%=:" ::);
} }
@@ -1422,8 +1430,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
asm volatile("move_out_end_%=:" ::); asm volatile("move_out_end_%=:" ::);
} }
asm volatile("loop_mn_end_%=:" ::);
} }
asm volatile("loop_mn_end_%=:" ::);
} }
} }