merge kernel changes from kernels-asplos-ae
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
rm kernel.radiance.elf
|
||||
rm -rf binaries
|
||||
mkdir binaries
|
||||
#!/bin/sh
|
||||
|
||||
for a in args/*; do
|
||||
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a/"$aa" input.a.bin
|
||||
cp -f input.b/"$aa" input.b.bin
|
||||
make > /dev/null
|
||||
mv kernel.radiance.elf binaries/gemmini_fp16nodma"$aa".elf
|
||||
cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||
cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||
touch input.c.bin
|
||||
|
||||
# touch source file to force re-building, as the Makefile does not track
|
||||
# binary changes
|
||||
touch kernel.cpp
|
||||
|
||||
make CONFIG=gemm.virgo.ampere.nodma.dim${aa}
|
||||
done
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
rm kernel.radiance.elf
|
||||
rm -rf binaries
|
||||
mkdir binaries
|
||||
#!/bin/sh
|
||||
|
||||
for a in args/*; do
|
||||
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a/"$aa" input.a.bin
|
||||
cp -f input.b/"$aa" input.b.bin
|
||||
make > /dev/null
|
||||
mv kernel.radiance.elf binaries/gemmini_hopper_nodma"$aa".elf
|
||||
cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||
cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||
touch input.c.bin
|
||||
|
||||
# touch source file to force re-building, as the Makefile does not track
|
||||
# binary changes
|
||||
touch kernel.cpp
|
||||
|
||||
make CONFIG=gemm.virgo.hopper.nodma.dim${aa}
|
||||
done
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
../sgemm_gemmini_dma/input.a
|
||||
@@ -1 +0,0 @@
|
||||
../sgemm_gemmini_dma/input.b
|
||||
@@ -1,12 +0,0 @@
|
||||
rm kernel.radiance.elf
|
||||
rm -rf binaries
|
||||
mkdir binaries
|
||||
touch input.c.bin
|
||||
for a in args/*; do
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a/"$aa" input.a.bin
|
||||
cp -f input.b/"$aa" input.b.bin
|
||||
make > /dev/null
|
||||
mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf
|
||||
done
|
||||
@@ -1,12 +0,0 @@
|
||||
rm kernel.radiance.elf
|
||||
rm -rf binaries
|
||||
mkdir binaries
|
||||
touch input.c.bin
|
||||
for a in args/*; do
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a/"$aa" input.a.bin
|
||||
cp -f input.b/"$aa" input.b.bin
|
||||
make > /dev/null
|
||||
mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
|
||||
done
|
||||
@@ -1,12 +0,0 @@
|
||||
rm kernel.radiance.elf
|
||||
rm -rf binaries
|
||||
mkdir binaries
|
||||
touch input.c.bin
|
||||
for a in args/*; do
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a/"$aa" input.a.bin
|
||||
cp -f input.b/"$aa" input.b.bin
|
||||
make > /dev/null
|
||||
mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf
|
||||
done
|
||||
21
kernels/sgemm_gemmini_dma/compile_virgo.sh
Executable file
21
kernels/sgemm_gemmini_dma/compile_virgo.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/sh
|
||||
|
||||
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
|
||||
echo "input binaries not found, generating operands"
|
||||
python3 generate_operands.py
|
||||
fi
|
||||
|
||||
for a in args/*; do
|
||||
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||
touch input.c.bin
|
||||
|
||||
# touch source file to force re-building, as the Makefile does not track
|
||||
# binary changes
|
||||
touch kernel.cpp
|
||||
|
||||
make CONFIG=gemm.virgo.hopper.dim${aa}
|
||||
done
|
||||
25
kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
Executable file
25
kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# This script generates the 8-core-per-cluster version of Virgo GEMM kernels.
|
||||
# We use the 4-core version for final evaluation; the 8-core kernels should
|
||||
# behave identically.
|
||||
|
||||
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
|
||||
echo "input binaries not found, generating operands"
|
||||
python3 generate_operands.py
|
||||
fi
|
||||
|
||||
for a in args/*; do
|
||||
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||
cp -f $a args.bin
|
||||
aa=$(basename "$a")
|
||||
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||
touch input.c.bin
|
||||
|
||||
# touch source file to force re-building, as the Makefile does not track
|
||||
# binary changes
|
||||
touch kernel.cpp
|
||||
|
||||
make CONFIG=gemm.virgo.ampere.dim${aa}
|
||||
done
|
||||
@@ -24,9 +24,9 @@ for s in sizes:
|
||||
|
||||
# Save the operand matrices to binary files
|
||||
save_matrix_to_bin("input.a.bin", matrix_a)
|
||||
save_matrix_to_bin(f"input.a/{s}", matrix_a)
|
||||
save_matrix_to_bin(f"input.a.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_a)
|
||||
save_matrix_to_bin("input.b.bin", matrix_b)
|
||||
save_matrix_to_bin(f"input.b/{s}", matrix_b)
|
||||
save_matrix_to_bin(f"input.b.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_b)
|
||||
|
||||
ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
|
||||
save_matrix_to_bin(f"ref{s}.bin", ref_matrix)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
||||
args.m256n256k256.bin
|
||||
64
kernels/sgemm_tcore/compile_tcore.sh
Executable file
64
kernels/sgemm_tcore/compile_tcore.sh
Executable file
@@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
|
||||
archs=("volta" "ampere" "hopper")
|
||||
dims=("256" "512" "1024")
|
||||
|
||||
if [ -z "$TOOLDIR" ]; then
|
||||
echo "error: \$TOOLDIR not set. Did you run source ci/toolchain_env.sh?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
switch_binaries() {
|
||||
local dim="$1"
|
||||
local arch="$2"
|
||||
dma=1
|
||||
[[ "$arch" == "volta" ]] && dma=0
|
||||
echo "dma is $dma"
|
||||
if [ "$dma" == "1" ]; then
|
||||
layout_a="row.swizzle_fp16"
|
||||
layout_b="row"
|
||||
else
|
||||
layout_a="col.swizzle_fp16"
|
||||
layout_b="row.swizzle_fp16"
|
||||
fi
|
||||
|
||||
args="args.m$1n$1k$1.bin"
|
||||
input_a="input.a.rand01.fp16.m$1n$1k$1.$layout_a.bin"
|
||||
input_b="input.b.rand01.fp16.m$1n$1k$1.$layout_b.bin"
|
||||
check_exists "$args"
|
||||
check_exists "$input_a"
|
||||
check_exists "$input_b"
|
||||
|
||||
ln -sf -v "$args" "args.bin"
|
||||
ln -sf -v "$input_a" "input.a.bin"
|
||||
ln -sf -v "$input_b" "input.b.bin"
|
||||
}
|
||||
|
||||
check_exists() {
|
||||
if ! [ -f "$1" ]; then
|
||||
echo "error: looked for file $1 that does not exist."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
for arch in "${archs[@]}"; do
|
||||
git checkout kernels-asplos-ae-$arch
|
||||
|
||||
# re-compile libvortexrt.a
|
||||
# FIXME after restructure
|
||||
pushd ../../../kernel
|
||||
make
|
||||
popd
|
||||
|
||||
for dim in "${dims[@]}"; do
|
||||
echo "compiling GEMM kernel for $arch with dim $dim"
|
||||
|
||||
switch_binaries $dim $arch
|
||||
|
||||
# touch source file to force re-building, as the Makefile does not track
|
||||
# binary changes
|
||||
touch kernel.cpp
|
||||
|
||||
make CONFIG=gemm.tcore.$arch.dim$dim
|
||||
done
|
||||
done
|
||||
@@ -1 +0,0 @@
|
||||
input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin
|
||||
@@ -1 +0,0 @@
|
||||
input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin
|
||||
@@ -91,8 +91,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
DEV_SMEM_START_ADDR +
|
||||
sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster);
|
||||
|
||||
MARK_BEG();
|
||||
|
||||
// NOTE: hardcoded
|
||||
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
|
||||
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
|
||||
@@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
threadblocks_per_cluster, threadblock_id_in_cluster,
|
||||
sharedmem_per_threadblock);
|
||||
|
||||
MARK_END();
|
||||
|
||||
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
|
||||
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
|
||||
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);
|
||||
|
||||
@@ -136,6 +136,10 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER ==
|
||||
#error Unsupported smem size
|
||||
#endif
|
||||
|
||||
// timing markers
|
||||
#define MARK_BEG() asm volatile ("slti x0, x1, -1047")
|
||||
#define MARK_END() asm volatile ("slti x0, x1, -499")
|
||||
|
||||
enum class MemLayout {
|
||||
MN_major,
|
||||
K_major,
|
||||
@@ -1220,6 +1224,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
||||
for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) {
|
||||
asm volatile("loop_k_start_%=:" ::);
|
||||
|
||||
MARK_BEG();
|
||||
|
||||
// producer code: GMEM->SMEM memory movement
|
||||
// ---------------------------------------------------------------------
|
||||
//
|
||||
@@ -1395,6 +1401,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
||||
threadblock_barrier(threadblock_id_in_cluster,
|
||||
warps_per_threadblock_per_core);
|
||||
|
||||
MARK_END();
|
||||
|
||||
asm volatile("loop_k_end_%=:" ::);
|
||||
}
|
||||
|
||||
@@ -1422,8 +1430,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
||||
|
||||
asm volatile("move_out_end_%=:" ::);
|
||||
}
|
||||
|
||||
asm volatile("loop_mn_end_%=:" ::);
|
||||
}
|
||||
asm volatile("loop_mn_end_%=:" ::);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user