merge kernel changes from kernels-asplos-ae
This commit is contained in:
@@ -1,11 +1,16 @@
|
|||||||
rm kernel.radiance.elf
|
#!/bin/sh
|
||||||
rm -rf binaries
|
|
||||||
mkdir binaries
|
|
||||||
for a in args/*; do
|
for a in args/*; do
|
||||||
|
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||||
cp -f $a args.bin
|
cp -f $a args.bin
|
||||||
aa=$(basename "$a")
|
aa=$(basename "$a")
|
||||||
cp -f input.a/"$aa" input.a.bin
|
cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||||
cp -f input.b/"$aa" input.b.bin
|
cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||||
make > /dev/null
|
touch input.c.bin
|
||||||
mv kernel.radiance.elf binaries/gemmini_fp16nodma"$aa".elf
|
|
||||||
|
# touch source file to force re-building, as the Makefile does not track
|
||||||
|
# binary changes
|
||||||
|
touch kernel.cpp
|
||||||
|
|
||||||
|
make CONFIG=gemm.virgo.ampere.nodma.dim${aa}
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -1,11 +1,16 @@
|
|||||||
rm kernel.radiance.elf
|
#!/bin/sh
|
||||||
rm -rf binaries
|
|
||||||
mkdir binaries
|
|
||||||
for a in args/*; do
|
for a in args/*; do
|
||||||
|
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||||
cp -f $a args.bin
|
cp -f $a args.bin
|
||||||
aa=$(basename "$a")
|
aa=$(basename "$a")
|
||||||
cp -f input.a/"$aa" input.a.bin
|
cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||||
cp -f input.b/"$aa" input.b.bin
|
cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||||
make > /dev/null
|
touch input.c.bin
|
||||||
mv kernel.radiance.elf binaries/gemmini_hopper_nodma"$aa".elf
|
|
||||||
|
# touch source file to force re-building, as the Makefile does not track
|
||||||
|
# binary changes
|
||||||
|
touch kernel.cpp
|
||||||
|
|
||||||
|
make CONFIG=gemm.virgo.hopper.nodma.dim${aa}
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
../sgemm_gemmini_dma/input.a
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
../sgemm_gemmini_dma/input.b
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
rm kernel.radiance.elf
|
|
||||||
rm -rf binaries
|
|
||||||
mkdir binaries
|
|
||||||
touch input.c.bin
|
|
||||||
for a in args/*; do
|
|
||||||
cp -f $a args.bin
|
|
||||||
aa=$(basename "$a")
|
|
||||||
cp -f input.a/"$aa" input.a.bin
|
|
||||||
cp -f input.b/"$aa" input.b.bin
|
|
||||||
make > /dev/null
|
|
||||||
mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf
|
|
||||||
done
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
rm kernel.radiance.elf
|
|
||||||
rm -rf binaries
|
|
||||||
mkdir binaries
|
|
||||||
touch input.c.bin
|
|
||||||
for a in args/*; do
|
|
||||||
cp -f $a args.bin
|
|
||||||
aa=$(basename "$a")
|
|
||||||
cp -f input.a/"$aa" input.a.bin
|
|
||||||
cp -f input.b/"$aa" input.b.bin
|
|
||||||
make > /dev/null
|
|
||||||
mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
|
|
||||||
done
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
rm kernel.radiance.elf
|
|
||||||
rm -rf binaries
|
|
||||||
mkdir binaries
|
|
||||||
touch input.c.bin
|
|
||||||
for a in args/*; do
|
|
||||||
cp -f $a args.bin
|
|
||||||
aa=$(basename "$a")
|
|
||||||
cp -f input.a/"$aa" input.a.bin
|
|
||||||
cp -f input.b/"$aa" input.b.bin
|
|
||||||
make > /dev/null
|
|
||||||
mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf
|
|
||||||
done
|
|
||||||
21
kernels/sgemm_gemmini_dma/compile_virgo.sh
Executable file
21
kernels/sgemm_gemmini_dma/compile_virgo.sh
Executable file
@@ -0,0 +1,21 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
|
||||||
|
echo "input binaries not found, generating operands"
|
||||||
|
python3 generate_operands.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
for a in args/*; do
|
||||||
|
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||||
|
cp -f $a args.bin
|
||||||
|
aa=$(basename "$a")
|
||||||
|
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||||
|
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||||
|
touch input.c.bin
|
||||||
|
|
||||||
|
# touch source file to force re-building, as the Makefile does not track
|
||||||
|
# binary changes
|
||||||
|
touch kernel.cpp
|
||||||
|
|
||||||
|
make CONFIG=gemm.virgo.hopper.dim${aa}
|
||||||
|
done
|
||||||
25
kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
Executable file
25
kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
# This script generates the 8-core-per-cluster version of Virgo GEMM kernels.
|
||||||
|
# We use the 4-core version for final evaluation; the 8-core kernels should
|
||||||
|
# behave identically.
|
||||||
|
|
||||||
|
if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
|
||||||
|
echo "input binaries not found, generating operands"
|
||||||
|
python3 generate_operands.py
|
||||||
|
fi
|
||||||
|
|
||||||
|
for a in args/*; do
|
||||||
|
echo "compiling GEMM kernel for Virgo with dim ${a}"
|
||||||
|
cp -f $a args.bin
|
||||||
|
aa=$(basename "$a")
|
||||||
|
cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
|
||||||
|
cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
|
||||||
|
touch input.c.bin
|
||||||
|
|
||||||
|
# touch source file to force re-building, as the Makefile does not track
|
||||||
|
# binary changes
|
||||||
|
touch kernel.cpp
|
||||||
|
|
||||||
|
make CONFIG=gemm.virgo.ampere.dim${aa}
|
||||||
|
done
|
||||||
@@ -24,9 +24,9 @@ for s in sizes:
|
|||||||
|
|
||||||
# Save the operand matrices to binary files
|
# Save the operand matrices to binary files
|
||||||
save_matrix_to_bin("input.a.bin", matrix_a)
|
save_matrix_to_bin("input.a.bin", matrix_a)
|
||||||
save_matrix_to_bin(f"input.a/{s}", matrix_a)
|
save_matrix_to_bin(f"input.a.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_a)
|
||||||
save_matrix_to_bin("input.b.bin", matrix_b)
|
save_matrix_to_bin("input.b.bin", matrix_b)
|
||||||
save_matrix_to_bin(f"input.b/{s}", matrix_b)
|
save_matrix_to_bin(f"input.b.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_b)
|
||||||
|
|
||||||
ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
|
ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
|
||||||
save_matrix_to_bin(f"ref{s}.bin", ref_matrix)
|
save_matrix_to_bin(f"ref{s}.bin", ref_matrix)
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
|||||||
args.m256n256k256.bin
|
|
||||||
64
kernels/sgemm_tcore/compile_tcore.sh
Executable file
64
kernels/sgemm_tcore/compile_tcore.sh
Executable file
@@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
archs=("volta" "ampere" "hopper")
|
||||||
|
dims=("256" "512" "1024")
|
||||||
|
|
||||||
|
if [ -z "$TOOLDIR" ]; then
|
||||||
|
echo "error: \$TOOLDIR not set. Did you run source ci/toolchain_env.sh?"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
switch_binaries() {
|
||||||
|
local dim="$1"
|
||||||
|
local arch="$2"
|
||||||
|
dma=1
|
||||||
|
[[ "$arch" == "volta" ]] && dma=0
|
||||||
|
echo "dma is $dma"
|
||||||
|
if [ "$dma" == "1" ]; then
|
||||||
|
layout_a="row.swizzle_fp16"
|
||||||
|
layout_b="row"
|
||||||
|
else
|
||||||
|
layout_a="col.swizzle_fp16"
|
||||||
|
layout_b="row.swizzle_fp16"
|
||||||
|
fi
|
||||||
|
|
||||||
|
args="args.m$1n$1k$1.bin"
|
||||||
|
input_a="input.a.rand01.fp16.m$1n$1k$1.$layout_a.bin"
|
||||||
|
input_b="input.b.rand01.fp16.m$1n$1k$1.$layout_b.bin"
|
||||||
|
check_exists "$args"
|
||||||
|
check_exists "$input_a"
|
||||||
|
check_exists "$input_b"
|
||||||
|
|
||||||
|
ln -sf -v "$args" "args.bin"
|
||||||
|
ln -sf -v "$input_a" "input.a.bin"
|
||||||
|
ln -sf -v "$input_b" "input.b.bin"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_exists() {
|
||||||
|
if ! [ -f "$1" ]; then
|
||||||
|
echo "error: looked for file $1 that does not exist."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
for arch in "${archs[@]}"; do
|
||||||
|
git checkout kernels-asplos-ae-$arch
|
||||||
|
|
||||||
|
# re-compile libvortexrt.a
|
||||||
|
# FIXME after restructure
|
||||||
|
pushd ../../../kernel
|
||||||
|
make
|
||||||
|
popd
|
||||||
|
|
||||||
|
for dim in "${dims[@]}"; do
|
||||||
|
echo "compiling GEMM kernel for $arch with dim $dim"
|
||||||
|
|
||||||
|
switch_binaries $dim $arch
|
||||||
|
|
||||||
|
# touch source file to force re-building, as the Makefile does not track
|
||||||
|
# binary changes
|
||||||
|
touch kernel.cpp
|
||||||
|
|
||||||
|
make CONFIG=gemm.tcore.$arch.dim$dim
|
||||||
|
done
|
||||||
|
done
|
||||||
@@ -1 +0,0 @@
|
|||||||
input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin
|
|
||||||
@@ -91,8 +91,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
|||||||
DEV_SMEM_START_ADDR +
|
DEV_SMEM_START_ADDR +
|
||||||
sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster);
|
sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster);
|
||||||
|
|
||||||
MARK_BEG();
|
|
||||||
|
|
||||||
// NOTE: hardcoded
|
// NOTE: hardcoded
|
||||||
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
|
constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
|
||||||
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
|
static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
|
||||||
@@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
|||||||
threadblocks_per_cluster, threadblock_id_in_cluster,
|
threadblocks_per_cluster, threadblock_id_in_cluster,
|
||||||
sharedmem_per_threadblock);
|
sharedmem_per_threadblock);
|
||||||
|
|
||||||
MARK_END();
|
|
||||||
|
|
||||||
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
|
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
|
||||||
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
|
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
|
||||||
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);
|
float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);
|
||||||
|
|||||||
@@ -136,6 +136,10 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER ==
|
|||||||
#error Unsupported smem size
|
#error Unsupported smem size
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// timing markers
|
||||||
|
#define MARK_BEG() asm volatile ("slti x0, x1, -1047")
|
||||||
|
#define MARK_END() asm volatile ("slti x0, x1, -499")
|
||||||
|
|
||||||
enum class MemLayout {
|
enum class MemLayout {
|
||||||
MN_major,
|
MN_major,
|
||||||
K_major,
|
K_major,
|
||||||
@@ -1220,6 +1224,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
|||||||
for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) {
|
for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) {
|
||||||
asm volatile("loop_k_start_%=:" ::);
|
asm volatile("loop_k_start_%=:" ::);
|
||||||
|
|
||||||
|
MARK_BEG();
|
||||||
|
|
||||||
// producer code: GMEM->SMEM memory movement
|
// producer code: GMEM->SMEM memory movement
|
||||||
// ---------------------------------------------------------------------
|
// ---------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
@@ -1395,6 +1401,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
|||||||
threadblock_barrier(threadblock_id_in_cluster,
|
threadblock_barrier(threadblock_id_in_cluster,
|
||||||
warps_per_threadblock_per_core);
|
warps_per_threadblock_per_core);
|
||||||
|
|
||||||
|
MARK_END();
|
||||||
|
|
||||||
asm volatile("loop_k_end_%=:" ::);
|
asm volatile("loop_k_end_%=:" ::);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1422,8 +1430,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
|
|||||||
|
|
||||||
asm volatile("move_out_end_%=:" ::);
|
asm volatile("move_out_end_%=:" ::);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
asm volatile("loop_mn_end_%=:" ::);
|
||||||
}
|
}
|
||||||
asm volatile("loop_mn_end_%=:" ::);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user