merge kernel changes from kernels-asplos-ae

2025-01-29 22:11:25 -08:00
parent a61bf257ff
commit 91a82c9f0f
24 changed files with 146 additions and 62 deletions
--- a/kernels/sgemm_gemmini/compile_ampere.sh
+++ b/kernels/sgemm_gemmini/compile_ampere.sh
@@ -1,11 +1,16 @@
-rm kernel.radiance.elf
+#!/bin/sh
-rm -rf binaries
+
 mkdir binaries
 for a in args/*; do
    echo "compiling GEMM kernel for Virgo with dim ${a}"
    cp -f $a args.bin
    aa=$(basename "$a")
-    cp -f input.a/"$aa" input.a.bin
+    cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
-    cp -f input.b/"$aa" input.b.bin
+    cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
-    make > /dev/null
+    touch input.c.bin
-    mv kernel.radiance.elf binaries/gemmini_fp16nodma"$aa".elf
+
    # touch source file to force re-building, as the Makefile does not track
    # binary changes
    touch kernel.cpp
    make CONFIG=gemm.virgo.ampere.nodma.dim${aa}
 done
--- a/kernels/sgemm_gemmini/compile_hopper.sh
+++ b/kernels/sgemm_gemmini/compile_hopper.sh
@@ -1,11 +1,16 @@
-rm kernel.radiance.elf
+#!/bin/sh
-rm -rf binaries
+
 mkdir binaries
 for a in args/*; do
    echo "compiling GEMM kernel for Virgo with dim ${a}"
    cp -f $a args.bin
    aa=$(basename "$a")
-    cp -f input.a/"$aa" input.a.bin
+    cp ../sgemm_gemmini_dma/input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
-    cp -f input.b/"$aa" input.b.bin
+    cp ../sgemm_gemmini_dma/input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
-    make > /dev/null
+    touch input.c.bin
-    mv kernel.radiance.elf binaries/gemmini_hopper_nodma"$aa".elf
+
    # touch source file to force re-building, as the Makefile does not track
    # binary changes
    touch kernel.cpp
    make CONFIG=gemm.virgo.hopper.nodma.dim${aa}
 done
--- a/kernels/sgemm_gemmini/input.a
+++ b/kernels/sgemm_gemmini/input.a
@@ -1 +0,0 @@
 ../sgemm_gemmini_dma/input.a
--- a/kernels/sgemm_gemmini/input.b
+++ b/kernels/sgemm_gemmini/input.b
@@ -1 +0,0 @@
 ../sgemm_gemmini_dma/input.b
--- a/kernels/sgemm_gemmini_dma/compile_ampere.sh
+++ b/kernels/sgemm_gemmini_dma/compile_ampere.sh
@@ -1,12 +0,0 @@
 rm kernel.radiance.elf
 rm -rf binaries
 mkdir binaries
 touch input.c.bin
 for a in args/*; do
    cp -f $a args.bin
    aa=$(basename "$a")
    cp -f input.a/"$aa" input.a.bin
    cp -f input.b/"$aa" input.b.bin
    make > /dev/null
    mv kernel.radiance.elf binaries/gemmini_fp16dma"$aa".elf
 done
--- a/kernels/sgemm_gemmini_dma/compile_debug.sh
+++ b/kernels/sgemm_gemmini_dma/compile_debug.sh
@@ -1,12 +0,0 @@
 rm kernel.radiance.elf
 rm -rf binaries
 mkdir binaries
 touch input.c.bin
 for a in args/*; do
    cp -f $a args.bin
    aa=$(basename "$a")
    cp -f input.a/"$aa" input.a.bin
    cp -f input.b/"$aa" input.b.bin
    make > /dev/null
    mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
 done
--- a/kernels/sgemm_gemmini_dma/compile_hopper.sh
+++ b/kernels/sgemm_gemmini_dma/compile_hopper.sh
@@ -1,12 +0,0 @@
 rm kernel.radiance.elf
 rm -rf binaries
 mkdir binaries
 touch input.c.bin
 for a in args/*; do
    cp -f $a args.bin
    aa=$(basename "$a")
    cp -f input.a/"$aa" input.a.bin
    cp -f input.b/"$aa" input.b.bin
    make > /dev/null
    mv kernel.radiance.elf binaries/gemmini_hopper_dma"$aa".elf
 done
--- a/kernels/sgemm_gemmini_dma/compile_virgo.sh
+++ b/kernels/sgemm_gemmini_dma/compile_virgo.sh
@@ -0,0 +1,21 @@
 #!/bin/sh
 if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
    echo "input binaries not found, generating operands"
    python3 generate_operands.py
 fi
 for a in args/*; do
    echo "compiling GEMM kernel for Virgo with dim ${a}"
    cp -f $a args.bin
    aa=$(basename "$a")
    cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
    cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
    touch input.c.bin
    # touch source file to force re-building, as the Makefile does not track
    # binary changes
    touch kernel.cpp
    make CONFIG=gemm.virgo.hopper.dim${aa}
 done
--- a/kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
+++ b/kernels/sgemm_gemmini_dma/compile_virgo_ampere.sh
@@ -0,0 +1,25 @@
 #!/bin/sh
 #
 # This script generates the 8-core-per-cluster version of Virgo GEMM kernels.
 # We use the 4-core version for final evaluation; the 8-core kernels should
 # behave identically.
 if [ ! -f input.a.rand01.fp16.m256n256k256.row.bin ]; then
    echo "input binaries not found, generating operands"
    python3 generate_operands.py
 fi
 for a in args/*; do
    echo "compiling GEMM kernel for Virgo with dim ${a}"
    cp -f $a args.bin
    aa=$(basename "$a")
    cp -f input.a.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.a.bin
    cp -f input.b.rand01.fp16.m${aa}n${aa}k${aa}.row.bin input.b.bin
    touch input.c.bin
    # touch source file to force re-building, as the Makefile does not track
    # binary changes
    touch kernel.cpp
    make CONFIG=gemm.virgo.ampere.dim${aa}
 done
--- a/kernels/sgemm_gemmini_dma/generate_operands.py
+++ b/kernels/sgemm_gemmini_dma/generate_operands.py
@@ -24,9 +24,9 @@ for s in sizes:
    # Save the operand matrices to binary files
    save_matrix_to_bin("input.a.bin", matrix_a)
-    save_matrix_to_bin(f"input.a/{s}", matrix_a)
+    save_matrix_to_bin(f"input.a.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_a)
    save_matrix_to_bin("input.b.bin", matrix_b)
-    save_matrix_to_bin(f"input.b/{s}", matrix_b)
+    save_matrix_to_bin(f"input.b.rand01.fp16.m{s}n{s}k{s}.row.bin", matrix_b)
    ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
    save_matrix_to_bin(f"ref{s}.bin", ref_matrix)
--- a/kernels/sgemm_gemmini_dma/input.a/128
+++ b/kernels/sgemm_gemmini_dma/input.a/128
--- a/kernels/sgemm_gemmini_dma/input.b/128
+++ b/kernels/sgemm_gemmini_dma/input.b/128
--- a/kernels/sgemm_tcore/args.bin
+++ b/kernels/sgemm_tcore/args.bin
@@ -1 +0,0 @@
 args.m256n256k256.bin
--- a/kernels/sgemm_tcore/compile_tcore.sh
+++ b/kernels/sgemm_tcore/compile_tcore.sh
@@ -0,0 +1,64 @@
 #!/bin/bash
 archs=("volta" "ampere" "hopper")
 dims=("256" "512" "1024")
 if [ -z "$TOOLDIR" ]; then
    echo "error: \$TOOLDIR not set.  Did you run source ci/toolchain_env.sh?"
    exit 1
 fi
 switch_binaries() {
    local dim="$1"
    local arch="$2"
    dma=1
    [[ "$arch" == "volta" ]] && dma=0
    echo "dma is $dma"
    if [ "$dma" == "1" ]; then
        layout_a="row.swizzle_fp16"
        layout_b="row"
    else
        layout_a="col.swizzle_fp16"
        layout_b="row.swizzle_fp16"
    fi
    args="args.m$1n$1k$1.bin"
    input_a="input.a.rand01.fp16.m$1n$1k$1.$layout_a.bin"
    input_b="input.b.rand01.fp16.m$1n$1k$1.$layout_b.bin"
    check_exists "$args"
    check_exists "$input_a"
    check_exists "$input_b"
    ln -sf -v "$args" "args.bin"
    ln -sf -v "$input_a" "input.a.bin"
    ln -sf -v "$input_b" "input.b.bin"
 }
 check_exists() {
    if ! [ -f "$1" ]; then
        echo "error: looked for file $1 that does not exist."
        exit 1
    fi
 }
 for arch in "${archs[@]}"; do
    git checkout kernels-asplos-ae-$arch
    # re-compile libvortexrt.a
    # FIXME after restructure
    pushd ../../../kernel
    make
    popd
    for dim in "${dims[@]}"; do
        echo "compiling GEMM kernel for $arch with dim $dim"
        switch_binaries $dim $arch
        # touch source file to force re-building, as the Makefile does not track
        # binary changes
        touch kernel.cpp
        make CONFIG=gemm.tcore.$arch.dim$dim
    done
 done
--- a/kernels/sgemm_tcore/input.a.bin
+++ b/kernels/sgemm_tcore/input.a.bin
@@ -1 +0,0 @@
 input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin
--- a/kernels/sgemm_tcore/input.a.rand01.fp16.m1024n1024k1024.row.swizzle_fp16.bin
+++ b/kernels/sgemm_tcore/input.a.rand01.fp16.m1024n1024k1024.row.swizzle_fp16.bin
--- a/kernels/sgemm_tcore/input.a.rand01.fp16.m256n256k256.row.swizzle_fp16.bin
+++ b/kernels/sgemm_tcore/input.a.rand01.fp16.m256n256k256.row.swizzle_fp16.bin
--- a/kernels/sgemm_tcore/input.a.rand01.fp16.m512n512k512.row.swizzle_fp16.bin
+++ b/kernels/sgemm_tcore/input.a.rand01.fp16.m512n512k512.row.swizzle_fp16.bin
--- a/kernels/sgemm_tcore/input.b.bin
+++ b/kernels/sgemm_tcore/input.b.bin
@@ -1 +0,0 @@
 input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin
--- a/kernels/sgemm_tcore/input.b.rand01.fp16.m1024n1024k1024.row.bin
+++ b/kernels/sgemm_tcore/input.b.rand01.fp16.m1024n1024k1024.row.bin
--- a/kernels/sgemm_tcore/input.b.rand01.fp16.m256n256k256.row.bin
+++ b/kernels/sgemm_tcore/input.b.rand01.fp16.m256n256k256.row.bin
--- a/kernels/sgemm_tcore/input.b.rand01.fp16.m512n512k512.row.bin
+++ b/kernels/sgemm_tcore/input.b.rand01.fp16.m512n512k512.row.bin
--- a/kernels/sgemm_tcore/kernel.cpp
+++ b/kernels/sgemm_tcore/kernel.cpp
@@ -91,8 +91,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
      DEV_SMEM_START_ADDR +
      sizeof(float_type) * 2 * (2 * BM * BK) * threadblock_id_in_cluster);
  MARK_BEG();
  // NOTE: hardcoded
  constexpr uint32_t quartile = (128 << 10) >> 2; // 128KB / 4
  static_assert((quartile * 4) == SMEM_SIZE, "wrong quartile constant");
@@ -121,8 +119,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
                      threadblocks_per_cluster, threadblock_id_in_cluster,
                      sharedmem_per_threadblock);
  MARK_END();
  float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
  float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
  float *gmem_tmp_d2 = reinterpret_cast<float *>(0xd2000000UL);
--- a/kernels/sgemm_tcore/sgemm_impl.hpp
+++ b/kernels/sgemm_tcore/sgemm_impl.hpp
@@ -136,6 +136,10 @@ static_assert(WMITER * WNITER * TCM * TCN * NUM_WARPS * CORES_PER_CLUSTER ==
 #error Unsupported smem size
 #endif
 // timing markers
 #define MARK_BEG() asm volatile ("slti x0, x1, -1047")
 #define MARK_END() asm volatile ("slti x0, x1, -499")
 enum class MemLayout {
  MN_major,
  K_major,
@@ -1220,6 +1224,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
      for (uint32_t block_k = 0; (block_k * BK) < dim_k; block_k++) {
        asm volatile("loop_k_start_%=:" ::);
        MARK_BEG();
        // producer code: GMEM->SMEM memory movement
        // ---------------------------------------------------------------------
        //
@@ -1395,6 +1401,8 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
        threadblock_barrier(threadblock_id_in_cluster,
                            warps_per_threadblock_per_core);
        MARK_END();
        asm volatile("loop_k_end_%=:" ::);
      }
@@ -1422,8 +1430,9 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
        asm volatile("move_out_end_%=:" ::);
      }
      asm volatile("loop_mn_end_%=:" ::);
    }
    asm volatile("loop_mn_end_%=:" ::);
  }
 }
		`@@ -1 +0,0 @@`
			`input.a.rand01.fp16.m256n256k256.col.swizzle_fp16.bin`
		`@@ -1 +0,0 @@`
			`input.b.rand01.fp16.m256n256k256.row.swizzle_fp16.bin`