flash: Split impl to header file

2024-09-07 21:16:35 -07:00
parent 03308f8033
commit b3be271b88
2 changed files with 447 additions and 440 deletions
--- a/tests/regression/flash_attention/flash_impl.hpp
+++ b/tests/regression/flash_attention/flash_impl.hpp
@@ -0,0 +1,446 @@
 #ifndef _FLASH_IMPL_H_
 #define _FLASH_IMPL_H_
 #include <vx_spawn.h>
 #include <float.h>
 #define B_ROW 64
 #define B_COL 64
 #define HEADDIM 64
 constexpr uint32_t ROWMAX_SETS = 3;
 constexpr bool DEBUG = true;
 constexpr bool WARP_SPECIALIZED = true;
 constexpr uint32_t DEV_FAKE_SMEM_START_ADDR = 0xf0000000;
 constexpr bool GEMMINI_DMA_FAST = false;
 constexpr bool Q_IS_K_MAJOR = true;
 // temporary safety stop for wrong configs
 static_assert(NUM_CORES == 4);
 static_assert(NUM_THREADS == 8);
 static_assert(NUM_WARPS == 8);
 inline void thread_block_init_sharedmem(const uint32_t tid_in_threadblock,
                                        const uint32_t threads_per_threadblock,
                                        float *smem_O, float *smem_rowmax,
                                        float *smem_rowsum,
                                        float *smem_O_row_scale) {
  asm volatile("threadblock_init_sharedmem_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  static_assert((B_ROW % NUM_THREADS) == 0,
                "B_ROW must be a multiple of NUM_THREADS");
  static_assert(B_ROW < (NUM_THREADS * CORES_PER_CLUSTER *
                         (NUM_WARPS / (WARP_SPECIALIZED ? 2 : 1))),
                "not enough warps to initialize rowmax/rowsum");
  // each thread initializes one element in rowmax/rowsum
  // multiple warps participate for the whole vector
  constexpr uint32_t needed_warps = B_ROW / NUM_THREADS;
  if (warp_id < needed_warps /* more warps in HW than needed? */) {
    uint32_t offset = NUM_THREADS * warp_id + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < ROWMAX_SETS; i++) {
      smem_rowmax[offset + i * ROWMAX_SETS] = FLT_MIN;
    }
    smem_rowsum[offset] = 0.0f;
    smem_O_row_scale[offset] = 0.0f;
  }
  // each warp clears out a row of smem_O
  // FIXME: dedup this pattern
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_COL;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    uint32_t thread_offset = HEADDIM * row + tid_in_warp;
    constexpr uint32_t per_row_iter = HEADDIM / NUM_THREADS;
    const float one = 0.0f;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      smem_O[thread_offset] = 0.0f;
      thread_offset += NUM_THREADS;
    }
  }
  asm volatile("threadblock_init_sharedmem_finish_%=:" ::);
 }
 inline void thread_block_copy_rowmax(const float *src, float *dest,
                                     const uint32_t tid_in_threadblock,
                                     const uint32_t threads_per_threadblock,
                                     const uint32_t threadblock_id_in_cluster) {
  asm volatile("threadblock_copy_rowmax_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  // each thread copies one element in rowmax
  // multiple warps participate for the whole vector
  constexpr uint32_t num_warps = B_ROW / NUM_THREADS;
  if (warp_id < num_warps) {
    uint32_t offset = NUM_THREADS * warp_id + tid_in_warp;
    dest[offset] = src[offset];
  }
  threadblock_barrier(threadblock_id_in_cluster,
                      warps_per_threadblock_per_core);
  asm volatile("threadblock_copy_rowmax_finish_%=:" ::);
 }
 template <uint32_t dim_row, uint32_t dim_col>
 inline void thread_block_copy_tile(const float *src, float *dest,
                                   const uint32_t tid_in_threadblock,
                                   const uint32_t threads_per_threadblock,
                                   const uint32_t threadblock_id_in_cluster) {
  asm volatile("threadblock_copy_tile_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  // FIXME: dedup this pattern
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < dim_row;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = dim_col * row;
    constexpr uint32_t per_row_iter = dim_col / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      dest[thread_offset] = src[thread_offset];
      thread_offset += NUM_THREADS;
    }
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
  }
  asm volatile("threadblock_copy_tile_finish_%=:" ::);
 }
 template <int order>
 inline float exponential_taylor_term(const float x) {
  asm volatile("exponential_taylor_term_start_%=:" ::);
  float res = 1.0f;
  if constexpr (order == 1) {
    res = x;
  } else if constexpr (order == 2) {
    res = x * x;
    res /= 2.0f;
  } else if constexpr (order == 3) {
    res = x * x * x;
    res /= 6.0f;
  }
  asm volatile("exponential_taylor_term_end_%=:" ::);
  return res;
 }
 __attribute__((always_inline)) inline void thread_block_online_softmax(
    const float *smem_S, float *smem_P, const uint32_t tid_in_threadblock,
    const uint32_t threads_per_threadblock,
    const uint32_t threadblock_id_in_cluster, float *smem_scratchpad,
    float *smem_rowmax, float *smem_rowsum, float *smem_O_row_scale) {
  asm volatile("thread_block_online_softmax_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  float *smem_rowmax_this = smem_rowmax + B_ROW;
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_ROW;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = B_COL * row;
    // rowmax
    //
    // two-level tree reduction: reduce each row into NUM_THREADS intermediate
    // maxes, then reduce it down to one row max
    // one warp handles one row in tile
    constexpr uint32_t per_row_iter = B_COL / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
    // FIXME: threadblock_id needs to be in here too
    float *warp_smem = smem_scratchpad + (warp_id * NUM_THREADS);
 // #define DUMB_ROWMAX
 #ifdef DUMB_ROWMAX
    // FIXME remove
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // no tree reduction; a single thread in a warp does serialized max across
    // the entire row
    if (tid_in_warp == 0) {
      float rowmax = smem_S[first_thread_offset];
 #pragma GCC unroll 16
      for (int i = 0; i < B_COL; i++) {
        asm volatile("fmax.s %0, %1, %2"
                     : "=f"(rowmax)
                     : "f"(rowmax), "f"(smem_S[first_thread_offset + i]));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      warp_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #else
    static_assert((B_COL % NUM_THREADS) == 0,
                  "B_COL must be a multiple of NUM_THREADS");
    float per_thread_max = FLT_MIN;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float next = smem_S[thread_offset];
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(per_thread_max)
                   : "f"(per_thread_max), "f"(next));
      thread_offset += NUM_THREADS;
    }
    // stage per-thread max value in smem
    warp_smem[tid_in_warp] = per_thread_max;
    // sync writes to warp_smem
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
 // #define PARALLEL_ROWMAX
 #ifndef PARALLEL_ROWMAX
    // elect 0-th thread to reduce all other thread's values in the warp
    if (tid_in_warp == 0) {
      float rowmax = per_thread_max;
      for (int i = 1; i < NUM_THREADS; i++) {
        float other = warp_smem[i];
        asm volatile("fmax.s %0, %1, %2"
                     : "=f"(rowmax)
                     : "f"(rowmax), "f"(other));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      warp_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #else
    if (warp_id < warps_in_threadblock / NUM_THREADS) {
      const uint32_t row = row_offset + NUM_THREADS * warp_id + tid_in_warp;
      float *const thread_smem = smem_scratchpad + (tid_in_warp * NUM_THREADS);
      float rowmax = FLT_MIN;
 #pragma GCC unroll
      for (int i = 0; i < NUM_THREADS; i++) {
        const float f = thread_smem[i];
        asm volatile("fmax.s %0, %1, %2" : "=f"(rowmax) : "f"(rowmax), "f"(f));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      thread_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #endif // PARALLEL_ROWMAX
 #endif // DUMB_ROWMAX
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // broadcast prev rowmax to all threads in the warp
    // NOTE: memory consistency is a little sketchy here
    const float rowmax_prev = warp_smem[0];
    const float rowmax_this = smem_rowmax_this[row];
    // exponential
    //
    // B_ROW / (B_ROW * B_COL / (exp_elem * threads_per_threadblock))
    // const uint32_t row_stride =
    //     (exp_elem_per_thread * threads_per_threadblock) / B_COL;
    // broadcast updated rowmax to all threads in the warp
    const float rowmax_new = smem_rowmax[row];
    asm volatile("flashattn_exp_p_start_%=:" ::);
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      float f0 = smem_S[thread_offset];
      f0 -= rowmax_new;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(f0);
      exp += exponential_taylor_term<2>(f0);
      // Store S transposed to the shared memory
      smem_P[thread_offset] = exp;
      thread_offset += NUM_THREADS;
    }
    asm volatile("flashattn_exp_p_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // rowsum
    //
    // two-level tree reduction, similar to rowmax
    asm volatile("flashattn_rowsum_start_%=:" ::);
    float per_thread_sum = 0.0f;
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      per_thread_sum += smem_P[thread_offset];
      thread_offset += NUM_THREADS;
    }
    // stage per-thread sum value in smem
    // FIXME: threadblock_id needs to be in here too
    warp_smem = smem_scratchpad + (warp_id * NUM_THREADS);
    warp_smem[tid_in_warp] = per_thread_sum;
    // sync writes to warp_smem
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // 0-th thread collects all other thread's values in the warp
    if (tid_in_warp == 0) {
      float rowsum = per_thread_sum;
      for (int iter = 1; iter < NUM_THREADS; iter++) {
        float other = warp_smem[iter];
        rowsum += other;
      }
      const float mi_prev = rowmax_prev;
      const float mi_this = rowmax_this;
      const float x = mi_prev - mi_this;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(x);
      exp += exponential_taylor_term<2>(x);
      // update rowsum
      const float rowsum_prev = smem_rowsum[row];
      float rowsum_new = exp * rowsum_prev + rowsum;
      smem_rowsum[row] = rowsum_new;
    }
    asm volatile("flashattn_rowsum_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // compute Oi rescale factor
    // FIXME: parallelize this across threads
    //
    asm volatile("flashattn_rescale_factor_start_%=:" ::);
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float mi_prev = rowmax_prev;
      const float mi_new = rowmax_new;
      const float x = mi_prev - mi_new;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(x);
      exp += exponential_taylor_term<2>(x);
      // @perf: div vs. expansion on e(-x)?
      smem_O_row_scale[row] = 1.0f / exp;
      thread_offset += NUM_THREADS;
    }
    asm volatile("flashattn_rescale_factor_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
  }
  asm volatile("thread_block_online_softmax_finish_%=:" ::);
 }
 __attribute__((always_inline)) inline void thread_block_O_rescale(
    const float *smem_O_in, float *smem_O_out, const float *smem_O_row_scale,
    const uint32_t tid_in_threadblock, const uint32_t threads_per_threadblock,
    const uint32_t threadblock_id_in_cluster) {
  asm volatile("thread_block_O_rescale_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_ROW;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = B_COL * row;
    constexpr uint32_t per_row_iter = B_COL / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
    // Oi rescale
    //
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float o = smem_O_in[thread_offset];
      const float scale = smem_O_row_scale[row];
      smem_O_out[thread_offset] = (o * scale);
      thread_offset += NUM_THREADS;
    }
  }
  asm volatile("thread_block_O_rescale_finish_%=:" ::);
 }
 #endif
--- a/tests/regression/flash_attention/kernel.cpp
+++ b/tests/regression/flash_attention/kernel.cpp
@@ -2,450 +2,11 @@
 #include <vx_intrinsics.h>
 #include <vx_print.h>
 #include <vx_spawn.h>
 #include <float.h>
 #include "common.h"
 #include "sgemm_impl.hpp"
 #include "include/gemmini.h"
 #include "gemmini_mmio.h"
-
+#include "flash_impl.hpp"
 #define B_ROW 64
 #define B_COL 64
 #define HEADDIM 64
 constexpr uint32_t ROWMAX_SETS = 3;
 constexpr bool DEBUG = false;
 constexpr bool WARP_SPECIALIZED = true;
 constexpr uint32_t DEV_FAKE_SMEM_START_ADDR = 0xf0000000;
 constexpr bool GEMMINI_DMA_FAST = true;
 constexpr bool Q_IS_K_MAJOR = true;
 // temporary safety stop for wrong configs
 static_assert(NUM_CORES == 4);
 static_assert(NUM_THREADS == 8);
 static_assert(NUM_WARPS == 8);
 inline void thread_block_init_sharedmem(const uint32_t tid_in_threadblock,
                                        const uint32_t threads_per_threadblock,
                                        float *smem_O, float *smem_rowmax,
                                        float *smem_rowsum,
                                        float *smem_O_row_scale) {
  asm volatile("threadblock_init_sharedmem_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  static_assert((B_ROW % NUM_THREADS) == 0,
                "B_ROW must be a multiple of NUM_THREADS");
  static_assert(B_ROW < (NUM_THREADS * CORES_PER_CLUSTER *
                         (NUM_WARPS / (WARP_SPECIALIZED ? 2 : 1))),
                "not enough warps to initialize rowmax/rowsum");
  // each thread initializes one element in rowmax/rowsum
  // multiple warps participate for the whole vector
  constexpr uint32_t needed_warps = B_ROW / NUM_THREADS;
  if (warp_id < needed_warps /* more warps in HW than needed? */) {
    uint32_t offset = NUM_THREADS * warp_id + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < ROWMAX_SETS; i++) {
      smem_rowmax[offset + i * ROWMAX_SETS] = FLT_MIN;
    }
    smem_rowsum[offset] = 0.0f;
    smem_O_row_scale[offset] = 0.0f;
  }
  // each warp clears out a row of smem_O
  // FIXME: dedup this pattern
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_COL;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    uint32_t thread_offset = HEADDIM * row + tid_in_warp;
    constexpr uint32_t per_row_iter = HEADDIM / NUM_THREADS;
    const float one = 0.0f;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      smem_O[thread_offset] = 0.0f;
      thread_offset += NUM_THREADS;
    }
  }
  asm volatile("threadblock_init_sharedmem_finish_%=:" ::);
 }
 inline void thread_block_copy_rowmax(const float *src, float *dest,
                                     const uint32_t tid_in_threadblock,
                                     const uint32_t threads_per_threadblock,
                                     const uint32_t threadblock_id_in_cluster) {
  asm volatile("threadblock_copy_rowmax_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  // each thread copies one element in rowmax
  // multiple warps participate for the whole vector
  constexpr uint32_t num_warps = B_ROW / NUM_THREADS;
  if (warp_id < num_warps) {
    uint32_t offset = NUM_THREADS * warp_id + tid_in_warp;
    dest[offset] = src[offset];
  }
  threadblock_barrier(threadblock_id_in_cluster,
                      warps_per_threadblock_per_core);
  asm volatile("threadblock_copy_rowmax_finish_%=:" ::);
 }
 template <uint32_t dim_row, uint32_t dim_col>
 inline void thread_block_copy_tile(const float *src, float *dest,
                                   const uint32_t tid_in_threadblock,
                                   const uint32_t threads_per_threadblock,
                                   const uint32_t threadblock_id_in_cluster) {
  asm volatile("threadblock_copy_tile_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  // FIXME: dedup this pattern
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < dim_row;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = dim_col * row;
    constexpr uint32_t per_row_iter = dim_col / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      dest[thread_offset] = src[thread_offset];
      thread_offset += NUM_THREADS;
    }
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
  }
  asm volatile("threadblock_copy_tile_finish_%=:" ::);
 }
 template <int order>
 inline float exponential_taylor_term(const float x) {
  asm volatile("exponential_taylor_term_start_%=:" ::);
  float res = 1.0f;
  if constexpr (order == 1) {
    res = x;
  } else if constexpr (order == 2) {
    res = x * x;
    res /= 2.0f;
  } else if constexpr (order == 3) {
    res = x * x * x;
    res /= 6.0f;
  }
  asm volatile("exponential_taylor_term_end_%=:" ::);
  return res;
 }
 __attribute__((always_inline)) inline void thread_block_online_softmax(
    const float *smem_S, float *smem_P, const uint32_t tid_in_threadblock,
    const uint32_t threads_per_threadblock,
    const uint32_t threadblock_id_in_cluster, float *smem_scratchpad,
    float *smem_rowmax, float *smem_rowsum, float *smem_O_row_scale) {
  asm volatile("thread_block_online_softmax_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
  float *smem_rowmax_this = smem_rowmax + B_ROW;
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_ROW;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = B_COL * row;
    // rowmax
    //
    // two-level tree reduction: reduce each row into NUM_THREADS intermediate
    // maxes, then reduce it down to one row max
    // one warp handles one row in tile
    constexpr uint32_t per_row_iter = B_COL / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
    // FIXME: threadblock_id needs to be in here too
    float *warp_smem = smem_scratchpad + (warp_id * NUM_THREADS);
 // #define DUMB_ROWMAX
 #ifdef DUMB_ROWMAX
    // FIXME remove
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // no tree reduction; a single thread in a warp does serialized max across
    // the entire row
    if (tid_in_warp == 0) {
      float rowmax = smem_S[first_thread_offset];
 #pragma GCC unroll 16
      for (int i = 0; i < B_COL; i++) {
        asm volatile("fmax.s %0, %1, %2"
                     : "=f"(rowmax)
                     : "f"(rowmax), "f"(smem_S[first_thread_offset + i]));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      warp_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #else
    static_assert((B_COL % NUM_THREADS) == 0,
                  "B_COL must be a multiple of NUM_THREADS");
    float per_thread_max = FLT_MIN;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float next = smem_S[thread_offset];
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(per_thread_max)
                   : "f"(per_thread_max), "f"(next));
      thread_offset += NUM_THREADS;
    }
    // stage per-thread max value in smem
    warp_smem[tid_in_warp] = per_thread_max;
    // sync writes to warp_smem
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
 // #define PARALLEL_ROWMAX
 #ifndef PARALLEL_ROWMAX
    // elect 0-th thread to reduce all other thread's values in the warp
    if (tid_in_warp == 0) {
      float rowmax = per_thread_max;
      for (int i = 1; i < NUM_THREADS; i++) {
        float other = warp_smem[i];
        asm volatile("fmax.s %0, %1, %2"
                     : "=f"(rowmax)
                     : "f"(rowmax), "f"(other));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      warp_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #else
    if (warp_id < warps_in_threadblock / NUM_THREADS) {
      const uint32_t row = row_offset + NUM_THREADS * warp_id + tid_in_warp;
      float *const thread_smem = smem_scratchpad + (tid_in_warp * NUM_THREADS);
      float rowmax = FLT_MIN;
 #pragma GCC unroll
      for (int i = 0; i < NUM_THREADS; i++) {
        const float f = thread_smem[i];
        asm volatile("fmax.s %0, %1, %2" : "=f"(rowmax) : "f"(rowmax), "f"(f));
      }
      smem_rowmax_this[row] = rowmax;
      // update previous rowmax
      // i.e. mi_new = max(mi, mij)
      float prev_rowmax = smem_rowmax[row];
      // stage prev rowmax in scratchpad for warp-wide broadcast
      thread_smem[0] = prev_rowmax;
      asm volatile("fmax.s %0, %1, %2"
                   : "=f"(rowmax)
                   : "f"(rowmax), "f"(prev_rowmax));
      smem_rowmax[row] = rowmax;
    }
 #endif // PARALLEL_ROWMAX
 #endif // DUMB_ROWMAX
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // broadcast prev rowmax to all threads in the warp
    // NOTE: memory consistency is a little sketchy here
    const float rowmax_prev = warp_smem[0];
    const float rowmax_this = smem_rowmax_this[row];
    // exponential
    //
    // B_ROW / (B_ROW * B_COL / (exp_elem * threads_per_threadblock))
    // const uint32_t row_stride =
    //     (exp_elem_per_thread * threads_per_threadblock) / B_COL;
    // broadcast updated rowmax to all threads in the warp
    const float rowmax_new = smem_rowmax[row];
    asm volatile("flashattn_exp_p_start_%=:" ::);
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      float f0 = smem_S[thread_offset];
      f0 -= rowmax_new;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(f0);
      exp += exponential_taylor_term<2>(f0);
      // Store S transposed to the shared memory
      smem_P[thread_offset] = exp;
      thread_offset += NUM_THREADS;
    }
    asm volatile("flashattn_exp_p_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // rowsum
    //
    // two-level tree reduction, similar to rowmax
    asm volatile("flashattn_rowsum_start_%=:" ::);
    float per_thread_sum = 0.0f;
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      per_thread_sum += smem_P[thread_offset];
      thread_offset += NUM_THREADS;
    }
    // stage per-thread sum value in smem
    // FIXME: threadblock_id needs to be in here too
    warp_smem = smem_scratchpad + (warp_id * NUM_THREADS);
    warp_smem[tid_in_warp] = per_thread_sum;
    // sync writes to warp_smem
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // 0-th thread collects all other thread's values in the warp
    if (tid_in_warp == 0) {
      float rowsum = per_thread_sum;
      for (int iter = 1; iter < NUM_THREADS; iter++) {
        float other = warp_smem[iter];
        rowsum += other;
      }
      const float mi_prev = rowmax_prev;
      const float mi_this = rowmax_this;
      const float x = mi_prev - mi_this;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(x);
      exp += exponential_taylor_term<2>(x);
      // update rowsum
      const float rowsum_prev = smem_rowsum[row];
      float rowsum_new = exp * rowsum_prev + rowsum;
      smem_rowsum[row] = rowsum_new;
    }
    asm volatile("flashattn_rowsum_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
    // compute Oi rescale factor
    // FIXME: parallelize this across threads
    //
    asm volatile("flashattn_rescale_factor_start_%=:" ::);
    thread_offset = first_thread_offset + tid_in_warp;
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float mi_prev = rowmax_prev;
      const float mi_new = rowmax_new;
      const float x = mi_prev - mi_new;
      // 2nd-order Taylor approximation
      float exp = 1.0f;
      exp += exponential_taylor_term<1>(x);
      exp += exponential_taylor_term<2>(x);
      // @perf: div vs. expansion on e(-x)?
      smem_O_row_scale[row] = 1.0f / exp;
      thread_offset += NUM_THREADS;
    }
    asm volatile("flashattn_rescale_factor_end_%=:" ::);
    threadblock_barrier(threadblock_id_in_cluster,
                        warps_per_threadblock_per_core);
  }
  asm volatile("thread_block_online_softmax_finish_%=:" ::);
 }
 __attribute__((always_inline)) inline void thread_block_O_rescale(
    const float *smem_O_in, float *smem_O_out, const float *smem_O_row_scale,
    const uint32_t tid_in_threadblock, const uint32_t threads_per_threadblock,
    const uint32_t threadblock_id_in_cluster) {
  asm volatile("thread_block_O_rescale_start_%=:" ::);
  const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS;
  const uint32_t warp_id = tid_in_threadblock / NUM_THREADS;
  const uint32_t warps_in_threadblock = threads_per_threadblock / NUM_THREADS;
  const uint32_t warps_per_threadblock_per_core =
      warps_in_threadblock / CORES_PER_CLUSTER;
 #pragma GCC unroll 1
  for (int row_offset = 0; row_offset < B_ROW;
       row_offset += warps_in_threadblock) {
    const uint32_t row = row_offset + warp_id;
    const uint32_t first_thread_offset = B_COL * row;
    constexpr uint32_t per_row_iter = B_COL / NUM_THREADS;
    uint32_t thread_offset = first_thread_offset + tid_in_warp;
    // Oi rescale
    //
 #pragma GCC unroll
    for (int i = 0; i < per_row_iter; i++) {
      const float o = smem_O_in[thread_offset];
      const float scale = smem_O_row_scale[row];
      smem_O_out[thread_offset] = (o * scale);
      thread_offset += NUM_THREADS;
    }
  }
  asm volatile("thread_block_O_rescale_finish_%=:" ::);
 }
 void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
  // @perf: All threads are running these compute whose result is mostly same