diff --git a/kernels/flash_attention/compile_flash.sh b/kernels/flash_attention/compile_flash.sh
index 95dcb233..5808508c 100755
--- a/kernels/flash_attention/compile_flash.sh
+++ b/kernels/flash_attention/compile_flash.sh
@@ -26,7 +26,7 @@ ln -sf input.b.rand.fp32.seqlen1024headdim64.row.bin input.b.bin
 ln -sf input.c.rand.fp32.seqlen1024headdim64.row.bin input.c.bin
 
 for arch in "${archs[@]}"; do
-    git checkout ae-$arch
+    git checkout ae-flash-$arch
 
     # re-compile libvortexrt.a
     # FIXME after restructure
@@ -34,13 +34,11 @@ for arch in "${archs[@]}"; do
     make
     popd
 
-    for dim in "${dims[@]}"; do
-        echo "compiling flash_attn kernel for $arch with seqlen 1024, headdim 64"
+    echo "compiling flash_attn kernel for $arch with seqlen 1024, headdim 64"
 
-        # touch source file to force re-building, as the Makefile does not track
-        # binary changes
-        touch kernel.cpp
+    # touch source file to force re-building, as the Makefile does not track
+    # binary changes
+    touch kernel.cpp
 
-        make CONFIG=flash.$arch.seqlen1024.headdim64
-    done
+    make CONFIG=flash.$arch.seqlen1024.headdim64
 done
diff --git a/kernels/sgemm_gemmini_dma/kernel.cpp b/kernels/sgemm_gemmini_dma/kernel.cpp
index 8ae36bd8..5b6c3f71 100644
--- a/kernels/sgemm_gemmini_dma/kernel.cpp
+++ b/kernels/sgemm_gemmini_dma/kernel.cpp
@@ -53,7 +53,7 @@
 #define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__)
 // #define PRINTF(...) vx_printf(__VA_ARGS__)
 #define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x))))
-// #define POWER
+#define POWER
 
 typedef uint16_t smem_elem_t;
 // typedef float smem_elem_t;