update gemmini only kernel

2024-04-15 10:22:00 -07:00
parent 0bb7aeb45b
commit 041d49fb58
2 changed files with 18 additions and 13 deletions
--- a/tests/kernel/gemmini_mmio/gemmini_mmio.h
+++ b/tests/kernel/gemmini_mmio/gemmini_mmio.h
@@ -14,7 +14,7 @@
 #define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE)
 #define SPAD_MASK (SPAD_NUM_ROWS - 1)

-#define PRINT_BUF SMEM_ADDR_END
+#define PRINT_BUF ((char *) (SMEM_ADDR_END))
 #define GEMMINI_RS1_ADDR 0xff007010
 #define GEMMINI_RS2_ADDR 0xff007018
 #define GEMMINI_INST_ADDR 0xff007000
@@ -32,7 +32,8 @@
    (((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM))

 // #define fence() { for (int i = 0; i < 10; i++) *((volatile uint32_t *) (0xFFFF0000)) = 0xdeadbeef; }
-#define fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); }
+#undef gemmini_fence
+#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); }

 #undef ROCC_INSTRUCTION_RS1_RS2
 #define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \
@@ -60,6 +61,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u
    a_transpose, b_transpose,
    full_C, low_D, false,
    act, 0, 0, false);
+  /*
  return;


@@ -72,7 +74,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u
  const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
  // const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t);
  const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
-  fence();
+  gemmini_fence();

  if (a_transpose || b_transpose || (I < 4)) {
    for (size_t k = 0; k < K; k++) {
@@ -140,7 +142,7 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u
            gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
          }
          if (k == K - 1) {
-            for (int x = 0; x < 3; x++) fence();
+            for (int x = 0; x < 3; x++) gemmini_fence();
            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM);
            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM);
            gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM);
@@ -152,7 +154,8 @@ static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const u
      }
    }
  }
-  fence();
+  gemmini_fence();
+  */
 }


--- a/tests/kernel/gemmini_mmio/main.cpp
+++ b/tests/kernel/gemmini_mmio/main.cpp
@@ -3,7 +3,7 @@
 #include <vx_intrinsics.h>
 #include <vx_print.h>
 #include <vx_spawn.h>
-#include <include/gemmini.h>
+#include "include/gemmini.h"
 #include "gemmini_mmio.h"

 #define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x))
@@ -12,7 +12,7 @@ int main() {

  int cid;
  asm volatile ("csrr %0, 0xcc2" : "=r" (cid));
-  if (cid > 0) return 0;
+  if (cid > 0) vx_tmc(0);

  vx_tmc(0xff);

@@ -40,7 +40,11 @@ int main() {
  vx_tmc_one();
  gemmini_config_ld(0);
  gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0);
-  gemmini_config_st(DIM * 4 * J);
+  gemmini_config_st(0);
+  /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM);
+  sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM);
+  sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */
+
  sprintf(print_buf, "DIM %d\n", DIM);
  sprintf(print_buf, "num cores %d\n", nc);
  sprintf(print_buf, "num threads %d\n", nt);
@@ -49,9 +53,6 @@ int main() {
  sprintf(print_buf, "%d", tid);

  uint32_t start_cycles, end_cycles;
-  /* sprintf(print_buf, "A spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_A, spad_A + I * K * DIM, (uint32_t) smem_A, (uint32_t) smem_A + sizeof(float) * I * K * DIM * DIM);
-  sprintf(print_buf, "B spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_B, spad_B + K * J * DIM, (uint32_t) smem_B, (uint32_t) smem_B + sizeof(float) * K * J * DIM * DIM);
-  sprintf(print_buf, "C spad: 0x%x-0x%x, smem: 0x%x-%x\n", spad_C, spad_C + I * J * DIM, (uint32_t) smem_C, (uint32_t) smem_C + sizeof(float) * I * J * DIM * DIM); */

  rd_cycles(start_cycles);
  // load A with 128->1 in row-major order
@@ -87,7 +88,7 @@ int main() {
  vx_tmc_one();
  sprintf(print_buf, "\ndata loading took %d cycles for %d floats\n", end_cycles - start_cycles, DIM * DIM * (I * K + J * K));

-  fence();
+  gemmini_fence();

  // sprintf(print_buf, "\nA in\n");
  // for (int i = 0; i < I * DIM; i++) {
@@ -113,7 +114,7 @@ int main() {
      /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION);

  rd_cycles(fence_cycles);
-  fence();
+  gemmini_fence();
  rd_cycles(end_cycles);
  sprintf(print_buf, "gemmini cycles taken: %d, fence cycles: %d\n", end_cycles - start_cycles, end_cycles - fence_cycles);

@@ -138,5 +139,6 @@ int main() {
  }
  sprintf(print_buf, "TEST PASSED\n");

+  vx_tmc(0);
  return 0;
 }