From 3de51577ef09273c54e0695e9c2fa8e9cf536ab8 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 Jan 2025 17:08:32 -0800
Subject: [PATCH] Check-in gemmini headers instead of submodule

---
 .gitmodules                                 |    3 -
 gemmini/include/accumulator.h               |   24 +
 gemmini/include/character.h                 |   10 +
 gemmini/include/gemmini.h                   | 3611 +++++++++++++++++++
 gemmini/include/gemmini_counter.h           |   79 +
 gemmini/include/gemmini_nn.h                |  576 +++
 gemmini/include/gemmini_params.dim16fp16.h  |   90 +
 gemmini/include/gemmini_params.dim8fp32.h   |   92 +
 gemmini/include/gemmini_params.h            |    1 +
 gemmini/include/gemmini_testutils.h         |  285 ++
 gemmini/include/translator.h                |   13 +
 gemmini/rocc-software/.gitignore            |    3 +
 gemmini/rocc-software/CONTRIBUTING.md       |   46 +
 gemmini/rocc-software/LICENSE               |  201 ++
 gemmini/rocc-software/README.md             |    4 +
 gemmini/rocc-software/src/riscv_test_rocc.h |   28 +
 gemmini/rocc-software/src/xcustom.h         |  170 +
 tests/regression/common.mk                  |    2 +-
 third_party/gemmini-rocc-tests              |    1 -
 19 files changed, 5234 insertions(+), 5 deletions(-)
 create mode 100644 gemmini/include/accumulator.h
 create mode 100644 gemmini/include/character.h
 create mode 100644 gemmini/include/gemmini.h
 create mode 100644 gemmini/include/gemmini_counter.h
 create mode 100644 gemmini/include/gemmini_nn.h
 create mode 100644 gemmini/include/gemmini_params.dim16fp16.h
 create mode 100644 gemmini/include/gemmini_params.dim8fp32.h
 create mode 120000 gemmini/include/gemmini_params.h
 create mode 100644 gemmini/include/gemmini_testutils.h
 create mode 100644 gemmini/include/translator.h
 create mode 100644 gemmini/rocc-software/.gitignore
 create mode 100644 gemmini/rocc-software/CONTRIBUTING.md
 create mode 100644 gemmini/rocc-software/LICENSE
 create mode 100644 gemmini/rocc-software/README.md
 create mode 100644 gemmini/rocc-software/src/riscv_test_rocc.h
 create mode 100644 gemmini/rocc-software/src/xcustom.h
 delete mode 160000 third_party/gemmini-rocc-tests

diff --git a/.gitmodules b/.gitmodules
index 6bc2bb4c..af1d1a47 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,6 +7,3 @@
 [submodule "third_party/ramulator"]
 	path = third_party/ramulator
 	url = https://github.com/CMU-SAFARI/ramulator.git
-[submodule "third_party/gemmini-rocc-tests"]
-	path = third_party/gemmini-rocc-tests
-	url = https://github.com/ucb-bar/gemmini-rocc-tests
diff --git a/gemmini/include/accumulator.h b/gemmini/include/accumulator.h
new file mode 100644
index 00000000..4de3d428
--- /dev/null
+++ b/gemmini/include/accumulator.h
@@ -0,0 +1,24 @@
+// See LICENSE for license details.
+
+#ifndef SRC_MAIN_C_ACCUMULATOR_H
+#define SRC_MAIN_C_ACCUMULATOR_H
+
+#include "rocc-software/src/xcustom.h"
+
+#define k_DO_WRITE 0
+#define k_DO_READ 1
+#define k_DO_LOAD 2
+#define k_DO_ACCUM 3
+
+#define XCUSTOM_ACC 0
+
+#define doWrite(y, rocc_rd, data)                                       \
+  ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_WRITE);
+#define doRead(y, rocc_rd)                                              \
+  ROCC_INSTRUCTION(XCUSTOM_ACC, y, 0, rocc_rd, k_DO_READ);
+#define doLoad(y, rocc_rd, mem_addr)                                    \
+  ROCC_INSTRUCTION(XCUSTOM_ACC, y, mem_addr, rocc_rd, k_DO_LOAD);
+#define doAccum(y, rocc_rd, data) \
+  ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_ACCUM);
+
+#endif  // SRC_MAIN_C_ACCUMULATOR_H
diff --git a/gemmini/include/character.h b/gemmini/include/character.h
new file mode 100644
index 00000000..e46e21b4
--- /dev/null
+++ b/gemmini/include/character.h
@@ -0,0 +1,10 @@
+// See LICENSE for license details.
+
+#ifndef SRC_MAIN_C_CHARACTER_H
+#define SRC_MAIN_C_CHARACTER_H
+
+#include "rocc-software/src/xcustom.h"
+
+#define XCUSTOM_CHAR 2
+
+#endif  // SRC_MAIN_C_CHARACTER_H
diff --git a/gemmini/include/gemmini.h b/gemmini/include/gemmini.h
new file mode 100644
index 00000000..e76ace6b
--- /dev/null
+++ b/gemmini/include/gemmini.h
@@ -0,0 +1,3611 @@
+// See LICENSE for license details.
+
+#ifndef SRC_MAIN_C_GEMMINI_H
+#define SRC_MAIN_C_GEMMINI_H
+
+#undef abs
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <stdbool.h>
+
+#include "include/gemmini_params.h"
+
+#define GEMMINI_ASSERTIONS
+
+// Accelerator interface
+#include "rocc-software/src/xcustom.h"
+
+// Counter Definition
+#include "include/gemmini_counter.h"
+
+#define k_CONFIG 0
+#define k_MVIN2 1
+#define k_MVIN 2
+#define k_MVOUT 3
+#define k_COMPUTE_PRELOADED 4
+#define k_COMPUTE_ACCUMULATE 5
+#define k_PRELOAD 6
+#define k_FLUSH 7
+
+#define k_LOOP_WS 8
+#define k_LOOP_WS_CONFIG_BOUNDS 9
+#define k_LOOP_WS_CONFIG_ADDRS_AB 10
+#define k_LOOP_WS_CONFIG_ADDRS_DC 11
+#define k_LOOP_WS_CONFIG_STRIDES_AB 12
+#define k_LOOP_WS_CONFIG_STRIDES_DC 13
+
+#define k_MVIN3 14
+
+#define k_COUNTER 126
+
+#define k_LOOP_CONV_WS 15
+#define k_LOOP_CONV_WS_CONFIG_1 16
+#define k_LOOP_CONV_WS_CONFIG_2 17
+#define k_LOOP_CONV_WS_CONFIG_3 18
+#define k_LOOP_CONV_WS_CONFIG_4 19
+#define k_LOOP_CONV_WS_CONFIG_5 20
+#define k_LOOP_CONV_WS_CONFIG_6 21
+
+// CLKGATE_EN: 22
+#define k_MVOUT_SPAD 23
+#define k_LOOP_WS_CONFIG_SPAD_AB 24
+#define k_LOOP_WS_CONFIG_SPAD_C 25
+
+#define CONFIG_EX 0
+#define CONFIG_LD 1
+#define CONFIG_ST 2
+#define CONFIG_BERT 3
+
+#define GARBAGE_ADDR ((uint32_t)(-1))
+#define OUTPUT_STATIONARY 0
+#define WEIGHT_STATIONARY 1
+
+#define NO_ACTIVATION 0
+#define RELU 1
+#define LAYERNORM 2
+#define IGELU 3
+#define SOFTMAX 4
+
+#ifdef ELEM_T_IS_FLOAT
+elem_t elem_t_bits_to_elem_t(elem_t_bits x) {
+    union {
+        elem_t_bits b;
+        elem_t f;
+    } un;
+
+    un.b = x;
+    return un.f;
+}
+
+elem_t_bits elem_t_to_elem_t_bits(elem_t x) {
+    union {
+        elem_t_bits b;
+        elem_t f;
+    } un;
+
+    un.f = x;
+    return un.b;
+}
+
+acc_t acc_t_bits_to_acc_t(acc_t_bits x) {
+    union {
+        acc_t_bits b;
+        acc_t f;
+    } un;
+
+    un.b = x;
+    return un.f;
+}
+
+acc_t_bits acc_t_to_acc_t_bits(acc_t x) {
+    union {
+        acc_t_bits b;
+        acc_t f;
+    } un;
+
+    un.f = x;
+    return un.b;
+}
+
+bool elem_t_isnan(elem_t x) {
+    elem_t_bits bits = elem_t_to_elem_t_bits(x);
+    uint64_t exp = (bits >> (ELEM_T_SIG_BITS-1)) & (((uint64_t)1 << ELEM_T_EXP_BITS) - 1);
+    uint64_t sig = bits & (((uint64_t)1 << ELEM_T_SIG_BITS) - 1);
+    bool is_nan_or_inf = exp == (((uint64_t)1 << ELEM_T_EXP_BITS) - 1);
+    bool is_not_inf = sig != 0;
+    return is_nan_or_inf && is_not_inf;
+}
+
+bool acc_t_isnan(acc_t x) {
+    acc_t_bits bits = acc_t_to_acc_t_bits(x);
+    uint64_t exp = (bits >> (ACC_T_SIG_BITS-1)) & (((uint64_t)1 << ACC_T_EXP_BITS) - 1);
+    uint64_t sig = bits & (((uint64_t)1 << ACC_T_SIG_BITS) - 1);
+    bool is_nan_or_inf = exp == (((uint64_t)1 << ACC_T_EXP_BITS) - 1);
+    bool is_not_inf = sig != 0;
+    return is_nan_or_inf && is_not_inf;
+}
+#endif
+
+#ifdef HAS_MVIN_SCALE
+static scale_t scale_t_bits_to_scale_t(scale_t_bits x) {
+    union {
+        scale_t_bits b;
+        scale_t f;
+    } un;
+
+    un.b = x;
+    return un.f;
+}
+
+static scale_t_bits scale_t_to_scale_t_bits(scale_t x) {
+    union {
+        scale_t_bits b;
+        scale_t f;
+    } un;
+
+    un.f = x;
+    return un.b;
+}
+#else
+#define scale_t_to_scale_t_bits(x) 0
+#endif
+
+#ifdef HAS_MVIN_ACC_SCALE
+static scale_acc_t scale_acc_t_bits_to_scale_acc_t(scale_acc_t_bits x) {
+    union {
+        scale_acc_t_bits b;
+        scale_acc_t f;
+    } un;
+
+    un.b = x;
+    return un.f;
+}
+
+static scale_acc_t_bits scale_acc_t_to_scale_acc_t_bits(scale_acc_t x) {
+    union {
+        scale_acc_t_bits b;
+        scale_acc_t f;
+    } un;
+
+    un.f = x;
+    return un.b;
+}
+#endif
+
+static acc_scale_t acc_scale_t_bits_to_acc_scale_t(acc_scale_t_bits x) {
+    union {
+        acc_scale_t_bits b;
+        acc_scale_t f;
+    } un;
+
+    un.b = x;
+    return un.f;
+}
+
+static acc_scale_t_bits acc_scale_t_to_acc_scale_t_bits(acc_scale_t x) {
+    union {
+        acc_scale_t_bits b;
+        acc_scale_t f;
+    } un;
+
+    un.f = x;
+    return un.b;
+}
+
+#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) \
+  ROCC_INSTRUCTION_0_R_R(x, rs1, rs2, funct)
+
+// mvin and mvout
+#define gemmini_extended_mvin(dram_addr, spad_addr, cols, rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN)
+
+#define gemmini_extended_mvin2(dram_addr, spad_addr, cols, rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN2)
+
+#define gemmini_extended_mvin3(dram_addr, spad_addr, cols, rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (spad_addr), k_MVIN3)
+
+#define gemmini_block_mvin(dram_addr, spad_addr, len) \
+  gemmini_extended_mvin(dram_addr, spad_addr, (len) * DIM, DIM)
+
+#define gemmini_mvin(dram_addr, spad_addr) \
+  gemmini_extended_mvin(dram_addr, spad_addr, DIM, DIM)
+
+#define gemmini_extended_mvout(dram_addr, spad_addr, cols, rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, dram_addr, ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(spad_addr), k_MVOUT)
+
+#define gemmini_extended_mvout_spad(dst_addr, dst_stride, src_addr, cols, rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(dst_stride) << 32) | (uint64_t)(dst_addr), ((uint64_t)(rows) << (ADDR_LEN + 16)) | ((uint64_t)(cols) << ADDR_LEN) | (uint64_t)(src_addr), k_MVOUT_SPAD)
+
+#define gemmini_mvout_spad(dst_addr, src_addr) \
+  gemmini_extended_mvout_spad(dst_addr, 1, src_addr, DIM, DIM)
+
+#define gemmini_mvout(dram_addr, spad_addr) \
+  gemmini_extended_mvout(dram_addr, spad_addr, DIM, DIM)
+
+// compute
+#define gemmini_extended_compute_preloaded(A, BD, A_cols, A_rows, BD_cols, BD_rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_PRELOADED)
+
+#define gemmini_extended_compute_accumulated(A, BD, A_cols, A_rows, BD_cols, BD_rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(A_rows) << (ADDR_LEN + 16)) | ((uint64_t)(A_cols) << ADDR_LEN) | (uint64_t)(A), ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), k_COMPUTE_ACCUMULATE)
+
+#define gemmini_compute_preloaded(A, BD) \
+  gemmini_extended_compute_preloaded(A, BD, DIM, DIM, DIM, DIM)
+
+#define gemmini_compute_accumulated(A, BD) \
+  gemmini_extended_compute_accumulated(A, BD, DIM, DIM, DIM, DIM)
+
+// preload
+#define gemmini_extended_preload(BD, C, BD_cols, BD_rows, C_cols, C_rows) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(BD_rows) << (ADDR_LEN + 16)) | ((uint64_t)(BD_cols) << ADDR_LEN) | (uint64_t)(BD), ((uint64_t)(C_rows) << (ADDR_LEN + 16)) | ((uint64_t)(C_cols) << ADDR_LEN) | (uint64_t)(C), k_PRELOAD)
+
+#define gemmini_preload(BD, C) \
+  gemmini_extended_preload(BD, C, DIM, DIM, DIM, DIM)
+
+#define gemmini_preload_zeros(C) \
+  gemmini_preload(GARBAGE_ADDR, C)
+
+// config
+#define gemmini_extended3_config_ex(dataflow, sys_act, sys_shift, sys_acc_scale, C_stride, A_stride, A_transpose, B_transpose, set_only_strides) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)sys_acc_scale) << 32) | ((uint64_t)(A_stride) << 16) | (B_transpose << 9) | (A_transpose << 8) | ((set_only_strides) << 7) | ((sys_act) << 3) | ((dataflow) << 2) | CONFIG_EX, ((uint64_t)(C_stride) << 48) | (sys_shift), k_CONFIG); \
+
+#define gemmini_extended2_config_ex(dataflow, sys_act, sys_shift, A_stride, A_transpose, B_transpose) \
+  gemmini_extended3_config_ex(dataflow, sys_act, sys_shift, ACC_SCALE_IDENTITY, 1, A_stride, A_transpose, B_transpose, false)
+
+#define gemmini_extended_config_ex(dataflow, sys_act, sys_shift, A_stride, A_transpose, B_transpose) \
+  gemmini_extended2_config_ex(dataflow, sys_act, sys_shift, A_stride, A_transpose, B_transpose)
+
+#define gemmini_config_ex(dataflow, sys_act, sys_shift) \
+    gemmini_extended_config_ex(dataflow, sys_act, sys_shift, 1, 0, 0)
+
+// Note: The "pixel_repeats" parameter below is still experimental, andthere is
+// a high chance that it will be removed in future releases.
+#define gemmini_extended5_config_ld(stride, scale, shrunk, block_mvin_stride, pixel_repeats, id) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(scale_t_to_scale_t_bits(scale)) << 32) | ((uint64_t)(block_mvin_stride) << 16) | ((uint64_t)(pixel_repeats) << 8) | ((id) << 3) | ((shrunk) << 2) | CONFIG_LD, stride, k_CONFIG)
+
+#define gemmini_extended4_config_ld(stride, scale, shrunk, block_mvin_stride, id) \
+  gemmini_extended5_config_ld(stride, scale, shrunk, block_mvin_stride, 1, id) \
+
+#define gemmini_extended3_config_ld(stride, scale, shrunk, id) \
+  gemmini_extended4_config_ld(stride, scale, shrunk, DIM, id)
+
+#define gemmini_extended2_config_ld(stride, scale, shrunk) \
+  gemmini_extended3_config_ld(stride, scale, shrunk, 0)
+
+#define gemmini_extended_config_ld(stride, scale) \
+  gemmini_extended2_config_ld(stride, scale, false)
+
+#define gemmini_config_ld(stride) \
+  gemmini_extended_config_ld(stride, MVIN_SCALE_IDENTITY)
+
+#define gemmini_extended2_config_st(stride, acc_act, acc_scale, pool_stride, pool_size, pool_out_dim, porows, pocols, orows, ocols, upad, lpad) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(ocols) << 56) | ((uint64_t)(orows) << 48) | ((uint64_t)(pocols) << 40) | ((uint64_t)(porows) << 32) | ((uint64_t)(pool_out_dim) << 24) | ((uint64_t)(lpad) << 10) | ((uint64_t)(upad) << 8) | ((uint64_t)(pool_size) << 6) | ((uint64_t)(pool_stride) << 4) | ((uint64_t)(acc_act) << 2) | CONFIG_ST, ((uint64_t)acc_scale_t_to_acc_scale_t_bits((acc_scale_t)acc_scale) << 32) | ((uint32_t)stride), k_CONFIG)
+
+#define gemmini_extended_config_st(stride, acc_act, acc_scale) \
+    gemmini_extended2_config_st(stride, acc_act, acc_scale, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+
+#define gemmini_config_st(stride) \
+    gemmini_extended_config_st(stride, NO_ACTIVATION, ACC_SCALE_IDENTITY)
+
+#define gemmini_config_norm(q_const, q_const_type, set_stats_id_only, act_msb, stat_id, igelu_qb, igelu_qc) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, (((uint64_t) ((uint32_t) q_const)) << 32) | ((q_const_type & 1) << 18) | ((set_stats_id_only & 1) << 17) | ((act_msb & 1) << 16) | ((uint64_t)stat_id << 8) | CONFIG_BERT, ((uint64_t)((uint32_t)(igelu_qc)) << 32) | ((uint64_t)((uint32_t)(igelu_qb))), k_CONFIG)
+
+// flush
+#define gemmini_flush(skip) \
+  ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, skip, 0, k_FLUSH)
+
+// fence
+#define gemmini_fence() asm volatile("fence")
+
+// Counter access
+#define gemmini_counter_access(rd, config_reg) \
+  { \
+    uint32_t _placeholder; \
+    ROCC_INSTRUCTION(XCUSTOM_ACC, rd, config_reg, _placeholder, k_COUNTER) \
+  }
+
+// Read counter
+static uint32_t counter_read(size_t index) {
+  uint32_t config_reg = (index & 0x7) << 4;
+  uint32_t res;
+  gemmini_counter_access(res, config_reg);
+  return res;
+}
+
+// Configure counter to take a new signal
+static void counter_configure(size_t index, size_t counter_code) {
+  int non_incremental = counter_code > INCREMENTAL_COUNTERS;
+  if (non_incremental) {
+    counter_code -= INCREMENTAL_COUNTERS;
+  }
+
+  uint32_t config_reg = (index & 0x7) << 4 | 0x8 | (counter_code & 0x3f) << 12 | non_incremental << 31;
+  uint32_t placeholder;
+  gemmini_counter_access(placeholder, config_reg);
+}
+
+// Take a snapshot
+static void counter_snapshot_take() {
+  uint32_t config_reg = 0x4;
+  uint32_t placeholder;
+  gemmini_counter_access(placeholder, config_reg);
+}
+
+// Counter snapshot reset
+static void counter_snapshot_reset() {
+  uint32_t config_reg = 0x2;
+  uint32_t placeholder;
+  gemmini_counter_access(placeholder, config_reg);
+}
+
+// Counter module reset
+static void counter_reset() {
+  uint32_t config_reg = 0x1;
+  uint32_t placeholder;
+  gemmini_counter_access(placeholder, config_reg);
+}
+
+int ceil_divide_int(int a, int b){
+    int c = (a % b == 0) ? ((int)(a/b)) :(((int)(a/b)) + 1); 
+    if(a < b) c = 1;
+    return c;
+}
+
+// weight-stationary matmul loop
+#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate, act, a_spad_id, b_spad_id, is_resadd) \
+  { \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(a_spad_id) << 18) | ((uint64_t)(b_spad_id) << 16) | ((uint64_t)(act) << 8) | ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((is_resadd) << 2) | ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \
+  }
+
+#define gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_transpose, B_transpose, full_C, low_D, ex_accumulate, act, a_spad_id, b_spad_id, is_resadd, skips) \
+  { \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_SPAD_AB) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(a_spad_id) << 18) | ((uint64_t)(b_spad_id) << 16) | ((uint64_t)(act) << 8) | ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((uint64_t)(C) << 32) | 0x200U | (skips) | ((is_resadd) << 2) | ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \
+  }
+
+// weight-stationary conv loop
+#define gemmini_loop_conv_ws(batch_size, in_row_dim, in_col_dim, in_channels, out_channels, out_row_dim, out_col_dim, pool_out_row_dim, pool_out_col_dim, stride, padding, kernel_dim, kernel_dilation, pool_size, pool_stride, pool_padding, batches, porows, pocols, pochs, krows, kcols, kchs, lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, orows, ocols, weights, output, bias, input, no_bias, no_pool, downsample, wrot180, input_dilated, activation, trans_output_1203, trans_weight_1203, trans_weight_0132, trans_input_3120, max_pixels_per_row, in_stride, weight_stride, out_stride, dw, a_spad_id, b_spad_id) \
+  { \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(out_channels) << 48) | ((uint64_t)(in_channels) << 32) | ((uint64_t)(in_row_dim) << 16) | (uint64_t)(batch_size), \
+      ((uint64_t)(padding) << 56) | ((uint64_t)(stride) << 48) | ((uint64_t)(out_col_dim) << 32) | ((uint64_t)(pool_out_row_dim) << 16) | (uint64_t)(out_row_dim), k_LOOP_CONV_WS_CONFIG_1) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(kernel_dim) << 48) | ((uint64_t)(pool_out_col_dim) << 32) | ((uint64_t)(pool_size) << 16) | ((uint64_t)(pool_stride) << 8) | (uint64_t)(pool_padding), \
+      ((uint64_t)(batches) << 48) | ((uint64_t)(porows) << 32) | ((uint64_t)(pocols) << 16) | (uint64_t)(pochs), k_LOOP_CONV_WS_CONFIG_2) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(krows) << 48) | ((uint64_t)(kcols) << 32) | ((uint64_t)(kchs) << 16) | (uint64_t)(lpad), \
+      ((uint64_t)(rpad) << 48) | ((uint64_t)(upad) << 32) | ((uint64_t)(dpad) << 24) | ((uint64_t)(plpad) << 16) | ((uint64_t)(in_col_dim)), k_LOOP_CONV_WS_CONFIG_3) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(orows) << 48) | ((uint64_t)(prpad) << 32) | ((uint64_t)(pupad) << 21) | ((uint64_t)(pdpad) << 10) | (uint64_t)(kernel_dilation), \
+      ((uint64_t)(in_stride) << 48) | ((uint64_t)(weight_stride) << 32) | ((uint64_t)(out_stride) << 16) | (uint64_t)(ocols), k_LOOP_CONV_WS_CONFIG_4) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, weights, \
+      output, k_LOOP_CONV_WS_CONFIG_5) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, bias, \
+      input, k_LOOP_CONV_WS_CONFIG_6) \
+    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(a_spad_id) << 18) | ((uint64_t)(b_spad_id) << 16) | ((uint64_t)(max_pixels_per_row) << 8) | ((dw) << 6) | ((trans_input_3120) << 5) | ((trans_weight_0132) << 4) | ((trans_weight_1203) << 3) | ((trans_output_1203) << 2) | ((wrot180) << 1) | (no_bias), \
+      ((activation) << 3)| ((input_dilated) << 2) | ((downsample) << 1) | (no_pool), \
+      k_LOOP_CONV_WS) \
+  }
+
+// Tiling functions
+static void sp_tiled_matmul_os(const elem_t * A, const elem_t * B, const void * D, void * C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K,
+        size_t A_row_stride, size_t B_row_stride, size_t D_row_stride, size_t C_row_stride,
+        bool a_transpose, bool b_transpose,
+        bool full_C, bool low_D,
+        bool no_bias, bool repeating_bias,
+        int act,
+        int a_spad_id, int b_spad_id) {
+
+  const uint32_t A_sp_addr_start = 0;
+  const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM;
+  const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+  const uint32_t C_sp_addr_start = (3 << (ADDR_LEN-2)) | (full_C << (ADDR_LEN-3));
+
+  const int A_blocks = K <= MAX_BLOCK_LEN ? K : MAX_BLOCK_LEN;
+  const int B_blocks = J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN;
+  const int D_blocks = J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC;
+
+  // Move-in D
+  if (D != NULL && !no_bias) {
+    const size_t D_stride = repeating_bias ? 0 : D_row_stride * sizeof(acc_t);
+    gemmini_extended_config_ld(D_stride, D_scale_factor);
+
+    for (size_t i = 0; i < I; i++) {
+      for (size_t j = 0; j < J; j += D_blocks) {
+        const size_t bias_row = repeating_bias ? 0 : i;
+        const acc_t * const D_dram_addr = (acc_t *)D + (bias_row * D_row_stride + j)*DIM;
+
+        const uint32_t D_sp_addr_acc = D_sp_addr_start + (i*J + j)*DIM;
+
+        const size_t blocks = j + D_blocks <= J ? D_blocks : J-j;
+
+        const size_t cols = blocks * DIM - (j + blocks >= J ? pad_J : 0);
+        const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+
+        gemmini_extended_mvin(D_dram_addr, D_sp_addr_acc, cols, rows);
+      }
+    }
+  }
+
+  // Move-in B
+  gemmini_extended_config_ld(B_row_stride * sizeof(elem_t), B_scale_factor);
+  for (size_t j = 0; j < J; j += B_blocks) {
+    for (size_t k = 0; k < K; k++) {
+      const elem_t * const B_dram_addr = B + (k*B_row_stride + j)*DIM;
+      const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM;
+      const size_t blocks = j + B_blocks <= J ? B_blocks : J-j;
+      const size_t cols = blocks * DIM - (j + blocks >= J ? pad_J : 0);
+      const size_t rows = DIM - (k == K-1 ? pad_K : 0);
+      gemmini_extended_mvin(B_dram_addr, B_sp_addr, cols, rows);
+    }
+  }
+
+  // Move-in A
+  gemmini_extended_config_ld(A_row_stride * sizeof(elem_t), A_scale_factor);
+  for (size_t i = 0; i < I; i++) {
+    for (size_t k = 0; k < K; k += A_blocks) {
+      const elem_t * const A_dram_addr = A + (i*A_row_stride + k)*DIM;
+      const uint32_t A_sp_addr = A_sp_addr_start + (i*K + k)*DIM;
+      const size_t blocks = k + A_blocks <= K ? A_blocks : K-k;
+      const size_t cols = blocks * DIM - (k + blocks >= K ? pad_K : 0);
+      const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+      gemmini_extended_mvin(A_dram_addr, A_sp_addr, cols, rows);
+    }
+  }
+
+  for (size_t i = 0; i < I; i++) {
+    for (size_t j = 0; j < J; j++) {
+      const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
+
+      for (size_t k = 0; k < K; k++) {
+
+        const uint32_t A_sp_addr = A_sp_addr_start + (i*K + k)*DIM;
+        const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM;
+
+        uint32_t out_sp_addr = k == K-1 ? C_sp_addr : GARBAGE_ADDR;
+
+        // If we're not using a bias, then we want to overwrite what's in the
+        // accumulator, rather than writing over it
+        int no_bias_new_matrix = no_bias && D != NULL && k == K-1;
+        if (no_bias_new_matrix) {
+          out_sp_addr &= ~(1 << (ADDR_LEN-2));
+        }
+
+        const size_t A_cols = DIM - (k == K - 1 ? pad_K : 0);
+        const size_t A_rows = DIM - (i == I - 1 ? pad_I : 0);
+        const size_t B_cols = DIM - (j == J - 1 ? pad_J : 0);
+        const size_t B_rows = DIM - (k == K - 1 ? pad_K : 0);
+        const size_t C_cols = DIM - (j == J - 1 ? pad_J : 0);
+        const size_t C_rows = DIM - (i == I - 1 ? pad_I : 0);
+
+        gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, C_cols, C_rows);
+
+        if (k == 0) { // First iteration
+          gemmini_extended_compute_preloaded(A_sp_addr, B_sp_addr, A_cols, A_rows, B_cols, B_rows);
+        } else { // All other iterations
+          gemmini_extended_compute_accumulated(A_sp_addr, B_sp_addr, A_cols, A_rows, B_cols, B_rows);
+        }
+      }
+    }
+  }
+
+  // Move-out C
+  if (C != NULL) {
+    const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
+
+    for (size_t i = 0; i < I; i++) {
+      for (size_t j = 0; j < J; j++) {
+        void * const C_dram_addr = (int8_t*)C + (i*C_row_stride + j)*DIM*sizeof_C;
+        const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
+
+        const size_t C_cols = DIM - (j == J - 1 ? pad_J : 0);
+        const size_t C_rows = DIM - (i == I - 1 ? pad_I : 0);
+
+        gemmini_extended_mvout(C_dram_addr, C_sp_addr, C_cols, C_rows);
+      }
+    }
+  }
+}
+
+
+static void sp_tiled_matmul_ws(const elem_t * A, const elem_t * B,
+        const void * D, void * C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K,
+        size_t A_row_stride, size_t B_row_stride, size_t D_row_stride, size_t C_row_stride,
+        bool a_transpose, bool b_transpose,
+        bool full_C, bool low_D,
+        bool no_bias, bool repeating_bias,
+        int act,
+        int a_spad_id, int b_spad_id) {
+/*
+  const uint32_t A_sp_addr_start = 0;
+  const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM;
+  const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+  const uint32_t C_sp_addr_start = 3 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3));
+  const int A_blocks = a_transpose ? (I <= MAX_BLOCK_LEN ? I : MAX_BLOCK_LEN) :
+    (K <= MAX_BLOCK_LEN ? K : MAX_BLOCK_LEN);
+  const int B_blocks = b_transpose ? (K <= MAX_BLOCK_LEN ? K : MAX_BLOCK_LEN) :
+    (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
+  const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) :
+    (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC);
+  const int C_blocks = full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
+  const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t);
+  const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
+  // Move-in D
+  if (D != NULL && !no_bias) {
+    for (size_t i = 0; i < I; i++) {
+      const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+      for (size_t j = 0; j < J; j += D_blocks) {
+        const size_t bias_row = repeating_bias ? 0 : i;
+        const void * const D_dram_addr = (int8_t *)D + (bias_row * D_row_stride + j)*DIM*sizeof_D;
+        const uint32_t D_sp_addr_acc = D_sp_addr_start + (i*J + j)*DIM;
+        size_t blocks = j + D_blocks <= J ? D_blocks : J-j;
+        const size_t cols = blocks * DIM - (j + blocks >= J ? pad_J : 0);
+        gemmini_extended_mvin3(D_dram_addr, D_sp_addr_acc, cols, rows);
+      }
+    }
+  }
+  for (size_t k = 0; k < K; k++) {
+    for (size_t j = 0; j < J; j++) {
+      for (size_t i = 0; i < I; i++) {
+        const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) :
+          (A_sp_addr_start + (i*K + k)*DIM);
+        const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) :
+          (B_sp_addr_start + (k*J + j)*DIM);
+        const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
+        // Mvin A
+        if (a_transpose) {
+          if (j == 0 && i % A_blocks == 0) {
+            const elem_t * const A_dram_addr = A + (k*A_row_stride + i)*DIM;
+            const size_t blocks = i + A_blocks <= I ? A_blocks : I-i;
+            const size_t cols = blocks * DIM - (i + blocks >= I ? pad_I : 0);
+            const size_t rows = DIM - (k == K-1 ? pad_K : 0);
+            gemmini_extended_mvin(A_dram_addr, A_sp_addr, cols, rows);
+          }
+        } else {
+          if (j == 0 && k % A_blocks == 0) {
+            const elem_t * const A_dram_addr = A + (i*A_row_stride + k)*DIM;
+            const size_t blocks = k + A_blocks <= K ? A_blocks : K-k;
+            const size_t cols = blocks * DIM - (k + blocks >= K ? pad_K : 0);
+            const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+            gemmini_extended_mvin(A_dram_addr, A_sp_addr, cols, rows);
+          }
+        }
+        // Mvin B
+        if (b_transpose) {
+          if (i == 0 && k % B_blocks == 0) {
+            const elem_t * const B_dram_addr = B + (j*B_row_stride + k)*DIM;
+            const size_t blocks = k + B_blocks <= K ? B_blocks : K-k;
+            const size_t cols = blocks * DIM - (k + blocks >= K ? pad_K : 0);
+            const size_t rows = DIM - (j == J-1 ? pad_J : 0);
+            gemmini_extended_mvin2(B_dram_addr, B_sp_addr, cols, rows);
+          }
+        } else {
+          if (i == 0 && j % B_blocks == 0) {
+            const elem_t * const B_dram_addr = B + (k*B_row_stride + j)*DIM;
+            const size_t blocks = j + B_blocks <= J ? B_blocks : J-j;
+            const size_t cols = blocks * DIM - (j + blocks >= J ? pad_J : 0);
+            const size_t rows = DIM - (k == K-1 ? pad_K : 0);
+            gemmini_extended_mvin2(B_dram_addr, B_sp_addr, cols, rows);
+          }
+        }
+        // Compute
+        {
+          uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
+          uint32_t out_sp_addr = C_sp_addr;
+          // If we're not using a bias, then we want to overwrite what's in the
+          // accumulator, rather than writing over it
+          int no_bias_new_matrix = no_bias && D != NULL && k == 0;
+          if (no_bias_new_matrix) {
+            out_sp_addr &= ~(1 << (ADDR_LEN-2));
+          }
+          const size_t A_cols = DIM - (k == K - 1 ? pad_K : 0);
+          const size_t A_rows = DIM - (i == I - 1 ? pad_I : 0);
+          const size_t B_cols = DIM - (j == J - 1 ? pad_J : 0);
+          const size_t B_rows = DIM - (k == K - 1 ? pad_K : 0);
+          const size_t C_cols = DIM - (j == J - 1 ? pad_J : 0);
+          const size_t C_rows = DIM - (i == I - 1 ? pad_I : 0);
+          gemmini_extended_preload(pre_sp_addr, out_sp_addr, B_cols, B_rows, C_cols, C_rows);
+          if (i == 0) { // First iteration
+            gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM);
+          } else { // All other iterations
+            gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, A_cols, A_rows, DIM, DIM);
+          }
+        }
+        if (C != NULL && k == K-1) {
+          // Move-out C (if not normalizing)
+          if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) {
+            const size_t rounded_j = (j / C_blocks) * C_blocks;
+            const uint32_t rounded_C_sp_addr = C_sp_addr_start + (i*J + rounded_j)*DIM;
+            void * const C_dram_addr = (int8_t*)C + (i*C_row_stride + rounded_j)*DIM*sizeof_C;
+            const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j;
+            const size_t cols = blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0);
+            const size_t rows = DIM - (i == I - 1 ? pad_I : 0);
+            gemmini_extended_mvout(C_dram_addr, rounded_C_sp_addr, cols, rows);
+          }
+          // Move-out C (if normalizing)
+          if (act == LAYERNORM && j == J - 1) {
+            uint32_t norm_cmds[][2] = {{1,2},{3,4},{0,0}};
+            const int norm_cmds_size = sizeof(norm_cmds) / sizeof(norm_cmds[0]);
+            const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+            for (size_t row = 0; row < rows; row += NORM_STAT_IDS) {
+              const size_t stat_ids = rows - row > NORM_STAT_IDS ?
+                NORM_STAT_IDS : rows - row;
+              for (int cmd = 0; cmd < norm_cmds_size; cmd++) {
+                for (size_t stat_id = 0; stat_id < stat_ids; stat_id++) {
+                  gemmini_config_norm(0, 0, 0, 0, stat_id, 0, 0);
+                  const size_t r = row + stat_id;
+                  for (size_t jj = 0; jj < J; jj += C_blocks) {
+                    uint32_t norm_C_sp_addr = C_sp_addr_start + (i*J + jj)*DIM + r;
+                    if (jj + C_blocks >= J) {
+                      norm_C_sp_addr |= (norm_cmds[cmd][1] << 26); // Final mean/inv-std-dev calculation
+                    } else {
+                      norm_C_sp_addr |= (norm_cmds[cmd][0] << 26); // Accumulate sum/variance
+                    }
+                    void * const C_dram_addr = (int8_t*)C +
+                      (i*C_row_stride + jj) * DIM * sizeof_C +
+                      r * C_row_stride * sizeof_C;
+                    const size_t blocks = jj + C_blocks <= J ? C_blocks : J-jj;
+                    const size_t cols = blocks * DIM - (jj + blocks >= J ? pad_J : 0);
+                    gemmini_extended_mvout(C_dram_addr, norm_C_sp_addr, cols, 1);
+                  }
+                }
+              }
+            }
+          } else if (act == SOFTMAX && j == J - 1) {
+            uint32_t norm_cmds[][2] = {{5,5},{6,7},{0,0}};
+            const int norm_cmds_size = sizeof(norm_cmds) / sizeof(norm_cmds[0]);
+            const size_t rows = DIM - (i == I-1 ? pad_I : 0);
+            for (size_t row = 0; row < rows; row += NORM_STAT_IDS) {
+              const size_t stat_ids = rows - row > NORM_STAT_IDS ?
+                NORM_STAT_IDS : rows - row;
+              for (int cmd = 0; cmd < norm_cmds_size; cmd++) {
+                for (size_t stat_id = 0; stat_id < stat_ids; stat_id++) {
+                  // set stat id only
+                  gemmini_config_norm(0, 0, 1, 0, stat_id, 0, 0);
+                  const size_t r = row + stat_id;
+                  for (size_t jj = 0; jj < J; jj += C_blocks) {
+                    uint32_t norm_C_sp_addr = C_sp_addr_start + (i*J + jj)*DIM + r;
+                    if (jj + C_blocks >= J) {
+                      norm_C_sp_addr |= (norm_cmds[cmd][1] << 26); // Final mean/inv-std-dev calculation
+                    } else {
+                      norm_C_sp_addr |= (norm_cmds[cmd][0] << 26); // Accumulate sum/variance
+                    }
+                    void * const C_dram_addr = (int8_t*)C +
+                      (i*C_row_stride + jj) * DIM * sizeof_C +
+                      r * C_row_stride * sizeof_C;
+                    const size_t blocks = jj + C_blocks <= J ? C_blocks : J-jj;
+                    const size_t cols = blocks * DIM - (jj + blocks >= J ? pad_J : 0);
+                    gemmini_extended_mvout(C_dram_addr, norm_C_sp_addr, cols, 1);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+*/
+
+  // Combined loop
+  gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, no_bias ? NULL : D, C,
+    A_row_stride, B_row_stride, repeating_bias ? 0 : D_row_stride, C_row_stride,
+    a_transpose, b_transpose,
+    full_C, low_D, !no_bias || D == NULL,
+    act, a_spad_id, b_spad_id, false);
+}
+
+
+static void tiled_matmul_outer(size_t dim_I, size_t dim_J, size_t dim_K,
+        const elem_t* A, const elem_t* B,
+        const void * D, void * C,
+        size_t stride_A, size_t stride_B, size_t stride_D, size_t stride_C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        size_t tile_I, size_t tile_J, size_t tile_K,
+        int act, acc_scale_t scale, acc_scale_t bert_scale,
+        bool repeating_bias,
+        bool a_transpose, bool b_transpose,
+        bool full_C, bool low_D,
+        uint8_t weightA,
+        int dataflow) {
+
+  const size_t dim_I_padded = (dim_I / DIM + (dim_I % DIM != 0)) * DIM;
+  const size_t dim_J_padded = (dim_J / DIM + (dim_J % DIM != 0)) * DIM;
+  const size_t dim_K_padded = (dim_K / DIM + (dim_K % DIM != 0)) * DIM;
+
+  const size_t I0 = dim_I_padded / (tile_I*DIM) + (dim_I_padded % (tile_I*DIM) != 0);
+  const size_t J0 = dim_J_padded / (tile_J*DIM) + (dim_J_padded % (tile_J*DIM) != 0);
+  const size_t K0 = dim_K_padded / (tile_K*DIM) + (dim_K_padded % (tile_K*DIM) != 0);
+
+  // These lines here are supposed to help us deal with when the dimensions of
+  // the systolic array aren't divisible by the tiling factors
+  const size_t last_I = dim_I_padded % (tile_I*DIM) == 0 ? tile_I : (dim_I_padded/DIM) % tile_I;
+  const size_t last_J = dim_J_padded % (tile_J*DIM) == 0 ? tile_J : (dim_J_padded/DIM) % tile_J;
+  const size_t last_K = dim_K_padded % (tile_K*DIM) == 0 ? tile_K : (dim_K_padded/DIM) % tile_K;
+
+  // These lines are supposed to figure out how much padding the hardware is
+  // supposed to add for the final tile
+  const size_t padding_I = dim_I_padded - dim_I;
+  const size_t padding_J = dim_J_padded - dim_J;
+  const size_t padding_K = dim_K_padded - dim_K;
+
+  const bool no_bias = D == NULL;
+
+  if (no_bias) {
+    D = (void*) 1; // Dummy address which isn't NULL
+  }
+
+  const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t) ;
+  const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
+
+  gemmini_extended_config_ex(dataflow, act & 3, 0, 1, a_transpose, b_transpose);
+  gemmini_extended_config_st(stride_C * sizeof_C, act & 3, scale);
+  gemmini_extended3_config_ld(stride_A * sizeof(elem_t), A_scale_factor, false, 0);
+  gemmini_extended3_config_ld(stride_B * sizeof(elem_t), B_scale_factor, false, 1)
+  gemmini_extended3_config_ld(repeating_bias ? 0 : (stride_D * sizeof_D), D_scale_factor, low_D, 2);
+
+  if (act == IGELU) {
+    const acc_scale_t sqrt_2 = 1.41421356237;
+    const acc_scale_t S = bert_scale;
+    const acc_scale_t S_erf = (-0.2888 * ((S*S) / 2));
+
+    const acc_t qb = -1.769 / (S / sqrt_2);
+    const acc_t qc = 1.0 / S_erf;
+
+    gemmini_config_norm(0, 0, 0, 0, 0, qb, qc);
+  }
+
+  if (act == SOFTMAX) {
+    const scale_t a = 0.3585;
+    const scale_t b = 1.353;
+    const scale_t c = 0.344;
+
+    const acc_t qln2 = (int) (0.693147 / bert_scale);
+    const acc_t qln2_inv = 65536 / qln2;
+    const acc_t qb = b / bert_scale;
+    const acc_t qc = c / (a*bert_scale*bert_scale);
+
+    gemmini_config_norm(qln2, 0, 0, 1, 0, qb, qc);
+    gemmini_config_norm(qln2_inv, 1, 0, 1, 0, qb, qc);
+  }
+
+  void (*inner)(const elem_t *, const elem_t *, const void *, void *,
+        scale_t, scale_t, scale_acc_t,
+        size_t, size_t, size_t, size_t, size_t, size_t,
+        size_t, size_t, size_t, size_t,
+        bool, bool,
+        bool, bool,
+        bool, bool,
+        int, int, int);
+
+  if (dataflow == OUTPUT_STATIONARY) {
+    inner = &sp_tiled_matmul_os;
+  } else /* if (dataflow == WEIGHT_STATIONARY) */ {
+    inner = &sp_tiled_matmul_ws;
+  }
+
+  // reuse operand if it fits scratchpad
+  int a_spad_id = 0;
+  int b_spad_id = 0;
+  bool b_reuse = (J0 * K0 <= 2) && (dataflow == WEIGHT_STATIONARY);
+  bool a_reuse = (I0 * K0 <= 2) && (dataflow == WEIGHT_STATIONARY);
+
+  for (size_t i0 = 0; i0 < I0; i0++)
+    for (size_t j0 = 0; j0 < J0; j0++)
+      for (size_t k0 = 0; k0 < K0; k0++) {
+        if(a_reuse)
+          a_spad_id = ((i0+k0) == 0) ? 1 : 2;
+        if(b_reuse)
+          b_spad_id = ((j0+k0) == 0) ? 1 : 2;
+
+        const void * pre;
+        if (k0 != 0) {
+          pre = NULL;
+        } else {
+          size_t bias_row = repeating_bias ? 0 : i0*tile_I*DIM;
+          // pre = &(((acc_t*)D)[bias_row * stride_D + j0 * tile_J * DIM]);
+          pre = (int8_t*)D + (bias_row * stride_D + j0 * tile_J * DIM)*sizeof_D;
+        }
+
+        void * out = k0 == K0-1 ? (int8_t*)C + (i0*tile_I*DIM*stride_C + j0*tile_J*DIM)*sizeof_C : NULL;
+
+        const size_t I = i0 < I0-1 ? tile_I : last_I;
+        const size_t J = j0 < J0-1 ? tile_J : last_J;
+        const size_t K = k0 < K0-1 ? tile_K : last_K;
+
+        const size_t pad_I = i0 == I0-1 ? padding_I : 0;
+        const size_t pad_J = j0 == J0-1 ? padding_J : 0;
+        const size_t pad_K = k0 == K0-1 ? padding_K : 0;
+
+        const elem_t * a = a_transpose ? (A + k0*tile_K*DIM*stride_A + i0*tile_I*DIM)
+          : (A + i0*tile_I*DIM*stride_A + k0*tile_K*DIM);
+
+        const elem_t * b = b_transpose ? (B + j0*tile_J*DIM*stride_B + k0*tile_K*DIM)
+          : (B + k0*tile_K*DIM*stride_B + j0*tile_J*DIM);
+
+        if(a_reuse && j0 >= 1) a = NULL;
+        if(b_reuse && i0 >= 1) b = NULL;
+        //printf("a_reuse: %d, b_reuse: %d, a_spad_id: %d, b_spad_id: %d, a: %llu, b: %llu \n", a_reuse, b_reuse, a_spad_id, b_spad_id, a, b);
+        (*inner)(a, b, pre, out,
+            A_scale_factor, B_scale_factor, D_scale_factor,
+            I, J, K,
+            pad_I, pad_J, pad_K,
+            stride_A, stride_B, stride_D, stride_C,
+            a_transpose, b_transpose,
+            full_C, low_D,
+            no_bias, repeating_bias,
+            act, a_spad_id, b_spad_id);
+      }
+
+  gemmini_fence();
+}
+
+
+static acc_t int_sqrt(acc_t n) {
+  if (n == 0) return 0;
+
+  int bits = 0;
+  for (acc_t x = n; x > 0; x /= 2)
+    bits++;
+
+  acc_t x_prev = 1 << ((bits + 1) / 2);
+
+  while (1) {
+    acc_t x_next = (x_prev + n / x_prev) / 2;
+    if (x_next >= x_prev) return x_prev;
+    x_prev = x_next;
+  };
+}
+
+
+static elem_t scale_and_sat(acc_t x, int act, acc_scale_t scale, acc_scale_t bert_scale) {
+  // Apply I-GELU if needed
+  if (act == IGELU) {
+    const acc_scale_t sqrt_2 = 1.41421356237;
+
+    const acc_scale_t S = bert_scale;
+
+    const acc_scale_t S_erf = (-0.2888 * (S/sqrt_2)*(S/sqrt_2));
+    const acc_t q1 = 1 / S_erf;
+    const acc_t qb = -1.769 / (S / sqrt_2);
+    const acc_t qc = 1.0 / (-0.2888 * (S / sqrt_2) * (S / sqrt_2));
+
+    const acc_t q = x;
+
+    const acc_t q_sign = q < 0 ? -1 : 1;
+    const acc_t q_clipped = abs(q) > (-qb) ? (-qb) : abs(q);
+    const acc_t q_poly = (q_clipped + qb)*(q_clipped + qb) + qc;
+    const acc_t q_erf = q_sign * q_poly;
+
+    x = q * (q_erf + q1);
+  }
+
+  // Scale value down and round it
+  x = ACC_SCALE(x, scale);
+  // Clip result
+  x = x > elem_t_max ? elem_t_max : (x < elem_t_min ? elem_t_min : x);
+  // Apply activation function
+  if (act == RELU) {
+    x = x < 0 ? 0 : x;
+  }
+  return x;
+}
+
+#ifdef HAS_MVIN_SCALE
+#define GEMMINI_SCALE(x, scale) MVIN_SCALE((x), (scale))
+#else
+#define GEMMINI_SCALE(x, scale) (x)
+#endif
+
+#ifdef HAS_MVIN_ACC_SCALE
+#define GEMMINI_ACC_SCALE(x, scale) MVIN_SCALE_ACC((x), (scale))
+#else
+#define GEMMINI_ACC_SCALE(x, scale) (x)
+#endif
+
+static void matmul_cpu(bool transA, bool transB, size_t DIM_I, size_t DIM_J, size_t DIM_K,
+        const elem_t* A, const elem_t* B, const acc_t * D,
+        elem_t* C,
+        size_t stride_A, size_t stride_B, size_t stride_D, size_t stride_C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        int act, acc_scale_t scale, acc_scale_t bert_scale, bool repeating_bias) {
+
+  const int no_bias = D == NULL;
+  if (act != LAYERNORM && act != SOFTMAX && !transA && !transB && DIM_I % 4 == 0 && DIM_J % 4 == 0) {
+    for (size_t i = 0; i < DIM_I; i += 4) {
+      for (size_t j = 0; j < DIM_J; j += 4) {
+
+        acc_t result[4][4]; // = {{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+
+        for (size_t ii = 0; ii < 4; ii++)
+          for (size_t jj = 0; jj < 4; jj++) {
+            const size_t bias_row = repeating_bias ? 0 : i + ii;
+            result[ii][jj] = no_bias ? 0 :
+              GEMMINI_ACC_SCALE(*(D + bias_row*stride_D + j + jj), D_scale_factor);
+          }
+
+        for (size_t k = 0; k < DIM_K; k++) {
+          result[0][0] +=
+                GEMMINI_SCALE(*(A + i*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j), B_scale_factor);
+          result[0][1] +=
+                GEMMINI_SCALE(*(A + i*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+1), B_scale_factor);
+          result[0][2] +=
+                GEMMINI_SCALE(*(A + i*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+2), B_scale_factor);
+          result[0][3] +=
+                GEMMINI_SCALE(*(A + i*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+3), B_scale_factor);
+          result[1][0] +=
+                GEMMINI_SCALE(*(A + (i+1)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j), B_scale_factor);
+          result[1][1] +=
+                GEMMINI_SCALE(*(A + (i+1)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+1), B_scale_factor);
+          result[1][2] +=
+                GEMMINI_SCALE(*(A + (i+1)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+2), B_scale_factor);
+          result[1][3] +=
+                GEMMINI_SCALE(*(A + (i+1)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+3), B_scale_factor);
+          result[2][0] +=
+                GEMMINI_SCALE(*(A + (i+2)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j), B_scale_factor);
+          result[2][1] +=
+                GEMMINI_SCALE(*(A + (i+2)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+1), B_scale_factor);
+          result[2][2] +=
+                GEMMINI_SCALE(*(A + (i+2)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+2), B_scale_factor);
+          result[2][3] +=
+                GEMMINI_SCALE(*(A + (i+2)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+3), B_scale_factor);
+          result[3][0] +=
+                GEMMINI_SCALE(*(A + (i+3)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j), B_scale_factor);
+          result[3][1] +=
+                GEMMINI_SCALE(*(A + (i+3)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+1), B_scale_factor);
+          result[3][2] +=
+                GEMMINI_SCALE(*(A + (i+3)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+2), B_scale_factor);
+          result[3][3] +=
+                GEMMINI_SCALE(*(A + (i+3)*stride_A + k), A_scale_factor) *
+                GEMMINI_SCALE(*(B + k*stride_B + j+3), B_scale_factor);
+        }
+
+        *(C + i*stride_C + j) =
+             scale_and_sat(result[0][0], act, scale, bert_scale);
+        *(C + i*stride_C + j+1) =
+             scale_and_sat(result[0][1], act, scale, bert_scale);
+        *(C + i*stride_C + j+2) =
+             scale_and_sat(result[0][2], act, scale, bert_scale);
+        *(C + i*stride_C + j+3) =
+             scale_and_sat(result[0][3], act, scale, bert_scale);
+        *(C + (i+1)*stride_C + j) =
+             scale_and_sat(result[1][0], act, scale, bert_scale);
+        *(C + (i+1)*stride_C + j+1) =
+             scale_and_sat(result[1][1], act, scale, bert_scale);
+        *(C + (i+1)*stride_C + j+2) =
+             scale_and_sat(result[1][2], act, scale, bert_scale);
+        *(C + (i+1)*stride_C + j+3) =
+             scale_and_sat(result[1][3], act, scale, bert_scale);
+        *(C + (i+2)*stride_C + j) =
+             scale_and_sat(result[2][0], act, scale, bert_scale);
+        *(C + (i+2)*stride_C + j+1) =
+             scale_and_sat(result[2][1], act, scale, bert_scale);
+        *(C + (i+2)*stride_C + j+2) =
+             scale_and_sat(result[2][2], act, scale, bert_scale);
+        *(C + (i+2)*stride_C + j+3) =
+             scale_and_sat(result[2][3], act, scale, bert_scale);
+        *(C + (i+3)*stride_C + j) =
+             scale_and_sat(result[3][0], act, scale, bert_scale);
+        *(C + (i+3)*stride_C + j+1) =
+             scale_and_sat(result[3][1], act, scale, bert_scale);
+        *(C + (i+3)*stride_C + j+2) =
+             scale_and_sat(result[3][2], act, scale, bert_scale);
+        *(C + (i+3)*stride_C + j+3) =
+             scale_and_sat(result[3][3], act, scale, bert_scale);
+      }
+    }
+  } else {
+    size_t A_dim_strides[2] = {!transA ? stride_A : 1, !transA ? 1 : stride_A}; // i, j stride
+    size_t B_dim_strides[2] = {!transB ? 1 : stride_B, !transB ? stride_B : 1}; // j, k stride
+
+    // We also create a buffer that we can use for layernorms and softmaxes
+    static acc_t c_buffer[1024];
+    const size_t c_buffer_sz = sizeof(c_buffer)/sizeof(c_buffer[0]);
+    if ((act == LAYERNORM || act == SOFTMAX) && DIM_J > c_buffer_sz) {
+      printf("Matmul is too large to normalize\n");
+      exit(1);
+    }
+
+    for (size_t i = 0; i < DIM_I; i++) {
+      for (size_t j = 0; j < DIM_J; j++) {
+        elem_t* c = C + (i * stride_C) + j;
+
+        const size_t bias_row = repeating_bias ? 0 : i;
+        acc_t sum = no_bias ? 0 : GEMMINI_ACC_SCALE(*(D + bias_row * stride_D + j), D_scale_factor);
+
+        for (size_t k = 0; k < DIM_K; k++) {
+          const elem_t* a = A + i * A_dim_strides[0] + k * A_dim_strides[1];
+          const elem_t* b = B + j * B_dim_strides[0] + k * B_dim_strides[1];
+          sum += (GEMMINI_SCALE(*a, A_scale_factor) * GEMMINI_SCALE(*b, B_scale_factor));
+        }
+
+        if (act == LAYERNORM || act == SOFTMAX)
+          c_buffer[j] = sum;
+        else
+          *c = scale_and_sat(sum, act, scale, bert_scale);
+      }
+
+#ifdef HAS_NORMALIZATIONS
+      if (act == LAYERNORM) {
+        acc_t sum = 0;
+        for (size_t j = 0; j < DIM_J; j++)
+          sum += c_buffer[j];
+        acc_t mean = sum / (acc_t)DIM_J;
+
+        acc_t total_err_sq = 0;
+        for (size_t j = 0; j < DIM_J; j++)
+          total_err_sq += (c_buffer[j] - mean)*(c_buffer[j] - mean);
+        acc_t variance = total_err_sq / (acc_t)DIM_J;
+
+        acc_t stddev = int_sqrt(variance);
+        if (variance == 0) stddev = 1;
+
+        for (size_t j = 0; j < DIM_J; j++) {
+          c_buffer[j] -= mean;
+          // c_buffer[j] /= stddev;
+          c_buffer[j] = ROUND_NEAR_EVEN((double)c_buffer[j] / stddev); // TODO I don't think I-BERT uses round-near-even, so we shouldn't either. We just use this rounding mode here in order to match the hardware.
+
+          elem_t* c = C + (i * stride_C) + j;
+          *c = scale_and_sat(c_buffer[j], act, scale, bert_scale);
+        }
+      } else if (act == SOFTMAX) {
+        const scale_t a = 0.3585;
+        const scale_t b = 1.353;
+        const scale_t c = 0.344;
+
+        // is SCALE supposed to be input scale?
+        const acc_t qln2 = (acc_t) (0.693147 / bert_scale);
+        const acc_t qln2_inv = 65536 / qln2;
+        const acc_t qb = b / bert_scale;
+        const acc_t qc = c / (a*bert_scale*bert_scale);
+
+        // pass 1: get max_q
+        acc_t max_q = -2147483648;
+        for (size_t j = 0; j < DIM_J; j++) {
+          if (c_buffer[j] > max_q) max_q = c_buffer[j];
+        }
+
+        // pass 2: calculate iexp(q_tilde) and sum(q_tilde)
+        acc_t sum_exp = 0;
+        for (size_t j = 0; j < DIM_J; j++) {
+          acc_t q = c_buffer[j] - max_q;
+          acc_t z = (acc_t) (-q * qln2_inv) >> 16;
+          acc_t qp = q + z * qln2;
+          acc_t q_exp = (qp + qb)*(qp + qb) + qc;
+          c_buffer[j] = q_exp >> z;
+          sum_exp += c_buffer[j];
+        }
+
+        // pass 3: divide by sum
+        scale_t factor = (127.f) / (float) sum_exp; // what corresponds to 1 in output?
+        for (size_t j = 0; j < DIM_J; j++) {
+          elem_t* c = C + (i * stride_C) + j;
+          *c = scale_and_sat(c_buffer[j], act, factor, bert_scale);
+        }
+      }
+#endif
+    }
+  }
+}
+
+#undef GEMMINI_SCALE
+
+// General matmul which can be run with different dataflows, or on the CPU
+enum tiled_matmul_type_t {OS, WS, CPU}; // TODO rename this so it's name also applies to convs
+
+// This function runs a tiled matrix mulctiplication, with hardcoded tiling
+// factors
+static void tiled_matmul(size_t dim_I, size_t dim_J, size_t dim_K,
+        const elem_t* A, const elem_t* B,
+        const void * D, void* C,
+        size_t stride_A, size_t stride_B, size_t stride_D, size_t stride_C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        int act, acc_scale_t scale, acc_scale_t bert_scale,
+        bool repeating_bias,
+        size_t tile_I, size_t tile_J, size_t tile_K,
+        bool transpose_A, bool transpose_B,
+        bool full_C, bool low_D,
+        uint8_t weightA,
+        enum tiled_matmul_type_t tiled_matmul_type) {
+
+#ifdef GEMMINI_ASSERTIONS
+  // Make sure that the tiling factors make sense
+  if (tile_I <= 0) {
+    printf("tile_I is non-positive\n");
+    exit(1);
+  } else if (tile_J <= 0) {
+    printf("tile_J is non-positive\n");
+    exit(1);
+  } else if (tile_K <= 0) {
+    printf("tile_K is non-positive\n");
+    exit(1);
+  }
+
+  const size_t dim_I_padded = (dim_I / DIM + (dim_I % DIM != 0)) * DIM;
+  const size_t dim_J_padded = (dim_J / DIM + (dim_J % DIM != 0)) * DIM;
+  const size_t dim_K_padded = (dim_K / DIM + (dim_K % DIM != 0)) * DIM;
+
+  if (tile_I * DIM > dim_I_padded) {
+    printf("tile_I is too large (tile_I * DIM > dim_I_padded)\n");
+    exit(1);
+  } else if (tile_J * DIM > dim_J_padded) {
+    printf("tile_J is too large (tile_J * DIM > dim_J_padded)\n");
+    exit(1);
+  } else if (tile_K * DIM > dim_K_padded) {
+    printf("tile_K is too large (tile_K * DIM > dim_K_padded)\n");
+    exit(1);
+  }
+
+  const bool double_buffered = tiled_matmul_type == WS;
+
+  const size_t total_spad_size = double_buffered ? BANK_NUM * BANK_ROWS / 2 :
+      BANK_NUM * BANK_ROWS;
+  const size_t total_acc_size = double_buffered ? ACC_ROWS / 2 : ACC_ROWS;
+
+  const size_t total_spad_rows =
+      (tile_I * tile_K * DIM) +   // Rows to store A
+      (tile_K * tile_J * DIM);    // Rows to store B
+
+  if (total_spad_rows > total_spad_size) {
+    printf("Not enough space in scratchpad to store A and B matrices\n");
+    exit(1);
+  }
+
+  const size_t total_acc_rows =
+      tile_I * tile_J * DIM;      // Rows to store C
+
+  if (total_acc_rows > total_acc_size) {
+    printf("Not enough space in accumulator to store C\n");
+    exit(1);
+  }
+
+  if (tile_I > 65535 || tile_J > 65535 || tile_K > 65535) {
+    printf("I, J, and K tiling factors must be less than 65535, to fit within the bounds of the LOOP_WS function");
+    exit(1);
+  }
+
+  char matmul_type_str[][4] = {"OS", "WS", "CPU"};
+
+  // Check if transpose options are correct
+  if (((tiled_matmul_type == OS) && (transpose_A || transpose_B)) ||
+    (tiled_matmul_type == WS && transpose_A && transpose_B)) {
+    printf("Not implemented: %s matmul, a_transpose=%d, b_transpose=%d\n", matmul_type_str[tiled_matmul_type], transpose_A, transpose_B);
+    exit(1);
+  }
+
+  // Check if full_C options are correct
+  if ((tiled_matmul_type == CPU && (full_C || low_D)) ||
+      (tiled_matmul_type == OS && low_D)) {
+    printf("Not implemented: %s matmul, full_C=%d, low_D=%d\n", matmul_type_str[tiled_matmul_type], full_C, low_D);
+  }
+
+  if (act == LAYERNORM || act == SOFTMAX) {
+    if (tiled_matmul_type == OS) {
+      printf("Not implemented: %s matmul, act=%d\n", matmul_type_str[tiled_matmul_type], act);
+    }
+    if (tile_J * DIM < dim_J) {
+      printf("When doing layernorm or softmax, the full J dimension of the matrix must fit in the accumulator\n");
+    }
+  }
+#endif
+
+  // Run a tiled matrix multiplication on either Gemmini or the CPU
+  if (tiled_matmul_type == OS || tiled_matmul_type == WS) {
+    tiled_matmul_outer(dim_I, dim_J, dim_K,
+        A, B, D, C,
+        stride_A, stride_B, stride_D, stride_C,
+        A_scale_factor, B_scale_factor, D_scale_factor,
+        tile_I, tile_J, tile_K,
+        act, scale, bert_scale, repeating_bias,
+        transpose_A, transpose_B,
+        full_C, low_D,
+        weightA,
+        (int)tiled_matmul_type);
+  } else /*if (tiled_matmul_type == CPU)*/ {
+    matmul_cpu(transpose_A, transpose_B, dim_I, dim_J, dim_K,
+            A, B, (const acc_t*) D, (elem_t*)C,
+            stride_A, stride_B, stride_D, stride_C,
+            A_scale_factor, B_scale_factor, D_scale_factor,
+            act, scale, bert_scale, repeating_bias);
+  }
+}
+
+
+static size_t tiled_matmul_total_spad_rows(size_t I, size_t J, size_t K) {
+  return (I * K + K * J) * DIM;
+}
+
+
+static size_t tiled_matmul_total_acc_rows(size_t I, size_t J) {
+  return (I * J) * DIM;
+}
+
+// This function runs a tiled matrix multiplication, with automatically
+// calculated tiling factors
+static void tiled_matmul_auto(size_t dim_I, size_t dim_J, size_t dim_K,
+        const elem_t* A, const elem_t* B,
+        const void * D, void * C,
+        size_t stride_A, size_t stride_B, size_t stride_D, size_t stride_C,
+        scale_t A_scale_factor, scale_t B_scale_factor, scale_acc_t D_scale_factor,
+        int act, acc_scale_t scale, acc_scale_t bert_scale,
+        bool repeating_bias,
+        bool transpose_A, bool transpose_B,
+        bool full_C, bool low_D,
+        uint8_t weightA,
+        enum tiled_matmul_type_t tiled_matmul_type) {
+
+#define partition_rows (BANK_NUM * BANK_ROWS / 2)
+#define mats_in_partition (partition_rows / DIM)
+#define mats_in_acc (ACC_ROWS / DIM)
+#define max_tile_i_j ((size_t)sqrt(mats_in_acc))
+#define max_tile_k (mats_in_partition / max_tile_i_j)
+
+    // "db_" means "double-buffered"
+#define db_partition_rows ((BANK_NUM * BANK_ROWS / 2) / 2)
+#define db_mats_in_partition (db_partition_rows / DIM)
+#define db_mats_in_acc ((ACC_ROWS / 2) / DIM)
+#define db_max_tile_i_j ((size_t)sqrt(db_mats_in_acc))
+#define db_max_tile_k (db_mats_in_partition / db_max_tile_i_j)
+
+    const size_t dim_I_padded = (dim_I / DIM + (dim_I % DIM != 0)) * DIM;
+    const size_t dim_J_padded = (dim_J / DIM + (dim_J % DIM != 0)) * DIM;
+    const size_t dim_K_padded = (dim_K / DIM + (dim_K % DIM != 0)) * DIM;
+
+    const bool double_buffered = tiled_matmul_type == WS;
+
+    const size_t max_spad_rows = double_buffered ? BANK_NUM * BANK_ROWS / 2 :
+      BANK_NUM * BANK_ROWS;
+    const size_t max_acc_rows = double_buffered ? ACC_ROWS / 2 : ACC_ROWS;
+
+    size_t tile_I, tile_J, tile_K;
+
+    if (act == LAYERNORM || act == SOFTMAX) {
+       tile_I = 1;
+       tile_J = dim_J_padded/DIM;
+       tile_K = 1;
+    } else if (double_buffered) {
+       tile_I = dim_I_padded/DIM < db_max_tile_i_j ? dim_I_padded/DIM : db_max_tile_i_j;
+       tile_J = dim_J_padded/DIM < db_max_tile_i_j ? dim_J_padded/DIM : db_max_tile_i_j;
+       tile_K = dim_K_padded/DIM < db_max_tile_k ? dim_K_padded/DIM : db_max_tile_k;
+    } else {
+       tile_I = dim_I_padded/DIM < max_tile_i_j ? dim_I_padded/DIM : max_tile_i_j;
+       tile_J = dim_J_padded/DIM < max_tile_i_j ? dim_J_padded/DIM : max_tile_i_j;
+       tile_K = dim_K_padded/DIM < max_tile_k ? dim_K_padded/DIM : max_tile_k;
+    }
+
+    // Fill scratchpad as much as possible
+    while (true) {
+      bool increased = false;
+
+      if (tiled_matmul_total_spad_rows(tile_I, tile_J+1, tile_K) <= max_spad_rows &&
+          tiled_matmul_total_acc_rows(tile_I, tile_J+1) <= max_acc_rows &&
+          (tile_J+1) * DIM <= dim_J_padded) {
+        tile_J++;
+        increased = true;
+      }
+
+      if (tiled_matmul_total_spad_rows(tile_I+1, tile_J, tile_K) <= max_spad_rows &&
+          tiled_matmul_total_acc_rows(tile_I+1, tile_J) <= max_acc_rows &&
+          (tile_I+1) * DIM <= dim_I_padded) {
+        tile_I++;
+        increased = true;
+      }
+
+      if (tiled_matmul_total_spad_rows(tile_I, tile_J, tile_K+1) <= max_spad_rows &&
+          (tile_K+1) * DIM <= dim_K_padded) {
+        tile_K++;
+        increased = true;
+      }
+
+      if (!increased)
+        break;
+    }
+
+#ifdef PRINT_TILE
+#if PRINT_TILE
+    const int spad_rows = tiled_matmul_total_spad_rows(tile_I, tile_J, tile_K);
+    const int acc_rows = tiled_matmul_total_acc_rows(tile_I, tile_J);
+
+    printf("tile_I: %d\n", tile_I);
+    printf("tile_J: %d\n", tile_J);
+    printf("tile_K: %d\n\n", tile_K);
+
+    printf("spad_rows: %d\n", spad_rows);
+    printf("acc_rows: %d\n\n", acc_rows);
+
+    printf("spad_row utilization: %d%%\n", (spad_rows * 100) / max_spad_rows);
+    printf("acc_row utilization: %d%%\n\n", (acc_rows * 100) / max_acc_rows);
+
+    exit(EXIT_SUCCESS);
+#endif
+#endif
+
+    tiled_matmul(dim_I, dim_J, dim_K,
+        A, B, D, C,
+        stride_A, stride_B, stride_D, stride_C,
+        A_scale_factor, B_scale_factor, D_scale_factor,
+        act, scale, bert_scale, repeating_bias,
+        tile_I, tile_J, tile_K,
+        transpose_A, transpose_B,
+        full_C, low_D,
+        weightA,
+        tiled_matmul_type);
+
+#undef partition_rows
+#undef mats_in_partition
+#undef mats_in_acc
+#undef max_tile_i_j
+#undef max_tile_k
+}
+
+
+static void sp_tiled_conv(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int pool_out_row_dim, int pool_out_col_dim,
+
+        int stride, int padding, int kernel_dim, int kernel_dilation,
+        int in_stride, int weight_stride, int out_stride,
+
+        int pool_size, int pool_stride, int pool_padding,
+
+        int batches,
+        int porows, int pocols, int pochs,
+        int krows, int kcols, int kchs,
+
+        int lpad, int rpad, int upad, int dpad,
+        int plpad, int prpad, int pupad, int pdpad,
+
+        const elem_t * input,
+        const elem_t * weights,
+        elem_t * output,
+        const acc_t * bias,
+
+        int act, acc_scale_t scale,
+
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        bool no_bias, bool no_pool, bool downsample, bool input_dilated,
+        bool dw, int a_spad_id, int b_spad_id) {
+
+  // When dw convs are true, we assume that kchs and ochs are 1
+  if (dw) { kchs = 1; pochs = 1; }
+
+  const int orows = porows * pool_stride + pool_size - 1 - pupad - pdpad;
+  const int ocols = pocols * pool_stride + pool_size - 1 - plpad - prpad;
+  const int ochs = pochs;
+
+  // Calculate image dimensions
+  // Note: "irows" and "icols" includes padding
+  const int dilated_krows = krows + (kernel_dilation - 1)*(krows - 1);
+  const int dilated_kcols = kcols + (kernel_dilation - 1)*(kcols - 1);
+  int irows = orows * stride + dilated_krows - 1;
+  int icols = ocols * stride + dilated_kcols - 1;
+  int irows_unpadded = irows - upad - dpad;
+  int icols_unpadded = icols - lpad - rpad;
+  const int ichs = kchs;
+
+#define UNDILATED(x) ((input_dilated) ? (((x)+1)/2) : (x))
+
+  if (input_dilated) {
+    irows_unpadded = (irows_unpadded+1)/2;
+    icols_unpadded = (icols_unpadded+1)/2;
+
+    irows = irows_unpadded + UNDILATED(upad) + UNDILATED(dpad);
+    icols = icols_unpadded + UNDILATED(lpad) + UNDILATED(rpad);
+  }
+
+#ifdef HAS_FIRST_LAYER_OPTIMIZATIONS
+  const bool transposed = trans_output_1203 || trans_input_3120 ||
+      trans_weight_1203 || trans_weight_0132;
+  int max_pixels_per_row = transposed || wrot180 || downsample ||
+      input_dilated || kernel_dilation > 1 ||
+      ichs > DIM ? 1 : DIM/ichs;
+  if (max_pixels_per_row > kcols) max_pixels_per_row = kcols;
+#else
+  const int max_pixels_per_row = 1;
+#endif
+
+  // Calculate spad address offsets
+  const int out_channels_per_bank = ochs / DIM + (ochs % DIM != 0);
+  const int in_channels_per_bank = kchs / DIM + (kchs % DIM != 0);
+  const int B_rows = trans_weight_0132 ?
+    in_channels_per_bank * kcols * krows * ochs :
+    out_channels_per_bank * kcols * krows * kchs;
+
+  static uint32_t D_sp_addr_row = 0;
+  static uint32_t C_sp_addr_row = 0;
+
+  const uint32_t A_sp_addr_start = 0;
+  const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - B_rows;
+  const uint32_t D_sp_addr_start = (1 << (ADDR_LEN - 1)) + D_sp_addr_row;
+  const uint32_t C_sp_addr_start = (3 << (ADDR_LEN - 2)) + C_sp_addr_row;
+
+  if (bias != 0) {
+    D_sp_addr_row = (D_sp_addr_row + ACC_ROWS / 2) % ACC_ROWS;
+  }
+
+  if (output != 0) {
+    C_sp_addr_row = (C_sp_addr_row + ACC_ROWS / 2) % ACC_ROWS;
+  }
+
+  gemmini_loop_conv_ws(batch_size, in_row_dim, in_col_dim, in_channels, out_channels, out_row_dim, out_col_dim, pool_out_row_dim, pool_out_col_dim, stride, padding, kernel_dim, kernel_dilation, pool_size, pool_stride, pool_padding, batches, porows, pocols, pochs, krows, kcols, kchs, lpad, rpad, upad, dpad, plpad, prpad, pupad, pdpad, orows, ocols, weights, output, bias, input, no_bias, no_pool, downsample, wrot180, input_dilated, act, trans_output_1203, trans_weight_1203, trans_weight_0132, trans_input_3120, max_pixels_per_row, in_stride, weight_stride, out_stride, dw, a_spad_id, b_spad_id);
+
+/*
+  if (!no_pool) {
+    printf("Pooling with rectangular convolutions is currently not supported.\n");
+    exit(1);
+  }
+
+  // Only rectangular convolutions will use the following C code
+
+  // mvin bias
+  if (bias != NULL) {
+    // TODO we probably don't need quite this many nested loops for this part
+
+    const int max_ochs_per_mvin = ochs < MAX_BLOCK_LEN_ACC * DIM ? ochs :
+        MAX_BLOCK_LEN_ACC * DIM;
+
+    gemmini_extended4_config_ld(0, MVIN_SCALE_IDENTITY, false, batches * orows * ocols, 2);
+
+    for (int b = 0; b < batches; b++)
+      for (int orow = 0; orow < orows; orow++)
+        for (int ocol = 0; ocol < ocols; ocol += DIM) {
+          const int I = ocols - ocol > DIM ? DIM : ocols - ocol;
+
+          for (int och = 0; och < ochs; och += max_ochs_per_mvin) {
+            const int J = ochs - och > max_ochs_per_mvin ? max_ochs_per_mvin : ochs - och;
+
+            const uint32_t D_sp_addr = D_sp_addr_start + (och / DIM) * batches * orows * ocols + b * orows * ocols + orow * ocols + ocol;
+
+            const acc_t * bias_dram_addr = no_bias ? NULL : bias + och;
+
+            gemmini_extended_mvin3(bias_dram_addr,
+                    D_sp_addr,
+                    J, I);
+          }
+        }
+  }
+
+  // mvin input
+  if (input != NULL){
+    int max_chs_per_mvin = ichs < MAX_BLOCK_LEN * DIM ? ichs :
+      MAX_BLOCK_LEN * DIM;
+    if (trans_input_3120) {
+      max_chs_per_mvin = batches < MAX_BLOCK_LEN * DIM ? batches :
+        MAX_BLOCK_LEN * DIM;
+    }
+
+    const int dram_stride = trans_input_3120 ?
+      batch_size * sizeof(elem_t) :
+      in_channels * sizeof(elem_t);
+
+    const int spad_stride = trans_input_3120 ?
+      ichs * (irows >> downsample) * (icols >> downsample) :
+      batches * (irows >> downsample) * (icols >> downsample);
+
+    gemmini_extended5_config_ld(dram_stride << downsample, MVIN_SCALE_IDENTITY, false, spad_stride, max_pixels_per_row, 0);
+
+    const int b_it = trans_input_3120 ? max_chs_per_mvin : 1;
+    const int ich_it = trans_input_3120 ? 1 : max_chs_per_mvin;
+
+    for (int b = 0; b < batches; b += b_it)
+      for (int irow = -UNDILATED(upad); irow < irows_unpadded + UNDILATED(dpad); irow += 1 + downsample) {
+        const int irow_padded = irow + UNDILATED(upad);
+
+        for (int icol = -UNDILATED(lpad); icol < icols_unpadded + UNDILATED(rpad);) {
+          // TODO There might be some unnecessary mvins here at the edge of the image
+
+          int I = icols_unpadded - icol > (DIM << downsample) ?
+            (DIM << downsample) : icols_unpadded - icol;
+
+          if (icol < 0) {
+            I = -icol > DIM ? DIM : -icol;
+          } else if (icol >= icols_unpadded) {
+            I = icols_unpadded + UNDILATED(rpad) - icol > DIM ? DIM : icols_unpadded + UNDILATED(rpad) - icol;
+          }
+
+          const int icol_padded = icol + UNDILATED(lpad);
+
+          for (int ich = 0; ich < ichs; ich += ich_it) {
+            int K = ichs - ich > max_chs_per_mvin ?
+              max_chs_per_mvin : ichs - ich;
+            if (trans_input_3120) {
+              K = batches - b > max_chs_per_mvin ?
+                max_chs_per_mvin : batches - b;
+            }
+
+#define DS(x) ((x) >> (downsample))
+
+            uint32_t A_sp_addr = A_sp_addr_start + (ich / DIM) * batches * DS(irows) * DS(icols) + b * DS(irows) * DS(icols) + DS(irow_padded) * DS(icols) + DS(icol_padded);
+            if (trans_input_3120) {
+              A_sp_addr = A_sp_addr_start + (b / DIM) * ichs * DS(irows) * DS(icols) + ich * DS(irows) * DS(icols) + DS(irow_padded) * DS(icols) + DS(icol_padded);
+            }
+
+            const bool is_zeros = irow < 0 || irow >= irows_unpadded || icol < 0 || icol >= icols_unpadded;
+
+            const elem_t * in = input + (b*in_row_dim*in_col_dim + irow*in_col_dim + icol) * in_stride + ich;
+            if (is_zeros) {
+              in = NULL;
+            } else if (trans_input_3120) {
+              in = input + (ich*in_row_dim*in_col_dim + irow*in_col_dim + icol) * batch_size + b;
+            }
+
+            gemmini_extended_mvin(in,
+                A_sp_addr,
+                K, I >> downsample);
+          }
+
+          icol += I;
+        }
+      }
+  }
+
+  // mvin weights
+  if (weights != NULL) {
+    int max_chs_per_mvin = ochs < MAX_BLOCK_LEN * DIM ? ochs :
+        MAX_BLOCK_LEN * DIM;
+    if (trans_weight_0132) {
+      max_chs_per_mvin = kchs < MAX_BLOCK_LEN * DIM ? kchs :
+          MAX_BLOCK_LEN * DIM;
+    }
+
+    size_t dram_stride = weight_stride * sizeof(elem_t);
+    if (dw) {
+      dram_stride = sizeof(elem_t);
+    } else if (trans_weight_1203) {
+      dram_stride = kernel_dim * kernel_dim * out_channels * sizeof(elem_t);
+    } else if (trans_weight_0132) {
+      dram_stride = in_channels * sizeof(elem_t);
+    }
+
+    const size_t spad_block_stride = trans_weight_0132 ?
+      krows * kcols * ochs : krows * kcols * kchs;
+
+    gemmini_extended4_config_ld(dram_stride, MVIN_SCALE_IDENTITY, false, spad_block_stride, 1);
+
+    const size_t och_it = trans_weight_0132 ? DIM : max_chs_per_mvin;
+    const size_t kch_it = trans_weight_0132 ? max_chs_per_mvin : DIM;
+
+    for (int och = 0; och < ochs; och += och_it) {
+      for (int krow = 0; krow < krows; krow++)
+        for (int kcol = 0; kcol < kcols; kcol++)
+          for (int kch = 0; kch < kchs; kch += kch_it) {
+            int K = kchs - kch > DIM ? DIM : kchs - kch;
+            int J = ochs - och > max_chs_per_mvin ? max_chs_per_mvin : ochs - och;
+            if (trans_weight_0132) {
+              K = ochs - och > DIM ? DIM : ochs - och;
+              J = kchs - kch > max_chs_per_mvin ? max_chs_per_mvin : kchs - kch;
+            }
+
+            uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * krows * kcols * kchs + krow * kcols * kchs + kcol * kchs + kch;
+            if (trans_weight_0132) {
+              B_sp_addr = B_sp_addr_start + (kch / DIM) * krows * kcols * ochs + krow * kcols * ochs + kcol * ochs + och;
+            }
+
+            const elem_t * w = weights + (krow*kernel_dim*in_channels + kcol*in_channels + kch) * weight_stride + och;
+            if (dw) {
+              w = weights + krow * kernel_dim + kcol;
+            } else if (trans_weight_1203) {
+              w = weights + (kch * kernel_dim * kernel_dim + krow * kernel_dim + kcol) * out_channels + och;
+            } else if (trans_weight_0132) {
+              w = weights + (krow * kernel_dim * out_channels + kcol * out_channels + och) * in_channels + kch;
+            }
+
+            gemmini_extended_mvin2(w, B_sp_addr, J, K);
+          }
+    }
+  }
+
+  // Compute
+  {
+    const int b_it = trans_input_3120 ? DIM : 1;
+    const int ocol_it = trans_input_3120 ? 1 : (DIM << input_dilated);
+
+    if (trans_input_3120) {
+      gemmini_extended3_config_ex(0, 0, 0, 0, orows * ocols, irows * icols, 0, 0, true);
+    }
+
+    for (int och = 0; och < ochs; och += DIM) {
+      for (int krow = 0; krow < krows; krow++) {
+        for (int kcol = 0; kcol < kcols; kcol += max_pixels_per_row) {
+          for (int kch = 0; kch < kchs; kch += DIM) {
+            bool new_weights = true;
+
+            for (int b = 0; b < batches; b += b_it) {
+              for (int orow = 0; orow < orows; orow++) {
+                // Skip some kernel rows due to input-dilation
+                if (input_dilated && ((krow * kernel_dilation + orow * stride - upad) % 2 != 0)) {
+                  continue;
+                }
+
+                for (int ocol = 0; ocol < ocols;) {
+                  // Skip some cols dimensions due to input-dilation
+                  if (input_dilated && ((kcol + ocol * stride - lpad) % 2 != 0)) {
+                    ocol++;
+                    continue;
+                  }
+
+                  int irow = orow * stride + krow * kernel_dilation;
+                  int icol = ocol * stride + kcol * kernel_dilation;
+
+                  if (input_dilated) {
+                    irow = (irow + 1) / 2;
+                    icol = (icol + 1) / 2;
+                  }
+
+                  const int pixels = kcols - kcol > max_pixels_per_row ?
+                    max_pixels_per_row : kcols - kcol;
+
+                  const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * orows * ocols + b * orows * ocols + orow * ocols + ocol;
+
+                  // Over here, construct a new matrix
+                  //
+                  // Let us assume that we only ever operate on
+                  // one pixel in one row.
+                  // Thus, krows == kcols == 1
+                  //
+                  // Then, for every set of I, J, and K values
+                  //     - I = ocols
+                  //     - J = ochs
+                  //     - K = kchs
+
+                  int I = UNDILATED(ocols - ocol > (DIM << input_dilated) ? (DIM << input_dilated) : ocols - ocol);
+                  const int J = ochs - och > DIM ? DIM : ochs - och;
+                  const int K = pixels * (kchs - kch > DIM ? DIM : kchs - kch);
+
+                  if (trans_input_3120) {
+                    I = batches - b > DIM ? DIM : batches - b;
+                  }
+
+                  uint32_t A_sp_addr = A_sp_addr_start + (kch / DIM) * batches * DS(irows) * DS(icols) + b * DS(irows) * DS(icols) + DS(irow) * DS(icols) + DS(icol);
+                  if (trans_input_3120) {
+                    A_sp_addr = A_sp_addr_start + (b / DIM) * kchs * DS(irows) * DS(icols) + kch * DS(irows) * DS(icols) + DS(irow) * DS(icols) + DS(icol);
+                  }
+
+                  const int krow_ = wrot180 ? krows - krow - 1 : krow;
+                  const int kcol_ = wrot180 ? kcols - kcol - 1 : kcol;
+
+                  uint32_t B_sp_addr = B_sp_addr_start + (och / DIM) * krows * kcols * kchs + krow_ * kcols * kchs + kcol_ * kchs + kch;
+                  if (trans_weight_0132) {
+                    B_sp_addr = B_sp_addr_start + (kch / DIM) * krows * kcols * ochs + krow_ * kcols * ochs + kcol_ * ochs + och;
+                  }
+
+                  const uint32_t pre_sp_addr = new_weights ?
+                    B_sp_addr : GARBAGE_ADDR;
+
+                  // perform matmul
+                  gemmini_extended_preload(pre_sp_addr, C_sp_addr, J, K, J, I);
+
+                  if (new_weights) {
+                    gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, K, I, J, I);
+                  } else {
+                    gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, K, I, J, I);
+                  }
+
+                  ocol += ocol_it;
+                  new_weights = false;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+#undef DS
+#undef UNDILATED
+
+  // mvout output
+  if (output != NULL) {
+    if (no_pool) {
+      for (int b = 0; b < batches; b++)
+        for (int orow = 0; orow < orows; orow++)
+          for (int ocol = 0; ocol < ocols; ocol += DIM) {
+            const int I = ocols - ocol > DIM ? DIM : ocols - ocol;
+
+            for (int och = 0; och < ochs; och += DIM) {
+              const int J = ochs - och > DIM ? DIM : ochs - och;
+
+              const uint32_t C_sp_addr = C_sp_addr_start + (och / DIM) * batches * orows * ocols + b * orows * ocols + orow * ocols + ocol;
+
+              elem_t * out = output + (b*out_row_dim*out_col_dim + orow*out_col_dim + ocol) * out_stride + och;
+              if (trans_output_1203) {
+                out = output + (orow*out_col_dim*batch_size + ocol*batch_size + b) * out_channels + och;
+              }
+
+              gemmini_extended_mvout(out,
+                  C_sp_addr,
+                  J, I);
+            }
+          }
+    } else {
+      printf("Pooling with rectangular convolutions is currently not supported.\n");
+      exit(1);
+*/
+      /*
+      gemmini_extended2_config_st(out_channels * sizeof(elem_t), act, scale, pool_stride, pool_size, pool_out_row_dim, porows, pocols, orows, ocols, pupad, plpad);
+
+      for (int b = 0; b < batches; b++) {
+        for (int poch = 0; poch < pochs; poch += DIM) {
+          const int channels = poch + DIM >= pochs ? pochs - poch : DIM;
+
+          elem_t * pout = output + (b * pool_out_row_dim * pool_out_col_dim)*out_channels + poch;
+
+          const uint32_t C_sp_addr = C_sp_addr_start + (poch / DIM) * batches * orows * ocols + b * orows * ocols;
+
+          gemmini_extended_mvout(pout,
+              C_sp_addr,
+              channels, 0);
+        }
+      }
+
+      gemmini_extended_config_st(out_channels * sizeof(elem_t), act, scale);
+<<<<<<< HEAD
+      */
+//    }
+//  } 
+  //  }
+  //}
+}
+
+
+static int tiled_conv_total_spad_rows_dw(bool acc, bool weight,
+        int stride,
+        int batches,
+        int porows, int pocols, int ochs,
+        int krows, int kcols, int kchs,
+        int pool_size, int pool_stride) {
+
+    const int orows = porows * pool_stride + pool_size - 1;
+    const int ocols = pocols * pool_stride + pool_size - 1;
+
+    const int irows = orows * stride + krows - 1; // - 2 * padding;
+    const int icols = ocols * stride + kcols - 1; // - 2 * padding;
+    const int ichs = kchs;
+
+    const int in_channels_per_bank = ichs / DIM + (ichs % DIM != 0);
+    const int out_channels_per_bank = ochs / DIM + (ochs % DIM != 0);
+
+    const int A_rows = in_channels_per_bank * batches * irows * icols;
+    const int B_rows = out_channels_per_bank * kcols * krows * kchs;
+    const int C_rows = out_channels_per_bank * batches * orows * ocols;
+
+    if (acc)
+        return C_rows;
+    else if(weight)
+        return B_rows;
+    else
+        return A_rows;
+}
+
+
+static int tiled_conv_total_spad_rows(bool acc,
+        int stride,
+        int input_dilation,
+        int kernel_dilation,
+        bool downsample,
+        bool trans_weight_0132,
+        bool trans_input_3120,
+        int batches,
+        int porows, int pocols, int ochs,
+        int krows, int kcols, int kchs,
+        int pool_size, int pool_stride) {
+
+    const int orows = porows * pool_stride + pool_size - 1;
+    const int ocols = pocols * pool_stride + pool_size - 1;
+
+    const int krows_dilated = krows + (kernel_dilation - 1)*(krows - 1);
+    const int kcols_dilated = kcols + (kernel_dilation - 1)*(kcols - 1);
+
+    int irows = orows * stride + krows_dilated - 1; // - 2 * padding;
+    int icols = ocols * stride + kcols_dilated - 1; // - 2 * padding;
+    const int ichs = kchs;
+
+    irows = irows / input_dilation + (irows % input_dilation != 0);
+    icols = icols / input_dilation + (icols % input_dilation != 0);
+
+    const int in_channels_per_bank = ichs / DIM + (ichs % DIM != 0);
+    const int out_channels_per_bank = ochs / DIM + (ochs % DIM != 0);
+    const int batches_per_bank = batches / DIM + (batches % DIM != 0);
+
+    const int A_rows = trans_input_3120 ?
+        (batches_per_bank * ichs * (irows >> downsample) * (icols >> downsample)) :
+        (in_channels_per_bank * batches * (irows >> downsample) * (icols >> downsample));
+
+    const int B_rows = trans_weight_0132 ?
+      in_channels_per_bank * kcols * krows * ochs :
+      out_channels_per_bank * kcols * krows * kchs;
+
+    const int C_rows = out_channels_per_bank * batches * orows * ocols;
+
+    return acc ? C_rows : A_rows + B_rows;
+}
+
+
+static void conv_cpu_without_pool(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim,
+        int in_stride, int weight_stride, int out_stride,
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale) {
+
+  bool no_bias = bias == NULL;
+
+  for (int b = 0; b < batch_size; b++) {
+    for (int orow = 0; orow < out_row_dim; orow++) {
+      for (int ocol = 0; ocol < out_col_dim; ocol++) {
+        for (int och = 0; och < out_channels; och++) {
+
+          acc_t opixel = no_bias ? 0 : bias[och];
+
+          for (int krow = 0; krow < kernel_dim; krow++) {
+            if ((orow * stride + krow * kernel_dilation - padding) % input_dilation != 0)
+              continue;
+
+            const int irow = (orow * stride + krow * kernel_dilation - padding) / input_dilation;
+
+            for (int kcol = 0; kcol < kernel_dim; kcol++) {
+              if ((ocol * stride + kcol * kernel_dilation - padding) % input_dilation != 0)
+                continue;
+
+              const int icol = (ocol * stride + kcol * kernel_dilation - padding) / input_dilation;
+
+              for (int kch = 0; kch < in_channels; kch++) {
+                const elem_t *in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * in_stride + kch;
+                if (trans_input_3120) {
+                  // NHWC to CHWN
+                  in = input + (kch * in_row_dim * in_col_dim + irow * in_col_dim + icol) * batch_size + b;
+                }
+
+                elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ?
+                    0 : *in;
+
+                const int krow_ = wrot180 ? kernel_dim - krow - 1 : krow;
+                const int kcol_ = wrot180 ? kernel_dim - kcol - 1 : kcol;
+
+                elem_t weight = *(weights + (krow_ * kernel_dim * in_channels + kcol_ * in_channels + kch) * weight_stride + och);
+                if (trans_weight_1203) {
+                  // HWIO to WIHO
+                  weight = *(weights + (kch * kernel_dim * kernel_dim  + krow_ * kernel_dim + kcol_) * out_channels + och);
+                } else if (trans_weight_0132) {
+                  // HWIO to HWOI
+                  weight = *(weights + (krow_ * kernel_dim * out_channels + kcol_ * out_channels + och) * in_channels + kch);
+                }
+
+                opixel += weight * ipixel;
+              }
+            }
+          }
+
+          elem_t *out = output + (b * out_row_dim * out_col_dim + orow * out_col_dim + ocol) * out_stride + och;
+          if (trans_output_1203) {
+            // NHWC to HWNC
+            out = output + (orow * out_col_dim * batch_size + ocol * batch_size + b) * out_channels + och;
+          }
+
+          *out = scale_and_sat(opixel, act, scale, 0);
+        }
+      }
+    }
+  }
+}
+
+
+static void conv_dw_cpu_without_pool(
+        int batch_size, int in_row_dim, int in_col_dim,
+        int channels, int out_row_dim, int out_col_dim,
+        int stride, int padding, int kernel_dim,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale) {
+
+  bool no_bias = bias == NULL;
+
+  for (int b = 0; b < batch_size; b++) {
+    for (int orow = 0; orow < out_row_dim; orow++) {
+      for (int ocol = 0; ocol < out_col_dim; ocol++) {
+        for (int ch = 0; ch < channels; ch++) {
+          acc_t opixel = no_bias ? 0 : bias[ch];
+
+          for (int krow = 0; krow < kernel_dim; krow++) {
+            const int irow = orow * stride + krow - padding;
+
+            for (int kcol = 0; kcol < kernel_dim; kcol++) {
+              const int icol = ocol * stride + kcol - padding;
+
+              const elem_t * in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * channels + ch;
+
+              const elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ?
+                  0 : *in;
+
+              const elem_t weight = *(weights + (ch * kernel_dim + krow) * kernel_dim  + kcol);
+
+              opixel += weight * ipixel;
+            }
+          }
+
+          elem_t *out = output + (b * out_row_dim * out_col_dim + orow * out_col_dim + ocol) * channels + ch;
+
+          *out = scale_and_sat(opixel, act, scale, 0);
+        }
+      }
+    }
+  }
+}
+
+
+static void conv_cpu(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim,
+        int in_stride, int weight_stride, int out_stride,
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+        int pool_size, int pool_stride, int pool_padding) {
+
+  const bool no_pool = pool_stride == 0;
+  if (no_pool) {
+    conv_cpu_without_pool(
+        batch_size, in_row_dim, in_col_dim, in_channels,
+        out_channels, out_row_dim, out_col_dim,
+        stride, input_dilation, kernel_dilation, padding, kernel_dim,
+        in_stride, weight_stride, out_stride,
+        wrot180, trans_output_1203, trans_input_3120,
+        trans_weight_1203, trans_weight_0132,
+        input, weights, bias, output,
+        act, scale);
+    return;
+  }
+
+  const bool no_bias = bias == NULL;
+  const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+  const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+
+  for (int b = 0; b < batch_size; b++) {
+    for (int porow = 0; porow < pool_out_row_dim; porow++) {
+      for (int pocol = 0; pocol < pool_out_col_dim; pocol++) {
+        for (int poch = 0; poch < out_channels; poch++) {
+
+          elem_t running_max = 0;
+          bool running_max_initialized = false;
+
+          for (int pwrow = 0; pwrow < pool_size; pwrow++) {
+            const int orow = porow * pool_stride + pwrow - pool_padding;
+
+            for (int pwcol = 0; pwcol < pool_size; pwcol++) {
+              const int ocol = pocol * pool_stride + pwcol - pool_padding;
+
+              if (orow < 0 || orow >= out_row_dim || ocol < 0 || ocol >= out_col_dim) {
+                if (!running_max_initialized || running_max < 0) {
+                  running_max = 0;
+                  running_max_initialized = true;
+                }
+              } else {
+
+                acc_t opixel = no_bias ? 0 : bias[poch];
+
+                for (int krow = 0; krow < kernel_dim; krow++) {
+                  if ((orow * stride + krow * kernel_dilation - padding) % input_dilation != 0)
+                    continue;
+
+                  const int irow = (orow * stride + krow * kernel_dilation - padding) / input_dilation;
+
+                  for (int kcol = 0; kcol < kernel_dim; kcol++) {
+                    if ((ocol * stride + kcol * kernel_dilation - padding) % input_dilation != 0)
+                      continue;
+
+                    const int icol = (ocol * stride + kcol * kernel_dilation - padding) / input_dilation;
+
+                    for (int kch = 0; kch < in_channels; kch++) {
+                      const elem_t * in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * in_stride + kch;
+                      if (trans_input_3120) {
+                        // NHWC to CHWN
+                        in = input + (kch * in_row_dim * in_col_dim + irow * in_col_dim + icol) * batch_size + b;
+                      }
+
+                      elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ?
+                          0 : *in;
+
+                      const int krow_ = wrot180 ? kernel_dim - krow - 1 : krow;
+                      const int kcol_ = wrot180 ? kernel_dim - kcol - 1 : kcol;
+
+                      elem_t weight = *(weights + (krow_ * kernel_dim * in_channels + kcol_ * in_channels + kch) * weight_stride + poch);
+                      if (trans_weight_1203) {
+                        // HWIO to WIHO
+                        weight = *(weights + (kch * kernel_dim * kernel_dim  + krow_ * kernel_dim + kcol_) * out_channels + poch);
+                      } else if (trans_weight_0132) {
+                        // HWIO to HWOI
+                        weight = *(weights + (krow_ * kernel_dim * out_channels + kcol_ * out_channels + poch) * in_channels + kch);
+                      }
+
+                      opixel += weight * ipixel;
+                    }
+                  }
+                }
+
+                opixel = scale_and_sat(opixel, act, scale, 0);
+                if (!running_max_initialized || opixel > running_max) {
+                  running_max = opixel;
+                  running_max_initialized = true;
+                }
+              }
+
+              if (pwrow == pool_size - 1 && pwcol == pool_size - 1) {
+                elem_t * out = output + (b * pool_out_row_dim * pool_out_col_dim + porow * pool_out_col_dim + pocol) * out_stride + poch;
+                if (trans_output_1203) {
+                  // NHWC to HWNC
+                  out = output + (porow * pool_out_col_dim * batch_size + pocol * batch_size + b) * out_channels + poch;
+                }
+
+                *out = running_max;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+static void conv_dw_cpu(
+        int batch_size, int in_row_dim, int in_col_dim,
+        int channels, int out_row_dim, int out_col_dim,
+        int stride, int padding, int kernel_dim,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+        int pool_size, int pool_stride, int pool_padding) {
+
+  const bool no_pool = pool_stride == 0;
+  if (no_pool) {
+    conv_dw_cpu_without_pool(
+        batch_size, in_row_dim, in_col_dim,
+        channels, out_row_dim, out_col_dim,
+        stride, padding, kernel_dim,
+        input, weights, bias, output,
+        act, scale);
+    return;
+  }
+
+  const bool no_bias = bias == NULL;
+  const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+  const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+
+  for (int b = 0; b < batch_size; b++) {
+    for (int porow = 0; porow < pool_out_row_dim; porow++) {
+      for (int pocol = 0; pocol < pool_out_col_dim; pocol++) {
+        for (int ch = 0; ch < channels; ch++) {
+
+          elem_t running_max = 0;
+          bool running_max_initialized = false;
+
+          for (int pwrow = 0; pwrow < pool_size; pwrow++) {
+            const int orow = porow * pool_stride + pwrow - pool_padding;
+
+            for (int pwcol = 0; pwcol < pool_size; pwcol++) {
+              const int ocol = pocol * pool_stride + pwcol - pool_padding;
+
+              if (orow < 0 || orow >= out_row_dim || ocol < 0 || ocol >= out_col_dim) {
+                if (!running_max_initialized || running_max < 0) {
+                  running_max = 0;
+                  running_max_initialized = true;
+                }
+              } else {
+
+                acc_t opixel = no_bias ? 0 : bias[ch];
+
+                for (int krow = 0; krow < kernel_dim; krow++) {
+                  const int irow = orow * stride + krow - padding;
+
+                  for (int kcol = 0; kcol < kernel_dim; kcol++) {
+                    const int icol = ocol * stride + kcol - padding;
+
+                    const elem_t * in = input + (b * in_row_dim * in_col_dim + irow * in_col_dim + icol) * channels + ch;
+
+                    elem_t ipixel = irow < 0 || irow >= in_row_dim || icol < 0 || icol >= in_col_dim ?
+                        0 : *in;
+
+                    const elem_t weight = *(weights + (ch * kernel_dim + krow) * kernel_dim  + kcol);
+
+                    opixel += weight * ipixel;
+                  }
+                }
+
+                opixel = scale_and_sat(opixel, act, scale, 0);
+                if (!running_max_initialized || opixel > running_max) {
+                  running_max = opixel;
+                  running_max_initialized = true;
+                }
+              }
+
+              if (pwrow == pool_size - 1 && pwcol == pool_size - 1) {
+                elem_t * out = output + (b * pool_out_row_dim * pool_out_col_dim + porow * pool_out_col_dim + pocol) * channels + ch;
+
+                *out = running_max;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+static void tiled_conv(
+        int batch_size,
+        int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim,
+        int in_stride, int weight_stride, int out_stride,
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        int batches,
+        int porows, int pocols, int pochs,
+        int krows, int kcols, int kchs,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+        int pool_size, int pool_stride, int pool_padding,
+
+        enum tiled_matmul_type_t tiled_conv_type) {
+
+#ifdef GEMMINI_ASSERTIONS
+  if (trans_weight_1203 && trans_weight_0132) {
+    printf("Only one weight transformation can be applied at a time\n");
+    exit(1);
+  }
+#endif
+
+    if (tiled_conv_type == CPU) {
+      if (pool_size == 1 && pool_stride == 1 && pool_padding == 0) {
+        pool_stride = 0;
+      }
+
+      // assume in_dim_rows = in_dim_cols
+      // and out_dim_rows = out_dim_cols for now
+      conv_cpu(
+        batch_size, in_row_dim, in_col_dim, in_channels,
+        out_channels, out_row_dim, out_col_dim,
+        stride, input_dilation, kernel_dilation, padding, kernel_dim,
+        in_stride, weight_stride, out_stride,
+        wrot180, trans_output_1203, trans_input_3120,
+        trans_weight_1203, trans_weight_0132,
+        input, weights, bias, output,
+        act, scale,
+        pool_size, pool_stride, pool_padding);
+      return;
+    } else if (tiled_conv_type == OS) {
+      printf("Gemmini convs do not currently support OS\n");
+      exit(1);
+    }
+
+    // TODO move everything below this into a tiled_conv_outer function to match the tiled_matmul function
+
+    bool no_bias = false;
+    if (bias == NULL) {
+        bias = (acc_t*)1;
+        no_bias = true;
+    }
+
+    bool no_pool = pool_stride == 0;
+    if (no_pool) {
+        pool_size = 1;
+        pool_stride = 1;
+        pool_padding = 0;
+    }
+
+    const bool downsample = stride == 2 && kernel_dim == 1 && in_row_dim % 2 == 0 && in_col_dim % 2 == 0
+      && padding == 0 && no_pool && input_dilation == 1 && !trans_input_3120;
+
+    const int input_dilated = input_dilation == 2;
+
+#ifdef GEMMINI_ASSERTIONS
+    {
+        // const int orows = porows * pool_stride + pool_size - 1;
+        // const int ocols = pocols * pool_stride + pool_size - 1;
+
+        // Check that data will fit in scratchpad
+        const int spad_rows = tiled_conv_total_spad_rows(false,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            batches, porows, pocols, pochs, krows, kcols, kchs, pool_size, pool_stride);
+        const int acc_rows = tiled_conv_total_spad_rows(true,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            batches, porows, pocols, pochs, krows, kcols, kchs, pool_size, pool_stride);
+
+        if (spad_rows > BANK_NUM * BANK_ROWS / 2) {
+            printf("not enough scratchpad space to store inputs and weights, %d\n", spad_rows);
+            exit(1);
+        }
+        if (acc_rows > ACC_ROWS / 2) {
+            printf("not enough accumulator space to store outputs\n");
+            exit(1);
+        }
+        if (kernel_dim <= padding) {
+            printf("kernel_dim must be larger than padding\n");
+            exit(1);
+        }
+        if (input_dilation > 2) {
+            printf("input_dilation > 2 is only supported on CPU\n");
+            exit(1);
+        }
+        if (input_dilation > 1 && stride > 1) {
+            printf("input input_dilation is only supported when stride == 1\n");
+            exit(1);
+        }
+        if (trans_output_1203 && !no_pool) {
+            printf("Output can only be transposed when pooling is disabled\n");
+            exit(1);
+        }
+        if (trans_input_3120 && trans_weight_0132) {
+            printf("Cannot transpose innermost dimensions of both inputs and weights on WS.\n");
+            exit(1);
+        }
+    }
+#endif
+
+    const size_t st_dram_stride = trans_output_1203 ?
+        batch_size * out_channels * sizeof(elem_t) :
+        out_stride * sizeof(elem_t);
+    gemmini_extended_config_st(st_dram_stride, act, scale);
+
+    gemmini_extended3_config_ex(WEIGHT_STATIONARY, 0, 0, 0, input_dilation, stride >> downsample, trans_input_3120, trans_weight_0132, false);
+
+    const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+    const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+    const int dilated_in_row_dim = in_row_dim + (input_dilation - 1) * (in_row_dim- 1);
+    const int dilated_in_col_dim = in_col_dim + (input_dilation - 1) * (in_col_dim- 1);
+
+    size_t a_spad_id = 0;
+    size_t b_spad_id = 0;
+
+    int porow_end = pool_out_row_dim;
+	int porow_start = 0;
+    bool a_reuse = false;
+    bool b_reuse = false;
+    size_t num_kch = ceil_divide_int(in_channels, kchs);
+    size_t num_poch = ceil_divide_int(out_channels, pochs);
+    size_t num_b = ceil_divide_int(batch_size, batches);
+    size_t num_porow = ceil_divide_int((porow_end - porow_start), porows);
+    size_t num_pocol = ceil_divide_int(pool_out_col_dim, pocols);
+    size_t num_krow = ceil_divide_int(kernel_dim, krows);
+    size_t num_kcol = ceil_divide_int(kernel_dim, kcols);
+
+
+//    printf("num_kch: %d, num_poch: %d, num_b: %d, num_porow: %d, num_pocol: %d, num_krow: %d, num_kcol: %d\n", num_kch, num_poch, num_b, num_porow, num_pocol, num_krow, num_kcol);
+
+    if(num_kch * num_poch * num_krow * num_kcol <= 2) 
+      b_reuse = true;
+    if(num_kch * num_krow * num_kcol * num_b * num_porow * num_pocol <= 2)
+      a_reuse = true;
+
+    for (int b = 0; b < batch_size; b += batches) {
+        for (int porow = porow_start; porow < porow_end; porow += porows) {
+            const int orow = porow * pool_stride - pool_padding;
+
+            for (int pocol = 0; pocol < pool_out_col_dim; pocol += pocols) {
+                const int ocol = pocol * pool_stride - pool_padding;
+
+                for (int poch = 0; poch < out_channels; poch += pochs) {
+                    for (int krow = 0; krow < kernel_dim; krow += krows) {
+                        const int orow_floored = orow < 0 ? 0 : orow;
+                        int irow = orow_floored * stride + krow * kernel_dilation - padding;
+
+                        for (int kcol = 0; kcol < kernel_dim; kcol += kcols) {
+                            const int ocol_floored = ocol < 0 ? 0 : ocol;
+                            int icol = ocol_floored * stride + kcol * kernel_dilation - padding;
+
+                            for (int kch = 0; kch < in_channels; kch += kchs) {
+                                if(a_reuse)
+						           a_spad_id = (kch + krow + kcol + b + (porow - porow_start) + pocol) == 0 ? 1 : 2;
+					            if(b_reuse)
+						           b_spad_id = (kch + poch + krow + kcol) == 0 ? 1 : 2;
+                                elem_t * out = output + (b * pool_out_row_dim * pool_out_col_dim + porow * pool_out_col_dim + pocol) * out_stride + poch;
+                                if (trans_output_1203) {
+                                    out = output + (porow * pool_out_col_dim * batch_size + pocol * batch_size + b) * out_channels + poch;
+                                }
+
+                                if (krow + krows < kernel_dim ||
+                                        kcol + kcols < kernel_dim ||
+                                        kch + kchs < in_channels) {
+                                    out = NULL;
+                                }
+
+                                const acc_t * bias_ = bias + poch;
+                                if (krow > 0 ||
+                                        kcol > 0 ||
+                                        kch > 0) {
+                                    bias_ = NULL;
+                                }
+
+                                const int batches_ = batch_size - b > batches ? batches : batch_size - b;
+                                const int porows_ = pool_out_row_dim - porow > porows ? porows : pool_out_row_dim - porow;
+                                const int pocols_ = pool_out_col_dim - pocol > pocols ? pocols : pool_out_col_dim - pocol;
+                                const int pochs_ = out_channels - poch > pochs ? pochs : out_channels - poch;
+                                const int krows_ = kernel_dim - krow > krows ? krows : kernel_dim - krow;
+                                const int kcols_ = kernel_dim - kcol > kcols ? kcols : kernel_dim - kcol;
+                                const int kchs_ = in_channels - kch > kchs ? kchs : in_channels - kch;
+
+                                const int ocols_ = pocols_ * pool_stride + pool_size - 1;
+                                const int orows_ = porows_ * pool_stride + pool_size - 1;
+
+                                const int plpad = ocol < 0 ? -ocol : 0;
+                                const int prpad = ocol + ocols_ > out_col_dim ? ocol + ocols_ - out_col_dim : 0;
+                                const int pupad = orow < 0 ? -orow : 0;
+                                const int pdpad = orow + orows_ > out_row_dim ? orow + orows_ - out_row_dim : 0;
+
+                                const int dilated_krows_ = krows_ + (kernel_dilation - 1)*(krows_ - 1);
+                                const int dilated_kcols_ = kcols_ + (kernel_dilation - 1)*(kcols_ - 1);
+
+                                const int icols_ = (ocols_ - plpad - prpad) * stride + dilated_kcols_ - 1;
+                                const int irows_ = (orows_ - pupad - pdpad) * stride + dilated_krows_ - 1;
+
+                                int lpad = icol < 0 ? -icol : 0;
+                                int rpad = icol + icols_ > dilated_in_col_dim ? icol + icols_ - dilated_in_col_dim : 0;
+                                int upad = irow < 0 ? -irow : 0;
+                                int dpad = irow + irows_ > dilated_in_row_dim ? irow + irows_ - dilated_in_row_dim : 0;
+
+                                if (input_dilated) {
+                                  lpad += lpad == 0 && icol % 2 != 0;
+                                  rpad += rpad == 0 && (icol + icols_) % 2 != 1;
+                                  upad += upad == 0 && irow % 2 != 0;
+                                  dpad += dpad == 0 && (irow + irows_) % 2 != 1;
+                                }
+
+                                int krow_ = krow;
+                                int kcol_ = kcol;
+                                if (wrot180) {
+                                  krow_ = kernel_dim - krow - krows_;
+                                  kcol_ = kernel_dim - kcol - kcols_;
+                                }
+
+                                const elem_t * weights_slice = weights + (krow_*kernel_dim*in_channels + kcol_*in_channels + kch) * weight_stride + poch;
+                                if (trans_weight_1203) {
+                                  weights_slice = weights + (kch*kernel_dim*kernel_dim + krow_*kernel_dim+kcol_) * out_channels + poch;
+                                } else if (trans_weight_0132) {
+                                  weights_slice = weights + (krow_*kernel_dim*out_channels + kcol_*out_channels + poch) * in_channels + kch;
+                                }
+
+                                const elem_t * in = input + (b *in_row_dim * in_col_dim + ((irow+upad)>>input_dilated) * in_col_dim + ((icol+lpad)>>input_dilated)) * in_stride + kch;
+                                if (trans_input_3120) {
+                                  in = input + (kch * in_row_dim * in_col_dim + ((irow+upad)>>input_dilated) * in_col_dim + ((icol+lpad)>>input_dilated)) * batch_size + b;
+                                }
+                                if(b_reuse && (pocol + (porow - porow_start) + b > 0)) weights_slice = NULL;
+							    if(a_reuse && (poch > 0)) in = NULL;
+                                //printf("a_reuse: %d, b_reuse: %d, a_spad_id: %d, b_spad_id: %d, in: %llu, weight: %llu \n", a_reuse, b_reuse, a_spad_id, b_spad_id, in, weights_slice);
+ 
+                                sp_tiled_conv(
+                                    batch_size, in_row_dim, in_col_dim, in_channels,
+                                    out_channels, out_row_dim, out_col_dim,
+                                    pool_out_row_dim, pool_out_col_dim,
+
+                                    stride, padding, kernel_dim, kernel_dilation,
+                                    in_stride, weight_stride, out_stride,
+
+                                    pool_size, pool_stride, pool_padding,
+
+                                    batches_,
+                                    porows_, pocols_, pochs_,
+                                    krows_, kcols_, kchs_,
+
+                                    lpad, rpad, upad, dpad,
+                                    plpad, prpad, pupad, pdpad,
+
+                                    in,
+                                    weights_slice,
+                                    out,
+                                    bias_,
+
+                                    act, scale,
+
+                                    wrot180, trans_output_1203, trans_input_3120,
+                                    trans_weight_1203, trans_weight_0132,
+
+                                    no_bias, no_pool, downsample, input_dilated,
+                                    false, a_spad_id, b_spad_id);
+
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+static void tiled_conv_dw(
+    int batch_size, int in_row_dim, int in_col_dim,
+    int channels, int out_row_dim, int out_col_dim,
+    int stride, int padding, int kernel_dim,
+
+    int batches,
+    int porows, int pocols,
+    int krows, int kcols,
+
+    const elem_t * input,
+    const elem_t * weights,
+    const acc_t * bias,
+    elem_t * output,
+
+    int act, acc_scale_t scale,
+    int pool_size, int pool_stride, int pool_padding,
+
+    enum tiled_matmul_type_t tiled_conv_type) {
+
+    if (tiled_conv_type == CPU) {
+      if (pool_size == 1 && pool_stride == 1 && pool_padding == 0) {
+        pool_stride = 0;
+      }
+
+      conv_dw_cpu(
+        batch_size, in_row_dim, in_col_dim,
+        channels, out_row_dim, out_col_dim,
+        stride, padding, kernel_dim,
+        input, weights, bias, output,
+        act, scale,
+        pool_size, pool_stride, pool_padding);
+      return;
+    } else if (tiled_conv_type == OS) {
+      printf("Gemmini convs do not currently support OS\n");
+      exit(1);
+    }
+
+    // TODO move everything below this into a tiled_conv_outer function to match the tiled_matmul function
+
+    bool no_bias = false;
+    if (bias == NULL) {
+        bias = (acc_t*)1;
+        no_bias = true;
+    }
+
+    bool no_pool = pool_stride == 0;
+    if (no_pool) {
+        pool_size = 1;
+        pool_stride = 1;
+        pool_padding = 0;
+    }
+
+#ifdef GEMMINI_ASSERTIONS
+    {
+        // const int orows = porows * pool_stride + pool_size - 1;
+        // const int ocols = pocols * pool_stride + pool_size - 1;
+
+        // Check that data will fit in scratchpad
+        const int spad_rows = tiled_conv_total_spad_rows(false,
+            stride, 1, 1, false, false, false,
+            batches, porows, pocols, 1, krows, kcols, 1, pool_size, pool_stride);
+        const int acc_rows = tiled_conv_total_spad_rows(true,
+            stride, 1, 1, false, false, false,
+            batches, porows, pocols, 1, krows, kcols, 1, pool_size, pool_stride);
+
+        if (spad_rows > BANK_NUM * BANK_ROWS / 2) {
+            printf("not enough scratchpad space to store inputs and weights, %d\n", spad_rows);
+            exit(1);
+        }
+        if (acc_rows > ACC_ROWS / 2) {
+            printf("not enough accumulator space to store outputs\n");
+            exit(1);
+        }
+        if (kernel_dim <= padding) {
+            printf("kernel_dim must be larger than padding\n");
+            exit(1);
+        }
+    }
+#endif
+
+    const size_t st_dram_stride = channels * sizeof(elem_t);
+    gemmini_extended_config_st(st_dram_stride, act, scale);
+
+    gemmini_extended3_config_ex(WEIGHT_STATIONARY, 0, 0, 0, 1, stride, false, false, false);
+
+    const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+    const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+
+    for (int b = 0; b < batch_size; b += batches) {
+        for (int porow = 0; porow < pool_out_row_dim; porow += porows) {
+            const int orow = porow * pool_stride - pool_padding;
+
+            for (int pocol = 0; pocol < pool_out_col_dim; pocol += pocols) {
+                const int ocol = pocol * pool_stride - pool_padding;
+
+                for (int ch = 0; ch < channels; ch++) {
+                    for (int krow = 0; krow < kernel_dim; krow += krows) {
+                        const int orow_floored = orow < 0 ? 0 : orow;
+                        int irow = orow_floored * stride + krow - padding;
+
+                        for (int kcol = 0; kcol < kernel_dim; kcol += kcols) {
+                            const int ocol_floored = ocol < 0 ? 0 : ocol;
+                            int icol = ocol_floored * stride + kcol - padding;
+
+                            elem_t * out = output + (b * pool_out_row_dim * pool_out_col_dim + porow * pool_out_col_dim + pocol) * channels + ch;
+
+                            if (krow + krows < kernel_dim ||
+                                    kcol + kcols < kernel_dim) {
+                                out = NULL;
+                            }
+
+                            const acc_t * bias_ = bias + ch;
+                            if (krow > 0 ||
+                                    kcol > 0) {
+                                bias_ = NULL;
+                            }
+
+                            const int batches_ = batch_size - b > batches ? batches : batch_size - b;
+                            const int porows_ = pool_out_row_dim - porow > porows ? porows : pool_out_row_dim - porow;
+                            const int pocols_ = pool_out_col_dim - pocol > pocols ? pocols : pool_out_col_dim - pocol;
+                            const int krows_ = kernel_dim - krow > krows ? krows : kernel_dim - krow;
+                            const int kcols_ = kernel_dim - kcol > kcols ? kcols : kernel_dim - kcol;
+
+                            const int ocols_ = pocols_ * pool_stride + pool_size - 1;
+                            const int orows_ = porows_ * pool_stride + pool_size - 1;
+
+                            const int plpad = ocol < 0 ? -ocol : 0;
+                            const int prpad = ocol + ocols_ > out_col_dim ? ocol + ocols_ - out_col_dim : 0;
+                            const int pupad = orow < 0 ? -orow : 0;
+                            const int pdpad = orow + orows_ > out_row_dim ? orow + orows_ - out_row_dim : 0;
+
+                            const int icols_ = (ocols_ - plpad - prpad) * stride + kcols_ - 1;
+                            const int irows_ = (orows_ - pupad - pdpad) * stride + krows_ - 1;
+
+                            int lpad = icol < 0 ? -icol : 0;
+                            int rpad = icol + icols_ > in_col_dim ? icol + icols_ - in_col_dim : 0;
+                            int upad = irow < 0 ? -irow : 0;
+                            int dpad = irow + irows_ > in_row_dim ? irow + irows_ - in_row_dim : 0;
+
+                            const elem_t * weights_slice = weights + (ch*kernel_dim + krow) * kernel_dim + kcol;
+
+                            const elem_t *in = input + (b * in_row_dim * in_col_dim + (irow+upad) * in_col_dim + (icol+lpad)) * channels + ch;
+
+                            sp_tiled_conv(
+                                batch_size, in_row_dim, in_col_dim, channels,
+                                channels, out_row_dim, out_col_dim,
+                                pool_out_row_dim, pool_out_col_dim,
+
+                                stride, padding, kernel_dim, 1,
+                                channels, 1, channels,
+
+                                pool_size, pool_stride, pool_padding,
+
+                                batches_,
+                                porows_, pocols_, 1,
+                                krows_, kcols_, 1,
+
+                                lpad, rpad, upad, dpad,
+                                plpad, prpad, pupad, pdpad,
+
+                                in,
+                                weights_slice,
+                                out,
+                                bias_,
+
+                                act, scale,
+
+                                false, false, false,
+                                false, false,
+
+                                no_bias, no_pool, false, false,
+                                true, 0, 0);
+
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+// need to specify each operand/output's stride
+// stride only for trans == false, wrot == false
+static void tiled_conv_stride_auto(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim,
+        int in_stride, int weight_stride, int out_stride, // specify in/output's stride
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+        int pool_size, int pool_stride, int pool_padding,
+
+        enum tiled_matmul_type_t tiled_conv_type) {
+
+    const bool no_pool = pool_stride == 0;
+    if (no_pool) {
+        pool_size = 1;
+        pool_stride = 1;
+        pool_padding = 0;
+    }
+
+    const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+    const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+
+    const bool downsample = stride == 2 && kernel_dim == 1 && padding == 0 && no_pool && in_row_dim % 2 == 0 && in_col_dim % 2 == 0;
+
+    // Tile convolution params
+
+    // int args[] = {batch_size, porows, pocols, pochs, krows, kcols, kchs};
+    int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_dim, kernel_dim, in_channels};
+    const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, out_channels, kernel_dim, kernel_dim, in_channels};
+
+    const int orows_idx = 1;
+    const int ocols_idx = 2;
+    const int out_channels_idx = 3;
+    const int in_channels_idx = 6;
+
+    // We divide by 2 for the sake of double-buffering
+    const int max_spad_rows = (BANK_NUM*BANK_ROWS / 2);
+    const int max_acc_rows = (ACC_ROWS / 2);
+
+    int spad_rows = tiled_conv_total_spad_rows(false,
+        stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    int acc_rows = tiled_conv_total_spad_rows(true,
+        stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+
+    while (spad_rows > max_spad_rows || acc_rows > max_acc_rows) {
+        int max_val = -1;
+        int max_idx = -1;
+
+        for (size_t i = 0; i < sizeof(args)/sizeof(args[0]); i++) {
+            // We avoid reducing ocols when possible to keep the spatial array fully utilized
+            if (!(i == ocols_idx && args[i] <= DIM && args[orows_idx] > 1)
+                    && args[i] > max_val) {
+                max_val = args[i];
+                max_idx = i;
+            }
+        }
+
+        if (max_idx == out_channels_idx || max_idx == in_channels_idx) {
+            // For input and output channels, there's no point in subtracting by just one
+            if (args[max_idx] % DIM != 0) {
+                args[max_idx] = (args[max_idx] / DIM) * DIM;
+            } else {
+                args[max_idx] -= DIM;
+            }
+            args[max_idx] = args[max_idx] == 0 ? 1 : args[max_idx];
+        } else {
+            args[max_idx]--;
+        }
+
+        spad_rows = tiled_conv_total_spad_rows(false,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+        acc_rows = tiled_conv_total_spad_rows(true,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    }
+
+    // Check if we can increase ocols
+    bool not_increased = false;
+    while (!not_increased) {
+        not_increased = true;
+
+        int args_candidate[] = {args[0], args[1], args[2], args[3], args[4], args[5], args[6]};
+        args_candidate[ocols_idx]++;
+
+        if (args_candidate[ocols_idx] > max_args[ocols_idx])
+            continue;
+
+        spad_rows = tiled_conv_total_spad_rows(false,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+        acc_rows = tiled_conv_total_spad_rows(true,
+            stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+            args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+
+        if (spad_rows <= max_spad_rows && acc_rows <= max_acc_rows) {
+            args[ocols_idx] = args_candidate[ocols_idx];
+            not_increased = false;
+        }
+    }
+
+    // Check if there are any parameters that we can currently still increase
+    bool nothing_increased = false;
+    while (!nothing_increased) {
+        nothing_increased = true;
+
+        for (size_t i = 0; i < sizeof(args)/sizeof(args[0]); i++) {
+            int args_candidate[] = {args[0], args[1], args[2], args[3], args[4], args[5], args[6]};
+            args_candidate[i]++;
+
+            if (args_candidate[i] > max_args[i])
+                continue;
+
+            spad_rows = tiled_conv_total_spad_rows(false,
+                stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+                args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+            acc_rows = tiled_conv_total_spad_rows(true,
+                stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+                args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+
+            if (spad_rows <= max_spad_rows && acc_rows <= max_acc_rows) {
+                args[i] = args_candidate[i];
+                nothing_increased = false;
+            }
+        }
+    }
+
+    const int batches = args[0];
+    const int orows = args[1];
+    const int ocols = args[2];
+    const int ochs = args[3];
+    const int krows = args[4];
+    const int kcols = args[5];
+    const int kchs = args[6];
+
+    /*
+    spad_rows = tiled_conv_total_spad_rows(false,
+        stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    acc_rows = tiled_conv_total_spad_rows(true,
+        stride, input_dilation, kernel_dilation, downsample, trans_weight_0132, trans_input_3120,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    */
+
+#ifdef PRINT_TILE
+#if PRINT_TILE
+    printf("batches = %d\n", batches);
+    printf("orows   = %d\n", orows);
+    printf("ocols   = %d\n", ocols);
+    printf("ochs    = %d\n", ochs);
+    printf("krows   = %d\n", krows);
+    printf("kcols   = %d\n", kcols);
+    printf("kchs    = %d\n\n", kchs);
+
+    printf("total spad_rows reserved: %d\n", spad_rows);
+    printf("total acc_rows reserved: %d\n\n", acc_rows);
+
+    printf("scratchpad row utilization: %d%%\n", (spad_rows*100) / max_spad_rows);
+    printf("accumulator row utilization: %d%%\n\n", (acc_rows*100) / max_acc_rows);
+
+    printf("inner matmul size: i=%d, j=%d, k=%d\n\n", ocols, ochs, kchs);
+#endif
+#endif
+
+    tiled_conv(
+        batch_size, in_row_dim, in_col_dim, in_channels,
+        out_channels, out_row_dim, out_col_dim,
+        stride, input_dilation, kernel_dilation, padding, kernel_dim,
+        in_stride, weight_stride, out_stride,
+        wrot180, trans_output_1203, trans_input_3120,
+        trans_weight_1203, trans_weight_0132,
+
+        batches,
+        orows, ocols, ochs,
+        krows, kcols, kchs,
+
+        input,
+        weights,
+        bias,
+        output,
+
+        act, scale,
+        pool_size, no_pool ? 0 : pool_stride, pool_padding,
+
+        tiled_conv_type);
+}
+
+
+static void tiled_conv_auto(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int stride, int input_dilation, int kernel_dilation, int padding, int kernel_dim, 
+        bool wrot180, bool trans_output_1203, bool trans_input_3120,
+        bool trans_weight_1203, bool trans_weight_0132,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+        int pool_size, int pool_stride, int pool_padding,
+
+        enum tiled_matmul_type_t tiled_conv_type) {
+
+    int in_stride = in_channels;
+    int out_stride = out_channels;
+    int weight_stride = out_channels;
+    tiled_conv_stride_auto(
+        batch_size, in_row_dim, in_col_dim, in_channels,
+        out_channels, out_row_dim, out_col_dim,
+        stride, input_dilation, kernel_dilation, padding, kernel_dim,
+        in_stride, weight_stride, out_stride,
+        wrot180, trans_output_1203, trans_input_3120,
+        trans_weight_1203, trans_weight_0132,
+        
+        input, weights, bias, output,
+
+        act, scale, pool_size, pool_stride, pool_padding,
+        tiled_conv_type);
+
+}
+
+// This function is for a convolution with kernel_dim=1, stride==2, padding=0, and no pooling
+static void tiled_conv_downsample(
+        int batch_size, int in_row_dim, int in_col_dim, int in_channels,
+        int out_channels, int out_row_dim, int out_col_dim,
+        int in_stride, int weight_stride, int out_stride,
+
+        const elem_t * input,
+        const elem_t * weights,
+        const acc_t * bias,
+        elem_t * output,
+
+        int act, acc_scale_t scale,
+
+        enum tiled_matmul_type_t tiled_conv_type) {
+
+    // Rectangular dimensions for this function are currently not supported
+    if (in_row_dim != in_col_dim || out_row_dim != out_col_dim) {
+        printf("Rectangular convolutions for tiled_conv_downsample are currently not supported.\n");
+        exit(1);
+    }
+
+    const int in_dim = in_row_dim;
+    const int out_dim = out_row_dim;
+
+    const int stride = 2;
+
+    for (int b = 0; b < batch_size; b++) {
+        for (int irow = 0; irow < in_row_dim; irow += stride) {
+            const int orow = irow / stride;
+
+            const int I = in_col_dim / stride; // number of columns in row
+            const int J = out_channels;
+            const int K = in_channels;
+
+            const elem_t * A = input + (b * in_dim + irow) * in_dim * in_stride;
+            const elem_t * B = weights;
+            const acc_t * D = bias;
+            elem_t * C = output + (b * out_dim + orow) * out_dim * out_stride;
+
+            const int A_stride = in_stride * 2;
+            const int B_stride = weight_stride;
+            const int D_stride = out_stride;
+            const int C_stride = out_stride;
+
+            tiled_matmul_auto(I, J, K, A, B, (void*)D, (void*)C,
+                    A_stride, B_stride, D_stride, C_stride,
+                    MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+                    MVIN_SCALE_IDENTITY, act, scale, 0,
+                    true, false, false, false, false, 0, tiled_conv_type);
+        }
+    }
+}
+
+//for mobilenet's depthwise convs
+static void tiled_conv_dw_auto(
+    int batch_size, int in_row_dim, int in_col_dim,
+    int channels, int out_row_dim, int out_col_dim,
+    int stride, int padding, int kernel_dim,
+
+    elem_t * input,
+    elem_t * weights,
+    acc_t * bias,
+    elem_t * output,
+
+    int act, acc_scale_t scale,
+    int pool_size, int pool_stride, int pool_padding,
+
+    enum tiled_matmul_type_t tiled_conv_type) {
+
+    const bool no_pool = pool_stride == 0;
+    if (no_pool) {
+        pool_size = 1;
+        pool_stride = 1;
+        pool_padding = 0;
+    }
+
+    const int pool_out_row_dim = (out_row_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+    const int pool_out_col_dim = (out_col_dim + 2 * pool_padding - pool_size) / pool_stride + 1;
+
+    // Tile convolution params
+
+    // int args[] = {batch_size, porows, pocols, pochs, krows, kcols, kchs};
+    int args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_dim, kernel_dim, 1};
+    const int max_args[] = {batch_size, pool_out_row_dim, pool_out_col_dim, 1, kernel_dim, kernel_dim, 1};
+
+    const int orows_idx = 1;
+    const int ocols_idx = 2;
+    const int out_channels_idx = 3;
+
+    // We divide by 2 for the sake of double-buffering
+    const int max_spad_rows = (BANK_NUM*BANK_ROWS / 2);
+    const int max_acc_rows = (ACC_ROWS / 2);
+
+    int spad_rows = tiled_conv_total_spad_rows(false,
+        stride, 1, 1, false, false, false,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    int acc_rows = tiled_conv_total_spad_rows(true,
+        stride, 1, 1, false, false, false,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+
+    while (spad_rows > max_spad_rows || acc_rows > max_acc_rows) {
+        int max_val = -1;
+        int max_idx = -1;
+
+        for (size_t i = 0; i < sizeof(args)/sizeof(args[0]); i++) {
+            // We avoid reducing ocols when possible to keep the spatial array fully utilized
+            if (!(i == ocols_idx && args[i] <= DIM && args[orows_idx] > 1)
+                    && args[i] > max_val) {
+                max_val = args[i];
+                max_idx = i;
+            }
+        }
+
+        if (max_idx == out_channels_idx) {
+            // For input and output channels, there's no point in subtracting by just one
+            if (args[max_idx] % DIM != 0) {
+                args[max_idx] = (args[max_idx] / DIM) * DIM;
+            } else {
+                args[max_idx] -= DIM;
+            }
+            args[max_idx] = args[max_idx] == 0 ? 1 : args[max_idx];
+        } else {
+            args[max_idx]--;
+        }
+
+        spad_rows = tiled_conv_total_spad_rows(false,
+            stride, 1, 1, false, false, false,
+            args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+        acc_rows = tiled_conv_total_spad_rows(true,
+            stride, 1, 1, false, false, false,
+            args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    }
+
+    // Check if we can increase ocols
+    bool not_increased = false;
+    while (!not_increased) {
+        not_increased = true;
+
+        int args_candidate[] = {args[0], args[1], args[2], args[3], args[4], args[5], args[6]};
+        args_candidate[ocols_idx]++;
+
+        if (args_candidate[ocols_idx] > max_args[ocols_idx])
+            continue;
+
+        spad_rows = tiled_conv_total_spad_rows(false,
+            stride, 1, 1, false, false, false,
+            args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+        acc_rows = tiled_conv_total_spad_rows(true,
+            stride, 1, 1, false, false, false,
+            args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+
+        if (spad_rows <= max_spad_rows && acc_rows <= max_acc_rows) {
+            args[ocols_idx] = args_candidate[ocols_idx];
+            not_increased = false;
+        }
+    }
+
+    // Check if there are any parameters that we can currently still increase
+    bool nothing_increased = false;
+    while (!nothing_increased) {
+        nothing_increased = true;
+
+        for (size_t i = 0; i < sizeof(args)/sizeof(args[0]); i++) {
+            int args_candidate[] = {args[0], args[1], args[2], args[3], args[4], args[5], args[6]};
+            args_candidate[i]++;
+
+            if (args_candidate[i] > max_args[i])
+                continue;
+
+            spad_rows = tiled_conv_total_spad_rows(false,
+                stride, 1, 1, false, false, false,
+                args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+            acc_rows = tiled_conv_total_spad_rows(true,
+                stride, 1, 1, false, false, false,
+                args_candidate[0], args_candidate[1], args_candidate[2], args_candidate[3], args_candidate[4], args_candidate[5], args_candidate[6], pool_size, pool_stride);
+
+            if (spad_rows <= max_spad_rows && acc_rows <= max_acc_rows) {
+                args[i] = args_candidate[i];
+                nothing_increased = false;
+            }
+        }
+    }
+
+    const int batches = args[0];
+    const int orows = args[1];
+    const int ocols = args[2];
+    const int ochs = 1; // args[3];
+    const int krows = args[4];
+    const int kcols = args[5];
+    const int kchs = 1; // args[6];
+
+    /*
+    spad_rows = tiled_conv_total_spad_rows(false,
+        stride, 1, 1, false, false, false,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+    acc_rows = tiled_conv_total_spad_rows(true,
+        stride, 1, 1, false, false, false,
+        args[0], args[1], args[2], args[3], args[4], args[5], args[6], pool_size, pool_stride);
+
+    printf("batches = %d\n", batches);
+    printf("orows   = %d\n", orows);
+    printf("ocols   = %d\n", ocols);
+    printf("ochs    = %d\n", ochs);
+    printf("krows   = %d\n", krows);
+    printf("kcols   = %d\n", kcols);
+    printf("kchs    = %d\n\n", kchs);
+
+    printf("total spad_rows reserved: %d\n", spad_rows);
+    printf("total acc_rows reserved: %d\n\n", acc_rows);
+
+    printf("scratchpad row utilization: %d%%\n", (spad_rows*100) / max_spad_rows);
+    printf("accumulator row utilization: %d%%\n\n", (acc_rows*100) / max_acc_rows);
+
+    printf("inner matmul size: i=%d, j=%d, k=%d\n\n", ocols, ochs, kchs);
+    */
+
+    tiled_conv_dw(
+        batch_size, in_row_dim, in_col_dim,
+        channels, out_row_dim, out_col_dim,
+        stride, padding, kernel_dim,
+
+        batches,
+        orows, ocols,
+        krows, kcols,
+
+        input,
+        weights,
+        bias,
+        output,
+
+        act, scale,
+        pool_size, no_pool ? 0 : pool_stride, pool_padding,
+
+        tiled_conv_type);
+}
+
+
+static void resadd_cpu(const size_t I, const size_t J,
+        const size_t stride,
+        const scale_t A_scale,
+        const scale_t B_scale,
+        const acc_scale_t C_scale,
+        const elem_t * A,
+        const elem_t * B,
+        elem_t * C,
+        bool relu) {
+
+	const int minimum = relu ? 0 : elem_t_min;
+
+    for (size_t i = 0; i < I; i++) {
+        for (size_t j = 0; j < J; j++) {
+            const elem_t * a = A + i * stride + j;
+            const elem_t * b = B + i * stride + j;
+            elem_t * c = C + i * stride + j;
+
+            acc_t result = MVIN_SCALE(*a, A_scale) + MVIN_SCALE(*b, B_scale);
+            result = ACC_SCALE(result, C_scale);
+            result = result > elem_t_max ? elem_t_max :
+                (result < minimum ? minimum : result);
+
+            *c = result;
+        }
+    }
+}
+
+
+static void sp_tiled_resadd(const size_t I, const size_t J,
+        const scale_t A_scale,
+        const scale_t B_scale,
+        const elem_t * A, const elem_t * B, elem_t * C,
+        size_t A_row_stride, size_t B_row_stride, size_t C_row_stride,
+        bool relu) {
+
+    int pad_I = ((I%DIM) == 0) ? 0 : DIM - (I % DIM);
+    int pad_J = ((J%DIM) == 0) ? 0 : DIM - (J % DIM);
+    int tile_I = (I%DIM == 0) ? (int)(I/DIM) : (int)(I/DIM) + 1;
+    int tile_J = (J%DIM == 0) ? (int)(J/DIM) : (int)(J/DIM) + 1;
+    //printf("pad I: %d, pad_J: %d, tile_I: %d, tile_J: %d\n", pad_I, pad_J, tile_I, tile_J);
+    gemmini_loop_ws(tile_I, tile_J, 0, pad_I, pad_J, 0, A, B, NULL, C, A_row_stride, B_row_stride, 0, C_row_stride, false, false, false, false, false, relu, 0, 0, true);
+    /*
+    // Use the new mvin2 command to overlap mvin A, mvin B, and mvout C
+
+    size_t blocks = (J/DIM + (J % DIM != 0));
+    if (blocks > MAX_BLOCK_LEN) blocks = MAX_BLOCK_LEN;
+
+    const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+    const uint32_t C_sp_addr_start = 3 << (ADDR_LEN-2);
+
+    const size_t rounded_up_J = (J / DIM + (J % DIM != 0)) * DIM;
+
+    // Mvin A
+    // printf("Mving A\n");
+    for (size_t i = 0; i < I; i += DIM) {
+        for (size_t j = 0; j < J; j += blocks * DIM) {
+            const size_t cols = j + blocks*DIM <= J ? blocks*DIM : J-j;
+            const size_t rows = i + DIM <= I ? DIM : I-i;
+
+            const elem_t * const A_dram_addr = A + i * A_row_stride + j;
+            const uint32_t A_sp_addr = D_sp_addr_start + i * (rounded_up_J/DIM) + j;
+
+            gemmini_extended_mvin(A_dram_addr, A_sp_addr, cols, rows);
+        }
+    }
+
+    // Mvin B
+     printf("Mving B\n");
+    for (size_t i = 0; i < I; i += DIM) {
+        for (size_t j = 0; j < J; j += blocks * DIM) {
+            const size_t cols = j + blocks*DIM <= J ? blocks*DIM : J-j;
+            const size_t rows = i + DIM <= I ? DIM : I-i;
+
+            const elem_t * const B_dram_addr = B + i * B_row_stride + j;
+            const uint32_t B_sp_addr = C_sp_addr_start + i * (rounded_up_J/DIM) + j;
+            gemmini_extended_mvin2(B_dram_addr, B_sp_addr, cols, rows);
+        }
+    }
+
+    // Mvout C from accumulator
+    // printf("Mvout C from accumulator\n");
+    for (size_t i = 0; i < I; i += DIM) {
+        for (size_t j = 0; j < J; j += blocks * DIM) {
+            const size_t cols = j + blocks*DIM <= J ? blocks*DIM : J-j;
+            const size_t rows = i + DIM <= I ? DIM : I-i;
+
+            elem_t * const C_dram_addr = C + i * C_row_stride + j;
+            const uint32_t C_sp_addr = D_sp_addr_start + i * (rounded_up_J/DIM) + j;
+            gemmini_extended_mvout(C_dram_addr, C_sp_addr, cols, rows);
+        }
+    }
+    */
+}
+
+// Compute MVIN_SCALE(A, A_scale) + MVIN_SCALE(B, B_scale) = C
+static void tiled_resadd(const size_t I, const size_t J,
+        const size_t stride,
+        const size_t tile_I, const size_t tile_J,
+        const scale_t A_scale,
+        const scale_t B_scale,
+        const acc_scale_t C_scale,
+        const elem_t * A,
+        const elem_t * B,
+        elem_t * C,
+        bool relu,
+        enum tiled_matmul_type_t matadd_type) {
+
+    gemmini_extended_config_st(stride * sizeof(elem_t), relu ? RELU : NO_ACTIVATION, C_scale);
+    gemmini_config_ex(WS, 0, 0);
+
+    gemmini_extended4_config_ld(stride * sizeof(elem_t), A_scale, true, DIM, 0);
+    gemmini_extended4_config_ld(stride * sizeof(elem_t), B_scale, true, DIM, 1);
+
+    for (size_t i = 0; i < I; i += tile_I) {
+        for (size_t j = 0; j < J; j += tile_J) {
+            const size_t I_tile = i + tile_I <= I ? tile_I : I - i;
+            const size_t J_tile = j + tile_J <= J ? tile_J : J - j;
+
+            const elem_t * a = A + i * stride + j;
+            const elem_t * b = B + i * stride + j;
+            elem_t * c = C + i * stride + j;
+
+            sp_tiled_resadd(I_tile, J_tile,
+                    A_scale, B_scale, a, b, c,
+                    stride, stride, stride,
+                    relu);
+        }
+    }
+
+    gemmini_fence();
+}
+
+// Compute (A >> A_shift) + B = C
+// specify stride
+static void tiled_resadd_stride_auto(const size_t I, const size_t J,
+        const scale_t A_scale,
+        const scale_t B_scale,
+        const acc_scale_t C_scale,
+        const size_t stride,
+        const elem_t * A,
+        const elem_t * B,
+        elem_t * C,
+        bool relu,
+        enum tiled_matmul_type_t matadd_type) {
+
+    if (matadd_type == CPU) {
+        resadd_cpu(I, J, stride,
+            A_scale, B_scale, C_scale, A, B, C,
+            relu);
+        return;
+    }
+
+    size_t tile_I = I, tile_J = J;
+
+    // size_t total_spad_rows = 2 * (tile_I / DIM + (tile_I % DIM != 0))*DIM * (tile_J / DIM + (tile_J % DIM != 0));
+    size_t total_acc_rows = (tile_I / DIM + (tile_I % DIM != 0))*DIM * (tile_J / DIM + (tile_J % DIM != 0));
+
+    // TODO this is a very inefficient way of doing this...
+    while (total_acc_rows > ACC_ROWS / 2) {
+        //if(tile_J > MAX_BLOCK_LEN * DIM)
+        //    tile_J = MAX_BLOCK_LEN * DIM;
+        //else 
+        if (tile_I >= tile_J || tile_J <= DIM)
+            tile_I /= 2;
+        else
+            tile_J -= DIM;
+
+        total_acc_rows = (tile_I / DIM + (tile_I % DIM != 0))*DIM * (tile_J / DIM + (tile_J % DIM != 0));
+    }
+
+    // printf("tile_I: %llu\n", tile_I);
+    // printf("tile_J: %llu\n", tile_J);
+
+    if (matadd_type == WS) {
+      tiled_resadd(I, J, stride, tile_I, tile_J,
+            A_scale, B_scale, C_scale, A, B, C,
+            relu, matadd_type);
+    }
+    else {
+      printf("Unsupported type\n");
+      exit(1);
+    }
+}
+
+static void tiled_resadd_auto(const size_t I, const size_t J,
+        const scale_t A_scale,
+        const scale_t B_scale,
+        const acc_scale_t C_scale,
+        const elem_t * A,
+        const elem_t * B,
+        elem_t * C,
+        bool relu,
+        enum tiled_matmul_type_t matadd_type) {
+    tiled_resadd_stride_auto(I, J, 
+        A_scale, B_scale, C_scale,
+        J,
+        A, B, C,
+        relu, matadd_type);
+}
+
+static void global_average_cpu(const elem_t * input, elem_t * output,
+    int batches, int channels, int dim) {
+  const int count = dim * dim;
+
+  for (int batch = 0; batch < batches; batch++) {
+    for (int channel = 0; channel < channels; channel++) {
+      acc_t sum = 0;
+      for (int row = 0; row < dim; row++) {
+        for (int col = 0; col < dim; col++) {
+          size_t pixel = batch * dim * dim + row * dim + col;
+
+          sum += input[pixel * channels + channel];
+        }
+      }
+
+#ifdef ELEM_T_IS_FLOAT
+      output[batch * channels + channel] = sum / count;
+#else
+      output[batch * channels + channel] = (sum + count/2) / count;
+#endif
+    }
+  }
+}
+
+
+static void sp_tiled_global_average(const elem_t * input, elem_t * output,
+    int batches, int channels, int dim, int channel_tile_size) {
+  const uint32_t C_acc_addr_start = ((uint32_t)1 << 31);
+
+  size_t blocks = channel_tile_size/DIM + (channel_tile_size % DIM != 0);
+  if (blocks > MAX_BLOCK_LEN) blocks = MAX_BLOCK_LEN;
+
+  for (int channel = 0; channel < channel_tile_size; channel += blocks*DIM) {
+    for (int row = 0; row < dim; row++) {
+      for (int col = 0; col < dim; col++) {
+        const elem_t * in = input +
+          (row * dim + col) * channels +
+          channel;
+
+        const uint32_t acc_addr_start = C_acc_addr_start |
+          ((row != 0 || col != 0) << 30);
+
+        const uint32_t acc_addr = acc_addr_start + channel / DIM;
+
+        const size_t cols = channel + blocks*DIM <= channel_tile_size ?
+          blocks*DIM : channel_tile_size - channel;
+
+        const size_t rows = 1;
+
+        gemmini_extended_mvin(in, acc_addr, cols, rows);
+      }
+    }
+  }
+
+  for (int channel = 0; channel < channel_tile_size; channel += DIM) {
+    elem_t * out = output + channel;
+
+    const uint32_t acc_addr = C_acc_addr_start + channel / DIM;
+
+    const size_t cols = channel + DIM <= channel_tile_size ?
+      DIM : channel_tile_size - channel;
+
+    const size_t rows = 1; // TODO we should move out more than just one row here
+
+    gemmini_extended_mvout(out, acc_addr, cols, rows);
+  }
+}
+
+
+static void tiled_global_average(const elem_t * input, elem_t * output,
+    int batches, int channels, int dim,
+    int channel_tile_size) {
+
+  gemmini_extended4_config_ld(DIM*sizeof(elem_t), MVIN_SCALE_IDENTITY, true, 1, 0);
+  gemmini_config_ex(0, NO_ACTIVATION, 0);
+  gemmini_extended_config_st(0, NO_ACTIVATION, 1.0 / (dim*dim));
+
+  for (int batch = 0; batch < batches; batch++) {
+    for (int channel = 0; channel < channels; channel += channel_tile_size) {
+      const int tile_size = channel + channel_tile_size <= channels ?
+        channel_tile_size : channels - channel;
+
+      sp_tiled_global_average(input + batch * dim * dim * channels + channel,
+          output + batch * channels + channel,
+          batches, channels, dim, tile_size);
+    }
+  }
+}
+
+
+static void tiled_global_average_auto(const elem_t * input, elem_t * output,
+    int batches, int channels, int dim,
+    enum tiled_matmul_type_t type) {
+  if (type == CPU) {
+    return global_average_cpu(input, output, batches, channels, dim);
+  }
+
+  int channel_tile_size = channels;
+
+  int acc_rows = channel_tile_size / DIM + (channel_tile_size % DIM != 0);
+  while (acc_rows > ACC_ROWS) {
+    channel_tile_size--;
+    acc_rows = channel_tile_size / DIM + (channel_tile_size % DIM != 0);
+  }
+
+  tiled_global_average(input, output, batches, channels, dim,
+      channel_tile_size);
+}
+
+static void sp_tiled_norm(const size_t I, const size_t J,
+        const acc_t * in, elem_t * out,
+        size_t A_row_stride, size_t C_row_stride,
+        int act) {
+#ifdef HAS_NORMALIZATIONS
+    size_t A_blocks = (J/DIM + (J % DIM != 0));
+    if (A_blocks > MAX_BLOCK_LEN_ACC) A_blocks = MAX_BLOCK_LEN_ACC;
+    size_t C_blocks = (J/DIM + (J % DIM != 0));
+    if (C_blocks > MAX_BLOCK_LEN) C_blocks = MAX_BLOCK_LEN;
+
+    const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
+    const uint32_t C_sp_addr_start = 3 << (ADDR_LEN-2);
+
+    const size_t rounded_up_J = (J / DIM + (J % DIM != 0)) * DIM;
+
+    for (size_t i = 0; i < I; i += DIM) {
+        // Mvin
+        for (size_t j = 0; j < J; j += A_blocks * DIM) {
+            const size_t cols = j + A_blocks*DIM <= J ? A_blocks*DIM : J-j;
+            const size_t rows = i + DIM <= I ? DIM : I-i;
+
+            const acc_t * const A_dram_addr = in + i * A_row_stride + j;
+            const uint32_t A_sp_addr = D_sp_addr_start + i * (rounded_up_J/DIM) + j;
+
+            gemmini_extended_mvin(A_dram_addr, A_sp_addr, cols, rows);
+        }
+
+        // Mvout
+        if (act == LAYERNORM) {
+            uint32_t norm_cmds[][2] = {{1,2},{3,4},{0,0}};
+            const int norm_cmds_size = sizeof(norm_cmds) / sizeof(norm_cmds[0]);
+            const size_t rows = I - i < DIM ? I - i : DIM;
+            for (size_t row = 0; row < rows; row += NORM_STAT_IDS) {
+                const size_t stat_ids = rows - row > NORM_STAT_IDS ?
+                    NORM_STAT_IDS : rows - row;
+                for (int cmd = 0; cmd < norm_cmds_size; cmd++) {
+                    for (size_t stat_id = 0; stat_id < stat_ids; stat_id++) {
+                        gemmini_config_norm(0, 0, 0, 0, stat_id, 0, 0);
+                        const size_t r = row + stat_id;
+                        for (size_t jj = 0; jj < J; jj += C_blocks * DIM) {
+                            uint32_t norm_C_sp_addr = C_sp_addr_start + i * (rounded_up_J/DIM) + jj + r;
+                            if (jj + C_blocks*DIM >= J) {
+                                norm_C_sp_addr |= (norm_cmds[cmd][1] << 26); // Final mean/inv-std-dev calculation
+                            } else {
+                                norm_C_sp_addr |= (norm_cmds[cmd][0] << 26); // Accumulate sum/variance
+                            }
+                            void * const C_dram_addr = (int8_t*)out +
+                                (i*C_row_stride + jj) * sizeof(elem_t) +
+                                r * C_row_stride * sizeof(elem_t);
+                            const size_t cols = J - jj < C_blocks * DIM ? J - jj : C_blocks * DIM;
+                            gemmini_extended_mvout(C_dram_addr, norm_C_sp_addr, cols, 1);
+                        }
+                    }
+                }
+            }
+        } else if (act == SOFTMAX) {
+            uint32_t norm_cmds[][2] = {{5,5},{6,7},{0,0}};
+            const int norm_cmds_size = sizeof(norm_cmds) / sizeof(norm_cmds[0]);
+            const size_t rows = I - i < DIM ? I - i : DIM;
+            for (size_t row = 0; row < rows; row += NORM_STAT_IDS) {
+                const size_t stat_ids = rows - row > NORM_STAT_IDS ?
+                    NORM_STAT_IDS : rows - row;
+                for (int cmd = 0; cmd < norm_cmds_size; cmd++) {
+                    for (size_t stat_id = 0; stat_id < stat_ids; stat_id++) {
+                        // set stat id only
+                        gemmini_config_norm(0, 0, 1, 0, stat_id, 0, 0);
+                        const size_t r = row + stat_id;
+                        for (size_t jj = 0; jj < J; jj += C_blocks * DIM) {
+                            uint32_t norm_C_sp_addr = C_sp_addr_start + i * (rounded_up_J/DIM) + jj + r;
+                            if (jj + C_blocks*DIM >= J) {
+                                norm_C_sp_addr |= (norm_cmds[cmd][1] << 26); // Final mean/inv-std-dev calculation
+                            } else {
+                                norm_C_sp_addr |= (norm_cmds[cmd][0] << 26); // Accumulate sum/variance
+                            }
+                            void * const C_dram_addr = (int8_t*)out +
+                                (i*C_row_stride + jj) * sizeof(elem_t) +
+                                r * C_row_stride * sizeof(elem_t);
+                            const size_t cols = J - jj < C_blocks * DIM ? J - jj : C_blocks * DIM;
+                            gemmini_extended_mvout(C_dram_addr, norm_C_sp_addr, cols, 1);
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+#else
+    printf("Normalizations not supported in this Gemmini config\n");
+    exit(1);
+#endif
+}
+
+static void tiled_norm(const size_t I, const size_t J,
+        const size_t tile_I, const size_t tile_J,
+        const acc_t * in,
+        elem_t * out,
+        const acc_scale_t C_scale,
+        int act,
+        enum tiled_matmul_type_t norm_type) {
+
+    gemmini_extended_config_st(J * sizeof(elem_t), act & 3, C_scale);
+    gemmini_config_ex(WS, 0, 0); // TODO is this actually required?
+
+    gemmini_extended4_config_ld(J * sizeof(acc_t), MVIN_SCALE_IDENTITY, false, DIM, 0);
+    gemmini_extended4_config_ld(J * sizeof(acc_t), MVIN_SCALE_IDENTITY, false, DIM, 1);
+
+    if (act == SOFTMAX) {
+        const scale_t a = 0.3585;
+        const scale_t b = 1.353;
+        const scale_t c = 0.344;
+
+        // TODO let bert-scale be set by the programmer
+        acc_scale_t bert_scale = 0.05;
+        const acc_t qln2 = (int) (0.693147 / bert_scale);
+        const acc_t qln2_inv = 65536 / qln2;
+        const acc_t qb = b / bert_scale;
+        const acc_t qc = c / (a*bert_scale*bert_scale);
+
+        gemmini_config_norm(qln2, 0, 0, 1, 0, qb, qc);
+        gemmini_config_norm(qln2_inv, 1, 0, 1, 0, qb, qc);
+    }
+
+    for (size_t i = 0; i < I; i += tile_I) {
+        for (size_t j = 0; j < J; j += tile_J) {
+            const size_t I_tile = i + tile_I <= I ? tile_I : I - i;
+            const size_t J_tile = j + tile_J <= J ? tile_J : J - j;
+
+            const acc_t * in_ = in + i * J + j;
+            elem_t * out_ = out + i * J + j;
+
+            sp_tiled_norm(I_tile, J_tile,
+                    in_, out_,
+                    J, J,
+                    act);
+        }
+    }
+
+    gemmini_fence();
+}
+
+static void tiled_norm_auto(const size_t I, const size_t J,
+        const acc_t * in,
+        elem_t * out,
+        const acc_scale_t C_scale,
+        int act,
+        enum tiled_matmul_type_t norm_type) {
+
+    size_t tile_I = I, tile_J = J;
+    size_t total_acc_rows = (tile_I / DIM + (tile_I % DIM != 0))*DIM * (tile_J / DIM + (tile_J % DIM != 0));
+
+    while (total_acc_rows > ACC_ROWS) {
+        if (tile_I > 1) {
+            tile_I--;
+        } else {
+            // TODO we should be able to tile over J as well to avoid this issue
+            printf("Can't fit pre-normalized tensor into accumulator");
+            exit(1);
+        }
+
+        total_acc_rows = (tile_I / DIM + (tile_I % DIM != 0))*DIM * (tile_J / DIM + (tile_J % DIM != 0));
+    }
+
+    if (norm_type) {
+      tiled_norm(I, J, tile_I, tile_J,
+            in, out,
+            C_scale, act, norm_type);
+    } else {
+      printf("Unsupported type\n");
+      exit(1);
+    }
+}
+
+#undef abs
+
+#endif // SRC_MAIN_C_GEMMINI_H
+
diff --git a/gemmini/include/gemmini_counter.h b/gemmini/include/gemmini_counter.h
new file mode 100644
index 00000000..6050ed74
--- /dev/null
+++ b/gemmini/include/gemmini_counter.h
@@ -0,0 +1,79 @@
+// See LICENSE for license details.
+
+#ifndef COUNTER_H_
+#define COUNTER_H_
+
+#define DISABLE 0
+
+#define INCREMENTAL_COUNTERS 44
+
+// All existing Gemmini performance counters
+
+#define MAIN_LD_CYCLES 1
+#define MAIN_ST_CYCLES 2
+#define MAIN_EX_CYCLES 3
+#define MAIN_LD_ST_CYCLES 4
+#define MAIN_LD_EX_CYCLES 5
+#define MAIN_ST_EX_CYCLES 6
+#define MAIN_LD_ST_EX_CYCLES 7
+
+#define LOAD_DMA_WAIT_CYCLE 8
+#define LOAD_ACTIVE_CYCLE 9
+#define LOAD_SCRATCHPAD_WAIT_CYCLE 10
+
+#define STORE_DMA_WAIT_CYCLE 11
+#define STORE_ACTIVE_CYCLE 12
+#define STORE_POOLING_CYCLE 13
+#define STORE_SCRATCHPAD_WAIT_CYCLE 14
+
+#define DMA_TLB_MISS_CYCLE 15
+#define DMA_TLB_HIT_REQ 16
+#define DMA_TLB_TOTAL_REQ 17
+
+#define RDMA_ACTIVE_CYCLE 18
+#define RDMA_TLB_WAIT_CYCLES 19
+#define RDMA_TL_WAIT_CYCLES 20
+
+#define WDMA_ACTIVE_CYCLE 21
+#define WDMA_TLB_WAIT_CYCLES 22
+#define WDMA_TL_WAIT_CYCLES 23
+
+#define EXE_ACTIVE_CYCLE 24
+#define EXE_FLUSH_CYCLE 25
+#define EXE_CONTROL_Q_BLOCK_CYCLE 26
+#define EXE_PRELOAD_HAZ_CYCLE 27
+#define EXE_OVERLAP_HAZ_CYCLE 28
+
+#define SCRATCHPAD_A_WAIT_CYCLE 29
+#define SCRATCHPAD_B_WAIT_CYCLE 30
+#define SCRATCHPAD_D_WAIT_CYCLE 31
+
+#define ACC_A_WAIT_CYCLE 32
+#define ACC_B_WAIT_CYCLE 33
+#define ACC_D_WAIT_CYCLE 34
+
+#define A_GARBAGE_CYCLES 35
+#define B_GARBAGE_CYCLES 36
+#define D_GARBAGE_CYCLES 37
+
+#define IM2COL_MEM_CYCLES 38
+#define IM2COL_ACTIVE_CYCLES 39
+#define IM2COL_TRANSPOSER_WAIT_CYCLE 40
+
+#define RESERVATION_STATION_FULL_CYCLES 41
+#define RESERVATION_STATION_ACTIVE_CYCLES 42
+
+#define LOOP_MATMUL_ACTIVE_CYCLES 43
+#define TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES 44
+
+#define RESERVATION_STATION_LD_COUNT (INCREMENTAL_COUNTERS + 1)
+#define RESERVATION_STATION_ST_COUNT (INCREMENTAL_COUNTERS + 2)
+#define RESERVATION_STATION_EX_COUNT (INCREMENTAL_COUNTERS + 3)
+
+#define RDMA_BYTES_REC (INCREMENTAL_COUNTERS + 4)
+#define WDMA_BYTES_SENT (INCREMENTAL_COUNTERS + 5)
+
+#define RDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 6)
+#define WDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 7)
+
+#endif
diff --git a/gemmini/include/gemmini_nn.h b/gemmini/include/gemmini_nn.h
new file mode 100644
index 00000000..44c42ffa
--- /dev/null
+++ b/gemmini/include/gemmini_nn.h
@@ -0,0 +1,576 @@
+#ifndef GEMMINI_NN_H
+#define GEMMINI_NN_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#ifndef BAREMETAL
+#include <sys/mman.h>
+#endif
+#include "include/gemmini.h"
+#include "include/gemmini_testutils.h"
+
+struct ConvParams {
+    int batch_size;
+    int in_row_dim;
+    int in_col_dim;
+    int out_row_dim;
+    int out_col_dim;
+    int kernel_size;
+    int in_channels;
+    int out_channels;
+    int in_stride;
+    int weight_stride;
+    int out_stride;
+    int stride;
+    int padding;
+    bool bias;
+    bool depthwise;
+    int n_patches;
+    int patch_size;
+    acc_scale_t output_scale;
+    scale_t res_scale;
+    int pool_size, pool_stride, pool_padding, out_dim_pooled;
+    
+    int I, J, K;
+};
+
+struct FcParams {
+    int batch_size;
+    int in_features;
+    int out_features;
+    acc_scale_t output_scale;
+    bool bias;
+
+    int I, J, K;
+};
+
+#define HIST_IMAGES(IMAGES) \
+    for (int num = -128; num <= 127; num++) { \
+        int count = 0; \
+        for (int i = 0; i < sizeof(IMAGES)/sizeof(IMAGES[0]); i++) { \
+            for (int j = 0; j < sizeof(IMAGES[0])/sizeof(IMAGES[0][0]); j++) { \
+                for (int k = 0; k < sizeof(IMAGES[0][0])/sizeof(IMAGES[0][0][0]); k++) { \
+                    for (int l = 0; l < sizeof(IMAGES[0][0][0])/sizeof(IMAGES[0][0][0][0]); l++) { \
+                        if (IMAGES[i][j][k][l] == num) { \
+                            count++; \
+                        } \
+                    } \
+                } \
+            } \
+        } \
+        if (count > 0) \
+            printf("%d: %d times\n", num, count); \
+    }
+
+#define HIST_MATRIX(MATRIX) \
+    for (int num = -128; num <= 127; num++) { \
+        int count = 0; \
+        for (int i = 0; i < sizeof(MATRIX)/sizeof(MATRIX[0]); i++) { \
+            for (int j = 0; j < sizeof(MATRIX[0])/sizeof(MATRIX[0][0]); j++) { \
+                if (MATRIX[i][j] == num) { \
+                    count++; \
+                } \
+            } \
+        } \
+        if (count > 0) \
+            printf("%d: %d times\n", num, count); \
+    }
+
+// This function runs a tiled matrix multiplication, with explicit tiling
+// factors
+static void tiled_matmul_nn(size_t dim_I, size_t dim_J, size_t dim_K,
+        const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
+        const void * D, elem_t C[dim_I][dim_J],
+        int act, acc_scale_t scale, bool repeating_bias,
+        size_t tile_I, size_t tile_J, size_t tile_K,
+        enum tiled_matmul_type_t tiled_matmul_type,
+        bool check, char * layer_name)
+{
+    if (check)
+        printf("%s: gemmini\n", layer_name);
+
+    tiled_matmul(dim_I, dim_J, dim_K,
+        (elem_t*)A, (elem_t*)B, D, (elem_t*)C, 
+        dim_K, dim_J, dim_J, dim_J,
+        MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+        act, scale, 0, repeating_bias,
+        tile_I, tile_J, tile_K,
+        false, false,
+        false, false,
+        0,
+        tiled_matmul_type);
+
+    if (check) {
+        printf("%s: CPU\n", layer_name);
+        elem_t gold[dim_I][dim_J];
+        tiled_matmul_auto(dim_I, dim_J, dim_K,
+            (elem_t*)A, (elem_t*)B, D, (elem_t*)gold, 
+            dim_K, dim_J, dim_J, dim_J,
+            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+            act, scale, 0, repeating_bias,
+            false, false,
+            false, false,
+            0,
+            CPU);
+
+        if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
+            printf("Layer calculated incorrectly: %s\n", layer_name);
+            exit(1);
+        }
+    }
+}
+
+// This function runs a tiled matrix multiplication, with automatically
+// calculated tiling factors
+// With default auto-stride calc (A_stride = dim_K, B_stride/C_stride/D_stride = dim_J)
+static void tiled_matmul_nn_auto(size_t dim_I, size_t dim_J, size_t dim_K, 
+        const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
+        const void * D, elem_t C[dim_I][dim_J],
+        int act, acc_scale_t scale, bool repeating_bias,
+        enum tiled_matmul_type_t tiled_matmul_type,
+        bool check, char * layer_name)
+{
+    if (check)
+        printf("%s: gemmini\n", layer_name);
+
+    tiled_matmul_auto(dim_I, dim_J, dim_K,
+        (elem_t*)A, (elem_t*)B, D, (elem_t*)C,
+        dim_K, dim_J, dim_J, dim_J,
+        MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+        act, scale, 0, repeating_bias,
+        false, false,
+        false, false,
+        0,
+        tiled_matmul_type);
+
+    if (check) {
+        printf("%s: CPU\n", layer_name);
+        elem_t gold[dim_I][dim_J];
+        tiled_matmul_auto(dim_I, dim_J, dim_K,
+            (elem_t*)A, (elem_t*)B, D, (elem_t*)gold, 
+            dim_K, dim_J, dim_J, dim_J,
+            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+            act, scale, 0, repeating_bias,
+            false, false,
+            false, false,
+            0,
+            CPU);
+
+        if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
+            printf("Layer calculated incorrectly: %s\n", layer_name);
+            exit(1);
+        }
+    }
+}
+
+// need to specify stride
+// auto tiling calc
+static void tiled_matmul_nn_stride_auto(size_t dim_I, size_t dim_J, size_t dim_K,
+        const size_t A_stride, const size_t B_stride, const size_t C_stride,
+        const elem_t * A, const elem_t * B, const void * D, const elem_t * C,
+        int act, acc_scale_t scale, bool repeating_bias,
+        enum tiled_matmul_type_t tiled_matmul_type)
+{
+
+    tiled_matmul_auto(dim_I, dim_J, dim_K,
+        (elem_t*)A, (elem_t*)B, D, (elem_t*)C,
+        A_stride, B_stride, C_stride, C_stride,
+        MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
+        act, scale, 0, repeating_bias,
+        false, false,
+        false, false,
+        0,
+        tiled_matmul_type);
+}
+static void conv_dw(size_t I, size_t J,
+    const size_t batch_size, const size_t channels,
+    const size_t in_row_dim, const size_t in_col_dim,
+    const size_t out_row_dim, const size_t out_col_dim,
+    const size_t kernel_size,
+    const elem_t input[batch_size][in_row_dim][in_col_dim][channels],
+    const elem_t weight[channels][kernel_size][kernel_size],
+    const acc_t * bias,
+    // elem_t output [batch_size][out_row_dim][out_col_dim][channels],
+    elem_t output [I][J],
+    const struct ConvParams * params)
+{
+    for (int batch = 0; batch < batch_size; batch++) {
+        for (int channel = 0; channel < channels; channel++) {
+            for (int out_row = 0; out_row < out_row_dim; out_row++) {
+                for (int out_col = 0; out_col < out_col_dim; out_col++) {
+                    int in_row = out_row * params->stride - params->padding;
+
+                    acc_t result = 0;
+                    if (params->bias) {
+                        result = bias[channel];
+                    }
+
+                    for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
+                        int in_col = out_col * params->stride - params->padding;
+
+                        for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
+                            if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
+                                result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
+                            }
+
+                            in_col++;
+                        }
+
+                        in_row++;
+                    }
+
+                    if (result < 0) {
+                        result = 0;
+                    }
+                    
+                    acc_t scaled = ACC_SCALE(result, params->output_scale);
+
+                    if (scaled > elem_t_max) {
+                        scaled = elem_t_max;
+                    } else if (scaled < elem_t_min) {
+                        scaled = elem_t_min;
+                    }
+                    
+                    size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
+                    output[r][channel] = scaled;
+                    // output[batch][out_row][out_col][channel] = scaled;
+                }
+            }
+        }
+    }
+}
+
+static void conv_dw_with_col2im(size_t prev_I, size_t prev_J, size_t I, size_t J,
+    const size_t batch_size, const size_t channels,
+    const size_t out_row_dim, const size_t out_col_dim, const size_t kernel_size,
+    const elem_t input[prev_I][prev_J],
+    const elem_t weight[channels][kernel_size][kernel_size],
+    const acc_t * bias,
+    // elem_t output [batch_size][out_dim][out_dim][channels],
+    elem_t output [I][J],
+    const struct ConvParams * params)
+{
+    for (int batch = 0; batch < batch_size; batch++) {
+        for (int channel = 0; channel < channels; channel++) {
+            for (int out_row = 0; out_row < out_row_dim; out_row++) {
+                for (int out_col = 0; out_col < out_col_dim; out_col++) {
+                    int in_row = out_row * params->stride - params->padding;
+
+                    acc_t result = 0;
+                    if (params->bias) {
+                        result = bias[channel];
+                    }
+
+                    for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
+                        int in_col = out_col * params->stride - params->padding;
+
+                        for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
+                            if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
+                                // result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
+
+                                size_t r = batch * params->in_row_dim * params->in_col_dim + in_row * params->in_col_dim + in_col;
+
+                                result += input[r][channel] * weight[channel][kernel_row][kernel_col];
+                            }
+
+                            in_col++;
+                        }
+
+                        in_row++;
+                    }
+
+                    if (result < 0) {
+                        result = 0;
+                    }
+                    
+                    acc_t scaled = ACC_SCALE(result, params->output_scale);
+
+                    if (scaled > elem_t_max) {
+                        scaled = elem_t_max;
+                    } else if (scaled < elem_t_min) {
+                        scaled = elem_t_min;
+                    }
+                    
+                    size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
+                    output[r][channel] = scaled;
+                    // output[batch][out_row][out_col][channel] = scaled;
+                }
+            }
+        }
+    }
+}
+
+static void im2col(size_t batch_size, size_t channels, size_t im_row_dim, size_t im_col_dim,
+    size_t I, size_t K,
+    const elem_t input[batch_size][im_row_dim][im_col_dim][channels],
+    elem_t output[I][K],
+    const struct ConvParams * params)
+{
+    int patch_row = 0;
+
+    for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
+        for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
+            for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
+                int patch_col = 0;
+
+                for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
+                    for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
+                        for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
+                            int pixel_row = im_row + filter_row;
+                            int pixel_col = im_col + filter_col;
+                            
+                            if (pixel_row < 0 || pixel_row >= params->in_row_dim
+                                || pixel_col < 0 || pixel_col >= params->in_col_dim) {
+                                // output[patch_row][patch_col] = 0;
+                            } else {
+                                output[patch_row][patch_col] = input[n_batch][pixel_row][pixel_col][im_channel];
+                            }
+
+                            patch_col++;
+                        }
+                    }
+                }
+                
+                patch_row++;
+            }
+        }
+    }
+}
+
+static void im2col_with_col2im(size_t prev_I, size_t prev_J,
+    size_t next_I, size_t next_K,
+    const elem_t input[prev_I][prev_J],
+    elem_t output[next_I][next_K],
+    const struct ConvParams * params)
+{
+    int out_row = 0;
+
+    for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
+        for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
+            for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
+                int out_col = 0;
+
+                for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
+                    for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
+                        for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
+                            int pixel_row = im_row + filter_row;
+                            int pixel_col = im_col + filter_col;
+
+                            if (pixel_row < 0 || pixel_row >= params->in_row_dim
+                                || pixel_col < 0 || pixel_col >= params->in_col_dim) {
+                                // output[out_row][out_col] = 0;
+                            } else {
+                                int in_row = n_batch * params->in_row_dim * params->in_col_dim + pixel_row * params->in_col_dim + pixel_col;
+                                int in_col = im_channel;
+
+                                output[out_row][out_col] = input[in_row][in_col];
+                            }
+
+                            out_col++;
+                        }
+                    }
+                }
+
+                out_row++;
+            }
+        }
+    }
+}
+
+// Compute C = A + B with saturating add
+void vecadd(size_t len, const elem_t * A, const elem_t * B, elem_t * C, scale_t A_shift) {
+    for (size_t i = 0; i < len; i++) {
+        acc_t result = MVIN_SCALE(A[i], A_shift) + B[i];
+
+        if (result > elem_t_max) {
+            result = elem_t_max;
+        } else if (result < elem_t_min) {
+            result = elem_t_min;
+        }
+
+        C[i] = result;
+    }
+}
+
+void resadd1(const size_t batch_size, const size_t channels, const size_t im_dim,
+    const elem_t A[batch_size][im_dim][im_dim][channels],
+    const elem_t B[batch_size][im_dim][im_dim][channels],
+    elem_t C[batch_size][im_dim][im_dim][channels],
+    bool relu,
+    const struct ConvParams * params) {
+
+    const int minimum = relu ? 0 : elem_t_min;
+
+    for (size_t batch = 0; batch < params->batch_size; batch++) {
+        for (size_t row = 0; row < params->out_dim_pooled; row++) {
+            for (size_t col = 0; col < params->out_dim_pooled; col++) {
+                for (size_t channel = 0; channel < params->out_channels; channel++) {
+                    acc_t result = MVIN_SCALE(A[batch][row][col][channel], params->res_scale) + B[batch][row][col][channel];
+
+                    if (result > elem_t_max) {
+                        result = elem_t_max;
+                    } else if (result < minimum) {
+                        result = minimum;
+                    }
+
+                    C[batch][row][col][channel] = result;
+                }
+            }
+        }
+    }
+}
+
+void resadd2(const size_t I, const size_t J,
+    const size_t batch_size, const size_t channels, const size_t im_dim,
+    const elem_t A[I][J],
+    const elem_t B[batch_size][im_dim][im_dim][channels],
+    elem_t C[batch_size][im_dim][im_dim][channels],
+    bool relu,
+    const struct ConvParams * params) {
+
+    const int minimum = relu ? 0 : elem_t_min;
+
+    for (size_t batch = 0; batch < params->batch_size; batch++) {
+        for (size_t row = 0; row < params->out_dim_pooled; row++) {
+            for (size_t col = 0; col < params->out_dim_pooled; col++) {
+                for (size_t channel = 0; channel < params->out_channels; channel++) {
+                    size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
+
+                    acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[batch][row][col][channel];
+
+                    if (result > elem_t_max) {
+                        result = elem_t_max;
+                    } else if (result < minimum) {
+                        result = minimum;
+                    }
+
+                    C[batch][row][col][channel] = result;
+                }
+            }
+        }
+    }
+}
+
+void resadd3(const size_t I, const size_t J,
+    const elem_t A[I][J],
+    const elem_t B[I][J],
+    elem_t C[I][J],
+    bool relu,
+    const struct ConvParams * params) {
+
+    const int minimum = relu ? 0 : elem_t_min;
+
+    for (size_t batch = 0; batch < params->batch_size; batch++) {
+        for (size_t row = 0; row < params->out_dim_pooled; row++) {
+            for (size_t col = 0; col < params->out_dim_pooled; col++) {
+                for (size_t channel = 0; channel < params->out_channels; channel++) {
+                    size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
+
+                    acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[r][channel];
+
+                    if (result > elem_t_max) {
+                        result = elem_t_max;
+                    } else if (result < minimum) {
+                        result = minimum;
+                    }
+
+                    C[r][channel] = result;
+                }
+            }
+        }
+    }
+}
+
+// Pooling
+void pool(size_t batch_size, size_t channels, size_t in_row_dim, size_t in_col_dim,
+    size_t out_row_dim, size_t out_col_dim,
+    elem_t input[batch_size][in_row_dim][in_col_dim][channels],
+    elem_t output[batch_size][out_row_dim][out_col_dim][channels],
+    const struct ConvParams * params)
+{
+    size_t kernel_size = params->pool_size;
+    size_t stride = params->pool_stride;
+    // size_t in_dim = params->out_dim;
+    size_t padding = params->pool_padding;
+
+    for (int batch = 0; batch < batch_size; batch++) {
+        for (int channel = 0; channel < channels; channel++) {
+            for (int out_row = 0; out_row < out_row_dim; out_row++) {
+                for (int out_col = 0; out_col < out_col_dim; out_col++) {
+                    int in_row = out_row * stride - padding;
+
+                    elem_t result = elem_t_min;
+
+                    for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
+                        int in_col = out_col * stride - padding;
+
+                        for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
+                            if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
+                                if (input[batch][in_row][in_col][channel] > result) {
+                                    result = input[batch][in_row][in_col][channel];
+                                }
+                            } else if (0 > result) {
+                                result = 0;
+                            }
+
+                            in_col++;
+                        }
+
+                        in_row++;
+                    }
+                    
+                    output[batch][out_row][out_col][channel] = result;
+                }
+            }
+        }
+    }
+}
+
+void pool_with_col2im(size_t I, size_t J,
+    size_t batch_size, size_t channels, size_t out_row_dim, size_t out_col_dim,
+    elem_t input[I][J],
+    elem_t output[batch_size][out_row_dim][out_col_dim][channels],
+    const struct ConvParams * params)
+{
+    size_t kernel_size = params->pool_size;
+    size_t stride = params->pool_stride;
+    size_t in_row_dim = params->out_row_dim;
+    size_t in_col_dim = params->out_col_dim;
+    size_t padding = params->pool_padding;
+
+    for (int batch = 0; batch < batch_size; batch++) {
+        for (int channel = 0; channel < channels; channel++) {
+            for (int out_row = 0; out_row < out_row_dim; out_row++) {
+                for (int out_col = 0; out_col < out_col_dim; out_col++) {
+                    int in_row = out_row * stride - padding;
+
+                    elem_t result = elem_t_min;
+
+                    for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
+                        int in_col = out_col * stride - padding;
+
+                        for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
+                            if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
+                                if (input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel] > result) {
+                                    result = input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel];
+                                }
+                            } else if (0 > result) {
+                                result = 0;
+                            }
+
+                            in_col++;
+                        }
+
+                        in_row++;
+                    }
+
+                    output[batch][out_row][out_col][channel] = result;
+                }
+            }
+        }
+    }
+}
+
+#endif // GEMMINI_NN_H
+
diff --git a/gemmini/include/gemmini_params.dim16fp16.h b/gemmini/include/gemmini_params.dim16fp16.h
new file mode 100644
index 00000000..c19739aa
--- /dev/null
+++ b/gemmini/include/gemmini_params.dim16fp16.h
@@ -0,0 +1,90 @@
+#ifndef GEMMINI_PARAMS_H
+#define GEMMINI_PARAMS_H
+
+#include <stdint.h>
+#include <limits.h>
+
+#define XCUSTOM_ACC 3
+#define DIM 16
+#define ADDR_LEN 32
+#define BANK_NUM 4
+#define BANK_ROWS 1024
+#define ACC_ROWS 1024
+#define MAX_BYTES 64
+#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*2))
+#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*2))
+
+typedef uint16_t elem_t;
+#define ELEM_T_IS_LOWPREC_FLOAT
+static const float elem_t_max = 65504.0;
+static const float elem_t_min = -65504.0;
+typedef uint16_t acc_t;
+typedef double full_t;
+
+#define ELEM_T_IS_FLOAT
+#define ELEM_T_EXP_BITS 5
+#define ELEM_T_SIG_BITS 11
+#define ACC_T_EXP_BITS 5
+#define ACC_T_SIG_BITS 11
+typedef uint16_t elem_t_bits;
+typedef uint16_t acc_t_bits;
+
+#define HAS_MVIN_SCALE
+typedef uint16_t scale_t;
+typedef uint16_t scale_t_bits;
+
+typedef int32_t scale_acc_t;
+typedef uint32_t scale_acc_t_bits;
+
+typedef uint16_t acc_scale_t;
+typedef uint16_t acc_scale_t_bits;
+
+#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
+#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
+
+#define MVIN_SCALE_IDENTITY 0x3c00
+
+#define ACC_SCALE_IDENTITY 1.0
+
+#define ROUNDING_RIGHT_SHIFT(x, shift) \
+    ((x) / (1 << (shift)))
+
+#ifdef __cplusplus
+#define SAME_TYPE(x) decltype(x)
+#else
+#define SAME_TYPE(x) typeof(x)
+#endif
+
+#define ROUND_NEAR_EVEN(x) \
+    ({ const SAME_TYPE(x) x_ = (x); \
+         const long long i = x_; \
+         const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
+         SAME_TYPE(x) rem = x_ - i; \
+         rem = rem < 0 ? -rem : rem; \
+         SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
+                     i % 2 == 0 ? i : next)); \
+         result; })
+
+// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
+#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
+((shift) > 0 ? (((x) >> (shift)) + \
+    (((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
+         ((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
+
+#define ACC_SCALE(x, scale) \
+    ((x))
+
+#define MVIN_SCALE(x, scale) \
+    ((x) * (scale))
+
+#define MVIN_SCALE_ACC(x, scale) (x)
+
+#define ACC_SCALE_T_IS_FLOAT
+#define ACC_SCALE_EXP_BITS 5
+#define ACC_SCALE_SIG_BITS 11
+
+#define ACC_READ_SMALL_WIDTH
+
+#define HAS_FIRST_LAYER_OPTIMIZATIONS
+
+#endif // GEMMINI_PARAMS_H
diff --git a/gemmini/include/gemmini_params.dim8fp32.h b/gemmini/include/gemmini_params.dim8fp32.h
new file mode 100644
index 00000000..c2770aaa
--- /dev/null
+++ b/gemmini/include/gemmini_params.dim8fp32.h
@@ -0,0 +1,92 @@
+#ifndef GEMMINI_PARAMS_H
+#define GEMMINI_PARAMS_H
+
+#include <stdint.h>
+#include <limits.h>
+
+#define XCUSTOM_ACC 3
+#define DIM 8
+#define ADDR_LEN 32
+#define BANK_NUM 8
+#define BANK_ROWS 1024
+#define ACC_ROWS 512
+#define MAX_BYTES 64
+#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*4))
+#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*4))
+
+typedef float elem_t;
+static const elem_t elem_t_max = 3.4028235E38;
+static const elem_t elem_t_min = -3.4028235E38;
+typedef float acc_t;
+typedef double full_t;
+
+#define ELEM_T_IS_FLOAT
+#define ELEM_T_EXP_BITS 8
+#define ELEM_T_SIG_BITS 24
+#define ACC_T_EXP_BITS 8
+#define ACC_T_SIG_BITS 24
+typedef uint32_t elem_t_bits;
+typedef uint32_t acc_t_bits;
+
+#define HAS_MVIN_SCALE
+typedef float scale_t;
+typedef uint32_t scale_t_bits;
+
+#define HAS_MVIN_ACC_SCALE
+typedef float scale_acc_t;
+typedef uint32_t scale_acc_t_bits;
+
+typedef float acc_scale_t;
+typedef uint32_t acc_scale_t_bits;
+
+#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
+#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
+
+#define MVIN_SCALE_IDENTITY 1.0
+
+#define ACC_SCALE_IDENTITY 1.0
+
+#define ROUNDING_RIGHT_SHIFT(x, shift) \
+    ((x) / (1 << (shift)))
+
+#ifdef __cplusplus
+#define SAME_TYPE(x) decltype(x)
+#else
+#define SAME_TYPE(x) typeof(x)
+#endif
+
+#define ROUND_NEAR_EVEN(x) \
+    ({ const SAME_TYPE(x) x_ = (x); \
+         const long long i = x_; \
+         const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
+         SAME_TYPE(x) rem = x_ - i; \
+         rem = rem < 0 ? -rem : rem; \
+         SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
+                     i % 2 == 0 ? i : next)); \
+         result; })
+
+// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
+#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
+((shift) > 0 ? (((x) >> (shift)) + \
+    (((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
+         ((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
+
+#define ACC_SCALE(x, scale) \
+    ((x) * (scale))
+
+#define MVIN_SCALE(x, scale) \
+    ((x) * (scale))
+
+#define MVIN_SCALE_ACC(x, scale) \
+    ((x) * (scale))
+
+#define ACC_SCALE_T_IS_FLOAT
+#define ACC_SCALE_EXP_BITS 8
+#define ACC_SCALE_SIG_BITS 24
+
+#define ACC_READ_SMALL_WIDTH
+#define ACC_READ_FULL_WIDTH
+
+#define HAS_FIRST_LAYER_OPTIMIZATIONS
+
+#endif // GEMMINI_PARAMS_H
diff --git a/gemmini/include/gemmini_params.h b/gemmini/include/gemmini_params.h
new file mode 120000
index 00000000..cf16fe06
--- /dev/null
+++ b/gemmini/include/gemmini_params.h
@@ -0,0 +1 @@
+gemmini_params.dim16fp16.h
\ No newline at end of file
diff --git a/gemmini/include/gemmini_testutils.h b/gemmini/include/gemmini_testutils.h
new file mode 100644
index 00000000..71af8747
--- /dev/null
+++ b/gemmini/include/gemmini_testutils.h
@@ -0,0 +1,285 @@
+// See LICENSE for license details.
+
+#ifndef SRC_MAIN_C_GEMMINI_TESTUTILS_H
+#define SRC_MAIN_C_GEMMINI_TESTUTILS_H
+
+#undef abs
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <stdbool.h>
+
+#include "include/gemmini_params.h"
+#include "include/gemmini.h"
+
+#ifdef BAREMETAL
+#undef assert
+#define assert(expr) \
+    if (!(expr)) { \
+      printf("Failed assertion: " #expr "\n  " __FILE__ ":%u\n", __LINE__); \
+      exit(1); \
+    }
+#endif
+
+// #define GEMMINI_ASSERTIONS
+
+// Matmul utility functions
+static void matmul(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[r][k]*B[k][c];
+    }
+}
+
+static void matmul_short(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C[r][c] += A[r][k]*B[k][c];
+    }
+}
+
+static void matmul_full(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  // Identical to the other matmul function, but with a 64-bit bias
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[r][k]*B[k][c];
+    }
+}
+
+static void matmul_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[k][r]*B[k][c];
+    }
+}
+
+static void matmul_short_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C[r][c] += A[k][r]*B[k][c];
+    }
+}
+
+static void matmul_full_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[k][r]*B[k][c];
+    }
+}
+
+static void matmul_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[r][k]*B[c][k];
+    }
+}
+
+static void matmul_short_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C[r][c] += A[r][k]*B[c][k];
+    }
+}
+
+static void matmul_full_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[r][k]*B[c][k];
+    }
+}
+
+static void matmul_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[k][r]*B[c][k];
+    }
+}
+
+static void matmul_short_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C[r][c] += A[k][r]*B[c][k];
+    }
+}
+
+static void matmul_full_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      C_full[r][c] = D[r][c];
+      for (size_t k = 0; k < DIM; k++)
+        C_full[r][c] += A[k][r]*B[c][k];
+    }
+}
+
+static void matadd(full_t sum[DIM][DIM], full_t m1[DIM][DIM], full_t m2[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++)
+      sum[r][c] = m1[r][c] + m2[r][c];
+}
+
+// THIS IS A ROUNDING SHIFT! It also performs a saturating cast
+static void matshift(full_t full[DIM][DIM], elem_t out[DIM][DIM], int shift) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      // Bitshift and round element
+      full_t shifted = ROUNDING_RIGHT_SHIFT(full[r][c], shift);
+
+      // Saturate and cast element
+#ifndef ELEM_T_IS_FLOAT
+      full_t elem = shifted > elem_t_max ? elem_t_max : (shifted < elem_t_min ? elem_t_min : shifted);
+      out[r][c] = elem;
+#else
+      out[r][c] = shifted; // TODO should we also saturate when using floats?
+#endif
+    }
+}
+
+static void matscale(full_t full[DIM][DIM], elem_t out[DIM][DIM], acc_scale_t scale) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++) {
+      // Bitshift and round element
+      full_t scaled = ACC_SCALE(full[r][c], scale);
+
+      // Saturate and cast element
+#ifndef ELEM_T_IS_FLOAT
+      full_t elem = scaled > elem_t_max ? elem_t_max : (scaled < elem_t_min ? elem_t_min : scaled);
+      out[r][c] = elem;
+#else
+      out[r][c] = scaled; // TODO should we also saturate when using floats?
+#endif
+    }
+}
+
+static void matrelu(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++)
+      out[r][c] = in[r][c] > 0 ? in[r][c] : 0;
+}
+
+static void transpose(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
+  for (size_t r = 0; r < DIM; r++)
+    for (size_t c = 0; c < DIM; c++)
+      out[c][r] = in[r][c];
+}
+
+int rand() {
+  static uint32_t x = 777;
+  x = x * 1664525 + 1013904223;
+  return x >> 24;
+}
+
+
+#ifdef ELEM_T_IS_FLOAT
+double rand_double() {
+    double a = (double)(rand() % 128) / (double)(1 + (rand() % 64));
+    double b = (double)(rand() % 128) / (double)(1 + (rand() % 64));
+    return a - b;
+}
+#endif
+
+static void printMatrix(elem_t m[DIM][DIM]) {
+  for (size_t i = 0; i < DIM; ++i) {
+    for (size_t j = 0; j < DIM; ++j)
+#ifndef ELEM_T_IS_FLOAT
+      printf("%d ", m[i][j]);
+#else
+      printf("%x ", elem_t_to_elem_t_bits(m[i][j]));
+#endif
+    printf("\n");
+  }
+}
+
+static void printMatrixAcc(acc_t m[DIM][DIM]) {
+  for (size_t i = 0; i < DIM; ++i) {
+    for (size_t j = 0; j < DIM; ++j)
+#ifndef ELEM_T_IS_FLOAT
+      printf("%d ", m[i][j]);
+#else
+      printf("%x ", acc_t_to_acc_t_bits(m[i][j]));
+#endif
+    printf("\n");
+  }
+}
+
+static int is_equal(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
+  for (size_t i = 0; i < DIM; ++i)
+    for (size_t j = 0; j < DIM; ++j) {
+#ifndef ELEM_T_IS_FLOAT
+      if (x[i][j] != y[i][j])
+#else
+      bool isnanx = elem_t_isnan(x[i][j]);
+      bool isnany = elem_t_isnan(y[i][j]);
+
+      if (x[i][j] != y[i][j] && !(isnanx && isnany))
+#endif
+          return 0;
+    }
+  return 1;
+}
+
+static int is_equal_transposed(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
+  for (size_t i = 0; i < DIM; ++i)
+    for (size_t j = 0; j < DIM; ++j) {
+#ifndef ELEM_T_IS_FLOAT
+      if (x[i][j] != y[j][i])
+#else
+      bool isnanx = elem_t_isnan(x[i][j]);
+      bool isnany = elem_t_isnan(y[j][i]);
+
+      if (x[i][j] != y[j][i] && !(isnanx && isnany))
+#endif
+          return 0;
+    }
+  return 1;
+}
+
+// This is a GNU extension known as statment expressions
+#define MAT_IS_EQUAL(dim_i, dim_j, x, y) \
+    ({int result = 1; \
+      for (size_t i = 0; i < dim_i; i++) \
+        for (size_t j = 0; j < dim_j; ++j) { \
+          if (x[i][j] != y[i][j]) { \
+            result = 0; \
+            break; \
+          } \
+        } \
+      result;})
+
+static uint64_t read_cycles() {
+    uint64_t cycles;
+    asm volatile ("rdcycle %0" : "=r" (cycles));
+    return cycles;
+
+    // const uint32_t * mtime = (uint32_t *)(33554432 + 0xbff8);
+    // const uint32_t * mtime = (uint32_t *)(33554432 + 0xbffc);
+    // return *mtime;
+}
+
+#undef abs
+
+#endif  // SRC_MAIN_C_GEMMINI_TESTUTILS_H
diff --git a/gemmini/include/translator.h b/gemmini/include/translator.h
new file mode 100644
index 00000000..27357db8
--- /dev/null
+++ b/gemmini/include/translator.h
@@ -0,0 +1,13 @@
+// See LICENSE for license details.
+
+#ifndef SRC_MAIN_C_TRANSLATOR_H
+#define SRC_MAIN_C_TRANSLATOR_H
+
+#include "rocc-software/src/xcustom.h"
+
+#define XCUSTOM_TRANS 1
+
+#define doTranslate(y, vaddr)                                \
+    ROCC_INSTRUCTION(XCUSTOM_TRANS, y, vaddr, 0, 0);
+
+#endif  // SRC_MAIN_C_TRANSLATOR_H
diff --git a/gemmini/rocc-software/.gitignore b/gemmini/rocc-software/.gitignore
new file mode 100644
index 00000000..59914d9e
--- /dev/null
+++ b/gemmini/rocc-software/.gitignore
@@ -0,0 +1,3 @@
+*~
+*#
+*.#*
\ No newline at end of file
diff --git a/gemmini/rocc-software/CONTRIBUTING.md b/gemmini/rocc-software/CONTRIBUTING.md
new file mode 100644
index 00000000..6e97de44
--- /dev/null
+++ b/gemmini/rocc-software/CONTRIBUTING.md
@@ -0,0 +1,46 @@
+All contributors must agree to the Developer Certificate of Origin Version 1.1. (DCO 1.1) by signing their commits with:
+
+```
+DCO 1.1 Signed-off-by: [NAME] <[EMAIL]>
+```
+
+The full text of the DCO 1.1 is as follows:
+
+```
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+660 York Street, Suite 102,
+San Francisco, CA 94110 USA
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+have the right to submit it under the open source license
+indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+of my knowledge, is covered under an appropriate open source
+license and I have the right under that license to submit that
+work with modifications, whether created in whole or in part
+by me, under the same open source license (unless I am
+permitted to submit under a different license), as indicated
+in the file; or
+
+(c) The contribution was provided directly to me by some other
+person who certified (a), (b) or (c) and I have not modified
+it.
+
+(d) I understand and agree that this project and the contribution
+are public and that a record of the contribution (including all
+personal information I submit with it, including my sign-off) is
+maintained indefinitely and may be redistributed consistent with
+this project or the open source license(s) involved.
+```
diff --git a/gemmini/rocc-software/LICENSE b/gemmini/rocc-software/LICENSE
new file mode 100644
index 00000000..8dada3ed
--- /dev/null
+++ b/gemmini/rocc-software/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/gemmini/rocc-software/README.md b/gemmini/rocc-software/README.md
new file mode 100644
index 00000000..237b3902
--- /dev/null
+++ b/gemmini/rocc-software/README.md
@@ -0,0 +1,4 @@
+Rocket Custom Coprocessor (RoCC) Software
+========================================
+
+This is a set of C and RISC-V Assembly macros that help with emitting custom RISC-V instructions for talking with Rocket Custom Coprocessors (RoCCs).
diff --git a/gemmini/rocc-software/src/riscv_test_rocc.h b/gemmini/rocc-software/src/riscv_test_rocc.h
new file mode 100644
index 00000000..453b9663
--- /dev/null
+++ b/gemmini/rocc-software/src/riscv_test_rocc.h
@@ -0,0 +1,28 @@
+// Copyright 2018 IBM
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_
+#define ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_
+
+
+#define RVTEST_XS_ENABLE                        \
+  li a0, MSTATUS_XS & (MSTATUS_XS >> 1);        \
+  csrs mstatus, a0;
+
+#define RVTEST_WITH_ROCC                        \
+  .macro init;                                  \
+  RVTEST_XS_ENABLE                              \
+  .endm
+
+#endif  // ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_
diff --git a/gemmini/rocc-software/src/xcustom.h b/gemmini/rocc-software/src/xcustom.h
new file mode 100644
index 00000000..c1eb257a
--- /dev/null
+++ b/gemmini/rocc-software/src/xcustom.h
@@ -0,0 +1,170 @@
+// Copyright 2018--2020 IBM
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ROCC_SOFTWARE_SRC_XCUSTOM_H_
+#define ROCC_SOFTWARE_SRC_XCUSTOM_H_
+
+#define STR1(x) #x
+#ifndef STR
+#define STR(x) STR1(x)
+#endif
+
+#define CAT_(A, B) A##B
+#define CAT(A, B) CAT_(A, B)
+
+/** Assembly macro for creating "raw" Rocket Custom Coproessor (RoCC)
+  * assembly language instructions that will return data in rd. These
+  * are to be used only in assembly language programs (not C/C++).
+  *
+  * Example:
+  *
+  * Consider the following macro consisting of a CUSTOM_0 instruction
+  * with func7 "42" that is doing some operation of "a0 = op(a1, a2)":
+  *
+  *     ROCC_INSTRUCTION_RAW_R_R_R(0, a0, a1, a2, 42)
+  *
+  * This will produce the following pseudo assembly language
+  * instruction:
+  *
+  *     .insn r CUSTOM_0, 7, 42, a0, a1, a2
+  *
+  * @param x the custom instruction number: 0, 1, 2, or 3
+  * @param rd the destination register, e.g., a0 or x10
+  * @param rs1 the first source register, e.g., a0 or x10
+  * @param rs2 the second source register, e.g., a0 or x10
+  * @param func7 the value of the func7 field
+  * @return a raw .insn RoCC instruction
+  */
+#define ROCC_INSTRUCTION_RAW_R_R_R(x, rd, rs1, rs2, func7) \
+  .insn r CAT(CUSTOM_, x), 7, func7, rd, rs1, rs2
+
+/** Assembly macro for creating "raw" Rocket Custom Coproessor (RoCC)
+  * assembly language instructions that will *NOT* return data in rd.
+  * These are to be used only in assembly language programs (not
+  * C/C++).
+  *
+  * Example:
+  *
+  * Consider the following macro consisting of a CUSTOM_1 instruction
+  * with func7 "42" that is doing some operation of "op(a1, a2)". *NO*
+  * data is returned:
+  *
+  *     ROCC_INSTRUCTION_RAW_R_R_R(1, a1, a2, 42)
+  *
+  * This will produce the following pseudo assembly language
+  * instruction:
+  *
+  *     .insn r CUSTOM_1, 3, 42, x0, a1, a2
+  *
+  * @param x the custom instruction number: 0, 1, 2, or 3
+  * @param rs1 the first source register, e.g., a0 or x10
+  * @param rs2 the second source register, e.g., a0 or x10
+  * @param func7 the value of the func7 field
+  * @return a raw .insn RoCC instruction
+  */
+#define ROCC_INSTRUCTION_RAW_0_R_R(x, rs1, rs2, func7) \
+  .insn r CAT(CUSTOM_, x), 3, func7, x0, rs1, rs2
+
+/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
+  * (RoCC) instructions that return data in rd. These are to be used
+  * only in C/C++ programs (not bare assembly).
+  *
+  * This is equivalent to ROCC_INSTRUCTION_R_R_R. See it's
+  * documentation.
+  */
+#define ROCC_INSTRUCTION(x, rd, rs1, rs2, func7) \
+  ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, func7)
+
+/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
+  * (RoCC) instructions that return data in C variable rd. 
+  * These are to be used only in C/C++ programs (not bare assembly).
+  *
+  * Example:
+  *
+  * Consider the following macro consisting of a CUSTOM_2 instruction
+  * with func7 "42" that is doing some operation of "a0 = op(a1, a2)"
+  * (where a0, a1, and a2 are variables defined in C):
+  *
+  *     ROCC_INSTRUCTION(2, a0, a1, a2, 42)
+  *
+  * This will produce the following inline assembly:
+  *
+  *     asm volatile(
+  *         ".insn r CUSTOM_2, 0x7, 42, %0, %1, %2"
+  *         : "=r"(rd)
+  *         : "r"(rs1), "r"(rs2));
+  *
+  * @param x the custom instruction number: 0, 1, 2, or 3
+  * @param rd the C variable to capture as destination operand
+  * @param rs1 the C variable to capture for first source register
+  * @param rs2 the C variable to capture for second source register
+  * @param func7 the value of the func7 field
+  * @return an inline assembly RoCC instruction
+  */
+#define ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, func7)                               \
+  {                                                                                  \
+    asm volatile(                                                                    \
+        ".insn r " STR(CAT(CUSTOM_, x)) ", " STR(0x7) ", " STR(func7) ", %0, %1, %2" \
+        : "=r"(rd)                                                                   \
+        : "r"(rs1), "r"(rs2));                                                       \
+  }
+
+/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
+  * (RoCC) instructions that return data in C variable rd.
+  * These are to be used only in C/C++ programs (not bare assembly).
+  *
+  * Example:
+  *
+  * Consider the following macro consisting of a CUSTOM_3 instruction
+  * with func7 "42" that is doing some operation of "a0 = op(a1, a2)"
+  * (where a0, a1, and a2 are variables defined in C):
+  *
+  *     ROCC_INSTRUCTION(3, a0, a1, a2, 42)
+  *
+  * This will produce the following inline assembly:
+  *
+  *     asm volatile(
+  *         ".insn r CUSTOM_3, 0x7, 42, %0, %1, %2"
+  *         :: "r"(rs1), "r"(rs2));
+  *
+  * @param x the custom instruction number: 0, 1, 2, or 3
+  * @param rs1 the C variable to capture for first source register
+  * @param rs2 the C variable to capture for second source register
+  * @param funct7 the value of the funct7 f
+  * @return an inline assembly RoCC instruction
+  */
+#define ROCC_INSTRUCTION_0_R_R(x, rs1, rs2, func7)                                   \
+  {                                                                                  \
+    asm volatile(                                                                    \
+        ".insn r " STR(CAT(CUSTOM_, x)) ", " STR(0x3) ", " STR(func7) ", x0, %0, %1" \
+        :                                                                            \
+        : "r"(rs1), "r"(rs2));                                                       \
+  }
+
+// [TODO] fix these to align with the above approach
+// Macro to pass rs2_ as an immediate
+/*
+#define ROCC_INSTRUCTION_R_R_I(XCUSTOM_, rd_, rs1_, rs2_, funct_) \
+  asm volatile (XCUSTOM_" %[rd], %[rs1], %[rs2], %[funct]"        \
+                : [rd] "=r" (rd_)                                 \
+                : [rs1] "r" (rs1_), [rs2] "i" (rs2_), [funct] "i" (funct_))
+
+// Macro to pass rs1_ and rs2_ as immediates
+#define ROCC_INSTRUCTION_R_I_I(XCUSTOM_, rd_, rs1_, rs2_, funct_) \
+  asm volatile (XCUSTOM_" %[rd], %[rs1], %[rs2], %[funct]"        \
+                : [rd] "=r" (rd_)                                 \
+                : [rs1] "i" (rs1_), [rs2] "i" (rs2_), [funct] "i" (funct_))
+*/
+
+#endif  // ROCC_SOFTWARE_SRC_XCUSTOM_H_
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index f000dcf6..96c5965a 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -22,7 +22,7 @@ RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
 
 VORTEX_RT_PATH ?= $(realpath ../../../runtime)
 VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests)
+GEMMINI_SW_PATH ?= $(realpath ../../../gemmini)
 
 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
 
diff --git a/third_party/gemmini-rocc-tests b/third_party/gemmini-rocc-tests
deleted file mode 160000
index 6148fc0d..00000000
--- a/third_party/gemmini-rocc-tests
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6148fc0d2c7a91ec87e72bdd3c3808c6f985a77e