Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/sim/Makefile
+++ b/sim/Makefile
@@ -1,9 +1,9 @@
 all:
 	$(MAKE) -C simx
 	$(MAKE) -C rtlsim
-	$(MAKE) -C vlsim
+	$(MAKE) -C opaesim
 	
 clean:
 	$(MAKE) -C simx clean
 	$(MAKE) -C rtlsim clean
-	$(MAKE) -C vlsim clean
+	$(MAKE) -C opaesim clean
--- a/sim/common/bitmanip.h
+++ b/sim/common/bitmanip.h
@@ -1,7 +1,19 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <cstdint>
-#include <algorithm>
 #include <assert.h>

 constexpr uint32_t count_leading_zeros(uint32_t value) {
@@ -77,5 +89,15 @@ T sext(const T& word, uint32_t width) {
  if (width == (sizeof(T) * 8)) 
    return word;
  T mask((static_cast<T>(1) << width) - 1);
-  return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
-}
+  return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : (word & mask);
+}
+
+template <typename T = uint32_t>
+T zext(const T& word, uint32_t width) {
+  assert(width > 1);
+  assert(width <= (sizeof(T) * 8));
+  if (width == (sizeof(T) * 8)) 
+    return word;
+  T mask((static_cast<T>(1) << width) - 1);
+  return word & mask;
+}
--- a/sim/common/mem.cpp
+++ b/sim/common/mem.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "mem.h"
 #include <vector>
 #include <iostream>
@@ -20,8 +33,9 @@ RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
    contents_.push_back(input.get());
  } while (input);

-  while (contents_.size() & (wordSize-1))
+  while (contents_.size() & (wordSize-1)) {
    contents_.push_back(0x00);
+  }
 }

 RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
@@ -29,7 +43,7 @@ RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
  , wordSize_(wordSize)
 {}

-void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
+void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
  auto addr_end = addr + size;
  if ((addr & (wordSize_-1))
   || (addr_end & (wordSize_-1)) 
@@ -44,7 +58,7 @@ void RamMemDevice::read(void *data, uint64_t addr, uint64_t size) {
  }
 }

-void RamMemDevice::write(const void *data, uint64_t addr, uint64_t size) {
+void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
  auto addr_end = addr + size;
  if ((addr & (wordSize_-1))
   || (addr_end & (wordSize_-1)) 
@@ -68,26 +82,26 @@ void RomMemDevice::write(const void* /*data*/, uint64_t /*addr*/, uint64_t /*siz

 ///////////////////////////////////////////////////////////////////////////////

-bool MemoryUnit::ADecoder::lookup(uint64_t a, uint32_t wordSize, mem_accessor_t* ma) {
-  uint64_t e = a + (wordSize - 1);
-  assert(e >= a);
+bool MemoryUnit::ADecoder::lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t* ma) {
+  uint64_t end = addr + (wordSize - 1);
+  assert(end >= addr);
  for (auto iter = entries_.rbegin(), iterE = entries_.rend(); iter != iterE; ++iter) {
-    if (a >= iter->start && e <= iter->end) {
+    if (addr >= iter->start && end <= iter->end) {
      ma->md   = iter->md;
-      ma->addr = a - iter->start;
+      ma->addr = addr - iter->start;
      return true;
    }
  }
  return false;
 }

-void MemoryUnit::ADecoder::map(uint64_t a, uint64_t e, MemDevice &m) {
-  assert(e >= a);
-  entry_t entry{&m, a, e};
+void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) {
+  assert(end >= start);
+  entry_t entry{&md, start, end};
  entries_.emplace_back(entry);
 }

-void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
+void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
  mem_accessor_t ma;
  if (!this->lookup(addr, size, &ma)) {
    std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -96,7 +110,7 @@ void MemoryUnit::ADecoder::read(void *data, uint64_t addr, uint64_t size) {
  ma.md->read(data, ma.addr, size);
 }

-void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size) {
+void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) {
  mem_accessor_t ma;
  if (!this->lookup(addr, size, &ma)) {
    std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
@@ -107,11 +121,11 @@ void MemoryUnit::ADecoder::write(const void *data, uint64_t addr, uint64_t size)

 ///////////////////////////////////////////////////////////////////////////////

-MemoryUnit::MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm)
+MemoryUnit::MemoryUnit(uint64_t pageSize)
  : pageSize_(pageSize)
-  , addrBytes_(addrBytes)
-  , disableVM_(disableVm) {
-  if (!disableVm) {
+  , enableVM_(pageSize != 0)
+  , amo_reservation_({0x0, false}) {
+  if (pageSize != 0) {
    tlb_[0] = TLBEntry(0, 077);
  }
 }
@@ -133,30 +147,38 @@ MemoryUnit::TLBEntry MemoryUnit::tlbLookup(uint64_t vAddr, uint32_t flagMask) {
  }
 }

-void MemoryUnit::read(void *data, uint64_t addr, uint64_t size, bool sup) {
+uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) {
  uint64_t pAddr;
-  if (disableVM_) {
-    pAddr = addr;
-  } else {
-    uint32_t flagMask = sup ? 8 : 1;
+  if (enableVM_) {
    TLBEntry t = this->tlbLookup(addr, flagMask);
    pAddr = t.pfn * pageSize_ + addr % pageSize_;
+  } else {
+    pAddr = addr;    
  }
+  return pAddr;
+}
+
+void MemoryUnit::read(void* data, uint64_t addr, uint64_t size, bool sup) {
+  uint64_t pAddr = this->toPhyAddr(addr, sup ? 8 : 1);
  return decoder_.read(data, pAddr, size);
 }

-void MemoryUnit::write(const void *data, uint64_t addr, uint64_t size, bool sup) {
-  uint64_t pAddr;
-  if (disableVM_) {
-    pAddr = addr;
-  } else {
-    uint32_t flagMask = sup ? 16 : 2;
-    TLBEntry t = tlbLookup(addr, flagMask);
-    pAddr = t.pfn * pageSize_ + addr % pageSize_;
-  }
+void MemoryUnit::write(const void* data, uint64_t addr, uint64_t size, bool sup) {
+  uint64_t pAddr = this->toPhyAddr(addr, sup ? 16 : 1);
  decoder_.write(data, pAddr, size);
+  amo_reservation_.valid = false;
 }

+void MemoryUnit::amo_reserve(uint64_t addr) {
+  uint64_t pAddr = this->toPhyAddr(addr, 1);
+  amo_reservation_.addr = pAddr;
+  amo_reservation_.valid = true;
+}
+
+bool MemoryUnit::amo_check(uint64_t addr) {
+  uint64_t pAddr = this->toPhyAddr(addr, 1);
+  return amo_reservation_.valid && (amo_reservation_.addr == pAddr);
+}
 void MemoryUnit::tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags) {
  tlb_[virt / pageSize_] = TLBEntry(phys / pageSize_, flags);
 }
@@ -168,12 +190,14 @@ void MemoryUnit::tlbRm(uint64_t va) {

 ///////////////////////////////////////////////////////////////////////////////

-RAM::RAM(uint32_t page_size) 
-  : size_(0)
+RAM::RAM(uint32_t page_size, uint64_t capacity) 
+  : capacity_(capacity)
  , page_bits_(log2ceil(page_size))
  , last_page_(nullptr)
  , last_page_index_(0) {    
   assert(ispow2(page_size));
+   assert(0 == capacity || ispow2(capacity));
+   assert(0 == (capacity % page_size));
 }

 RAM::~RAM() {
@@ -191,6 +215,9 @@ uint64_t RAM::size() const {
 }

 uint8_t *RAM::get(uint64_t address) const {
+  if (capacity_ != 0 && address >= capacity_) {
+    throw OutOfRange();
+  }
  uint32_t page_size   = 1 << page_bits_;  
  uint32_t page_offset = address & (page_size - 1);
  uint64_t page_index  = address >> page_bits_;
@@ -218,14 +245,14 @@ uint8_t *RAM::get(uint64_t address) const {
  return page + page_offset;
 }

-void RAM::read(void *data, uint64_t addr, uint64_t size) {
+void RAM::read(void* data, uint64_t addr, uint64_t size) {
  uint8_t* d = (uint8_t*)data;
  for (uint64_t i = 0; i < size; i++) {
    d[i] = *this->get(addr + i);
  }
 }

-void RAM::write(const void *data, uint64_t addr, uint64_t size) {
+void RAM::write(const void* data, uint64_t addr, uint64_t size) {
  const uint8_t* d = (const uint8_t*)data;
  for (uint64_t i = 0; i < size; i++) {
    *this->get(addr + i) = d[i];
@@ -236,6 +263,7 @@ void RAM::loadBinImage(const char* filename, uint64_t destination) {
  std::ifstream ifs(filename);
  if (!ifs) {
    std::cout << "error: " << filename << " not found" << std::endl;
+    std::abort();
  }

  ifs.seekg(0, ifs.end);
@@ -268,6 +296,7 @@ void RAM::loadHexImage(const char* filename) {
  std::ifstream ifs(filename);
  if (!ifs) {
    std::cout << "error: " << filename << " not found" << std::endl;
+    std::abort();
  }

  ifs.seekg(0, ifs.end);
@@ -313,4 +342,4 @@ void RAM::loadHexImage(const char* filename) {
    ++line;
    --size;
  }
-}
+}
--- a/sim/common/mem.h
+++ b/sim/common/mem.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <cstdint>
@@ -7,13 +20,14 @@

 namespace vortex {
 struct BadAddress {};
+struct OutOfRange {};

 class MemDevice {
 public:
  virtual ~MemDevice() {}
  virtual uint64_t size() const = 0;
-  virtual void read(void *data, uint64_t addr, uint64_t size) = 0;
-  virtual void write(const void *data, uint64_t addr, uint64_t size) = 0;
+  virtual void read(void* data, uint64_t addr, uint64_t size) = 0;
+  virtual void write(const void* data, uint64_t addr, uint64_t size) = 0;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -21,11 +35,11 @@ public:
 class RamMemDevice : public MemDevice {
 public:
  RamMemDevice(uint64_t size, uint32_t wordSize);
-  RamMemDevice(const char *filename, uint32_t wordSize);
+  RamMemDevice(const char* filename, uint32_t wordSize);
  ~RamMemDevice() {}

-  void read(void *data, uint64_t addr, uint64_t size) override;  
-  void write(const void *data, uint64_t addr, uint64_t size) override;
+  void read(void* data, uint64_t addr, uint64_t size) override;  
+  void write(const void* data, uint64_t addr, uint64_t size) override;

  virtual uint64_t size() const {
    return contents_.size();
@@ -50,7 +64,7 @@ public:
  
  ~RomMemDevice();

-  void write(const void *data, uint64_t addr, uint64_t size) override;
+  void write(const void* data, uint64_t addr, uint64_t size) override;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -63,47 +77,56 @@ public:
      : faultAddr(a)
      , notFound(nf) 
    {}
-    uint64_t faultAddr;
-    bool notFound;
+    uint64_t  faultAddr;
+    bool      notFound;
  };

-  MemoryUnit(uint64_t pageSize, uint64_t addrBytes, bool disableVm = false);
+  MemoryUnit(uint64_t pageSize = 0);

  void attach(MemDevice &m, uint64_t start, uint64_t end);

-  void read(void *data, uint64_t addr, uint64_t size, bool sup);  
-  void write(const void *data, uint64_t addr, uint64_t size, bool sup);
+  void read(void* data, uint64_t addr, uint64_t size, bool sup);
+  void write(const void* data, uint64_t addr, uint64_t size, bool sup);
+
+  void amo_reserve(uint64_t addr);
+  bool amo_check(uint64_t addr);

  void tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags);
-  void tlbRm(uint64_t va);
+  void tlbRm(uint64_t vaddr);
  void tlbFlush() {
    tlb_.clear();
  }
+
 private:

+  struct amo_reservation_t {
+    uint64_t addr;
+    bool     valid;
+  };
+
  class ADecoder {
  public:
    ADecoder() {}
    
-    void read(void *data, uint64_t addr, uint64_t size);
-    void write(const void *data, uint64_t addr, uint64_t size);
+    void read(void* data, uint64_t addr, uint64_t size);
+    void write(const void* data, uint64_t addr, uint64_t size);
    
    void map(uint64_t start, uint64_t end, MemDevice &md);

  private:

    struct mem_accessor_t {
-      MemDevice* md;
-      uint64_t addr;
+      MemDevice*  md;
+      uint64_t    addr;
    };
    
    struct entry_t {
-      MemDevice *md;
-      uint64_t      start;
-      uint64_t      end;        
+      MemDevice*  md;
+      uint64_t    start;
+      uint64_t    end;        
    };

-    bool lookup(uint64_t a, uint32_t wordSize, mem_accessor_t*);
+    bool lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t*);

    std::vector<entry_t> entries_;
  };
@@ -120,11 +143,14 @@ private:

  TLBEntry tlbLookup(uint64_t vAddr, uint32_t flagMask);

+  uint64_t toPhyAddr(uint64_t vAddr, uint32_t flagMask);
+
  std::unordered_map<uint64_t, TLBEntry> tlb_;
-  uint64_t pageSize_;
-  uint64_t addrBytes_;
-  ADecoder decoder_;  
-  bool disableVM_;
+  uint64_t  pageSize_;
+  ADecoder  decoder_;  
+  bool      enableVM_;
+
+  amo_reservation_t amo_reservation_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -132,15 +158,15 @@ private:
 class RAM : public MemDevice {
 public:
  
-  RAM(uint32_t page_size);
+   RAM(uint32_t page_size, uint64_t capacity = 0);
  ~RAM();

  void clear();

  uint64_t size() const override;

-  void read(void *data, uint64_t addr, uint64_t size) override;  
-  void write(const void *data, uint64_t addr, uint64_t size) override;
+  void read(void* data, uint64_t addr, uint64_t size) override;  
+  void write(const void* data, uint64_t addr, uint64_t size) override;

  void loadBinImage(const char* filename, uint64_t destination);
  void loadHexImage(const char* filename);
@@ -157,11 +183,11 @@ private:

  uint8_t *get(uint64_t address) const;

-  uint64_t size_;
+  uint64_t capacity_;
  uint32_t page_bits_;  
  mutable std::unordered_map<uint64_t, uint8_t*> pages_;
  mutable uint8_t* last_page_;
  mutable uint64_t last_page_index_;
 };

-} // namespace vortex
+} // namespace vortex
--- a/sim/common/mempool.h
+++ b/sim/common/mempool.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <stack>
@@ -18,8 +31,9 @@ public:
  void* allocate() {
    void* mem;
    if (!free_list_.empty()) {
-      mem = static_cast<void*>(free_list_.top());
+      auto entry = free_list_.top();
      free_list_.pop();
+      mem = static_cast<void*>(entry);      
    } else {
      mem = ::operator new(sizeof(T));
    }
@@ -36,12 +50,13 @@ public:

  void flush() {
    while (!free_list_.empty()) {
-      ::operator delete(free_list_.top());
+      auto entry = free_list_.top();
      free_list_.pop();
+      ::operator delete(entry);      
    }
  }

 private:
-  std::stack<void*> free_list_;
+  std::stack<T*> free_list_;
  uint32_t max_size_;
-};
+};
--- a/sim/common/rvfloats.cpp
+++ b/sim/common/rvfloats.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "rvfloats.h"
 #include <stdio.h>

@@ -16,12 +29,9 @@ inline float64_t to_float64_t(uint64_t x) { return float64_t{x}; }
 inline uint32_t from_float32_t(float32_t x) { return uint32_t(x.v); }
 inline uint64_t from_float64_t(float64_t x) { return uint64_t(x.v); }

-inline uint32_t get_fflags() {
-  uint32_t fflags = softfloat_exceptionFlags;
-  if (fflags) {
-    softfloat_exceptionFlags = 0; 
-  }
-  return fflags;
+inline void rv_init(uint32_t frm) {
+  softfloat_exceptionFlags = 0;
+  softfloat_roundingMode = frm;
 }

 #ifdef __cplusplus
@@ -29,289 +39,296 @@ extern "C" {
 #endif

 uint32_t rv_fadd_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_add(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fadd_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_add(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fsub_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_sub(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_sub(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fmul_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_mul(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_mul(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto c_neg = c ^ F32_SIGN;
  auto r = f32_mulAdd(to_float32_t(a), to_float32_t(b), to_float32_t(c_neg));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto c_neg = c ^ F64_SIGN;
  auto r = f64_mulAdd(to_float64_t(a), to_float64_t(b), to_float64_t(c_neg));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto a_neg = a ^ F32_SIGN;
  auto c_neg = c ^ F32_SIGN;
  auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c_neg));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fnmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto a_neg = a ^ F64_SIGN;
  auto c_neg = c ^ F64_SIGN;
  auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c_neg));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto a_neg = a ^ F32_SIGN;
  auto r = f32_mulAdd(to_float32_t(a_neg), to_float32_t(b), to_float32_t(c));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fnmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto a_neg = a ^ F64_SIGN;
  auto r = f64_mulAdd(to_float64_t(a_neg), to_float64_t(b), to_float64_t(c));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_div(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_div(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_sqrt(to_float32_t(a));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_sqrt(to_float64_t(a));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_to_i32(to_float32_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_ftoi_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_to_i32(to_float64_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_to_ui32(to_float32_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_ftou_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_to_ui32(to_float64_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_ftol_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_to_i64(to_float32_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_ftol_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_to_i64(to_float64_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_ftolu_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f32_to_ui64(to_float32_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_ftolu_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = f64_to_ui64(to_float64_t(a), frm, true);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_itof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = i32_to_f32(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_itof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = i32_to_f64(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_utof_s(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = ui32_to_f32(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_utof_d(uint32_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = ui32_to_f64(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_ltof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = i64_to_f32(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_ltof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = i64_to_f64(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 uint32_t rv_lutof_s(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = ui64_to_f32(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float32_t(r);
 }

 uint64_t rv_lutof_d(uint64_t a, uint32_t frm, uint32_t* fflags) {
-  softfloat_roundingMode = frm;
+  rv_init(frm);
  auto r = ui64_to_f64(a);
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return from_float64_t(r);
 }

 bool rv_flt_s(uint32_t a, uint32_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f32_lt(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 bool rv_flt_d(uint64_t a, uint64_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f64_lt(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 bool rv_fle_s(uint32_t a, uint32_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f32_le(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 bool rv_fle_d(uint64_t a, uint64_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f64_le(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 bool rv_feq_s(uint32_t a, uint32_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f32_eq(to_float32_t(a), to_float32_t(b));
-  if (fflags) { *fflags = get_fflags(); }  
+  if (fflags) { *fflags = softfloat_exceptionFlags; }  
  return r;
 }

 bool rv_feq_d(uint64_t a, uint64_t b, uint32_t* fflags) {
+  rv_init(0);
  auto r = f64_eq(to_float64_t(a), to_float64_t(b));
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {  
  uint32_t r;
+  rv_init(0);
  if (isNaNF32UI(a) && isNaNF32UI(b)) {
    r = defaultNaNF32UI;   
  } else {
@@ -324,12 +341,13 @@ uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) {
      r = b;
    }
  }
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {  
  uint64_t r;
+  rv_init(0);
  if (isNaNF64UI(a) && isNaNF64UI(b)) {
    r = defaultNaNF64UI;   
  } else {
@@ -342,12 +360,13 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) {
      r = b;
    }
  }
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
  uint32_t r;
+  rv_init(0);
  if (isNaNF32UI(a) && isNaNF32UI(b)) {
    r = defaultNaNF32UI;   
  } else {
@@ -360,12 +379,13 @@ uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) {
      r = b;
    }
  }
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

 uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
  uint64_t r;
+  rv_init(0);
  if (isNaNF64UI(a) && isNaNF64UI(b)) {
    r = defaultNaNF64UI;   
  } else {
@@ -378,7 +398,7 @@ uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) {
      r = b;
    }
  }
-  if (fflags) { *fflags = get_fflags(); }
+  if (fflags) { *fflags = softfloat_exceptionFlags; }
  return r;
 }

--- a/sim/common/rvfloats.h
+++ b/sim/common/rvfloats.h
@@ -1,5 +1,17 @@
-#ifndef RVFLOATS_H
-#define RVFLOATS_H
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once

 #include <cstdint>

@@ -78,5 +90,3 @@ uint64_t rv_ftod(uint32_t a);
 #ifdef __cplusplus
 }
 #endif
-
-#endif
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <functional>
@@ -84,33 +97,39 @@ public:
  }

  uint64_t pop() {
-    auto cycle = queue_.front().cycle;
+    auto cycles = queue_.front().cycles;
    queue_.pop();
-    return cycle;
+    return cycles;
  }  

  void tx_callback(const TxCallback& callback) {
    tx_cb_ = callback;
  }

+  uint64_t arrival_time() const {
+    if (queue_.empty())
+      return 0;
+    return queue_.front().cycles;
+  }
+
 protected:
  struct timed_pkt_t {
    Pkt      pkt;
-    uint64_t cycle;
+    uint64_t cycles;
  };

  std::queue<timed_pkt_t> queue_;
  SimPort*   peer_;
  TxCallback tx_cb_;

-  void push(const Pkt& data, uint64_t cycle) {
+  void push(const Pkt& data, uint64_t cycles) {
    if (tx_cb_) {
-      tx_cb_(data, cycle);
+      tx_cb_(data, cycles);
    }
    if (peer_) {
-      peer_->push(data, cycle);
+      peer_->push(data, cycles);
    } else {
-      queue_.push({data, cycle});
+      queue_.push({data, cycles});
    }
  }

@@ -129,14 +148,14 @@ public:
  
  virtual void fire() const = 0;

-  uint64_t time() const {
-    return time_;
+  uint64_t cycles() const {
+    return cycles_;
  }

 protected:
-  SimEventBase(uint64_t time) : time_(time) {}
+  SimEventBase(uint64_t cycles) : cycles_(cycles) {}

-  uint64_t time_;
+  uint64_t cycles_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -150,8 +169,8 @@ public:

  typedef std::function<void (const Pkt&)> Func;

-  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time) 
-    : SimEventBase(time)
+  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) 
+    : SimEventBase(cycles)
    , func_(func)
    , pkt_(pkt)
  {}
@@ -180,11 +199,11 @@ template <typename Pkt>
 class SimPortEvent : public SimEventBase {
 public:
  void fire() const override {
-    const_cast<SimPort<Pkt>*>(port_)->push(pkt_, time_);
+    const_cast<SimPort<Pkt>*>(port_)->push(pkt_, cycles_);
  }

-  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t time) 
-    : SimEventBase(time) 
+  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t cycles) 
+    : SimEventBase(cycles) 
    , port_(port)
    , pkt_(pkt)
  {}
@@ -330,7 +349,7 @@ public:
    auto evt_it_end = events_.end();
    while (evt_it != evt_it_end) {
      auto& event = *evt_it;
-      if (cycles_ >= event->time()) {        
+      if (cycles_ >= event->cycles()) {        
        event->fire();
        evt_it = events_.erase(evt_it);
      } else {        
@@ -395,5 +414,5 @@ void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
    reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);    
  } else {
    SimPlatform::instance().schedule(this, pkt, delay);
-  }  
-}
+  } 
+}
--- a/sim/common/stringutil.h
+++ b/sim/common/stringutil.h
@@ -0,0 +1,78 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include  <iomanip>
+
+class ByteStream : public std::istream {
+public:
+  ByteStream(const void *buf, std::size_t size) : buf_(buf), size_(size) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const ByteStream& obj) {
+    auto oldflags = os.flags();
+    auto oldwidth = os.width();
+    auto oldfill  = os.fill();
+    for (std::size_t i = 0, n = obj.size_; i < n; ++i) {
+      int byte = *((uint8_t*)obj.buf_ + (n - 1 - i));
+      os << std::hex << std::setw(2) << std::setfill('0') << byte;
+    }
+    os.fill(oldfill);
+    os.width(oldwidth);
+    os.flags(oldflags);
+    return os;
+  }
+
+private:
+  const void *buf_;
+  std::size_t size_;
+};
+
+class IndentStream : public std::streambuf {
+public:
+  explicit IndentStream(std::streambuf* dest, int indent = 4)
+    : dest_(dest)
+    , isBeginLine_(true)
+    , indent_(indent, ' ')
+    , owner_(nullptr)
+  {}
+  
+  explicit IndentStream(std::ostream& dest, int indent = 4)
+    : dest_(dest.rdbuf())
+    , isBeginLine_(true)
+    , indent_(indent, ' ')
+    , owner_(&dest) {
+      owner_->rdbuf(this);
+  }
+
+  virtual ~IndentStream() {
+    if (owner_)
+        owner_->rdbuf(dest_);
+  }
+
+protected:
+  virtual int overflow(int ch) {
+    if (isBeginLine_ && ch != '\n') {
+      dest_->sputn(indent_.data(), indent_.size());
+    }
+    isBeginLine_ = ch == '\n';
+    return dest_->sputc(ch);
+  }
+
+private:
+  std::streambuf* dest_;
+  bool            isBeginLine_;
+  std::string     indent_;
+  std::ostream*   owner_;
+};
--- a/sim/common/texturing.h
+++ b/sim/common/texturing.h
@@ -1,237 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cocogfx/include/fixed.h>
-#include <bitmanip.h>
-
-using namespace cocogfx;
-
-enum class WrapMode {
-  Clamp,
-  Repeat,
-  Mirror,
-};
-
-enum class TexFormat {
-  A8R8G8B8,
-  R5G6B5,
-  A1R5G5B5,
-  A4R4G4B4,
-  A8L8,
-  L8,  
-  A8,  
-};
-
-template <uint32_t F, typename T = int32_t>
-T Clamp(Fixed<F,T> fx, WrapMode mode) {
-  switch (mode) {
-  case WrapMode::Clamp:  return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
-  case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
-  case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
-  default: 
-    std::abort();
-    return 0;    
-  }
-}
-
-inline uint32_t Stride(TexFormat format) {
-  switch (format) {
-  case TexFormat::A8R8G8B8: 
-    return 4;
-  case TexFormat::R5G6B5:
-  case TexFormat::A1R5G5B5:
-  case TexFormat::A4R4G4B4:
-  case TexFormat::A8L8:
-    return 2;
-  case TexFormat::L8:
-  case TexFormat::A8:
-    return 1;
-  default: 
-    std::abort();
-    return 0;
-  }
-}
-
-inline void Unpack8888(TexFormat format, 
-                       uint32_t texel, 
-                       uint32_t* lo, 
-                       uint32_t* hi) {
-  uint32_t r, g, b, a;
-  switch (format) {
-  case TexFormat::A8R8G8B8:    
-    r = (texel >> 16) & 0xff;
-    g = (texel >> 8) & 0xff;
-    b = texel & 0xff;
-    a = texel >> 24;
-    break;
-  case TexFormat::R5G6B5: 
-    r = ((texel >> 11) << 3) | (texel >> 13);    
-    g = ((texel >> 3) & 0xfc) | ((texel >> 9) & 0x3);
-    b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);    
-    a = 0xff;
-    break;
-  case TexFormat::A1R5G5B5:         
-    r = ((texel >> 7) & 0xf8) | ((texel << 1) >> 13);
-    g = ((texel >> 2) & 0xf8) | ((texel >> 7) & 7);
-    b = ((texel & 0x1f) << 3) | ((texel & 0x1c) >> 2);
-    a = 0xff * (texel >> 15);
-    break;
-  case TexFormat::A4R4G4B4:   
-    r = ((texel >> 4) & 0xf0) | ((texel >> 8) & 0x0f);
-    g = ((texel & 0xf0) >> 0) | ((texel & 0xf0) >> 4);
-    b = ((texel & 0x0f) << 4) | ((texel & 0x0f) >> 0);
-    a = ((texel >> 8) & 0xf0) | (texel >> 12);
-    break;
-  case TexFormat::A8L8:
-    r = texel & 0xff;
-    g = r;
-    b = r;
-    a = texel >> 8;
-    break;
-  case TexFormat::L8:
-    r = texel & 0xff;
-    g = r;
-    b = r;
-    a = 0xff;
-    break;
-  case TexFormat::A8:
-    r = 0xff;
-    g = 0xff;
-    b = 0xff;
-    a = texel & 0xff;
-    break;
-  default: 
-    std::abort();
-  } 
-  *lo = (r << 16) + b;
-  *hi = (a << 16) + g;
-}
-
-inline void Unpack8888(uint32_t texel, uint32_t* lo, uint32_t* hi) {
-  *lo = texel & 0x00ff00ff;
-  *hi = (texel >> 8) & 0x00ff00ff;
-}
-
-inline uint32_t Pack8888(uint32_t lo, uint32_t hi) {
-  return (hi << 8) | lo;
-}
-
-inline uint32_t Lerp8888(uint32_t a, uint32_t b, uint32_t f) {
-  return (a + (((b - a) * f) >> 8)) & 0x00ff00ff;
-}
-
-template <uint32_t F, typename T = int32_t>
-void TexAddressLinear(Fixed<F,T> fu, 
-                      Fixed<F,T> fv, 
-                      uint32_t log_width,
-                      uint32_t log_height,
-                      WrapMode wrapu,
-                      WrapMode wrapv,
-                      uint32_t* addr00,
-                      uint32_t* addr01,
-                      uint32_t* addr10,
-                      uint32_t* addr11,
-                      uint32_t* alpha,
-                      uint32_t* beta
-) {
-  auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
-  auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
-
-  uint32_t u0 = Clamp(fu - delta_x, wrapu);    
-  uint32_t u1 = Clamp(fu + delta_x, wrapu);
-  uint32_t v0 = Clamp(fv - delta_y, wrapv);     
-  uint32_t v1 = Clamp(fv + delta_y, wrapv);
-
-  uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
-  uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
-
-  uint32_t x0s = (u0 << 8) >> shift_u;
-  uint32_t y0s = (v0 << 8) >> shift_v;
-
-  uint32_t x0 = x0s >> 8;
-  uint32_t y0 = y0s >> 8;
-  uint32_t x1 = u1 >> shift_u;
-  uint32_t y1 = v1 >> shift_v;
-
-  *addr00 = x0 + (y0 << log_width);
-  *addr01 = x1 + (y0 << log_width);
-  *addr10 = x0 + (y1 << log_width);
-  *addr11 = x1 + (y1 << log_width);
-
-  *alpha  = x0s & 0xff;
-  *beta   = y0s & 0xff;
-
-  //printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
-}
-
-template <uint32_t F, typename T = int32_t>
-void TexAddressPoint(Fixed<F,T> fu, 
-                     Fixed<F,T> fv, 
-                     uint32_t log_width,
-                     uint32_t log_height,
-                     WrapMode wrapu,
-                     WrapMode wrapv,
-                     uint32_t* addr
-) {
-  uint32_t u = Clamp(fu, wrapu);
-  uint32_t v = Clamp(fv, wrapv);
-  
-  uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
-  uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
-  
-  *addr = x + (y << log_width);
-
-  //printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
-}
-
-inline uint32_t TexFilterLinear(
-  TexFormat format,
-  uint32_t texel00,  
-  uint32_t texel01,
-  uint32_t texel10,
-  uint32_t texel11,
-  uint32_t alpha,
-  uint32_t beta
-) {
-  uint32_t c01l, c01h;
-  {
-    uint32_t c0l, c0h, c1l, c1h;
-    Unpack8888(format, texel00, &c0l, &c0h);
-    Unpack8888(format, texel01, &c1l, &c1h);
-    c01l = Lerp8888(c0l, c1l, alpha);
-    c01h = Lerp8888(c0h, c1h, alpha);
-  }
-
-  uint32_t c23l, c23h;
-  {
-    uint32_t c2l, c2h, c3l, c3h;
-    Unpack8888(format, texel10, &c2l, &c2h);
-    Unpack8888(format, texel11, &c3l, &c3h);
-    c23l = Lerp8888(c2l, c3l, alpha);
-    c23h = Lerp8888(c2h, c3h, alpha);
-  }
-
-  uint32_t color;
-  {
-    uint32_t cl = Lerp8888(c01l, c23l, beta);
-    uint32_t ch = Lerp8888(c01h, c23h, beta);
-    color = Pack8888(cl, ch);
-  }
-
-  //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
-
-  return color;
-}
-
-inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
-  uint32_t color;
-  {
-    uint32_t cl, ch;
-    Unpack8888(format, texel, &cl, &ch);
-    color = Pack8888(cl, ch);
-  }
-
-  //printf("*** texel=0x%x, color=0x%x\n", texel, color);
-
-  return color;
-}
--- a/sim/common/util.cpp
+++ b/sim/common/util.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "util.h"
 #include <string.h>

@@ -7,4 +20,20 @@ const char* fileExtension(const char* filepath) {
    if (ext == NULL || ext == filepath) 
      return "";
    return ext + 1;
+}
+
+void* aligned_malloc(size_t size, size_t alignment) {
+  // reserve margin for alignment and storing of unaligned address
+  assert((alignment & (alignment - 1)) == 0);   // Power of 2 alignment.
+  size_t margin = (alignment-1) + sizeof(void*);
+  void *unaligned_addr = malloc(size + margin);
+  void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
+  aligned_addr[-1] = unaligned_addr;
+  return aligned_addr;
+}
+
+void aligned_free(void *ptr) {
+  // retreive the stored unaligned address and use it to free the allocation
+  void* unaligned_addr = ((void**)ptr)[-1];
+  free(unaligned_addr);
 }
--- a/sim/common/util.h
+++ b/sim/common/util.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <cstdint>
@@ -49,4 +62,7 @@ const char* fileExtension(const char* filepath);
 #define DISABLE_WARNING_UNUSED_PARAMETER
 #define DISABLE_WARNING_UNREFERENCED_FUNCTION
 #define DISABLE_WARNING_ANONYMOUS_STRUCT
-#endif
+#endif
+
+void *aligned_malloc(size_t size, size_t alignment);
+void aligned_free(void *ptr);
--- a/sim/common/uuid_gen.h
+++ b/sim/common/uuid_gen.h
@@ -0,0 +1,55 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include <unordered_map>
+
+namespace vortex {
+
+class UUIDGenerator {
+public:    
+    UUIDGenerator() : ids_(0) {}
+    virtual ~UUIDGenerator() {}
+
+    uint32_t get_uuid(uint64_t PC) {
+        uint32_t id;
+        uint32_t ref;
+        auto it = uuid_map_.find(PC);
+        if (it != uuid_map_.end()) {
+            uint64_t value = it->second;
+            id  = value & 0xffff;
+            ref = value >> 16;
+        } else {
+            id = ids_++;
+            ref = -1;
+        }
+        ++ref;
+        uint64_t ret = (uint64_t(ref) << 16) | id;
+        uuid_map_[PC] = ret;
+        return ret;
+    }
+
+    void reset() {
+        uuid_map_.clear();
+        ids_ = 0;
+    }
+
+private:
+
+    std::unordered_map<uint64_t, uint32_t> uuid_map_;
+    uint32_t ids_;
+};
+
+}
--- a/sim/opaesim/.gitignore
+++ b/sim/opaesim/.gitignore
--- a/sim/opaesim/Makefile
+++ b/sim/opaesim/Makefile
@@ -0,0 +1,138 @@
+XLEN ?= 32
+DESTDIR ?= .
+RTL_DIR = ../../hw/rtl
+DPI_DIR = ../../hw/dpi
+AFU_DIR = $(RTL_DIR)/afu/opae
+SCRIPT_DIR = ../../hw/scripts
+THIRD_PARTY_DIR = ../../third_party
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
+CXXFLAGS += -fPIC -Wno-maybe-uninitialized
+CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
+CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
+CXXFLAGS += -I../$(THIRD_PARTY_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
+
+LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
+LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
+
+# control RTL debug tracing states
+DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE  
+DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
+DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
+DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
+DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK 
+DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
+DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
+DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
+DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
+DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
+DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
+DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER
+DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
+DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
+
+# Control logic analyzer monitors
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_RASTER
+DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED
+
+# AFU parameters
+CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
+ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS)))
+	CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2
+endif
+ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS)))
+	CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26
+endif
+ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS)))
+	CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512
+endif
+ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
+	CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4
+endif
+
+DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
+
+SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
+SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
+SRCS += fpga.cpp opae_sim.cpp
+
+RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
+RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
+
+FPU_INCLUDE = -I$(RTL_DIR)/fpu
+ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
+	RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
+	FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
+endif
+RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
+RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip
+
+TOP = vortex_afu_shim
+
+VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
+VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
+VL_FLAGS += --x-initial unique --x-assign unique
+VL_FLAGS += -DSIMULATION
+VL_FLAGS += -DXLEN_$(XLEN)
+VL_FLAGS += $(CONFIGS)
+VL_FLAGS += verilator.vlt
+VL_FLAGS += $(RTL_INCLUDE)
+VL_FLAGS += $(RTL_PKGS)
+VL_FLAGS += $(DBG_SCOPE_FLAGS)
+
+CXXFLAGS += $(CONFIGS)
+
+# Enable Verilator multithreaded simulation
+THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
+VL_FLAGS += -j $(THREADS)
+#VL_FLAGS += --threads $(THREADS)
+
+# Debugigng
+ifdef DEBUG
+	VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
+	CXXFLAGS += -g -O0 $(DBG_FLAGS)
+else    
+	VL_FLAGS += -DNDEBUG
+	CXXFLAGS += -O3 -DNDEBUG
+endif
+
+# Enable scope analyzer
+ifdef SCOPE
+	VL_FLAGS += -DSCOPE
+	CXXFLAGS += -DSCOPE
+	SCOPE_JSON = $(DESTDIR)/scope.json
+endif
+
+# Enable perf counters
+ifdef PERF
+	VL_FLAGS += -DPERF_ENABLE
+	CXXFLAGS += -DPERF_ENABLE
+endif
+
+# use our OPAE shim
+VL_FLAGS += -DNOPAE
+CXXFLAGS += -DNOPAE
+
+PROJECT = libopae-c-sim.so
+
+all: $(PROJECT)
+
+$(DESTDIR)/vortex.xml:
+	verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
+
+$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
+	$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
+
+$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
+	$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
+
+$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
+	verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
+
+clean:
+	rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
--- a/sim/opaesim/fpga.cpp
+++ b/sim/opaesim/fpga.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <stdint.h>
 #include <iostream>
 #include <stdio.h>
@@ -8,10 +21,61 @@
 #include "fpga.h"
 #include "opae_sim.h"
 #include <VX_config.h>
+#include <util.h>

 using namespace vortex;

+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop) {  
+  __unused (token, prop);
+  return FPGA_OK;
+}
+
+extern fpga_result fpgaPropertiesSetObjectType(fpga_properties prop, fpga_objtype objtype) {  
+  __unused (prop, objtype);
+  return FPGA_OK;  
+}
+
+extern fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid) {  
+  __unused (prop, guid);
+  return FPGA_OK;  
+}
+
+extern fpga_result fpgaDestroyProperties(fpga_properties *prop) {
+  __unused (prop);  
+  return FPGA_OK;  
+}
+
+extern fpga_result fpgaEnumerate(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches) {  
+  __unused (filters, num_filters, num_filters, tokens, max_tokens);
+  if (num_matches) {
+    *num_matches = 1;
+  }
+  return FPGA_OK;  
+}
+
+extern fpga_result fpgaDestroyToken(fpga_token *token) {  
+  __unused (token);
+  return FPGA_OK;  
+}
+
+extern fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties *filters, uint64_t* lms) {  
+  __unused (filters);
+  if (lms) {
+  #if (XLEN == 64)
+    *lms = 0x200000000; // 8 GB
+  #else
+    *lms = 0x100000000; // 4 GB
+  #endif
+  }
+  return FPGA_OK;  
+}
+
 extern fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags) {
+  __unused (token);
  if (NULL == handle || flags != 0)
    return FPGA_INVALID_PARAM;
  auto sim = new opae_sim();    
@@ -83,4 +147,8 @@ extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_

 extern const char *fpgaErrStr(fpga_result e) {
  return "";
-}
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/sim/opaesim/fpga.h
+++ b/sim/opaesim/fpga.h
@@ -1,7 +1,20 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #ifndef __FPGA_H__
 #define __FPGA_H__

-#include <stdio.h>
+#include <stdint.h>

 #ifdef __cplusplus
 extern "C" {
@@ -21,28 +34,21 @@ typedef enum {
 	FPGA_RECONF_ERROR    /**< Error while reconfiguring FPGA */
 } fpga_result;

+typedef enum { 
+	FPGA_DEVICE = 0,
+	FPGA_ACCELERATOR
+} fpga_objtype;
+
 typedef void *fpga_handle;

 typedef void *fpga_token;

-fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags);
+typedef void *fpga_properties;

-fpga_result fpgaClose(fpga_handle handle);
-
-fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
-
-fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid);
-
-fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr);
-
-fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value);
-
-fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value);
-
-const char *fpgaErrStr(fpga_result e);
+typedef uint8_t fpga_guid[16];

 #ifdef __cplusplus
-} // extern "C"
-#endif // __cplusplus
+}
+#endif

 #endif // __FPGA_H__
--- a/sim/opaesim/opae_sim.cpp
+++ b/sim/opaesim/opae_sim.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "opae_sim.h"

 #include <verilated.h>
@@ -25,6 +38,7 @@
 #include <list>
 #include <queue>
 #include <unordered_map>
+#include <util.h>

 #ifndef MEMORY_BANKS 
  #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
@@ -62,6 +76,8 @@

 #define RAM_PAGE_SIZE 4096

+#define CPU_GPU_LATENCY 200
+
 using namespace vortex;

 static uint64_t timestamp = 0;
@@ -70,23 +86,6 @@ double sc_time_stamp() {
  return timestamp;
 }

-static void *__aligned_malloc(size_t alignment, size_t size) {
-  // reserve margin for alignment and storing of unaligned address
-  size_t margin = (alignment-1) + sizeof(void*);
-  void *unaligned_addr = malloc(size + margin);
-  void **aligned_addr = (void**)((uintptr_t)(((uint8_t*)unaligned_addr) + margin) & ~(alignment-1));
-  aligned_addr[-1] = unaligned_addr;
-  return aligned_addr;
-}
-
-static void __aligned_free(void *ptr) {
-  // retreive the stored unaligned address and use it to free the allocation
-  void* unaligned_addr = ((void**)ptr)[-1];
-  free(unaligned_addr);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
 static bool trace_enabled = false;
 static uint64_t trace_start_time = TRACE_START_TIME;
 static uint64_t trace_stop_time = TRACE_STOP_TIME;
@@ -158,7 +157,7 @@ public:
      future_.wait();
    } 
    for (auto& buffer : host_buffers_) {
-      __aligned_free(buffer.second.data);
+      aligned_free(buffer.second.data);
    }   
  #ifdef VCD_OUTPUT
    trace_->close();
@@ -176,9 +175,13 @@ public:
  }

  int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
-    auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len);
+    auto alloc = aligned_malloc(len, CACHE_BLOCK_SIZE);
    if (alloc == NULL)
      return -1;
+    // set uninitialized data to "baadf00d"
+    for (uint32_t i = 0; i < len; ++i) {
+        ((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
+    }
    host_buffer_t buffer;
    buffer.data   = (uint64_t*)alloc;
    buffer.size   = len;
@@ -193,7 +196,7 @@ public:
  void release_buffer(uint64_t wsid) {
    auto it = host_buffers_.find(wsid);
    if (it != host_buffers_.end()) {
-      __aligned_free(it->second.data);
+      aligned_free(it->second.data);
      host_buffers_.erase(it);
    }
  }
@@ -205,6 +208,11 @@ public:
  void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
    std::lock_guard<std::mutex> guard(mutex_);

+    // simulate CPU-GPU latency
+    for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) 
+      this->tick();
+
+    // simulate mmio request
    device_->vcp2af_sRxPort_c0_mmioRdValid = 1;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -217,7 +225,12 @@ public:

  void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
    std::lock_guard<std::mutex> guard(mutex_);
+
+    // simulate CPU-GPU latency
+    for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) 
+      this->tick();
    
+    // simulate mmio request
    device_->vcp2af_sRxPort_c0_mmioWrValid = 1;  
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
@@ -254,7 +267,14 @@ private:
      this->eval();
    } 

-    device_->reset = 0;
+    device_->reset = 0;    
+
+    for (int i = 0; i < RESET_DELAY; ++i) {
+      device_->clk = 0;
+      this->eval();
+      device_->clk = 1;
+      this->eval();
+    }
    
    // Turn on assertion after reset
    Verilated::assertOn(true);
@@ -289,7 +309,7 @@ private:
  #endif
  }

-  void eval() {  
+  void eval() {
    device_->eval();
  #ifdef VCD_OUTPUT
    if (sim_trace_enabled()) {
@@ -396,10 +416,10 @@ private:

      // process memory requests
      assert(!device_->avs_read[b] || !device_->avs_write[b]);
-      unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE;
+      unsigned byte_addr = (device_->avs_address[b] * MEMORY_BANKS + b) * MEM_BLOCK_SIZE;
      if (device_->avs_write[b]) {           
        uint64_t byteen = device_->avs_byteenable[b];        
-        uint8_t* data = (uint8_t*)device_->avs_writedata[b].data();
+        uint8_t* data = (uint8_t*)(device_->avs_writedata[b].data());
        for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
          if ((byteen >> i) & 0x1) {            
            (*ram_)[byte_addr + i] = data[i];
@@ -419,8 +439,7 @@ private:
          0
        );
        dram_queue_.push(dram_req);
-      }
-
+      } else
      if (device_->avs_read[b]) {
        auto mem_req = new mem_rd_req_t();
        mem_req->addr = device_->avs_address[b];
@@ -491,7 +510,7 @@ private:

  std::mutex mutex_;

-  RAM *ram_;
+  RAM* ram_;

  ramulator::Gem5Wrapper* dram_;

@@ -531,4 +550,4 @@ void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value)

 void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
  impl_->read_mmio64(mmio_num, offset, value);
-}
+}
--- a/sim/opaesim/opae_sim.h
+++ b/sim/opaesim/opae_sim.h
@@ -0,0 +1,43 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+namespace vortex {
+
+class RAM;
+
+class opae_sim {
+public:
+  
+  opae_sim();
+  virtual ~opae_sim();
+
+  int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
+
+  void release_buffer(uint64_t wsid);
+
+  void get_io_address(uint64_t wsid, uint64_t *ioaddr);
+
+  void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
+
+  void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
+
+private: 
+
+  class Impl;
+  Impl* impl_;  
+};
+
+}
--- a/sim/opaesim/verilator.vlt
+++ b/sim/opaesim/verilator.vlt
@@ -0,0 +1,8 @@
+`verilator_config
+
+lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
+lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
+lint_off -file "*/fpnew/src/*"
+
+lint_off -file "*/afu/opae/ccip/ccip_if_pkg.sv"
+lint_off -file "*/afu/opae/local_mem_cfg_pkg.sv"
--- a/sim/opaesim/vortex_afu_shim.sv
+++ b/sim/opaesim/vortex_afu_shim.sv
@@ -1,16 +1,24 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 `include "VX_platform.vh"
 `IGNORE_WARNINGS_BEGIN
 `include "vortex_afu.vh"
 `IGNORE_WARNINGS_END

-/* verilator lint_off IMPORTSTAR */ 
-import ccip_if_pkg::*;
-import local_mem_cfg_pkg::*;
-/* verilator lint_on IMPORTSTAR */
-
 `include "VX_define.vh"

-module vortex_afu_shim (
+module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
  // global signals
  input clk,
  input reset,
@@ -167,4 +175,4 @@ assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid;
 assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid;  
 assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data;

-endmodule
+endmodule
--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@@ -1,3 +1,4 @@
+XLEN ?= 32
 DESTDIR ?= .
 RTL_DIR = ../../hw/rtl
 DPI_DIR = ../../hw/dpi
@@ -8,6 +9,7 @@ CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I../../../hw -I../../common
 CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
 CXXFLAGS += -I../$(THIRD_PARTY_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)

 LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
@@ -24,12 +26,20 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
 DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
 DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
 DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
+DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER
+DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
+DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR

-DBG_FLAGS += $(DBG_TRACE_FLAGS)
+DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)

-FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
-TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
-RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
+RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
+
+FPU_INCLUDE = -I$(RTL_DIR)/fpu
+ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
+	RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
+	FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
+endif
+RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
@@ -42,14 +52,18 @@ else
 	TOP = Vortex
 endif

-VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
-VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
-VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
+VL_FLAGS = --exe
+VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
+VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
 VL_FLAGS += --x-initial unique --x-assign unique
 VL_FLAGS += verilator.vlt
-VL_FLAGS += $(RTL_INCLUDE)
-
+VL_FLAGS += -DSIMULATION
+VL_FLAGS += -DXLEN_$(XLEN)
 VL_FLAGS += $(CONFIGS)
+VL_FLAGS += $(RTL_INCLUDE)
+VL_FLAGS += $(RTL_PKGS)
+VL_FLAGS += --cc $(TOP) --top-module $(TOP)
+
 CXXFLAGS += $(CONFIGS)

 # Enable Verilator multithreaded simulation
@@ -59,8 +73,8 @@ VL_FLAGS += -j $(THREADS)

 # Debugigng
 ifdef DEBUG
-	VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
+	VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
+	CXXFLAGS += -g -O0 $(DBG_FLAGS)
 else    
 	VL_FLAGS += -DNDEBUG
 	CXXFLAGS += -O2 -DNDEBUG
@@ -72,20 +86,12 @@ ifdef PERF
 	CXXFLAGS += -DPERF_ENABLE
 endif

-# ALU backend
-VL_FLAGS += -DIMUL_DPI
-VL_FLAGS += -DIDIV_DPI
-
-# FPU backend
-FPU_CORE ?= FPU_DPI
-VL_FLAGS += -D$(FPU_CORE)
-
 PROJECT = rtlsim

 all: $(PROJECT)

 $(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
-	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@
+	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
 	
 $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@
--- a/sim/rtlsim/main.cpp
+++ b/sim/rtlsim/main.cpp
@@ -1,11 +1,24 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <fstream>
 #include <iomanip>
 #include <unistd.h>
-#include <unistd.h>
 #include <util.h>
 #include <mem.h>
 #include <VX_config.h>
+#include <VX_types.h>
 #include "processor.h"

 #define RAM_PAGE_SIZE 4096
@@ -13,11 +26,11 @@
 using namespace vortex;

 static void show_usage() {
-   std::cout << "Usage: [-r] [-h: help] programs.." << std::endl;
+   std::cout << "Usage: [-r: riscv-test] [-h: help] <program>" << std::endl;
 }

 bool riscv_test = false;
-std::vector<const char*> programs;
+const char* program = nullptr;

 static void parse_args(int argc, char **argv) {
  	int c;
@@ -35,56 +48,68 @@ static void parse_args(int argc, char **argv) {
      		show_usage();
      		exit(-1);
    	}
-  	}	
+	}

-	for (int i = optind; i < argc; ++i) {
-		programs.push_back(argv[i]);	
+	if (optind < argc) {
+		program = argv[optind];
+		std::cout << "Running " << program << "..." << std::endl;
+	} else {
+		show_usage();
+      	exit(-1);
 	}
 }

 int main(int argc, char **argv) {
-
 	int exitcode = 0;
-	bool failed = false;
 	
-	parse_args(argc, argv);
+	parse_args(argc, argv);	

+	// create memory module
 	vortex::RAM ram(RAM_PAGE_SIZE);
+
+	// create processor
 	vortex::Processor processor;
+
+	// attach memory module
 	processor.attach_ram(&ram);

-	for (auto program : programs) {
-		std::cout << "Running " << program << "..." << std::endl;		
+	// setup base DCRs
+	const uint64_t startup_addr(STARTUP_ADDR);
+	processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
+#if (XLEN == 64)
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
+#endif
+	processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);	

+	// load program
+	{		
 		std::string program_ext(fileExtension(program));
 		if (program_ext == "bin") {
-			ram.loadBinImage(program, STARTUP_ADDR);
+			ram.loadBinImage(program, startup_addr);
 		} else if (program_ext == "hex") {
 			ram.loadHexImage(program);
 		} else {
 			std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
 			return -1;
 		}
-
-		exitcode = processor.run();
-		
-		if (riscv_test) {
-			if (1 == exitcode) {
-				std::cout << "Passed" << std::endl;
-			} else {
-				std::cout << "Failed: exitcode=" << exitcode << std::endl;
-				failed = true;
-			}
-		} else {
-			if (exitcode != 0) {
-				std::cout << "*** error: exitcode=" << exitcode << std::endl;
-				failed = true;
-			}
-		}	
-		
-		if (failed)
-			break;
 	}

-	return failed ? exitcode : 0;
+	// run simulation
+	exitcode = processor.run();
+	
+	if (riscv_test) {
+		if (1 == exitcode) {
+			std::cout << "Passed" << std::endl;
+			exitcode = 0;
+		} else {
+			std::cout << "Failed" << std::endl;
+			exitcode = 1;
+		}
+	} else {
+		if (exitcode != 0) {
+			std::cout << "*** error: exitcode=" << exitcode << std::endl;
+		}
+	}
+
+	return exitcode;
 }
--- a/sim/rtlsim/processor.cpp
+++ b/sim/rtlsim/processor.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "processor.h"

 #include <verilated.h>
@@ -56,6 +69,14 @@
 #define VERILATOR_RESET_VALUE 2
 #endif

+#if (XLEN == 32)
+typedef uint32_t Word;
+#elif (XLEN == 64)
+typedef uint64_t Word;
+#else
+#error unsupported XLEN
+#endif
+
 #define VL_WDATA_GETW(lwp, i, n, w) \
  VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)

@@ -71,7 +92,7 @@ double sc_time_stamp() {

 static bool trace_enabled = false;
 static uint64_t trace_start_time = TRACE_START_TIME;
-static uint64_t trace_stop_time = TRACE_STOP_TIME;
+static uint64_t trace_stop_time  = TRACE_STOP_TIME;

 bool sim_trace_enabled() {
  if (timestamp >= trace_start_time 
@@ -126,6 +147,9 @@ public:

    // reset the device
    this->reset();
+    
+    // Turn on assertion after reset
+    Verilated::assertOn(true);
  }

  ~Impl() {
@@ -165,27 +189,46 @@ public:
    std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
  #endif

-    // reset device
-    this->reset();
+    // start execution
+    running_ = true;
+    device_->reset = 0;

-    // execute program
+    // wait on device to go busy
+    while (!device_->busy) {
+      this->tick();
+    }
+
+    // wait on device to go idle
    while (device_->busy) {
      if (get_ebreak()) {
-        exitcode = get_last_wb_value(3);
+        exitcode = (int)get_last_wb_value(3);
        break;  
      }
      this->tick();
    }
+    
+    // reset device
+    this->reset();

-    // wait 5 cycles to flush the pipeline
-    this->wait(5);  
+    this->cout_flush();

    return exitcode;
  }

+  void write_dcr(uint32_t addr, uint32_t value) {
+    device_->dcr_wr_valid = 1;
+    device_->dcr_wr_addr  = addr;
+    device_->dcr_wr_data  = value;
+    while (device_->dcr_wr_valid) {
+      this->tick();
+    }
+  }
+
 private:

-  void reset() { 
+  void reset() {
+    running_ = false;
+
    print_bufs_.clear();

    pending_mem_reqs_.clear();
@@ -199,6 +242,8 @@ private:
    this->reset_avs_bus();
  #endif

+    this->reset_dcr_bus();
+
    device_->reset = 1;

    for (int i = 0; i < RESET_DELAY; ++i) {
@@ -206,14 +251,7 @@ private:
      this->eval();
      device_->clk = 1;
      this->eval();
-    }  
-
-    device_->reset = 0;
-    
-    // Turn on assertion after reset
-    Verilated::assertOn(true);
-
-    this->cout_flush();
+    }
  }

  void tick() {
@@ -226,6 +264,7 @@ private:
  #else
    this->eval_avs_bus(0);
  #endif
+    this->eval_dcr_bus(0);

    device_->clk = 1;
    this->eval();
@@ -235,6 +274,7 @@ private:
  #else
    this->eval_avs_bus(1);
  #endif
+    this->eval_dcr_bus(1);

    if (MEM_CYCLE_RATIO > 0) { 
      auto cycle = timestamp / 2;
@@ -260,6 +300,8 @@ private:
  #ifdef VCD_OUTPUT
    if (sim_trace_enabled()) {
      trace_->dump(timestamp);
+    } else {
+      exit(-1);
    }
  #endif
    ++timestamp;
@@ -268,30 +310,30 @@ private:
 #ifdef AXI_BUS

  void reset_axi_bus() {    
-    device_->m_axi_wready  = 0;
-    device_->m_axi_awready = 0;
-    device_->m_axi_arready = 0;  
-    device_->m_axi_rvalid  = 0;
-    device_->m_axi_bvalid  = 0;
+    device_->m_axi_wready[0]  = 0;
+    device_->m_axi_awready[0] = 0;
+    device_->m_axi_arready[0] = 0;  
+    device_->m_axi_rvalid[0]  = 0;
+    device_->m_axi_bvalid[0]  = 0;
  }
    
  void eval_axi_bus(bool clk) {
    if (!clk) {
-      mem_rd_rsp_ready_ = device_->m_axi_rready;
-      mem_wr_rsp_ready_ = device_->m_axi_bready;
+      mem_rd_rsp_ready_ = device_->m_axi_rready[0];
+      mem_wr_rsp_ready_ = device_->m_axi_bready[0];
      return;
    }

    if (ram_ == nullptr) {
-      device_->m_axi_wready  = 0;
-      device_->m_axi_awready = 0;
-      device_->m_axi_arready = 0;  
+      device_->m_axi_wready[0]  = 0;
+      device_->m_axi_awready[0] = 0;
+      device_->m_axi_arready[0] = 0;  
      return;
    }

    // process memory responses
    if (mem_rd_rsp_active_
-    && device_->m_axi_rvalid && mem_rd_rsp_ready_) {
+    && device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) {
      mem_rd_rsp_active_ = false;
    }    
    if (!mem_rd_rsp_active_) {      
@@ -299,30 +341,30 @@ private:
       && (*pending_mem_reqs_.begin())->ready 
       && !(*pending_mem_reqs_.begin())->write) {      
        auto mem_rsp_it = pending_mem_reqs_.begin();
-        auto mem_req = *mem_rsp_it;
+        auto mem_rsp = *mem_rsp_it;
        /*
-          printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
+          printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-            printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
+            printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
          }
          printf("\n");
        */      
-        device_->m_axi_rvalid = 1;
-        device_->m_axi_rid    = mem_req->tag;   
-        device_->m_axi_rresp  = 0;
-        device_->m_axi_rlast  = 1;
-        memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE);
+        device_->m_axi_rvalid[0] = 1;
+        device_->m_axi_rid[0]    = mem_rsp->tag;   
+        device_->m_axi_rresp[0]  = 0;
+        device_->m_axi_rlast[0]  = 1;
+        memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
        pending_mem_reqs_.erase(mem_rsp_it);
        mem_rd_rsp_active_ = true;
-        delete mem_req;
+        delete mem_rsp;
      } else {
-        device_->m_axi_rvalid = 0;
+        device_->m_axi_rvalid[0] = 0;
      }
    }

    // send memory write response  
    if (mem_wr_rsp_active_
-    && device_->m_axi_bvalid && mem_wr_rsp_ready_) {
+    && device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) {
      mem_wr_rsp_active_ = false;
    }
    if (!mem_wr_rsp_active_) {
@@ -330,34 +372,34 @@ private:
       && (*pending_mem_reqs_.begin())->ready 
       && (*pending_mem_reqs_.begin())->write) {
        auto mem_rsp_it = pending_mem_reqs_.begin();
-        auto mem_req = *mem_rsp_it;
+        auto mem_rsp = *mem_rsp_it;
        /*
-          printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr);        
+          printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);        
        */
-        device_->m_axi_bvalid = 1;      
-        device_->m_axi_bid    = mem_req->tag;
-        device_->m_axi_bresp  = 0;
+        device_->m_axi_bvalid[0] = 1;      
+        device_->m_axi_bid[0]    = mem_rsp->tag;
+        device_->m_axi_bresp[0]  = 0;
        pending_mem_reqs_.erase(mem_rsp_it);        
        mem_wr_rsp_active_ = true;
-        delete mem_req;
+        delete mem_rsp;
      } else {
-        device_->m_axi_bvalid = 0;
+        device_->m_axi_bvalid[0] = 0;
      }      
    }

    // select the memory bank
-    uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr;
+    uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0];
    
    // process memory requests
-    if (device_->m_axi_wvalid || device_->m_axi_arvalid) {
-      if (device_->m_axi_wvalid) {        
-        uint64_t byteen = device_->m_axi_wstrb;
-        unsigned base_addr = device_->m_axi_awaddr;
-        uint8_t* data = (uint8_t*)(device_->m_axi_wdata);
+    if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) {
+      if (device_->m_axi_wvalid[0]) {        
+        uint64_t byteen = device_->m_axi_wstrb[0];
+        uint64_t base_addr = device_->m_axi_awaddr[0];
+        uint8_t* data = (uint8_t*)device_->m_axi_wdata[0].data();

        // check console output
-        if (base_addr >= IO_COUT_ADDR 
-         && base_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {          
+        if (base_addr >= uint64_t(IO_COUT_ADDR)
+         && base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {          
          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
            if ((byteen >> i) & 0x1) {            
              auto& ss_buf = print_bufs_[i];
@@ -384,15 +426,15 @@ private:
          }  

          auto mem_req = new mem_req_t();
-          mem_req->tag   = device_->m_axi_awid;
-          mem_req->addr  = device_->m_axi_awaddr;        
+          mem_req->tag   = device_->m_axi_awid[0];
+          mem_req->addr  = device_->m_axi_awaddr[0];        
          mem_req->write = true;
          mem_req->ready = true;
          pending_mem_reqs_.emplace_back(mem_req);

          // send dram request
          ramulator::Request dram_req( 
-            device_->m_axi_awaddr,
+            device_->m_axi_awaddr[0],
            ramulator::Request::Type::WRITE,
            0
          );
@@ -401,18 +443,18 @@ private:
      } else {
        // process reads
        auto mem_req = new mem_req_t();
-        mem_req->tag  = device_->m_axi_arid;   
-        mem_req->addr = device_->m_axi_araddr;
-        ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE);
+        mem_req->tag  = device_->m_axi_arid[0];
+        mem_req->addr = device_->m_axi_araddr[0];
+        ram_->read(mem_req->block.data(), device_->m_axi_araddr[0], MEM_BLOCK_SIZE);
        mem_req->write = false;
        mem_req->ready = false;
        pending_mem_reqs_.emplace_back(mem_req);

        // send dram request
        ramulator::Request dram_req( 
-          device_->m_axi_araddr,
+          device_->m_axi_araddr[0],
          ramulator::Request::Type::READ,
-          std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
+          std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
              mem_req->ready = true;
            }, placeholders::_1, mem_req),
          0
@@ -421,9 +463,9 @@ private:
      } 
    } 

-    device_->m_axi_wready  = 1;
-    device_->m_axi_awready = 1;
-    device_->m_axi_arready = 1;     
+    device_->m_axi_wready[0]  = running_;
+    device_->m_axi_awready[0] = running_;
+    device_->m_axi_arready[0] = running_;     
  }

 #else
@@ -454,35 +496,35 @@ private:
       && (*pending_mem_reqs_.begin())->ready) {
        device_->mem_rsp_valid = 1;      
        auto mem_rsp_it = pending_mem_reqs_.begin();
-        auto mem_req = *mem_rsp_it;
+        auto mem_rsp = *mem_rsp_it;
        /*
-          printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
+          printf("%0ld: [sim] MEM Rd: bank=%d, tag=%0lx, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp->tag, mem_rsp->addr);
          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-            printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
+            printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
          }
          printf("\n");
        */
-        memcpy(device_->mem_rsp_data.data(), mem_req->block.data(), MEM_BLOCK_SIZE);
-        device_->mem_rsp_tag = mem_req->tag;   
+        memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
+        device_->mem_rsp_tag = mem_rsp->tag;   
        pending_mem_reqs_.erase(mem_rsp_it);
        mem_rd_rsp_active_ = true;
-        delete mem_req;
+        delete mem_rsp;
      } else {
        device_->mem_rsp_valid = 0;
      }
    }

    // process memory requests    
-    if (device_->mem_req_valid) {
-      uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
+    if (device_->mem_req_valid && running_) {
+      uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
      if (device_->mem_req_rw) {        
        // process writes
        uint64_t byteen = device_->mem_req_byteen;        
-        uint8_t* data = (uint8_t*)device_->mem_req_data.data();
+        uint8_t* data = (uint8_t*)(device_->mem_req_data.data());

        // check console output
-        if (byte_addr >= IO_COUT_ADDR 
-         && byte_addr < (IO_COUT_ADDR + IO_COUT_SIZE)) {          
+        if (byte_addr >= uint64_t(IO_COUT_ADDR)
+         && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {    
          for (int i = 0; i < IO_COUT_SIZE; i++) {
            if ((byteen >> i) & 0x1) {            
              auto& ss_buf = print_bufs_[i];
@@ -496,7 +538,7 @@ private:
          }   
        } else {
          /*
-            printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen);
+            printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
            for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
              printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
            }
@@ -515,7 +557,7 @@ private:
            0
          );
          dram_queue_.push(dram_req);
-        }
+        }         
      } else {
        // process reads
        auto mem_req = new mem_req_t();
@@ -526,11 +568,13 @@ private:
        ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE);
        pending_mem_reqs_.emplace_back(mem_req);

+        //printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
+
        // send dram request
        ramulator::Request dram_req( 
          byte_addr,
          ramulator::Request::Type::READ,
-          std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
+          std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
              mem_req->ready = true;
            }, placeholders::_1, mem_req),
          0
@@ -539,11 +583,24 @@ private:
      }
    }   

-    device_->mem_req_ready = 1;
+    device_->mem_req_ready = running_;
  }

 #endif

+  void  reset_dcr_bus() {
+    device_->dcr_wr_valid = 0;
+  }
+
+  void  eval_dcr_bus(bool clk) {
+    if (!clk) {
+      return;
+    }
+    if (device_->dcr_wr_valid) {
+      device_->dcr_wr_valid = 0;
+    }
+  }
+
  void wait(uint32_t cycles) {
    for (int i = 0; i < cycles; ++i) {
      this->tick();
@@ -552,17 +609,17 @@ private:

  bool get_ebreak() const {
  #ifdef AXI_BUS
-    return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
+    return (bool)device_->Vortex_axi->vortex->sim_ebreak;
  #else
-    return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
+    return (bool)device_->Vortex->sim_ebreak;
  #endif
  }

-  int get_last_wb_value(int reg) const {
+  uint64_t get_last_wb_value(int reg) const {
  #ifdef AXI_BUS
-    return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
+    return ((Word*)device_->Vortex_axi->vortex->sim_wb_value.data())[reg];
  #else
-    return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
+    return ((Word*)device_->Vortex->sim_wb_value.data())[reg];
  #endif
  }

@@ -600,6 +657,8 @@ private:
  ramulator::Gem5Wrapper* dram_;

  std::queue<ramulator::Request> dram_queue_;
+
+  bool running_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -618,4 +677,8 @@ void Processor::attach_ram(RAM* mem) {

 int Processor::run() {
  return impl_->run();
+}
+
+void Processor::write_dcr(uint32_t addr, uint32_t value) {
+  return impl_->write_dcr(addr, value);
 }
--- a/sim/rtlsim/processor.h
+++ b/sim/rtlsim/processor.h
@@ -1,5 +1,20 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

+#include <stdint.h>
+
 namespace vortex {

 class RAM;
@@ -14,6 +29,8 @@ public:

  int run();

+  void write_dcr(uint32_t addr, uint32_t value);
+
 private:

  class Impl;
--- a/sim/rtlsim/verilator.vlt
+++ b/sim/rtlsim/verilator.vlt
@@ -1,10 +1,5 @@
 `verilator_config

-lint_off -rule BLKANDNBLK -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule UNOPTFLAT -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule WIDTH -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule UNUSED -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule LITENDIAN -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule IMPORTSTAR -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule PINCONNECTEMPTY -file "../../hw/rtl/fp_cores/fpnew/*"
-lint_off -file "../../hw/rtl/fp_cores/fpnew/*"
+lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
+lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
+lint_off -file "*/fpnew/src/*"
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -1,45 +1,36 @@
+XLEN ?= 32
 DESTDIR ?= .
 RTL_DIR = ../hw/rtl
 THIRD_PARTY_DIR = ../../third_party

-CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I. -I../common -I../../hw
 CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
 CXXFLAGS += -I$(THIRD_PARTY_DIR)
+CXXFLAGS += -DXLEN_$(XLEN)
 CXXFLAGS += $(CONFIGS)

-LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a 
-LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx 
+LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp
-
-OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
-VPATH := $(sort $(dir $(SRCS)))
-
-#$(info OBJS is $(OBJS))
-#$(info VPATH is $(VPATH))
+SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp

 # Debugigng
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
+	#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer
 else    
 	CXXFLAGS += -O2 -DNDEBUG
 endif

-# XLEN parameterization
-ifdef XLEN
-	CXXFLAGS += -DXLEN=$(XLEN)
-endif
-
 PROJECT = simx

 all: $(DESTDIR)/$(PROJECT)
 	
 $(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
-	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+	$(CXX) $(CXXFLAGS) -DSTARTUP_ADDR=0x80000000 $^ $(LDFLAGS) -o $@

 $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@
@@ -48,4 +39,4 @@ $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
 	$(CXX) $(CXXFLAGS) -MM $^ > .depend;

 clean:
-	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
+	rm -rf $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -0,0 +1,87 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+
+#include <cstdlib>
+#include <stdio.h>
+#include "types.h"
+
+namespace vortex {
+
+class Arch {  
+private:
+  uint16_t num_threads_;
+  uint16_t num_warps_;
+  uint16_t num_cores_;  
+  uint16_t num_clusters_;  
+  uint16_t vsize_;
+  uint16_t num_regs_;
+  uint16_t num_csrs_;
+  uint16_t num_barriers_;
+  uint16_t ipdom_size_;
+  
+public:
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)   
+    : num_threads_(num_threads)
+    , num_warps_(num_warps)
+    , num_cores_(num_cores)
+    , num_clusters_(num_clusters)
+    , vsize_(16)
+    , num_regs_(32)
+    , num_csrs_(4096)
+    , num_barriers_(NUM_BARRIERS)
+    , ipdom_size_((num_threads-1) * 2)
+  {}
+
+  uint16_t vsize() const { 
+    return vsize_; 
+  }
+
+  uint16_t num_regs() const {
+    return num_regs_;
+  }
+
+  uint16_t num_csrs() const {
+    return num_csrs_;
+  }
+
+  uint16_t num_barriers() const {
+    return num_barriers_;
+  }
+
+  uint16_t ipdom_size() const {
+    return ipdom_size_;
+  }
+
+  uint16_t num_threads() const {
+    return num_threads_;
+  }
+
+  uint16_t num_warps() const {
+    return num_warps_;
+  }
+
+  uint16_t num_cores() const {
+    return num_cores_;
+  }
+  
+  uint16_t num_clusters() const {
+    return num_clusters_;
+  }
+};
+
+}
--- a/sim/simx/archdef.h
+++ b/sim/simx/archdef.h
@@ -1,70 +0,0 @@
-#pragma once
-
-#include <string>
-#include <sstream>
-
-#include <cstdlib>
-#include <stdio.h>
-#include "types.h"
-
-namespace vortex {
-
-class ArchDef {  
-private:
-  uint16_t num_cores_;
-  uint16_t num_warps_;
-  uint16_t num_threads_;
-  uint16_t wsize_;
-  uint16_t vsize_;
-  uint16_t num_regs_;
-  uint16_t num_csrs_;
-  uint16_t num_barriers_;
-  
-public:
-  ArchDef(uint16_t num_cores, 
-          uint16_t num_warps, 
-          uint16_t num_threads)   
-    : num_cores_(num_cores)
-    , num_warps_(num_warps)
-    , num_threads_(num_threads)
-    , wsize_(4)
-    , vsize_(16)
-    , num_regs_(32)
-    , num_csrs_(4096)
-    , num_barriers_(NUM_BARRIERS)
-  {}
-
-  uint16_t wsize() const { 
-    return wsize_; 
-  }
-
-  uint16_t vsize() const { 
-    return vsize_; 
-  }
-
-  uint16_t num_regs() const {
-    return num_regs_;
-  }
-
-  uint16_t num_csrs() const {
-    return num_csrs_;
-  }
-
-  uint16_t num_barriers() const {
-    return num_barriers_;
-  }
-
-  uint16_t num_threads() const {
-    return num_threads_;
-  }
-
-  uint16_t num_warps() const {
-    return num_warps_;
-  }
-
-  uint16_t num_cores() const {
-    return num_cores_;
-  }
-};
-
-}
--- a/sim/simx/args.cpp
+++ b/sim/simx/args.cpp
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <string>
-#include "args.h"
-
-using namespace vortex;
-using std::string;
-
-std::string CommandLineArg::helpString_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::longArgs_;
-std::unordered_map<string, CommandLineArg *> CommandLineArg::shortArgs_;
-
-CommandLineArg::CommandLineArg(string s, string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-  shortArgs_[s] = this;
-}
-
-CommandLineArg::CommandLineArg(string l, const char *helpText) {
-  helpString_ += helpText;
-  longArgs_[l] = this;
-}
-
-void CommandLineArg::readArgs(int argc, char **argv) {
-  for (int i = 0; i < argc; i++) {
-    std::unordered_map<string, CommandLineArg *>::iterator 
-      s = shortArgs_.find(std::string(argv[i])), 
-      l = longArgs_.find(std::string(argv[i]));
-
-    if (s != shortArgs_.end()) {
-      i += s->second->read(argc - i, &argv[i]);
-    } else if (l != longArgs_.end()) {
-      i += l->second->read(argc - i, &argv[i]);
-    } else {
-      throw BadArg(string(argv[i]));
-    }
-  }
-}
-
-void CommandLineArg::clearArgs() {
-  shortArgs_.clear();
-  longArgs_.clear();
-  helpString_ = "";
-}
-
-void CommandLineArg::showHelp(std::ostream &os) {
-  os << helpString_;
-}
--- a/sim/simx/args.h
+++ b/sim/simx/args.h
@@ -1,64 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <unordered_map>
-#include <util.h>
-
-namespace vortex {
-
-struct BadArg { BadArg(std::string s) : arg(s) {} std::string arg; };
-
-class CommandLineArg {
-public:
-  CommandLineArg(std::string s, std::string l, const char *helpText);
-  CommandLineArg(std::string l, const char *helpText);
-  virtual int read(int argc, char** argv) = 0;
-
-  static void readArgs(int argc, char **argv);
-  static void clearArgs();
-  static void showHelp(std::ostream &os);
-
-private:
-  static std::string helpString_;
-  static std::unordered_map<std::string, CommandLineArg *> longArgs_;
-  static std::unordered_map<std::string, CommandLineArg *> shortArgs_;
-};
-
-template <typename T> class CommandLineArgSetter : public CommandLineArg {
-public:
-  CommandLineArgSetter(std::string s, std::string l, const char *ht, T &x) :
-    CommandLineArg(s, l, ht), arg_(x) {}
-
-  CommandLineArgSetter(std::string l, const char *ht, T &x) :
-    CommandLineArg(l, ht), arg_(x) {}
-
-  int read(int argc, char **argv) {
-    __unused (argc);
-    std::istringstream iss(argv[1]);
-    iss >> arg_;
-    return 1;
-  }
-private:
-  T &arg_;
-};
-
-class CommandLineArgFlag : public CommandLineArg {
-public:
-  CommandLineArgFlag(std::string s, std::string l, const char *ht, bool &x) :
-    CommandLineArg(s, l, ht), arg_(x) { arg_ = false; }
-
-  CommandLineArgFlag(std::string l, const char *ht, bool &x) :
-    CommandLineArg(l, ht), arg_(x) { arg_ = false; }
-
-  int read(int argc, char **argv) { 
-    __unused (argc, argv);
-    arg_ = true; 
-    return 0; 
-  }
-private:
-  bool &arg_;
-};
-  
-}
--- a/sim/simx/cache.cpp
+++ b/sim/simx/cache.cpp
@@ -1,637 +0,0 @@
-#include "cache.h"
-#include "debug.h"
-#include "types.h"
-#include <util.h>
-#include <unordered_map>
-#include <vector>
-#include <list>
-#include <queue>
-
-using namespace vortex;
-
-struct params_t {
-    uint32_t sets_per_bank;
-    uint32_t blocks_per_set;    
-    uint32_t words_per_block;
-    uint32_t log2_num_inputs;
-
-    uint32_t word_select_addr_start;
-    uint32_t word_select_addr_end;
-
-    uint32_t bank_select_addr_start;
-    uint32_t bank_select_addr_end;
-
-    uint32_t set_select_addr_start;
-    uint32_t set_select_addr_end;
-
-    uint32_t tag_select_addr_start;
-    uint32_t tag_select_addr_end;
-
-    params_t(const Cache::Config& config) {
-        uint32_t bank_bits   = log2ceil(config.num_banks);
-        uint32_t offset_bits = config.B - config.W;
-        uint32_t log2_bank_size  = config.C - bank_bits;
-        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);   
-
-        this->log2_num_inputs = log2ceil(config.num_inputs);
-
-        this->words_per_block = 1 << offset_bits;
-        this->blocks_per_set  = 1 << config.A;
-        this->sets_per_bank   = 1 << index_bits;
-
-        assert(config.ports_per_bank <= this->words_per_block);
-                
-        // Word select
-        this->word_select_addr_start = config.W;
-        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
-
-        // Bank select
-        this->bank_select_addr_start = (1+this->word_select_addr_end);
-        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
-
-        // Set select
-        this->set_select_addr_start = (1+this->bank_select_addr_end);
-        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
-
-        // Tag select
-        this->tag_select_addr_start = (1+this->set_select_addr_end);
-        this->tag_select_addr_end = (config.addr_width-1);
-    }
-
-    uint32_t addr_bank_id(uint64_t word_addr) const {
-        if (bank_select_addr_end >= bank_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
-        else    
-            return 0;
-    }
-
-    uint32_t addr_set_id(uint64_t word_addr) const {
-        if (set_select_addr_end >= set_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
-        else
-            return 0;
-    }
-
-    uint64_t addr_tag(uint64_t word_addr) const {
-        if (tag_select_addr_end >= tag_select_addr_start)
-            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
-        else    
-            return 0;
-    }
-    
-    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
-        uint64_t addr(0);
-        if (bank_select_addr_end >= bank_select_addr_start)            
-            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
-        if (set_select_addr_end >= set_select_addr_start)
-            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
-        if (tag_select_addr_end >= tag_select_addr_start)
-            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
-        return addr;
-    }
-};
-
-struct block_t {
-    bool     valid;
-    bool     dirty;        
-    uint64_t tag;
-    uint32_t lru_ctr;
-};
-
-struct set_t {
-    std::vector<block_t> blocks;    
-    set_t(uint32_t size) : blocks(size) {}
-
-    void clear() {
-        for (auto& block : blocks) {
-            block.valid = false;
-        }
-    }
-};
-
-struct bank_req_info_t {
-    bool     valid;    
-    uint32_t req_id;
-    uint64_t req_tag;
-};
-
-struct bank_req_t {
-    bool valid;
-    bool write;
-    bool mshr_replay;
-    uint64_t tag;
-    uint32_t set_id;
-    uint32_t core_id;
-    uint64_t uuid;
-    std::vector<bank_req_info_t> infos;
-
-    bank_req_t(uint32_t size) 
-        : valid(false)
-        , write(false)
-        , mshr_replay(false)
-        , tag(0)
-        , set_id(0)
-        , core_id(0)
-        , uuid(0)
-        , infos(size)
-    {}
-};
-
-struct mshr_entry_t : public bank_req_t {
-    uint32_t block_id;
-
-    mshr_entry_t(uint32_t size = 0) 
-        : bank_req_t(size) 
-        , block_id(0)
-    {}
-};
-
-class MSHR {
-private:
-    std::vector<mshr_entry_t> entries_;
-    uint32_t size_;
-
-public:    
-    MSHR(uint32_t size)
-        : entries_(size)
-        , size_(0) 
-    {}
-
-    bool empty() const {
-        return (0 == size_);
-    }
-    
-    bool full() const {
-        return (size_ == entries_.size());
-    }
-
-    int lookup(const bank_req_t& bank_req) {
-         for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (entry.valid 
-             && entry.set_id == bank_req.set_id 
-             && entry.tag == bank_req.tag) {
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    int allocate(const bank_req_t& bank_req, uint32_t block_id) {
-        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
-            auto& entry = entries_.at(i);
-            if (!entry.valid) {
-                *(bank_req_t*)&entry = bank_req;
-                entry.valid = true;
-                entry.mshr_replay = false;
-                entry.block_id = block_id;  
-                ++size_;              
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    mshr_entry_t& replay(uint32_t id) {
-        auto& root_entry = entries_.at(id);
-        assert(root_entry.valid);
-        // make all related mshr entries for replay
-        for (auto& entry : entries_) {
-            if (entry.valid 
-             && entry.set_id == root_entry.set_id 
-             && entry.tag == root_entry.tag) {
-                entry.mshr_replay = true;
-            }
-        }
-        return root_entry;
-    }
-
-    bool pop(bank_req_t* out) {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                *out = entry;
-                entry.valid = false;
-                --size_;
-                return true;
-            }
-        }
-        return false;
-    }
-
-    void clear() {
-        for (auto& entry : entries_) {
-            if (entry.valid && entry.mshr_replay) {
-                entry.valid = false;
-            }
-        }
-        size_ = 0;
-    }
-};
-
-struct bank_t {
-    std::vector<set_t>  sets;    
-    MSHR                mshr;
-
-    bank_t(const Cache::Config& config, 
-           const params_t& params) 
-        : sets(params.sets_per_bank, params.blocks_per_set)
-        , mshr(config.mshr_size)
-    {}
-
-    void clear() {
-        mshr.clear();
-        for (auto& set : sets) {
-            set.clear();
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class Cache::Impl {
-private:
-    Cache* const simobject_;
-    Config config_;
-    params_t params_;
-    std::vector<bank_t> banks_;
-    Switch<MemReq, MemRsp>::Ptr mem_switch_;    
-    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
-    std::vector<SimPort<MemReq>> mem_req_ports_;
-    std::vector<SimPort<MemRsp>>  mem_rsp_ports_;
-    uint32_t flush_cycles_;
-    PerfStats perf_stats_;
-    uint64_t pending_read_reqs_;
-    uint64_t pending_write_reqs_;
-    uint64_t pending_fill_reqs_;    
-
-public:
-    Impl(Cache* simobject, const Config& config) 
-        : simobject_(simobject)
-        , config_(config)
-        , params_(config)
-        , banks_(config.num_banks, {config, params_})
-        , mem_req_ports_(config.num_banks, simobject)
-        , mem_rsp_ports_(config.num_banks, simobject)
-    {
-        bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
-        bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
-        simobject->MemRspPort.bind(&bypass_switch_->RspIn);
-
-        if (config.num_banks > 1) {
-            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
-            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
-                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
-                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
-            }    
-            mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
-        } else {
-            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
-            bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
-        }
-
-        // calculate tag flush cycles
-        flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
-    }
-
-    void reset() {
-        for (auto& bank : banks_) {
-            bank.clear();
-        }
-        perf_stats_ = PerfStats();
-        pending_read_reqs_ = 0;
-        pending_write_reqs_ = 0;
-        pending_fill_reqs_ = 0;
-    }
-
-    void tick() {
-        // wait on flush cycles
-        if (flush_cycles_ != 0) {
-            --flush_cycles_;
-            return;
-        }
-
-        // per-bank pipeline request
-        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
-
-        // calculate memory latency
-        perf_stats_.mem_latency += pending_fill_reqs_;
-
-        // handle bypasss responses
-        auto& bypass_port = bypass_switch_->RspOut.at(1);            
-        if (!bypass_port.empty()) {
-            auto& mem_rsp = bypass_port.front();
-            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
-            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
-            MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
-            DT(3, simobject_->name() << "-" << core_rsp);
-            bypass_port.pop();
-        }        
-
-        // handle MSHR replay
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& bank = banks_.at(bank_id);
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            bank.mshr.pop(&pipeline_req);
-        }       
-
-        // handle memory fills
-        std::vector<bool> pending_fill_req(config_.num_banks, false);
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
-            if (!mem_rsp_port.empty()) {
-                auto& mem_rsp = mem_rsp_port.front();
-                this->processMemoryFill(bank_id, mem_rsp.tag);                
-                pending_fill_req.at(bank_id) = true;
-                mem_rsp_port.pop();
-            }
-        }
-        
-        // handle incoming core requests
-        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
-            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            // check cache bypassing
-            if (core_req.non_cacheable) {
-                // send IO request
-                this->processIORequest(core_req, req_id);
-
-                // remove request
-                core_req_port.pop();
-                continue;
-            }
-
-            auto bank_id = params_.addr_bank_id(core_req.addr);
-            auto set_id  = params_.addr_set_id(core_req.addr);
-            auto tag     = params_.addr_tag(core_req.addr);
-            auto port_id = req_id % config_.ports_per_bank;
-            
-            // create bank request
-            bank_req_t bank_req(config_.ports_per_bank);
-            bank_req.valid = true;
-            bank_req.write = core_req.write;
-            bank_req.mshr_replay = false;
-            bank_req.tag = tag;            
-            bank_req.set_id = set_id;       
-            bank_req.core_id = core_req.core_id;
-            bank_req.uuid = core_req.uuid;
-            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
-
-            auto& bank = banks_.at(bank_id);            
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-
-            // check pending MSHR replay
-            if (pipeline_req.valid 
-             && pipeline_req.mshr_replay) {
-                 // stall
-                continue;
-            }    
-
-            // check pending fill request
-            if (pending_fill_req.at(bank_id)) {
-                // stall
-                continue;
-            }
-            
-            // check MSHR capacity if read or writeback
-            if ((!core_req.write || !config_.write_through)
-             && bank.mshr.full()) {
-                ++perf_stats_.mshr_stalls;
-                continue;
-            }    
-
-            // check bank conflicts
-            if (pipeline_req.valid) {
-                // check port conflict
-                if (pipeline_req.write != core_req.write
-                 || pipeline_req.set_id != set_id
-                 || pipeline_req.tag != tag
-                 || pipeline_req.infos[port_id].valid) {
-                    ++perf_stats_.bank_stalls;
-                    continue;
-                }
-                // update pending request infos
-                pipeline_req.infos[port_id] = bank_req.infos[port_id];
-            } else {
-                // schedule new request
-                pipeline_req = bank_req;
-            }
-
-            if (core_req.write)
-                ++perf_stats_.writes;
-            else
-                ++perf_stats_.reads;
-
-            // remove request
-            auto time = core_req_port.pop();
-            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
-        }
-    
-        // process active request        
-        this->processBankRequest(pipeline_reqs);
-    } 
-
-    const PerfStats& perf_stats() const {
-        return perf_stats_;
-    }
-
-private:
-    
-    void processIORequest(const MemReq& core_req, uint32_t req_id) {
-        {
-            MemReq mem_req(core_req);
-            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
-            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
-            DT(3, simobject_->name() << "-" << mem_req);
-        }
-
-        if (core_req.write && config_.write_reponse) {
-            MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
-            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
-            DT(3, simobject_->name() << "-" << core_rsp);
-        }
-    }
-
-    void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
-        // update block
-        auto& bank  = banks_.at(bank_id);
-        auto& entry = bank.mshr.replay(mshr_id);
-        auto& set   = bank.sets.at(entry.set_id);
-        auto& block = set.blocks.at(entry.block_id);
-        block.valid = true;
-        block.tag   = entry.tag;
-        --pending_fill_reqs_;
-    }
-
-    void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            auto& pipeline_req = pipeline_reqs.at(bank_id);
-            if (!pipeline_req.valid)
-                continue;
-
-            auto& bank = banks_.at(bank_id);
-            auto& set = bank.sets.at(pipeline_req.set_id);
-
-            if (pipeline_req.mshr_replay) {
-                // send core response
-                for (auto& info : pipeline_req.infos) {
-                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
-                    DT(3, simobject_->name() << "-" << core_rsp);         
-                }
-            } else {        
-                bool hit = false;
-                bool found_free_block = false;            
-                uint32_t hit_block_id = 0;
-                uint32_t repl_block_id = 0;            
-                uint32_t max_cnt = 0;
-                
-                for (uint32_t i = 0, n = set.blocks.size(); i < n; ++i) {
-                    auto& block = set.blocks.at(i);
-                    if (block.valid) {
-                        if (block.tag == pipeline_req.tag) {
-                            block.lru_ctr = 0;                        
-                            hit_block_id = i;
-                            hit = true;
-                        } else {
-                            ++block.lru_ctr;
-                        }
-                        if (max_cnt < block.lru_ctr) {
-                            max_cnt = block.lru_ctr;
-                            repl_block_id = i;
-                        }
-                    } else {                    
-                        found_free_block = true;
-                        repl_block_id = i;
-                    }
-                }
-
-                if (hit) {     
-                    //
-                    // Hit handling   
-                    //                
-                    if (pipeline_req.write) {
-                        // handle write hit
-                        auto& hit_block = set.blocks.at(hit_block_id);
-                        if (config_.write_through) {
-                            // forward write request to memory
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        } else {
-                            // mark block as dirty
-                            hit_block.dirty = true;
-                        }
-                    }
-                    // send core response
-                    if (!pipeline_req.write || config_.write_reponse) {
-                        for (auto& info : pipeline_req.infos) {     
-                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                            DT(3, simobject_->name() << "-" << core_rsp);
-                        }
-                    }
-                } else {     
-                    //
-                    // Miss handling   
-                    //
-                    if (pipeline_req.write)
-                        ++perf_stats_.write_misses;
-                    else
-                        ++perf_stats_.read_misses;
-
-                    if (!found_free_block && !config_.write_through) {
-                        // write back dirty block
-                        auto& repl_block = set.blocks.at(repl_block_id);
-                        if (repl_block.dirty) {                       
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++perf_stats_.evictions;
-                        }
-                    }
-
-                    if (pipeline_req.write && config_.write_through) {
-                        // forward write request to memory
-                        {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = true;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                        }
-                        // send core response
-                        if (config_.write_reponse) {
-                            for (auto& info : pipeline_req.infos) {         
-                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
-                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
-                                DT(3, simobject_->name() << "-" << core_rsp);
-                            }
-                        }
-                    } else {
-                        // MSHR lookup
-                        int pending = bank.mshr.lookup(pipeline_req);
-
-                        // allocate MSHR
-                        int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
-                        
-                        // send fill request
-                        if (pending == -1) {
-                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
-                            mem_req.write = false;
-                            mem_req.tag   = mshr_id;
-                            mem_req.core_id = pipeline_req.core_id;
-                            mem_req.uuid = pipeline_req.uuid;
-                            mem_req_ports_.at(bank_id).send(mem_req, 1);
-                            DT(3, simobject_->name() << "-" << mem_req);
-                            ++pending_fill_reqs_;
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-Cache::Cache(const SimContext& ctx, const char* name, const Config& config) 
-    : SimObject<Cache>(ctx, name)    
-    , CoreReqPorts(config.num_inputs, this)
-    , CoreRspPorts(config.num_inputs, this)
-    , MemReqPort(this)
-    , MemRspPort(this)
-    , impl_(new Impl(this, config))
-{}
-
-Cache::~Cache() {
-    delete impl_;
-}
-
-void Cache::reset() {
-    impl_->reset();
-}
-
-void Cache::tick() {
-    impl_->tick();
-}
-
-const Cache::PerfStats& Cache::perf_stats() const {
-    return impl_->perf_stats();
-}
--- a/sim/simx/cache_cluster.h
+++ b/sim/simx/cache_cluster.h
@@ -0,0 +1,106 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "cache_sim.h"
+
+namespace vortex {
+
+class CacheCluster : public SimObject<CacheCluster> {
+public:
+    std::vector<std::vector<SimPort<MemReq>>> CoreReqPorts;
+    std::vector<std::vector<SimPort<MemRsp>>> CoreRspPorts;
+    SimPort<MemReq> MemReqPort;
+    SimPort<MemRsp> MemRspPort;
+
+    CacheCluster(const SimContext& ctx, 
+                 const char* name, 
+                 uint32_t num_units, 
+                 uint32_t num_caches, 
+                 uint32_t num_requests,
+                 const CacheSim::Config& config) 
+        : SimObject(ctx, name)        
+        , CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
+        , CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
+        , MemReqPort(this)
+        , MemRspPort(this)
+        , caches_(MAX(num_caches, 0x1)) {
+
+        CacheSim::Config config2(config);
+        if (0 == num_caches) {
+            num_caches = 1;
+            config2.bypass = true;
+        }
+
+        char sname[100];
+        
+        std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
+        for (uint32_t u = 0; u < num_units; ++u) {
+            snprintf(sname, 100, "%s-unit-arb-%d", name, u);
+            unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
+            for (uint32_t i = 0; i < num_requests; ++i) {
+                this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
+                unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
+            }
+        }
+
+        std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
+        for (uint32_t i = 0; i < config.num_inputs; ++i) {
+            snprintf(sname, 100, "%s-mem-arb-%d", name, i);
+            mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
+            for (uint32_t u = 0; u < num_units; ++u) {              
+                unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
+                mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
+            }            
+        }
+
+        snprintf(sname, 100, "%s-cache-arb", name);
+        auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
+
+        for (uint32_t i = 0; i < num_caches; ++i) {
+            snprintf(sname, 100, "%s-cache%d", name, i);
+            caches_.at(i) = CacheSim::Create(sname, config2);
+
+            for (uint32_t j = 0; j < config.num_inputs; ++j) {
+                mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
+                caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
+            }
+
+            caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
+            cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
+        }
+
+        cache_arb->ReqOut.at(0).bind(&this->MemReqPort);
+        this->MemRspPort.bind(&cache_arb->RspOut.at(0));
+    }
+
+    ~CacheCluster() {}
+
+    void reset() {}
+    
+    void tick() {}
+
+    CacheSim::PerfStats perf_stats() const {
+        CacheSim::PerfStats perf;
+        for (auto cache : caches_) {
+            perf += cache->perf_stats();
+        }   
+        return perf;
+    }
+    
+private:
+    std::vector<CacheSim::Ptr> caches_;
+};
+
+}
--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@@ -0,0 +1,707 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cache_sim.h"
+#include "debug.h"
+#include "types.h"
+#include <util.h>
+#include <unordered_map>
+#include <vector>
+#include <list>
+#include <queue>
+
+using namespace vortex;
+
+struct params_t {
+    uint32_t sets_per_bank;
+    uint32_t lines_per_set;    
+    uint32_t words_per_line;
+    uint32_t log2_num_inputs;
+
+    uint32_t word_select_addr_start;
+    uint32_t word_select_addr_end;
+
+    uint32_t bank_select_addr_start;
+    uint32_t bank_select_addr_end;
+
+    uint32_t set_select_addr_start;
+    uint32_t set_select_addr_end;
+
+    uint32_t tag_select_addr_start;
+    uint32_t tag_select_addr_end;
+
+    params_t(const CacheSim::Config& config) {
+        int32_t bank_bits = log2ceil(config.num_banks);
+        int32_t offset_bits = config.B - config.W;
+        int32_t log2_bank_size = config.C - bank_bits;
+        int32_t index_bits = log2_bank_size - (config.B + config.A);        
+        assert(log2_bank_size > 0);
+        assert(offset_bits >= 0);
+        assert(index_bits >= 0);
+
+        this->log2_num_inputs = log2ceil(config.num_inputs);
+
+        this->words_per_line = 1 << offset_bits;
+        this->lines_per_set  = 1 << config.A;
+        this->sets_per_bank   = 1 << index_bits;
+
+        assert(config.ports_per_bank <= this->words_per_line);
+                
+        // Word select
+        this->word_select_addr_start = config.W;
+        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
+
+        // Bank select
+        this->bank_select_addr_start = (1+this->word_select_addr_end);
+        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
+
+        // Set select
+        this->set_select_addr_start = (1+this->bank_select_addr_end);
+        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
+
+        // Tag select
+        this->tag_select_addr_start = (1+this->set_select_addr_end);
+        this->tag_select_addr_end = (config.addr_width-1);
+    }
+
+    uint32_t addr_bank_id(uint64_t word_addr) const {
+        if (bank_select_addr_end >= bank_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
+        else    
+            return 0;
+    }
+
+    uint32_t addr_set_id(uint64_t word_addr) const {
+        if (set_select_addr_end >= set_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
+        else
+            return 0;
+    }
+
+    uint64_t addr_tag(uint64_t word_addr) const {
+        if (tag_select_addr_end >= tag_select_addr_start)
+            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
+        else    
+            return 0;
+    }
+    
+    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
+        uint64_t addr(0);
+        if (bank_select_addr_end >= bank_select_addr_start)            
+            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
+        if (set_select_addr_end >= set_select_addr_start)
+            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
+        if (tag_select_addr_end >= tag_select_addr_start)
+            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
+        return addr;
+    }
+};
+
+struct line_t {  
+    uint64_t tag;
+    uint32_t lru_ctr;
+    bool     valid;
+    bool     dirty;
+
+    void clear() {
+        valid = false;
+        dirty = false;
+    }
+};
+
+struct set_t {
+    std::vector<line_t> lines;
+
+    set_t(uint32_t num_ways) 
+        : lines(num_ways) 
+    {}
+
+    void clear() {
+        for (auto& line : lines) {
+            line.clear();
+        }
+    }
+};
+
+struct bank_req_port_t {
+    uint32_t req_id;
+    uint64_t req_tag;
+    bool     valid;
+
+    void clear() {
+        valid = false;   
+    }
+};
+
+struct bank_req_t {
+
+    enum ReqType {
+        None   = 0,
+        Fill   = 1,
+        Replay = 2,        
+        Core   = 3
+    };
+
+    std::vector<bank_req_port_t> ports;
+    uint64_t tag;
+    uint32_t set_id;
+    uint32_t cid;
+    uint64_t uuid;
+    ReqType  type;
+    bool     write;
+
+    bank_req_t(uint32_t num_ports)
+        : ports(num_ports) 
+    {}
+
+    void clear() {
+        for (auto& port : ports) {
+            port.clear();
+        }
+        type = ReqType::None;
+    }
+};
+
+struct mshr_entry_t {
+    bank_req_t bank_req;
+    uint32_t   line_id;
+
+    mshr_entry_t(uint32_t num_ports) 
+        : bank_req(num_ports) 
+    {}
+
+    void clear() {
+        bank_req.clear();
+    }
+};
+
+class MSHR {
+private:
+    std::vector<mshr_entry_t> entries_;
+    uint32_t size_;
+
+public:    
+    MSHR(uint32_t size, uint32_t num_ports)
+        : entries_(size, num_ports)
+        , size_(0) 
+    {}
+
+    bool empty() const {
+        return (0 == size_);
+    }
+    
+    bool full() const {
+        return (size_ == entries_.size());
+    }
+
+    bool lookup(const bank_req_t& bank_req) {
+         for (auto& entry : entries_) {;
+            if (entry.bank_req.type != bank_req_t::None
+             && entry.bank_req.set_id == bank_req.set_id 
+             && entry.bank_req.tag == bank_req.tag) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    int allocate(const bank_req_t& bank_req, uint32_t line_id) {
+        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+            auto& entry = entries_.at(i);
+            if (entry.bank_req.type == bank_req_t::None) {
+                entry.bank_req = bank_req;
+                entry.line_id = line_id;  
+                ++size_;              
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    mshr_entry_t& replay(uint32_t id) {
+        auto& root_entry = entries_.at(id);
+        assert(root_entry.bank_req.type == bank_req_t::Core);
+        // mark all related mshr entries for replay
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Core 
+             && entry.bank_req.set_id == root_entry.bank_req.set_id 
+             && entry.bank_req.tag == root_entry.bank_req.tag) {
+                entry.bank_req.type = bank_req_t::Replay;
+            }
+        }
+        return root_entry;
+    }
+
+    bool pop(bank_req_t* out) {
+        for (auto& entry : entries_) {
+            if (entry.bank_req.type == bank_req_t::Replay) {
+                *out = entry.bank_req;
+                entry.bank_req.type = bank_req_t::None;
+                --size_;
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void clear() {
+        for (auto& entry : entries_) {
+            entry.clear();
+        }
+        size_ = 0;
+    }
+};
+
+struct bank_t {
+    std::vector<set_t> sets;    
+    MSHR               mshr;
+
+    bank_t(const CacheSim::Config& config, 
+           const params_t& params) 
+        : sets(params.sets_per_bank, params.lines_per_set)
+        , mshr(config.mshr_size, config.ports_per_bank)
+    {}
+
+    void clear() {        
+        for (auto& set : sets) {
+            set.clear();
+        }
+        mshr.clear();
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class CacheSim::Impl {
+private:
+    CacheSim* const simobject_;
+    Config config_;
+    params_t params_;
+    std::vector<bank_t> banks_;
+    Switch<MemReq, MemRsp>::Ptr bank_switch_;    
+    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
+    std::vector<SimPort<MemReq>> mem_req_ports_;
+    std::vector<SimPort<MemRsp>> mem_rsp_ports_;
+    std::vector<bank_req_t> pipeline_reqs_;
+    uint32_t init_cycles_;
+    PerfStats perf_stats_;
+    uint64_t pending_read_reqs_;
+    uint64_t pending_write_reqs_;
+    uint64_t pending_fill_reqs_;
+
+public:
+    Impl(CacheSim* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , params_(config)
+        , banks_(config.num_banks, {config, params_})
+        , mem_req_ports_(config.num_banks, simobject)
+        , mem_rsp_ports_(config.num_banks, simobject)
+        , pipeline_reqs_(config.num_banks, config.ports_per_bank)
+    {
+        char sname[100];
+        snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
+
+        if (config_.bypass) {            
+            bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);            
+            for (uint32_t i = 0; i < config_.num_inputs; ++i) {
+               simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
+               bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
+            }
+            bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+            return;
+        }
+        
+        bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
+        bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
+        simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
+
+        if (config.num_banks > 1) {
+            snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
+            bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
+            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+                mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
+                bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
+            }    
+            bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0));
+        } else {
+            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0));
+        }
+
+        // calculate cache initialization cycles
+        init_cycles_ = params_.sets_per_bank * params_.lines_per_set;
+    }
+
+    void reset() {
+        if (config_.bypass)
+            return;
+
+        for (auto& bank : banks_) {
+            bank.clear();
+        }
+        perf_stats_ = PerfStats();
+        pending_read_reqs_  = 0;
+        pending_write_reqs_ = 0;
+        pending_fill_reqs_  = 0;
+    }
+
+    void tick() {
+        if (config_.bypass)
+            return;
+
+        // wait on cache initialization cycles
+        if (init_cycles_ != 0) {
+            --init_cycles_;
+            return;
+        }
+
+        // handle cache bypasss responses
+        {
+            auto& bypass_port = bypass_switch_->RspIn.at(1);            
+            if (!bypass_port.empty()) {
+                auto& mem_rsp = bypass_port.front();
+                this->processBypassResponse(mem_rsp);
+                bypass_port.pop();
+            }
+        }
+
+        // initialize pipeline request
+        for (auto& pipeline_req : pipeline_reqs_) {
+            pipeline_req.clear();
+        }
+
+        // schedule MSHR replay
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            bank.mshr.pop(&pipeline_req);
+        }
+
+        // schedule memory fill
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
+            if (mem_rsp_port.empty())
+                continue;
+
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+            if (pipeline_req.type != bank_req_t::None)
+                continue;
+
+            auto& mem_rsp = mem_rsp_port.front();            
+            DT(3, simobject_->name() << "-dram-" << mem_rsp);
+            pipeline_req.type = bank_req_t::Fill;
+            pipeline_req.tag = mem_rsp.tag;
+            mem_rsp_port.pop();
+        }
+
+        // schedule core requests        
+        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
+            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            // check cache bypassing
+            if (core_req.type == AddrType::IO) {
+                // send bypass request
+                this->processBypassRequest(core_req, req_id);
+                // remove request
+                core_req_port.pop();
+                continue;
+            }
+
+            auto bank_id = params_.addr_bank_id(core_req.addr);
+            auto set_id  = params_.addr_set_id(core_req.addr);
+            auto tag     = params_.addr_tag(core_req.addr);
+            auto port_id = req_id % config_.ports_per_bank;
+
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+
+            // check MSHR capacity
+            if ((!core_req.write || !config_.write_through)
+             && bank.mshr.full()) {
+                ++perf_stats_.mshr_stalls;
+                ++perf_stats_.bank_stalls;
+                continue;
+            }            
+
+            // check bank conflicts
+            if (pipeline_req.type == bank_req_t::Core) {
+                // check port conflict
+                if (pipeline_req.write != core_req.write
+                 || pipeline_req.set_id != set_id
+                 || pipeline_req.tag != tag
+                 || pipeline_req.ports.at(port_id).valid) {
+                    ++perf_stats_.bank_stalls;
+                    continue;
+                }
+                // extend request ports
+                pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+            } else if (pipeline_req.type == bank_req_t::None) {
+                // schedule new request
+                bank_req_t bank_req(config_.ports_per_bank);
+                bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
+                bank_req.tag   = tag;            
+                bank_req.set_id = set_id;       
+                bank_req.cid   = core_req.cid;
+                bank_req.uuid  = core_req.uuid;
+                bank_req.type  = bank_req_t::Core;
+                bank_req.write = core_req.write;
+                pipeline_req   = bank_req;
+            } else {
+                // bank in use
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            if (core_req.write)
+                ++perf_stats_.writes;
+            else
+                ++perf_stats_.reads;
+
+            // remove request
+            DT(3, simobject_->name() << "-core-" << core_req);
+            auto time = core_req_port.pop();
+            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
+        }
+    
+        // process active request        
+        this->processBankRequests();
+    } 
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
+    }
+
+private:
+    
+    void processBypassResponse(const MemRsp& mem_rsp) {
+        uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
+        uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
+        MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
+        simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
+        DT(3, simobject_->name() << "-core-" << core_rsp);
+    }
+
+    void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
+        DT(3, simobject_->name() << "-core-" << core_req);
+
+        {
+            MemReq mem_req(core_req);
+            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
+            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
+            DT(3, simobject_->name() << "-dram-" << mem_req);
+        }
+
+        if (core_req.write && config_.write_reponse) {
+            MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
+            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
+            DT(3, simobject_->name() << "-core-" << core_rsp);
+        }
+    }
+
+    void processBankRequests() {
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto pipeline_req = pipeline_reqs_.at(bank_id);
+            
+            switch (pipeline_req.type) {
+            case bank_req_t::None:
+                break;
+            case bank_req_t::Fill: {
+                // update cache line
+                auto& bank  = banks_.at(bank_id);
+                auto& entry = bank.mshr.replay(pipeline_req.tag);
+                auto& set   = bank.sets.at(entry.bank_req.set_id);
+                auto& line  = set.lines.at(entry.line_id);
+                line.valid  = true;
+                line.tag    = entry.bank_req.tag;
+                --pending_fill_reqs_;
+            } break;
+            case bank_req_t::Replay: {
+                // send core response
+                if (!pipeline_req.write || config_.write_reponse) {
+                    for (auto& info : pipeline_req.ports) {
+                        if (!info.valid)
+                            continue;
+                        MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                        simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
+                        DT(3, simobject_->name() << "-core-" << core_rsp);         
+                    }
+                }
+            } break;
+            case bank_req_t::Core: {        
+                bool hit = false;
+                bool found_free_line = false;            
+                uint32_t hit_line_id = 0;
+                uint32_t repl_line_id = 0;            
+                uint32_t max_cnt = 0;
+
+                auto& set = bank.sets.at(pipeline_req.set_id);
+
+                // tag lookup                
+                for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
+                    auto& line = set.lines.at(i);
+                    if (line.valid) {
+                        if (line.tag == pipeline_req.tag) {
+                            line.lru_ctr = 0;                        
+                            hit_line_id = i;
+                            hit = true;
+                        } else {
+                            ++line.lru_ctr;
+                        }
+                        if (max_cnt < line.lru_ctr) {
+                            max_cnt = line.lru_ctr;
+                            repl_line_id = i;
+                        }
+                    } else {                    
+                        found_free_line = true;
+                        repl_line_id = i;
+                    }
+                }
+
+                if (hit) {     
+                    //
+                    // Hit handling   
+                    //                
+                    if (pipeline_req.write) {
+                        // handle write hit
+                        auto& hit_line = set.lines.at(hit_line_id);
+                        if (config_.write_through) {
+                            // forward write request to memory
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        } else {
+                            // mark line as dirty
+                            hit_line.dirty = true;
+                        }
+                    }
+                    // send core response
+                    if (!pipeline_req.write || config_.write_reponse) {
+                        for (auto& info : pipeline_req.ports) {     
+                            if (!info.valid)
+                                continue;
+                            MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                            DT(3, simobject_->name() << "-core-" << core_rsp);
+                        }
+                    }
+                } else {     
+                    //
+                    // Miss handling   
+                    //
+                    if (pipeline_req.write)
+                        ++perf_stats_.write_misses;
+                    else
+                        ++perf_stats_.read_misses;
+
+                    if (!found_free_line && !config_.write_through) {
+                        // write back dirty line
+                        auto& repl_line = set.lines.at(repl_line_id);
+                        if (repl_line.dirty) {                       
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++perf_stats_.evictions;
+                        }
+                    }
+
+                    if (pipeline_req.write && config_.write_through) {
+                        // forward write request to memory
+                        {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = true;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                        }
+                        // send core response
+                        if (config_.write_reponse) {
+                            for (auto& info : pipeline_req.ports) {
+                                if (!info.valid)
+                                    continue;       
+                                MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
+                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                                DT(3, simobject_->name() << "-core-" << core_rsp);
+                            }
+                        }
+                    } else {
+                        // MSHR lookup
+                        auto mshr_pending = bank.mshr.lookup(pipeline_req);
+
+                        // allocate MSHR
+                        auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
+                        
+                        // send fill request
+                        if (!mshr_pending) {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = false;
+                            mem_req.tag   = mshr_id;
+                            mem_req.cid = pipeline_req.cid;
+                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-dram-" << mem_req);
+                            ++pending_fill_reqs_;
+                        }
+                    }
+                }
+            } break;
+            }
+        }
+        // calculate memory latency
+        perf_stats_.mem_latency += pending_fill_reqs_;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<CacheSim>(ctx, name)    
+    , CoreReqPorts(config.num_inputs, this)
+    , CoreRspPorts(config.num_inputs, this)
+    , MemReqPort(this)
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
+{}
+
+CacheSim::~CacheSim() {
+    delete impl_;
+}
+
+void CacheSim::reset() {
+    impl_->reset();
+}
+
+void CacheSim::tick() {
+    impl_->tick();
+}
+
+const CacheSim::PerfStats& CacheSim::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/cache_sim.h
+++ b/sim/simx/cache_sim.h
@@ -1,13 +1,27 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
-#include "memsim.h"
+#include "mem_sim.h"

 namespace vortex {

-class Cache : public SimObject<Cache> {
+class CacheSim : public SimObject<CacheSim> {
 public:
    struct Config {
+        bool    bypass;         // cache bypass
        uint8_t C;              // log2 cache size
        uint8_t B;              // log2 block size
        uint8_t W;              // log2 word size
@@ -45,6 +59,19 @@ public:
            , mshr_stalls(0)
            , mem_latency(0)
        {}
+
+        PerfStats& operator+=(const PerfStats& rhs) {
+            this->reads += rhs.reads;
+            this->writes += rhs.writes;
+            this->read_misses += rhs.read_misses;
+            this->write_misses += rhs.write_misses;
+            this->evictions += rhs.evictions;
+            this->pipeline_stalls += rhs.pipeline_stalls;
+            this->bank_stalls += rhs.bank_stalls;
+            this->mshr_stalls += rhs.mshr_stalls;
+            this->mem_latency += rhs.mem_latency;
+            return *this;
+        }
    };

    std::vector<SimPort<MemReq>> CoreReqPorts;
@@ -52,8 +79,8 @@ public:
    SimPort<MemReq>              MemReqPort;
    SimPort<MemRsp>              MemRspPort;

-    Cache(const SimContext& ctx, const char* name, const Config& config);
-    ~Cache();
+    CacheSim(const SimContext& ctx, const char* name, const Config& config);
+    ~CacheSim();

    void reset();
    
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -0,0 +1,222 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cluster.h"
+
+using namespace vortex;
+
+Cluster::Cluster(const SimContext& ctx, 
+                 uint32_t cluster_id,
+                 ProcessorImpl* processor, 
+                 const Arch &arch, const 
+                 DCRS &dcrs) 
+  : SimObject(ctx, "cluster")
+  , mem_req_port(this)
+  , mem_rsp_port(this)
+  , cluster_id_(cluster_id)
+  , cores_(arch.num_cores())  
+  , barriers_(arch.num_barriers(), 0)
+  , sharedmems_(arch.num_cores())
+  , processor_(processor)
+{
+  auto num_cores = arch.num_cores();
+  
+  char sname[100];
+  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
+  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
+    !L2_ENABLED,
+    log2ceil(L2_CACHE_SIZE), // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L2_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L2_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    5,                      // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L2_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
+  });
+
+  l2cache_->MemReqPort.bind(&this->mem_req_port);
+  this->mem_rsp_port.bind(&l2cache_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
+  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
+    !ICACHE_ENABLED,
+    log2ceil(ICACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(uint32_t)), // W
+    log2ceil(ICACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    1,                      // number of banks
+    1,                      // number of ports
+    1,                      // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    (uint8_t)arch.num_warps(), // mshr
+    2,                      // pipeline latency
+  });
+
+  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
+  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
+
+  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
+  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+    !DCACHE_ENABLED,
+    log2ceil(DCACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // B
+    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_NUM_WAYS),// A
+    XLEN,                   // address bits    
+    DCACHE_NUM_BANKS,       // number of banks
+    1,                      // number of ports
+    DCACHE_NUM_BANKS,       // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    DCACHE_MSHR_SIZE,       // mshr
+    4,                      // pipeline latency
+  });
+
+  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
+  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
+
+  ///////////////////////////////////////////////////////////////////////////
+
+  // create shared memory blocks
+  for (uint32_t i = 0; i < num_cores; ++i) {
+    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
+    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
+      (1 << SMEM_LOG_SIZE),
+      sizeof(Word),
+      NUM_LSU_LANES, 
+      NUM_LSU_LANES,
+      false
+    });
+  }
+
+  // create cores
+
+  for (uint32_t i = 0; i < num_cores; ++i) {  
+    uint32_t core_id = cluster_id * num_cores + i;
+    cores_.at(i) = Core::Create(core_id, 
+                                this, 
+                                arch, 
+                                dcrs, 
+                                sharedmems_.at(i));
+
+    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+
+    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
+      auto smem_demux = SMemDemux::Create(sname);
+      
+      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
+      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
+      
+      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
+      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
+
+      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
+      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
+    }
+  }
+}
+
+Cluster::~Cluster() {
+  //--
+}
+
+void Cluster::reset() {  
+  for (auto& barrier : barriers_) {
+    barrier.reset();
+  }
+}
+
+void Cluster::tick() {
+  //--
+}
+
+void Cluster::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+bool Cluster::running() const {
+  for (auto& core : cores_) {
+    if (core->running())
+      return true;
+  }
+  return false;
+}
+
+bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
+  bool done = true;
+  Word exitcode_ = 0;
+  for (auto& core : cores_) {
+    Word ec;
+    if (core->check_exit(&ec, riscv_test)) {
+      exitcode_ |= ec;
+    } else {
+      done = false;
+    }
+  }
+  *exitcode = exitcode_;
+  return done;
+}
+
+void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  auto& barrier = barriers_.at(bar_id);
+
+  uint32_t local_core_id = core_id % cores_.size();
+  barrier.set(local_core_id);
+
+  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
+
+  if (barrier.count() == (size_t)count) {
+      // resume all suspended cores
+      for (uint32_t i = 0; i < cores_.size(); ++i) {
+        if (barrier.test(i)) {
+          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+          cores_.at(i)->resume();
+        }
+      }
+      barrier.reset();
+    }
+}
+
+ProcessorImpl* Cluster::processor() const {
+  return processor_;
+}
+
+Cluster::PerfStats Cluster::perf_stats() const {
+  Cluster::PerfStats perf;
+  perf.icache = icaches_->perf_stats();
+  perf.dcache = dcaches_->perf_stats();    
+  perf.tcache = tcaches_->perf_stats();
+  perf.ocache = ocaches_->perf_stats();
+  perf.rcache = rcaches_->perf_stats();
+  perf.l2cache = l2cache_->perf_stats();
+
+  for (auto sharedmem : sharedmems_) {
+    perf.sharedmem += sharedmem->perf_stats();
+  }
+  
+  return perf;
+}
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@@ -0,0 +1,92 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "dcrs.h"
+#include "arch.h"
+#include "cache_cluster.h"
+#include "shared_mem.h"
+#include "core.h"
+#include "constants.h"
+
+namespace vortex {
+
+class ProcessorImpl;
+
+class Cluster : public SimObject<Cluster> {
+public:
+  struct PerfStats {
+    CacheSim::PerfStats   icache;
+    CacheSim::PerfStats   dcache;
+    SharedMem::PerfStats  sharedmem;
+    CacheSim::PerfStats   l2cache;
+    CacheSim::PerfStats   tcache;
+    CacheSim::PerfStats   ocache;
+    CacheSim::PerfStats   rcache;
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->icache      += rhs.icache;
+      this->dcache      += rhs.dcache;
+      this->sharedmem   += rhs.sharedmem;
+      this->l2cache     += rhs.l2cache;
+      this->tcache      += rhs.tcache;
+      this->ocache      += rhs.ocache;
+      this->rcache      += rhs.rcache;
+      return *this;
+    }
+  };
+
+  SimPort<MemReq> mem_req_port;
+  SimPort<MemRsp> mem_rsp_port;
+
+  Cluster(const SimContext& ctx, 
+          uint32_t cluster_id,
+          ProcessorImpl* processor, 
+          const Arch &arch, 
+          const DCRS &dcrs);
+
+  ~Cluster();
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  bool check_exit(Word* exitcode, bool riscv_test) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
+
+  ProcessorImpl* processor() const;
+
+  Cluster::PerfStats perf_stats() const;
+  
+private:
+  uint32_t                     cluster_id_;  
+  std::vector<Core::Ptr>       cores_;  
+  std::vector<CoreMask>        barriers_;
+  CacheSim::Ptr                l2cache_;
+  CacheCluster::Ptr            icaches_;
+  CacheCluster::Ptr            dcaches_;
+  std::vector<SharedMem::Ptr>  sharedmems_;
+  CacheCluster::Ptr            tcaches_;
+  CacheCluster::Ptr            ocaches_;
+  CacheCluster::Ptr            rcaches_;
+  ProcessorImpl*               processor_;
+};
+
+} // namespace vortex
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef RAM_PAGE_SIZE
@@ -10,14 +23,4 @@

 #ifndef MEMORY_BANKS
 #define MEMORY_BANKS 2
-#endif
-
-namespace vortex {
-
-enum Constants {
-
-    SMEM_BANK_OFFSET = log2ceil(sizeof(uint32_t)) + log2ceil(STACK_SIZE / sizeof(uint32_t)),
-
-};
-
-}
+#endif
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <string>
@@ -11,101 +24,104 @@
 #include <simobject.h>
 #include "debug.h"
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "decode.h"
 #include "mem.h"
 #include "warp.h"
 #include "pipeline.h"
-#include "cache.h"
-#include "sharedmem.h"
+#include "cache_sim.h"
+#include "shared_mem.h"
 #include "ibuffer.h"
 #include "scoreboard.h"
-#include "exeunit.h"
-#include "tex_unit.h"
+#include "operand.h"
+#include "dispatcher.h"
+#include "exe_unit.h"
+#include "dcrs.h"

 namespace vortex {

+class Cluster;
+
 class Core : public SimObject<Core> {
 public:
  struct PerfStats {
+    uint64_t cycles;
    uint64_t instrs;
    uint64_t ibuf_stalls;
    uint64_t scrb_stalls;
    uint64_t alu_stalls;
    uint64_t lsu_stalls;
-    uint64_t csr_stalls;
    uint64_t fpu_stalls;
-    uint64_t gpu_stalls;
+    uint64_t sfu_stalls;
+    uint64_t ifetches;
    uint64_t loads;
    uint64_t stores;
-    uint64_t branches;
-    uint64_t mem_reads;
-    uint64_t mem_writes;
-    uint64_t mem_latency;
-    uint64_t tex_reads;
-    uint64_t tex_latency;
+    uint64_t ifetch_latency;
+    uint64_t load_latency;

    PerfStats() 
-      : instrs(0)
+      : cycles(0)
+      , instrs(0)
      , ibuf_stalls(0)
      , scrb_stalls(0)
      , alu_stalls(0)
      , lsu_stalls(0)
-      , csr_stalls(0)
      , fpu_stalls(0)
-      , gpu_stalls(0)
+      , sfu_stalls(0)
+      , ifetches(0)
      , loads(0)
      , stores(0)
-      , branches(0)
-      , mem_reads(0)
-      , mem_writes(0)
-      , mem_latency(0)
-      , tex_reads(0)
-      , tex_latency(0)
+      , ifetch_latency(0)
+      , load_latency(0)
    {}
  };

-  SimPort<MemRsp> MemRspPort;
-  SimPort<MemReq> MemReqPort;
+  std::vector<SimPort<MemReq>> icache_req_ports;
+  std::vector<SimPort<MemRsp>> icache_rsp_ports;
+
+  std::vector<SimPort<MemReq>> dcache_req_ports;
+  std::vector<SimPort<MemRsp>> dcache_rsp_ports;
+
+  Core(const SimContext& ctx, 
+       uint32_t core_id, 
+       Cluster* cluster,
+       const Arch &arch, 
+       const DCRS &dcrs,
+       SharedMem::Ptr  sharedmem);

-  Core(const SimContext& ctx, const ArchDef &arch, uint32_t id);
  ~Core();

-  void attach_ram(RAM* ram);
-
-  bool running() const;
-
  void reset();

  void tick();

+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  void resume();
+
  uint32_t id() const {
-    return id_;
+    return core_id_;
  }

-  const Decoder& decoder() {
-    return decoder_;
-  }
-
-  const ArchDef& arch() const {
+  const Arch& arch() const {
    return arch_;
  }

-  const PerfStats& perf_stats() const {
-    return perf_stats_;
-  } 
-
-  uint32_t getIRegValue(int reg) const {
-    return warps_.at(0)->getIRegValue(reg);
+  const DCRS& dcrs() const {
+    return dcrs_;
  }

  uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
  
  void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);

-  WarpMask wspawn(uint32_t num_warps, uint32_t nextPC);
+  void wspawn(uint32_t num_warps, Word nextPC);
  
-  WarpMask barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
+
+  AddrType get_addr_type(uint64_t addr);

  void icache_read(void* data, uint64_t addr, uint32_t size);

@@ -113,19 +129,22 @@ public:

  void dcache_write(const void* data, uint64_t addr, uint32_t size);

-  uint32_t tex_read(uint32_t unit, uint32_t lod, uint32_t u, uint32_t v, std::vector<mem_addr_size_t>* mem_addrs);
+  void dcache_amo_reserve(uint64_t addr);
+
+  bool dcache_amo_check(uint64_t addr);

  void trigger_ecall();

  void trigger_ebreak();

-  bool check_exit() const;
+  bool check_exit(Word* exitcode, bool riscv_test) const;

 private:

  void schedule();
  void fetch();
  void decode();
+  void issue();
  void execute();
  void commit();
  
@@ -133,49 +152,55 @@ private:

  void cout_flush();

-  uint32_t id_;
-  const ArchDef arch_;
+  uint32_t core_id_;
+  const Arch& arch_;
+  const DCRS &dcrs_;
+  
  const Decoder decoder_;
  MemoryUnit mmu_;
-  RAM smem_;
-  std::vector<TexUnit> tex_units_;

  std::vector<std::shared_ptr<Warp>> warps_;  
-  std::vector<WarpMask> barriers_;  
-  std::vector<uint32_t> csrs_;
+  std::vector<WarpMask> barriers_;
  std::vector<Byte> fcsrs_;
  std::vector<IBuffer> ibuffers_;
  Scoreboard scoreboard_;
+  std::vector<Operand::Ptr> operands_;
+  std::vector<Dispatcher::Ptr> dispatchers_;
  std::vector<ExeUnit::Ptr> exe_units_;
-  Cache::Ptr icache_;
-  Cache::Ptr dcache_;
-  SharedMem::Ptr shared_mem_;
-  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
+  SharedMem::Ptr sharedmem_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
  
  HashTable<pipeline_trace_t*> pending_icache_;
+  std::vector<pipeline_trace_t*> committed_traces_;
  WarpMask active_warps_;
  WarpMask stalled_warps_;
-  uint32_t last_schedule_wid_;
  uint64_t issued_instrs_;
  uint64_t committed_instrs_;
-  uint32_t csr_tex_unit_;
-  bool ecall_;
-  bool ebreak_;
+  bool exited_;
+
+  uint64_t pending_ifetches_;

  std::unordered_map<int, std::stringstream> print_bufs_;
+
+  std::vector<std::vector<CSRs>> csrs_;
  
  PerfStats perf_stats_;
-  uint64_t perf_mem_pending_reads_;
+  
+  Cluster* cluster_;

+  uint32_t commit_exe_;
+
+  friend class Warp;
  friend class LsuUnit;
  friend class AluUnit;
-  friend class CsrUnit;
  friend class FpuUnit;
-  friend class GpuUnit;
+  friend class SfuUnit;
+  friend class TexUnit;
+  friend class RasterAgent;
+  friend class RopAgent;
+  friend class TexAgent;
 };

-} // namespace vortex
+} // namespace vortex
--- a/sim/simx/dcrs.cpp
+++ b/sim/simx/dcrs.cpp
@@ -0,0 +1,28 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dcrs.h"
+#include <iostream>
+
+using namespace vortex;
+
+void DCRS::write(uint32_t addr, uint32_t value) {     
+  if (addr >= VX_DCR_BASE_STATE_BEGIN
+   && addr < VX_DCR_BASE_STATE_END) {
+      base_dcrs.write(addr, value);
+      return;
+  }
+
+  std::cout << std::hex << "Error: invalid global DCR addr=0x" << addr << std::endl;
+  std::abort();
+}
--- a/sim/simx/dcrs.h
+++ b/sim/simx/dcrs.h
@@ -0,0 +1,45 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <util.h>
+#include <VX_types.h>
+#include <array>
+
+namespace vortex {
+
+class BaseDCRS {
+public:
+    uint32_t read(uint32_t addr) const {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        return states_.at(state);
+    }
+
+    void write(uint32_t addr, uint32_t value) {
+        uint32_t state = VX_DCR_BASE_STATE(addr);
+        states_.at(state) = value;
+    }
+
+private:    
+    std::array<uint32_t, VX_DCR_BASE_STATE_COUNT> states_;
+};
+
+class DCRS {
+public:
+    void write(uint32_t addr, uint32_t value);
+    
+    BaseDCRS base_dcrs;
+};
+
+}
--- a/sim/simx/debug.h
+++ b/sim/simx/debug.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #ifndef DEBUG_LEVEL
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <string>
 #include <stdlib.h>
@@ -9,41 +22,36 @@
 #include "debug.h"
 #include "types.h"
 #include "decode.h"
-#include "archdef.h"
+#include "arch.h"
 #include "instr.h"

 using namespace vortex;

-struct InstTableEntry_t {
-  bool controlFlow;
-  InstType iType;
-};
-
-static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable = {
-  {Opcode::NOP,        {false, InstType::N_TYPE}},
-  {Opcode::R_INST,     {false, InstType::R_TYPE}},
-  {Opcode::L_INST,     {false, InstType::I_TYPE}},
-  {Opcode::I_INST,     {false, InstType::I_TYPE}},
-  {Opcode::S_INST,     {false, InstType::S_TYPE}},
-  {Opcode::B_INST,     {true , InstType::B_TYPE}},
-  {Opcode::LUI_INST,   {false, InstType::U_TYPE}},
-  {Opcode::AUIPC_INST, {false, InstType::U_TYPE}},
-  {Opcode::JAL_INST,   {true , InstType::J_TYPE}},
-  {Opcode::JALR_INST,  {true , InstType::I_TYPE}},
-  {Opcode::SYS_INST,   {true , InstType::I_TYPE}},
-  {Opcode::FENCE,      {true , InstType::I_TYPE}},
-  {Opcode::FL,         {false, InstType::I_TYPE}},
-  {Opcode::FS,         {false, InstType::S_TYPE}},
-  {Opcode::FCI,        {false, InstType::R_TYPE}}, 
-  {Opcode::FMADD,      {false, InstType::R4_TYPE}},
-  {Opcode::FMSUB,      {false, InstType::R4_TYPE}},
-  {Opcode::FMNMADD,    {false, InstType::R4_TYPE}},
-  {Opcode::FMNMSUB,    {false, InstType::R4_TYPE}},  
-  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
-  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
-  {Opcode::GPU,        {false, InstType::R4_TYPE}},
-  {Opcode::R_INST_W,   {false, InstType::R_TYPE}},
-  {Opcode::I_INST_W,   {false, InstType::I_TYPE}},
+static const std::unordered_map<Opcode, InstType> sc_instTable = {
+  {Opcode::R_INST,     InstType::R_TYPE},
+  {Opcode::L_INST,     InstType::I_TYPE},
+  {Opcode::I_INST,     InstType::I_TYPE},
+  {Opcode::S_INST,     InstType::S_TYPE},
+  {Opcode::B_INST,     InstType::B_TYPE},
+  {Opcode::LUI_INST,   InstType::U_TYPE},
+  {Opcode::AUIPC_INST, InstType::U_TYPE},
+  {Opcode::JAL_INST,   InstType::J_TYPE},
+  {Opcode::JALR_INST,  InstType::I_TYPE},
+  {Opcode::SYS_INST,   InstType::I_TYPE},
+  {Opcode::FENCE,      InstType::I_TYPE},
+  {Opcode::AMO,        InstType::R_TYPE},
+  {Opcode::FL,         InstType::I_TYPE},
+  {Opcode::FS,         InstType::S_TYPE},
+  {Opcode::FCI,        InstType::R_TYPE}, 
+  {Opcode::FMADD,      InstType::R4_TYPE},
+  {Opcode::FMSUB,      InstType::R4_TYPE},
+  {Opcode::FMNMADD,    InstType::R4_TYPE},
+  {Opcode::FMNMSUB,    InstType::R4_TYPE},  
+  {Opcode::VSET,       InstType::V_TYPE},
+  {Opcode::EXT1,       InstType::R_TYPE},
+  {Opcode::EXT2,       InstType::R4_TYPE},
+  {Opcode::R_INST_W,   InstType::R_TYPE},
+  {Opcode::I_INST_W,   InstType::I_TYPE},
 };

 enum Constants {
@@ -58,6 +66,8 @@ enum Constants {
  width_i_imm = 12,
  width_j_imm = 20,
  width_v_imm = 11,
+  width_aq    = 1,
+  width_rl    = 1,

  shift_opcode= 0,
  shift_rd    = width_opcode,
@@ -72,15 +82,15 @@ enum Constants {
  shift_func6 = shift_func7 + width_vmask,
  shift_vset  = shift_func7 + width_func6,

-  mask_opcode = (1<<width_opcode)-1,  
-  mask_reg    = (1<<width_reg)-1,
-  mask_func2  = (1<<width_func2)-1,
-  mask_func3  = (1<<width_func3)-1,
-  mask_func6  = (1<<width_func6)-1,
-  mask_func7  = (1<<width_func7)-1,
-  mask_i_imm  = (1<<width_i_imm)-1,
-  mask_j_imm  = (1<<width_j_imm)-1,
-  mask_v_imm  = (1<<width_v_imm)-1,
+  mask_opcode = (1 << width_opcode) - 1,  
+  mask_reg    = (1 << width_reg)   - 1,
+  mask_func2  = (1 << width_func2) - 1,
+  mask_func3  = (1 << width_func3) - 1,
+  mask_func6  = (1 << width_func6) - 1,
+  mask_func7  = (1 << width_func7) - 1,
+  mask_i_imm  = (1 << width_i_imm) - 1,
+  mask_j_imm  = (1 << width_j_imm) - 1,
+  mask_v_imm  = (1 << width_v_imm) - 1,
 };

 static const char* op_string(const Instr &instr) {
@@ -92,7 +102,6 @@ static const char* op_string(const Instr &instr) {
  auto imm    = instr.getImm();

  switch (opcode) {
-  case Opcode::NOP:        return "NOP";
  case Opcode::LUI_INST:   return "LUI";
  case Opcode::AUIPC_INST: return "AUIPC";
  case Opcode::R_INST:
@@ -116,7 +125,7 @@ static const char* op_string(const Instr &instr) {
      case 2: return "SLT";
      case 3: return "SLTU";
      case 4: return "XOR";
-      case 5: return func7 ? "SRA" : "SRL";
+      case 5: return (func7 & 0x20) ? "SRA" : "SRL";
      case 6: return "OR";
      case 7: return "AND";
      default:
@@ -130,7 +139,7 @@ static const char* op_string(const Instr &instr) {
    case 2: return "SLTI";
    case 3: return "SLTIU";
    case 4: return "XORI";
-    case 5: return func7 ? "SRAI" : "SRLI";
+    case 5: return (func7 & 0x20) ? "SRAI" : "SRLI";
    case 6: return "ORI";
    case 7: return "ANDI";
    default:
@@ -151,8 +160,8 @@ static const char* op_string(const Instr &instr) {
  case Opcode::JALR_INST:  return "JALR";
  case Opcode::L_INST:
    switch (func3) {
-    case 0: return "LBI";
-    case 1: return "LHI";
+    case 0: return "LB";
+    case 1: return "LH";
    case 2: return "LW";
    case 3: return "LD";
    case 4: return "LBU";
@@ -192,11 +201,11 @@ static const char* op_string(const Instr &instr) {
    }
  case Opcode::I_INST_W:
    switch (func3) {
-      case 0: return "ADDIW";
-      case 1: return "SLLIW";
-      case 5: return func7 ? "SRAIW" : "SRLIW";
-      default:
-        std::abort();
+    case 0: return "ADDIW";
+    case 1: return "SLLIW";
+    case 5: return func7 ? "SRAIW" : "SRLIW";
+    default:
+      std::abort();
    }
  case Opcode::SYS_INST: 
    switch (func3) {
@@ -222,20 +231,59 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FENCE: return "FENCE";
  case Opcode::FL: 
    switch (func3) {
-      case 0x1: return "VL";
-      case 0x2: return "FLW";
-      case 0x3: return "FLD";
-      default: 
-        std::abort();
+    case 0x1: return "VL";
+    case 0x2: return "FLW";
+    case 0x3: return "FLD";
+    default: 
+      std::abort();
    }
  case Opcode::FS: 
    switch (func3) {
-      case 0x1: return "VS";
-      case 0x2: return "FSW";
-      case 0x3: return "FSD";
+    case 0x1: return "VS";
+    case 0x2: return "FSW";
+    case 0x3: return "FSD";
+    default: 
+      std::abort();
+    }
+  case Opcode::AMO: {
+    auto amo_type = func7 >> 2;
+    switch (func3) {
+      case 0x2:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.W";
+        case 0x01: return "AMOSWAP.W";
+        case 0x02: return "LR.W";
+        case 0x03: return "SC.W";
+        case 0x04: return "AMOXOR.W";
+        case 0x08: return "AMOOR.W";
+        case 0x0c: return "AMOAND.W";
+        case 0x10: return "AMOMIN.W";
+        case 0x14: return "AMOMAX.W";
+        case 0x18: return "AMOMINU.W";
+        case 0x1c: return "AMOMAXU.W";
+        default:
+          std::abort();
+        }
+      case 0x3:
+        switch (amo_type) {
+        case 0x00: return "AMOADD.D";
+        case 0x01: return "AMOSWAP.D";
+        case 0x02: return "LR.D";
+        case 0x03: return "SC.D";
+        case 0x04: return "AMOXOR.D";
+        case 0x08: return "AMOOR.D";
+        case 0x0c: return "AMOAND.D";
+        case 0x10: return "AMOMIN.D";
+        case 0x14: return "AMOMAX.D";
+        case 0x18: return "AMOMINU.D";
+        case 0x1c: return "AMOMAXU.D";
+        default:
+          std::abort();
+        }
      default: 
        std::abort();
    }
+  }
  case Opcode::FCI: 
    switch (func7) {
    case 0x00: return "FADD.S";
@@ -332,9 +380,9 @@ static const char* op_string(const Instr &instr) {
      default:
        std::abort();
      }
-    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.W";
+    case 0x70: return func3 ? "FCLASS.S" : "FMV.X.S";
    case 0x71: return func3 ? "FCLASS.D" : "FMV.X.D";
-    case 0x78: return "FMV.W.X";
+    case 0x78: return "FMV.S.X";
    case 0x79: return "FMV.D.X";
    default:
      std::abort();
@@ -344,23 +392,36 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
  case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
  case Opcode::VSET:    return "VSET";
-  case Opcode::GPGPU:
-    switch (func3) {            
-    case 0: return "TMC";
-    case 1: return "WSPAWN";
-    case 2: return "SPLIT";
-    case 3: return "JOIN";
-    case 4: return "BAR";
-    case 5: return "PREFETCH";
+  case Opcode::EXT1:
+    switch (func7) {
+    case 0:
+      switch (func3) {            
+      case 0: return "TMC";
+      case 1: return "WSPAWN";
+      case 2: return "SPLIT";
+      case 3: return "JOIN";
+      case 4: return "BAR";
+      case 5: return "PRED";
+      default:
+        std::abort();
+      }
+    case 1:
+      switch (func3) {
+      case 0: return "RASTER";      
+      default:
+        std::abort();
+      }
    default:
      std::abort();
    }
-  case Opcode::GPU:
+  case Opcode::EXT2:
    switch (func3) {
-    case 0: return "TEX";
+    case 0:
+      return "TEX";
    case 1: {
      switch (func2) {
      case 0: return "CMOV";
+      case 1: return "ROP";      
      default:
        std::abort();
      }
@@ -375,43 +436,36 @@ static const char* op_string(const Instr &instr) {

 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {  
-  auto opcode = instr.getOpcode();    
-  auto func2  = instr.getFunc2();
+  auto opcode = instr.getOpcode();
  auto func3  = instr.getFunc3();

-  os << op_string(instr) << ": ";
-
-  if (opcode == S_INST 
-   || opcode == FS) {     
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
-     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
-  } else 
-  if (opcode == L_INST 
-   || opcode == FL) {     
-     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
-  } else {
-    if (instr.getRDType() != RegType::None) {
-      os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
-    }
-    uint32_t i = 0;
-    for (; i < instr.getNRSrc(); ++i) {    
-      if (i) os << ", ";
-      os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
-    }    
-    if (instr.hasImm()) {
-      if (i) os << ", ";
-      os << "imm=0x" << std::hex << instr.getImm();
-    }
-    if (opcode == GPU && func3 == 0) {
-      os << ", unit=" << std::dec << func2;
-    }
+  os << op_string(instr);
+  
+  int sep = 0;
+  if (instr.getRDType() != RegType::None) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRDType() << std::dec << instr.getRDest();
+  }
+  for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {    
+    if (instr.getRSType(i) == RegType::None)
+      continue;
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
+  }
+  if (instr.hasImm()) {
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getImm();
+  }
+  if (opcode == Opcode::SYS_INST && func3 >= 5) {
+    // CSRs with immediate values
+    if (sep++ != 0) { os << ", "; } else { os << " "; }
+    os << "0x" << std::hex << instr.getRSrc(0);
  }
  return os;
 }
 }

-Decoder::Decoder(const ArchDef&) {}
+Decoder::Decoder(const Arch&) {}

 std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {  
  auto instr = std::make_shared<Instr>();
@@ -434,7 +488,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    return nullptr;
  }

-  auto iType = op_it->second.iType;
+  auto iType = op_it->second;
  if (op == Opcode::FL || op == Opcode::FS) { 
    if (func3 != 0x2 && func3 != 0x3) {
      iType = InstType::V_TYPE;
@@ -442,57 +496,97 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  }

  switch (iType) {
-  case InstType::N_TYPE:
-    break;
-
  case InstType::R_TYPE:
-    if (op == Opcode::FCI) {
-      switch (func7) {      
+    switch (op) {
+    case Opcode::FCI:
+      switch (func7) {  
+      case 0x2c: // FSQRT.S
+      case 0x2d: // FSQRT.D
+        instr->setDestReg(rd, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        break;    
      case 0x50: // FLE.S, FLT.S, FEQ.S
      case 0x51: // FLE.D, FLT.D, FEQ.D
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);
        break;
      case 0x60: // FCVT.W.D, FCVT.WU.D, FCVT.L.D, FCVT.LU.D
      case 0x61: // FCVT.WU.S, FCVT.W.S, FCVT.L.S, FCVT.LU.S
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::None);
        break;
      case 0x68: // FCVT.S.W, FCVT.S.WU, FCVT.S.L, FCVT.S.LU
      case 0x69: // FCVT.D.W, FCVT.D.WU, FCVT.D.L, FCVT.D.LU
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
-        instr->setSrcReg(rs2, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs2, RegType::None);
        break;
-      case 0x70: // FCLASS.S, FMV.X.W
+      case 0x70: // FCLASS.S, FMV.X.S
      case 0x71: // FCLASS.D, FMV.X.D        
        instr->setDestReg(rd, RegType::Integer);
-        instr->setSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs1, RegType::Float);
        break;
-      case 0x78: // FMV.W.X
+      case 0x78: // FMV.S.X
      case 0x79: // FMV.D.X        
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Integer);
+        instr->addSrcReg(rs1, RegType::Integer);
        break;
      default:
        instr->setDestReg(rd, RegType::Float);
-        instr->setSrcReg(rs1, RegType::Float);
-        instr->setSrcReg(rs2, RegType::Float);        
+        instr->addSrcReg(rs1, RegType::Float);
+        instr->addSrcReg(rs2, RegType::Float);        
        break;
      }
-    } else {
+      break;
+    case Opcode::EXT1:
+      switch (func7) {
+      case 0:
+        switch (func3) {         
+        case 0: // TMC
+        case 3: // JOIN
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        case 1: // WSPAWN        
+        case 4: // BAR
+        case 5: // PRED
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          break;
+        case 2: // SPLIT
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      case 1:
+        switch (func3) {
+        case 0: // RASTER
+          instr->setDestReg(rd, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
+      break;
+    default:
      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs1, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
+      break;
    }
    instr->setFunc3(func3);
    instr->setFunc7(func7);
    break;

  case InstType::I_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FL) {
      instr->setDestReg(rd, RegType::Float);      
    } else {
@@ -503,15 +597,23 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    switch (op) {
    case Opcode::SYS_INST:
      if (func3 != 0) {
-        // RV32I: CSR*
-        instr->setDestReg(rd, RegType::Integer);
-      }
+        // RV32I: CSR
+        if (func3 >= 5) {
+          // rs1 holds zimm
+          instr->setSrcReg(0, rs1, RegType::None);
+        }        
+      } else {        
+        instr->setDestReg(rd, RegType::None);
+        instr->setSrcReg(0, rs1, RegType::None);
+      }      
      // uint12
      instr->setImm(code >> shift_rs2);
      break;
    case Opcode::FENCE:
      // uint12
      instr->setImm(code >> shift_rs2);
+      instr->setDestReg(rd, RegType::None);
+      instr->setSrcReg(0, rs1, RegType::None);
      break;
    case Opcode::I_INST:
    case Opcode::I_INST_W:
@@ -538,11 +640,11 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
  } break;
  case InstType::S_TYPE: {    
-    instr->setSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
    if (op == Opcode::FS) {
-      instr->setSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
    } else {
-      instr->setSrcReg(rs2, RegType::Integer);
+      instr->addSrcReg(rs2, RegType::Integer);
    }
    instr->setFunc3(func3);
    auto imm = (func7 << width_reg) | rd;
@@ -550,8 +652,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  } break;

  case InstType::B_TYPE: {
-    instr->setSrcReg(rs1, RegType::Integer);
-    instr->setSrcReg(rs2, RegType::Integer);
+    instr->addSrcReg(rs1, RegType::Integer);
+    instr->addSrcReg(rs2, RegType::Integer);
    instr->setFunc3(func3);
    auto bit_11   = rd & 0x1;
    auto bits_4_1 = rd >> 1;
@@ -581,8 +683,8 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
  case InstType::V_TYPE:
    switch (op) {
    case Opcode::VSET: {
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setFunc3(func3);
      if (func3 == 7) {
        instr->setImm(!(code >> shift_vset));
@@ -593,20 +695,20 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
          instr->setVediv((immed >> 4) & 0x3);
          instr->setVsew((immed >> 2) & 0x3);
        } else {
-          instr->setSrcVReg(rs2);
+          instr->addSrcReg(rs2, RegType::Vector);
        }
      } else {
-        instr->setSrcVReg(rs2);
+        instr->addSrcReg(rs2, RegType::Vector);
        instr->setVmask((code >> shift_func7) & 0x1);
        instr->setFunc6(func6);
      }
    } break;

    case Opcode::FL:
-      instr->setDestVReg(rd);
-      instr->setSrcVReg(rs1);
+      instr->setDestReg(rd, RegType::Vector);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -614,9 +716,9 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {

    case Opcode::FS:
      instr->setVs3(rd);
-      instr->setSrcVReg(rs1);
+      instr->addSrcReg(rs1, RegType::Vector);
      instr->setVlsWidth(func3);
-      instr->setSrcVReg(rs2);
+      instr->addSrcReg(rs2, RegType::Vector);
      instr->setVmask(code >> shift_func7);
      instr->setVmop((code >> shift_vmop) & mask_func3);
      instr->setVnf((code >> shift_vnf) & mask_func3);
@@ -627,16 +729,28 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
    break;
  case R4_TYPE:
-    if (op == Opcode::GPU) {
-      instr->setDestReg(rd, RegType::Integer);
-      instr->setSrcReg(rs1, RegType::Integer);
-      instr->setSrcReg(rs2, RegType::Integer);
-      instr->setSrcReg(rs3, RegType::Integer);
+    if (op == Opcode::EXT2) {
+      switch (func3) {
+      case 1:
+        switch (func2) {
+        case 0: // CMOV
+          instr->setDestReg(rd, RegType::Integer);
+          instr->addSrcReg(rs1, RegType::Integer);
+          instr->addSrcReg(rs2, RegType::Integer);
+          instr->addSrcReg(rs3, RegType::Integer);
+          break;
+        default:
+          std::abort();
+        }
+        break;
+      default:
+        std::abort();
+      }
    } else {
      instr->setDestReg(rd, RegType::Float);
-      instr->setSrcReg(rs1, RegType::Float);
-      instr->setSrcReg(rs2, RegType::Float);
-      instr->setSrcReg(rs3, RegType::Float);
+      instr->addSrcReg(rs1, RegType::Float);
+      instr->addSrcReg(rs2, RegType::Float);
+      instr->addSrcReg(rs3, RegType::Float);
    }
    instr->setFunc2(func2);
    instr->setFunc3(func3);
--- a/sim/simx/decode.h
+++ b/sim/simx/decode.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <vector>
@@ -5,12 +18,12 @@

 namespace vortex {

-class ArchDef;
+class Arch;
 class Instr;

 class Decoder {
 public:
-  Decoder(const ArchDef &);    
+  Decoder(const Arch &);    
  
  std::shared_ptr<Instr> decode(uint32_t code) const;
 };
--- a/sim/simx/dispatcher.h
+++ b/sim/simx/dispatcher.h
@@ -0,0 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Dispatcher : public SimObject<Dispatcher> {
+public:
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;
+
+    Dispatcher(const SimContext& ctx, const Arch& arch, uint32_t buf_size, uint32_t block_size, uint32_t num_lanes) 
+        : SimObject<Dispatcher>(ctx, "Dispatcher") 
+        , Outputs(ISSUE_WIDTH, this)
+        , Inputs_(ISSUE_WIDTH, this)
+        , arch_(arch)
+        , queues_(ISSUE_WIDTH, std::queue<pipeline_trace_t*>())
+        , buf_size_(buf_size)        
+        , block_size_(block_size)        
+        , num_lanes_(num_lanes)        
+        , batch_count_(ISSUE_WIDTH / block_size)
+        , pid_count_(arch.num_threads() / num_lanes)
+        , batch_idx_(0)
+        , start_p_(block_size, 0)
+    {}
+    
+    virtual ~Dispatcher() {}
+
+    virtual void reset() {
+        batch_idx_ = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            start_p_.at(b) = 0;
+        }
+    }
+
+    virtual void tick() {
+        for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+            auto& queue = queues_.at(i);
+            if (queue.empty())
+                continue;
+            auto trace = queue.front();
+            Inputs_.at(i).send(trace, 1);
+            queue.pop();
+        }
+
+        uint32_t block_sent = 0;
+        for (uint32_t b = 0; b < block_size_; ++b) {
+            uint32_t i = batch_idx_ * block_size_ + b;
+            auto& input = Inputs_.at(i);            
+            if (input.empty()) {
+                ++block_sent;
+                continue;
+            }
+            auto& output = Outputs.at(i);
+            auto trace = input.front();
+            if (pid_count_ != 1) {
+                auto start_p = start_p_.at(b);
+                if (start_p == -1) {
+                    ++block_sent;
+                    continue;       
+                }             
+                int start(-1), end(-1);
+                for (uint32_t j = start_p * num_lanes_, n = arch_.num_threads(); j < n; ++j) {
+                    if (!trace->tmask.test(j))
+                        continue;
+                    if (start == -1)
+                        start = j;
+                    end = j;
+                }                
+                start /= num_lanes_;
+                end /= num_lanes_;
+                auto new_trace = new pipeline_trace_t(*trace);
+                new_trace->tmask.reset();
+                for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
+                    new_trace->tmask[j] = trace->tmask[j];
+                }                
+                new_trace->pid = start;
+                new_trace->sop = (start_p == 0);
+                if (start == end) {
+                    new_trace->eop = 1;
+                    start_p_.at(b) = -1;
+                    input.pop();
+                    ++block_sent;
+                    delete trace;
+                } else {
+                    new_trace->eop = 0;
+                    start_p_.at(b) = start + 1;
+                }                
+                output.send(new_trace, 1);
+                DT(3, "pipeline-dispatch: " << *new_trace);
+            } else {
+                trace->pid = 0;
+                input.pop();
+                output.send(trace, 1);
+                DT(3, "pipeline-dispatch: " << *trace);
+                ++block_sent;
+            }            
+        }
+        if (block_sent == block_size_) {
+            batch_idx_ = (batch_idx_ + 1) % batch_count_;
+            for (uint32_t b = 0; b < block_size_; ++b) {
+                start_p_.at(b) = 0;
+            }
+        }
+    };
+
+    bool push(uint32_t issue_index, pipeline_trace_t* trace) {
+        auto& queue = queues_.at(issue_index);
+        if (queue.size() >= buf_size_)
+            return false;
+        queue.push(trace);        
+        return true;
+    }
+
+private:
+    std::vector<SimPort<pipeline_trace_t*>> Inputs_;
+    const Arch& arch_;
+    std::vector<std::queue<pipeline_trace_t*>> queues_;
+    uint32_t buf_size_;
+    uint32_t block_size_;
+    uint32_t num_lanes_;
+    uint32_t batch_count_;
+    uint32_t pid_count_;
+    uint32_t batch_idx_;
+    std::vector<int> start_p_;
+};
+
+}
--- a/sim/simx/exe_unit.cpp
+++ b/sim/simx/exe_unit.cpp
@@ -0,0 +1,341 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exe_unit.h"
+#include <iostream>
+#include <iomanip>
+#include <string.h>
+#include <assert.h>
+#include <util.h>
+#include "debug.h"
+#include "core.h"
+#include "constants.h"
+#include "cache_sim.h"
+
+using namespace vortex;
+
+AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
+    
+void AluUnit::tick() {    
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->alu_type) {
+        case AluType::ARITH:        
+        case AluType::BRANCH:
+        case AluType::SYSCALL:
+        case AluType::IMUL:
+            output.send(trace, LATENCY_IMUL+1);
+            break;
+        case AluType::IDIV:
+            output.send(trace, XLEN+1);
+            break;
+        default:
+            std::abort();
+        }
+        DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
+        if (trace->eop && trace->fetch_stall) {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+        auto time = input.pop();
+        core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
+    
+void FpuUnit::tick() {
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        auto& input = Inputs.at(i);
+        if (input.empty()) 
+            continue;
+        auto& output = Outputs.at(i);
+        auto trace = input.front();
+        switch (trace->fpu_type) {
+        case FpuType::FNCP:
+            output.send(trace, 2);
+            break;
+        case FpuType::FMA:
+            output.send(trace, LATENCY_FMA+1);
+            break;
+        case FpuType::FDIV:
+            output.send(trace, LATENCY_FDIV+1);
+            break;
+        case FpuType::FSQRT:
+            output.send(trace, LATENCY_FSQRT+1);
+            break;
+        case FpuType::FCVT:
+            output.send(trace, LATENCY_FCVT+1);
+            break;
+        default:
+            std::abort();
+        }    
+        DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
+        auto time = input.pop();
+        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "LSU")
+    , pending_rd_reqs_(LSUQ_SIZE)
+    , num_lanes_(NUM_LSU_LANES)     
+    , pending_loads_(0)
+    , fence_lock_(false)
+    , input_idx_(0)
+{}
+
+void LsuUnit::reset() {
+    pending_rd_reqs_.clear();
+    pending_loads_ = 0;
+    fence_lock_ = false;
+}
+
+void LsuUnit::tick() {    
+    core_->perf_stats_.load_latency += pending_loads_;
+
+    // handle dcache response    
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
+        if (dcache_rsp_port.empty())
+            continue;
+        auto& mem_rsp = dcache_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type 
+            << ", tid=" << t << ", " << *trace);  
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        dcache_rsp_port.pop();
+        --pending_loads_;
+    }
+
+    // handle shared memory response
+    for (uint32_t t = 0; t < num_lanes_; ++t) {
+        auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
+        if (smem_rsp_port.empty())
+            continue;
+        auto& mem_rsp = smem_rsp_port.front();
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
+        auto trace = entry.trace;
+        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
+        assert(entry.count);
+        --entry.count; // track remaining addresses 
+        if (0 == entry.count) {
+            int iw = trace->wid % ISSUE_WIDTH;
+            auto& output = Outputs.at(iw);
+            output.send(trace, 1);
+            pending_rd_reqs_.release(mem_rsp.tag);
+        } 
+        smem_rsp_port.pop();  
+        --pending_loads_;
+    }
+
+    if (fence_lock_) {
+        // wait for all pending memory operations to complete
+        if (!pending_rd_reqs_.empty())
+            return;
+        int iw = fence_state_->wid % ISSUE_WIDTH;
+        auto& output = Outputs.at(iw);
+        output.send(fence_state_, 1);
+        fence_lock_ = false;
+        DT(3, "fence-unlock: " << fence_state_);
+    }    
+
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+
+        auto t0 = trace->pid * num_lanes_;
+
+        if (trace->lsu_type == LsuType::FENCE) {
+            // schedule fence lock
+            fence_state_ = trace;
+            fence_lock_ = true;        
+            DT(3, "fence-lock: " << *trace);
+            // remove input
+            auto time = input.pop(); 
+            core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+            break;
+        }
+
+        // check pending queue capacity    
+        if (pending_rd_reqs_.full()) {
+            if (!trace->log_once(true)) {
+                DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
+            }
+            break;
+        } else {
+            trace->log_once(false);
+        }
+        
+        bool is_write = (trace->lsu_type == LsuType::STORE);
+
+        // duplicates detection
+        bool is_dup = false;
+        if (trace->tmask.test(t0)) {
+            uint64_t addr_mask = sizeof(uint32_t)-1;
+            uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
+            uint32_t matches = 1;
+            for (uint32_t t = 1; t < num_lanes_; ++t) {
+                if (!trace->tmask.test(t0 + t))
+                    continue;
+                auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
+                matches += (addr0 == mem_addr);
+            }
+            is_dup = (matches == trace->tmask.count());
+        }
+
+        uint32_t addr_count;
+        if (is_dup) {
+            addr_count = 1;
+        } else {
+            addr_count = trace->tmask.count();
+        }
+
+        auto tag = pending_rd_reqs_.allocate({trace, addr_count});
+
+        for (uint32_t t = 0; t < num_lanes_; ++t) {
+            if (!trace->tmask.test(t0 + t))
+                continue;
+            
+            auto& dcache_req_port = core_->dcache_req_ports.at(t);
+            auto mem_addr = trace_data->mem_addrs.at(t);
+            auto type = core_->get_addr_type(mem_addr.addr);
+
+            MemReq mem_req;
+            mem_req.addr  = mem_addr.addr;
+            mem_req.write = is_write;
+            mem_req.type  = type; 
+            mem_req.tag   = tag;
+            mem_req.cid   = trace->cid;
+            mem_req.uuid  = trace->uuid;        
+                
+            dcache_req_port.send(mem_req, 2);
+            DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
+                << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
+
+            ++pending_loads_;
+            ++core_->perf_stats_.loads;        
+            if (is_dup)
+                break;
+        }
+
+        // do not wait on writes
+        if (is_write) {
+            pending_rd_reqs_.release(tag);
+            output.send(trace, 1);
+            ++core_->perf_stats_.stores;
+        }
+
+        // remove input
+        auto time = input.pop();
+        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+
+        break; // single block
+    }
+    ++input_idx_;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+SfuUnit::SfuUnit(const SimContext& ctx, Core* core) 
+    : ExeUnit(ctx, core, "SFU")
+{}
+    
+void SfuUnit::tick() {
+    // handle pending responses
+    for (auto pending_rsp : pending_rsps_) {
+        if (pending_rsp->empty())
+            continue;
+        auto trace = pending_rsp->front();
+        if (trace->cid != core_->id())
+            continue;
+        int iw = trace->wid % ISSUE_WIDTH;
+        auto& output = Outputs.at(iw);
+        output.send(trace, 1);
+        pending_rsp->pop();
+    }
+
+    // check input queue
+    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
+        int iw = (input_idx_ + i) % ISSUE_WIDTH;        
+        auto& input = Inputs.at(iw);
+        if (input.empty())
+            continue;
+        auto& output = Outputs.at(iw);
+        auto trace = input.front();
+        auto sfu_type = trace->sfu_type;
+        bool release_warp = trace->fetch_stall;
+
+        switch  (sfu_type) {
+        case SfuType::TMC: 
+        case SfuType::WSPAWN:
+        case SfuType::SPLIT:
+        case SfuType::JOIN:
+        case SfuType::PRED:
+        case SfuType::CSRRW:
+        case SfuType::CSRRS:
+        case SfuType::CSRRC:
+            output.send(trace, 1);
+            break;
+        case SfuType::BAR: {
+            output.send(trace, 1);
+            auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
+            if (trace->eop) {
+                core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
+            }
+            release_warp = false;
+        }   break;
+        case SfuType::CMOV:
+            output.send(trace, 3);
+            break;
+        default:
+            std::abort();
+        }
+
+        DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
+        if (trace->eop && release_warp)  {
+            assert(core_->stalled_warps_.test(trace->wid));
+            core_->stalled_warps_.reset(trace->wid);
+        }
+
+        auto time = input.pop();
+        auto stalls = (SimPlatform::instance().cycles() - time);
+
+        core_->perf_stats_.sfu_stalls += stalls;
+
+        break; // single block
+    }
+    ++input_idx_;
+}
--- a/sim/simx/exe_unit.h
+++ b/sim/simx/exe_unit.h
@@ -1,8 +1,21 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "pipeline.h"
-#include "cache.h"
+#include "cache_sim.h"

 namespace vortex {

@@ -10,13 +23,13 @@ class Core;

 class ExeUnit : public SimObject<ExeUnit> {
 public:
-    SimPort<pipeline_trace_t*> Input;
-    SimPort<pipeline_trace_t*> Output;
+    std::vector<SimPort<pipeline_trace_t*>> Inputs;
+    std::vector<SimPort<pipeline_trace_t*>> Outputs;

    ExeUnit(const SimContext& ctx, Core* core, const char* name) 
        : SimObject<ExeUnit>(ctx, name) 
-        , Input(this)
-        , Output(this)
+        , Inputs(ISSUE_WIDTH, this)
+        , Outputs(ISSUE_WIDTH, this)
        , core_(core)
    {}
    
@@ -32,28 +45,25 @@ protected:

 ///////////////////////////////////////////////////////////////////////////////

-class NopUnit : public ExeUnit {
-public:
-    NopUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class LsuUnit : public ExeUnit {
-private:    
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
-    pipeline_trace_t* fence_state_;
-    bool fence_lock_;
-
 public:
    LsuUnit(const SimContext& ctx, Core*);

    void reset();

    void tick();
+
+private:    
+    struct pending_req_t {
+      pipeline_trace_t* trace;
+      uint32_t count;
+    };
+    HashTable<pending_req_t> pending_rd_reqs_;    
+    uint32_t num_lanes_;
+    pipeline_trace_t* fence_state_;
+    uint64_t pending_loads_;
+    bool fence_lock_;
+    uint32_t input_idx_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -67,15 +77,6 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class CsrUnit : public ExeUnit {
-public:
-    CsrUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class FpuUnit : public ExeUnit {
 public:
    FpuUnit(const SimContext& ctx, Core*);
@@ -85,19 +86,15 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-class GpuUnit : public ExeUnit {
-private:
-    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
-
-    bool processTexRequest(pipeline_trace_t* trace);
-    
+class SfuUnit : public ExeUnit {
 public:
-    GpuUnit(const SimContext& ctx, Core*);
-
-    void reset();
+    SfuUnit(const SimContext& ctx, Core*);
    
    void tick();
+
+private:
+  std::vector<SimPort<pipeline_trace_t*>*> pending_rsps_;
+  uint32_t input_idx_;
 };

 }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
--- a/sim/simx/exeunit.cpp
+++ b/sim/simx/exeunit.cpp
@@ -1,383 +0,0 @@
-#include "exeunit.h"
-#include <iostream>
-#include <iomanip>
-#include <string.h>
-#include <assert.h>
-#include <util.h>
-#include "debug.h"
-#include "core.h"
-#include "constants.h"
-
-using namespace vortex;
-
-NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
-    
-void NopUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    Input.pop();
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "LSU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_rd_reqs_(LSUQ_SIZE)
-    , fence_lock_(false)
-{}
-
-void LsuUnit::reset() {
-    pending_rd_reqs_.clear();
-    fence_lock_ = false;
-}
-
-void LsuUnit::tick() {
-    // handle dcache response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        dcache_rsp_port.pop();  
-    }
-
-    // handle shared memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
-        if (smem_rsp_port.empty())
-            continue;
-        auto& mem_rsp = smem_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.first;
-        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        smem_rsp_port.pop();  
-    }
-
-    if (fence_lock_) {
-        // wait for all pending memory operations to complete
-        if (!pending_rd_reqs_.empty())
-            return;
-        Output.send(fence_state_, 1);
-        fence_lock_ = false;
-        DT(3, "fence-unlock: " << fence_state_);
-    }
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    if (trace->lsu.type == LsuType::FENCE) {
-        // schedule fence lock
-        fence_state_ = trace;
-        fence_lock_ = true;        
-        DT(3, "fence-lock: " << *trace);
-        // remove input
-        auto time = Input.pop(); 
-        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-        return;
-    }
-
-    // check pending queue capacity    
-    if (pending_rd_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** lsu-queue-stall: " << *trace);
-        }
-        return;
-    } else {
-        trace->resume();
-    }
-    
-    bool is_write = (trace->lsu.type == LsuType::STORE);
-
-    // duplicates detection
-    bool is_dup = false;
-    if (trace->tmask.test(0)) {
-        uint64_t addr_mask = sizeof(uint32_t)-1;
-        uint32_t addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
-        uint32_t matches = 1;
-        for (uint32_t t = 1; t < num_threads_; ++t) {
-            if (!trace->tmask.test(t))
-                continue;
-            auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
-            matches += (addr0 == mem_addr);
-        }
-        is_dup = (matches == trace->tmask.count());
-    }
-
-    uint32_t valid_addrs = 0;
-    if (is_dup) {
-        valid_addrs = 1;
-    } else {
-        for (auto& mem_addr : trace->mem_addrs) {
-            valid_addrs += mem_addr.size();
-        }
-    }
-
-    auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-        
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);        
-        auto mem_addr = trace->mem_addrs.at(t).at(0);
-        auto type = get_addr_type(mem_addr.addr, mem_addr.size);
-
-        MemReq mem_req;
-        mem_req.addr  = mem_addr.addr;
-        mem_req.write = is_write;
-        mem_req.non_cacheable = (type == AddrType::IO); 
-        mem_req.tag   = tag;
-        mem_req.core_id = trace->cid;
-        mem_req.uuid = trace->uuid;
-        
-        if (type == AddrType::Shared) {
-            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
-            DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
-        } else {            
-            dcache_req_port.send(mem_req, 2);
-            DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
-        }        
-        
-        if (is_dup)
-            break;
-    }
-
-    // do not wait on writes
-    if (is_write) {        
-        pending_rd_reqs_.release(tag);
-        Output.send(trace, 1);
-    }
-
-    // remove input
-    auto time = Input.pop();
-    core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
-    
-void AluUnit::tick() {    
-    if (Input.empty())
-        return;
-    auto trace = Input.front();    
-    switch (trace->alu.type) {
-    case AluType::ARITH:        
-    case AluType::BRANCH:
-    case AluType::SYSCALL:
-    case AluType::CMOV:
-        Output.send(trace, 1);
-        break;
-    case AluType::IMUL:
-        Output.send(trace, LATENCY_IMUL+1);
-        break;
-    case AluType::IDIV:
-        Output.send(trace, XLEN+1);
-        break;
-    default:
-        std::abort();
-    }
-    DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
-    if (trace->fetch_stall) {
-        core_->stalled_warps_.reset(trace->wid);
-    }
-    auto time = Input.pop();
-    core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
-    
-void CsrUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    Output.send(trace, 1);
-    auto time = Input.pop();
-    core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
-    DT(3, "pipeline-execute: op=CSR, " << *trace);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
-    
-void FpuUnit::tick() {
-    if (Input.empty()) 
-        return;
-    auto trace = Input.front();
-    switch (trace->fpu.type) {
-    case FpuType::FNCP:
-        Output.send(trace, 2);
-        break;
-    case FpuType::FMA:
-        Output.send(trace, LATENCY_FMA+1);
-        break;
-    case FpuType::FDIV:
-        Output.send(trace, LATENCY_FDIV+1);
-        break;
-    case FpuType::FSQRT:
-        Output.send(trace, LATENCY_FSQRT+1);
-        break;
-    case FpuType::FCVT:
-        Output.send(trace, LATENCY_FCVT+1);
-        break;
-    default:
-        std::abort();
-    }    
-    DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
-    auto time = Input.pop();
-    core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-GpuUnit::GpuUnit(const SimContext& ctx, Core* core) 
-    : ExeUnit(ctx, core, "GPU")
-    , num_threads_(core->arch().num_threads()) 
-    , pending_tex_reqs_(TEXQ_SIZE)
-{}
-
-void GpuUnit::reset() {
-    pending_tex_reqs_.clear();
-}
-    
-void GpuUnit::tick() {
-#ifdef EXT_TEX_ENABLE
-    // handle memory response
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
-        auto trace = entry.first;
-        DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
-        assert(entry.second);
-        --entry.second; // track remaining blocks 
-        if (0 == entry.second) {
-            Output.send(trace, 1);
-            pending_tex_reqs_.release(mem_rsp.tag);
-        }   
-        dcache_rsp_port.pop();
-    }
-#endif
-
-    // check input queue
-    if (Input.empty())
-        return;
-
-    auto trace = Input.front();
-
-    bool issued = false;
-
-    switch  (trace->gpu.type) {
-    case GpuType::TMC:
-        Output.send(trace, 1);
-        core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
-        issued = true;
-        break;
-    case GpuType::WSPAWN:
-        Output.send(trace, 1);
-        core_->active_warps_ = trace->gpu.active_warps;        
-        issued = true;
-        break;
-    case GpuType::SPLIT:
-    case GpuType::JOIN:
-        Output.send(trace, 1);
-        issued = true;
-        break;
-    case GpuType::BAR:
-        Output.send(trace, 1);
-        if (trace->gpu.active_warps != 0) 
-            core_->active_warps_ |= trace->gpu.active_warps;
-        else
-            core_->active_warps_.reset(trace->wid);
-        issued = true;
-        break;
-    case GpuType::TEX:
-        if (this->processTexRequest(trace))
-           issued = true;
-        break;
-    default:
-        std::abort();
-    }
-
-    if (issued) {    
-        DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
-        if (trace->fetch_stall)  {
-            core_->stalled_warps_.reset(trace->wid);
-        }
-        auto time = Input.pop();
-        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
-    }
-}
-
-bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {    
-    // check pending queue capacity    
-    if (pending_tex_reqs_.full()) {
-        if (!trace->suspend()) {
-            DT(3, "*** tex-queue-stall: " << *trace);
-        }
-        return false;
-    } else {
-        trace->resume();
-    }
-
-    // send memory request
-
-    uint32_t valid_addrs = 0;
-    for (auto& mem_addr : trace->mem_addrs) {
-        valid_addrs += mem_addr.size();
-    }
-
-    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
-
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!trace->tmask.test(t))
-            continue;
-
-        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
-        for (auto& mem_addr : trace->mem_addrs.at(t)) {
-            MemReq mem_req;
-            mem_req.addr  = mem_addr.addr;
-            mem_req.write = (trace->lsu.type == LsuType::STORE);
-            mem_req.tag   = tag;
-            mem_req.core_id = core_->id();
-            mem_req.uuid = trace->uuid;
-            dcache_req_port.send(mem_req, 3);
-            DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", tid=" << t << ", "<< trace);
-            ++ core_->perf_stats_.tex_reads;
-            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
-        }
-    }
-
-    return true;
-}
--- a/sim/simx/ibuffer.h
+++ b/sim/simx/ibuffer.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,10 +19,6 @@
 namespace vortex {

 class IBuffer {
-private:
-    std::queue<pipeline_trace_t*> entries_;
-    uint32_t capacity_;
-
 public:    
    IBuffer(uint32_t size) 
        : capacity_(size)
@@ -39,6 +48,10 @@ public:
        std::queue<pipeline_trace_t*> empty;
        std::swap(entries_, empty );
    }
+
+private:
+    std::queue<pipeline_trace_t*> entries_;
+    uint32_t capacity_;
 };

 }
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "types.h"
@@ -7,7 +20,7 @@ namespace vortex {
 class Warp;

 enum Opcode {   
-  NOP       = 0,    
+  NONE      = 0,    
  R_INST    = 0x33,
  L_INST    = 0x3,
  I_INST    = 0x13,
@@ -19,6 +32,7 @@ enum Opcode {
  JALR_INST = 0x67,
  SYS_INST  = 0x73,
  FENCE     = 0x0f,
+  AMO       = 0x2f,
  // F Extension
  FL        = 0x7,
  FS        = 0x27,
@@ -26,19 +40,20 @@ enum Opcode {
  FMADD     = 0x43,
  FMSUB     = 0x47,
  FMNMSUB   = 0x4b,
-  FMNMADD   = 0x4f,
-  // Vector Extension  
-  VSET      = 0x57,
-  // GPGPU Extension
-  GPGPU     = 0x6b,
-  GPU       = 0x5b,
-  // RV64 Standard Extensions
+  FMNMADD   = 0x4f,  
+  // RV64 Standard Extension
  R_INST_W  = 0x3b,
  I_INST_W  = 0x1b,
+  // Vector Extension  
+  VSET      = 0x57,
+  // Custom Extensions
+  EXT1      = 0x0b,
+  EXT2      = 0x2b,
+  EXT3      = 0x5b,
+  EXT4      = 0x7b
 };

-enum InstType { 
-  N_TYPE, 
+enum InstType {
  R_TYPE, 
  I_TYPE, 
  S_TYPE, 
@@ -52,25 +67,45 @@ enum InstType {
 class Instr {
 public:
  Instr() 
-    : opcode_(Opcode::NOP)
+    : opcode_(Opcode::NONE)
    , num_rsrcs_(0)
    , has_imm_(false)
    , rdest_type_(RegType::None)
+    , imm_(0)
    , rdest_(0)
    , func2_(0)
    , func3_(0)
    , func6_(0)
-    , func7_(0) {
+    , func7_(0)
+    , vmask_(0)
+    , vlsWidth_(0)
+    , vMop_(0)
+    , vNf_(0)
+    , vs3_(0)
+    , vlmul_(0)
+    , vsew_(0)
+    , vediv_(0)   {
    for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) {
       rsrc_type_[i] = RegType::None;
+       rsrc_[i] = 0;
    }
  }

  void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(uint32_t destReg, RegType type) { rdest_type_ = type; rdest_ = destReg; }
-  void setSrcReg(uint32_t srcReg, RegType type) { rsrc_type_[num_rsrcs_] = type; rsrc_[num_rsrcs_++] = srcReg; }
-  void setDestVReg(uint32_t destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
-  void setSrcVReg(uint32_t srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestReg(uint32_t destReg, RegType type) { 
+    rdest_type_ = type; 
+    rdest_ = destReg; 
+  }
+  void addSrcReg(uint32_t srcReg, RegType type) { 
+    rsrc_type_[num_rsrcs_] = type; 
+    rsrc_[num_rsrcs_] = srcReg; 
+    ++num_rsrcs_;
+  }
+  void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { 
+    rsrc_type_[index] = type; 
+    rsrc_[index] = srcReg; 
+    num_rsrcs_ = std::max<uint32_t>(num_rsrcs_, index+1); 
+  }
  void setFunc2(uint32_t func2) { func2_ = func2; }
  void setFunc3(uint32_t func3) { func3_ = func3; }
  void setFunc7(uint32_t func7) { func7_ = func7; }
@@ -85,17 +120,17 @@ public:
  void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; }
  void setFunc6(uint32_t func6) { func6_ = func6; }

-  Opcode getOpcode() const { return opcode_; }
+  Opcode   getOpcode() const { return opcode_; }
  uint32_t getFunc2() const { return func2_; }
  uint32_t getFunc3() const { return func3_; }
  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }
  uint32_t getNRSrc() const { return num_rsrcs_; }
  uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; }
-  RegType getRSType(uint32_t i) const { return rsrc_type_[i]; }
+  RegType  getRSType(uint32_t i) const { return rsrc_type_[i]; }
  uint32_t getRDest() const { return rdest_; }  
-  RegType getRDType() const { return rdest_type_; }  
-  bool hasImm() const { return has_imm_; }
+  RegType  getRDType() const { return rdest_type_; }  
+  bool     hasImm() const { return has_imm_; }
  uint32_t getImm() const { return imm_; }
  uint32_t getVlsWidth() const { return vlsWidth_; }
  uint32_t getVmop() const { return vMop_; }
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -1,98 +1,132 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <iomanip>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <stdlib.h>
+#include <unistd.h>
 #include <sys/stat.h>
 #include "processor.h"
-#include "archdef.h"
 #include "mem.h"
 #include "constants.h"
 #include <util.h>
-#include "args.h"
 #include "core.h"

 using namespace vortex;

+static void show_usage() {
+   std::cout << "Usage: [-c <cores>] [-w <warps>] [-t <threads>] [-r: riscv-test] [-s: stats] [-h: help] <program>" << std::endl;
+}
+
+uint32_t num_threads = NUM_THREADS;
+uint32_t num_warps = NUM_WARPS;
+uint32_t num_cores = NUM_CORES;
+uint32_t num_clusters = NUM_CLUSTERS;
+bool showStats = false;;
+bool riscv_test = false;
+const char* program = nullptr;
+
+static void parse_args(int argc, char **argv) {
+  	int c;
+  	while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
+    	switch (c) {
+      case 't':
+        num_threads = atoi(optarg);
+        break;
+      case 'w':
+        num_warps = atoi(optarg);
+        break;
+		  case 'c':
+        num_cores = atoi(optarg);
+        break;
+		  case 'g':
+        num_clusters = atoi(optarg);
+        break;
+      case 'r':
+        riscv_test = true;
+        break;
+      case 's':
+        showStats = true;
+        break;
+    	case 'h':
+    	case '?':
+      		show_usage();
+      		exit(0);
+    		break;
+    	default:
+      		show_usage();
+      		exit(-1);
+    	}
+	}
+
+	if (optind < argc) {
+		program = argv[optind];
+    std::cout << "Running " << program << "..." << std::endl;
+	} else {
+		show_usage();
+    exit(-1);
+	}
+}
+
 int main(int argc, char **argv) {
  int exitcode = 0;

-  std::string imgFileName;
-  int num_cores(NUM_CORES * NUM_CLUSTERS);
-  int num_warps(NUM_WARPS);
-  int num_threads(NUM_THREADS);  
-  bool showHelp(false);
-  bool showStats(false);
-  bool riscv_test(false);
+  parse_args(argc, argv);

-  // parse the command line arguments
-  CommandLineArgFlag fh("-h", "--help", "show command line options", showHelp);
-  CommandLineArgSetter<std::string> fi("-i", "--image", "program binary", imgFileName);
-  CommandLineArgSetter<int> fc("-c", "--cores", "number of cores", num_cores);
-  CommandLineArgSetter<int> fw("-w", "--warps", "number  of warps", num_warps);
-  CommandLineArgSetter<int> ft("-t", "--threads", "number of threads", num_threads);
-  CommandLineArgFlag fr("-r", "--riscv", "enable riscv tests", riscv_test);
-  CommandLineArgFlag fs("-s", "--stats", "show stats", showStats);
-
-  CommandLineArg::readArgs(argc - 1, argv + 1);
-
-  if (showHelp || imgFileName.empty()) {
-    std::cout << "Vortex emulator command line arguments:\n"
-                 "  -i, --image <filename> Program RAM image\n"
-                 "  -c, --cores <num> Number of cores\n"
-                 "  -w, --warps <num> Number of warps\n"
-                 "  -t, --threads <num> Number of threads\n"
-                 "  -r, --riscv riscv test\n"
-                 "  -s, --stats Print stats on exit.\n";
-    return 0;
-  }
-
-  std::cout << "Running " << imgFileName << "..." << std::endl;
-  
  {
    // create processor configuation
-    ArchDef arch(num_cores, num_warps, num_threads);
+    Arch arch(num_threads, num_warps, num_cores, num_clusters);

    // create memory module
    RAM ram(RAM_PAGE_SIZE);

+    // create processor
+    Processor processor(arch);
+  
+    // attach memory module
+    processor.attach_ram(&ram); 
+
+	  // setup base DCRs
+    const uint64_t startup_addr(STARTUP_ADDR);
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
+  #if (XLEN == 64)
+    processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
+  #endif
+	  processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
+
    // load program
-    {
-      std::string program_ext(fileExtension(imgFileName.c_str()));
+    {      
+      std::string program_ext(fileExtension(program));
      if (program_ext == "bin") {
-        ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
+        ram.loadBinImage(program, startup_addr);
      } else if (program_ext == "hex") {
-        ram.loadHexImage(imgFileName.c_str());
+        ram.loadHexImage(program);
      } else {
        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
        return -1;
      }
    }

-    // create processor
-    Processor processor(arch);
-  
-    // attach memory module
-    processor.attach_ram(&ram);   
-
    // run simulation
-    exitcode = processor.run();
+    exitcode = processor.run(riscv_test);
+  }   

+  if (exitcode != 0) {
+    std::cout << "*** error: exitcode=" << exitcode << std::endl;
  } 

-  if (riscv_test) {
-    if (1 == exitcode) {
-      std::cout << "Passed." << std::endl;
-      exitcode = 0;
-    } else {
-      std::cout << "Failed." << std::endl;
-    }
-  } else {
-    if (exitcode != 0) {
-      std::cout << "*** error: exitcode=" << exitcode << std::endl;
-    }
-  }  
-
  return exitcode;
 }
--- a/sim/simx/mem_sim.cpp
+++ b/sim/simx/mem_sim.cpp
@@ -1,4 +1,17 @@
-#include "memsim.h"
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mem_sim.h"
 #include <vector>
 #include <queue>
 #include <stdlib.h>
@@ -83,7 +96,7 @@ public:
            mem_req.addr,
            mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
            std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
-            mem_req.core_id
+            mem_req.cid
        );

        if (!dram_->send(dram_req))
--- a/sim/simx/mem_sim.h
+++ b/sim/simx/mem_sim.h
@@ -1,8 +1,20 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <simobject.h>
 #include "types.h"
-#include <vector>

 namespace vortex {

--- a/sim/simx/operand.h
+++ b/sim/simx/operand.h
@@ -0,0 +1,61 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Operand : public SimObject<Operand> {
+public:
+    SimPort<pipeline_trace_t*> Input;
+    SimPort<pipeline_trace_t*> Output;
+
+    Operand(const SimContext& ctx) 
+        : SimObject<Operand>(ctx, "Operand") 
+        , Input(this)
+        , Output(this)
+    {}
+    
+    virtual ~Operand() {}
+
+    virtual void reset() {}
+
+    virtual void tick() {
+        if (Input.empty())
+            return;
+        auto trace = Input.front();
+
+        int delay = 1;
+        for (int i = 0; i < MAX_NUM_REGS; ++i) {
+            bool is_iregs = trace->used_iregs.test(i);
+            bool is_fregs = trace->used_fregs.test(i);
+            bool is_vregs = trace->used_vregs.test(i);
+            if (is_iregs || is_fregs || is_vregs) {
+                if (is_iregs && i == 0)
+                    continue;
+                ++delay;
+            }
+        }
+
+        Output.send(trace, delay);
+        
+        DT(3, "pipeline-operands: " << *trace);
+
+        Input.pop();
+    };
+};
+
+}
--- a/sim/simx/pipeline.h
+++ b/sim/simx/pipeline.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+

 #pragma once

@@ -5,14 +18,38 @@
 #include <iostream>
 #include <util.h>
 #include "types.h"
-#include "archdef.h"
+#include "arch.h"
 #include "debug.h"

 namespace vortex {

+class ITraceData {
+public:
+    using Ptr = std::shared_ptr<ITraceData>;
+    ITraceData() {}
+    virtual ~ITraceData() {}
+};
+
+struct LsuTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<LsuTraceData>;
+  std::vector<mem_addr_size_t> mem_addrs;
+  LsuTraceData(uint32_t num_threads) : mem_addrs(num_threads) {}
+};
+
+struct SFUTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<SFUTraceData>;
+  struct {
+    uint32_t id;
+    uint32_t count;
+  } bar;
+  SFUTraceData(uint32_t bar_id, uint32_t bar_count) : bar{bar_id, bar_count} {}
+};
+
 struct pipeline_trace_t {
+public:
  //--
-  uint64_t    uuid;
+  const uint64_t uuid;
+  const Arch&    arch;
  
  //--
  uint32_t    cid;
@@ -21,12 +58,9 @@ struct pipeline_trace_t {
  Word        PC;

  //--
-  bool        fetch_stall;
-
-  //--
-  bool        wb;  
-  RegType     rdest_type;
  uint32_t    rdest;
+  RegType     rdest_type;
+  bool        wb;

  //--
  RegMask     used_iregs;
@@ -36,73 +70,104 @@ struct pipeline_trace_t {
  //- 
  ExeType     exe_type; 

-  //--
-  std::vector<std::vector<mem_addr_size_t>> mem_addrs;
-  
  //--
  union {
-    struct {        
-      LsuType type;
-    } lsu;
-    struct {
-      AluType type;
-    } alu;
-    struct {
-      FpuType type;
-    } fpu;
-    struct {
-      GpuType type;
-      WarpMask active_warps;
-    } gpu;
+    uint32_t unit_type;
+    LsuType  lsu_type;
+    AluType  alu_type;
+    FpuType  fpu_type;
+    SfuType  sfu_type;
  };

-  bool stalled;
+  ITraceData::Ptr data;

-  pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
-    uuid = uuid_;
-    cid = 0;
-    wid = 0;
-    tmask.reset();
-    PC = 0;
-    fetch_stall = false;
-    wb  = false;
-    rdest = 0;
-    rdest_type = RegType::None;
-    used_iregs.reset();
-    used_fregs.reset();
-    used_vregs.reset();
-    exe_type = ExeType::NOP;
-    mem_addrs.resize(arch.num_threads());
-    stalled = false;
-  }
+  int pid;
+  bool sop;
+  bool eop;

-  bool suspend() {
-    bool old = stalled;
-    stalled = true;
+  bool fetch_stall;
+
+  pipeline_trace_t(uint64_t uuid, const Arch& arch) 
+    : uuid(uuid)
+    , arch(arch)
+    , cid(0)
+    , wid(0)
+    , tmask(0)
+    , PC(0)    
+    , rdest(0)
+    , rdest_type(RegType::None)
+    , wb(false)
+    , used_iregs(0)
+    , used_fregs(0)
+    , used_vregs(0)
+    , exe_type(ExeType::ALU)
+    , unit_type(0)
+    , data(nullptr)
+    , pid(-1)
+    , sop(true)
+    , eop(true)
+    , fetch_stall(false)
+    , log_once_(false) 
+  {}
+
+  pipeline_trace_t(const pipeline_trace_t& rhs) 
+    : uuid(rhs.uuid)
+    , arch(rhs.arch)
+    , cid(rhs.cid)
+    , wid(rhs.wid)
+    , tmask(rhs.tmask)
+    , PC(rhs.PC)    
+    , rdest(rhs.rdest)
+    , rdest_type(rhs.rdest_type)
+    , wb(rhs.wb)    
+    , used_iregs(rhs.used_iregs)
+    , used_fregs(rhs.used_fregs)
+    , used_vregs(rhs.used_vregs)
+    , exe_type(rhs.exe_type)
+    , unit_type(rhs.unit_type)
+    , data(rhs.data)
+    , pid(rhs.pid)
+    , sop(rhs.sop)
+    , eop(rhs.eop)
+    , fetch_stall(rhs.fetch_stall)
+    , log_once_(false) 
+  {}
+  
+  ~pipeline_trace_t() {}
+
+  bool log_once(bool enable) {
+    bool old = log_once_;
+    log_once_ = enable;
    return old;
  }

-  void resume() {
-    stalled = false;
-  }
+private:
+  bool log_once_;
 };

 inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
-  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
+  os << "cid=" << state.cid;
+  os << ", wid=" << state.wid;
+  os << ", tmask=";
+  for (uint32_t i = 0, n = state.arch.num_threads(); i < n; ++i) {
+      os << state.tmask.test(i);
+  }  
+  os << ", PC=0x" << std::hex << state.PC;
  os << ", wb=" << state.wb;
  if (state.wb) {
     os << ", rd=" << state.rdest_type << std::dec << state.rdest;
  }
  os << ", ex=" << state.exe_type;
+  if (state.pid != -1) {
+    os << ", pid=" << state.pid;
+    os << ", sop=" << state.sop;
+    os << ", eop=" << state.eop;
+  }
  os << " (#" << std::dec << state.uuid << ")";
  return os;
 }

 class PipelineLatch {
-protected:
-  const char* name_;
-  std::queue<pipeline_trace_t*> queue_;
-
 public:
  PipelineLatch(const char* name = nullptr) 
    : name_(name) 
@@ -132,6 +197,10 @@ public:
    std::queue<pipeline_trace_t*> empty;
    std::swap(queue_, empty );
  }
+
+protected:
+  const char* name_;
+  std::queue<pipeline_trace_t*> queue_;
 };

 }
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -1,168 +1,141 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "processor.h"
-#include "core.h"
-#include "constants.h"
+#include "processor_impl.h"

 using namespace vortex;

-class Processor::Impl {
-private:
-  std::vector<Core::Ptr> cores_;
-  std::vector<Cache::Ptr> l2caches_;
-  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
-  Cache::Ptr l3cache_;
-  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
+ProcessorImpl::ProcessorImpl(const Arch& arch) 
+  : arch_(arch)
+  , clusters_(arch.num_clusters())
+{
+  SimPlatform::instance().initialize();

-public:
-  Impl(const ArchDef& arch) 
-    : cores_(arch.num_cores())
-    , l2caches_(NUM_CLUSTERS)
-    , l2_mem_switches_(NUM_CLUSTERS)
-  {
-    SimPlatform::instance().initialize();
+  // create memory simulator
+  memsim_ = MemSim::Create("dram", MemSim::Config{
+    MEMORY_BANKS,
+    uint32_t(arch.num_cores()) * arch.num_clusters()
+  });

-    uint32_t num_cores = arch.num_cores();
-    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
-
-    // create cores
-    for (uint32_t i = 0; i < num_cores; ++i) {
-        cores_.at(i) = Core::Create(arch, i);
+  // create L3 cache
+  l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
+    !L3_ENABLED,
+    log2ceil(L3_CACHE_SIZE),  // C
+    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(L3_NUM_WAYS),  // W
+    0,                      // A
+    XLEN,                   // address bits  
+    L3_NUM_BANKS,           // number of banks
+    1,                      // number of ports
+    uint8_t(arch.num_clusters()), // request size 
+    true,                   // write-through
+    false,                  // write response
+    0,                      // victim size
+    L3_MSHR_SIZE,           // mshr
+    2,                      // pipeline latency
    }
+  );        
+  
+  // connect L3 memory ports
+  l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
+  memsim_->MemRspPort.bind(&l3cache_->MemRspPort);

-     // setup memory simulator
-    auto memsim = MemSim::Create("dram", MemSim::Config{
-      MEMORY_BANKS,
-      arch.num_cores()
-    });
-    
-    std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
-    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
-
-    if (L3_ENABLE) {
-      l3cache_ = Cache::Create("l3cache", Cache::Config{
-        log2ceil(L3_CACHE_SIZE),  // C
-        log2ceil(MEM_BLOCK_SIZE), // B
-        2,                      // W
-        0,                      // A
-        32,                     // address bits  
-        L3_NUM_BANKS,           // number of banks
-        L3_NUM_PORTS,           // number of ports
-        NUM_CLUSTERS,           // request size 
-        true,                   // write-through
-        false,                  // write response
-        0,                      // victim size
-        L3_MSHR_SIZE,           // mshr
-        2,                      // pipeline latency
-        }
-      );        
-      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
-      }
-    } else if (NUM_CLUSTERS > 1) {
-      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
-      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-      }
-    }
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-      std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster); 
-      std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
-
-      if (L2_ENABLE) {
-        auto& l2cache = l2caches_.at(i);
-        l2cache = Cache::Create("l2cache", Cache::Config{
-          log2ceil(L2_CACHE_SIZE),  // C
-          log2ceil(MEM_BLOCK_SIZE), // B
-          2,                      // W
-          0,                      // A
-          32,                     // address bits  
-          L2_NUM_BANKS,           // number of banks
-          L2_NUM_PORTS,           // number of ports
-          (uint8_t)cores_per_cluster, // request size 
-          true,                   // write-through
-          false,                  // write response
-          0,                      // victim size
-          L2_MSHR_SIZE,           // mshr
-          2,                      // pipeline latency
-        });
-        l2cache->MemReqPort.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
-        }
-      } else {
-        auto& l2_mem_switch = l2_mem_switches_.at(i);
-        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
-        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
-        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-          cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
-        }
-      }
-
-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        auto& core = cores_.at((i * cores_per_cluster) + j);
-        core->MemReqPort.bind(cluster_mem_req_ports.at(j));
-        cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
-      }
-    }
+  // create clusters
+  for (uint32_t i = 0; i < arch.num_clusters(); ++i) {
+    clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_);
+    // connect L3 core ports
+    clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i));
+    l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port);
  }

-  ~Impl() {
-    SimPlatform::instance().finalize();
-  }
+  // set up memory perf recording
+  memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
+    __unused (cycle);
+    perf_mem_reads_   += !req.write;
+    perf_mem_writes_  += req.write;
+    perf_mem_pending_reads_ += !req.write;
+  });
+  memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
+    __unused (cycle);
+    --perf_mem_pending_reads_;
+  });

-  void attach_ram(RAM* ram) {
-    for (auto core : cores_) {
-      core->attach_ram(ram);
-    }
-  }
+  this->reset();
+}

-  int run() {
-    SimPlatform::instance().reset();
-    bool running;
-    int exitcode = 0;
-    do {
-      SimPlatform::instance().tick();
-      running = false;
-      for (auto& core : cores_) {
-        if (core->running()) {
-          running = true;
-        }
-        if (core->check_exit()) {
-          exitcode = core->getIRegValue(3);
-          running = false;
-          break;
+ProcessorImpl::~ProcessorImpl() {
+  SimPlatform::instance().finalize();
+}
+
+void ProcessorImpl::attach_ram(RAM* ram) {
+  for (auto cluster : clusters_) {
+    cluster->attach_ram(ram);
+  }
+}
+
+int ProcessorImpl::run(bool riscv_test) {
+  SimPlatform::instance().reset();
+  this->reset();
+  
+  bool done;
+  Word exitcode = 0;
+  do {
+    SimPlatform::instance().tick();
+    done = true;
+    for (auto cluster : clusters_) {
+      if (cluster->running()) {
+        Word ec;   
+        if (cluster->check_exit(&ec, riscv_test)) {
+          exitcode |= ec;
+        } else {
+          done = false;
        }
      }
-    } while (running);
+    }
+    perf_mem_latency_ += perf_mem_pending_reads_;
+  } while (!done);

-    return exitcode;
-  }
-};
+  return exitcode;
+}
+ 
+void ProcessorImpl::reset() {
+  perf_mem_reads_ = 0;
+  perf_mem_writes_ = 0;
+  perf_mem_latency_ = 0;
+  perf_mem_pending_reads_ = 0;
+}
+
+void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
+  dcrs_.write(addr, value);
+}
+
+ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
+  ProcessorImpl::PerfStats perf;
+  perf.mem_reads   = perf_mem_reads_;
+  perf.mem_writes  = perf_mem_writes_;
+  perf.mem_latency = perf_mem_latency_;
+  perf.l3cache     = l3cache_->perf_stats();
+  for (auto cluster : clusters_) {
+    perf.clusters += cluster->perf_stats();
+  }   
+  return perf;
+}

 ///////////////////////////////////////////////////////////////////////////////

-Processor::Processor(const ArchDef& arch) 
-  : impl_(new Impl(arch))
+Processor::Processor(const Arch& arch) 
+  : impl_(new ProcessorImpl(arch))
 {}

 Processor::~Processor() {
@@ -173,6 +146,10 @@ void Processor::attach_ram(RAM* mem) {
  impl_->attach_ram(mem);
 }

-int Processor::run() {
-  return impl_->run();
+int Processor::run(bool riscv_test) {
+  return impl_->run(riscv_test);
+}
+
+void Processor::write_dcr(uint32_t addr, uint32_t value) {
+  return impl_->write_dcr(addr, value);
 }
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@@ -1,22 +1,39 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

+#include <stdint.h>
+
 namespace vortex {

-class ArchDef;
+class Arch;
 class RAM;
+class ProcessorImpl;

 class Processor {
 public:
-  Processor(const ArchDef& arch);
+  Processor(const Arch& arch);
  ~Processor();

  void attach_ram(RAM* mem);

-  int run();
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);

 private:
-  class Impl;
-  Impl* impl_;
+  ProcessorImpl* impl_;
 };

-}
+}
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -0,0 +1,66 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mem_sim.h"
+#include "cache_sim.h"
+#include "constants.h"
+#include "dcrs.h"
+#include "cluster.h"
+
+namespace vortex {
+
+class ProcessorImpl {
+public:
+  struct PerfStats {
+    uint64_t mem_reads;
+    uint64_t mem_writes;
+    uint64_t mem_latency;
+    CacheSim::PerfStats l3cache;
+    Cluster::PerfStats clusters;
+
+    PerfStats()
+      : mem_reads(0)
+      , mem_writes(0)
+      , mem_latency(0)
+    {}
+  };
+
+  ProcessorImpl(const Arch& arch);
+  ~ProcessorImpl();
+
+  void attach_ram(RAM* mem);
+
+  int run(bool riscv_test);
+
+  void write_dcr(uint32_t addr, uint32_t value);
+
+  ProcessorImpl::PerfStats perf_stats() const;
+
+private:
+ 
+  void reset();
+
+  const Arch& arch_;
+  std::vector<std::shared_ptr<Cluster>> clusters_;
+  DCRS dcrs_;
+  MemSim::Ptr   memsim_;
+  CacheSim::Ptr l3cache_;
+  uint64_t perf_mem_reads_;
+  uint64_t perf_mem_writes_;
+  uint64_t perf_mem_latency_;
+  uint64_t perf_mem_pending_reads_;
+};
+
+}
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include "pipeline.h"
@@ -6,20 +19,15 @@
 namespace vortex {

 class Scoreboard {
-private:
+public:
+
    struct reg_use_t {
        RegType  type;
        uint32_t reg;        
        uint64_t owner;
    };
-
-    std::vector<RegMask> in_use_iregs_;
-    std::vector<RegMask> in_use_fregs_;
-    std::vector<RegMask> in_use_vregs_;
-    std::unordered_map<uint32_t, uint64_t> owners_; 
-
-public:    
-    Scoreboard(const ArchDef &arch) 
+        
+    Scoreboard(const Arch &arch) 
        : in_use_iregs_(arch.num_warps())
        , in_use_fregs_(arch.num_warps())
        , in_use_vregs_(arch.num_warps())
@@ -84,8 +92,7 @@ public:
    }
    
    void reserve(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;  
+        assert(state->wb);  
        switch (state->rdest_type) {
        case RegType::Integer:            
            in_use_iregs_.at(state->wid).set(state->rdest);
@@ -105,8 +112,7 @@ public:
    }

    void release(pipeline_trace_t* state) {
-        if (!state->wb)
-            return;       
+        assert(state->wb);      
        switch (state->rdest_type) {
        case RegType::Integer:
            in_use_iregs_.at(state->wid).reset(state->rdest);
@@ -123,6 +129,13 @@ public:
        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
        owners_.erase(tag);
    }
+
+private:
+
+    std::vector<RegMask> in_use_iregs_;
+    std::vector<RegMask> in_use_fregs_;
+    std::vector<RegMask> in_use_vregs_;
+    std::unordered_map<uint32_t, uint64_t> owners_;
 };

 }
--- a/sim/simx/shared_mem.cpp
+++ b/sim/simx/shared_mem.cpp
@@ -0,0 +1,138 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "shared_mem.h"
+#include "core.h"
+#include <bitmanip.h>
+#include <vector>
+#include "types.h"
+
+using namespace vortex;
+
+class SharedMem::Impl {
+protected:
+    SharedMem* simobject_;
+    Config    config_;
+    RAM       ram_;
+    uint32_t  bank_sel_addr_start_;
+    uint32_t  bank_sel_addr_end_;
+    PerfStats perf_stats_;
+
+    uint64_t to_local_addr(uint64_t addr) {
+        uint32_t total_lines = config_.capacity / config_.line_size;        
+        uint32_t line_bits = log2ceil(total_lines);
+        uint32_t offset = bit_getw(addr, 0, line_bits-1);
+        return offset;
+    }
+
+public:
+    Impl(SharedMem* simobject, const Config& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , ram_(config.capacity, config.capacity)
+        , bank_sel_addr_start_(0)
+        , bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
+    {}    
+    
+    virtual ~Impl() {}
+
+    void reset() {
+        perf_stats_ = PerfStats();
+    }
+
+    void read(void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.read(data, s_addr, size);
+    }
+
+    void write(const void* data, uint64_t addr, uint32_t size) {
+        auto s_addr = to_local_addr(addr);        
+        DPH(3, "Shared Mem addr=0x" << std::hex << s_addr << std::endl);
+        ram_.write(data, s_addr, size);
+    }
+
+    void tick() {
+        std::vector<bool> in_used_banks(config_.num_banks);
+        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
+            auto& core_req_port = simobject_->Inputs.at(req_id);            
+            if (core_req_port.empty())
+                continue;
+
+            auto& core_req = core_req_port.front();
+
+            uint32_t bank_id = 0;
+            if (bank_sel_addr_start_ <= bank_sel_addr_end_) {
+                bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
+            }
+
+            // bank conflict check
+            if (in_used_banks.at(bank_id)) {
+                ++perf_stats_.bank_stalls;
+                continue;
+            }
+
+            in_used_banks.at(bank_id) = true;
+
+            if (!core_req.write || config_.write_reponse) {
+                // send response
+                MemRsp core_rsp{core_req.tag, core_req.cid};
+                simobject_->Outputs.at(req_id).send(core_rsp, 1);
+            }
+
+            // update perf counters
+            perf_stats_.reads += !core_req.write;            
+            perf_stats_.writes += core_req.write;
+
+            // remove input
+            core_req_port.pop();
+        }
+    }
+
+    const PerfStats& perf_stats() const { 
+        return perf_stats_; 
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+SharedMem::SharedMem(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<SharedMem>(ctx, name)   
+    , Inputs(config.num_reqs, this)
+    , Outputs(config.num_reqs, this)
+    , impl_(new Impl(this, config))
+{}
+
+SharedMem::~SharedMem() {
+    delete impl_;
+}
+
+void SharedMem::reset() {
+    impl_->reset();
+}
+
+void SharedMem::read(void* data, uint64_t addr, uint32_t size) {
+    impl_->read(data, addr, size);
+}
+
+void SharedMem::write(const void* data, uint64_t addr, uint32_t size) {
+    impl_->write(data, addr, size);
+}
+
+void SharedMem::tick() {
+    impl_->tick();
+}
+
+const SharedMem::PerfStats& SharedMem::perf_stats() const {
+    return impl_->perf_stats();
+}
--- a/sim/simx/shared_mem.h
+++ b/sim/simx/shared_mem.h
@@ -0,0 +1,72 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "types.h"
+
+namespace vortex {
+
+class SharedMem : public SimObject<SharedMem> {
+public:
+  struct Config {
+    uint32_t capacity;
+    uint32_t line_size;
+    uint32_t num_reqs;
+    uint32_t num_banks;
+    bool write_reponse;
+  };
+
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t bank_stalls;
+
+    PerfStats() 
+      : reads(0)
+      , writes(0)
+      , bank_stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads += rhs.reads;
+      this->writes += rhs.writes;
+      this->bank_stalls += rhs.bank_stalls;
+      return *this;
+    }
+  };
+
+  std::vector<SimPort<MemReq>> Inputs;
+  std::vector<SimPort<MemRsp>> Outputs;
+
+  SharedMem(const SimContext& ctx, const char* name, const Config& config);    
+  virtual ~SharedMem();
+
+  void reset();
+
+  void read(void* data, uint64_t addr, uint32_t size);
+
+  void write(const void* data, uint64_t addr, uint32_t size);
+
+  void tick();
+
+  const PerfStats& perf_stats() const;
+
+protected:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
--- a/sim/simx/sharedmem.h
+++ b/sim/simx/sharedmem.h
@@ -1,96 +0,0 @@
-#pragma once
-
-#include <simobject.h>
-#include <bitmanip.h>
-#include <vector>
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class SharedMem : public SimObject<SharedMem> {
-public:
-    struct Config {
-        uint32_t num_reqs;
-        uint32_t num_banks; 
-        uint32_t bank_offset;
-        uint32_t latency;
-        bool     write_reponse;
-    };
-
-    struct PerfStats {
-        uint64_t reads;
-        uint64_t writes;
-        uint64_t bank_stalls;
-
-        PerfStats() 
-            : reads(0)
-            , writes(0)
-            , bank_stalls(0)
-        {}
-    };
-
-    std::vector<SimPort<MemReq>> Inputs;
-    std::vector<SimPort<MemRsp>> Outputs;
-
-    SharedMem(const SimContext& ctx, const char* name, const Config& config) 
-        : SimObject<SharedMem>(ctx, name)
-        , Inputs(config.num_reqs, this)
-        , Outputs(config.num_reqs, this)
-        , config_(config)
-        , bank_sel_addr_start_(config.bank_offset)
-        , bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
-    {}    
-    
-    virtual ~SharedMem() {}
-
-    void reset() {
-        perf_stats_ = PerfStats();
-    }
-
-    void tick() {
-        std::vector<bool> in_used_banks(config_.num_banks);
-        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
-            auto& core_req_port = this->Inputs.at(req_id);            
-            if (core_req_port.empty())
-                continue;
-
-            auto& core_req = core_req_port.front();
-
-            uint32_t bank_id = (uint32_t)bit_getw(
-                core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
-
-            // bank conflict check
-            if (in_used_banks.at(bank_id))
-                continue;
-
-            in_used_banks.at(bank_id) = true;
-
-            if (!core_req.write || config_.write_reponse) {
-                // send response
-                MemRsp core_rsp{core_req.tag, core_req.core_id};
-                this->Outputs.at(req_id).send(core_rsp, 1);
-            }
-
-            // update perf counters
-            perf_stats_.reads += !core_req.write;            
-            perf_stats_.writes += core_req.write;
-
-            // remove input
-            core_req_port.pop();
-        }
-    }
-
-    const PerfStats& perf_stats() const { 
-        return perf_stats_; 
-    }
-
-protected:
-    Config    config_;
-    uint32_t  bank_sel_addr_start_;
-    uint32_t  bank_sel_addr_end_;
-    PerfStats perf_stats_;
-};
-
-}
--- a/sim/simx/tex_unit.cpp
+++ b/sim/simx/tex_unit.cpp
@@ -1,100 +0,0 @@
-#include "tex_unit.h"
-#include "core.h"
-#include <texturing.h>
-#include <VX_config.h>
-
-using namespace vortex;
-using namespace cocogfx;
-
-enum class FilterMode {
-  Point,
-  Bilinear,
-  Trilinear,
-};
-
-TexUnit::TexUnit(Core* core) : core_(core) {}
-
-TexUnit::~TexUnit() {}
-
-void TexUnit::clear() {
-  for (auto& state : states_) {
-    state = 0;
-  }
-}
-
-uint32_t TexUnit::get_state(uint32_t state) {
-  return states_.at(state);
-}
-  
-void TexUnit::set_state(uint32_t state, uint32_t value) {
-  states_.at(state) = value;
-}
-
-uint32_t TexUnit::read(int32_t u, 
-                       int32_t v, 
-                       int32_t lod, 
-                       std::vector<mem_addr_size_t>* mem_addrs) {
-  //--
-  auto xu = Fixed<TEX_FXD_FRAC>::make(u);
-  auto xv = Fixed<TEX_FXD_FRAC>::make(v);
-  uint32_t base_addr  = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
-  uint32_t log_width  = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
-  uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
-  auto format         = (TexFormat)states_.at(TEX_STATE_FORMAT);    
-  auto filter         = (FilterMode)states_.at(TEX_STATE_FILTER);    
-  auto wrapu          = (WrapMode)states_.at(TEX_STATE_WRAPU);
-  auto wrapv          = (WrapMode)states_.at(TEX_STATE_WRAPV);
-
-  auto stride = Stride(format);
-  
-  switch (filter) {
-  case FilterMode::Bilinear: {
-    // addressing
-    uint32_t offset00, offset01, offset10, offset11;
-    uint32_t alpha, beta;
-    TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, 
-      &offset00, &offset01, &offset10, &offset11, &alpha, &beta);
-
-    uint32_t addr00 = base_addr + offset00 * stride;
-    uint32_t addr01 = base_addr + offset01 * stride;
-    uint32_t addr10 = base_addr + offset10 * stride;
-    uint32_t addr11 = base_addr + offset11 * stride;
-
-    // memory lookup
-    uint32_t texel00(0), texel01(0), texel10(0), texel11(0);
-    core_->dcache_read(&texel00, addr00, stride);
-    core_->dcache_read(&texel01, addr01, stride);
-    core_->dcache_read(&texel10, addr10, stride);
-    core_->dcache_read(&texel11, addr11, stride);
-
-    mem_addrs->push_back({addr00, stride});
-    mem_addrs->push_back({addr01, stride});
-    mem_addrs->push_back({addr10, stride});
-    mem_addrs->push_back({addr11, stride});
-
-    // filtering
-    auto color = TexFilterLinear(
-      format, texel00, texel01, texel10, texel11, alpha, beta);
-    return color;
-  }
-  case FilterMode::Point: {
-    // addressing
-    uint32_t offset;
-    TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
-    
-    uint32_t addr = base_addr + offset * stride;
-
-    // memory lookup
-    uint32_t texel(0);
-    core_->dcache_read(&texel, addr, stride);
-    mem_addrs->push_back({addr, stride});
-
-    // filtering
-    auto color = TexFilterPoint(format, texel);
-    return color;
-  }
-  default:
-    std::abort();
-    return 0;
-  }
-}
--- a/sim/simx/tex_unit.h
+++ b/sim/simx/tex_unit.h
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "types.h"
-
-namespace vortex {
-
-class Core;
-
-class TexUnit {
-public:
-    TexUnit(Core* core);
-    ~TexUnit();
-
-    void clear();
-
-    uint32_t get_state(uint32_t state);
-  
-    void set_state(uint32_t state, uint32_t value);
-
-    uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
-
-private:
-
-    std::array<uint32_t, NUM_TEX_STATES> states_;
-    Core* core_;
-};
-
-}
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #pragma once

 #include <stdint.h>
@@ -5,31 +18,42 @@
 #include <queue>
 #include <unordered_map>
 #include <util.h>
+#include <stringutil.h>
 #include <VX_config.h>
 #include <simobject.h>
+#include "uuid_gen.h"
+#include "debug.h"

 namespace vortex {

 typedef uint8_t Byte;
-#if XLEN == 32
+#if (XLEN == 32)
 typedef uint32_t Word;
 typedef int32_t  WordI;
 typedef uint64_t DWord;
 typedef int64_t  DWordI;
-#elif XLEN == 64
+typedef uint32_t WordF;
+#elif (XLEN == 64)
 typedef uint64_t Word;
 typedef int64_t  WordI;
 typedef __uint128_t DWord;
 typedef __int128_t DWordI;
+typedef uint64_t WordF;
 #else
 #error unsupported XLEN
 #endif

-typedef uint64_t FWord;
+#define MAX_NUM_CORES   1024
+#define MAX_NUM_THREADS 32
+#define MAX_NUM_WARPS   32
+#define MAX_NUM_REGS    32

-typedef std::bitset<32> RegMask;
-typedef std::bitset<32> ThreadMask;
-typedef std::bitset<32> WarpMask;
+typedef std::bitset<MAX_NUM_CORES>   CoreMask;
+typedef std::bitset<MAX_NUM_REGS>    RegMask;
+typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
+typedef std::bitset<MAX_NUM_WARPS>   WarpMask;
+
+typedef std::unordered_map<uint32_t, uint32_t> CSRs;

 ///////////////////////////////////////////////////////////////////////////////

@@ -40,8 +64,8 @@ enum class RegType {
  Vector
 };

-inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
-  switch (clss) {
+inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
+  switch (type) {
  case RegType::None: break;
  case RegType::Integer: os << "x"; break;  
  case RegType::Float:   os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
 ///////////////////////////////////////////////////////////////////////////////

 enum class ExeType {
-  NOP,
  ALU,
  LSU,
-  CSR,
  FPU,
-  GPU,
+  SFU,
  MAX,
 };

 inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
  switch (type) {
-  case ExeType::NOP: os << "NOP"; break;
  case ExeType::ALU: os << "ALU"; break;
  case ExeType::LSU: os << "LSU"; break;
-  case ExeType::CSR: os << "CSR"; break;
  case ExeType::FPU: os << "FPU"; break;
-  case ExeType::GPU: os << "GPU"; break;
+  case ExeType::SFU: os << "SFU"; break;
  case ExeType::MAX: break;
  }
  return os;
@@ -82,8 +102,7 @@ enum class AluType {
  BRANCH,
  SYSCALL,
  IMUL,
-  IDIV,    
-  CMOV,
+  IDIV
 };

 inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  case AluType::SYSCALL: os << "SYSCALL"; break;
  case AluType::IMUL:    os << "IMUL"; break;
  case AluType::IDIV:    os << "IDIV"; break;
-  case AluType::CMOV:    os << "CMOV"; break;
  }
  return os;
 }
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
 enum class LsuType {
  LOAD,
  STORE,
-  FENCE,
-  PREFETCH,    
+  FENCE
 };

 inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
  switch (type) {
-  case LsuType::LOAD:     os << "LOAD"; break;
-  case LsuType::STORE:    os << "STORE"; break;
-  case LsuType::FENCE:    os << "FENCE"; break;
-  case LsuType::PREFETCH: os << "PREFETCH"; break;
+  case LsuType::LOAD:  os << "LOAD"; break;
+  case LsuType::STORE: os << "STORE"; break;
+  case LsuType::FENCE: os << "FENCE"; break;
  }
  return os;
 }
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
  uint32_t size;
 };

-inline AddrType get_addr_type(Word addr, uint32_t size) {
-  __unused (size);
-  if (SM_ENABLE) {
-    if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
-    &&  addr < SMEM_BASE_ADDR) {      
-      assert((addr + size) <= SMEM_BASE_ADDR);
-      return AddrType::Shared;
-    }
-  }
-  if (addr >= IO_BASE_ADDR) {
-     return AddrType::IO;
-  }
-  return AddrType::Global;
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 enum class FpuType {
@@ -179,23 +180,37 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {

 ///////////////////////////////////////////////////////////////////////////////

-enum class GpuType {
+enum class SfuType {
  TMC,
  WSPAWN,
  SPLIT,
  JOIN,
  BAR,
+  PRED,
+  CSRRW,
+  CSRRS,
+  CSRRC,
  TEX,
+  RASTER,
+  ROP,    
+  CMOV  
 };

-inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
+inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
  switch (type) {
-  case GpuType::TMC:    os << "TMC"; break;
-  case GpuType::WSPAWN: os << "WSPAWN"; break;
-  case GpuType::SPLIT:  os << "SPLIT"; break;
-  case GpuType::JOIN:   os << "JOIN"; break;
-  case GpuType::BAR:    os << "BAR"; break;
-  case GpuType::TEX:    os << "TEX"; break;
+  case SfuType::TMC:    os << "TMC"; break;
+  case SfuType::WSPAWN: os << "WSPAWN"; break;
+  case SfuType::SPLIT:  os << "SPLIT"; break;
+  case SfuType::JOIN:   os << "JOIN"; break;
+  case SfuType::BAR:    os << "BAR"; break;
+  case SfuType::PRED:   os << "PRED"; break;
+  case SfuType::CSRRW:  os << "CSRRW"; break;
+  case SfuType::CSRRS:  os << "CSRRS"; break;
+  case SfuType::CSRRC:  os << "CSRRC"; break;
+  case SfuType::TEX:    os << "TEX"; break;
+  case SfuType::RASTER: os << "RASTER"; break;
+  case SfuType::ROP:    os << "ROP"; break;
+  case SfuType::CMOV:   os << "CMOV"; break;
  }
  return os;
 }
@@ -218,31 +233,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemReq {
-    uint64_t addr;
-    bool write;
-    bool non_cacheable;
-    uint32_t tag;
-    uint32_t core_id;    
-    uint64_t uuid;
+  uint64_t addr;
+  bool write;
+  AddrType type;
+  uint32_t tag;
+  uint32_t cid;    
+  uint64_t uuid;

-    MemReq(uint64_t _addr = 0, 
-           bool _write = false,
-           bool _non_cacheable = false,
-           uint64_t _tag = 0, 
-           uint32_t _core_id = 0,
-           uint64_t _uuid = 0
-    )   : addr(_addr)
-        , write(_write)
-        , non_cacheable(_non_cacheable)
-        , tag(_tag)
-        , core_id(_core_id)
-        , uuid(_uuid)
-    {}
+  MemReq(uint64_t _addr = 0, 
+          bool _write = false,
+          AddrType _type = AddrType::Global,
+          uint64_t _tag = 0, 
+          uint32_t _cid = 0,
+          uint64_t _uuid = 0
+  ) : addr(_addr)
+    , write(_write)
+    , type(_type)
+    , tag(_tag)
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
  os << "mem-" << (req.write ? "wr" : "rd") << ": ";
-  os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
+  os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
+  os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
  os << " (#" << std::dec << req.uuid << ")";
  return os;
 }
@@ -250,18 +266,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemRsp {
-    uint64_t tag;    
-    uint32_t core_id;
-    uint64_t uuid;
-    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
-      : tag (_tag) 
-      , core_id(_core_id)
-      , uuid(_uuid)
-    {}
+  uint64_t tag;    
+  uint32_t cid;
+  uint64_t uuid;
+  
+  MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
+    : tag (_tag) 
+    , cid(_cid)
+    , uuid(_uuid)
+  {}
 };

 inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
-  os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
+  os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
  os << " (#" << std::dec << rsp.uuid << ")";
  return os;
 }
@@ -270,10 +287,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {

 template <typename T>
 class HashTable {
-private:
-  std::vector<std::pair<bool, T>> entries_;
-  uint32_t size_;
-
 public:    
  HashTable(uint32_t capacity)
    : entries_(capacity)
@@ -336,92 +349,180 @@ public:
    }
    size_ = 0;
  }
+
+private:
+  std::vector<std::pair<bool, T>> entries_;
+  uint32_t size_;
 };

 ///////////////////////////////////////////////////////////////////////////////

-template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
+template <typename Req, typename Rsp>
 class Switch : public SimObject<Switch<Req, Rsp>> {
-private:
-  ArbiterType type_;
-  uint32_t delay_;  
-  uint32_t cursor_;
-  uint32_t tag_shift_;
-
 public:
+  std::vector<SimPort<Req>>  ReqIn;
+  std::vector<SimPort<Rsp>>  RspIn;
+
+  std::vector<SimPort<Req>>  ReqOut;  
+  std::vector<SimPort<Rsp>>  RspOut;
+
  Switch(
    const SimContext& ctx, 
    const char* name, 
    ArbiterType type, 
-    uint32_t num_inputs, 
+    uint32_t num_inputs = 1, 
+    uint32_t num_outputs = 1,
    uint32_t delay = 1
  ) 
-    : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
+    : SimObject<Switch<Req, Rsp>>(ctx, name)    
+    , ReqIn(num_inputs,   this)
+    , RspIn(num_inputs,   this)
+    , ReqOut(num_outputs, this)    
+    , RspOut(num_outputs, this)
    , type_(type)
    , delay_(delay)
-    , cursor_(0)
-    , tag_shift_(log2ceil(num_inputs))
-    , ReqIn(num_inputs, this)
-    , ReqOut(this)
-    , RspIn(this)    
-    , RspOut(num_inputs, this)
+    , cursors_(num_outputs, 0)
+    , lg_num_reqs_(log2ceil(num_inputs / num_outputs))
  {
-    assert(delay_ != 0);
-    assert(num_inputs <= MaxInputs);
-    if (num_inputs == 1) {
-      // bypass
-      ReqIn.at(0).bind(&ReqOut);
-      RspIn.bind(&RspOut.at(0));
+    assert(delay != 0);    
+    assert(num_inputs <= 32);
+    assert(num_outputs <= 32);
+    assert(num_inputs >= num_outputs);
+
+    if (num_inputs == num_outputs) {
+      // bypass mode
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        ReqIn.at(i).bind(&ReqOut.at(i));
+        RspOut.at(i).bind(&RspIn.at(i));
+      }
    }
  }

  void reset() {
-    cursor_ = 0;
+    for (auto& cursor : cursors_) {
+      cursor = 0;
+    }
  }

-  void tick() {  
-    if (ReqIn.size() == 1)
+  void tick() {
+    uint32_t I = ReqIn.size();
+    uint32_t O = ReqOut.size();
+    uint32_t R = 1 << lg_num_reqs_;
+
+    // skip bypass mode
+    if (I == O)
      return;
        
-    // process incomming requests    
-    for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {      
-      uint32_t j = (cursor_ + i) % n;
-      auto& req_in = ReqIn.at(j);      
-      if (!req_in.empty()) {
-        auto& req = req_in.front();
-        if (tag_shift_) {
-          req.tag = (req.tag << tag_shift_) | j;
+    // process incomming requests        
+    for (uint32_t o = 0; o < O; ++o) {
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (cursors_.at(o) + r) & (R-1);
+        uint32_t j = o * R + i;
+        if (j >= I)
+          continue;
+        
+        auto& req_in = ReqIn.at(j);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          if (lg_num_reqs_ != 0) {
+            req.tag = (req.tag << lg_num_reqs_) | i;
+          }
+          DT(4, this->name() << "-" << req);
+          ReqOut.at(o).send(req, delay_);                
+          req_in.pop();
+          this->update_cursor(o, i);
+          break;
        }
-        ReqOut.send(req, delay_);                
-        req_in.pop();
-        this->update_cursor(j);
-        break;
      }
-    } 
-
-    // process incoming reponses
-    if (!RspIn.empty()) {
-      auto& rsp = RspIn.front();    
-      uint32_t port_id = 0;
-      if (tag_shift_) {
-        port_id = rsp.tag & ((1 << tag_shift_)-1);
-        rsp.tag >>= tag_shift_;
-      }      
-      RspOut.at(port_id).send(rsp, 1);
-      RspIn.pop();
+      
+      // process incoming reponses
+      if (!RspOut.at(o).empty()) {
+        auto& rsp = RspOut.at(o).front();
+        uint32_t i = 0;
+        if (lg_num_reqs_ != 0) {
+          i = rsp.tag & (R-1);
+          rsp.tag >>= lg_num_reqs_;
+        }      
+        DT(4, this->name() << "-" << rsp);
+        uint32_t j = o * R + i;
+        RspIn.at(j).send(rsp, 1);      
+        RspOut.at(o).pop();
+      }
    }
  }

-  void update_cursor(uint32_t grant) {
+  void update_cursor(uint32_t index, uint32_t grant) {
    if (type_ == ArbiterType::RoundRobin) {
-      cursor_ = grant + 1;
+      cursors_.at(index) = grant + 1;
    }
  }

-  std::vector<SimPort<Req>>  ReqIn;
-  SimPort<Req>              ReqOut;
-  SimPort<Rsp>               RspIn;    
-  std::vector<SimPort<Rsp>> RspOut;
+private:
+  ArbiterType type_;
+  uint32_t delay_;  
+  std::vector<uint32_t> cursors_;
+  uint32_t lg_num_reqs_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SMemDemux : public SimObject<SMemDemux> {
+public:
+  SimPort<MemReq>  ReqIn;
+  SimPort<MemRsp>  RspIn;
+
+  SimPort<MemReq>  ReqSm;
+  SimPort<MemRsp>  RspSm;
+
+  SimPort<MemReq>  ReqDc;
+  SimPort<MemRsp>  RspDc;
+
+  SMemDemux(
+    const SimContext& ctx, 
+    const char* name, 
+    uint32_t delay = 1
+  ) : SimObject<SMemDemux>(ctx, name)    
+    , ReqIn(this)
+    , RspIn(this)
+    , ReqSm(this)
+    , RspSm(this)
+    , ReqDc(this)
+    , RspDc(this)
+    , delay_(delay)
+  {}
+
+  void reset() {}
+
+  void tick() {
+    // process incomming requests  
+    if (!ReqIn.empty()) {
+      auto& req = ReqIn.front();
+      DT(4, this->name() << "-" << req);
+      if (req.type == AddrType::Shared) {
+        ReqSm.send(req, delay_);
+      } else {
+        ReqDc.send(req, delay_);
+      }
+      ReqIn.pop();
+    }   
+      
+    // process incoming reponses
+    if (!RspSm.empty()) {
+      auto& rsp = RspSm.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspSm.pop();
+    }
+    if (!RspDc.empty()) {
+      auto& rsp = RspDc.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspDc.pop();
+    }
+  }
+
+private:
+  uint32_t delay_;
 };

 }
--- a/sim/simx/warp.cpp
+++ b/sim/simx/warp.cpp
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <iostream>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,21 +23,25 @@

 using namespace vortex;

-Warp::Warp(Core *core, uint32_t id)
-    : id_(id)
+Warp::Warp(Core *core, uint32_t warp_id)
+    : warp_id_(warp_id)
+    , arch_(core->arch())
    , core_(core)
    , ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
-    , freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
+    , freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
    , vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
 {
-  this->clear();
+  this->reset();
 }

-void Warp::clear() {
-  active_ = false;
-  PC_ = STARTUP_ADDR;
+void Warp::reset() {
+  PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
+#if (XLEN == 64)
+  PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
+#endif
  tmask_.reset();  
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i) {
+  issued_instrs_ = 0;
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
    for (auto& reg : ireg_file_.at(i)) {
      reg = 0;
    }
@@ -35,31 +52,44 @@ void Warp::clear() {
      reg = 0;
    }
  }
+  uui_gen_.reset();
 }

-void Warp::eval(pipeline_trace_t *trace) {
+pipeline_trace_t* Warp::eval() {
  assert(tmask_.any());

-  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
-  for (uint32_t i = 0, n = core_->arch().num_threads(); i < n; ++i)
-    DPN(2, tmask_.test(n-i-1));
-  DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
-
-  /* Fetch and decode. */    
+#ifndef NDEBUG
+  uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
+  uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
+  uint32_t instr_id  = instr_uuid & 0xffff;
+  uint32_t instr_ref = instr_uuid >> 16;
+  uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
+#else
+  uint64_t uuid = 0;
+#endif
+  
+  DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
+  for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
+    DPN(1, tmask_.test(i));
+  DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);

+  // Fetch
  uint32_t instr_code = 0;
  core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
-  auto instr = core_->decoder().decode(instr_code);
+
+  // Decode
+  auto instr = core_->decoder_.decode(instr_code);
  if (!instr) {
-    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
+    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
    std::abort();
  }  

-  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
+  DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);

-  // Update trace
+  // Create trace
+  auto trace = new pipeline_trace_t(uuid, arch_);
  trace->cid   = core_->id();
-  trace->wid   = id_;
+  trace->wid   = warp_id_;
  trace->PC    = PC_;
  trace->tmask = tmask_;
  trace->rdest = instr->getRDest();
@@ -68,18 +98,20 @@ void Warp::eval(pipeline_trace_t *trace) {
  // Execute
  this->execute(*instr, trace);

-  DP(4, "Register state:");
-  for (uint32_t i = 0; i < core_->arch().num_regs(); ++i) {
-    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
+  DP(5, "Register state:");
+  for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
+    DPN(5, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    // Integer register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, '|');
+    DPN(5, '|');
    // Floating point register file
-    for (uint32_t j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
+    for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
+      DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
-    DPN(4, std::endl);
+    DPN(5, std::endl);
  }  
+
+  return trace;
 }
--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@@ -1,3 +1,16 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #ifndef __WARP_H
 #define __WARP_H

@@ -7,28 +20,26 @@

 namespace vortex {

+class Arch;
 class Core;
 class Instr;
 class pipeline_trace_t;
+
 struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
    , PC(PC)
-    , fallThrough(false)
-    , unanimous(false) 
+    , fallthrough(false)
  {}

-  DomStackEntry(const ThreadMask &tmask)
-      : tmask(tmask)
-      , PC(0)
-      , fallThrough(true)
-      , unanimous(false) 
+  DomStackEntry(const ThreadMask &tmask) 
+    : tmask(tmask)
+    , fallthrough(true)
  {}

  ThreadMask tmask;
  Word PC;
-  bool fallThrough;
-  bool unanimous;
+  bool fallthrough;
 };

 struct vtype {
@@ -40,72 +51,58 @@ struct vtype {

 class Warp {
 public:
-  Warp(Core *core, uint32_t id);
+  Warp(Core *core, uint32_t warp_id);

-  void clear();
-  
-  bool active() const {
-    return active_;
-  }
-
-  void suspend() {
-    active_ = false;
-  }
-
-  void activate() {
-    active_ = true;
-  }
-
-  std::size_t getActiveThreads() const {
-    if (active_)
-      return tmask_.count();
-    return 0;
-  }
+  void reset();

  uint32_t id() const {
-    return id_;
+    return warp_id_;
  }

-  uint32_t getPC() const {
+  Word getPC() const {
    return PC_;
  }

-  void setPC(uint32_t PC) {
+  void setPC(Word PC) {
    PC_ = PC;
  }

  void setTmask(size_t index, bool value) {
    tmask_.set(index, value);
-    active_ = tmask_.any();
  }

-  uint32_t getTmask() const {
-    if (active_)
-      return tmask_.to_ulong();
-    return 0;
+  uint64_t getTmask() const {
+    return tmask_.to_ulong();
  }

-  uint32_t getIRegValue(uint32_t reg) const {
+  Word getIRegValue(uint32_t reg) const {
    return ireg_file_.at(0).at(reg);
  }

-  void eval(pipeline_trace_t *);
+  uint64_t incr_instrs() {
+    return issued_instrs_++;
+  }
+
+  pipeline_trace_t* eval();

 private:

  void execute(const Instr &instr, pipeline_trace_t *trace);
+
+  UUIDGenerator uui_gen_;
  
-  uint32_t id_;
+  uint32_t warp_id_;
+  const Arch& arch_;
  Core *core_;
-  bool active_;
+  uint64_t issued_instrs_;
  
  Word PC_;
-  ThreadMask tmask_;  
-  
-  std::vector<std::vector<Word>> ireg_file_;
-  std::vector<std::vector<FWord>> freg_file_;
-  std::vector<std::vector<Byte>> vreg_file_;
-  std::stack<DomStackEntry> dom_stack_;
+  ThreadMask tmask_;
+
+  std::vector<std::vector<Word>>     ireg_file_;
+  std::vector<std::vector<uint64_t>> freg_file_;
+  std::vector<std::vector<Byte>>     vreg_file_;
+  std::stack<DomStackEntry>          ipdom_stack_;

  struct vtype vtype_;
  uint32_t vl_;
--- a/sim/vlsim/Makefile
+++ b/sim/vlsim/Makefile
@@ -1,101 +0,0 @@
-DESTDIR ?= .
-RTL_DIR = ../../hw/rtl
-DPI_DIR = ../../hw/dpi
-SCRIPT_DIR = ../../hw/scripts
-THIRD_PARTY_DIR = ../../third_party
-
-CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
-CXXFLAGS += -fPIC -Wno-maybe-uninitialized
-CXXFLAGS += -I.. -I../../../hw -I../../common
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)
-
-LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
-LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
-
-# control RTL debug tracing states
-DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE  
-DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
-DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
-DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
-DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK 
-DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
-DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
-DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
-DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
-DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
-DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
-
-DBG_FLAGS += $(DBG_TRACE_FLAGS)
-
-SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
-SRCS += fpga.cpp opae_sim.cpp
-
-FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
-TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
-RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
-RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
-
-TOP = vortex_afu_shim
-
-VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
-VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
-VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-EOFNEWLINE
-VL_FLAGS += --x-initial unique --x-assign unique
-VL_FLAGS += verilator.vlt
-VL_FLAGS += $(RTL_INCLUDE)
-
-VL_FLAGS += $(CONFIGS)
-CXXFLAGS += $(CONFIGS)
-
-# Enable Verilator multithreaded simulation
-THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
-VL_FLAGS += -j $(THREADS)
-#VL_FLAGS += --threads $(THREADS)
-
-# Debugigng
-ifdef DEBUG
-	VL_FLAGS += --trace --trace-structs -DVCD_OUTPUT $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
-else    
-	VL_FLAGS += -DNDEBUG
-	CXXFLAGS += -O2 -DNDEBUG
-endif
-
-# Enable scope analyzer
-ifdef SCOPE
-	VL_FLAGS += -DSCOPE
-	CXXFLAGS += -DSCOPE
-endif
-
-# Enable perf counters
-ifdef PERF
-	VL_FLAGS += -DPERF_ENABLE
-	CXXFLAGS += -DPERF_ENABLE
-endif
-
-# use our OPAE shim
-VL_FLAGS += -DNOPAE
-CXXFLAGS += -DNOPAE
-
-# ALU backend
-VL_FLAGS += -DIMUL_DPI
-VL_FLAGS += -DIDIV_DPI
-
-# FPU backend
-FPU_CORE ?= FPU_DPI
-VL_FLAGS += -D$(FPU_CORE)
-
-PROJECT = libopae-c-vlsim.so
-
-all: $(PROJECT)
-
-vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
-	$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
-
-$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h
-	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
-
-clean:
-	rm -rf obj_dir $(DESTDIR)/$(PROJECT)
--- a/sim/vlsim/opae_sim.h
+++ b/sim/vlsim/opae_sim.h
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <stdint.h>
-namespace vortex {
-
-class RAM;
-
-class opae_sim {
-public:
-  
-  opae_sim();
-  virtual ~opae_sim();
-
-  int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
-
-  void release_buffer(uint64_t wsid);
-
-  void get_io_address(uint64_t wsid, uint64_t *ioaddr);
-
-  void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
-
-  void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
-
-private: 
-
-  class Impl;
-  Impl* impl_;  
-};
-
-}
--- a/sim/vlsim/verilator.vlt
+++ b/sim/vlsim/verilator.vlt
@@ -1,10 +0,0 @@
-`verilator_config
-
-lint_off -rule BLKANDNBLK -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule UNOPTFLAT -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule WIDTH -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule UNUSED -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule LITENDIAN -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule IMPORTSTAR -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -rule PINCONNECTEMPTY -file "../../../hw/rtl/fp_cores/fpnew/*"
-lint_off -file "../rtl/fp_cores/fpnew/*"