SimX timing simulation

2021-11-13 01:41:12 -05:00
parent 009e897cab
commit c2721fd545
26 changed files with 3690 additions and 1639 deletions
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@@ -0,0 +1,427 @@
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <list>
+#include <assert.h>
+
+namespace vortex {
+
+class SimObjectBase;
+
+class SimEventBase {
+public:
+  typedef std::shared_ptr<SimEventBase> Ptr;
+
+  virtual ~SimEventBase() {}
+  
+  virtual void fire() const  = 0;
+
+  bool step() {
+    return (0 == --delay_);
+  }
+
+protected:
+  SimEventBase(uint64_t delay) : delay_(delay) {}
+
+  uint64_t delay_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SimSimpleEvent : public SimEventBase {
+public:
+  typedef std::function<void (const Pkt&)> Func;
+
+  template <typename... Args>
+  static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
+    return std::make_shared<SimSimpleEvent>(func, pkt, delay);
+  }   
+
+  SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) 
+    : SimEventBase(delay)
+    , func_(func)
+    , pkt_(pkt)
+  {}
+
+  void fire() const override {
+    func_(pkt_);
+  }
+
+protected:  
+  Func func_;
+  Pkt  pkt_; 
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SimPortEvent : public SimEventBase {
+public:
+  typedef std::function<void (const Pkt&, uint32_t)> Func;
+
+  template <typename... Args>
+  static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
+    return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
+  }
+
+  SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) 
+    : SimEventBase(delay) 
+    , func_(func)
+    , pkt_(pkt)
+    , port_id_(port_id)
+  {}
+  
+  void fire() const override {
+    func_(pkt_, port_id_);
+  }
+
+private:  
+  Func     func_;
+  Pkt      pkt_;  
+  uint32_t port_id_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SimPortBase {
+public:
+  typedef std::shared_ptr<SimPortBase> Ptr;  
+
+  virtual ~SimPortBase() {}
+  
+  SimObjectBase* module() const {
+    return module_;
+  }
+  
+  uint32_t port_id() const {
+    return port_id_;
+  }
+
+  SimPortBase* peer() const {
+    return peer_;
+  }
+
+  bool connected() const {
+    return (peer_ != nullptr);
+  }
+
+  bool is_slave() const {
+    return is_slave_;
+  }
+
+protected:
+
+  SimPortBase(SimObjectBase* module, bool is_slave);
+
+  void connect(SimPortBase* peer) {
+    assert(peer_ == nullptr);
+    peer_ = peer;
+  }
+
+  void disconnect() { 
+    assert(peer_ == nullptr);  
+    peer_ = nullptr;
+  }
+
+  SimObjectBase* module_;
+  uint32_t       port_id_;
+  bool           is_slave_;
+  SimPortBase*   peer_;
+
+  template <typename Pkt> friend class MasterPort;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SlavePort : public SimPortBase {
+public:
+  typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
+  typedef std::function<void (const Pkt&, uint32_t)> Func;
+
+  static Ptr Create(SimObjectBase* module, const Func& func) {
+    return std::make_shared<SlavePort<Pkt>>(module, func);
+  }
+
+  template <typename T>
+  static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
+    return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
+  } 
+
+  SlavePort(SimObjectBase* module, const Func& func)
+    : SimPortBase(module, true)
+    , func_(func)
+  {}
+
+  template <typename T>
+  SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
+    : SimPortBase(module, true)
+    , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
+  {}
+
+  SlavePort(SimObjectBase* module, SlavePort* peer) 
+    : SimPortBase(module, false) 
+  {
+    this->connect(peer);
+  }
+
+  void send(const Pkt& pkt, uint64_t delay) const;
+
+  const Func& func() const {
+    return func_;
+  }
+
+protected:
+  SlavePort& operator=(const SlavePort&);
+  Func func_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class MasterPort : public SimPortBase {
+public:
+  typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
+  typedef std::function<void (const Pkt&, uint32_t)> Func;
+
+  static Ptr Create() {
+    return std::make_shared<MasterPort<Ptr>>(module);
+  }  
+
+  MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
+
+  MasterPort(SimObjectBase* module, MasterPort* peer) 
+    : SimPortBase(module, false) 
+  {
+    peer->connect(this);
+  }
+
+  void bind(SlavePort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void unbind() {    
+    peer_->disconnect();
+    this->disconnect();
+  }
+
+  void send(const Pkt& pkt, uint64_t delay) const {
+    assert(peer_ != nullptr);
+    if (peer_->is_slave()) {
+      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
+      slave->send(pkt, delay);
+    } else {
+      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
+      master->send(pkt, delay);
+    }  
+  }
+
+private:
+  MasterPort& operator=(const MasterPort&);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SimContext;
+
+class SimObjectBase {
+public:
+  typedef std::shared_ptr<SimObjectBase> Ptr;
+
+  virtual ~SimObjectBase() {}
+
+  template <typename T, typename Pkt>
+  void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
+
+  virtual void step(uint64_t cycle) = 0;
+
+  const std::string& name() const {
+    return name_;
+  }
+
+protected:
+
+  SimObjectBase(const SimContext& ctx, const char* name);
+
+  uint32_t allocate_port(SimPortBase* port) {
+      uint32_t id = ports_.size();
+      ports_.push_back(port);
+      return id;
+  }
+
+private:
+  std::string name_;
+  std::vector<SimPortBase*> ports_;
+
+  friend class SimPlatform;
+  friend class SimPortBase;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Impl>
+class SimObject : public SimObjectBase {
+public:
+  typedef std::shared_ptr<Impl> Ptr;  
+
+  template <typename... Args>
+  static Ptr Create(Args&&... args);
+
+protected:
+
+  SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {}
+
+  void step(uint64_t cycle) override {
+    this->impl().step(cycle);
+  }
+
+private:
+
+  const Impl& impl() const {
+    return static_cast<const Impl&>(*this);
+  }
+
+  Impl& impl() {
+    return static_cast<Impl&>(*this);
+  }
+};
+
+class SimContext {
+private:    
+  SimContext() {}
+  template <typename Impl> template <typename... Args> 
+  friend typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SimPlatform {
+public:
+  static SimPlatform& instance() {
+    static SimPlatform s_inst;
+    return s_inst;
+  }
+
+  bool initialize() {
+    //--
+    return true;
+  }
+
+  void finalize() {
+    instance().clear();
+  }
+
+  void register_object(const SimObjectBase::Ptr& obj) {
+    objects_.push_back(obj);
+  }
+
+  template <typename Pkt>
+  void schedule(const typename SimSimpleEvent<Pkt>::Func& callback, 
+                const Pkt& pkt, 
+                uint64_t delay) {    
+    auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
+    assert(delay != 0);
+    events_.emplace_back(evt);
+  }
+
+  template <typename Pkt>
+  void schedule(const typename SimPortEvent<Pkt>::Func& callback, 
+                const Pkt& pkt, 
+                uint32_t port_id, 
+                uint64_t delay) {
+    auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
+    assert(delay != 0);
+    events_.emplace_back(evt);
+  }
+
+  void step() {
+    // evaluate events
+    auto evt_it = events_.begin();
+    auto evt_it_end = events_.end();
+    while (evt_it != evt_it_end) {
+      auto& event = *evt_it;
+      if (event->step()) {        
+        event->fire();
+        evt_it = events_.erase(evt_it);
+      } else {        
+        ++evt_it;
+      }
+    }
+    // evaluate components
+    for (auto& object : objects_) {
+      object->step(cycles_);
+    }
+    // advance clock    
+    ++cycles_;
+  }
+
+  uint64_t cycles() const {
+    return cycles_;
+  }
+
+private:
+
+  SimPlatform() : cycles_(0) {}
+
+  virtual ~SimPlatform() {
+    this->clear();
+  }
+
+  void clear() {
+    objects_.clear();
+    events_.clear();
+  }
+
+  std::vector<SimObjectBase::Ptr> objects_;
+  std::list<SimEventBase::Ptr> events_;
+  uint64_t cycles_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) 
+  : module_(module)  
+  , port_id_(module->allocate_port(this))
+  , is_slave_(is_slave)
+  , peer_(nullptr) 
+{}
+
+inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) 
+  : name_(name) 
+{}
+
+template <typename Impl>
+template <typename... Args>
+typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
+  auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
+  SimPlatform::instance().register_object(obj);
+  return obj;
+}
+
+template <typename Pkt>
+void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
+  if (func_) {
+    SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
+  } else {
+    assert(peer_ != nullptr);
+    if (peer_->is_slave()) {
+      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
+      slave->send(pkt, delay);
+    } else {
+      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
+      master->send(pkt, delay);
+    }
+  }  
+}
+
+template <typename T, typename Pkt>
+void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
+  auto callback = std::bind(entry, obj, std::placeholders::_1);
+  SimPlatform::instance().schedule(callback, pkt, delay);
+}
+
+}
--- a/sim/common/util.h
+++ b/sim/common/util.h
@@ -1,6 +1,7 @@
 #pragma once

 #include <cstdint>
+#include <algorithm>
 #include <assert.h>

 template <typename... Args>
@@ -8,24 +9,83 @@ void unused(Args&&...) {}

 #define __unused(...) unused(__VA_ARGS__)

-constexpr bool ispow2(uint64_t value) {
+constexpr uint32_t count_leading_zeros(uint32_t value) {
+  return value ? __builtin_clz(value) : 32;
+}
+
+constexpr uint32_t count_trailing_zeros(uint32_t value) {
+  return value ? __builtin_ctz(value) : 32;
+}
+
+constexpr bool ispow2(uint32_t value) {
  return value && !(value & (value - 1));
 }

-constexpr unsigned log2ceil(uint32_t value) {
-  return 32 - __builtin_clz(value - 1);
+constexpr uint32_t log2ceil(uint32_t value) {
+  return 32 - count_leading_zeros(value - 1);
 }

-inline uint64_t align_size(uint64_t size, uint64_t alignment) {        
+inline unsigned log2up(uint32_t value) {
+  return std::max<uint32_t>(1, log2ceil(value));
+}
+
+constexpr unsigned log2floor(uint32_t value) {
+  return 31 - count_leading_zeros(value);
+}
+
+constexpr unsigned ceil2(uint32_t value) {
+  return 32 - count_leading_zeros(value);
+}
+
+inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return bits & ~(1ull << index);
+}
+
+inline uint64_t bit_set(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return bits | (1ull << index);
+}
+
+inline bool bit_get(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return (bits >> index) & 0x1;
+}
+
+inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
+    return bits & ~mask;
+}
+
+inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    uint64_t dirty = (value << (shift + start)) >> shift;
+    return bit_clrw(bits, start, end) | dirty;
+}
+
+inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    return (bits << shift) >> (shift + start);
+}
+
+inline uint64_t aligned_size(uint64_t size, uint32_t alignment) {        
    assert(0 == (alignment & (alignment - 1)));
    return (size + alignment - 1) & ~(alignment - 1);
 }

 // Apply integer sign extension
-inline uint32_t signExt(uint32_t w, uint32_t bit, uint32_t mask) {
-  if (w >> (bit - 1))
-    w |= ~mask;
-  return w;
+inline uint32_t sext32(uint32_t word, uint32_t width) {
+  assert(width > 1);
+  assert(width <= 32);
+  uint32_t mask = (1 << width) - 1;
+  return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
 }

 // return file extension
--- a/sim/simX/Makefile
+++ b/sim/simX/Makefile
@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
 TOP = vx_cache_sim

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp 
-SRCS += args.cpp pipeline.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp
+SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp

 OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
 VPATH := $(sort $(dir $(SRCS)))
--- a/sim/simX/archdef.h
+++ b/sim/simX/archdef.h
@@ -10,20 +10,30 @@
 namespace vortex {

 class ArchDef {  
+private:
+  int num_cores_;
+  int num_warps_;
+  int num_threads_;
+  int wsize_;
+  int vsize_;
+  int num_regs_;
+  int num_csrs_;
+  int num_barriers_;
+  
 public:
  ArchDef(const std::string& /*arch*/,
          int num_cores, 
          int num_warps, 
-          int num_threads) {         
-    wsize_       = 4;
-    vsize_       = 16;
-    num_regs_    = 32;
-    num_csrs_    = 4096;
-    num_barriers_= NUM_BARRIERS;
-    num_cores_   = num_cores;
-    num_warps_   = num_warps;
-    num_threads_ = num_threads;
-  }
+          int num_threads)   
+    : num_cores_(num_cores)
+    , num_warps_(num_warps)
+    , num_threads_(num_threads)
+    , wsize_(4)
+    , vsize_(16)
+    , num_regs_(32)
+    , num_csrs_(4096)
+    , num_barriers_(NUM_BARRIERS)
+  {}

  int wsize() const { 
    return wsize_; 
@@ -56,17 +66,6 @@ public:
  int num_cores() const {
    return num_cores_;
  }
-  
-private:
-
-  int wsize_;
-  int vsize_;
-  int num_regs_;
-  int num_csrs_;
-  int num_barriers_;
-  int num_threads_;
-  int num_warps_;
-  int num_cores_;
 };

 }
--- a/sim/simX/cache.cpp
+++ b/sim/simX/cache.cpp
@@ -0,0 +1,497 @@
+#include "cache.h"
+#include "debug.h"
+#include <util.h>
+#include <unordered_map>
+#include <vector>
+#include <list>
+#include <queue>
+
+using namespace vortex;
+
+struct params_t {
+    uint32_t sets_per_bank;
+    uint32_t blocks_per_set;    
+    uint32_t words_per_block;
+
+    uint32_t word_select_addr_start;
+    uint32_t word_select_addr_end;
+
+    uint32_t bank_select_addr_start;
+    uint32_t bank_select_addr_end;
+
+    uint32_t set_select_addr_start;
+    uint32_t set_select_addr_end;
+
+    uint32_t tag_select_addr_start;
+    uint32_t tag_select_addr_end;
+
+    params_t(const CacheConfig& config) {
+        uint32_t bank_bits   = log2ceil(config.num_banks);
+        uint32_t offset_bits = config.B - config.W;
+        uint32_t log2_bank_size  = config.C - bank_bits;
+        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
+        assert(log2_bank_size >= config.B);
+        
+        
+        this->words_per_block = 1 << offset_bits;
+        this->blocks_per_set  = 1 << config.A;
+        this->sets_per_bank   = 1 << index_bits;
+
+        assert(config.ports_per_bank <= this->words_per_block);
+                
+        // Word select
+        this->word_select_addr_start = config.W;
+        this->word_select_addr_end = (this->word_select_addr_start+offset_bits-1);
+
+        // Bank select
+        this->bank_select_addr_start = (1+this->word_select_addr_end);
+        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
+
+        // Set select
+        this->set_select_addr_start = (1+this->bank_select_addr_end);
+        this->set_select_addr_end = (this->set_select_addr_start+index_bits-1);
+
+        // Tag select
+        this->tag_select_addr_start = (1+this->set_select_addr_end);
+        this->tag_select_addr_end = (config.addr_width-1);
+    }
+
+    uint32_t addr_bank_id(uint64_t word_addr) const {
+        if (bank_select_addr_end >= bank_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
+        else    
+            return 0;
+    }
+
+    uint32_t addr_set_id(uint64_t word_addr) const {
+        if (set_select_addr_end >= set_select_addr_start)
+            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
+        else
+            return 0;
+    }
+
+    uint64_t addr_tag(uint64_t word_addr) const {
+        if (tag_select_addr_end >= tag_select_addr_start)
+            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
+        else    
+            return 0;
+    }
+    
+    uint64_t mem_addr(uint32_t bank_id, uint32_t set_id, uint64_t tag) const {
+        uint64_t addr(0);
+        if (bank_select_addr_end >= bank_select_addr_start)            
+            addr = bit_setw(addr, bank_select_addr_start, bank_select_addr_end, bank_id);
+        if (set_select_addr_end >= set_select_addr_start)
+            addr = bit_setw(addr, set_select_addr_start, set_select_addr_end, set_id);
+        if (tag_select_addr_end >= tag_select_addr_start)
+            addr = bit_setw(addr, tag_select_addr_start, tag_select_addr_end, tag);
+        return addr;
+    }
+};
+
+struct block_t {
+    bool     valid;
+    bool     dirty;        
+    uint64_t tag;
+    uint32_t lru_ctr;
+};
+
+struct set_t {
+    std::vector<block_t> blocks;    
+    set_t(uint32_t size) : blocks(size) {}
+};
+
+struct bank_req_info_t {
+    bool     valid;    
+    uint32_t req_id;
+    uint32_t req_tag;
+};
+
+struct bank_req_t {
+    bool valid;
+    bool write;
+    bool mshr_replay;
+    uint64_t tag;
+    uint32_t set_id;
+    std::vector<bank_req_info_t> infos;
+
+    bank_req_t(uint32_t size) 
+        : valid(false)
+        , write(false)
+        , mshr_replay(false)
+        , tag(0)
+        , set_id(0)
+        , infos(size)
+    {}
+};
+
+struct mshr_entry_t : public bank_req_t {
+    uint32_t block_id;
+
+    mshr_entry_t(uint32_t size = 0) 
+        : bank_req_t(size) 
+        , block_id(0)
+    {}
+};
+
+class MSHR {
+private:
+    std::vector<mshr_entry_t> entries_;
+    uint32_t capacity_;
+
+public:    
+    MSHR(uint32_t size)
+        : entries_(size)
+        , capacity_(0) 
+    {}
+
+    bool empty() const {
+        return (0 == capacity_);
+    }
+    
+    bool full() const {
+        return (capacity_ == entries_.size());
+    }
+
+    int lookup(const bank_req_t& bank_req) {
+         for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+            auto& entry = entries_.at(i);
+            if (entry.valid 
+             && entry.set_id == bank_req.set_id 
+             && entry.tag == bank_req.tag) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    int allocate(const bank_req_t& bank_req, uint32_t block_id) {
+        for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+            auto& entry = entries_.at(i);
+            if (!entry.valid) {
+                *(bank_req_t*)&entry = bank_req;
+                entry.valid = true;
+                entry.mshr_replay = false;
+                entry.block_id = block_id;  
+                ++capacity_;              
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    mshr_entry_t& replay(uint32_t id) {
+        auto& root_entry = entries_.at(id);
+        assert(root_entry.valid);
+        // make all related mshr entries for replay
+        for (auto& entry : entries_) {
+            if (entry.valid 
+             && entry.set_id == root_entry.set_id 
+             && entry.tag == root_entry.tag) {
+                entry.mshr_replay = true;
+            }
+        }
+        return root_entry;
+    }
+
+    bool try_pop(bank_req_t* out) {
+        for (auto& entry : entries_) {
+            if (entry.valid && entry.mshr_replay) {
+                *out = entry;
+                entry.valid = false;
+                --capacity_;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+struct bank_t {
+    std::vector<set_t>      sets;    
+    MSHR                    mshr;
+    std::queue<bank_req_t>  stall_buffer;
+    bank_req_t              active_req;
+
+    bank_t(const CacheConfig& config, 
+           const params_t& params) 
+        : sets(params.sets_per_bank, params.blocks_per_set)
+        , mshr(config.mshr_size)
+        , active_req(config.ports_per_bank) 
+    {}
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class Cache::Impl {
+private:
+    Cache* const simobject_;
+    CacheConfig config_;
+    params_t params_;
+    std::vector<bank_t> banks_;
+    std::vector<std::pair<bool, MemReq>> core_reqs_;
+    std::pair<bool, MemRsp> mem_rsp_;
+    std::vector<std::queue<uint32_t>> core_rsps_;
+
+public:
+    Impl(Cache* simobject, const CacheConfig& config) 
+        : simobject_(simobject)
+        , config_(config)
+        , params_(config)
+        , banks_(config.num_banks, {config, params_})
+        , core_reqs_(config.num_inputs)
+        , core_rsps_(config.num_inputs)
+    {}    
+
+    void handleMemResponse(const MemRsp& response, uint32_t) {        
+        mem_rsp_ = {true, response};
+    }
+
+    void handleCoreRequest(const MemReq& request, uint32_t port_id) {
+        core_reqs_.at(port_id) = {true, request};
+    }
+
+    void step(uint64_t /*cycle*/) {
+        // process core response
+        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
+            auto& core_rsp = core_rsps_.at(req_id);
+            if (!core_rsp.empty()) {
+                simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency);
+                core_rsp.pop();
+            }
+        }
+
+        for (auto& bank : banks_) {
+            auto& active_req = bank.active_req;
+
+            // try chedule mshr replay
+            if (!active_req.valid) {
+                bank.mshr.try_pop(&active_req);
+            }
+
+            // try schedule stall replay
+            if (!active_req.valid 
+             && !bank.stall_buffer.empty()) {            
+                active_req = bank.stall_buffer.front();
+                bank.stall_buffer.pop();
+            }
+        }
+
+        // handle memory fills
+        if (mem_rsp_.first) {
+            mem_rsp_.first = false;
+            auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
+            auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
+            this->processMemoryFill(bank_id, mshr_id);        
+        }
+        
+        // handle incoming core requests
+        for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
+            auto& entry = core_reqs_.at(i);
+            if (!entry.first)
+                continue;
+                
+            entry.first = false;
+
+            auto& core_req = entry.second;
+            auto bank_id   = params_.addr_bank_id(core_req.addr);
+            auto set_id    = params_.addr_set_id(core_req.addr);
+            auto tag       = params_.addr_tag(core_req.addr);
+            auto port_id   = i % config_.ports_per_bank;
+            
+            // create abnk request
+            bank_req_t bank_req(config_.ports_per_bank);
+            bank_req.valid = true;
+            bank_req.write = core_req.write;
+            bank_req.mshr_replay = false;
+            bank_req.tag = tag;            
+            bank_req.set_id = set_id;       
+            bank_req.infos.at(port_id) = {true, i, core_req.tag};
+
+            auto& bank = banks_.at(bank_id);
+            
+            // check MSHR capacity
+            if (bank.mshr.full()) {
+                // add to stall buffer
+                bank.stall_buffer.emplace(bank_req);
+                continue;
+            }
+
+            auto& active_req = bank.active_req;
+
+            // check pending MSHR request
+            if (active_req.valid 
+             && active_req.mshr_replay) {
+                // add to stall buffer
+                bank.stall_buffer.emplace(bank_req);
+                continue;
+            }        
+
+            // check bank conflicts
+            if (active_req.valid) {
+                // check port conflict
+                if (active_req.write != core_req.write
+                 || active_req.set_id != set_id
+                 || active_req.tag != tag
+                 || active_req.infos[port_id].valid) {
+                    // add to stall buffer
+                    bank.stall_buffer.emplace(bank_req);
+                    continue;
+                }
+                // update pending request infos
+                active_req.infos[port_id] = bank_req.infos[port_id];
+            } else {
+                // schedule new request
+                active_req = bank_req;
+            }
+        }
+    
+        // process active request
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            this->processBankRequest(bank_id);
+        }
+    }
+
+    void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
+        // update block
+        auto& bank = banks_.at(bank_id);
+        auto& root_entry = bank.mshr.replay(mshr_id);
+        auto& set   = bank.sets.at(root_entry.set_id);
+        auto& block = set.blocks.at(root_entry.block_id);
+        block.valid = true;
+        block.tag   = root_entry.tag;
+    }
+
+    void processBankRequest(uint32_t bank_id) {
+        auto& bank = banks_.at(bank_id);
+        auto& active_req = bank.active_req;
+        if (!active_req.valid)
+            return;
+
+        active_req.valid = false;
+
+        auto& set = bank.sets.at(active_req.set_id);
+
+        if (active_req.mshr_replay) {
+            // send core response
+            for (auto& info : active_req.infos) {
+                core_rsps_.at(info.req_id).emplace(info.req_tag);            
+            }
+        } else {        
+            bool hit = false;
+            bool found_free_block = false;            
+            int hit_block_id = 0;
+            int repl_block_id = 0;            
+            uint32_t max_cnt = 0;
+            
+            for (int i = 0, n = set.blocks.size(); i < n; ++i) {
+                auto& block = set.blocks.at(i);
+                if (block.valid) {
+                    if (block.tag == active_req.tag) {
+                        block.lru_ctr = 0;                        
+                        hit_block_id = i;
+                        hit = true;
+                    } else {
+                        ++block.lru_ctr;
+                    }
+                    if (max_cnt < block.lru_ctr) {
+                        max_cnt = block.lru_ctr;
+                        repl_block_id = i;
+                    }
+                } else {                    
+                    found_free_block = true;
+                    repl_block_id = i;
+                }
+            }
+
+            if (hit) {     
+                //
+                // MISS handling   
+                //                
+                if (active_req.write) {
+                    // handle write hit
+                    auto& hit_block = set.blocks.at(hit_block_id);
+                    if (config_.write_through) {
+                        // forward write request to memory
+                        MemReq mem_req;
+                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
+                        mem_req.write = true;
+                        mem_req.tag   = 0;
+                        simobject_->MemReqPort.send(mem_req, 1);
+                    } else {
+                        // mark block as dirty
+                        hit_block.dirty = true;
+                    }
+                }
+                // send core response
+                for (auto& info : active_req.infos) {
+                    core_rsps_.at(info.req_id).emplace(info.req_tag);            
+                }
+            } else {     
+                //
+                // MISS handling   
+                //                 
+                if (!found_free_block && !config_.write_through) {
+                     // write back dirty block
+                    auto& repl_block = set.blocks.at(repl_block_id);
+                    if (repl_block.dirty) {                       
+                        MemReq mem_req;
+                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
+                        mem_req.write = true;
+                        simobject_->MemReqPort.send(mem_req, 1);
+                    }
+                }
+
+                if (active_req.write && config_.write_through) {
+                    // forward write request to memory
+                    {
+                        MemReq mem_req;
+                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
+                        mem_req.write = true;
+                        mem_req.tag   = 0;
+                        simobject_->MemReqPort.send(mem_req, 1);
+                    }
+                    // send core response
+                    for (auto& info : active_req.infos) {
+                        core_rsps_.at(info.req_id).emplace(info.req_tag);            
+                    }
+                } else {
+                    // lookup
+                    int pending = bank.mshr.lookup(active_req);
+
+                    // allocate MSHR
+                    int mshr_id = bank.mshr.allocate(active_req, repl_block_id);
+                    
+                    // send fill request
+                    if (pending == -1) {
+                        MemReq mem_req;
+                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
+                        mem_req.write = active_req.write;
+                        mem_req.tag = bit_setw(0,            0, 15, bank_id);
+                        mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
+                        simobject_->MemReqPort.send(mem_req, 1);
+                    }
+                }
+            }
+        }
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) 
+    : SimObject<Cache>(ctx, name)
+    , impl_(new Impl(this, config))
+    , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
+    , CoreRspPorts(config.num_inputs, this)
+    , MemReqPort(this)
+    , MemRspPort(this, impl_, &Impl::handleMemResponse)
+{}
+
+Cache::~Cache() {
+    delete impl_;
+}
+
+void Cache::step(uint64_t cycle) {
+    impl_->step(cycle);
+}
--- a/sim/simX/cache.h
+++ b/sim/simX/cache.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <simobject.h>
+#include "memsim.h"
+
+namespace vortex {
+
+struct CacheConfig {
+    uint8_t C;              // log2 cache size    
+    uint8_t B;              // log2 block size   
+    uint8_t W;              // log2 word size 
+    uint8_t A;              // log2 associativity    
+    uint8_t addr_width;     // word address bits
+    uint8_t num_banks;      // number of banks
+    uint8_t ports_per_bank; // number of ports per bank
+    uint8_t num_inputs;     // number of inputs
+    bool    write_through;  // is write-through cache
+    uint16_t victim_size;   // victim cache size
+    uint16_t mshr_size;     // MSHR buffer size
+    uint8_t latency;        // pipeline latency 
+};
+
+class Cache : public SimObject<Cache> {
+private:
+    class Impl;
+    Impl* impl_;
+    
+public:
+    Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
+    ~Cache();
+
+    void step(uint64_t cycle);
+
+    std::vector<SlavePort<MemReq>>  CoreReqPorts;
+    std::vector<MasterPort<MemRsp>> CoreRspPorts;
+    MasterPort<MemReq>              MemReqPort;
+    SlavePort<MemRsp>               MemRspPort;
+};
+
+}
--- a/sim/simX/constants.h
+++ b/sim/simX/constants.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "types.h"
+
+#ifndef MEM_LATENCY
+#define MEM_LATENCY 18
+#endif
+
+namespace vortex {
+
+struct Constants {
+
+static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE;
+static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1;
+
+static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2;
+static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2;
+
+};
+
+}
--- a/sim/simX/core.cpp
+++ b/sim/simX/core.cpp
@@ -12,34 +12,92 @@

 using namespace vortex;

-Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
-    : id_(id)
+Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
+    : SimObject(ctx, "Core")
+    , id_(id)
    , arch_(arch)
    , decoder_(decoder)
    , mem_(mem)
    , shared_mem_(1, SMEM_SIZE)
-    , inst_in_schedule_("schedule")
-    , inst_in_fetch_("fetch")
-    , inst_in_decode_("decode")
-    , inst_in_issue_("issue")
-    , inst_in_execute_("execute")
-    , inst_in_writeback_("writeback") {
-  in_use_iregs_.resize(arch.num_warps(), 0);
-  in_use_fregs_.resize(arch.num_warps(), 0);
-  in_use_vregs_.reset();
-
-  csrs_.resize(arch_.num_csrs(), 0);
-
-  fcsrs_.resize(arch_.num_warps(), 0);
-
-  barriers_.resize(arch_.num_barriers(), 0);
-
-  warps_.resize(arch_.num_warps());
+    , warps_(arch.num_warps())
+    , barriers_(arch.num_barriers(), 0)
+    , csrs_(arch.num_csrs(), 0)
+    , fcsrs_(arch.num_warps(), 0)
+    , ibuffers_(arch.num_warps(), IBUF_SIZE)
+    , scoreboard_(arch_) 
+    , exe_units_((int)ExeType::MAX)
+    , icache_(Cache::Create("Icache", CacheConfig{
+        log2ceil(ICACHE_SIZE),  // C
+        log2ceil(L1_BLOCK_SIZE),// B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        1,                      // number of banks
+        1,                      // number of ports
+        1,                      // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        NUM_WARPS,              // mshr
+        2,                      // pipeline latency
+      }))
+    , dcache_(Cache::Create("Dcache", CacheConfig{
+        log2ceil(DCACHE_SIZE),  // C
+        log2ceil(L1_BLOCK_SIZE),// B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        DCACHE_NUM_BANKS,       // number of banks
+        DCACHE_NUM_PORTS,       // number of ports
+        (uint8_t)arch.num_threads(), // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        DCACHE_MSHR_SIZE,       // mshr
+        2,                      // pipeline latency
+      }))
+    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) 
+    , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
+    , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
+    , fetch_stage_("fetch")
+    , decode_stage_("decode")
+    , issue_stage_("issue")
+    , execute_stage_("execute")
+    , commit_stage_("writeback")
+    , pending_icache_(arch_.num_warps())
+    , stalled_warps_(0)
+    , last_schedule_wid_(0)
+    , pending_instrs_(0)
+    , ebreak_(false)   
+    , stats_insts_(0)
+    , stats_loads_(0)
+    , stats_stores_(0)
+    , MemRspPort(this, &l1_mem_switch_->RspIn)
+    , MemReqPort(this, &l1_mem_switch_->ReqOut)    
+{  
  for (int i = 0; i < arch_.num_warps(); ++i) {
-    warps_[i] = std::make_shared<Warp>(this, i);
+    warps_.at(i) = std::make_shared<Warp>(this, i);
  }

-  this->clear();
+  // register execute units
+  exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
+  exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
+  exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
+  exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);  
+  exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
+
+  // connect l1 caches
+  icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
+  for (int i = 0; i < arch_.num_threads(); ++i) {
+    dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
+  }
+
+  // connect l1 switch
+  icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
+  dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
+  l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);  
+  l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
+
+  // activate warp0
+  warps_.at(0)->setTmask(0, true);
 }

 Core::~Core() {
@@ -51,79 +109,41 @@ Core::~Core() {
  }
 }

-void Core::clear() {
-  for (int w = 0; w < arch_.num_warps(); ++w) {    
-    in_use_iregs_[w].reset();
-    in_use_fregs_[w].reset();    
-  }
-  stalled_warps_.reset();
-
-  in_use_vregs_.reset();
-  
-  for (auto& csr : csrs_) {
-    csr = 0;
+void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
+  // advance to decode stage
+  uint32_t wid = response.tag;
+  pipeline_state_t state;
+  pending_icache_.remove(wid, &state);
+  auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
+  state.icache_latency = latency;
+  decode_stage_.push(state);
 }

-  for (auto& fcsr : fcsrs_) {
-    fcsr = 0;
-  }
-
-  for (auto& barrier : barriers_) {
-    barrier.reset();
-  }
-  
-  for (auto warp : warps_) {
-    warp->clear();
-  }  
-
-  inst_in_schedule_.clear();
-  inst_in_fetch_.clear();
-  inst_in_decode_.clear();
-  inst_in_issue_.clear();
-  inst_in_execute_.clear();
-  inst_in_writeback_.clear();
-  print_bufs_.clear();
-
-  steps_  = 0;
-  insts_  = 0;
-  loads_  = 0;
-  stores_ = 0;
-
-  inst_in_schedule_.valid = true;
-  warps_[0]->setTmask(0, true);
-
-  ebreak_ = false;
-}
-
-void Core::step() {
+void Core::step(uint64_t cycle) {
+    __unused (cycle);
  D(2, "###########################################################");
+  D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);

-  steps_++;
-  D(2, std::dec << "Core" << id_ << ": cycle: " << steps_);
-
-  this->writeback();
+  this->commit();
  this->execute();
  this->issue();
  this->decode();
  this->fetch();
-  this->schedule();

  DPN(2, std::flush);
 }

-void Core::schedule() {
-  if (!inst_in_schedule_.enter(&inst_in_fetch_))
-    return;
-
+void Core::warp_scheduler() {
  bool foundSchedule = false;
-  int scheduled_warp = inst_in_schedule_.wid;
+  int scheduled_warp = last_schedule_wid_;

-  for (size_t wid = 0; wid < warps_.size(); ++wid) {
  // round robin scheduling
+  for (size_t wid = 0; wid < warps_.size(); ++wid) {    
    scheduled_warp = (scheduled_warp + 1) % warps_.size();
-    bool is_active = warps_[scheduled_warp]->active();
-    bool stalled = stalled_warps_[scheduled_warp];
-    if (is_active && !stalled) {
+    bool warp_active  = warps_.at(scheduled_warp)->active();
+    bool warp_stalled = stalled_warps_.test(scheduled_warp); 
+    if (warp_active && !warp_stalled) {      
+      last_schedule_wid_ = scheduled_warp;
      foundSchedule = true;
      break;
    }
@@ -132,113 +152,121 @@ void Core::schedule() {
  if (!foundSchedule)
    return;  

-  D(2, "Schedule: wid=" << scheduled_warp);
-  inst_in_schedule_.wid = scheduled_warp;
+  // suspend warp until decode
+  stalled_warps_.set(scheduled_warp);

-  // advance pipeline
-  inst_in_schedule_.next(&inst_in_fetch_);
+  auto& warp = warps_.at(scheduled_warp);  
+  stats_insts_ += warp->getActiveThreads();
+  
+  pipeline_state_t state;
+  warp->eval(&state);
+
+  D(4, state);  
+
+  // advance to fetch stage
+  ++pending_instrs_;
+  fetch_stage_.push(state);
 }

 void Core::fetch() {
-  if (!inst_in_fetch_.enter(&inst_in_issue_))
-    return;
-
-  int wid = inst_in_fetch_.wid;
-  
-  auto active_threads_b = warps_[wid]->getActiveThreads();    
-  warps_[wid]->step(&inst_in_fetch_);
-  auto active_threads_a = warps_[wid]->getActiveThreads();   
-
-  insts_ += active_threads_b;
-  if (active_threads_b != active_threads_a) {
-    D(3, "*** warp#" << wid << " active threads changed to " << active_threads_a);
+  // schedule icache request
+  pipeline_state_t state;
+  if (fetch_stage_.try_pop(&state)) {
+    state.icache_latency = SimPlatform::instance().cycles();
+    MemReq mem_req;
+    mem_req.addr  = state.PC;
+    mem_req.write = false;
+    mem_req.tag   = pending_icache_.allocate(state);    
+    icache_->CoreReqPorts.at(0).send(mem_req, 1);
  }  

-  if (inst_in_fetch_.stall_warp) {
-    D(3, "*** warp#" << wid << " fetch stalled");
-    stalled_warps_[wid] = true;
-  }
-  
-  D(4, inst_in_fetch_);
-
-  // advance pipeline
-  inst_in_fetch_.next(&inst_in_issue_);
+  // schedule next warp
+  this->warp_scheduler();  
 }

 void Core::decode() {
-  if (!inst_in_decode_.enter(&inst_in_issue_))
+  pipeline_state_t state;
+  if (!decode_stage_.try_pop(&state))
    return;    
  
-  // advance pipeline
-  inst_in_decode_.next(&inst_in_issue_);
+  if (state.stall_warp) {
+    D(3, "*** warp#" << state.wid << " fetch stalled");
+  } else {
+    // release warp
+    stalled_warps_.reset(state.wid);
+  }
+  
+  // advance to issue stage
+  issue_stage_.push(state);
 }

 void Core::issue() {
-  if (!inst_in_issue_.enter(&inst_in_execute_))
-    return;
-
-  bool in_use_regs = (inst_in_issue_.used_iregs & in_use_iregs_[inst_in_issue_.wid]) != 0 
-                  || (inst_in_issue_.used_fregs & in_use_fregs_[inst_in_issue_.wid]) != 0 
-                  || (inst_in_issue_.used_vregs & in_use_vregs_) != 0;
-  
-  if (in_use_regs) {      
-    D(3, "*** Issue: registers not ready!");
-    inst_in_issue_.stalled = true;
-    return;
+  if (!issue_stage_.empty()) {
+    // insert to ibuffer 
+    auto& state = issue_stage_.top();
+    auto& ibuffer = ibuffers_.at(state.wid);
+    if (!ibuffer.full()) {
+      ibuffer.push(state);
+      issue_stage_.pop();
+    }
  }
    
-  switch (inst_in_issue_.rdest_type) {
-  case 1:
-    if (inst_in_issue_.rdest)
-      in_use_iregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
-    break;
-  case 2:
-    in_use_fregs_[inst_in_issue_.wid][inst_in_issue_.rdest] = 1;
-    break;
-  case 3:
-    in_use_vregs_[inst_in_issue_.rdest] = 1;
-    break;
-  default:  
+  // issue ibuffer instructions
+  for (auto& ibuffer : ibuffers_) {
+    if (ibuffer.empty())
+      continue;
+
+    auto& state = ibuffer.top();
+
+    // check scoreboard
+    if (scoreboard_.in_use(state))
+      continue;
+
+    // update scoreboard
+    scoreboard_.reserve(state);
+
+    // advance to execute stage
+    execute_stage_.push(state);
+
+    ibuffer.pop();
    break;
  }
-
-  // advance pipeline
-  inst_in_issue_.next(&inst_in_execute_);
 }

 void Core::execute() {
-  if (!inst_in_execute_.enter(&inst_in_writeback_))
+  // process stage inputs
+  if (!execute_stage_.empty()) {
+    auto& state = execute_stage_.top();
+    auto& exe_unit = exe_units_.at((int)state.exe_type);
+    exe_unit->push_input(state);
+    execute_stage_.pop();
+  }
+
+  // advance execute units
+  for (auto& exe_unit : exe_units_) {
+    exe_unit->step();
+  }  
+  
+  // commit completed instructions
+  for (auto& exe_unit : exe_units_) {
+    pipeline_state_t state;
+    if (exe_unit->pop_output(&state)) {
+      if (state.stall_warp) {
+        stalled_warps_.reset(state.wid);
+      }
+      // advance to commit stage
+      commit_stage_.push(state);      
+    }
+  }
+}
+
+void Core::commit() {
+  pipeline_state_t state;
+  if (!commit_stage_.try_pop(&state))
    return;

-  // advance pipeline
-  inst_in_execute_.next(&inst_in_writeback_);
-}
-
-void Core::writeback() {
-  if (!inst_in_writeback_.enter(NULL))
-    return;
-
-  switch (inst_in_writeback_.rdest_type) {
-  case 1:
-    in_use_iregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
-    break;
-  case 2:
-    in_use_fregs_[inst_in_writeback_.wid][inst_in_writeback_.rdest] = 0;
-    break;
-  case 3:
-    in_use_vregs_[inst_in_writeback_.rdest] = 0;
-    break;
-  default:  
-    break;
-  }
-
-  if (inst_in_writeback_.stall_warp) {
-    stalled_warps_[inst_in_writeback_.wid] = false;
-    D(3, "*** warp#" << inst_in_writeback_.wid << " fetch released");
-  }
-
-  // advance pipeline
-  inst_in_writeback_.next(NULL);
+  // update scoreboard
+  scoreboard_.release(state);
 }

 Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -281,16 +309,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
    return arch_.num_cores();
  } else if (addr == CSR_MINSTRET) {
    // NumInsts
-    return insts_;
+    return stats_insts_;
  } else if (addr == CSR_MINSTRET_H) {
    // NumInsts
-    return (Word)(insts_ >> 32);
+    return (Word)(stats_insts_ >> 32);
  } else if (addr == CSR_MCYCLE) {
    // NumCycles
-    return (Word)steps_;
+    return (Word)SimPlatform::instance().cycles();
  } else if (addr == CSR_MCYCLE_H) {
    // NumCycles
-    return (Word)(steps_ >> 32);
+    return (Word)(SimPlatform::instance().cycles() >> 32);
  } else {
    return csrs_.at(addr);
  }
@@ -328,7 +356,7 @@ Word Core::icache_fetch(Addr addr) {
 }

 Word Core::dcache_read(Addr addr, Size size) {
-  ++loads_;
+  ++stats_loads_;
  Word data = 0;
 #ifdef SM_ENABLE
  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
@@ -342,7 +370,7 @@ Word Core::dcache_read(Addr addr, Size size) {
 }

 void Core::dcache_write(Addr addr, Word data, Size size) {
-  ++stores_;
+  ++stats_stores_;
 #ifdef SM_ENABLE
  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
   && ((addr + 3) < SMEM_BASE_ADDR)) {
@@ -359,23 +387,19 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
 }

 bool Core::running() const {
-  return inst_in_fetch_.valid 
-      || inst_in_decode_.valid 
-      || inst_in_issue_.valid 
-      || inst_in_execute_.valid 
-      || inst_in_writeback_.valid;
+  return pending_instrs_;
 }

 void Core::printStats() const {
-  std::cout << "Steps : " << steps_ << std::endl
-            << "Insts : " << insts_ << std::endl
-            << "Loads : " << loads_ << std::endl
-            << "Stores: " << stores_ << std::endl;
+  std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl
+            << "Insts : " << stats_insts_ << std::endl
+            << "Loads : " << stats_loads_ << std::endl
+            << "Stores: " << stats_stores_ << std::endl;
 }

 void Core::writeToStdOut(Addr addr, Word data) {
  uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
-  auto& ss_buf = print_bufs_[tid];
+  auto& ss_buf = print_bufs_.at(tid);
  char c = (char)data;
  ss_buf << c;
  if (c == '\n') {
--- a/sim/simX/core.h
+++ b/sim/simX/core.h
@@ -4,10 +4,11 @@
 #include <vector>
 #include <list>
 #include <stack>
+#include <queue>
 #include <unordered_map>
 #include <memory>
 #include <set>
-
+#include <simobject.h>
 #include "debug.h"
 #include "types.h"
 #include "archdef.h"
@@ -15,20 +16,21 @@
 #include "mem.h"
 #include "warp.h"
 #include "pipeline.h"
+#include "cache.h"
+#include "ibuffer.h"
+#include "scoreboard.h"
+#include "exeunit.h"

 namespace vortex {

-class Core {
+class Core : public SimObject<Core> {
 public:
-  Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
-
+  Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
  ~Core();

-  void clear();
-
  bool running() const;

-  void step();
+  void step(uint64_t cycle);

  void printStats() const;

@@ -40,7 +42,7 @@ public:
    return *warps_.at(i);
  }

-  Decoder& decoder() {
+  const Decoder& decoder() {
    return decoder_;
  }

@@ -48,16 +50,12 @@ public:
    return arch_;
  }

-  unsigned long num_insts() const {
-    return insts_;
-  }
-
-  unsigned long num_steps() const {
-    return steps_;
+  unsigned long stats_insts() const {
+    return stats_insts_;
  } 

  Word getIRegValue(int reg) const {
-    return warps_[0]->getIRegValue(reg);
+    return warps_.at(0)->getIRegValue(reg);
  }

  Word get_csr(Addr addr, int tid, int wid);
@@ -73,50 +71,66 @@ public:
  void dcache_write(Addr, Word, Size);

  void trigger_ebreak();
+
  bool check_ebreak() const;

 private:

-  void schedule();
  void fetch();
  void decode();
  void issue();
  void execute();
-  void writeback();
+  void commit();
+
+  void warp_scheduler();
+
+  void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);

  void writeToStdOut(Addr addr, Word data);

-  std::vector<RegMask> in_use_iregs_;
-  std::vector<RegMask> in_use_fregs_;
-  RegMask in_use_vregs_;
-  WarpMask stalled_warps_;
-  std::vector<std::shared_ptr<Warp>> warps_;  
-  std::vector<WarpMask> barriers_;  
-  std::vector<Word> csrs_;
-  std::vector<Byte> fcsrs_;
-  std::unordered_map<int, std::stringstream> print_bufs_;
-
  Word id_;
  const ArchDef& arch_;
-  Decoder &decoder_;
+  const Decoder& decoder_;
  MemoryUnit& mem_;
 #ifdef SM_ENABLE
  RAM shared_mem_;
 #endif 

+  std::vector<std::shared_ptr<Warp>> warps_;  
+  std::vector<WarpMask> barriers_;  
+  std::vector<Word> csrs_;
+  std::vector<Byte> fcsrs_;
+  std::vector<IBuffer> ibuffers_;
+  Scoreboard scoreboard_;
+  std::vector<ExeUnit::Ptr> exe_units_;
+  Cache::Ptr icache_;
+  Cache::Ptr dcache_;
+  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
+  SlavePort<MemRsp> icache_rsp_port_;
+  std::vector<SlavePort<MemRsp>> dcache_rsp_port_;
+
+  PipelineStage fetch_stage_;
+  PipelineStage decode_stage_;
+  PipelineStage issue_stage_;
+  PipelineStage execute_stage_;
+  PipelineStage commit_stage_;  
+  
+  HashTable<pipeline_state_t> pending_icache_;
+  WarpMask stalled_warps_;  
+  uint32_t last_schedule_wid_;
+  uint32_t pending_instrs_;
  bool ebreak_;

-  Pipeline inst_in_schedule_;
-  Pipeline inst_in_fetch_;
-  Pipeline inst_in_decode_;
-  Pipeline inst_in_issue_;
-  Pipeline inst_in_execute_;
-  Pipeline inst_in_writeback_;
+  std::unordered_map<int, std::stringstream> print_bufs_;
+  uint64_t stats_insts_;
+  uint64_t stats_loads_;
+  uint64_t stats_stores_;

-  uint64_t steps_;
-  uint64_t insts_;
-  uint64_t loads_;
-  uint64_t stores_; 
+  friend class LsuUnit;
+
+public:
+  SlavePort<MemRsp>  MemRspPort;
+  MasterPort<MemReq> MemReqPort;
 };

 } // namespace vortex
--- a/sim/simX/decode.cpp
+++ b/sim/simX/decode.cpp
@@ -281,7 +281,7 @@ Decoder::Decoder(const ArchDef &arch) {
  v_imm_mask_  = 0x7ff;  
 }

-std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {  
+std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {  
  auto instr = std::make_shared<Instr>();
  Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
  instr->setOpcode(op);
@@ -351,9 +351,9 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
    instr->setFunc3(func3);
    instr->setFunc7(func7);    
    if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) {
-      instr->setImm(signExt(rs2, 5, reg_mask_));
+      instr->setImm(sext32(rs2, 5));
    } else {
-      instr->setImm(signExt(code >> shift_rs2_, 12, i_imm_mask_));
+      instr->setImm(sext32(code >> shift_rs2_, 12));
    }
  } break;

@@ -366,7 +366,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
    }
    instr->setFunc3(func3);
    Word imeed = (func7 << reg_s_) | rd;
-    instr->setImm(signExt(imeed, 12, s_imm_mask_));
+    instr->setImm(sext32(imeed, 12));
  } break;

  case InstType::B_TYPE: {
@@ -378,12 +378,12 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) {
    Word bit_10_5 = func7 & 0x3f;
    Word bit_12   = func7 >> 6;
    Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
-    instr->setImm(signExt(imeed, 13, b_imm_mask_));
+    instr->setImm(sext32(imeed, 13));
  } break;

  case InstType::U_TYPE:
    instr->setDestReg(rd);
-    instr->setImm(signExt(code >> shift_func3_, 20, u_imm_mask_));
+    instr->setImm(sext32(code >> shift_func3_, 20));
    break;

  case InstType::J_TYPE: {
--- a/sim/simX/decode.h
+++ b/sim/simX/decode.h
@@ -13,7 +13,7 @@ class Decoder {
 public:
  Decoder(const ArchDef &);    
  
-  std::shared_ptr<Instr> decode(Word code, Word PC);
+  std::shared_ptr<Instr> decode(Word code, Word PC) const;

 private:

--- a/sim/simX/execute.cpp
+++ b/sim/simX/execute.cpp
--- a/sim/simX/exeunit.cpp
+++ b/sim/simX/exeunit.cpp
@@ -0,0 +1,152 @@
+#include "exeunit.h"
+#include <iostream>
+#include <iomanip>
+#include <string.h>
+#include <assert.h>
+#include <util.h>
+#include "debug.h"
+#include "core.h"
+
+using namespace vortex;
+
+LsuUnit::LsuUnit(Core* core) 
+    : ExeUnit("LSU")
+    , core_(core)
+    , num_threads_(core->arch().num_threads()) 
+    , pending_dcache_(LSUQ_SIZE)
+    , fence_lock_(false)
+{}
+
+void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
+    auto entry = pending_dcache_.at(response.tag);    
+    entry.second.reset(port_id); // track remaining blocks
+    if (!entry.second.any()) {        
+        auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
+        entry.first.dcache_latency = latency;
+        this->schedule_output(entry.first, 1);
+        pending_dcache_.release(response.tag);
+    }
+}
+
+void LsuUnit::step() {
+    if (fence_lock_) {
+        // wait for all pending memory operations to complete
+        if (!pending_dcache_.empty())
+            return;
+        this->schedule_output(fence_state_, 1);
+        fence_lock_ = false;
+    }
+
+    if (inputs_.empty())
+        return;
+
+    auto state = inputs_.top();
+
+    if (state.lsu.fence) {
+        // schedule fence lock
+        fence_state_ = state;
+        fence_lock_ = true;
+        inputs_.pop();
+        return;
+    }
+
+    // send dcache requests
+    if (!pending_dcache_.full()) {   
+        state.dcache_latency = SimPlatform::instance().cycles();
+        auto tag = pending_dcache_.allocate({state, state.tmask});         
+        for (uint32_t t = 0; t < num_threads_; ++t) {
+            if (!state.tmask.test(t))
+                continue;
+            MemReq mem_req;
+            mem_req.addr  = state.mem_addrs.at(t);
+            mem_req.write = state.lsu.store;
+            mem_req.tag   = tag;
+            core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
+        }            
+        inputs_.pop();
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
+    
+void AluUnit::step() {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    switch  (state.alu.type) {
+    case AluType::ARITH:
+        this->schedule_output(state, 1);
+        break;
+    case AluType::BRANCH:
+        this->schedule_output(state, 1);
+        break;
+    case AluType::IMUL:
+        this->schedule_output(state, LATENCY_IMUL);
+        break;
+    case AluType::IDIV:
+        this->schedule_output(state, XLEN);
+        break;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
+    
+void CsrUnit::step() {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    this->schedule_output(state, 1);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
+    
+void FpuUnit::step() {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    switch  (state.fpu.type) {
+    case FpuType::FNCP:
+        this->schedule_output(state, 1);
+        break;
+    case FpuType::FMA:
+        this->schedule_output(state, LATENCY_FMA);
+        break;
+    case FpuType::FDIV:
+        this->schedule_output(state, LATENCY_FDIV);
+        break;
+    case FpuType::FSQRT:
+        this->schedule_output(state, LATENCY_FSQRT);
+        break;
+    case FpuType::FCVT:
+        this->schedule_output(state, LATENCY_FCVT);
+        break;
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
+    
+void GpuUnit::step() {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    switch  (state.gpu.type) {
+    case GpuType::TMC:
+    case GpuType::WSPAWN:
+    case GpuType::SPLIT:
+    case GpuType::JOIN:
+    case GpuType::BAR:
+        this->schedule_output(state, 1);
+        break;
+    case GpuType::TEX:
+        /* TODO */
+        break;
+    }
+}
--- a/sim/simX/exeunit.h
+++ b/sim/simX/exeunit.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <simobject.h>
+#include "pipeline.h"
+#include "cache.h"
+
+namespace vortex {
+
+class Core;
+
+class ExeUnit {
+protected:
+    const char* name_;
+    Queue<pipeline_state_t> inputs_;
+    Queue<pipeline_state_t> outputs_;
+
+    void schedule_output(const pipeline_state_t& state, uint32_t delay) {
+        if (delay > 1) {
+            SimPlatform::instance().schedule(
+                [&](const pipeline_state_t& req) { 
+                    outputs_.push(req); 
+                },
+                state,
+                (delay - 1)
+            );
+        } else {
+            outputs_.push(state);
+        }
+    }
+
+public:    
+    typedef std::shared_ptr<ExeUnit> Ptr;
+
+    ExeUnit(const char* name) : name_(name) {}
+    
+    virtual ~ExeUnit() {}
+
+    void push_input(const pipeline_state_t& state) {
+        inputs_.push(state);
+    }
+
+    bool pop_output(pipeline_state_t* state) {
+        return outputs_.try_pop(state);
+    }
+
+    virtual void step() = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class LsuUnit : public ExeUnit {
+private:
+    Core* core_;
+    uint32_t num_threads_;
+    HashTable<std::pair<pipeline_state_t, ThreadMask>> pending_dcache_;
+    pipeline_state_t fence_state_;
+    bool fence_lock_;
+
+public:
+    LsuUnit(Core*);
+
+    void handleCacheReponse(const MemRsp& response, uint32_t port_id);
+
+    void step();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class AluUnit : public ExeUnit {
+public:
+    AluUnit(Core*);
+    
+    void step();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class CsrUnit : public ExeUnit {
+public:
+    CsrUnit(Core*);
+    
+    void step();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class FpuUnit : public ExeUnit {
+public:
+    FpuUnit(Core*);
+    
+    void step();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class GpuUnit : public ExeUnit {
+public:
+    GpuUnit(Core*);
+    
+    void step();
+};
+
+}
--- a/sim/simX/ibuffer.h
+++ b/sim/simX/ibuffer.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class IBuffer {
+private:
+    std::queue<pipeline_state_t> entries_;
+    uint32_t capacity_;
+
+public:    
+    IBuffer(uint32_t size) 
+        : capacity_(size)
+    {}
+
+    bool empty() const {
+        return entries_.empty();
+    }
+    
+    bool full() const {
+        return (entries_.size() == capacity_);
+    }
+
+    const pipeline_state_t& top() const {
+        return entries_.front();
+    }
+
+    void push(const pipeline_state_t& state) {
+        entries_.emplace(state);
+    }
+
+    void pop() {
+        return entries_.pop();
+    }
+};
+
+}
--- a/sim/simX/instr.h
+++ b/sim/simX/instr.h
@@ -113,15 +113,12 @@ private:
  int num_rsrcs_;
  bool has_imm_;
  int rdest_type_;
-  int isrc_mask_;
-  int fsrc_mask_;  
-  int vsrc_mask_;
  Word imm_;
  int rsrc_type_[MAX_REG_SOURCES];
  int rsrc_[MAX_REG_SOURCES];  
  int rdest_;
  Word func3_;
-  Word func7_;
+  Word func6_;

  //Vector
  Word vmask_;
@@ -132,7 +129,7 @@ private:
  Word vlmul_;
  Word vsew_;
  Word vediv_;
-  Word func6_;
+  Word func7_;  

  friend std::ostream &operator<<(std::ostream &, const Instr&);
 };
--- a/sim/simX/main.cpp
+++ b/sim/simX/main.cpp
@@ -5,28 +5,26 @@
 #include <fstream>
 #include <stdlib.h>
 #include <sys/stat.h>
-
-#include "debug.h"
-#include "types.h"
-#include "core.h"
+#include "processor.h"
 #include "args.h"

 using namespace vortex;

 int main(int argc, char **argv) {
+  int ret;

-  std::string archString("rv32imf");
+  std::string archStr("rv32imf");
+  std::string imgFileName;
  int num_cores(NUM_CORES * NUM_CLUSTERS);
  int num_warps(NUM_WARPS);
  int num_threads(NUM_THREADS);  
-  std::string imgFileName;
  bool showHelp(false);
  bool showStats(false);
  bool riscv_test(false);

  /* Read the command line arguments. */
  CommandLineArgFlag fh("-h", "--help", "", showHelp);
-  CommandLineArgSetter<std::string> fa("-a", "--arch", "", archString);  
+  CommandLineArgSetter<std::string> fa("-a", "--arch", "", archStr);  
  CommandLineArgSetter<std::string> fi("-i", "--image", "", imgFileName);
  CommandLineArgSetter<int> fc("-c", "--cores", "", num_cores);
  CommandLineArgSetter<int> fw("-w", "--warps", "", num_warps);
@@ -48,62 +46,18 @@ int main(int argc, char **argv) {
    return 0;
  }

-  ArchDef arch(archString, num_cores, num_warps, num_threads);
+  std::cout << "Running " << imgFileName << "..." << std::endl;
  
-  Decoder decoder(arch);
-  MemoryUnit mu(0, arch.wsize(), true);
-  
-  RAM ram((1<<12), (1<<20));
-
-  std::string program_ext(fileExtension(imgFileName.c_str()));
-  if (program_ext == "bin") {
-    ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
-  } else if (program_ext == "hex") {
-    ram.loadHexImage(imgFileName.c_str());
-  } else {
-    std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
+  if (!SimPlatform::instance().initialize())
    return -1;
+
+  {
+    ArchDef arch(archStr, num_cores, num_warps, num_threads);
+    Processor processor(arch);
+    ret = processor.run(imgFileName, riscv_test, showStats);
  }  

-  mu.attach(ram, 0, 0xFFFFFFFF);
+  SimPlatform::instance().finalize();

-  struct stat hello;
-  fstat(0, &hello);
-
-  std::vector<std::shared_ptr<Core>> cores(num_cores);
-  for (int i = 0; i < num_cores; ++i) {
-    cores[i] = std::make_shared<Core>(arch, decoder, mu, i);
-  }
-
-  bool running;
-  int exitcode = 0;
-  do {
-    running = false;
-    for (auto& core : cores) {            
-      core->step();
-      if (core->running()) {
-          running = true;
-      }
-      if (core->check_ebreak()) {
-        exitcode = core->getIRegValue(3);
-        running = false;
-        break;
-      }
-    }
-  } while (running);
-
-  if (riscv_test) {
-    if (1 == exitcode) {
-      std::cout << "Passed." << std::endl;
-      exitcode = 0;
-    } else {
-      std::cout << "Failed." << std::endl;
-    }
-  } else {
-    if (exitcode != 0) {
-      std::cout << "*** error: exitcode=" << exitcode << std::endl;
-    }
-  }
-
-  return exitcode;
+  return ret;
 }
--- a/sim/simX/memsim.cpp
+++ b/sim/simX/memsim.cpp
@@ -0,0 +1,58 @@
+#include "memsim.h"
+#include <vector>
+#include <queue>
+#include "constants.h"
+
+using namespace vortex;
+
+class MemSim::Impl {
+private:
+    MemSim* simobject_;
+    std::vector<std::queue<MemReq>> inputs_;
+    uint32_t latency_;
+
+public:
+    Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) 
+        : simobject_(simobject)
+        , inputs_(num_banks)
+        , latency_(latency)  
+    {}
+
+    void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
+        inputs_.at(port_id).push(mem_req);        
+    }
+
+    void step(uint64_t /*cycle*/) {
+        for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
+            auto& queue = inputs_.at(i);            
+            if (queue.empty())
+                continue;
+            auto& entry = queue.front();
+            if (!entry.write) {
+                MemRsp mem_rsp;
+                mem_rsp.tag = entry.tag;
+                simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
+            }
+            queue.pop();
+        }
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+MemSim::MemSim(const SimContext& ctx, 
+               uint32_t num_banks,
+               uint32_t latency) 
+    : SimObject<MemSim>(ctx, "MemSim")
+    , impl_(new Impl(this, num_banks, latency))
+    , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) 
+    , MemRspPorts(num_banks, this)
+{}
+
+MemSim::~MemSim() {
+    delete impl_;
+}
+
+void MemSim::step(uint64_t cycle) {
+    impl_->step(cycle);
+}
--- a/sim/simX/memsim.h
+++ b/sim/simX/memsim.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <simobject.h>
+#include <vector>
+#include <list>
+
+namespace vortex {
+
+struct MemReq {
+    uint64_t addr;
+    uint32_t tag;
+    bool write;
+};
+
+struct MemRsp {
+    uint32_t tag;
+};
+
+class MemSim : public SimObject<MemSim>{
+private:
+    class Impl;
+    Impl* impl_;
+
+public:
+
+    MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency);
+    ~MemSim();
+
+    void step(uint64_t cycle);
+
+    std::vector<SlavePort<MemReq>>  MemReqPorts;
+    std::vector<MasterPort<MemRsp>> MemRspPorts;
+};
+
+};
--- a/sim/simX/pipeline.cpp
+++ b/sim/simX/pipeline.cpp
@@ -1,63 +0,0 @@
-#include <iostream>
-#include "pipeline.h"
-
-using namespace vortex;
-
-namespace vortex {
-std::ostream &operator<<(std::ostream &os, const Pipeline& pipeline) {
-  os << pipeline.name_ << ": valid=" << pipeline.valid << std::endl;
-  os << pipeline.name_ << ": stalled=" << pipeline.stalled << std::endl;
-  os << pipeline.name_ << ": stall_warp=" << pipeline.stall_warp << std::endl;      
-  os << pipeline.name_ << ": wid=" << pipeline.wid << std::endl;
-  os << pipeline.name_ << ": PC=" << std::hex << pipeline.PC << std::endl;
-  os << pipeline.name_ << ": used_iregs=" << pipeline.used_iregs << std::endl;
-  os << pipeline.name_ << ": used_fregs=" << pipeline.used_fregs << std::endl;
-  os << pipeline.name_ << ": used_vregs=" << pipeline.used_vregs << std::endl;
-  return os;
-}
-}
-
-Pipeline::Pipeline(const char* name) 
-: name_(name) {
-  this->clear();
-}
-
-void Pipeline::clear() {
-  valid = false;
-  stalled = false;
-  stall_warp = false;
-  wid = 0;
-  PC = 0;
-  used_iregs.reset();
-  used_fregs.reset();
-  used_vregs.reset();
-}
-
-bool Pipeline::enter(Pipeline *drain) {
-  if (drain) {
-    if (drain->stalled) {
-      this->stalled = true;
-      return false;
-    }
-    drain->valid = false;
-  }
-  this->stalled = false;
-  if (!this->valid)
-    return false;
-  return true;
-}
-
-void Pipeline::next(Pipeline *drain) {
-  if (drain) {
-    drain->valid = this->valid;
-    drain->stalled = this->stalled;
-    drain->stall_warp = this->stall_warp;
-    drain->wid = this->wid;
-    drain->PC = this->PC;
-    drain->rdest = this->rdest;
-    drain->rdest_type = this->rdest_type;
-    drain->used_iregs = this->used_iregs;
-    drain->used_fregs = this->used_fregs;
-    drain->used_vregs = this->used_vregs;
-  }
-}
--- a/sim/simX/pipeline.h
+++ b/sim/simX/pipeline.h
@@ -2,47 +2,75 @@
 #pragma once

 #include <memory>
+#include <iostream>
 #include <util.h>
 #include "types.h"
 #include "debug.h"

 namespace vortex {

-class Instr;
-
-class Pipeline {
-public:
-  Pipeline(const char* name);
-
-  void clear();
-
-  bool enter(Pipeline* drain);
-
-  void next(Pipeline* drain);
-
-  //--
-  bool      valid;
-
-  //--
-  bool      stalled;
-  bool      stall_warp;
-
+struct pipeline_state_t {
  //--    
  int         wid;  
+  ThreadMask  tmask;
  Word        PC;

  //--
+  bool        stall_warp;
  int         rdest_type;
  int         rdest;
  RegMask     used_iregs;
  RegMask     used_fregs;
  RegMask     used_vregs;

-private:
+  //- 
+  ExeType     exe_type; 
+  std::vector<uint64_t> mem_addrs;
  
-  const char* name_;
-
-  friend std::ostream &operator<<(std::ostream &, const Pipeline&);
+  //--
+  union {
+    struct {        
+      uint8_t load : 1;
+      uint8_t store: 1;
+      uint8_t fence : 1;
+      uint8_t prefetch: 1;
+    } lsu;
+    struct {
+      AluType type;
+    } alu;
+    struct {
+      FpuType type;
+    } fpu;
+    struct {
+      GpuType type;
+    } gpu;
  };

+  // stats
+  uint64_t icache_latency;
+  uint64_t dcache_latency;
+};
+
+class PipelineStage : public Queue<pipeline_state_t> {
+protected:
+  const char* name_;
+  friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&);
+
+public:
+  PipelineStage(const char* name = nullptr) 
+    : name_(name) 
+  {}
+};
+
+inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
+  os << "stall_warp="   << state.stall_warp;
+  os << ", wid="        << state.wid;
+  os << ", PC="         << std::hex << state.PC;
+  os << ", used_iregs=" << state.used_iregs;
+  os << ", used_fregs=" << state.used_fregs;
+  os << ", used_vregs=" << state.used_vregs;
+  os << std::endl;
+  return os;
+}
+
 }
--- a/sim/simX/processor.h
+++ b/sim/simX/processor.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include "constants.h"
+#include "debug.h"
+#include "types.h"
+#include "core.h"
+
+namespace vortex {
+
+class Processor {
+private:
+  ArchDef arch_; 
+  Decoder decoder_;
+  MemoryUnit mu_;
+  RAM ram_;
+  std::vector<Core::Ptr> cores_;  
+  std::vector<Cache::Ptr> l2caches_;  
+  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
+  Cache::Ptr l3cache_;
+  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
+  MemSim::Ptr memsim_;
+
+public:
+  Processor(const ArchDef& arch) 
+    : arch_(arch)
+    , decoder_(arch)
+    , mu_(0, arch.wsize(), true)
+    , ram_((1<<12), (1<<20)) 
+    , cores_(arch.num_cores())
+    , l2caches_(NUM_CLUSTERS)
+    , l2_mem_switches_(NUM_CLUSTERS)
+  {
+    uint32_t num_cores = arch.num_cores();
+    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
+    
+    // bind RAM to memory unit
+    mu_.attach(ram_, 0, 0xFFFFFFFF);    
+
+    // create cores
+    for (uint32_t i = 0; i < num_cores; ++i) {
+      cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
+    }
+    
+    // connect memory sub-systen
+    memsim_ = MemSim::Create(1, MEM_LATENCY);
+    std::vector<SlavePort<MemReq>*>  mem_req_ports(1); 
+    std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
+    mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
+    mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
+
+    if (L3_ENABLE) {
+      l3cache_ = Cache::Create("l3cache", CacheConfig{
+        log2ceil(L3_CACHE_SIZE),  // C
+        log2ceil(MEM_BLOCK_SIZE), // B
+        2,                      // W
+        0,                      // A
+        32,                    // address bits    
+        L3_NUM_BANKS,           // number of banks
+        L3_NUM_PORTS,           // number of ports
+        NUM_CLUSTERS,           // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        L3_MSHR_SIZE,           // mshr
+        2,                      // pipeline latency
+      });
+      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
+      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
+
+      mem_req_ports.resize(NUM_CLUSTERS);
+      mem_rsp_ports.resize(NUM_CLUSTERS);
+      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
+        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
+      }
+    } else if (NUM_CLUSTERS > 1) {
+      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
+      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
+      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
+
+      mem_req_ports.resize(NUM_CLUSTERS);
+      mem_rsp_ports.resize(NUM_CLUSTERS);
+      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
+        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
+      }
+    }
+
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {      
+      if (L2_ENABLE) {
+        auto& l2cache = l2caches_.at(i);
+        l2cache = Cache::Create("l2cache", CacheConfig{
+          log2ceil(L2_CACHE_SIZE),  // C
+          log2ceil(MEM_BLOCK_SIZE), // B
+          2,                      // W
+          0,                      // A
+          32,                     // address bits    
+          L2_NUM_BANKS,           // number of banks
+          L2_NUM_PORTS,           // number of ports
+          NUM_CORES,              // request size   
+          true,                   // write-throught
+          0,                      // victim size
+          L2_MSHR_SIZE,           // mshr
+          2,                      // pipeline latency
+        });
+        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
+        l2cache->MemReqPort.bind(mem_req_ports.at(i));
+
+        mem_req_ports.resize(cores_per_cluster);
+        mem_rsp_ports.resize(cores_per_cluster);
+        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+          mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
+          mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
+        }
+      } else if (cores_per_cluster > 1) {
+        auto& l2_mem_switch = l2_mem_switches_.at(i);
+        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
+        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
+        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));  
+
+        mem_req_ports.resize(cores_per_cluster);
+        mem_rsp_ports.resize(cores_per_cluster);
+        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+          mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
+          mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
+        }
+      }
+
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        auto& core = cores_.at((i * NUM_CLUSTERS) + j);        
+        mem_rsp_ports.at(i)->bind(&core->MemRspPort);
+        core->MemReqPort.bind(mem_req_ports.at(j));
+      }
+    }
+  }
+
+  ~Processor() {}
+
+  int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
+    {
+      std::string program_ext(fileExtension(program.c_str()));
+      if (program_ext == "bin") {
+        ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
+      } else if (program_ext == "hex") {
+        ram_.loadHexImage(program.c_str());
+      } else {
+        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
+        return -1;
+      }
+    }
+
+    bool running;
+    int exitcode = 0;
+    do {
+      SimPlatform::instance().step();
+      
+      running = false;
+      for (auto& core : cores_) {
+        if (core->running()) {
+          running = true;
+        }
+        if (core->check_ebreak()) {
+          exitcode = core->getIRegValue(3);
+          running = false;
+          break;
+        }
+      }
+    } while (running);
+
+    // get error status
+
+    if (riscv_test) {
+      if (1 == exitcode) {
+        std::cout << "Passed." << std::endl;
+        exitcode = 0;
+      } else {
+        std::cout << "Failed." << std::endl;
+      }
+    } else {
+      if (exitcode != 0) {
+        std::cout << "*** error: exitcode=" << exitcode << std::endl;
+      }
+    }
+
+    return exitcode;
+  }
+
+};
+
+}
--- a/sim/simX/scoreboard.h
+++ b/sim/simX/scoreboard.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "pipeline.h"
+#include <queue>
+
+namespace vortex {
+
+class Scoreboard {
+private:
+    std::vector<RegMask> in_use_iregs_;
+    std::vector<RegMask> in_use_fregs_;
+    std::vector<RegMask> in_use_vregs_;
+
+public:    
+    Scoreboard(const ArchDef &arch) 
+        : in_use_iregs_(arch.num_warps())
+        , in_use_fregs_(arch.num_warps())
+        , in_use_vregs_(arch.num_warps())
+    {
+        for (int w = 0; w < arch.num_warps(); ++w) {    
+            in_use_iregs_.at(w).reset();
+            in_use_fregs_.at(w).reset();
+            in_use_vregs_.at(w).reset();    
+        }
+    }
+
+    bool in_use(const pipeline_state_t& state) const {
+        return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 
+            || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
+            || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
+    }
+    
+    void reserve(const pipeline_state_t& state) {
+        if (!state.rdest)
+            return;
+        
+        switch (state.rdest_type) {
+        case 1:            
+            in_use_iregs_.at(state.wid).set(state.rdest);
+            break;
+        case 2:
+            in_use_fregs_.at(state.wid).set(state.rdest);
+            break;
+        case 3:
+            in_use_vregs_.at(state.wid).set(state.rdest);
+            break;
+        default:  
+            break;
+        }
+    }
+
+    void release(const pipeline_state_t& state) {
+        if (!state.rdest)
+            return;
+        switch (state.rdest_type) {
+        case 1:
+            in_use_iregs_.at(state.wid).reset(state.rdest);
+            break;
+        case 2:
+            in_use_fregs_.at(state.wid).reset(state.rdest);
+            break;
+        case 3:
+            in_use_vregs_.at(state.wid).reset(state.rdest);
+            break;
+        default:  
+            break;
+        }      
+    }
+};
+
+}
--- a/sim/simX/types.h
+++ b/sim/simX/types.h
@@ -2,7 +2,10 @@

 #include <stdint.h>
 #include <bitset>
+#include <queue>
+#include <unordered_map>
 #include <VX_config.h>
+#include <simobject.h>

 namespace vortex {

@@ -14,9 +17,242 @@ typedef uint32_t Addr;
 typedef uint32_t Size;

 typedef std::bitset<32> RegMask;
-
 typedef std::bitset<32> ThreadMask;
-
 typedef std::bitset<32> WarpMask;

+enum class ExeType {
+  ALU,
+  LSU,
+  CSR,
+  FPU,
+  GPU,
+  MAX,
+};
+
+enum class AluType {
+  ARITH,
+  BRANCH,
+  IMUL,
+  IDIV,    
+};
+
+enum class FpuType {
+  FNCP,
+  FMA,
+  FDIV,
+  FSQRT,
+  FCVT,
+};
+
+enum class GpuType {
+  TMC,
+  WSPAWN,
+  SPLIT,
+  JOIN,
+  BAR,
+  TEX,
+};
+
+enum class ArbiterType {
+  Priority,
+  RoundRobin
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class Queue {
+protected:
+  std::queue<T> queue_;
+
+public:
+  Queue() {}
+
+  bool empty() const {
+    return queue_.empty();
+  }
+
+  const T& top() const {
+    return queue_.front();
+  }
+
+  void push(const T& value) {
+    queue_.push(value);
+  }
+
+  void pop() {
+    queue_.pop();
+  }
+
+  bool try_pop(T* value) {
+    if (queue_.empty())
+      return false;
+    *value = queue_.front();
+    queue_.pop();
+    return true;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class HashTable {
+private:
+  std::vector<std::pair<bool, T>> entries_;
+  uint32_t capacity_;
+
+public:    
+  HashTable(uint32_t size)
+    : entries_(size)
+    , capacity_(0) 
+  {}
+
+  bool empty() const {
+    return (0 == capacity_);
+  }
+  
+  bool full() const {
+    return (capacity_ == entries_.size());
+  }
+
+  bool contains(uint32_t index) const {
+    return entries_.at(index).first;
+  }
+
+  const T& at(uint32_t index) const {
+    auto& entry = entries_.at(index);
+    assert(entry.first);
+    return entry.second;
+  }
+
+  T& at(uint32_t index) {
+    auto& entry = entries_.at(index);
+    assert(entry.first);
+    return entry.second;
+  }
+
+  uint32_t allocate(const T& value) {
+    for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+      auto& entry = entries_.at(i);
+      if (!entry.first) {
+        entry.first = true;
+        entry.second = value;
+        ++capacity_;              
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  void release(uint32_t index) {
+    auto& entry = entries_.at(index);
+    assert(entry.first);
+    entry.first = false;
+  }
+
+  void remove(uint32_t index, T* value) {
+    auto& entry = entries_.at(index);
+    assert(entry.first);
+    *value = entry.second;
+    entry.first = false;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
+class Switch : public SimObject<Switch<Req, Rsp>> {
+private:
+  struct req_t {  
+    std::vector<Req>       data;
+    std::bitset<MaxInputs> valid;
+    req_t() {} 
+    req_t(uint32_t size) : data(size) {} 
+  };
+
+  void handleIncomingRequest(const Req& req, uint32_t port_id) {
+    cur_req_.data.at(port_id) = req;
+    cur_req_.valid.set(port_id);
+  }
+
+  void handleIncomingResponse(const Rsp& rsp, uint32_t) {
+    rsps_.push(rsp);
+  }
+
+  ArbiterType type_;
+  std::queue<req_t> reqs_;
+  std::queue<Rsp> rsps_;
+  req_t cur_req_; 
+  uint32_t delay_;  
+  uint32_t cursor_;
+  std::unordered_map<uint32_t, uint32_t> addr_table_;
+
+public:
+  Switch(
+    const SimContext& ctx, 
+    const char* name, 
+    ArbiterType type, 
+    uint32_t num_inputs, 
+    uint32_t delay = 1
+  ) 
+    : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
+    , type_(type)
+    , cur_req_(num_inputs)
+    , delay_(delay)
+    , cursor_(0)
+    , ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
+    , ReqOut(this)
+    , RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)    
+    , RspOut(num_inputs, this)
+  {
+    assert(delay_ != 0);
+    assert(num_inputs <= MaxInputs);
+  }
+
+  void step(uint64_t /*cycle*/) {    
+    if (cur_req_.valid.any()) {
+      reqs_.push(cur_req_);      
+      cur_req_.valid.reset();
+    }
+
+    while (!reqs_.empty()) {
+      auto& entry = reqs_.front();
+      bool found = false;
+      for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
+        auto j = (cursor_ + i) % n;        
+        if (entry.valid.test(j)) {
+          auto& req = entry.data.at(j);
+          addr_table_[req.tag] = j;
+          ReqOut.send(req, delay_);
+          entry.valid.reset(j);
+          this->update_cursor(j);
+          found = true;
+          break;
+        }
+      }
+      if (found)
+        break;
+      reqs_.pop();
+    } 
+
+    if (!rsps_.empty()) {
+      auto& rsp = rsps_.front();
+      auto port_id = addr_table_.at(rsp.tag);
+      RspOut.at(port_id).send(rsp, 1);
+      rsps_.pop();
+    }
+  }
+
+  void update_cursor(uint32_t grant) {
+    if (type_ == ArbiterType::RoundRobin) {
+      cursor_ = grant + 1;
+    }
+  }
+
+  std::vector<SlavePort<Req>>  ReqIn;
+  MasterPort<Req>              ReqOut;
+  SlavePort<Rsp>               RspIn;    
+  std::vector<MasterPort<Rsp>> RspOut;
+};
+
 }
--- a/sim/simX/warp.cpp
+++ b/sim/simX/warp.cpp
@@ -12,25 +12,21 @@ using namespace vortex;

 Warp::Warp(Core *core, Word id)
    : id_(id)
-    , core_(core) {
+    , core_(core)
+    , active_(false)
+    , PC_(STARTUP_ADDR)
+    , tmask_(0) {
  iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
  fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
  vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
-  this->clear();
 }

-void Warp::clear() {
-  PC_ = STARTUP_ADDR;
-  tmask_.reset();
-  active_ = false;
-}
-
-void Warp::step(Pipeline *pipeline) {
+void Warp::eval(pipeline_state_t *pipeline_state) {
  assert(tmask_.any());

  DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
  for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
-    DPN(2, tmask_[n-i-1]);
+    DPN(2, tmask_.test(n-i-1));
  DPN(2, "\n");

  /* Fetch and decode. */    
@@ -38,55 +34,24 @@ void Warp::step(Pipeline *pipeline) {
  Word fetched = core_->icache_fetch(PC_);
  auto instr = core_->decoder().decode(fetched, PC_);

-  // Update pipeline
-  pipeline->valid = true;
-  pipeline->PC = PC_;
-  pipeline->rdest = instr->getRDest();
-  pipeline->rdest_type = instr->getRDType();
-  pipeline->used_iregs.reset();
-  pipeline->used_fregs.reset();
-  pipeline->used_vregs.reset();
-
-  switch (pipeline->rdest_type) {
-  case 1:
-    pipeline->used_iregs[pipeline->rdest] = 1;
-    break;
-  case 2:
-    pipeline->used_fregs[pipeline->rdest] = 1;
-    break;
-  case 3:
-    pipeline->used_vregs[pipeline->rdest] = 1;
-    break;
-  default:
-    break;
-  }
-
-  for (int i = 0; i < instr->getNRSrc(); ++i) {
-    int type = instr->getRSType(i);
-    int reg = instr->getRSrc(i);
-    switch (type) {
-    case 1:
-      pipeline->used_iregs[reg] = 1;
-      break;
-    case 2:
-      pipeline->used_fregs[reg] = 1;
-      break;
-    case 3:
-      pipeline->used_vregs[reg] = 1;
-      break;
-    default:
-      break;
-    }
-  }
+  // Update state
+  pipeline_state->wid   = id_;
+  pipeline_state->PC    = PC_;
+  pipeline_state->tmask = tmask_;
+  pipeline_state->rdest = instr->getRDest();
+  pipeline_state->rdest_type = instr->getRDType();
+  pipeline_state->used_iregs.reset();
+  pipeline_state->used_fregs.reset();
+  pipeline_state->used_vregs.reset();
  
  // Execute
-  this->execute(*instr, pipeline);
+  this->execute(*instr, pipeline_state);

  D(4, "Register state:");
  for (int i = 0; i < core_->arch().num_regs(); ++i) {
    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    for (int j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' ');
+      DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
    }
    DPN(4, std::endl);
  }  
--- a/sim/simX/warp.h
+++ b/sim/simX/warp.h
@@ -9,7 +9,7 @@ namespace vortex {

 class Core;
 class Instr;
-class Pipeline;
+class pipeline_state_t;
 struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
@@ -42,8 +42,6 @@ class Warp {
 public:
  Warp(Core *core, Word id);
  
-  void clear();
-  
  bool active() const {
    return active_;
  }
@@ -71,7 +69,7 @@ public:
  }

  void setTmask(size_t index, bool value) {
-    tmask_[index] = value;
+    tmask_.set(index, value);
    active_ = tmask_.any();
  }

@@ -82,18 +80,18 @@ public:
  }

  Word getIRegValue(int reg) const {
-    return iRegFile_[0][reg];
+    return iRegFile_.at(0).at(reg);
  }

-  void step(Pipeline *);
+  void eval(pipeline_state_t *);

 private:

-  void execute(const Instr &instr, Pipeline *);
+  void execute(const Instr &instr, pipeline_state_t *pipeline_state);
  
  Word id_;
-  bool active_;
  Core *core_;
+  bool active_;
  
  Word PC_;
  ThreadMask tmask_;