fixes: texture unit mem access sometimes going to smem, bilinear texture filtering; new: cache req_id,

2021-11-24 00:00:17 -05:00
parent 1501360f4b
commit 18762dffce
70 changed files with 3818 additions and 1727 deletions
--- a/sim/common/bitmanip.h
+++ b/sim/common/bitmanip.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <cstdint>
+#include <algorithm>
+#include <assert.h>
+
+constexpr uint32_t count_leading_zeros(uint32_t value) {
+  return value ? __builtin_clz(value) : 32;
+}
+
+constexpr uint32_t count_trailing_zeros(uint32_t value) {
+  return value ? __builtin_ctz(value) : 32;
+}
+
+constexpr bool ispow2(uint32_t value) {
+  return value && !(value & (value - 1));
+}
+
+constexpr uint32_t log2ceil(uint32_t value) {
+  return 32 - count_leading_zeros(value - 1);
+}
+
+inline unsigned log2up(uint32_t value) {
+  return std::max<uint32_t>(1, log2ceil(value));
+}
+
+constexpr unsigned log2floor(uint32_t value) {
+  return 31 - count_leading_zeros(value);
+}
+
+constexpr unsigned ceil2(uint32_t value) {
+  return 32 - count_leading_zeros(value);
+}
+
+inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return bits & ~(1ull << index);
+}
+
+inline uint64_t bit_set(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return bits | (1ull << index);
+}
+
+inline bool bit_get(uint64_t bits, uint32_t index) {
+    assert(index <= 63);
+    return (bits >> index) & 0x1;
+}
+
+inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
+    return bits & ~mask;
+}
+
+inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    uint64_t dirty = (value << (shift + start)) >> shift;
+    return bit_clrw(bits, start, end) | dirty;
+}
+
+inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
+    assert(end >= start);
+    assert(end <= 63);
+    uint32_t shift = 63 - end;
+    return (bits << shift) >> (shift + start);
+}
+
+// Apply integer sign extension
+inline uint32_t sext32(uint32_t word, uint32_t width) {
+  assert(width > 1);
+  assert(width <= 32);
+  uint32_t mask = (1 << width) - 1;
+  return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
+}
--- a/sim/common/fixed.h
+++ b/sim/common/fixed.h
@@ -0,0 +1,419 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <assert.h>
+
+template <uint32_t F, typename T = int32_t>
+class Fixed {
+private:
+
+  template <uint32_t F2, typename T2> 
+  struct Cast {
+  private:
+    template <bool isF2Bigger, bool isT2Bigger> struct Tag {};
+
+    inline static T Convert(T2 value, Tag<false, false>) {
+      return static_cast<T>(value) << (F - F2);
+    }
+
+    inline static T Convert(T2 value, Tag<false, true>) {
+      return static_cast<T>(value) >> (F2 - F);
+    }
+
+    inline static T Convert(T2 value, Tag<true, false>) {
+      return static_cast<T>(value << (F - F2));
+    }
+
+    inline static T Convert(T2 value, Tag<true, true>) {
+      return static_cast<T>(value >> (F2 - F));
+    }
+
+  public:    
+    inline static T Convert(T2 value) {
+      return Convert(value, Tag<(sizeof(T2) > sizeof(T)), (F2 > F)>{});
+    }  
+  };
+
+public:
+  using data_type = T;
+
+  static constexpr uint32_t FRAC = F;
+  static constexpr uint32_t INT = sizeof(T) * 8 - FRAC;
+  static constexpr uint32_t HFRAC = FRAC >> 1;
+  static constexpr T ONE = static_cast<T>(1) << FRAC;
+  static constexpr T MASK = ONE - 1;
+  static constexpr T IMASK = ~MASK;
+  static constexpr T HALF = ONE >> 1;
+  static constexpr T TWO = ONE << 1;
+
+  Fixed() {}
+
+  explicit Fixed(int64_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(uint64_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(int32_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(uint32_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(int16_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(uint16_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(int8_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  explicit Fixed(uint8_t rhs)
+      : data_(static_cast<T>(rhs << FRAC)) {
+    assert((static_cast<int64_t>(rhs) << FRAC) == data_);
+  }
+
+  template <uint32_t F2, typename T2>
+  explicit Fixed(Fixed<F2, T2> rhs)
+    : data_(Cast<F2, T2>::Convert(rhs.data()))
+  {}
+
+  explicit Fixed(float rhs)
+      : data_(static_cast<T>(rhs * ONE)) {
+    assert(data_ == static_cast<T>(rhs * ONE));
+  }
+
+  bool operator==(Fixed rhs) const {
+    return (data_ == rhs.data_);
+  }
+
+  bool operator!=(Fixed rhs) const {
+    return (data_ != rhs.data_);
+  }
+
+  bool operator<(Fixed rhs) const {
+    return (data_ < rhs.data_);
+  }
+
+  bool operator<=(Fixed rhs) const {
+    return (data_ <= rhs.data_);
+  }
+
+  bool operator>(Fixed rhs) const {
+    return (data_ > rhs.data_);
+  }
+
+  bool operator>=(Fixed rhs) const {
+    return (data_ >= rhs.data_);
+  }
+
+  Fixed operator-() const {
+    return make(-data_);
+  }
+
+  Fixed operator+=(Fixed rhs) {
+    *this = (*this) + rhs;
+    return *this;
+  }
+
+  Fixed operator-=(Fixed rhs) {
+    *this = (*this) - rhs;
+    return *this;
+  }
+
+  Fixed operator*=(Fixed rhs) {
+    *this = (*this) * rhs;
+    return *this;
+  }
+
+  Fixed operator/=(Fixed rhs) {
+    *this = (*this) / rhs;
+    return *this;
+  }
+
+  template <uint32_t F2, typename T2>
+  Fixed operator*=(Fixed<F2, T2> rhs) {
+    *this = (*this) * rhs;
+    return *this;
+  }
+
+  template <uint32_t F2, typename T2>
+  Fixed operator/=(Fixed<F2, T2> rhs) {
+    *this = (*this) / rhs;
+    return *this;
+  }
+
+  Fixed operator*=(int32_t rhs) {
+    *this = (*this) * rhs;
+    return *this;
+  }
+
+  Fixed operator*=(uint32_t rhs) {
+    *this = (*this) * rhs;
+    return *this;
+  }
+
+  Fixed operator*=(float rhs) {
+    *this = (*this) * rhs;
+    return *this;
+  }
+
+  Fixed operator/=(int32_t rhs) {
+    *this = (*this) / rhs;
+    return *this;
+  }
+
+  Fixed operator/=(uint32_t rhs) {
+    *this = (*this) / rhs;
+    return *this;
+  }
+
+  Fixed operator/=(float rhs) {
+    *this = (*this) / rhs;
+    return *this;
+  }
+
+  friend Fixed operator+(Fixed lhs, Fixed rhs) {
+    assert((static_cast<int64_t>(lhs.data_) + rhs.data_) ==
+           (lhs.data_ + rhs.data_));
+    return Fixed::make(lhs.data_ + rhs.data_);
+  }
+
+  friend Fixed operator-(Fixed lhs, Fixed rhs) {
+    assert((static_cast<int64_t>(lhs.data_) - rhs.data_) ==
+           (lhs.data_ - rhs.data_));
+    return Fixed::make(lhs.data_ - rhs.data_);
+  }
+
+  friend Fixed operator*(Fixed lhs, Fixed rhs) {
+    return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data_) >> FRAC);
+  }
+
+  template <uint32_t F2, typename T2>
+  friend Fixed operator*(Fixed lhs, Fixed<F2, T2> rhs) {
+    return Fixed::make((static_cast<int64_t>(lhs.data_) * rhs.data()) >> F2);
+  }
+
+  friend Fixed operator/(Fixed lhs, Fixed rhs) {
+    assert(rhs.data_ != 0);
+    return Fixed::make((static_cast<int64_t>(lhs.data_) << FRAC) / rhs.data_);
+  }
+
+  template <uint32_t F2, typename T2>
+  friend Fixed operator/(Fixed lhs, Fixed<F2, T2> rhs) {
+    assert(rhs.data() != 0);
+    return Fixed::make((static_cast<int64_t>(lhs.data_) << F2) / rhs.data());
+  }
+
+  friend Fixed operator*(Fixed lhs, float rhs) {
+    return static_cast<float>(lhs) * rhs;
+  }
+
+  friend Fixed operator*(float lhs, Fixed rhs) {
+    return lhs * static_cast<float>(rhs);
+  }
+
+  friend Fixed operator/(Fixed lhs, float rhs) {
+    return static_cast<float>(lhs) / rhs;
+  }
+
+  friend Fixed operator/(float lhs, Fixed rhs) {
+    return lhs / static_cast<float>(rhs);
+  }
+
+  friend Fixed operator*(Fixed lhs, char rhs) {
+    return lhs * static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator*(char lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, char rhs) {
+    return lhs / static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator/(char lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator*(Fixed lhs, uint8_t rhs) {
+    return lhs * static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator*(uint8_t lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, uint8_t rhs) {
+    return lhs / static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator/(uint8_t lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator*(Fixed lhs, short rhs) {
+    return lhs * static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator*(short lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, short rhs) {
+    return lhs / static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator/(short lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator*(Fixed lhs, uint16_t rhs) {
+    return lhs * static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator*(uint16_t lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, uint16_t rhs) {
+    return lhs / static_cast<int32_t>(rhs);
+  }
+
+  friend Fixed operator/(uint16_t lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator*(Fixed lhs, int32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ * rhs);
+    assert((lhs.data_ * static_cast<int64_t>(rhs)) == value);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator*(int32_t lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, int32_t rhs) {
+    assert(rhs);
+    auto value = static_cast<T>(lhs.data_ / rhs);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator/(int32_t lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator*(Fixed lhs, uint32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ << rhs);
+    assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator*(uint32_t lhs, Fixed rhs) {
+    return rhs * lhs;
+  }
+
+  friend Fixed operator/(Fixed lhs, uint32_t rhs) {
+    assert(rhs);
+    auto value = static_cast<T>(lhs.data_ / rhs);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator/(uint32_t lhs, Fixed rhs) {
+    return rhs / lhs;
+  }
+
+  friend Fixed operator<<(Fixed lhs, int32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ << rhs);
+    assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator>>(Fixed lhs, int32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ >> rhs);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator<<(Fixed lhs, uint32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ << rhs);
+    assert((lhs.data_ << static_cast<int64_t>(rhs)) == value);
+    return Fixed::make(value);
+  }
+
+  friend Fixed operator>>(Fixed lhs, uint32_t rhs) {
+    auto value = static_cast<T>(lhs.data_ >> rhs);
+    return Fixed::make(value);
+  }
+
+  static Fixed make(T value) {
+    Fixed ret;
+    ret.data_ = value;
+    return ret;
+  }
+
+  explicit operator int64_t() const {
+    return static_cast<int64_t>(data_ >> F);
+  }
+
+  explicit operator uint64_t() const {
+    return static_cast<uint64_t>(data_ >> F);
+  }
+
+  explicit operator int32_t() const {
+    return static_cast<int32_t>(data_ >> F);
+  }
+
+  explicit operator uint32_t() const {
+    return static_cast<uint32_t>(data_ >> F);
+  }
+
+  explicit operator int16_t() const {
+    return static_cast<int16_t>(data_ >> F);
+  }
+
+  explicit operator uint16_t() const {
+    return static_cast<uint16_t>(data_ >> F);
+  }
+
+  explicit operator int8_t() const {
+    return static_cast<int8_t>(data_ >> F);
+  }
+
+  explicit operator uint8_t() const {
+    return static_cast<uint8_t>(data_ >> F);
+  }
+
+  template <uint32_t F2, typename T2>
+  explicit operator Fixed<F2, T2>() const {
+    return Fixed<F2, T2>(*this);
+  }
+
+  explicit operator float() const {
+    return static_cast<float>(data_) / (static_cast<T>(1) << F);
+  }
+
+  T data() const {
+    return data_;
+  }
+
+private:
+  T data_;
+};
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@@ -5,10 +5,9 @@
 #include <memory>
 #include <vector>
 #include <list>
+#include <queue>
 #include <assert.h>

-namespace vortex {
-
 class SimObjectBase;

 ///////////////////////////////////////////////////////////////////////////////
@@ -59,32 +58,44 @@ protected:
 template <typename Pkt>
 class SimPort : public SimPortBase {
 public:
-  void send(const Pkt& pkt, uint64_t delay) const; 
+  void send(const Pkt& pkt, uint64_t delay) const;

-  bool read(Pkt* out) {
-    if (!valid_)
-      return false;
-    *out = data_;
-    valid_ = false;
-    return true;
+  void bind(SimPort<Pkt>* peer) {
+    this->connect(peer);
  }

+  void unbind() {    
+    this->disconnect();
+  }
+
+  bool empty() const {
+    return queue_.empty();
+  }
+
+  const Pkt& top() const {
+    return queue_.front();
+  }
+
+  Pkt& top() {
+    return queue_.front();
+  }
+
+  void pop() {
+    queue_.pop();
+  } 
+
 protected:
  SimPort(SimObjectBase* module)
    : SimPortBase(module)
-    , valid_(false)
  {}

-  void write(const Pkt& data) {
-    assert(!valid_);
-    data_  = data;
-    valid_ = true;
+  void push(const Pkt& data) {
+    queue_.push(data);
  }

  SimPort& operator=(const SimPort&) = delete;

-  Pkt data_;
-  bool valid_;
+  std::queue<Pkt> queue_;

  template <typename U> friend class SimPortEvent;
 };
@@ -94,15 +105,7 @@ protected:
 template <typename Pkt>
 class SlavePort : public SimPort<Pkt> {
 public:
-  SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
-
-  void bind(SlavePort<Pkt>* peer) {
-    this->connect(peer);
-  }
-
-  void unbind() {    
-    this->disconnect();
-  }
+  SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}  

 protected:
  SlavePort& operator=(const SlavePort&) = delete;
@@ -115,18 +118,6 @@ class MasterPort : public SimPort<Pkt> {
 public:
  MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}

-  void bind(SlavePort<Pkt>* peer) {
-    this->connect(peer);
-  }
-
-  void bind(MasterPort<Pkt>* peer) {
-    this->connect(peer);
-  }
-
-  void unbind() {    
-    this->disconnect();
-  }
-
 protected:
  MasterPort& operator=(const MasterPort&) = delete;
 };
@@ -194,7 +185,7 @@ public:
  {}
  
  void fire() const override {
-    const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
+    const_cast<SimPort<Pkt>*>(port_)->push(pkt_);
  }

 private:  
@@ -382,6 +373,4 @@ template <typename T, typename Pkt>
 void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
  auto callback = std::bind(entry, obj, std::placeholders::_1);
  SimPlatform::instance().schedule(callback, pkt, delay);
-}
-
 }
--- a/sim/common/texturing.h
+++ b/sim/common/texturing.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+#include <fixed.h>
+#include <bitmanip.h>
+
+enum class WrapMode {
+  Clamp,
+  Repeat,
+  Mirror,
+};
+
+enum class TexFormat {
+  R8G8B8A8,
+  R5G6B5,  
+  R4G4B4A4,
+  L8A8,
+  L8,  
+  A8,  
+};
+
+template <uint32_t F, typename T = int32_t>
+T Clamp(Fixed<F,T> fx, WrapMode mode) {
+  switch (mode) {
+  case WrapMode::Clamp:  return (fx.data() < 0) ? 0 : ((fx.data() > Fixed<F,T>::MASK) ? Fixed<F,T>::MASK : fx.data());
+  case WrapMode::Repeat: return (fx.data() & Fixed<F,T>::MASK);
+  case WrapMode::Mirror: return (bit_get(fx.data(), Fixed<F,T>::FRAC) ? ~fx.data() : fx.data());
+  default: 
+    std::abort();
+    return 0;    
+  }
+}
+
+inline uint32_t Stride(TexFormat format) {
+  switch (format) {
+  case TexFormat::R8G8B8A8: 
+    return 4;
+  case TexFormat::R5G6B5:
+  case TexFormat::R4G4B4A4:
+  case TexFormat::L8A8:
+    return 2;
+  case TexFormat::L8:
+  case TexFormat::A8:
+    return 1;
+  default: 
+    std::abort();
+    return 0;
+  }
+}
+
+inline void Unpack8888(TexFormat format, 
+                       uint32_t texel, 
+                       uint32_t* lo, 
+                       uint32_t* hi) {
+  switch (format) {
+  case TexFormat::R8G8B8A8: 
+    *lo = texel & 0x00ff00ff;
+    *hi = (texel >> 8) & 0x00ff00ff;
+    break;
+  case TexFormat::R5G6B5:
+  case TexFormat::R4G4B4A4:
+    *lo = texel;
+    *hi= 0;
+    break;
+  case TexFormat::L8A8:
+    *lo = (texel | (texel << 8)) & 0x00ff00ff;
+    *hi = 0;
+    break;
+  case TexFormat::L8:
+    *lo = (texel | (texel << 16)) & 0x07e0f81f;
+    *hi = 0;
+    break;
+  case TexFormat::A8:
+    *lo = (texel | (texel << 12)) & 0x0f0f0f0f;
+    *hi = 0;
+    break;
+  default: 
+    std::abort();
+  }
+}
+
+inline uint32_t Pack8888(TexFormat format, uint32_t lo, uint32_t hi) {
+  switch (format) {
+  case TexFormat::R8G8B8A8: 
+    return (hi << 8) | lo;
+  case TexFormat::R5G6B5:
+  case TexFormat::R4G4B4A4:
+    return lo;
+  case TexFormat::L8A8:
+    return (lo | (lo >> 8)) & 0xffff;
+  case TexFormat::L8:
+    return (lo | (lo >> 16)) & 0xffff;
+  case TexFormat::A8:
+    return (lo | (lo >> 12)) & 0xffff;  
+  default: 
+    std::abort();
+    return 0;
+  }
+}
+
+inline void Lerp8888(uint32_t al, 
+                     uint32_t ah, 
+                     uint32_t bl, 
+                     uint32_t bh, 
+                     uint32_t frac, 
+                     uint32_t* lo, 
+                     uint32_t* hi) {
+    *lo = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
+    *hi = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
+}
+
+template <uint32_t F, typename T = int32_t>
+void TexAddressLinear(Fixed<F,T> fu, 
+                      Fixed<F,T> fv, 
+                      uint32_t log_width,
+                      uint32_t log_height,
+                      WrapMode wrapu,
+                      WrapMode wrapv,
+                      uint32_t* addr00,
+                      uint32_t* addr01,
+                      uint32_t* addr10,
+                      uint32_t* addr11,
+                      uint32_t* alpha,
+                      uint32_t* beta
+) {
+  auto delta_x = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_width);
+  auto delta_y = Fixed<F,T>::make(Fixed<F,T>::HALF >> log_height);
+
+  uint32_t u0 = Clamp(fu - delta_x, wrapu);    
+  uint32_t u1 = Clamp(fu + delta_x, wrapu);
+  uint32_t v0 = Clamp(fv - delta_y, wrapv);     
+  uint32_t v1 = Clamp(fv + delta_y, wrapv);
+
+  uint32_t shift_u = (Fixed<F,T>::FRAC - log_width);
+  uint32_t shift_v = (Fixed<F,T>::FRAC - log_height);
+
+  uint32_t x0s = (u0 << 8) >> shift_u;
+  uint32_t y0s = (v0 << 8) >> shift_v;
+
+  uint32_t x0 = x0s >> 8;
+  uint32_t y0 = y0s >> 8;
+  uint32_t x1 = u1 >> shift_u;
+  uint32_t y1 = v1 >> shift_v;
+
+  *addr00 = x0 + (y0 << log_width);
+  *addr01 = x1 + (y0 << log_width);
+  *addr10 = x0 + (y1 << log_width);
+  *addr11 = x1 + (y1 << log_width);
+
+  *alpha  = x0s & 0xff;
+  *beta   = y0s & 0xff;
+
+  //printf("*** fu=0x%x, fv=0x%x, u0=0x%x, u1=0x%x, v0=0x%x, v1=0x%x, x0=0x%x, x1=0x%x, y0=0x%x, y1=0x%x, addr00=0x%x, addr01=0x%x, addr10=0x%x, addr11=0x%x\n", fu.data(), fv.data(), u0, u1, v0, v1, x0, x1, y0, y1, *addr00, *addr01, *addr10, *addr11);
+}
+
+template <uint32_t F, typename T = int32_t>
+void TexAddressPoint(Fixed<F,T> fu, 
+                     Fixed<F,T> fv, 
+                     uint32_t log_width,
+                     uint32_t log_height,
+                     WrapMode wrapu,
+                     WrapMode wrapv,
+                     uint32_t* addr
+) {
+  uint32_t u = Clamp(fu, wrapu);
+  uint32_t v = Clamp(fv, wrapv);
+  
+  uint32_t x = u >> (Fixed<F,T>::FRAC - log_width);
+  uint32_t y = v >> (Fixed<F,T>::FRAC - log_height);
+  
+  *addr = x + (y << log_width);
+
+  //printf("*** fu=0x%x, fv=0x%x, u=0x%x, v=0x%x, x=0x%x, y=0x%x, addr=0x%x\n", fu.data(), fv.data(), u, v, x, y, *addr);
+}
+
+inline uint32_t TexFilterLinear(
+  TexFormat format,
+  uint32_t texel00,  
+  uint32_t texel01,
+  uint32_t texel10,
+  uint32_t texel11,
+  uint32_t alpha,
+  uint32_t beta
+) {
+  uint32_t c01l, c01h;
+  {
+    uint32_t c0l, c0h;  
+    uint32_t c1l, c1h;
+    Unpack8888(format, texel00, &c0l, &c0h);
+    Unpack8888(format, texel01, &c1l, &c1h);
+    Lerp8888(c0l, c0h, c1l, c1h, alpha, &c01l, &c01h);
+  }
+
+  uint32_t c23l, c23h;
+  {
+    uint32_t c2l, c2h;  
+    uint32_t c3l, c3h;
+    Unpack8888(format, texel10, &c2l, &c2h);
+    Unpack8888(format, texel11, &c3l, &c3h);
+    Lerp8888(c2l, c2h, c3l, c3h, alpha, &c23l, &c23h);
+  }
+
+  uint32_t cl, ch;
+  Lerp8888(c01l, c01h, c23l, c23h, beta, &cl, &ch);
+  uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
+
+  //printf("*** texel00=0x%x, texel01=0x%x, texel10=0x%x, texel11=0x%x, color=0x%x\n", texel00, texel01, texel10, texel11, color);
+
+  return color;
+}
+
+inline uint32_t TexFilterPoint(TexFormat format, uint32_t texel) {
+  uint32_t cl, ch;  
+  Unpack8888(format, texel, &cl, &ch);
+  uint32_t color = Pack8888(TexFormat::R8G8B8A8, cl, ch);
+
+  //printf("*** texel=0x%x, color=0x%x\n", texel, color);
+
+  return color;
+}
--- a/sim/common/util.h
+++ b/sim/common/util.h
@@ -3,85 +3,12 @@
 #include <cstdint>
 #include <algorithm>
 #include <assert.h>
+#include <bitmanip.h>

 template <typename... Args>
 void unused(Args&&...) {}

 #define __unused(...) unused(__VA_ARGS__)

-constexpr uint32_t count_leading_zeros(uint32_t value) {
-  return value ? __builtin_clz(value) : 32;
-}
-
-constexpr uint32_t count_trailing_zeros(uint32_t value) {
-  return value ? __builtin_ctz(value) : 32;
-}
-
-constexpr bool ispow2(uint32_t value) {
-  return value && !(value & (value - 1));
-}
-
-constexpr uint32_t log2ceil(uint32_t value) {
-  return 32 - count_leading_zeros(value - 1);
-}
-
-inline unsigned log2up(uint32_t value) {
-  return std::max<uint32_t>(1, log2ceil(value));
-}
-
-constexpr unsigned log2floor(uint32_t value) {
-  return 31 - count_leading_zeros(value);
-}
-
-constexpr unsigned ceil2(uint32_t value) {
-  return 32 - count_leading_zeros(value);
-}
-
-inline uint64_t bit_clr(uint64_t bits, uint32_t index) {
-    assert(index <= 63);
-    return bits & ~(1ull << index);
-}
-
-inline uint64_t bit_set(uint64_t bits, uint32_t index) {
-    assert(index <= 63);
-    return bits | (1ull << index);
-}
-
-inline bool bit_get(uint64_t bits, uint32_t index) {
-    assert(index <= 63);
-    return (bits >> index) & 0x1;
-}
-
-inline uint64_t bit_clrw(uint64_t bits, uint32_t start, uint32_t end) {
-    assert(end >= start);
-    assert(end <= 63);
-    uint32_t shift = 63 - end;
-    uint64_t mask = (0xffffffffffffffff << (shift + start)) >> shift;
-    return bits & ~mask;
-}
-
-inline uint64_t bit_setw(uint64_t bits, uint32_t start, uint32_t end, uint64_t value) {
-    assert(end >= start);
-    assert(end <= 63);
-    uint32_t shift = 63 - end;
-    uint64_t dirty = (value << (shift + start)) >> shift;
-    return bit_clrw(bits, start, end) | dirty;
-}
-
-inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
-    assert(end >= start);
-    assert(end <= 63);
-    uint32_t shift = 63 - end;
-    return (bits << shift) >> (shift + start);
-}
-
-// Apply integer sign extension
-inline uint32_t sext32(uint32_t word, uint32_t width) {
-  assert(width > 1);
-  assert(width <= 32);
-  uint32_t mask = (1 << width) - 1;
-  return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : word;
-}
-
 // return file extension
 const char* fileExtension(const char* filepath);
--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@@ -23,8 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
 DBG_TRACE_FLAGS += -DDBG_TRACE_TEX

 DBG_FLAGS += $(DBG_TRACE_FLAGS)
-DBG_FLAGS += -DDBG_CACHE_REQ_INFO
-DBG_FLAGS += -DVCD_OUTPUT

 FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
 TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
@@ -51,10 +49,17 @@ VL_FLAGS += $(RTL_INCLUDE)
 VL_FLAGS += $(CONFIGS)
 CXXFLAGS += $(CONFIGS)

+# Enable Verilator multithreaded simulation
+#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
+#VL_FLAGS += --threads $(THREADS)
+
+# Enable VCD trace
+VCD_TRACE = -DVCD_OUTPUT
+
 # Debugigng
 ifdef DEBUG
-	VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
+	VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
+	CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
 else    
 	VL_FLAGS += -DNDEBUG
 	CXXFLAGS += -O2 -DNDEBUG
--- a/sim/simX/Makefile
+++ b/sim/simX/Makefile
@@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
 TOP = vx_cache_sim

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp 
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp
+SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp

 OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
 VPATH := $(sort $(dir $(SRCS)))
--- a/sim/simX/cache.cpp
+++ b/sim/simX/cache.cpp
@@ -13,6 +13,7 @@ struct params_t {
    uint32_t sets_per_bank;
    uint32_t blocks_per_set;    
    uint32_t words_per_block;
+    uint32_t log2_num_inputs;

    uint32_t word_select_addr_start;
    uint32_t word_select_addr_end;
@@ -31,8 +32,10 @@ struct params_t {
        uint32_t offset_bits = config.B - config.W;
        uint32_t log2_bank_size  = config.C - bank_bits;
        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);        
-        
+        assert(log2_bank_size >= config.B);   
+
+        this->log2_num_inputs = log2ceil(config.num_inputs);
+
        this->words_per_block = 1 << offset_bits;
        this->blocks_per_set  = 1 << config.A;
        this->sets_per_bank   = 1 << index_bits;
@@ -104,7 +107,7 @@ struct set_t {
 struct bank_req_info_t {
    bool     valid;    
    uint32_t req_id;
-    uint32_t req_tag;
+    uint64_t req_tag;
 };

 struct bank_req_t {
@@ -194,7 +197,7 @@ public:
        return root_entry;
    }

-    bool try_pop(bank_req_t* out) {
+    bool pop(bank_req_t* out) {
        for (auto& entry : entries_) {
            if (entry.valid && entry.mshr_replay) {
                *out = entry;
@@ -208,16 +211,13 @@ public:
 };

 struct bank_t {
-    std::vector<set_t>      sets;    
-    MSHR                    mshr;
-    std::queue<bank_req_t>  stall_buffer;
-    bank_req_t              active_req;
+    std::vector<set_t>  sets;    
+    MSHR                mshr;

    bank_t(const CacheConfig& config, 
           const params_t& params) 
        : sets(params.sets_per_bank, params.blocks_per_set)
        , mshr(config.mshr_size)
-        , active_req(config.ports_per_bank) 
    {}
 };

@@ -229,8 +229,8 @@ private:
    CacheConfig config_;
    params_t params_;
    std::vector<bank_t> banks_;
-    std::vector<std::queue<uint32_t>> core_rsps_;
-    Switch<MemReq, MemRsp>::Ptr mem_switch_;
+    Switch<MemReq, MemRsp>::Ptr mem_switch_;    
+    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
    std::vector<MasterPort<MemReq>> mem_req_ports_;
    std::vector<SlavePort<MemRsp>>  mem_rsp_ports_;

@@ -240,241 +240,270 @@ public:
        , config_(config)
        , params_(config)
        , banks_(config.num_banks, {config, params_})
-        , core_rsps_(config.num_inputs)
        , mem_req_ports_(config.num_banks, simobject)
        , mem_rsp_ports_(config.num_banks, simobject)
    {
+        bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
+        bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
+        simobject->MemRspPort.bind(&bypass_switch_->RspIn);
+
        if (config.num_banks > 1) {
            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
            }    
-            mem_switch_->ReqOut.bind(&simobject->MemReqPort);
-            simobject->MemRspPort.bind(&mem_switch_->RspIn);
+            mem_switch_->ReqOut.bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspOut.at(0).bind(&mem_switch_->RspIn);
        } else {
-            mem_req_ports_.at(0).bind(&simobject->MemReqPort);
-            simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
+            mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
+            bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
        }
    }

    void step(uint64_t /*cycle*/) {
-        // process core response
-        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
-            auto& core_rsp = core_rsps_.at(req_id);
-            if (!core_rsp.empty()) {
-                simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_rsp.front()}, config_.latency);
-                core_rsp.pop();
-            }
+        // handle bypasss responses
+        auto& bypass_port = bypass_switch_->RspOut.at(1);            
+        if (!bypass_port.empty()) {
+            auto& mem_rsp = bypass_port.top();
+            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
+            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
+            MemRsp core_rsp(tag);
+            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
+            bypass_port.pop();
        }

-        for (auto& bank : banks_) {
-            auto& active_req = bank.active_req;
+        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);

-            // try chedule mshr replay
-            if (!active_req.valid) {
-                bank.mshr.try_pop(&active_req);
-            }
-
-            // try schedule stall queue if MSHR has space
-            if (!active_req.valid 
-             && !bank.stall_buffer.empty()
-             && !bank.mshr.full()) {            
-                active_req = bank.stall_buffer.front();
-                bank.stall_buffer.pop();
-            }
-        }
+        // handle MSHR replay
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& bank = banks_.at(bank_id);
+            auto& pipeline_req = pipeline_reqs.at(bank_id);
+            bank.mshr.pop(&pipeline_req);
+        }       

        // handle memory fills
-        for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
-            MemRsp mem_rsp;
-            if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
-                this->processMemoryFill(i, mem_rsp.tag);
+        std::vector<bool> pending_fill_req(config_.num_banks, false);
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
+            if (!mem_rsp_port.empty()) {
+                auto& mem_rsp = mem_rsp_port.top();
+                this->processMemoryFill(bank_id, mem_rsp.tag);                
+                pending_fill_req.at(bank_id) = true;
+                mem_rsp_port.pop();
            }
        }
        
        // handle incoming core requests
-        for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
-            MemReq core_req;
-            if (!simobject_->CoreReqPorts.at(i).read(&core_req))
+        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
+            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);            
+            if (core_req_port.empty())
                continue;

-            auto bank_id   = params_.addr_bank_id(core_req.addr);
-            auto set_id    = params_.addr_set_id(core_req.addr);
-            auto tag       = params_.addr_tag(core_req.addr);
-            auto port_id   = i % config_.ports_per_bank;
+            auto& core_req = core_req_port.top();
+
+            // check cache bypassing
+            if (core_req.is_io) {
+                // send IO request
+                this->processIORequest(core_req, req_id);
+
+                // remove request
+                core_req_port.pop();
+                continue;
+            }
+
+            auto bank_id = params_.addr_bank_id(core_req.addr);
+            auto set_id  = params_.addr_set_id(core_req.addr);
+            auto tag     = params_.addr_tag(core_req.addr);
+            auto port_id = req_id % config_.ports_per_bank;
            
-            // create abnk request
+            // create bank request
            bank_req_t bank_req(config_.ports_per_bank);
            bank_req.valid = true;
            bank_req.write = core_req.write;
            bank_req.mshr_replay = false;
            bank_req.tag = tag;            
            bank_req.set_id = set_id;       
-            bank_req.infos.at(port_id) = {true, i, core_req.tag};
+            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};

-            auto& bank = banks_.at(bank_id);
-            
-            // check MSHR capacity
-            if (bank.mshr.full()) {
-                // add to stall buffer
-                bank.stall_buffer.emplace(bank_req);
+            auto& bank = banks_.at(bank_id);            
+            auto& pipeline_req = pipeline_reqs.at(bank_id);
+
+            // check pending MSHR replay
+            if (pipeline_req.valid 
+             && pipeline_req.mshr_replay) {
+                 // stall
+                continue;
+            }    
+
+            // check pending fill request
+            if (pending_fill_req.at(bank_id)) {
+                // stall
                continue;
            }
-
-            auto& active_req = bank.active_req;
-
-            // check pending MSHR request
-            if (active_req.valid 
-             && active_req.mshr_replay) {
-                // add to stall buffer
-                bank.stall_buffer.emplace(bank_req);
+            
+            // check MSHR capacity if read or writeback
+            if ((!core_req.write || !config_.write_through)
+             && bank.mshr.full()) {
+                 // stall
                continue;
-            }        
+            }    

            // check bank conflicts
-            if (active_req.valid) {
+            if (pipeline_req.valid) {
                // check port conflict
-                if (active_req.write != core_req.write
-                 || active_req.set_id != set_id
-                 || active_req.tag != tag
-                 || active_req.infos[port_id].valid) {
-                    // add to stall buffer
-                    bank.stall_buffer.emplace(bank_req);
+                if (pipeline_req.write != core_req.write
+                 || pipeline_req.set_id != set_id
+                 || pipeline_req.tag != tag
+                 || pipeline_req.infos[port_id].valid) {
+                    // stall
                    continue;
                }
                // update pending request infos
-                active_req.infos[port_id] = bank_req.infos[port_id];
+                pipeline_req.infos[port_id] = bank_req.infos[port_id];
            } else {
                // schedule new request
-                active_req = bank_req;
+                pipeline_req = bank_req;
            }
+            // remove request
+            core_req_port.pop();
        }
    
-        // process active request
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
-            this->processBankRequest(bank_id);
+        // process active request        
+        this->processBankRequest(pipeline_reqs);
+    }
+    
+    void processIORequest(const MemReq& core_req, uint32_t req_id) {
+        {
+            MemReq mem_req(core_req);
+            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
+            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
+        }
+
+        if (core_req.write && config_.write_reponse) {
+            simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);            
        }
    }

    void processMemoryFill(uint32_t bank_id, uint32_t mshr_id) {
        // update block
-        auto& bank = banks_.at(bank_id);
-        auto& root_entry = bank.mshr.replay(mshr_id);
-        auto& set   = bank.sets.at(root_entry.set_id);
-        auto& block = set.blocks.at(root_entry.block_id);
+        auto& bank  = banks_.at(bank_id);
+        auto& entry = bank.mshr.replay(mshr_id);
+        auto& set   = bank.sets.at(entry.set_id);
+        auto& block = set.blocks.at(entry.block_id);
        block.valid = true;
-        block.tag   = root_entry.tag;
+        block.tag   = entry.tag;
    }

-    void processBankRequest(uint32_t bank_id) {
-        auto& bank = banks_.at(bank_id);
-        auto& active_req = bank.active_req;
-        if (!active_req.valid)
-            return;
+    void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
+        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+            auto& pipeline_req = pipeline_reqs.at(bank_id);
+            if (!pipeline_req.valid)
+                continue;

-        active_req.valid = false;
+            auto& bank = banks_.at(bank_id);
+            auto& set = bank.sets.at(pipeline_req.set_id);

-        auto& set = bank.sets.at(active_req.set_id);
-
-        if (active_req.mshr_replay) {
-            // send core response
-            for (auto& info : active_req.infos) {
-                core_rsps_.at(info.req_id).emplace(info.req_tag);            
-            }
-        } else {        
-            bool hit = false;
-            bool found_free_block = false;            
-            int hit_block_id = 0;
-            int repl_block_id = 0;            
-            uint32_t max_cnt = 0;
-            
-            for (int i = 0, n = set.blocks.size(); i < n; ++i) {
-                auto& block = set.blocks.at(i);
-                if (block.valid) {
-                    if (block.tag == active_req.tag) {
-                        block.lru_ctr = 0;                        
-                        hit_block_id = i;
-                        hit = true;
-                    } else {
-                        ++block.lru_ctr;
-                    }
-                    if (max_cnt < block.lru_ctr) {
-                        max_cnt = block.lru_ctr;
+            if (pipeline_req.mshr_replay) {
+                // send core response
+                for (auto& info : pipeline_req.infos) {
+                    simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);           
+                }
+            } else {        
+                bool hit = false;
+                bool found_free_block = false;            
+                int hit_block_id = 0;
+                int repl_block_id = 0;            
+                uint32_t max_cnt = 0;
+                
+                for (int i = 0, n = set.blocks.size(); i < n; ++i) {
+                    auto& block = set.blocks.at(i);
+                    if (block.valid) {
+                        if (block.tag == pipeline_req.tag) {
+                            block.lru_ctr = 0;                        
+                            hit_block_id = i;
+                            hit = true;
+                        } else {
+                            ++block.lru_ctr;
+                        }
+                        if (max_cnt < block.lru_ctr) {
+                            max_cnt = block.lru_ctr;
+                            repl_block_id = i;
+                        }
+                    } else {                    
+                        found_free_block = true;
                        repl_block_id = i;
                    }
-                } else {                    
-                    found_free_block = true;
-                    repl_block_id = i;
-                }
-            }
-
-            if (hit) {     
-                //
-                // MISS handling   
-                //                
-                if (active_req.write) {
-                    // handle write hit
-                    auto& hit_block = set.blocks.at(hit_block_id);
-                    if (config_.write_through) {
-                        // forward write request to memory
-                        MemReq mem_req;
-                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
-                        mem_req.write = true;
-                        mem_req.tag   = 0;
-                        mem_req_ports_.at(bank_id).send(mem_req, 1);
-                    } else {
-                        // mark block as dirty
-                        hit_block.dirty = true;
-                    }
-                }
-                // send core response
-                for (auto& info : active_req.infos) {
-                    core_rsps_.at(info.req_id).emplace(info.req_tag);            
-                }
-            } else {     
-                //
-                // MISS handling   
-                //                 
-                if (!found_free_block && !config_.write_through) {
-                     // write back dirty block
-                    auto& repl_block = set.blocks.at(repl_block_id);
-                    if (repl_block.dirty) {                       
-                        MemReq mem_req;
-                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
-                        mem_req.write = true;
-                        mem_req.tag   = 0;
-                        mem_req_ports_.at(bank_id).send(mem_req, 1);
-                    }
                }

-                if (active_req.write && config_.write_through) {
-                    // forward write request to memory
-                    {
-                        MemReq mem_req;
-                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
-                        mem_req.write = true;
-                        mem_req.tag   = 0;
-                        mem_req_ports_.at(bank_id).send(mem_req, 1);
+                if (hit) {     
+                    //
+                    // MISS handling   
+                    //                
+                    if (pipeline_req.write) {
+                        // handle write hit
+                        auto& hit_block = set.blocks.at(hit_block_id);
+                        if (config_.write_through) {
+                            // forward write request to memory
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
+                            mem_req.write = true;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                        } else {
+                            // mark block as dirty
+                            hit_block.dirty = true;
+                        }
                    }
                    // send core response
-                    for (auto& info : active_req.infos) {
-                        core_rsps_.at(info.req_id).emplace(info.req_tag);            
+                    if (!pipeline_req.write || config_.write_reponse) {
+                        for (auto& info : pipeline_req.infos) {          
+                            simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
+                        }
+                    }
+                } else {     
+                    //
+                    // MISS handling   
+                    //                 
+                    if (!found_free_block && !config_.write_through) {
+                        // write back dirty block
+                        auto& repl_block = set.blocks.at(repl_block_id);
+                        if (repl_block.dirty) {                       
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
+                            mem_req.write = true;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                        }
                    }
-                } else {
-                    // lookup
-                    int pending = bank.mshr.lookup(active_req);

-                    // allocate MSHR
-                    int mshr_id = bank.mshr.allocate(active_req, repl_block_id);
-                    
-                    // send fill request
-                    if (pending == -1) {
-                        MemReq mem_req;
-                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
-                        mem_req.write = active_req.write;
-                        mem_req.tag   = mshr_id;
-                        mem_req_ports_.at(bank_id).send(mem_req, 1);
+                    if (pipeline_req.write && config_.write_through) {
+                        // forward write request to memory
+                        {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = true;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                        }
+                        // send core response
+                        if (config_.write_reponse) {
+                            for (auto& info : pipeline_req.infos) {            
+                                simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
+                            }
+                        }
+                    } else {
+                        // MSHR lookup
+                        int pending = bank.mshr.lookup(pipeline_req);
+
+                        // allocate MSHR
+                        int mshr_id = bank.mshr.allocate(pipeline_req, repl_block_id);
+                        
+                        // send fill request
+                        if (pending == -1) {
+                            MemReq mem_req;
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
+                            mem_req.write = pipeline_req.write;
+                            mem_req.tag   = mshr_id;
+                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                        }
                    }
                }
            }
--- a/sim/simX/cache.h
+++ b/sim/simX/cache.h
@@ -14,7 +14,8 @@ struct CacheConfig {
    uint8_t num_banks;      // number of banks
    uint8_t ports_per_bank; // number of ports per bank
    uint8_t num_inputs;     // number of inputs
-    bool    write_through;  // is write-through cache
+    bool    write_through;  // is write-through
+    bool    write_reponse;  // enable write response
    uint16_t victim_size;   // victim cache size
    uint16_t mshr_size;     // MSHR buffer size
    uint8_t latency;        // pipeline latency 
--- a/sim/simX/constants.h
+++ b/sim/simX/constants.h
@@ -10,11 +10,7 @@ namespace vortex {

 struct Constants {

-static constexpr uint32_t CORE_TO_DCACHE_DELAY = 1 + SM_ENABLE;
-static constexpr uint32_t CORE_TO_ICACHE_DELAY = 1;
-
-static constexpr uint32_t ICACHE_TO_MEM_DELAY = 2;
-static constexpr uint32_t DCACHE_TO_MEM_DELAY = 2;
+static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE;

 };

--- a/sim/simX/core.cpp
+++ b/sim/simX/core.cpp
@@ -19,6 +19,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    , decoder_(arch)
    , mmu_(0, arch.wsize(), true)
    , shared_mem_(4096)
+    , tex_units_(NUM_TEX_UNITS, this)
    , warps_(arch.num_warps())
    , barriers_(arch.num_barriers(), 0)
    , csrs_(arch.num_csrs(), 0)
@@ -35,7 +36,8 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
        1,                      // number of banks
        1,                      // number of ports
        1,                      // request size   
-        true,                   // write-throught
+        true,                   // write-through
+        false,                  // write response
        0,                      // victim size
        NUM_WARPS,              // mshr
        2,                      // pipeline latency
@@ -49,12 +51,14 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
        DCACHE_NUM_BANKS,       // number of banks
        DCACHE_NUM_PORTS,       // number of ports
        (uint8_t)arch.num_threads(), // request size   
-        true,                   // write-throught
+        true,                   // write-through
+        false,                  // write response
        0,                      // victim size
        DCACHE_MSHR_SIZE,       // mshr
        2,                      // pipeline latency
      }))
-    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
+    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) 
+    , dcache_switch_(arch.num_threads())
    , fetch_stage_("fetch")
    , decode_stage_("decode")
    , issue_stage_("issue")
@@ -65,10 +69,9 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    , last_schedule_wid_(0)
    , issued_instrs_(0)
    , committed_instrs_(0)
+    , ecall_(false)
    , ebreak_(false)   
    , stats_insts_(0)
-    , stats_loads_(0)
-    , stats_stores_(0)
    , MemRspPort(this)
    , MemReqPort(this)    
 {  
@@ -92,6 +95,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
  this->MemRspPort.bind(&l1_mem_switch_->RspIn);
  l1_mem_switch_->ReqOut.bind(&this->MemReqPort);

+  // lsu/tex switch
+  for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
+    auto& sw = dcache_switch_.at(i);
+#ifdef EXT_TEX_ENABLE
+    sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 2);
+#else
+    sw = Switch<MemReq, MemRsp>::Create("lsu_arb", ArbiterType::Priority, 1);
+#endif        
+    sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
+    dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
+  }
+
  // activate warp0
  warps_.at(0)->setTmask(0, true);
 }
@@ -147,44 +162,41 @@ void Core::warp_scheduler(uint64_t cycle) {
  auto& warp = warps_.at(scheduled_warp);  
  stats_insts_ += warp->getActiveThreads();
  
-  pipeline_state_t state;
-  state.clear();
-  state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
+  auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_);

-  warp->eval(&state);
+  warp->eval(trace);

-  DT(3, cycle, "pipeline-schedule: " << state);
+  DT(3, cycle, "pipeline-schedule: " << *trace);

  // advance to fetch stage  
-  fetch_stage_.push(state);
+  fetch_stage_.push(trace);
 }

 void Core::fetch(uint64_t cycle) {
  // handle icache reponse
-  {
-    MemRsp mem_rsp;
-    if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
-      pipeline_state_t state;
-      pending_icache_.remove(mem_rsp.tag, &state);
-      auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
-      state.icache_latency = latency;
-      decode_stage_.push(state);
-      DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
-    }
+  auto& icache_rsp_port = icache_->CoreRspPorts.at(0);      
+  if (!icache_rsp_port.empty()){
+    auto& mem_rsp = icache_rsp_port.top();
+    auto trace = pending_icache_.at(mem_rsp.tag);
+    auto latency = (SimPlatform::instance().cycles() - trace->icache_latency);
+    trace->icache_latency = latency;
+    decode_stage_.push(trace);
+    DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
+    pending_icache_.release(mem_rsp.tag);
+    icache_rsp_port.pop();
  }

  // send icache request
-  {
-    pipeline_state_t state;
-    if (fetch_stage_.try_pop(&state)) {
-      state.icache_latency = SimPlatform::instance().cycles();
-      MemReq mem_req;
-      mem_req.addr  = state.PC;
-      mem_req.write = false;
-      mem_req.tag   = pending_icache_.allocate(state);    
-      icache_->CoreReqPorts.at(0).send(mem_req, 1);
-      DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
-    }
+  if (!fetch_stage_.empty()) {
+    auto trace = fetch_stage_.top();
+    trace->icache_latency = SimPlatform::instance().cycles();
+    MemReq mem_req;
+    mem_req.addr  = trace->PC;
+    mem_req.write = false;
+    mem_req.tag   = pending_icache_.allocate(trace);    
+    icache_->CoreReqPorts.at(0).send(mem_req, 1);
+    DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
+    fetch_stage_.pop();
  }  

  // schedule next warp
@@ -194,19 +206,21 @@ void Core::fetch(uint64_t cycle) {
 void Core::decode(uint64_t cycle) {
  __unused (cycle);

-  pipeline_state_t state;
-  if (!decode_stage_.try_pop(&state))
-    return;    
+  if (decode_stage_.empty())
+    return;
+
+  auto trace = decode_stage_.top();
  
  // release warp
-  if (!state.stall_warp) {
-    stalled_warps_.reset(state.wid);
+  if (!trace->fetch_stall) {
+    stalled_warps_.reset(trace->wid);
  }

-  DT(3, cycle, "pipeline-decode: " << state);
+  DT(3, cycle, "pipeline-decode: " << *trace);
  
  // advance to issue stage
-  issue_stage_.push(state);
+  issue_stage_.push(trace);
+  decode_stage_.pop();
 }

 void Core::issue(uint64_t cycle) {
@@ -214,12 +228,13 @@ void Core::issue(uint64_t cycle) {

  if (!issue_stage_.empty()) {
    // insert to ibuffer 
-    auto& state = issue_stage_.top();
-    auto& ibuffer = ibuffers_.at(state.wid);
-    if (ibuffer.full()) {
-      DT(3, cycle, "*** ibuffer-stall: " << state);
-    } else {
-      ibuffer.push(state);
+    auto trace = issue_stage_.top();
+    auto& ibuffer = ibuffers_.at(trace->wid);
+    if (!trace->check_stalled(ibuffer.full())) {
+      DT(3, cycle, "*** ibuffer-stall: " << *trace);
+    }
+    if (!ibuffer.full()) {
+      ibuffer.push(trace);
      issue_stage_.pop();
    }
  }
@@ -229,27 +244,30 @@ void Core::issue(uint64_t cycle) {
    if (ibuffer.empty())
      continue;

-    auto& state = ibuffer.top();
+    auto trace = ibuffer.top();

    // check scoreboard
-    if (scoreboard_.in_use(state)) {
+    if (!trace->check_stalled(scoreboard_.in_use(trace))) {
      DTH(3, cycle, "*** scoreboard-stall: dependents={");
-      auto owners = scoreboard_.owners(state);
-      for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
-        if (i) DTN(3, ", ");
-        DTN(3, "#" << owners.at(i));  
+      auto uses = scoreboard_.get_uses(trace);
+      for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
+        auto& use = uses.at(i);
+        __unused(use);
+        if (i) DTN(3, ", ");        
+        DTN(3, use.type << use.reg << "(#" << use.owner << ")");  
      }
-      DTN(3, "}, " << state << std::endl);
-      continue;
+      DTN(3, "}, " << *trace << std::endl);
    }
+    if (scoreboard_.in_use(trace))
+      continue;

-    DT(3, cycle, "pipeline-issue: " << state);
+    DT(3, cycle, "pipeline-issue: " << *trace);

    // update scoreboard
-    scoreboard_.reserve(state);
+    scoreboard_.reserve(trace);

    // advance to execute stage
-    execute_stage_.push(state);
+    execute_stage_.push(trace);

    ibuffer.pop();
    break;
@@ -259,11 +277,11 @@ void Core::issue(uint64_t cycle) {
 void Core::execute(uint64_t cycle) {
  // process stage inputs
  if (!execute_stage_.empty()) {
-    auto& state = execute_stage_.top();
-    auto& exe_unit = exe_units_.at((int)state.exe_type);
-    exe_unit->push_input(state);
+    auto trace = execute_stage_.top();
+    auto& exe_unit = exe_units_.at((int)trace->exe_type);
+    exe_unit->push(trace);    
+    DT(3, cycle, "pipeline-execute: " << *trace);
    execute_stage_.pop();
-    DT(3, cycle, "pipeline-execute: " << state);
  }

  // advance execute units
@@ -273,13 +291,14 @@ void Core::execute(uint64_t cycle) {
  
  // commit completed instructions
  for (auto& exe_unit : exe_units_) {
-    pipeline_state_t state;
-    if (exe_unit->pop_output(&state)) {
-      if (state.stall_warp) {
-        stalled_warps_.reset(state.wid);
+    if (!exe_unit->empty()) {
+      auto trace = exe_unit->top();
+      if (trace->fetch_stall) {
+        stalled_warps_.reset(trace->wid);
      }
      // advance to commit stage
-      commit_stage_.push(state);   
+      commit_stage_.push(trace);   
+      exe_unit->pop();
    }
  }
 }
@@ -287,21 +306,28 @@ void Core::execute(uint64_t cycle) {
 void Core::commit(uint64_t cycle) {
  __unused (cycle);
  
-  pipeline_state_t state;
-  if (!commit_stage_.try_pop(&state))
+  if (commit_stage_.empty())
    return;

-  DT(3, cycle, "pipeline-commit: " << state);
+  auto trace = commit_stage_.top();
+
+  DT(3, cycle, "pipeline-commit: " << *trace);

  // update scoreboard
-  scoreboard_.release(state);
+  scoreboard_.release(trace);

  assert(committed_instrs_ <= issued_instrs_);
  ++committed_instrs_;
+
+  commit_stage_.pop();
+
+  // delete the trace
+  delete trace;
 }

 bool Core::running() const {
-  return (committed_instrs_ != issued_instrs_);
+  bool is_running = (committed_instrs_ != issued_instrs_);
+  return is_running;
 }

 Word Core::get_csr(Addr addr, int tid, int wid) {
@@ -355,6 +381,12 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
    // NumCycles
    return (Word)(SimPlatform::instance().cycles() >> 32);
  } else {
+    if (addr >= CSR_TEX(0,0)
+     && addr < CSR_TEX(NUM_TEX_UNITS,0)) {
+      uint32_t unit = CSR_TEX_UNIT(addr);
+      uint32_t state = CSR_TEX_STATE(addr);
+      return tex_units_.at(unit).get_state(state);
+    }
    return csrs_.at(addr);
  }
 }
@@ -367,6 +399,13 @@ void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
  } else if (addr == CSR_FCSR) {
    fcsrs_.at(wid) = value & 0xff;
  } else {
+    if (addr >= CSR_TEX(0,0)
+     && addr < CSR_TEX(NUM_TEX_UNITS,0)) {
+      uint32_t unit = CSR_TEX_UNIT(addr);
+      uint32_t state = CSR_TEX_STATE(addr);
+      tex_units_.at(unit).set_state(state, value);
+      return;
+    }
    csrs_.at(addr) = value;
  }
 }
@@ -390,29 +429,27 @@ Word Core::icache_read(Addr addr, Size size) {
  return data;
 }

-Word Core::dcache_read(Addr addr, Size size) {
-  ++stats_loads_;
+Word Core::dcache_read(Addr addr, Size size) {  
  Word data = 0;
-#ifdef SM_ENABLE
-  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
-   && ((addr + 3) < SMEM_BASE_ADDR)) {
-     shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
-     return data;
+  if (SM_ENABLE) {
+    if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
+    && ((addr + 3) < SMEM_BASE_ADDR)) {
+      shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
+      return data;
+    }
  }
-#endif
  mmu_.read(&data, addr, size, 0);
  return data;
 }

-void Core::dcache_write(Addr addr, Word data, Size size) {
-  ++stats_stores_;
-#ifdef SM_ENABLE
-  if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
-   && ((addr + 3) < SMEM_BASE_ADDR)) {
-     shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
-     return;
+void Core::dcache_write(Addr addr, Word data, Size size) {  
+  if (SM_ENABLE) {
+    if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
+    && ((addr + 3) < SMEM_BASE_ADDR)) {
+      shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
+      return;
+    }
  }
-#endif
  if (addr >= IO_COUT_ADDR 
   && addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
     this->writeToStdOut(addr, data);
@@ -421,11 +458,8 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
  mmu_.write(&data, addr, size, 0);
 }

-void Core::printStats() const {
-  std::cout << "Cycles: " << SimPlatform::instance().cycles() << std::endl
-            << "Insts : " << stats_insts_ << std::endl
-            << "Loads : " << stats_loads_ << std::endl
-            << "Stores: " << stats_stores_ << std::endl;
+Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<uint64_t>* mem_addrs) {
+  return tex_units_.at(unit).read(u, v, lod, mem_addrs);
 }

 void Core::writeToStdOut(Addr addr, Word data) {
@@ -439,10 +473,14 @@ void Core::writeToStdOut(Addr addr, Word data) {
  }
 }

+void Core::trigger_ecall() {
+  ecall_ = true;
+}
+
 void Core::trigger_ebreak() {
  ebreak_ = true;
 }

-bool Core::check_ebreak() const {
-  return ebreak_;
+bool Core::check_exit() const {
+  return ebreak_ || ecall_;
 }
--- a/sim/simX/core.h
+++ b/sim/simX/core.h
@@ -20,6 +20,7 @@
 #include "ibuffer.h"
 #include "scoreboard.h"
 #include "exeunit.h"
+#include "tex_unit.h"

 namespace vortex {

@@ -34,8 +35,6 @@ public:

  void step(uint64_t cycle);

-  void printStats() const;
-
  Word id() const {
    return id_;
  }
@@ -72,9 +71,13 @@ public:

  void dcache_write(Addr, Word, Size);

+  Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<uint64_t>* mem_addrs);
+
+  void trigger_ecall();
+
  void trigger_ebreak();

-  bool check_ebreak() const;
+  bool check_exit() const;

 private:

@@ -92,10 +95,8 @@ private:
  const ArchDef arch_;
  const Decoder decoder_;
  MemoryUnit mmu_;
-
-#ifdef SM_ENABLE
  RAM shared_mem_;
-#endif 
+  std::vector<TexUnit> tex_units_;

  std::vector<std::shared_ptr<Warp>> warps_;  
  std::vector<WarpMask> barriers_;  
@@ -107,6 +108,7 @@ private:
  Cache::Ptr icache_;
  Cache::Ptr dcache_;
  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
+  std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;

  PipelineStage fetch_stage_;
  PipelineStage decode_stage_;
@@ -114,20 +116,20 @@ private:
  PipelineStage execute_stage_;
  PipelineStage commit_stage_;  
  
-  HashTable<pipeline_state_t> pending_icache_;
+  HashTable<pipeline_trace_t*> pending_icache_;
  WarpMask stalled_warps_;  
  uint32_t last_schedule_wid_;
  uint32_t issued_instrs_;
  uint32_t committed_instrs_;
+  bool ecall_;
  bool ebreak_;

  std::unordered_map<int, std::stringstream> print_bufs_;
  
  uint64_t stats_insts_;
-  uint64_t stats_loads_;
-  uint64_t stats_stores_;

  friend class LsuUnit;
+  friend class GpuUnit;

 public:
  SlavePort<MemRsp>  MemRspPort;
--- a/sim/simX/decode.cpp
+++ b/sim/simX/decode.cpp
@@ -41,14 +41,18 @@ static const std::unordered_map<int, struct InstTableEntry_t> sc_instTable = {
  {Opcode::FMNMSUB,    {false, InstType::R4_TYPE}},  
  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
+  {Opcode::GPU,        {false, InstType::R4_TYPE}},
 };

-static const char* op_string(const Instr &instr) {  
-  Word func3 = instr.getFunc3();
-  Word func7 = instr.getFunc7();
-  Word rs2   = instr.getRSrc(1);
-  Word imm   = instr.getImm();
-  switch (instr.getOpcode()) {
+static const char* op_string(const Instr &instr) {
+  auto opcode = instr.getOpcode();
+  Word func2  = instr.getFunc2();
+  Word func3  = instr.getFunc3();
+  Word func7  = instr.getFunc7();
+  Word rs2    = instr.getRSrc(1);
+  Word imm    = instr.getImm();
+
+  switch (opcode) {
  case Opcode::NOP:        return "NOP";
  case Opcode::LUI_INST:   return "LUI";
  case Opcode::AUIPC_INST: return "AUIPC";
@@ -120,7 +124,16 @@ static const char* op_string(const Instr &instr) {
    }
  case Opcode::SYS_INST: 
    switch (func3) {
-    case 0: return imm ? "EBREAK" : "ECALL";
+    case 0:
+      switch (imm) {
+      case 0x000: return "ECALL";
+      case 0x001: return "EBREAK";
+      case 0x002: return "URET";
+      case 0x102: return "SRET";
+      case 0x302: return "MRET";
+      default:
+        std::abort();      
+      }
    case 1: return "CSRRW";
    case 2: return "CSRRS";
    case 3: return "CSRRC";
@@ -181,29 +194,43 @@ static const char* op_string(const Instr &instr) {
    case 1: return "WSPAWN";
    case 2: return "SPLIT";
    case 3: return "JOIN";
-    case 4: return "BAR"; 
-    case 6: return "PREFETCH";
+    case 4: return "BAR";
+    default:
+      std::abort();
+    }
+  case Opcode::GPU:
+    switch (func3) {
+    case 0: return "TEX";
+    case 1: {
+      switch (func2) {
+      case 0: return "CMOV";
+      default:
+        std::abort();
+      }
+    }
    default:
      std::abort();
    }
  default:
    std::abort();
-  }  
+  }
 }

 namespace vortex {
-std::ostream &operator<<(std::ostream &os, const Instr &instr) {
-  os << op_string(instr) << ": ";
+std::ostream &operator<<(std::ostream &os, const Instr &instr) {  
  auto opcode = instr.getOpcode();    
+  Word func2  = instr.getFunc2();
+  Word func3  = instr.getFunc3();
+
+  os << op_string(instr) << ": ";
+
  if (opcode == S_INST 
-   || opcode == FS
-   || opcode == VS) {     
+   || opcode == FS) {     
     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
  } else 
  if (opcode == L_INST 
-   || opcode == FL
-   || opcode == VL) {     
+   || opcode == FL) {     
     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
  } else {
@@ -219,8 +246,10 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
      if (i) os << ", ";
      os << "imm=0x" << std::hex << instr.getImm();
    }
-  } 
-
+    if (opcode == GPU && func3 == 0) {
+      os << ", unit=" << std::dec << func2;
+    }
+  }
  return os;
 }
 }
@@ -239,6 +268,7 @@ Decoder::Decoder(const ArchDef &arch) {
  shift_func3_  = shift_rd_ + reg_s_;
  shift_rs1_    = shift_func3_ + func3_s_;
  shift_rs2_    = shift_rs1_ + reg_s_;
+  shift_func2_  = shift_rs2_ + reg_s_;
  shift_func7_  = shift_rs2_ + reg_s_;
  shift_rs3_    = shift_func7_ + func2_s_;
  shift_vmop_   = shift_func7_ + vmask_s_;
@@ -247,7 +277,7 @@ Decoder::Decoder(const ArchDef &arch) {
  shift_vset_   = shift_func7_ + 6;

  reg_mask_    = 0x1f;
-  func2_mask_  = 0x2;
+  func2_mask_  = 0x3;
  func3_mask_  = 0x7;
  func6_mask_  = 0x3f;
  func7_mask_  = 0x7f;
@@ -265,6 +295,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
  Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
  instr->setOpcode(op);

+  Word func2 = (code >> shift_func2_) & func2_mask_;
  Word func3 = (code >> shift_func3_) & func3_mask_;
  Word func6 = (code >> shift_func6_) & func6_mask_;
  Word func7 = (code >> shift_func7_) & func7_mask_;
@@ -403,7 +434,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
      }
    } break;

-    case Opcode::VL:
+    case Opcode::FL:
      instr->setDestVReg(rd);
      instr->setSrcVReg(rs1);
      instr->setVlsWidth(func3);
@@ -413,7 +444,7 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
      instr->setVnf((code >> shift_vnf_) & func3_mask_);
      break;

-    case Opcode::VS:
+    case Opcode::FS:
      instr->setVs3(rd);
      instr->setSrcVReg(rs1);
      instr->setVlsWidth(func3);
@@ -428,10 +459,18 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
    }
    break;
  case R4_TYPE:
-    instr->setDestFReg(rd);
-    instr->setSrcFReg(rs1);
-    instr->setSrcFReg(rs2);
-    instr->setSrcFReg(rs3);
+    if (op == Opcode::GPU) {
+      instr->setDestReg(rd);
+      instr->setSrcReg(rs1);
+      instr->setSrcReg(rs2);
+      instr->setSrcReg(rs3);
+    } else {
+      instr->setDestFReg(rd);
+      instr->setSrcFReg(rs1);
+      instr->setSrcFReg(rs2);
+      instr->setSrcFReg(rs3);
+    }
+    instr->setFunc2(func2);
    instr->setFunc3(func3);
    break;
  default:
--- a/sim/simX/execute.cpp
+++ b/sim/simX/execute.cpp
@@ -49,11 +49,12 @@ inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid)
  }
 }

-void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
+void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  assert(tmask_.any());

  Word nextPC = PC_ + core_->arch().wsize();

+  Word func2  = instr.getFunc2();
  Word func3  = instr.getFunc3();
  Word func6  = instr.getFunc6();
  Word func7  = instr.getFunc7();
@@ -117,8 +118,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
  case NOP:
    break;
  case LUI_INST:
-    pipeline_state->exe_type = ExeType::ALU;
-    pipeline_state->alu.type = AluType::ARITH;
+    trace->exe_type = ExeType::ALU;
+    trace->alu.type = AluType::ARITH;
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -127,8 +128,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    rd_write = true;
    break;
  case AUIPC_INST:
-    pipeline_state->exe_type = ExeType::ALU;
-    pipeline_state->alu.type = AluType::ARITH;
+    trace->exe_type = ExeType::ALU;
+    trace->alu.type = AluType::ARITH;
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -137,10 +138,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    rd_write = true;
    break;
  case R_INST:
-    pipeline_state->exe_type = ExeType::ALU;    
-    pipeline_state->alu.type = AluType::ARITH;
-    pipeline_state->used_iregs[rsrc0] = 1;
-    pipeline_state->used_iregs[rsrc1] = 1;
+    trace->exe_type = ExeType::ALU;    
+    trace->alu.type = AluType::ARITH;
+    trace->used_iregs.set(rsrc0);
+    trace->used_iregs.set(rsrc1);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -149,7 +150,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        case 0:
          // MUL
          rddata[t] = ((WordI)rsdata[t][0]) * ((WordI)rsdata[t][1]);
-          pipeline_state->alu.type = AluType::IMUL;
+          trace->alu.type = AluType::IMUL;
          break;
        case 1: {
          // MULH
@@ -163,7 +164,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          }
          uint64_t result = first * second;
          rddata[t] = (result >> 32) & 0xFFFFFFFF;
-          pipeline_state->alu.type = AluType::IMUL;
+          trace->alu.type = AluType::IMUL;
        } break;
        case 2: {
          // MULHSU          
@@ -173,14 +174,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          }
          int64_t second = (int64_t)rsdata[t][1];
          rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
-          pipeline_state->alu.type = AluType::IMUL;
+          trace->alu.type = AluType::IMUL;
        } break;
        case 3: {
          // MULHU
          uint64_t first = (uint64_t)rsdata[t][0];
          uint64_t second = (uint64_t)rsdata[t][1];
          rddata[t] = ((first * second) >> 32) & 0xFFFFFFFF;
-          pipeline_state->alu.type = AluType::IMUL;
+          trace->alu.type = AluType::IMUL;
        } break;
        case 4: {
          // DIV
@@ -193,7 +194,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          } else {
            rddata[t] = dividen / divisor;
          }
-          pipeline_state->alu.type = AluType::IDIV;
+          trace->alu.type = AluType::IDIV;
        } break;
        case 5: {
          // DIVU
@@ -204,7 +205,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          } else {
            rddata[t] = dividen / divisor;
          }
-          pipeline_state->alu.type = AluType::IDIV;
+          trace->alu.type = AluType::IDIV;
        } break;
        case 6: {
          // REM
@@ -217,7 +218,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          } else {
            rddata[t] = dividen % divisor;
          }
-          pipeline_state->alu.type = AluType::IDIV;
+          trace->alu.type = AluType::IDIV;
        } break;
        case 7: {
          // REMU
@@ -228,7 +229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          } else {
            rddata[t] = dividen % divisor;
          }
-          pipeline_state->alu.type = AluType::IDIV;
+          trace->alu.type = AluType::IDIV;
        } break;
        default:
          std::abort();
@@ -285,9 +286,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    rd_write = true;
    break;
  case I_INST:
-    pipeline_state->exe_type = ExeType::ALU;    
-    pipeline_state->alu.type = AluType::ARITH;    
-    pipeline_state->used_iregs[rsrc0] = 1;
+    trace->exe_type = ExeType::ALU;    
+    trace->alu.type = AluType::ARITH;    
+    trace->used_iregs.set(rsrc0);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -336,10 +337,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    rd_write = true;
    break;
  case B_INST:    
-    pipeline_state->exe_type = ExeType::ALU;    
-    pipeline_state->alu.type = AluType::BRANCH;    
-    pipeline_state->used_iregs[rsrc0] = 1;
-    pipeline_state->used_iregs[rsrc1] = 1;
+    trace->exe_type = ExeType::ALU;    
+    trace->alu.type = AluType::BRANCH;    
+    trace->used_iregs.set(rsrc0);
+    trace->used_iregs.set(rsrc1);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -385,107 +386,149 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
      }
      break; // runonce
    }
-    pipeline_state->stall_warp = true;
+    trace->fetch_stall = true;
    break;
  case JAL_INST:    
-    pipeline_state->exe_type = ExeType::ALU;    
-    pipeline_state->alu.type = AluType::BRANCH;
+    trace->exe_type = ExeType::ALU;    
+    trace->alu.type = AluType::BRANCH;
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      rddata[t] = nextPC;
      nextPC = PC_ + immsrc;  
-      pipeline_state->stall_warp = true;
+      trace->fetch_stall = true;
      break; // runonce
    }
    rd_write = true;
    break;
  case JALR_INST:
-    pipeline_state->exe_type = ExeType::ALU;    
-    pipeline_state->alu.type = AluType::BRANCH;    
-    pipeline_state->used_iregs[rsrc0] = 1;
+    trace->exe_type = ExeType::ALU;    
+    trace->alu.type = AluType::BRANCH;    
+    trace->used_iregs.set(rsrc0);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      rddata[t] = nextPC;
      nextPC = rsdata[t][0] + immsrc;
-      pipeline_state->stall_warp = true;
+      trace->fetch_stall = true;
      break; // runOnce
    }
    rd_write = true;
    break;
  case L_INST:
-    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.type = LsuType::LOAD;
-    pipeline_state->used_iregs[rsrc0] = 1;
-    pipeline_state->mem_addrs.resize(num_threads);
-    for (int t = 0; t < num_threads; ++t) {
-      if (!tmask_.test(t))
-        continue;
-      Word memAddr   = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
-      Word shift_by  = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
-      Word data_read = core_->dcache_read(memAddr, 4);
-      pipeline_state->mem_addrs.at(t) = memAddr;
-      DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
-      switch (func3) {
-      case 0:
-        // LBI
-        rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
-        break;
-      case 1:
-        // LHI
-        rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
-        break;
-      case 2:
-        // LW
-        rddata[t] = data_read;
-        break;
-      case 4:
-        // LBU
-        rddata[t] = Word((data_read >> shift_by) & 0xFF);
-        break;
-      case 5:
-        // LHU
-        rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
-        break; 
-      default:
-        std::abort();      
+  case FL:
+    trace->exe_type = ExeType::LSU;    
+    trace->lsu.type = LsuType::LOAD;
+    trace->used_iregs.set(rsrc0);
+    if (opcode == L_INST 
+    || (opcode == FL && func3 == 2)) {
+      for (int t = 0; t < num_threads; ++t) {
+        if (!tmask_.test(t))
+          continue;
+        Word memAddr   = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
+        Word shift_by  = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
+        Word data_read = core_->dcache_read(memAddr, 4);
+        trace->mem_addrs.at(t).push_back(memAddr);
+        DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
+        switch (func3) {
+        case 0:
+          // LBI
+          rddata[t] = sext32((data_read >> shift_by) & 0xFF, 8);
+          break;
+        case 1:
+          // LHI
+          rddata[t] = sext32((data_read >> shift_by) & 0xFFFF, 16);
+          break;
+        case 2:
+          // LW
+          rddata[t] = data_read;
+          break;
+        case 4:
+          // LBU
+          rddata[t] = Word((data_read >> shift_by) & 0xFF);
+          break;
+        case 5:
+          // LHU
+          rddata[t] = Word((data_read >> shift_by) & 0xFFFF);
+          break; 
+        default:
+          std::abort();      
+        }
      }
-    }
-    rd_write = true;
-    break;
-  case S_INST:     
-    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.type = LsuType::STORE;
-    pipeline_state->used_iregs[rsrc0] = 1;
-    pipeline_state->used_iregs[rsrc1] = 1;
-    pipeline_state->mem_addrs.resize(num_threads);
-    for (int t = 0; t < num_threads; ++t) {
-      if (!tmask_.test(t))
-        continue;
-      Word memAddr = rsdata[t][0] + immsrc;
-      pipeline_state->mem_addrs.at(t) = memAddr;
-      DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
-      switch (func3) {
-      case 0:
-        // SB
-        core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
-        break;
-      case 1:
-        // SH
-        core_->dcache_write(memAddr, rsdata[t][1], 2);
-        break;
-      case 2:
-        // SW
-        core_->dcache_write(memAddr, rsdata[t][1], 4);
-        break;
+    } else {
+      DP(4, "Executing vector load");      
+      DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
+      DP(4, "dest: v" << rdest);
+      DP(4, "width" << instr.getVlsWidth());
+      auto &vd = vRegFile_.at(rdest);
+      switch (instr.getVlsWidth()) {
+      case 6: { 
+        // load word and unit strided (not checking for unit stride)
+        for (int i = 0; i < vl_; i++) {
+          Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
+          DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr);
+          Word data_read = core_->dcache_read(memAddr, 4);
+          DP(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
+          int *result_ptr = (int *)(vd.data() + i);
+          *result_ptr = data_read;            
+        }
+      } break;
      default:
        std::abort();
      }
    }
+    rd_write = true;
+    break;
+  case S_INST:   
+  case FS:  
+    trace->exe_type = ExeType::LSU;    
+    trace->lsu.type = LsuType::STORE;
+    trace->used_iregs.set(rsrc0);
+    trace->used_iregs.set(rsrc1);
+    if (opcode == S_INST 
+    || (opcode == FS && func3 == 2)) {
+      for (int t = 0; t < num_threads; ++t) {
+        if (!tmask_.test(t))
+          continue;
+        Word memAddr = rsdata[t][0] + immsrc;
+        trace->mem_addrs.at(t).push_back(memAddr);
+        DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        switch (func3) {
+        case 0:
+          // SB
+          core_->dcache_write(memAddr, rsdata[t][1] & 0x000000FF, 1);
+          break;
+        case 1:
+          // SH
+          core_->dcache_write(memAddr, rsdata[t][1], 2);
+          break;
+        case 2:
+          // SW
+          core_->dcache_write(memAddr, rsdata[t][1], 4);
+          break;
+        default:
+          std::abort();
+        }
+      }
+    } else {
+      for (int i = 0; i < vl_; i++) {
+        Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
+        DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        switch (instr.getVlsWidth()) {
+        case 6: {
+          // store word and unit strided (not checking for unit stride)          
+          uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
+          core_->dcache_write(memAddr, value, 4);
+          DP(4, "store: " << memAddr << " value:" << value);
+        } break;
+        default:
+          std::abort();
+        }          
+      }
+    }
    break;
  case SYS_INST:
-    pipeline_state->exe_type = ExeType::CSR;
+    trace->exe_type = ExeType::CSR;
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -493,30 +536,40 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
      Word csr_value = core_->get_csr(csr_addr, t, id_);
      switch (func3) {
      case 0:
-        if (csr_addr < 2) {
-          // ECALL/EBREAK
+        switch (csr_addr) {
+        case 0: // ECALL
+          core_->trigger_ecall();
+          break;
+        case 1: // EBREAK
          core_->trigger_ebreak();
-        }
+          break;
+        case 0x002: // URET
+        case 0x102: // SRET
+        case 0x302: // MRET
+          break;
+        default:
+          std::abort();
+        }            
        break;
      case 1:
        // CSRRW
        rddata[t] = csr_value;
        core_->set_csr(csr_addr, rsdata[t][0], t, id_);        
-        pipeline_state->used_iregs[rsrc0] = 1;
+        trace->used_iregs.set(rsrc0);
        rd_write = true;
        break;
      case 2:
        // CSRRS
        rddata[t] = csr_value;
        core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
-        pipeline_state->used_iregs[rsrc0] = 1;
+        trace->used_iregs.set(rsrc0);
        rd_write = true;
        break;
      case 3:
        // CSRRC
        rddata[t] = csr_value;
        core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
-        pipeline_state->used_iregs[rsrc0] = 1;
+        trace->used_iregs.set(rsrc0);
        rd_write = true;
        break;
      case 5:
@@ -543,88 +596,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    } 
    break;
  case FENCE:
-    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.type = LsuType::FENCE;
-    pipeline_state->stall_warp = true;
-    break;
-  case (FL | VL):
-    pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.type = LsuType::LOAD;
-    pipeline_state->used_iregs[rsrc0] = 1;    
-    if (func3 == 0x2) {
-      pipeline_state->mem_addrs.resize(num_threads);
-      for (int t = 0; t < num_threads; ++t) {
-        if (!tmask_.test(t))
-          continue;
-        Word memAddr = rsdata[t][0] + immsrc;
-        pipeline_state->mem_addrs.at(t) = memAddr;
-        Word data_read = core_->dcache_read(memAddr, 4);        
-        DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
-        rddata[t] = data_read;
-      }
-    } else {  
-      DP(3, "Executing vector load");      
-      DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
-      DP(3, "dest: v" << rdest);
-      DP(3, "width" << instr.getVlsWidth());
-      pipeline_state->mem_addrs.resize(vl_);
-      auto &vd = vRegFile_.at(rdest);
-      switch (instr.getVlsWidth()) {
-      case 6: { 
-        // load word and unit strided (not checking for unit stride)
-        for (int i = 0; i < vl_; i++) {
-          Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
-          pipeline_state->mem_addrs.at(i) = memAddr;
-          DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
-          Word data_read = core_->dcache_read(memAddr, 4);
-          DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
-          int *result_ptr = (int *)(vd.data() + i);
-          *result_ptr = data_read;            
-        }
-      } break;
-      default:
-        std::abort();
-      }
-      break;
-    }
-    rd_write = true;
-    break;
-  case (FS | VS):
-    pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.type = LsuType::STORE;
-    pipeline_state->used_iregs[rsrc0] = 1;
-    pipeline_state->used_iregs[rsrc1] = 1;    
-    if (func3 == 0x2) {
-      pipeline_state->mem_addrs.resize(num_threads);
-      for (int t = 0; t < num_threads; ++t) {
-        if (!tmask_.test(t))
-          continue;
-        Word memAddr = rsdata[t][0] + immsrc;
-        pipeline_state->mem_addrs.at(t) = memAddr;
-        core_->dcache_write(memAddr, rsdata[t][1], 4);
-        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
-      }
-    } else {      
-      pipeline_state->mem_addrs.resize(vl_);
-      for (int i = 0; i < vl_; i++) {
-        Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
-        pipeline_state->mem_addrs.at(i) = memAddr;
-        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
-        switch (instr.getVlsWidth()) {
-        case 6: {
-          //store word and unit strided (not checking for unit stride)          
-          uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
-          core_->dcache_write(memAddr, value, 4);
-          DP(3, "store: " << memAddr << " value:" << value);
-        } break;
-        default:
-          std::abort();
-        }          
-      }
-    }
-    break;    
+    trace->exe_type = ExeType::LSU;    
+    trace->lsu.type = LsuType::FENCE;
+    trace->fetch_stall = true;
+    break;   
  case FCI:        
-    pipeline_state->exe_type = ExeType::FPU;     
+    trace->exe_type = ExeType::FPU;     
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue; 
@@ -633,32 +610,32 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
      switch (func7) {
      case 0x00: //FADD
        rddata[t] = rv_fadd(rsdata[t][0], rsdata[t][1], frm, &fflags);
-        pipeline_state->fpu.type = FpuType::FMA;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FMA;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;
      case 0x04: //FSUB
        rddata[t] = rv_fsub(rsdata[t][0], rsdata[t][1], frm, &fflags);
-        pipeline_state->fpu.type = FpuType::FMA;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FMA;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;
      case 0x08: //FMUL
        rddata[t] = rv_fmul(rsdata[t][0], rsdata[t][1], frm, &fflags);
-        pipeline_state->fpu.type = FpuType::FMA;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FMA;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;
      case 0x0c: //FDIV
        rddata[t] = rv_fdiv(rsdata[t][0], rsdata[t][1], frm, &fflags);
-        pipeline_state->fpu.type = FpuType::FDIV;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FDIV;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;
      case 0x2c: //FSQRT
        rddata[t] = rv_fsqrt(rsdata[t][0], frm, &fflags);
-        pipeline_state->fpu.type = FpuType::FSQRT;
-        pipeline_state->used_fregs[rsrc0] = 1;
+        trace->fpu.type = FpuType::FSQRT;
+        trace->used_fregs.set(rsrc0);
        break;        
      case 0x10:
        switch (func3) {            
@@ -672,9 +649,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          rddata[t] = rv_fsgnjx(rsdata[t][0], rsdata[t][1]);
          break;
        }
-        pipeline_state->fpu.type = FpuType::FNCP;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FNCP;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;
      case 0x14:              
        if (func3) {
@@ -684,9 +661,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          // FMIN.S
          rddata[t] = rv_fmin(rsdata[t][0], rsdata[t][1], &fflags);
        }
-        pipeline_state->fpu.type = FpuType::FNCP;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;        
+        trace->fpu.type = FpuType::FNCP;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);        
        break;
      case 0x60:
        if (rsrc1 == 0) { 
@@ -696,8 +673,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          // FCVT.WU.S
          rddata[t] = rv_ftou(rsdata[t][0], frm, &fflags);
        }
-        pipeline_state->fpu.type = FpuType::FCVT;
-        pipeline_state->used_fregs[rsrc0] = 1;
+        trace->fpu.type = FpuType::FCVT;
+        trace->used_fregs.set(rsrc0);
        break;
      case 0x70:      
        if (func3) {
@@ -707,8 +684,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          // FMV.X.W
          rddata[t] = rsdata[t][0];
        }        
-        pipeline_state->fpu.type = FpuType::FNCP;
-        pipeline_state->used_fregs[rsrc0] = 1;
+        trace->fpu.type = FpuType::FNCP;
+        trace->used_fregs.set(rsrc0);
        break;
      case 0x50:             
        switch(func3) {              
@@ -725,9 +702,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          rddata[t] = rv_feq(rsdata[t][0], rsdata[t][1], &fflags);
          break;
        } 
-        pipeline_state->fpu.type = FpuType::FNCP;
-        pipeline_state->used_fregs[rsrc0] = 1;
-        pipeline_state->used_fregs[rsrc1] = 1;
+        trace->fpu.type = FpuType::FNCP;
+        trace->used_fregs.set(rsrc0);
+        trace->used_fregs.set(rsrc1);
        break;        
      case 0x68:
        if (rsrc1) {
@@ -737,14 +714,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          // FCVT.S.W:
          rddata[t] = rv_itof(rsdata[t][0], frm, &fflags);
        }
-        pipeline_state->fpu.type = FpuType::FCVT;
-        pipeline_state->used_iregs[rsrc0] = 1;
+        trace->fpu.type = FpuType::FCVT;
+        trace->used_iregs.set(rsrc0);
        break;
      case 0x78:
        // FMV.W.X
        rddata[t] = rsdata[t][0];
-        pipeline_state->fpu.type = FpuType::FNCP;
-        pipeline_state->used_iregs[rsrc0] = 1;
+        trace->fpu.type = FpuType::FNCP;
+        trace->used_iregs.set(rsrc0);
        break;
      }
      update_fcrs(fflags, core_, t, id_);
@@ -755,10 +732,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
  case FMSUB:      
  case FMNMADD:
  case FMNMSUB: 
-    pipeline_state->fpu.type = FpuType::FMA;
-    pipeline_state->used_fregs[rsrc0] = 1;
-    pipeline_state->used_fregs[rsrc1] = 1;
-    pipeline_state->used_fregs[rsrc2] = 1;
+    trace->fpu.type = FpuType::FMA;
+    trace->used_fregs.set(rsrc0);
+    trace->used_fregs.set(rsrc1);
+    trace->used_fregs.set(rsrc2);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
@@ -784,8 +761,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    }
    rd_write = true;
    break;
-  case GPGPU: {
-    pipeline_state->exe_type = ExeType::GPU;
+  case GPGPU: {    
    int ts = 0;
    for (int t = 0; t < num_threads; ++t) {
      if (tmask_.test(t)) {
@@ -795,10 +771,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    }
    switch (func3) {
    case 0: {
-      // TMC        
-      pipeline_state->gpu.type = GpuType::TMC;
-      pipeline_state->used_iregs[rsrc0] = 1;
-      pipeline_state->stall_warp = true;
+      // TMC   
+      trace->exe_type = ExeType::GPU;     
+      trace->gpu.type = GpuType::TMC;
+      trace->used_iregs.set(rsrc0);
+      trace->fetch_stall = true;
      if (rsrc1) {
        // predicate mode
        ThreadMask pred;
@@ -823,10 +800,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    } break;
    case 1: {
      // WSPAWN
-      pipeline_state->gpu.type = GpuType::WSPAWN;
-      pipeline_state->used_iregs[rsrc0] = 1;
-      pipeline_state->used_iregs[rsrc1] = 1;
-      pipeline_state->stall_warp = true;
+      trace->exe_type = ExeType::GPU;
+      trace->gpu.type = GpuType::WSPAWN;
+      trace->used_iregs.set(rsrc0);
+      trace->used_iregs.set(rsrc1);
+      trace->fetch_stall = true;
      int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
      DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
      for (int i = 1; i < active_warps; ++i) {
@@ -837,9 +815,10 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    } break;
    case 2: {
      // SPLIT    
-      pipeline_state->gpu.type = GpuType::SPLIT;
-      pipeline_state->used_iregs[rsrc0] = 1;
-      pipeline_state->stall_warp = true;
+      trace->exe_type = ExeType::GPU;
+      trace->gpu.type = GpuType::SPLIT;
+      trace->used_iregs.set(rsrc0);
+      trace->fetch_stall = true;
      if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
        ThreadMask tmask;
        for (int i = 0; i < num_threads; ++i) {
@@ -868,8 +847,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    } break;
    case 3: {
      // JOIN
-      pipeline_state->gpu.type = GpuType::JOIN;        
-      pipeline_state->stall_warp = true;        
+      trace->exe_type = ExeType::GPU;
+      trace->gpu.type = GpuType::JOIN;        
+      trace->fetch_stall = true;        
      if (!domStack_.empty() && domStack_.top().unanimous) {
        DP(3, "*** Uninimous branch at join");
        tmask_ = domStack_.top().tmask;
@@ -893,18 +873,19 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    } break;
    case 4: {
      // BAR
-      pipeline_state->gpu.type = GpuType::BAR;
-      pipeline_state->used_iregs[rsrc0] = 1;
-      pipeline_state->used_iregs[rsrc1] = 1;
-      pipeline_state->stall_warp = true; 
+      trace->exe_type = ExeType::GPU; 
+      trace->gpu.type = GpuType::BAR;
+      trace->used_iregs.set(rsrc0);
+      trace->used_iregs.set(rsrc1);
+      trace->fetch_stall = true; 
      active_ = false;
      core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); 
    } break;
-    case 6: {
+    case 5: {
      // PREFETCH
-      pipeline_state->exe_type = ExeType::LSU; 
-      pipeline_state->lsu.type = LsuType::PREFETCH; 
-      pipeline_state->used_iregs[rsrc0] = 1;
+      trace->exe_type = ExeType::LSU; 
+      trace->lsu.type = LsuType::PREFETCH; 
+      trace->used_iregs.set(rsrc0);
      for (int t = 0; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;
@@ -915,7 +896,50 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    default:
      std::abort();
    }
-    }  break;
+  }  break;
+  case GPU: {    
+    switch (func3) {
+    case 0: { // TEX
+      trace->exe_type = ExeType::GPU; 
+      trace->gpu.type = GpuType::TEX;
+      trace->used_iregs.set(rsrc0);
+      trace->used_iregs.set(rsrc1);
+      trace->used_iregs.set(rsrc2);
+      for (int t = 0; t < num_threads; ++t) {
+        if (!tmask_.test(t))
+          continue;        
+        auto unit  = func2;
+        auto u     = rsdata[t][0];
+        auto v     = rsdata[t][1];
+        auto lod   = rsdata[t][2];
+        auto color = core_->tex_read(unit, u, v, lod, &trace->mem_addrs.at(t));
+        rddata[t] = color;
+      }
+      rd_write = true;
+    } break;
+    case 1: 
+      switch (func2) {
+      case 0: { // CMOV
+        trace->exe_type = ExeType::ALU;
+        trace->alu.type = AluType::CMOV;
+        trace->used_iregs.set(rsrc0);
+        trace->used_iregs.set(rsrc1);
+        trace->used_iregs.set(rsrc2);
+        for (int t = 0; t < num_threads; ++t) {
+          if (!tmask_.test(t))
+            continue;     
+          rddata[t] = rsdata[t][0] ? rsdata[t][1] : rsdata[t][2];
+        }
+        rd_write = true;
+      } break;
+      default:
+        std::abort();
+      }
+      break;
+    default:
+      std::abort();
+    }
+  } break;
  case VSET: {
    int VLEN = core_->arch().vsize() * 8;
    int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@@ -966,7 +990,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }                
      } break;
      case 24: {
-        //vmseq
+        // vmseq
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -997,7 +1021,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 25: { 
-        //vmsne
+        // vmsne
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1028,7 +1052,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 26: {
-        //vmsltu
+        // vmsltu
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1059,7 +1083,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 27: {
-        //vmslt
+        // vmslt
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1090,7 +1114,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 28: {
-        //vmsleu
+        // vmsleu
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1121,7 +1145,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 29: {
-        //vmsle
+        // vmsle
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1152,7 +1176,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 30: {
-        //vmsgtu
+        // vmsgtu
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1183,7 +1207,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 31: {
-        //vmsgt
+        // vmsgt
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1356,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 27: { 
-        //vmxor
+        // vmxor
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1402,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 28: {
-        //vmornot
+        // vmornot
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1448,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 29: {
-        //vmnand
+        // vmnand
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1494,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 30: {
-        //vmnor
+        // vmnor
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1540,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 31: {
-        //vmxnor
+        // vmxnor
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1586,7 +1610,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
      } break;
      case 37: {
-        //vmul
+        // vmul
        auto &vr1 = vRegFile_.at(rsrc0);
        auto &vr2 = vRegFile_.at(rsrc1);
        auto &vd = vRegFile_.at(rdest);
@@ -1769,7 +1793,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
  }

  if (rd_write) {
-    pipeline_state->wb = true;
+    trace->wb = true;
    DPH(2, "Dest Reg: ");
    auto rdt = instr.getRDType();    
    switch (rdt) {
@@ -1786,7 +1810,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          DPN(2, "0x" << std::hex << rddata[t]);         
        }
        DPN(2, "}" << std::endl);
-        pipeline_state->used_iregs[rdest] = 1;
+        trace->used_iregs[rdest] = 1;
      }
      break;
    case RegType::Float:
@@ -1801,7 +1825,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        DPN(2, "0x" << std::hex << rddata[t]);         
      }
      DPN(2, "}" << std::endl);
-      pipeline_state->used_fregs[rdest] = 1;
+      trace->used_fregs[rdest] = 1;
      break;
    default:
      std::abort();
--- a/sim/simX/exeunit.cpp
+++ b/sim/simX/exeunit.cpp
@@ -6,16 +6,18 @@
 #include <util.h>
 #include "debug.h"
 #include "core.h"
+#include "constants.h"

 using namespace vortex;

 NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
    
 void NopUnit::step(uint64_t /*cycle*/) {
-    pipeline_state_t state;
-    if (!inputs_.try_pop(&state))
+    if (inputs_.empty()) 
        return;
-    this->schedule_output(state, 1);
+    auto trace = inputs_.top();
+    this->schedule_output(trace, 1);
+    inputs_.pop();
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -33,19 +35,23 @@ void LsuUnit::step(uint64_t cycle) {

    // handle dcache response
    for (uint32_t t = 0; t < num_threads_; ++t) {
-        MemRsp mem_rsp;
-        if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
+        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
+        if (dcache_rsp_port.empty())
            continue;
-        auto& entry = pending_dcache_.at(mem_rsp.tag);  
-        DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);  
-        assert(entry.second.test(t));
-        entry.second.reset(t); // track remaining blocks        
-        if (!entry.second.any()) {        
-            auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
-            entry.first.dcache_latency = latency;
-            this->schedule_output(entry.first, 1);
+        auto& mem_rsp = dcache_rsp_port.top();
+        auto& entry = pending_dcache_.at(mem_rsp.tag);          
+        auto trace = entry.first;
+        DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
+            << ", tid=" << t << ", " << *trace);  
+        assert(entry.second);
+        --entry.second; // track remaining blocks 
+        if (0 == entry.second) {        
+            auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
+            trace->dcache_latency = latency;
+            this->schedule_output(trace, 1);
            pending_dcache_.release(mem_rsp.tag);
-        }   
+        } 
+        dcache_rsp_port.pop();  
    }

    if (fence_lock_) {
@@ -61,36 +67,83 @@ void LsuUnit::step(uint64_t cycle) {
    if (inputs_.empty())
        return;

-    auto state = inputs_.top();
+    auto trace = inputs_.top();

-    if (state.lsu.type == LsuType::FENCE) {
+    if (trace->lsu.type == LsuType::FENCE) {
        // schedule fence lock
-        fence_state_ = state;
-        fence_lock_ = true;
-        inputs_.pop();
-        DT(3, cycle, "fence-lock: " << state);
+        fence_state_ = trace;
+        fence_lock_ = true;        
+        DT(3, cycle, "fence-lock: " << *trace);
+        // remove input
+        inputs_.pop(); 
        return;
    }

    // check pending queue capacity
-    if (pending_dcache_.full()) {
-        DT(3, cycle, "*** lsu-queue-stall: " << state);
+    if (!trace->check_stalled(pending_dcache_.full())) {
+        DT(3, cycle, "*** lsu-queue-stall: " << *trace);
+    }
+    if (pending_dcache_.full())
        return;
+
+    // send memory request
+
+    bool has_shared_memory = false;
+    bool mem_rsp_pending = false;    
+    bool is_write = (trace->lsu.type == LsuType::STORE);
+
+    uint32_t valid_addrs = 0;
+    for (auto& mem_addr : trace->mem_addrs) {
+        valid_addrs += mem_addr.size();
+    }    
+
+    trace->dcache_latency = SimPlatform::instance().cycles();
+    auto tag = pending_dcache_.allocate({trace, valid_addrs});
+
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        if (!trace->tmask.test(t))
+            continue;
+
+        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
+        for (auto mem_addr : trace->mem_addrs.at(t)) {
+            // check shared memory address
+            if (SM_ENABLE) {
+                if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
+                && (mem_addr < SMEM_BASE_ADDR)) {
+                    DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag 
+                        << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
+                    has_shared_memory = true;
+                    continue;
+                }
+            }
+
+            bool is_io = (mem_addr >= IO_BASE_ADDR);
+
+            MemReq mem_req;
+            mem_req.addr  = mem_addr;
+            mem_req.write = is_write;
+            mem_req.tag   = tag;
+            mem_req.is_io = is_io; 
+            dcache_req_port.send(mem_req, 1);
+            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag 
+                << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);            
+            // do not wait on writes
+            mem_rsp_pending = !is_write;
+        }
    }

-    // send dcache request 
-    state.dcache_latency = SimPlatform::instance().cycles();
-    auto tag = pending_dcache_.allocate({state, state.tmask});         
-    for (uint32_t t = 0; t < num_threads_; ++t) {
-        if (!state.tmask.test(t))
-            continue;
-        MemReq mem_req;
-        mem_req.addr  = state.mem_addrs.at(t);
-        mem_req.write = (state.lsu.type == LsuType::STORE);
-        mem_req.tag   = tag;
-        core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
-        DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
-    }            
+    // do not wait 
+    if (!mem_rsp_pending) {        
+        pending_dcache_.release(tag);
+        uint32_t delay = 1;
+        if (has_shared_memory) {
+            // all threads accessed shared memory
+            delay += Constants::SMEM_DELAY;
+        }
+        this->schedule_output(trace, delay);
+    }
+
+    // remove input
    inputs_.pop();
 }

@@ -98,23 +151,27 @@ void LsuUnit::step(uint64_t cycle) {

 AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
    
-void AluUnit::step(uint64_t /*cycle*/) {
-    pipeline_state_t state;
-    if (!inputs_.try_pop(&state))
+void AluUnit::step(uint64_t /*cycle*/) {    
+    if (inputs_.empty())
        return;
-    switch  (state.alu.type) {
-    case AluType::ARITH:
-        this->schedule_output(state, 1);
-        break;
+    auto trace = inputs_.top();    
+    switch (trace->alu.type) {
+    case AluType::ARITH:        
    case AluType::BRANCH:
-        this->schedule_output(state, 1);
+    case AluType::CMOV:
+        this->schedule_output(trace, 1);
+        inputs_.pop();
        break;
    case AluType::IMUL:
-        this->schedule_output(state, LATENCY_IMUL);
+        this->schedule_output(trace, LATENCY_IMUL);
+        inputs_.pop();
        break;
    case AluType::IDIV:
-        this->schedule_output(state, XLEN);
+        this->schedule_output(trace, XLEN);
+        inputs_.pop();
        break;
+    default:
+        std::abort();
    }
 }

@@ -123,10 +180,11 @@ void AluUnit::step(uint64_t /*cycle*/) {
 CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
    
 void CsrUnit::step(uint64_t /*cycle*/) {
-    pipeline_state_t state;
-    if (!inputs_.try_pop(&state))
+    if (inputs_.empty()) 
        return;
-    this->schedule_output(state, 1);
+    auto trace = inputs_.top();
+    this->schedule_output(trace, 1);
+    inputs_.pop();
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -134,46 +192,127 @@ void CsrUnit::step(uint64_t /*cycle*/) {
 FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
    
 void FpuUnit::step(uint64_t /*cycle*/) {
-    pipeline_state_t state;
-    if (!inputs_.try_pop(&state))
+    if (inputs_.empty()) 
        return;
-    switch  (state.fpu.type) {
+    auto trace = inputs_.top();
+    switch (trace->fpu.type) {
    case FpuType::FNCP:
-        this->schedule_output(state, 1);
+        this->schedule_output(trace, 1);
+        inputs_.pop();
        break;
    case FpuType::FMA:
-        this->schedule_output(state, LATENCY_FMA);
+        this->schedule_output(trace, LATENCY_FMA);
+        inputs_.pop();
        break;
    case FpuType::FDIV:
-        this->schedule_output(state, LATENCY_FDIV);
+        this->schedule_output(trace, LATENCY_FDIV);
+        inputs_.pop();
        break;
    case FpuType::FSQRT:
-        this->schedule_output(state, LATENCY_FSQRT);
+        this->schedule_output(trace, LATENCY_FSQRT);
+        inputs_.pop();
        break;
    case FpuType::FCVT:
-        this->schedule_output(state, LATENCY_FCVT);
+        this->schedule_output(trace, LATENCY_FCVT);
+        inputs_.pop();
        break;
+    default:
+        std::abort();
    }
 }

 ///////////////////////////////////////////////////////////////////////////////

-GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
+GpuUnit::GpuUnit(Core* core) 
+    : ExeUnit("GPU") 
+    , core_(core)
+    , num_threads_(core->arch().num_threads()) 
+    , pending_tex_reqs_(TEXQ_SIZE)
+{}
    
-void GpuUnit::step(uint64_t /*cycle*/) {
-    pipeline_state_t state;
-    if (!inputs_.try_pop(&state))
+void GpuUnit::step(uint64_t cycle) {
+    __unused (cycle);
+#ifdef EXT_TEX_ENABLE
+    // handle memory response
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
+        if (dcache_rsp_port.empty())
+            continue;
+        auto& mem_rsp = dcache_rsp_port.top();
+        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
+        auto trace = entry.first;
+        DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
+        assert(entry.second);
+        --entry.second; // track remaining blocks 
+        if (0 == entry.second) {             
+            auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
+            trace->dcache_latency = latency;
+            this->schedule_output(trace, 1);
+            pending_tex_reqs_.release(mem_rsp.tag);
+        }   
+        dcache_rsp_port.pop();
+    }
+#endif
+
+    // check input queue
+    if (inputs_.empty())
        return;
-    switch  (state.gpu.type) {
+
+    auto trace = inputs_.top();
+
+    switch  (trace->gpu.type) {
    case GpuType::TMC:
    case GpuType::WSPAWN:
    case GpuType::SPLIT:
    case GpuType::JOIN:
    case GpuType::BAR:
-        this->schedule_output(state, 1);
-        break;
-    case GpuType::TEX:
-        /* TODO */
+        this->schedule_output(trace, 1);
+        inputs_.pop();
        break;
+    case GpuType::TEX: {
+        if (this->processTexRequest(cycle, trace))
+            inputs_.pop();
+    }   break;
+    default:
+        std::abort();
    }
+}
+
+bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
+    __unused (cycle);
+    
+    // check pending queue capacity
+    if (!trace->check_stalled(pending_tex_reqs_.full())) {
+        DT(3, cycle, "*** tex-queue-stall: " << *trace);
+    }
+    if (pending_tex_reqs_.full())
+        return false;
+
+    // send memory request
+
+    uint32_t valid_addrs = 0;
+    for (auto& mem_addr : trace->mem_addrs) {
+        valid_addrs += mem_addr.size();
+    }
+
+    trace->tex_latency = SimPlatform::instance().cycles();
+    auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
+
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        if (!trace->tmask.test(t))
+            continue;
+
+        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
+        for (auto mem_addr : trace->mem_addrs.at(t)) {
+            MemReq mem_req;
+            mem_req.addr  = mem_addr;
+            mem_req.write = (trace->lsu.type == LsuType::STORE);
+            mem_req.tag   = tag;
+            dcache_req_port.send(mem_req, 1);
+            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag 
+                << ", tid=" << t << ", "<< trace);
+        }
+    }
+
+    return true;
 }
--- a/sim/simX/exeunit.h
+++ b/sim/simX/exeunit.h
@@ -11,36 +11,43 @@ class Core;
 class ExeUnit {
 protected:
    const char* name_;
-    Queue<pipeline_state_t> inputs_;
-    Queue<pipeline_state_t> outputs_;
+    Queue<pipeline_trace_t*> inputs_;
+    Queue<pipeline_trace_t*> outputs_;

-    void schedule_output(const pipeline_state_t& state, uint32_t delay) {
+    void schedule_output(pipeline_trace_t* trace, uint32_t delay) {
        if (delay > 1) {
            SimPlatform::instance().schedule(
-                [&](const pipeline_state_t& req) { 
+                [&](pipeline_trace_t* req) { 
                    outputs_.push(req); 
                },
-                state,
+                trace,
                (delay - 1)
            );
        } else {
-            outputs_.push(state);
+            outputs_.push(trace);
        }
    }

 public:    
    typedef std::shared_ptr<ExeUnit> Ptr;

-    ExeUnit(const char* name) : name_(name) {}
-    
+    ExeUnit(const char* name) : name_(name) {}    
    virtual ~ExeUnit() {}

-    void push_input(const pipeline_state_t& state) {
-        inputs_.push(state);
+    void push(pipeline_trace_t* trace) {
+        inputs_.push(trace);
    }

-    bool pop_output(pipeline_state_t* state) {
-        return outputs_.try_pop(state);
+    bool empty() const {
+        return outputs_.empty();
+    }
+
+    pipeline_trace_t* top() const {
+        return outputs_.top();
+    }
+
+    void pop() {
+        outputs_.pop();
    }

    virtual void step(uint64_t cycle) = 0;
@@ -61,8 +68,8 @@ class LsuUnit : public ExeUnit {
 private:
    Core* core_;
    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_state_t, ThreadMask>> pending_dcache_;
-    pipeline_state_t fence_state_;
+    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
+    pipeline_trace_t* fence_state_;
    bool fence_lock_;

 public:
@@ -101,6 +108,13 @@ public:
 ///////////////////////////////////////////////////////////////////////////////

 class GpuUnit : public ExeUnit {
+private:
+    Core* core_;
+    uint32_t num_threads_;
+    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
+
+    bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
+    
 public:
    GpuUnit(Core*);
    
--- a/sim/simX/ibuffer.h
+++ b/sim/simX/ibuffer.h
@@ -7,7 +7,7 @@ namespace vortex {

 class IBuffer {
 private:
-    std::queue<pipeline_state_t> entries_;
+    std::queue<pipeline_trace_t*> entries_;
    uint32_t capacity_;

 public:    
@@ -23,12 +23,12 @@ public:
        return (entries_.size() == capacity_);
    }

-    const pipeline_state_t& top() const {
+    pipeline_trace_t* top() const {
        return entries_.front();
    }

-    void push(const pipeline_state_t& state) {
-        entries_.emplace(state);
+    void push(pipeline_trace_t* trace) {
+        entries_.emplace(trace);
    }

    void pop() {
--- a/sim/simX/instr.h
+++ b/sim/simX/instr.h
@@ -29,10 +29,9 @@ enum Opcode {
  FMNMADD   = 0x4f,
  // Vector Extension  
  VSET      = 0x57,
-  VL        = 0x7,
-  VS        = 0x27,
  // GPGPU Extension
  GPGPU     = 0x6b,
+  GPU       = 0x5b,
 };

 enum InstType { 
@@ -70,6 +69,7 @@ public:
  void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg;  }
  void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
  void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setFunc2(Word func2) { func2_ = func2; }
  void setFunc3(Word func3) { func3_ = func3; }
  void setFunc7(Word func7) { func7_ = func7; }
  void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
@@ -85,6 +85,7 @@ public:

  /* Getters used by encoders. */
  Opcode getOpcode() const { return opcode_; }
+  Word getFunc2() const { return func2_; }
  Word getFunc3() const { return func3_; }
  Word getFunc6() const { return func6_; }
  Word getFunc7() const { return func7_; }
@@ -118,6 +119,7 @@ private:
  RegType rsrc_type_[MAX_REG_SOURCES];
  int rsrc_[MAX_REG_SOURCES];  
  int rdest_;
+  Word func2_;
  Word func3_;
  Word func6_;

--- a/sim/simX/memsim.cpp
+++ b/sim/simX/memsim.cpp
@@ -20,14 +20,16 @@ public:

    void step(uint64_t /*cycle*/) {
        for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
-            MemReq mem_req;     
-            if (!simobject_->MemReqPorts.at(i).read(&mem_req))
+            auto& mem_req_port = simobject_->MemReqPorts.at(i); 
+            if (mem_req_port.empty())
                continue;
+            auto& mem_req = mem_req_port.top();
            if (!mem_req.write) {
                MemRsp mem_rsp;
                mem_rsp.tag = mem_req.tag;
                simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
            }
+            mem_req_port.pop();
        }
    }
 };
--- a/sim/simX/memsim.h
+++ b/sim/simX/memsim.h
@@ -10,10 +10,22 @@ struct MemReq {
    uint64_t addr;
    uint32_t tag;
    bool write;
+    bool is_io;
+
+    MemReq(uint64_t _addr = 0, 
+           uint64_t _tag = 0, 
+           bool _write = false, 
+           bool _is_io = false
+    )   : addr(_addr)
+        , tag(_tag)
+        , write(_write)
+        , is_io(_is_io) 
+    {}
 };

 struct MemRsp {
-    uint32_t tag;
+    uint64_t tag;    
+    MemRsp(uint64_t _tag = 0) : tag (_tag) {}
 };

 class MemSim : public SimObject<MemSim>{
--- a/sim/simX/pipeline.h
+++ b/sim/simX/pipeline.h
@@ -5,11 +5,12 @@
 #include <iostream>
 #include <util.h>
 #include "types.h"
+#include "archdef.h"
 #include "debug.h"

 namespace vortex {

-struct pipeline_state_t {
+struct pipeline_trace_t {
  //--
  uint64_t    id;
  
@@ -20,17 +21,24 @@ struct pipeline_state_t {
  Word        PC;

  //--
-  bool        stall_warp;
+  bool        fetch_stall;
+  bool        pipeline_stall;
+
+  //--
  bool        wb;  
  RegType     rdest_type;
  int         rdest;
+
+  //--
  RegMask     used_iregs;
  RegMask     used_fregs;
  RegMask     used_vregs;

  //- 
  ExeType     exe_type; 
-  std::vector<uint64_t> mem_addrs;
+
+  //--
+  std::vector<std::vector<uint64_t>> mem_addrs;
  
  //--
  union {
@@ -51,27 +59,37 @@ struct pipeline_state_t {
  // stats
  uint64_t icache_latency;
  uint64_t dcache_latency;
+  uint64_t tex_latency;

-  void clear() {
+  pipeline_trace_t(uint64_t id_, const ArchDef& arch) {
+    id  = id_;
    cid = 0;
    wid = 0;
    tmask.reset();
-    PC = 0;
-    stall_warp = false;
-    wb = false;
+    PC  = 0;
+    fetch_stall = false;
+    pipeline_stall = false;
+    wb  = false;
    rdest = 0;
    rdest_type = RegType::None;
    used_iregs.reset();
    used_fregs.reset();
    used_vregs.reset();
    exe_type = ExeType::NOP;
-    mem_addrs.clear();    
+    mem_addrs.resize(arch.num_threads());    
    icache_latency = 0;
    dcache_latency = 0;
+    tex_latency    = 0;
+  }
+
+  bool check_stalled(bool stall) {
+    bool old = pipeline_stall;
+    pipeline_stall = stall;
+    return stall ? old : true;
  }
 };

-inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
+inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state) {
  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
  os << ", wb=" << state.wb;
  if (state.wb) {
@@ -82,10 +100,9 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state)
  return os;
 }

-class PipelineStage : public Queue<pipeline_state_t> {
+class PipelineStage : public Queue<pipeline_trace_t*> {
 protected:
  const char* name_;
-  friend std::ostream &operator<<(std::ostream &, const pipeline_state_t&);

 public:
  PipelineStage(const char* name = nullptr) 
--- a/sim/simX/processor.cpp
+++ b/sim/simX/processor.cpp
@@ -33,7 +33,8 @@ Processor::Processor(const ArchDef& arch)
      L3_NUM_BANKS,           // number of banks
      L3_NUM_PORTS,           // number of ports
      NUM_CLUSTERS,           // request size   
-      true,                   // write-throught
+      true,                   // write-through
+      false,                  // write response
      0,                      // victim size
      L3_MSHR_SIZE,           // mshr
      2,                      // pipeline latency
@@ -74,7 +75,8 @@ Processor::Processor(const ArchDef& arch)
        L2_NUM_BANKS,           // number of banks
        L2_NUM_PORTS,           // number of ports
        NUM_CORES,              // request size   
-        true,                   // write-throught
+        true,                   // write-through
+        false,                  // write response
        0,                      // victim size
        L2_MSHR_SIZE,           // mshr
        2,                      // pipeline latency
@@ -129,7 +131,7 @@ int Processor::run() {
      if (core->running()) {
        running = true;
      }
-      if (core->check_ebreak()) {
+      if (core->check_exit()) {
        exitcode = core->getIRegValue(3);
        running = false;
        break;
@@ -137,5 +139,7 @@ int Processor::run() {
    }
  } while (running);

+  std::cout << std::flush;
+
  return exitcode;
 }
--- a/sim/simX/scoreboard.h
+++ b/sim/simX/scoreboard.h
@@ -7,6 +7,12 @@ namespace vortex {

 class Scoreboard {
 private:
+    struct reg_use_t {
+        RegType  type;
+        uint32_t reg;        
+        uint64_t owner;
+    };
+
    std::vector<RegMask> in_use_iregs_;
    std::vector<RegMask> in_use_fregs_;
    std::vector<RegMask> in_use_vregs_;
@@ -25,21 +31,21 @@ public:
        }
    }

-    bool in_use(const pipeline_state_t& state) const {
-        return (state.used_iregs & in_use_iregs_.at(state.wid)) != 0 
-            || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
-            || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
+    bool in_use(pipeline_trace_t* state) const {
+        return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 
+            || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
+            || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
    }

-    std::vector<uint64_t> owners(const pipeline_state_t& state) const {
-        std::vector<uint64_t> out;        
+    std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
+        std::vector<reg_use_t> out;        
        {
            uint32_t r = 0;
-            auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);        
+            auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);        
            while (used_iregs.any()) {
                if (used_iregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
-                    out.push_back(owners_.at(tag));
+                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
+                    out.push_back({RegType::Integer, r, owners_.at(tag)});
                }
                used_iregs >>= 1;
                ++r;
@@ -47,11 +53,11 @@ public:
        }
        {
            uint32_t r = 0;
-            auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
+            auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
            while (used_fregs.any()) {
                if (used_fregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
-                    out.push_back(owners_.at(tag));
+                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
+                    out.push_back({RegType::Float, r, owners_.at(tag)});
                }
                used_fregs >>= 1;
                ++r;
@@ -59,11 +65,11 @@ public:
        }
        {
            uint32_t r = 0;
-            auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
+            auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
            while (used_vregs.any()) {
                if (used_vregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
-                    out.push_back(owners_.at(tag));
+                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
+                    out.push_back({RegType::Vector, r, owners_.at(tag)});
                }
                used_vregs >>= 1;
                ++r;
@@ -72,44 +78,44 @@ public:
        return std::move(out);
    }
    
-    void reserve(const pipeline_state_t& state) {
-        if (!state.wb)
+    void reserve(pipeline_trace_t* state) {
+        if (!state->wb)
            return;  
-        switch (state.rdest_type) {
+        switch (state->rdest_type) {
        case RegType::Integer:            
-            in_use_iregs_.at(state.wid).set(state.rdest);
+            in_use_iregs_.at(state->wid).set(state->rdest);
            break;
        case RegType::Float:
-            in_use_fregs_.at(state.wid).set(state.rdest);
+            in_use_fregs_.at(state->wid).set(state->rdest);
            break;
        case RegType::Vector:
-            in_use_vregs_.at(state.wid).set(state.rdest);
+            in_use_vregs_.at(state->wid).set(state->rdest);
            break;
        default:  
            break;
        }      
-        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
        assert(owners_.count(tag) == 0);
-        owners_[tag] = state.id;
+        owners_[tag] = state->id;
    }

-    void release(const pipeline_state_t& state) {
-        if (!state.wb)
+    void release(pipeline_trace_t* state) {
+        if (!state->wb)
            return;       
-        switch (state.rdest_type) {
+        switch (state->rdest_type) {
        case RegType::Integer:
-            in_use_iregs_.at(state.wid).reset(state.rdest);
+            in_use_iregs_.at(state->wid).reset(state->rdest);
            break;
        case RegType::Float:
-            in_use_fregs_.at(state.wid).reset(state.rdest);
+            in_use_fregs_.at(state->wid).reset(state->rdest);
            break;
        case RegType::Vector:
-            in_use_vregs_.at(state.wid).reset(state.rdest);
+            in_use_vregs_.at(state->wid).reset(state->rdest);
            break;
        default:  
            break;
        }      
-        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
        owners_.erase(tag);
    }
 };
--- a/sim/simX/tex_unit.cpp
+++ b/sim/simX/tex_unit.cpp
@@ -0,0 +1,91 @@
+#include "tex_unit.h"
+#include "core.h"
+#include <texturing.h>
+#include <VX_config.h>
+
+using namespace vortex;
+
+enum class FilterMode {
+  Point,
+  Bilinear,
+  Trilinear,
+};
+
+TexUnit::TexUnit(Core* core) : core_(core) {}
+
+TexUnit::~TexUnit() {}
+
+uint32_t TexUnit::get_state(uint32_t state) {
+  return states_.at(state);
+}
+  
+void TexUnit::set_state(uint32_t state, uint32_t value) {
+  states_.at(state) = value;
+}
+
+uint32_t TexUnit::read(int32_t u, 
+                       int32_t v, 
+                       int32_t lod, 
+                       std::vector<uint64_t>* mem_addrs) {
+  //--
+  auto xu = Fixed<TEX_FXD_FRAC>::make(u);
+  auto xv = Fixed<TEX_FXD_FRAC>::make(v);
+  uint32_t base_addr  = states_.at(TEX_STATE_ADDR) + states_.at(TEX_STATE_MIPOFF(lod));
+  uint32_t log_width  = std::max<int32_t>(states_.at(TEX_STATE_WIDTH) - lod, 0);
+  uint32_t log_height = std::max<int32_t>(states_.at(TEX_STATE_HEIGHT) - lod, 0);
+  auto format         = (TexFormat)states_.at(TEX_STATE_FORMAT);    
+  auto filter         = (FilterMode)states_.at(TEX_STATE_FILTER);    
+  auto wrapu          = (WrapMode)states_.at(TEX_STATE_WRAPU);
+  auto wrapv          = (WrapMode)states_.at(TEX_STATE_WRAPV);
+
+  auto stride = Stride(format);
+  
+  switch (filter) {
+  case FilterMode::Bilinear: {
+    // addressing
+    uint32_t offset00, offset01, offset10, offset11;
+    uint32_t alpha, beta;
+    TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv, 
+      &offset00, &offset01, &offset10, &offset11, &alpha, &beta);
+
+    uint32_t addr00 = base_addr + offset00 * stride;
+    uint32_t addr01 = base_addr + offset01 * stride;
+    uint32_t addr10 = base_addr + offset10 * stride;
+    uint32_t addr11 = base_addr + offset11 * stride;
+
+    // memory lookup
+    uint32_t texel00 = core_->dcache_read(addr00, stride);
+    uint32_t texel01 = core_->dcache_read(addr01, stride);
+    uint32_t texel10 = core_->dcache_read(addr10, stride);
+    uint32_t texel11 = core_->dcache_read(addr11, stride);
+
+    mem_addrs->push_back(addr00);
+    mem_addrs->push_back(addr01);
+    mem_addrs->push_back(addr10);
+    mem_addrs->push_back(addr11);
+
+    // filtering
+    auto color = TexFilterLinear(
+      format, texel00, texel01, texel10, texel11, alpha, beta);
+    return color;
+  }
+  case FilterMode::Point: {
+    // addressing
+    uint32_t offset;
+    TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
+    
+    uint32_t addr = base_addr + offset * stride;
+
+    // memory lookup
+    uint32_t texel = core_->dcache_read(addr, stride);
+    mem_addrs->push_back(addr);
+
+    // filtering
+    auto color = TexFilterPoint(format, texel);
+    return color;
+  }
+  default:
+    std::abort();
+    return 0;
+  }
+}
--- a/sim/simX/tex_unit.h
+++ b/sim/simX/tex_unit.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "types.h"
+
+namespace vortex {
+
+class Core;
+
+class TexUnit {
+public:
+    TexUnit(Core* core);
+    ~TexUnit();
+
+    uint32_t get_state(uint32_t state);
+  
+    void set_state(uint32_t state, uint32_t value);
+
+    uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<uint64_t>* mem_addrs);
+
+private:
+
+    std::array<uint32_t, NUM_TEX_STATES> states_;
+    Core* core_;
+};
+
+}
--- a/sim/simX/types.h
+++ b/sim/simX/types.h
@@ -66,6 +66,7 @@ enum class AluType {
  BRANCH,
  IMUL,
  IDIV,    
+  CMOV,
 };

 inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -74,6 +75,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  case AluType::BRANCH: os << "BRANCH"; break;
  case AluType::IMUL:   os << "IMUL"; break;
  case AluType::IDIV:   os << "IDIV"; break;
+  case AluType::CMOV:   os << "CMOV"; break;
  }
  return os;
 }
@@ -155,8 +157,6 @@ class Queue {
 protected:
  std::queue<T> queue_;

-  uint32_t count;
-
 public:
  Queue() {}

@@ -168,21 +168,16 @@ public:
    return queue_.front();
  }

-  void push(const T& value) {
-    ++count;
-    queue_.push(value);
+  T& top() {
+    return queue_.front();
  }

  void pop() {
    queue_.pop();
  }

-  bool try_pop(T* value) {
-    if (queue_.empty())
-      return false;
-    *value = queue_.front();
-    queue_.pop();
-    return true;
+  void push(const T& value) {    
+    queue_.push(value);
  }
 };

@@ -244,14 +239,6 @@ public:
    entry.first = false;
    --capacity_;
  }
-
-  void remove(uint32_t index, T* value) {
-    auto& entry = entries_.at(index);
-    assert(entry.first);
-    *value = entry.second;
-    entry.first = false;
-    --capacity_;
-  }
 };

 ///////////////////////////////////////////////////////////////////////////////
@@ -259,18 +246,7 @@ public:
 template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
 class Switch : public SimObject<Switch<Req, Rsp>> {
 private:
-  struct req_batch_t {  
-    std::vector<Req>       data;
-    std::bitset<MaxInputs> valid;
-    req_batch_t() {} 
-    req_batch_t(uint32_t size) 
-      : data(size)
-      , valid(0)
-    {} 
-  };
-
  ArbiterType type_;
-  std::queue<req_batch_t> reqq_;
  uint32_t delay_;  
  uint32_t cursor_;
  uint32_t tag_shift_;
@@ -295,55 +271,43 @@ public:
  {
    assert(delay_ != 0);
    assert(num_inputs <= MaxInputs);
+    if (num_inputs == 1) {
+      // bypass
+      ReqIn.at(0).bind(&ReqOut);
+      RspIn.bind(&RspOut.at(0));
+    }
  }

-  void step(uint64_t /*cycle*/) {    
-    // process incomming requests
-    {
-      req_batch_t req_batch(ReqIn.size());
-      for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
-        Req req;
-        if (ReqIn.at(i).read(&req)) {
-          req_batch.data.at(i) = req;
-          req_batch.valid.set(i);
+  void step(uint64_t /*cycle*/) {  
+    if (ReqIn.size() == 1)
+      return;
+        
+    // process incomming requests    
+    for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {      
+      uint32_t j = (cursor_ + i) % n;
+      auto& req_in = ReqIn.at(j);      
+      if (!req_in.empty()) {
+        auto& req = req_in.top();
+        if (tag_shift_) {
+          req.tag = (req.tag << tag_shift_) | j;
        }
+        ReqOut.send(req, delay_);                
+        req_in.pop();
+        this->update_cursor(j);
+        break;
      }
-      if (req_batch.valid.any()) {
-        reqq_.push(req_batch);
-      }
-    }
-
-    // apply arbitration
-    if (!reqq_.empty()) {
-      auto& req_batch = reqq_.front();
-      for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
-        auto j = (cursor_ + i) % n;        
-        if (req_batch.valid.test(j)) {
-          auto& req = req_batch.data.at(j);
-          if (tag_shift_) {
-            req.tag = (req.tag << tag_shift_) | j;
-          }
-          ReqOut.send(req, delay_);
-          req_batch.valid.reset(j);
-          this->update_cursor(j);
-          if (!req_batch.valid.any())
-            reqq_.pop(); // pop when empty
-          break;
-        }
-      }      
    } 

    // process incoming reponses
-    {
-      Rsp rsp;
-      if (RspIn.read(&rsp)) {    
-        uint32_t port_id = 0;
-        if (tag_shift_) {
-          port_id = rsp.tag & ((1 << tag_shift_)-1);
-          rsp.tag >>= tag_shift_;
-        }      
-        RspOut.at(port_id).send(rsp, 1);
-      }
+    if (!RspIn.empty()) {
+      auto& rsp = RspIn.top();    
+      uint32_t port_id = 0;
+      if (tag_shift_) {
+        port_id = rsp.tag & ((1 << tag_shift_)-1);
+        rsp.tag >>= tag_shift_;
+      }      
+      RspOut.at(port_id).send(rsp, 1);
+      RspIn.pop();
    }
  }

--- a/sim/simX/warp.cpp
+++ b/sim/simX/warp.cpp
@@ -21,7 +21,7 @@ Warp::Warp(Core *core, Word id)
  vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
 }

-void Warp::eval(pipeline_state_t *pipeline_state) {
+void Warp::eval(pipeline_trace_t *trace) {
  assert(tmask_.any());

  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
@@ -38,18 +38,18 @@ void Warp::eval(pipeline_state_t *pipeline_state) {
    std::abort();
  }  

-  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
+  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")");

-  // Update state
-  pipeline_state->cid   = core_->id();
-  pipeline_state->wid   = id_;
-  pipeline_state->PC    = PC_;
-  pipeline_state->tmask = tmask_;
-  pipeline_state->rdest = instr->getRDest();
-  pipeline_state->rdest_type = instr->getRDType();
+  // Update trace
+  trace->cid   = core_->id();
+  trace->wid   = id_;
+  trace->PC    = PC_;
+  trace->tmask = tmask_;
+  trace->rdest = instr->getRDest();
+  trace->rdest_type = instr->getRDType();
    
  // Execute
-  this->execute(*instr, pipeline_state);
+  this->execute(*instr, trace);

  DP(4, "Register state:");
  for (int i = 0; i < core_->arch().num_regs(); ++i) {
--- a/sim/simX/warp.h
+++ b/sim/simX/warp.h
@@ -9,7 +9,7 @@ namespace vortex {

 class Core;
 class Instr;
-class pipeline_state_t;
+class pipeline_trace_t;
 struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
@@ -83,11 +83,11 @@ public:
    return iRegFile_.at(0).at(reg);
  }

-  void eval(pipeline_state_t *);
+  void eval(pipeline_trace_t *);

 private:

-  void execute(const Instr &instr, pipeline_state_t *pipeline_state);
+  void execute(const Instr &instr, pipeline_trace_t *trace);
  
  Word id_;
  Core *core_;
--- a/sim/vlsim/Makefile
+++ b/sim/vlsim/Makefile
@@ -24,7 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
 DBG_TRACE_FLAGS += -DDBG_TRACE_TEX

 DBG_FLAGS += $(DBG_TRACE_FLAGS)
-DBG_FLAGS += -DDBG_CACHE_REQ_INFO

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
@@ -51,10 +50,13 @@ CXXFLAGS += $(CONFIGS)
 #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
 #VL_FLAGS += --threads $(THREADS)

+# Enable VCD trace
+#VCD_TRACE = -DVCD_OUTPUT
+
 # Debugigng
 ifdef DEBUG
-	VL_FLAGS += -DVCD_OUTPUT --trace --trace-structs $(DBG_FLAGS)
-	CXXFLAGS += -g -O0 -DVCD_OUTPUT $(DBG_FLAGS)
+	VL_FLAGS += $(VCD_TRACE) --trace --trace-structs $(DBG_FLAGS)
+	CXXFLAGS += -g -O0 $(VCD_TRACE) $(DBG_FLAGS)
 else    
 	VL_FLAGS += -DNDEBUG
 	CXXFLAGS += -O2 -DNDEBUG