Merge remote-tracking branch 'upstream/master' into vortex2

2024-02-01 23:35:58 -08:00
parent a15f4fd483 b31d868a27
commit 48558982f7
203 changed files with 4383 additions and 21981 deletions
--- a/sim/opaesim/Makefile
+++ b/sim/opaesim/Makefile
@@ -1,20 +1,22 @@
 XLEN ?= 32
-DESTDIR ?= .
-RTL_DIR = ../../hw/rtl
-DPI_DIR = ../../hw/dpi
+DESTDIR ?= $(CURDIR)
+HW_DIR = $(abspath ../../hw)
+COMMON_DIR = $(abspath ../common)
+THIRD_PARTY_DIR = $(abspath ../../third_party)
+RTL_DIR = $(HW_DIR)/rtl
+DPI_DIR = $(HW_DIR)/dpi
 AFU_DIR = $(RTL_DIR)/afu/opae
-SCRIPT_DIR = ../../hw/scripts
-THIRD_PARTY_DIR = ../../third_party
+SCRIPT_DIR = $(HW_DIR)/scripts

 CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
-CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)
+CXXFLAGS += -I$(CURDIR) -I$(HW_DIR) -I$(COMMON_DIR) -I$(DESTDIR)
+CXXFLAGS += -I/$(THIRD_PARTY_DIR)/softfloat/source/include
+CXXFLAGS += -I/$(THIRD_PARTY_DIR)
 CXXFLAGS += -DXLEN_$(XLEN)

-LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
-LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
+LDFLAGS += -shared $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
+LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread

 # control RTL debug tracing states
 DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE  
@@ -53,9 +55,9 @@ endif

 DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)

-SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
-SRCS += fpga.cpp opae_sim.cpp
+SRCS += $(CURDIR)/fpga.cpp $(CURDIR)/opae_sim.cpp

 RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
 RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
@@ -73,7 +75,7 @@ TOP = vortex_afu_shim
 VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
 VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
 VL_FLAGS += --x-initial unique --x-assign unique
-VL_FLAGS += -DSIMULATION
+VL_FLAGS += -DSIMULATION -DSV_DPI
 VL_FLAGS += -DXLEN_$(XLEN)
 VL_FLAGS += $(CONFIGS)
 VL_FLAGS += verilator.vlt
@@ -119,16 +121,16 @@ PROJECT = libopae-c-sim.so
 all: $(DESTDIR)/$(PROJECT)

 $(DESTDIR)/vortex.xml:
-	verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
+	verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $@

 $(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
-	$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
+	$(SCRIPT_DIR)/scope.py $^ -o $@

 $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
-	$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
+	$(SCRIPT_DIR)/gen_config.py -i $^ -o $@

 $(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
-	verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
+	verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@

 clean:
-	rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
+	rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
--- a/sim/rtlsim/.gitignore
+++ b/sim/rtlsim/.gitignore
@@ -1,2 +1 @@
-VX_config.h
 /obj_dir/*
--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@@ -1,18 +1,20 @@
 XLEN ?= 32
-DESTDIR ?= .
-RTL_DIR = ../../hw/rtl
-DPI_DIR = ../../hw/dpi
-THIRD_PARTY_DIR = ../../third_party
+DESTDIR ?= $(CURDIR)
+HW_DIR = $(abspath ../../hw)
+COMMON_DIR = $(abspath ../common)
+THIRD_PARTY_DIR = $(abspath ../../third_party)
+RTL_DIR = $(HW_DIR)/rtl
+DPI_DIR = $(HW_DIR)/dpi

 CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
-CXXFLAGS += -I../../../hw -I../../common
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
-CXXFLAGS += -I../$(THIRD_PARTY_DIR)
+CXXFLAGS += -I$(HW_DIR) -I$(COMMON_DIR)
+CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
+CXXFLAGS += -I$(THIRD_PARTY_DIR)
 CXXFLAGS += -DXLEN_$(XLEN)

-LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
-LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
+LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
+LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 # control RTL debug tracing states
 DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE  
@@ -38,9 +40,9 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
 endif
 RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)

-SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
-SRCS += processor.cpp
+SRCS += $(CURDIR)/processor.cpp

 ifdef AXI_BUS
 	TOP = Vortex_axi
@@ -54,7 +56,7 @@ VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
 VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
 VL_FLAGS += --x-initial unique --x-assign unique
 VL_FLAGS += verilator.vlt
-VL_FLAGS += -DSIMULATION
+VL_FLAGS += -DSIMULATION -DSV_DPI
 VL_FLAGS += -DXLEN_$(XLEN)
 VL_FLAGS += $(CONFIGS)
 VL_FLAGS += $(RTL_INCLUDE)
@@ -87,11 +89,11 @@ PROJECT = rtlsim

 all: $(DESTDIR)/$(PROJECT)

-$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
-	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
+$(DESTDIR)/$(PROJECT): $(SRCS) $(CURDIR)/main.cpp
+	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
 	
 $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
-	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@
+	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@

 clean:
-	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
+	rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@@ -1,11 +1,12 @@
 XLEN ?= 32
-DESTDIR ?= .
-RTL_DIR = ../hw/rtl
-THIRD_PARTY_DIR = ../../third_party
+DESTDIR ?= $(CURDIR)
+HW_DIR = $(abspath ../../hw)
+COMMON_DIR = $(abspath ../common)
+THIRD_PARTY_DIR = $(abspath ../../third_party)

 CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
-CXXFLAGS += -I. -I../common -I../../hw
+CXXFLAGS += -I$(CURDIR) -I$(COMMON_DIR) -I$(HW_DIR)
 CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
 CXXFLAGS += -I$(THIRD_PARTY_DIR)
 CXXFLAGS += -DXLEN_$(XLEN)
@@ -14,8 +15,8 @@ CXXFLAGS += $(CONFIGS)
 LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
 LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

-SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
-SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
+SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
+SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp

 # Debugigng
 ifdef DEBUG
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@@ -28,6 +28,7 @@ private:
  uint16_t num_warps_;
  uint16_t num_cores_;  
  uint16_t num_clusters_;  
+  uint16_t socket_size_;
  uint16_t vsize_;
  uint16_t num_regs_;
  uint16_t num_csrs_;
@@ -35,11 +36,12 @@ private:
  uint16_t ipdom_size_;
  
 public:
-  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)   
+  Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)   
    : num_threads_(num_threads)
    , num_warps_(num_warps)
    , num_cores_(num_cores)
-    , num_clusters_(num_clusters)
+    , num_clusters_(NUM_CLUSTERS)
+    , socket_size_(SOCKET_SIZE)
    , vsize_(16)
    , num_regs_(32)
    , num_csrs_(4096)
@@ -82,6 +84,10 @@ public:
  uint16_t num_clusters() const {
    return num_clusters_;
  }
+
+  uint16_t socket_size() const {
+    return socket_size_;
+  }
 };

 }
--- a/sim/simx/cache_cluster.h
+++ b/sim/simx/cache_cluster.h
@@ -45,20 +45,20 @@ public:

        char sname[100];
        
-        std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
+        std::vector<MemSwitch::Ptr> unit_arbs(num_units);
        for (uint32_t u = 0; u < num_units; ++u) {
            snprintf(sname, 100, "%s-unit-arb-%d", name, u);
-            unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
+            unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
            for (uint32_t i = 0; i < num_requests; ++i) {
                this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
                unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
            }
        }

-        std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
+        std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
        for (uint32_t i = 0; i < config.num_inputs; ++i) {
            snprintf(sname, 100, "%s-mem-arb-%d", name, i);
-            mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
+            mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
            for (uint32_t u = 0; u < num_units; ++u) {              
                unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
                mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
@@ -66,7 +66,7 @@ public:
        }

        snprintf(sname, 100, "%s-cache-arb", name);
-        auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
+        auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);

        for (uint32_t i = 0; i < num_caches; ++i) {
            snprintf(sname, 100, "%s-cache%d", name, i);
--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@@ -41,19 +41,16 @@ struct params_t {
    uint32_t tag_select_addr_end;

    params_t(const CacheSim::Config& config) {
-        int32_t bank_bits = log2ceil(config.num_banks);
-        int32_t offset_bits = config.B - config.W;
-        int32_t log2_bank_size = config.C - bank_bits;
-        int32_t index_bits = log2_bank_size - (config.B + config.A);        
-        assert(log2_bank_size > 0);
+        int32_t offset_bits = config.L - config.W;
+        int32_t index_bits = config.C - (config.L + config.A + config.B);
        assert(offset_bits >= 0);
        assert(index_bits >= 0);

        this->log2_num_inputs = log2ceil(config.num_inputs);

-        this->words_per_line = 1 << offset_bits;
+        this->sets_per_bank  = 1 << index_bits;        
        this->lines_per_set  = 1 << config.A;
-        this->sets_per_bank   = 1 << index_bits;
+        this->words_per_line = 1 << offset_bits;        

        assert(config.ports_per_bank <= this->words_per_line);
                
@@ -63,7 +60,7 @@ struct params_t {

        // Bank select
        this->bank_select_addr_start = (1+this->word_select_addr_end);
-        this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
+        this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);

        // Set select
        this->set_select_addr_start = (1+this->bank_select_addr_end);
@@ -74,23 +71,23 @@ struct params_t {
        this->tag_select_addr_end = (config.addr_width-1);
    }

-    uint32_t addr_bank_id(uint64_t word_addr) const {
+    uint32_t addr_bank_id(uint64_t addr) const {
        if (bank_select_addr_end >= bank_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
+            return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
        else    
            return 0;
    }

-    uint32_t addr_set_id(uint64_t word_addr) const {
+    uint32_t addr_set_id(uint64_t addr) const {
        if (set_select_addr_end >= set_select_addr_start)
-            return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
+            return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
        else
            return 0;
    }

-    uint64_t addr_tag(uint64_t word_addr) const {
+    uint64_t addr_tag(uint64_t addr) const {
        if (tag_select_addr_end >= tag_select_addr_start)
-            return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
+            return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
        else    
            return 0;
    }
@@ -288,8 +285,8 @@ private:
    Config config_;
    params_t params_;
    std::vector<bank_t> banks_;
-    Switch<MemReq, MemRsp>::Ptr bank_switch_;    
-    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
+    MemSwitch::Ptr bank_switch_;    
+    MemSwitch::Ptr bypass_switch_;
    std::vector<SimPort<MemReq>> mem_req_ports_;
    std::vector<SimPort<MemRsp>> mem_rsp_ports_;
    std::vector<bank_req_t> pipeline_reqs_;
@@ -304,16 +301,16 @@ public:
        : simobject_(simobject)
        , config_(config)
        , params_(config)
-        , banks_(config.num_banks, {config, params_})
-        , mem_req_ports_(config.num_banks, simobject)
-        , mem_rsp_ports_(config.num_banks, simobject)
-        , pipeline_reqs_(config.num_banks, config.ports_per_bank)
+        , banks_((1 << config.B), {config, params_})
+        , mem_req_ports_((1 << config.B), simobject)
+        , mem_rsp_ports_((1 << config.B), simobject)
+        , pipeline_reqs_((1 << config.B), config.ports_per_bank)
    {
        char sname[100];
        snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());

        if (config_.bypass) {            
-            bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);            
+            bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);            
            for (uint32_t i = 0; i < config_.num_inputs; ++i) {
               simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
               bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
@@ -323,14 +320,14 @@ public:
            return;
        }
        
-        bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
+        bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
        bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
        simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));

-        if (config.num_banks > 1) {
+        if (config.B != 0) {
            snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
-            bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
-            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+            bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
+            for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
                mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
                bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
            }    
@@ -383,20 +380,22 @@ public:
            pipeline_req.clear();
        }

-        // schedule MSHR replay
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+        // first: schedule MSHR replay (flush MSHR queue)
+        for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
            auto& bank = banks_.at(bank_id);
            auto& pipeline_req = pipeline_reqs_.at(bank_id);
            bank.mshr.pop(&pipeline_req);
        }

-        // schedule memory fill
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+        // second: schedule memory fill (flush memory queue)
+        for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
            auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
            if (mem_rsp_port.empty())
                continue;

            auto& pipeline_req = pipeline_reqs_.at(bank_id);
+
+            // skip if bank already busy
            if (pipeline_req.type != bank_req_t::None)
                continue;

@@ -407,7 +406,7 @@ public:
            mem_rsp_port.pop();
        }

-        // schedule core requests        
+        // last: schedule core requests (flush core queue)      
        for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
            auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
            if (core_req_port.empty())
@@ -425,18 +424,21 @@ public:
            }

            auto bank_id = params_.addr_bank_id(core_req.addr);
-            auto set_id  = params_.addr_set_id(core_req.addr);
-            auto tag     = params_.addr_tag(core_req.addr);
-            auto port_id = req_id % config_.ports_per_bank;
-
            auto& bank = banks_.at(bank_id);
            auto& pipeline_req = pipeline_reqs_.at(bank_id);

+            // skip if bank already busy
+            if (pipeline_req.type != bank_req_t::None)
+                continue;
+
+            auto set_id  = params_.addr_set_id(core_req.addr);
+            auto tag     = params_.addr_tag(core_req.addr);
+            auto port_id = req_id % config_.ports_per_bank;           
+
            // check MSHR capacity
            if ((!core_req.write || !config_.write_through)
             && bank.mshr.full()) {
                ++perf_stats_.mshr_stalls;
-                ++perf_stats_.bank_stalls;
                continue;
            }            

@@ -452,7 +454,7 @@ public:
                }
                // extend request ports
                pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
-            } else if (pipeline_req.type == bank_req_t::None) {
+            } else {
                // schedule new request
                bank_req_t bank_req(config_.ports_per_bank);
                bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
@@ -463,10 +465,6 @@ public:
                bank_req.type  = bank_req_t::Core;
                bank_req.write = core_req.write;
                pipeline_req   = bank_req;
-            } else {
-                // bank in use
-                ++perf_stats_.bank_stalls;
-                continue;
            }

            if (core_req.write)
@@ -516,7 +514,7 @@ private:
    }

    void processBankRequests() {
-        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
+        for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
            auto& bank = banks_.at(bank_id);
            auto pipeline_req = pipeline_reqs_.at(bank_id);
            
@@ -545,11 +543,10 @@ private:
                    }
                }
            } break;
-            case bank_req_t::Core: {        
-                bool hit = false;
-                bool found_free_line = false;            
-                uint32_t hit_line_id = 0;
-                uint32_t repl_line_id = 0;            
+            case bank_req_t::Core: {
+                int32_t hit_line_id  = -1;                
+                int32_t free_line_id = -1;
+                int32_t repl_line_id = 0;
                uint32_t max_cnt = 0;

                auto& set = bank.sets.at(pipeline_req.set_id);
@@ -557,38 +554,34 @@ private:
                // tag lookup                
                for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
                    auto& line = set.lines.at(i);
+                    if (max_cnt < line.lru_ctr) {
+                        max_cnt = line.lru_ctr;
+                        repl_line_id = i;
+                    }
                    if (line.valid) {
-                        if (line.tag == pipeline_req.tag) {
-                            line.lru_ctr = 0;                        
+                        if (line.tag == pipeline_req.tag) {   
                            hit_line_id = i;
-                            hit = true;
+                            line.lru_ctr = 0;
                        } else {
                            ++line.lru_ctr;
                        }
-                        if (max_cnt < line.lru_ctr) {
-                            max_cnt = line.lru_ctr;
-                            repl_line_id = i;
-                        }
                    } else {                    
-                        found_free_line = true;
-                        repl_line_id = i;
+                        free_line_id = i;
                    }
                }

-                if (hit) {     
-                    //
-                    // Hit handling   
-                    //                
+                if (hit_line_id != -1) {
+                    // Hit handling
                    if (pipeline_req.write) {
-                        // handle write hit
+                        // handle write has_hit
                        auto& hit_line = set.lines.at(hit_line_id);
                        if (config_.write_through) {
                            // forward write request to memory
                            MemReq mem_req;
-                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
+                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = true;
-                            mem_req.cid = pipeline_req.cid;
-                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req.cid   = pipeline_req.cid;
+                            mem_req.uuid  = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            DT(3, simobject_->name() << "-dram-" << mem_req);
                        } else {
@@ -606,23 +599,21 @@ private:
                            DT(3, simobject_->name() << "-core-" << core_rsp);
                        }
                    }
-                } else {     
-                    //
-                    // Miss handling   
-                    //
+                } else {
+                    // Miss handling
                    if (pipeline_req.write)
                        ++perf_stats_.write_misses;
                    else
                        ++perf_stats_.read_misses;

-                    if (!found_free_line && !config_.write_through) {
+                    if (free_line_id == -1 && !config_.write_through) {
                        // write back dirty line
                        auto& repl_line = set.lines.at(repl_line_id);
                        if (repl_line.dirty) {                       
                            MemReq mem_req;
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
                            mem_req.write = true;
-                            mem_req.cid = pipeline_req.cid;
+                            mem_req.cid   = pipeline_req.cid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            DT(3, simobject_->name() << "-dram-" << mem_req);
                            ++perf_stats_.evictions;
@@ -635,8 +626,8 @@ private:
                            MemReq mem_req;
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = true;
-                            mem_req.cid = pipeline_req.cid;
-                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req.cid   = pipeline_req.cid;
+                            mem_req.uuid  = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            DT(3, simobject_->name() << "-dram-" << mem_req);
                        }
@@ -655,7 +646,7 @@ private:
                        auto mshr_pending = bank.mshr.lookup(pipeline_req);

                        // allocate MSHR
-                        auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
+                        auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
                        
                        // send fill request
                        if (!mshr_pending) {
@@ -663,8 +654,8 @@ private:
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = false;
                            mem_req.tag   = mshr_id;
-                            mem_req.cid = pipeline_req.cid;
-                            mem_req.uuid = pipeline_req.uuid;
+                            mem_req.cid   = pipeline_req.cid;
+                            mem_req.uuid  = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            DT(3, simobject_->name() << "-dram-" << mem_req);
                            ++pending_fill_reqs_;
--- a/sim/simx/cache_sim.h
+++ b/sim/simx/cache_sim.h
@@ -23,16 +23,15 @@ public:
    struct Config {
        bool    bypass;         // cache bypass
        uint8_t C;              // log2 cache size
-        uint8_t B;              // log2 block size
+        uint8_t L;              // log2 line size
        uint8_t W;              // log2 word size
        uint8_t A;              // log2 associativity
-        uint8_t addr_width;     // word address bits
-        uint8_t num_banks;      // number of banks
+        uint8_t B;              // log2 number of banks
+        uint8_t addr_width;     // word address bits        
        uint8_t ports_per_bank; // number of ports per bank
        uint8_t num_inputs;     // number of inputs
        bool    write_through;  // is write-through
        bool    write_reponse;  // enable write response
-        uint16_t victim_size;   // victim cache size
        uint16_t mshr_size;     // MSHR buffer size
        uint8_t latency;        // pipeline latency
    };
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@@ -18,34 +18,60 @@ using namespace vortex;
 Cluster::Cluster(const SimContext& ctx, 
                 uint32_t cluster_id,
                 ProcessorImpl* processor, 
-                 const Arch &arch, const 
-                 DCRS &dcrs) 
+                 const Arch &arch, 
+                 const DCRS &dcrs) 
  : SimObject(ctx, "cluster")
  , mem_req_port(this)
  , mem_rsp_port(this)
  , cluster_id_(cluster_id)
-  , cores_(arch.num_cores())  
-  , barriers_(arch.num_barriers(), 0)
-  , sharedmems_(arch.num_cores())
  , processor_(processor)
+  , sockets_(NUM_SOCKETS)
+  , barriers_(arch.num_barriers(), 0)
+  , cores_per_socket_(arch.socket_size())
 {
-  auto num_cores = arch.num_cores();
-  
  char sname[100];
+
+  uint32_t sockets_per_cluster = sockets_.size();
+
+  // create sockets
+
+  snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
+  auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
+
+  snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
+  auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
+
+  for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
+    uint32_t socket_id = cluster_id * sockets_per_cluster + i;
+    auto socket = Socket::Create(socket_id, 
+                                 this, 
+                                 arch, 
+                                 dcrs);
+
+    socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
+    icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
+
+    socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
+    dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
+
+    sockets_.at(i) = socket;
+  }
+
+  // Create l2cache
+  
  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
    !L2_ENABLED,
    log2ceil(L2_CACHE_SIZE), // C
-    log2ceil(MEM_BLOCK_SIZE), // B
+    log2ceil(MEM_BLOCK_SIZE), // L
    log2ceil(L2_NUM_WAYS),  // W
    0,                      // A
+    log2ceil(L2_NUM_BANKS), // B
    XLEN,                   // address bits  
-    L2_NUM_BANKS,           // number of banks
    1,                      // number of ports
-    5,                      // request size 
+    2,                      // request size 
    true,                   // write-through
    false,                  // write response
-    0,                      // victim size
    L2_MSHR_SIZE,           // mshr
    2,                      // pipeline latency
  });
@@ -53,89 +79,11 @@ Cluster::Cluster(const SimContext& ctx,
  l2cache_->MemReqPort.bind(&this->mem_req_port);
  this->mem_rsp_port.bind(&l2cache_->MemRspPort);

-  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
-  icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
-    !ICACHE_ENABLED,
-    log2ceil(ICACHE_SIZE),  // C
-    log2ceil(L1_LINE_SIZE), // B
-    log2ceil(sizeof(uint32_t)), // W
-    log2ceil(ICACHE_NUM_WAYS),// A
-    XLEN,                   // address bits    
-    1,                      // number of banks
-    1,                      // number of ports
-    1,                      // number of inputs
-    true,                   // write-through
-    false,                  // write response
-    0,                      // victim size
-    (uint8_t)arch.num_warps(), // mshr
-    2,                      // pipeline latency
-  });
+  icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
+  l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));

-  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
-  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
-
-  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
-  dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
-    !DCACHE_ENABLED,
-    log2ceil(DCACHE_SIZE),  // C
-    log2ceil(L1_LINE_SIZE), // B
-    log2ceil(sizeof(Word)), // W
-    log2ceil(DCACHE_NUM_WAYS),// A
-    XLEN,                   // address bits    
-    DCACHE_NUM_BANKS,       // number of banks
-    1,                      // number of ports
-    DCACHE_NUM_BANKS,       // number of inputs
-    true,                   // write-through
-    false,                  // write response
-    0,                      // victim size
-    DCACHE_MSHR_SIZE,       // mshr
-    4,                      // pipeline latency
-  });
-
-  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
-  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
-
-  ///////////////////////////////////////////////////////////////////////////
-
-  // create shared memory blocks
-  for (uint32_t i = 0; i < num_cores; ++i) {
-    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
-    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
-      (1 << SMEM_LOG_SIZE),
-      sizeof(Word),
-      NUM_LSU_LANES, 
-      NUM_LSU_LANES,
-      false
-    });
-  }
-
-  // create cores
-
-  for (uint32_t i = 0; i < num_cores; ++i) {  
-    uint32_t core_id = cluster_id * num_cores + i;
-    cores_.at(i) = Core::Create(core_id, 
-                                this, 
-                                arch, 
-                                dcrs, 
-                                sharedmems_.at(i));
-
-    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
-    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
-
-    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
-      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
-      auto smem_demux = SMemDemux::Create(sname);
-      
-      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
-      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
-      
-      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
-      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
-
-      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
-      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
-    }
-  }
+  dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
+  l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
 }

 Cluster::~Cluster() {
@@ -153,14 +101,14 @@ void Cluster::tick() {
 }

 void Cluster::attach_ram(RAM* ram) {
-  for (auto core : cores_) {
-    core->attach_ram(ram);
+  for (auto& socket : sockets_) {
+    socket->attach_ram(ram);
  }
 }

 bool Cluster::running() const {
-  for (auto& core : cores_) {
-    if (core->running())
+  for (auto& socket : sockets_) {
+    if (socket->running())
      return true;
  }
  return false;
@@ -169,9 +117,9 @@ bool Cluster::running() const {
 bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
  bool done = true;
  Word exitcode_ = 0;
-  for (auto& core : cores_) {
+  for (auto& socket : sockets_) {
    Word ec;
-    if (core->check_exit(&ec, riscv_test)) {
+    if (socket->check_exit(&ec, riscv_test)) {
      exitcode_ |= ec;
    } else {
      done = false;
@@ -184,36 +132,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
 void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
  auto& barrier = barriers_.at(bar_id);

-  uint32_t local_core_id = core_id % cores_.size();
+  auto sockets_per_cluster = sockets_.size();
+  auto cores_per_socket = cores_per_socket_;
+
+  uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
+  uint32_t local_core_id = core_id % cores_per_cluster;
  barrier.set(local_core_id);

  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);

  if (barrier.count() == (size_t)count) {
      // resume all suspended cores
-      for (uint32_t i = 0; i < cores_.size(); ++i) {
-        if (barrier.test(i)) {
-          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
-          cores_.at(i)->resume();
+      for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
+        for (uint32_t c = 0; c < cores_per_socket; ++c) {
+          uint32_t i = s * cores_per_socket + c;
+          if (barrier.test(i)) {
+            DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+            sockets_.at(s)->resume(c);
+          }
        }
      }
      barrier.reset();
    }
 }

-ProcessorImpl* Cluster::processor() const {
-  return processor_;
-}
-
 Cluster::PerfStats Cluster::perf_stats() const {
-  Cluster::PerfStats perf;
-  perf.icache = icaches_->perf_stats();
-  perf.dcache = dcaches_->perf_stats();
-  perf.l2cache = l2cache_->perf_stats();
-
-  for (auto sharedmem : sharedmems_) {
-    perf.sharedmem += sharedmem->perf_stats();
-  }
-  
-  return perf;
+  PerfStats perf_stats;
+  perf_stats.l2cache = l2cache_->perf_stats();
+  return perf_stats;
 }
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@@ -19,6 +19,7 @@
 #include "cache_cluster.h"
 #include "shared_mem.h"
 #include "core.h"
+#include "socket.h"
 #include "constants.h"

 namespace vortex {
@@ -28,18 +29,7 @@ class ProcessorImpl;
 class Cluster : public SimObject<Cluster> {
 public:
  struct PerfStats {
-    CacheSim::PerfStats   icache;
-    CacheSim::PerfStats   dcache;
-    SharedMem::PerfStats  sharedmem;
-    CacheSim::PerfStats   l2cache;
-
-    PerfStats& operator+=(const PerfStats& rhs) {
-      this->icache      += rhs.icache;
-      this->dcache      += rhs.dcache;
-      this->sharedmem   += rhs.sharedmem;
-      this->l2cache     += rhs.l2cache;
-      return *this;
-    }
+    CacheSim::PerfStats l2cache;
  };

  SimPort<MemReq> mem_req_port;
@@ -53,6 +43,14 @@ public:

  ~Cluster();

+  uint32_t id() const {
+    return cluster_id_;
+  }
+
+  ProcessorImpl* processor() const {
+    return processor_;
+  }
+
  void reset();

  void tick();
@@ -65,22 +63,15 @@ public:

  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);

-  ProcessorImpl* processor() const;
-
-  Cluster::PerfStats perf_stats() const;
+  PerfStats perf_stats() const;
  
 private:
-  uint32_t                     cluster_id_;  
-  std::vector<Core::Ptr>       cores_;  
-  std::vector<CoreMask>        barriers_;
-  CacheSim::Ptr                l2cache_;
-  CacheCluster::Ptr            icaches_;
-  CacheCluster::Ptr            dcaches_;
-  std::vector<SharedMem::Ptr>  sharedmems_;
-  CacheCluster::Ptr            tcaches_;
-  CacheCluster::Ptr            ocaches_;
-  CacheCluster::Ptr            rcaches_;
-  ProcessorImpl*               processor_;
+  uint32_t                    cluster_id_;
+  ProcessorImpl*              processor_;
+  std::vector<Socket::Ptr>    sockets_;  
+  std::vector<CoreMask>       barriers_;
+  CacheSim::Ptr               l2cache_;
+  uint32_t                    cores_per_socket_;
 };

 } // namespace vortex
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@@ -21,6 +21,7 @@
 #include "mem.h"
 #include "decode.h"
 #include "core.h"
+#include "socket.h"
 #include "debug.h"
 #include "constants.h"
 #include "processor_impl.h"
@@ -29,35 +30,36 @@ using namespace vortex;

 Core::Core(const SimContext& ctx, 
           uint32_t core_id, 
-           Cluster* cluster,
+           Socket* socket,
           const Arch &arch, 
-           const DCRS &dcrs,
-           SharedMem::Ptr  sharedmem)
+           const DCRS &dcrs)
    : SimObject(ctx, "core")
    , icache_req_ports(1, this)
    , icache_rsp_ports(1, this)
    , dcache_req_ports(NUM_LSU_LANES, this)
    , dcache_rsp_ports(NUM_LSU_LANES, this)
    , core_id_(core_id)
+    , socket_(socket)
    , arch_(arch)
    , dcrs_(dcrs)
    , decoder_(arch)
    , warps_(arch.num_warps())
    , barriers_(arch.num_barriers(), 0)
    , fcsrs_(arch.num_warps(), 0)
-    , ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
-    , scoreboard_(arch_) 
+    , ibuffers_(arch.num_warps(), IBUF_SIZE)
+    , scoreboard_(arch_)
    , operands_(ISSUE_WIDTH)
-    , dispatchers_((uint32_t)ExeType::MAX)
-    , exe_units_((uint32_t)ExeType::MAX)
-    , sharedmem_(sharedmem)
+    , dispatchers_((uint32_t)ExeType::ExeTypeCount)
+    , exe_units_((uint32_t)ExeType::ExeTypeCount)
+    , smem_demuxs_(NUM_LSU_LANES)
    , fetch_latch_("fetch")
    , decode_latch_("decode")
    , pending_icache_(arch_.num_warps())
-    , committed_traces_(ISSUE_WIDTH, nullptr)
-    , csrs_(arch.num_warps())
-    , cluster_(cluster)
-{  
+    , csrs_(arch.num_warps())  
+    , commit_arbs_(ISSUE_WIDTH)
+{
+  char sname[100];
+
  for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
    csrs_.at(i).resize(arch.num_threads());
  }
@@ -70,6 +72,28 @@ Core::Core(const SimContext& ctx,
    operands_.at(i) = SimPlatform::instance().create_object<Operand>();
  }

+  // initialize shared memory
+  snprintf(sname, 100, "core%d-shared_mem", core_id);
+  shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
+    (1 << SMEM_LOG_SIZE),
+    sizeof(Word),
+    NUM_LSU_LANES, 
+    NUM_LSU_LANES,
+    false
+  });
+  for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
+    snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
+    auto smem_demux = SMemDemux::Create(sname);
+    
+    smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
+    dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
+
+    smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
+    shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
+
+    smem_demuxs_.at(i) = smem_demux;
+  }
+
  // initialize dispatchers
  dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
  dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
@@ -82,6 +106,16 @@ Core::Core(const SimContext& ctx,
  exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
  exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);

+  // bind commit arbiters
+  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {    
+    snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
+    auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
+    for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
+      exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
+    }
+    commit_arbs_.at(i) = arbiter;
+  }
+
  this->reset();
 }

@@ -99,8 +133,12 @@ void Core::reset() {
  for (auto& exe_unit : exe_units_) {
    exe_unit->reset();
  }
+ 
+  for (auto& commit_arb : commit_arbs_) {
+    commit_arb->reset();
+  }
  
-  for ( auto& barrier : barriers_) {
+  for (auto& barrier : barriers_) {
    barrier.reset();
  }
  
@@ -112,7 +150,7 @@ void Core::reset() {
    ibuf.clear();
  }

-  commit_exe_= 0;
+  ibuffer_idx_ = 0;

  scoreboard_.clear();
  fetch_latch_.clear();
@@ -150,8 +188,10 @@ void Core::schedule() {
      break;
    }
  }
-  if (scheduled_warp == -1)
+  if (scheduled_warp == -1) {
+    ++perf_stats_.sched_idle;
    return;
+  }

  // suspend warp until decode
  stalled_warps_.set(scheduled_warp);
@@ -192,11 +232,11 @@ void Core::fetch() {
  mem_req.tag   = pending_icache_.allocate(trace);    
  mem_req.cid   = trace->cid;
  mem_req.uuid  = trace->uuid;
-  icache_req_ports.at(0).send(mem_req, 1);    
+  icache_req_ports.at(0).send(mem_req, 2);    
  DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);    
-  fetch_latch_.pop();    
-  ++pending_ifetches_;   
+  fetch_latch_.pop();
  ++perf_stats_.ifetches;
+  ++pending_ifetches_;
 }

 void Core::decode() {
@@ -206,7 +246,7 @@ void Core::decode() {
  auto trace = decode_latch_.front();

  // check ibuffer capacity
-  auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
+  auto& ibuffer = ibuffers_.at(trace->wid);
  if (ibuffer.full()) {
    if (!trace->log_once(true)) {
      DT(3, "*** ibuffer-stall: " << *trace);
@@ -223,13 +263,6 @@ void Core::decode() {
    stalled_warps_.reset(trace->wid);
  }

-  // update perf counters
-  uint32_t active_threads = trace->tmask.count();
-  if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
-    perf_stats_.loads += active_threads;
-  if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE) 
-    perf_stats_.stores += active_threads;
-
  DT(3, "pipeline-decode: " << *trace);

  // insert to ibuffer 
@@ -239,7 +272,7 @@ void Core::decode() {
 }

 void Core::issue() {   
-  // operands to dispatch
+  // operands to dispatchers
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
    auto& operand = operands_.at(i);    
    if (operand->Output.empty())
@@ -257,7 +290,8 @@ void Core::issue() {

  // issue ibuffer instructions
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-    auto& ibuffer = ibuffers_.at(i);
+    uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
+    auto& ibuffer = ibuffers_.at(ii);
    if (ibuffer.empty())
      continue;

@@ -265,17 +299,41 @@ void Core::issue() {

    // check scoreboard
    if (scoreboard_.in_use(trace)) {
+      auto uses = scoreboard_.get_uses(trace);
      if (!trace->log_once(true)) {
-        DTH(3, "*** scoreboard-stall: dependents={");
-        auto uses = scoreboard_.get_uses(trace);
+        DTH(3, "*** scoreboard-stall: dependents={");        
        for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
          auto& use = uses.at(j);
          __unused (use);
          if (j) DTN(3, ", ");
-          DTN(3, use.type << use.reg << "(#" << use.owner << ")");
+          DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
        }
        DTN(3, "}, " << *trace << std::endl);
      }
+      for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
+        auto& use = uses.at(j);
+        switch (use.exe_type) {        
+        case ExeType::ALU: ++perf_stats_.scrb_alu; break;
+        case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
+        case ExeType::LSU: ++perf_stats_.scrb_lsu; break;        
+        case ExeType::SFU: {
+          ++perf_stats_.scrb_sfu;
+          switch (use.sfu_type) {
+          case SfuType::TMC:
+          case SfuType::WSPAWN:
+          case SfuType::SPLIT:
+          case SfuType::JOIN:
+          case SfuType::BAR:
+          case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
+          case SfuType::CSRRW:
+          case SfuType::CSRRS:
+          case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
+          default: assert(false);
+          }
+        } break;
+        default: assert(false);
+        }        
+      }
      ++perf_stats_.scrb_stalls;
      continue;
    } else {
@@ -294,10 +352,11 @@ void Core::issue() {

    ibuffer.pop();
  }
+  ibuffer_idx_ += ISSUE_WIDTH;
 }

 void Core::execute() {
-  for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
+  for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
    auto& dispatch = dispatchers_.at(i);
    auto& exe_unit = exe_units_.at(i);
    for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
@@ -313,10 +372,10 @@ void Core::execute() {
 void Core::commit() {
  // process completed instructions 
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-    auto trace = committed_traces_.at(i);
-    if (!trace)
+    auto& commit_arb = commit_arbs_.at(i);
+    if (commit_arb->Outputs.at(0).empty())
      continue;
-    committed_traces_.at(i) = nullptr;
+    auto trace = commit_arb->Outputs.at(0).front();

    // advance to commit stage
    DT(3, "pipeline-commit: " << *trace);
@@ -334,27 +393,11 @@ void Core::commit() {
      perf_stats_.instrs += trace->tmask.count();
    }

+    commit_arb->Outputs.at(0).pop();
+
    // delete the trace
    delete trace;
  }
- 
-  // select completed instructions
- for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
-    uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
-    auto& exe_unit = exe_units_.at(ii);
-    for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
-      auto committed_trace = committed_traces_.at(j); 
-      if (committed_trace)
-        continue;
-      auto& output = exe_unit->Outputs.at(j);
-      if (output.empty())
-        continue;
-      auto trace = output.front();
-      committed_traces_.at(j) = trace;
-      output.pop();
-    }
-  }
-  ++commit_exe_;
 }

 void Core::wspawn(uint32_t num_warps, Word nextPC) {
@@ -379,7 +422,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
  if (is_global) {
    // global barrier handling
    if (barrier.count() == active_warps_.count()) {
-      cluster_->barrier(bar_idx, count, core_id_);
+      socket_->barrier(bar_idx, count, core_id_);
      barrier.reset();
    }    
  } else {
@@ -416,7 +459,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
 void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {  
  auto type = this->get_addr_type(addr);
  if (type == AddrType::Shared) {
-    sharedmem_->read(data, addr, size);
+    shared_mem_->read(data, addr, size);
  } else {  
    mmu_.read(data, addr, size, 0);
  }
@@ -431,7 +474,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
     this->writeToStdOut(data, addr, size);
  } else {
    if (type == AddrType::Shared) {
-      sharedmem_->write(data, addr, size);
+      shared_mem_->write(data, addr, size);
    } else {
      mmu_.write(data, addr, size, 0);
    }
@@ -533,71 +576,76 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
        break;    
      case VX_DCR_MPM_CLASS_CORE: {
        switch (addr) {
+        case VX_CSR_MPM_SCHED_ID:  return perf_stats_.sched_idle & 0xffffffff; 
+        case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
+        case VX_CSR_MPM_SCHED_ST:  return perf_stats_.sched_stalls & 0xffffffff; 
+        case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
        case VX_CSR_MPM_IBUF_ST:   return perf_stats_.ibuf_stalls & 0xffffffff; 
        case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; 
-        case VX_CSR_MPM_SCRB_ST:   return perf_stats_.scrb_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; 
-        case VX_CSR_MPM_ALU_ST:    return perf_stats_.alu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_ALU_ST_H:  return perf_stats_.alu_stalls >> 32; 
-        case VX_CSR_MPM_LSU_ST:    return perf_stats_.lsu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_LSU_ST_H:  return perf_stats_.lsu_stalls >> 32;
-        case VX_CSR_MPM_FPU_ST:    return perf_stats_.fpu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_FPU_ST_H:  return perf_stats_.fpu_stalls >> 32; 
-        case VX_CSR_MPM_SFU_ST:    return perf_stats_.sfu_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SFU_ST_H:  return perf_stats_.sfu_stalls >> 32; 
-        
+        case VX_CSR_MPM_SCRB_ST:   return perf_stats_.scrb_stalls & 0xffffffff;
+        case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
+        case VX_CSR_MPM_SCRB_ALU:  return perf_stats_.scrb_alu & 0xffffffff;
+        case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
+        case VX_CSR_MPM_SCRB_FPU:  return perf_stats_.scrb_fpu & 0xffffffff;
+        case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
+        case VX_CSR_MPM_SCRB_LSU:  return perf_stats_.scrb_lsu & 0xffffffff;
+        case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
+        case VX_CSR_MPM_SCRB_SFU:  return perf_stats_.scrb_sfu & 0xffffffff;
+        case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
+        case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
+        case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
+        case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
+        case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
        case VX_CSR_MPM_IFETCHES:  return perf_stats_.ifetches & 0xffffffff; 
        case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; 
        case VX_CSR_MPM_LOADS:     return perf_stats_.loads & 0xffffffff; 
        case VX_CSR_MPM_LOADS_H:   return perf_stats_.loads >> 32; 
        case VX_CSR_MPM_STORES:    return perf_stats_.stores & 0xffffffff; 
        case VX_CSR_MPM_STORES_H:  return perf_stats_.stores >> 32;
-        case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff; 
-        case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32; 
-        case VX_CSR_MPM_LOAD_LAT:  return perf_stats_.load_latency & 0xffffffff; 
-        case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
+        case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff; 
+        case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32; 
+        case VX_CSR_MPM_LOAD_LT:   return perf_stats_.load_latency & 0xffffffff; 
+        case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
       }
      } break; 
      case VX_DCR_MPM_CLASS_MEM: {
-        auto proc_perf = cluster_->processor()->perf_stats();
+        auto proc_perf = socket_->cluster()->processor()->perf_stats();
+        auto cluster_perf = socket_->cluster()->perf_stats();
+        auto socket_perf = socket_->perf_stats();
+        auto smem_perf = shared_mem_->perf_stats();
        switch (addr) {
-        case VX_CSR_MPM_ICACHE_READS:    return proc_perf.clusters.icache.reads & 0xffffffff; 
-        case VX_CSR_MPM_ICACHE_READS_H:  return proc_perf.clusters.icache.reads >> 32; 
-        case VX_CSR_MPM_ICACHE_MISS_R:   return proc_perf.clusters.icache.read_misses & 0xffffffff;
-        case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
+        case VX_CSR_MPM_ICACHE_READS:     return socket_perf.icache.reads & 0xffffffff; 
+        case VX_CSR_MPM_ICACHE_READS_H:   return socket_perf.icache.reads >> 32; 
+        case VX_CSR_MPM_ICACHE_MISS_R:    return socket_perf.icache.read_misses & 0xffffffff;
+        case VX_CSR_MPM_ICACHE_MISS_R_H:  return socket_perf.icache.read_misses >> 32;
+        case VX_CSR_MPM_ICACHE_MSHR_ST:   return socket_perf.icache.mshr_stalls & 0xffffffff; 
+        case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
        
-        case VX_CSR_MPM_DCACHE_READS:    return proc_perf.clusters.dcache.reads & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_READS_H:  return proc_perf.clusters.dcache.reads >> 32; 
-        case VX_CSR_MPM_DCACHE_WRITES:   return proc_perf.clusters.dcache.writes & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; 
-        case VX_CSR_MPM_DCACHE_MISS_R:   return proc_perf.clusters.dcache.read_misses & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; 
-        case VX_CSR_MPM_DCACHE_MISS_W:   return proc_perf.clusters.dcache.write_misses & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; 
-        case VX_CSR_MPM_DCACHE_BANK_ST:  return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
-        case VX_CSR_MPM_DCACHE_MSHR_ST:  return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; 
-        case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
-        
-        case VX_CSR_MPM_SMEM_READS:    return proc_perf.clusters.sharedmem.reads & 0xffffffff;
-        case VX_CSR_MPM_SMEM_READS_H:  return proc_perf.clusters.sharedmem.reads >> 32;
-        case VX_CSR_MPM_SMEM_WRITES:   return proc_perf.clusters.sharedmem.writes & 0xffffffff;
-        case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
-        case VX_CSR_MPM_SMEM_BANK_ST:  return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; 
-        case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32; 
+        case VX_CSR_MPM_DCACHE_READS:     return socket_perf.dcache.reads & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_READS_H:   return socket_perf.dcache.reads >> 32; 
+        case VX_CSR_MPM_DCACHE_WRITES:    return socket_perf.dcache.writes & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_WRITES_H:  return socket_perf.dcache.writes >> 32; 
+        case VX_CSR_MPM_DCACHE_MISS_R:    return socket_perf.dcache.read_misses & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MISS_R_H:  return socket_perf.dcache.read_misses >> 32; 
+        case VX_CSR_MPM_DCACHE_MISS_W:    return socket_perf.dcache.write_misses & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MISS_W_H:  return socket_perf.dcache.write_misses >> 32; 
+        case VX_CSR_MPM_DCACHE_BANK_ST:   return socket_perf.dcache.bank_stalls & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
+        case VX_CSR_MPM_DCACHE_MSHR_ST:   return socket_perf.dcache.mshr_stalls & 0xffffffff; 
+        case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;

-        case VX_CSR_MPM_L2CACHE_READS:    return proc_perf.clusters.l2cache.reads & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_READS_H:  return proc_perf.clusters.l2cache.reads >> 32; 
-        case VX_CSR_MPM_L2CACHE_WRITES:   return proc_perf.clusters.l2cache.writes & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32; 
-        case VX_CSR_MPM_L2CACHE_MISS_R:   return proc_perf.clusters.l2cache.read_misses & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32; 
-        case VX_CSR_MPM_L2CACHE_MISS_W:   return proc_perf.clusters.l2cache.write_misses & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32; 
-        case VX_CSR_MPM_L2CACHE_BANK_ST:  return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
-        case VX_CSR_MPM_L2CACHE_MSHR_ST:  return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff; 
-        case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
+        case VX_CSR_MPM_L2CACHE_READS:    return cluster_perf.l2cache.reads & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_READS_H:  return cluster_perf.l2cache.reads >> 32; 
+        case VX_CSR_MPM_L2CACHE_WRITES:   return cluster_perf.l2cache.writes & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32; 
+        case VX_CSR_MPM_L2CACHE_MISS_R:   return cluster_perf.l2cache.read_misses & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32; 
+        case VX_CSR_MPM_L2CACHE_MISS_W:   return cluster_perf.l2cache.write_misses & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32; 
+        case VX_CSR_MPM_L2CACHE_BANK_ST:  return cluster_perf.l2cache.bank_stalls & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
+        case VX_CSR_MPM_L2CACHE_MSHR_ST:  return cluster_perf.l2cache.mshr_stalls & 0xffffffff; 
+        case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;

        case VX_CSR_MPM_L3CACHE_READS:    return proc_perf.l3cache.reads & 0xffffffff; 
        case VX_CSR_MPM_L3CACHE_READS_H:  return proc_perf.l3cache.reads >> 32; 
@@ -612,14 +660,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
        case VX_CSR_MPM_L3CACHE_MSHR_ST:  return proc_perf.l3cache.mshr_stalls & 0xffffffff; 
        case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;

-        case VX_CSR_MPM_MEM_READS:   return proc_perf.mem_reads & 0xffffffff; 
-        case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; 
-        case VX_CSR_MPM_MEM_WRITES:  return proc_perf.mem_writes & 0xffffffff; 
-        case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32; 
-        case VX_CSR_MPM_MEM_LAT:     return proc_perf.mem_latency & 0xffffffff; 
-        case VX_CSR_MPM_MEM_LAT_H:   return proc_perf.mem_latency >> 32;
+        case VX_CSR_MPM_MEM_READS:        return proc_perf.mem_reads & 0xffffffff; 
+        case VX_CSR_MPM_MEM_READS_H:      return proc_perf.mem_reads >> 32;
+        case VX_CSR_MPM_MEM_WRITES:       return proc_perf.mem_writes & 0xffffffff; 
+        case VX_CSR_MPM_MEM_WRITES_H:     return proc_perf.mem_writes >> 32; 
+        case VX_CSR_MPM_MEM_LT:           return proc_perf.mem_latency & 0xffffffff; 
+        case VX_CSR_MPM_MEM_LT_H :        return proc_perf.mem_latency >> 32;
+         
+        case VX_CSR_MPM_SMEM_READS:       return smem_perf.reads & 0xffffffff;
+        case VX_CSR_MPM_SMEM_READS_H:     return smem_perf.reads >> 32;
+        case VX_CSR_MPM_SMEM_WRITES:      return smem_perf.writes & 0xffffffff;
+        case VX_CSR_MPM_SMEM_WRITES_H:    return smem_perf.writes >> 32;
+        case VX_CSR_MPM_SMEM_BANK_ST:     return smem_perf.bank_stalls & 0xffffffff; 
+        case VX_CSR_MPM_SMEM_BANK_ST_H:   return smem_perf.bank_stalls >> 32; 
        }
      } break;
+      default: {
+        std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
+        std::abort();
+      } break;
      }
    } else {
      std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@@ -22,11 +22,11 @@
 #include <memory>
 #include <set>
 #include <simobject.h>
+#include <mem.h>
 #include "debug.h"
 #include "types.h"
 #include "arch.h"
 #include "decode.h"
-#include "mem.h"
 #include "warp.h"
 #include "pipeline.h"
 #include "cache_sim.h"
@@ -40,19 +40,25 @@

 namespace vortex {

-class Cluster;
+class Socket;
+
+using TraceSwitch = Mux<pipeline_trace_t*>;

 class Core : public SimObject<Core> {
 public:
  struct PerfStats {
    uint64_t cycles;
    uint64_t instrs;
+    uint64_t sched_idle;
+    uint64_t sched_stalls;
    uint64_t ibuf_stalls;
    uint64_t scrb_stalls;
-    uint64_t alu_stalls;
-    uint64_t lsu_stalls;
-    uint64_t fpu_stalls;
-    uint64_t sfu_stalls;
+    uint64_t scrb_alu;
+    uint64_t scrb_fpu;
+    uint64_t scrb_lsu;
+    uint64_t scrb_sfu;
+    uint64_t scrb_wctl;
+    uint64_t scrb_csrs;
    uint64_t ifetches;
    uint64_t loads;
    uint64_t stores;
@@ -62,12 +68,16 @@ public:
    PerfStats() 
      : cycles(0)
      , instrs(0)
+      , sched_idle(0)
+      , sched_stalls(0)
      , ibuf_stalls(0)
      , scrb_stalls(0)
-      , alu_stalls(0)
-      , lsu_stalls(0)
-      , fpu_stalls(0)
-      , sfu_stalls(0)
+      , scrb_alu(0)
+      , scrb_fpu(0)
+      , scrb_lsu(0)
+      , scrb_sfu(0)
+      , scrb_wctl(0)
+      , scrb_csrs(0)
      , ifetches(0)
      , loads(0)
      , stores(0)
@@ -84,10 +94,9 @@ public:

  Core(const SimContext& ctx, 
       uint32_t core_id, 
-       Cluster* cluster,
+       Socket* socket,
       const Arch &arch, 
-       const DCRS &dcrs,
-       SharedMem::Ptr  sharedmem);
+       const DCRS &dcrs);

  ~Core();

@@ -105,6 +114,10 @@ public:
    return core_id_;
  }

+  Socket* socket() const {
+    return socket_;
+  }
+
  const Arch& arch() const {
    return arch_;
  }
@@ -153,6 +166,7 @@ private:
  void cout_flush();

  uint32_t core_id_;
+  Socket* socket_;
  const Arch& arch_;
  const DCRS &dcrs_;
  
@@ -167,13 +181,13 @@ private:
  std::vector<Operand::Ptr> operands_;
  std::vector<Dispatcher::Ptr> dispatchers_;
  std::vector<ExeUnit::Ptr> exe_units_;
-  SharedMem::Ptr sharedmem_;
+  SharedMem::Ptr shared_mem_;
+  std::vector<SMemDemux::Ptr> smem_demuxs_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
  
  HashTable<pipeline_trace_t*> pending_icache_;
-  std::vector<pipeline_trace_t*> committed_traces_;
  WarpMask active_warps_;
  WarpMask stalled_warps_;
  uint64_t issued_instrs_;
@@ -188,9 +202,10 @@ private:
  
  PerfStats perf_stats_;
  
-  Cluster* cluster_;
+  std::vector<TraceSwitch::Ptr> commit_arbs_;

  uint32_t commit_exe_;
+  uint32_t ibuffer_idx_;

  friend class Warp;
  friend class LsuUnit;
--- a/sim/simx/dispatcher.h
+++ b/sim/simx/dispatcher.h
@@ -66,6 +66,7 @@ public:
            }
            auto& output = Outputs.at(i);
            auto trace = input.front();
+            auto new_trace = trace;
            if (pid_count_ != 1) {
                auto start_p = start_p_.at(b);
                if (start_p == -1) {
@@ -81,33 +82,30 @@ public:
                    end = j;
                }                
                start /= num_lanes_;
-                end /= num_lanes_;
-                auto new_trace = new pipeline_trace_t(*trace);
-                new_trace->tmask.reset();
-                for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
-                    new_trace->tmask[j] = trace->tmask[j];
-                }                
-                new_trace->pid = start;
-                new_trace->sop = (start_p == 0);
-                if (start == end) {
-                    new_trace->eop = 1;
+                end /= num_lanes_;                
+                if (start != end) {
+                    new_trace = new pipeline_trace_t(*trace);
+                    new_trace->eop = false;
+                    start_p_.at(b) = start + 1;
+                } else {
                    start_p_.at(b) = -1;
                    input.pop();
                    ++block_sent;
-                    delete trace;
-                } else {
-                    new_trace->eop = 0;
-                    start_p_.at(b) = start + 1;
-                }                
-                output.send(new_trace, 1);
-                DT(3, "pipeline-dispatch: " << *new_trace);
+                }
+                new_trace->pid = start;
+                new_trace->sop = (0 == start_p);
+                ThreadMask tmask;
+                for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
+                    tmask[j] = trace->tmask[j];
+                }
+                new_trace->tmask = tmask;                
            } else {
-                trace->pid = 0;
+                new_trace->pid = 0;
                input.pop();
-                output.send(trace, 1);
-                DT(3, "pipeline-dispatch: " << *trace);
                ++block_sent;
-            }            
+            }
+            DT(3, "pipeline-dispatch: " << *new_trace);
+            output.send(new_trace, 1);
        }
        if (block_sent == block_size_) {
            batch_idx_ = (batch_idx_ + 1) % batch_count_;
@@ -138,4 +136,4 @@ private:
    std::vector<int> start_p_;
 };

-}
+}
--- a/sim/simx/exe_unit.cpp
+++ b/sim/simx/exe_unit.cpp
@@ -51,8 +51,7 @@ void AluUnit::tick() {
            assert(core_->stalled_warps_.test(trace->wid));
            core_->stalled_warps_.reset(trace->wid);
        }
-        auto time = input.pop();
-        core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();
    }
 }

@@ -87,8 +86,7 @@ void FpuUnit::tick() {
            std::abort();
        }    
        DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
-        auto time = input.pop();
-        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();
    }
 }

@@ -114,7 +112,7 @@ void LsuUnit::tick() {

    // handle dcache response    
    for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
+        auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
        if (dcache_rsp_port.empty())
            continue;
        auto& mem_rsp = dcache_rsp_port.front();
@@ -136,7 +134,7 @@ void LsuUnit::tick() {

    // handle shared memory response
    for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
+        auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
        if (smem_rsp_port.empty())
            continue;
        auto& mem_rsp = smem_rsp_port.front();
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
            fence_lock_ = true;        
            DT(3, "fence-lock: " << *trace);
            // remove input
-            auto time = input.pop(); 
-            core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+            input.pop(); 
            break;
        }

@@ -213,7 +210,9 @@ void LsuUnit::tick() {
                auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
                matches += (addr0 == mem_addr);
            }
+        #ifdef LSU_DUP_ENABLE
            is_dup = (matches == trace->tmask.count());
+        #endif
        }

        uint32_t addr_count;
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
            if (!trace->tmask.test(t0 + t))
                continue;
            
-            auto& dcache_req_port = core_->dcache_req_ports.at(t);
+            auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
            auto mem_addr = trace_data->mem_addrs.at(t);
            auto type = core_->get_addr_type(mem_addr.addr);

@@ -241,12 +240,16 @@ void LsuUnit::tick() {
            mem_req.cid   = trace->cid;
            mem_req.uuid  = trace->uuid;        
                
-            dcache_req_port.send(mem_req, 2);
+            dcache_req_port.send(mem_req, 1);
            DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
                << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);

-            ++pending_loads_;
-            ++core_->perf_stats_.loads;        
+            if (is_write) {
+                ++core_->perf_stats_.stores;
+            } else {                
+                ++core_->perf_stats_.loads;
+                ++pending_loads_;
+            }
            if (is_dup)
                break;
        }
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
        // do not wait on writes
        if (is_write) {
            pending_rd_reqs_.release(tag);
-            output.send(trace, 1);
-            ++core_->perf_stats_.stores;
+            output.send(trace, 1);            
        }

        // remove input
-        auto time = input.pop();
-        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
+        input.pop();

        break; // single block
    }
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
            core_->stalled_warps_.reset(trace->wid);
        }

-        auto time = input.pop();
-        auto stalls = (SimPlatform::instance().cycles() - time);
-
-        core_->perf_stats_.sfu_stalls += stalls;
+        input.pop();

        break; // single block
    }
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@@ -34,14 +34,13 @@ static void show_usage() {
 uint32_t num_threads = NUM_THREADS;
 uint32_t num_warps = NUM_WARPS;
 uint32_t num_cores = NUM_CORES;
-uint32_t num_clusters = NUM_CLUSTERS;
 bool showStats = false;;
 bool riscv_test = false;
 const char* program = nullptr;

 static void parse_args(int argc, char **argv) {
  	int c;
-  	while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
+  	while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
    	switch (c) {
      case 't':
        num_threads = atoi(optarg);
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
        break;
 		  case 'c':
        num_cores = atoi(optarg);
-        break;
-		  case 'g':
-        num_clusters = atoi(optarg);
        break;
      case 'r':
        riscv_test = true;
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {

  {
    // create processor configuation
-    Arch arch(num_threads, num_warps, num_cores, num_clusters);
+    Arch arch(num_threads, num_warps, num_cores);

    // create memory module
    RAM ram(RAM_PAGE_SIZE);
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
  l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
    !L3_ENABLED,
    log2ceil(L3_CACHE_SIZE),  // C
-    log2ceil(MEM_BLOCK_SIZE), // B
-    log2ceil(L3_NUM_WAYS),  // W
-    0,                      // A
-    XLEN,                   // address bits  
-    L3_NUM_BANKS,           // number of banks
-    1,                      // number of ports
+    log2ceil(MEM_BLOCK_SIZE), // L
+    log2ceil(L3_NUM_WAYS),    // W
+    0,                        // A
+    log2ceil(L3_NUM_BANKS),   // B
+    XLEN,                     // address bits      
+    1,                        // number of ports
    uint8_t(arch.num_clusters()), // request size 
-    true,                   // write-through
-    false,                  // write response
-    0,                      // victim size
-    L3_MSHR_SIZE,           // mshr
-    2,                      // pipeline latency
+    true,                     // write-through
+    false,                    // write response
+    L3_MSHR_SIZE,             // mshr
+    2,                        // pipeline latency
    }
  );        
  
@@ -114,6 +113,7 @@ void ProcessorImpl::reset() {
  perf_mem_writes_ = 0;
  perf_mem_latency_ = 0;
  perf_mem_pending_reads_ = 0;
+  
 }

 void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
@@ -126,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
  perf.mem_writes  = perf_mem_writes_;
  perf.mem_latency = perf_mem_latency_;
  perf.l3cache     = l3cache_->perf_stats();
-  for (auto cluster : clusters_) {
-    perf.clusters += cluster->perf_stats();
-  }   
  return perf;
 }

--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@@ -24,17 +24,10 @@ namespace vortex {
 class ProcessorImpl {
 public:
  struct PerfStats {
+    CacheSim::PerfStats l3cache;
    uint64_t mem_reads;
    uint64_t mem_writes;
    uint64_t mem_latency;
-    CacheSim::PerfStats l3cache;
-    Cluster::PerfStats clusters;
-
-    PerfStats()
-      : mem_reads(0)
-      , mem_writes(0)
-      , mem_latency(0)
-    {}
  };

  ProcessorImpl(const Arch& arch);
@@ -46,7 +39,7 @@ public:

  void write_dcr(uint32_t addr, uint32_t value);

-  ProcessorImpl::PerfStats perf_stats() const;
+  PerfStats perf_stats() const;

 private:
 
@@ -55,7 +48,7 @@ private:
  const Arch& arch_;
  std::vector<std::shared_ptr<Cluster>> clusters_;
  DCRS dcrs_;
-  MemSim::Ptr   memsim_;
+  MemSim::Ptr memsim_;
  CacheSim::Ptr l3cache_;
  uint64_t perf_mem_reads_;
  uint64_t perf_mem_writes_;
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@@ -22,9 +22,11 @@ class Scoreboard {
 public:

    struct reg_use_t {
-        RegType  type;
-        uint32_t reg;        
-        uint64_t owner;
+        RegType  reg_type;
+        uint32_t reg_id; 
+        ExeType  exe_type;
+        SfuType  sfu_type;        
+        uint64_t uuid;
    };
        
    Scoreboard(const Arch &arch) 
@@ -44,89 +46,81 @@ public:
        owners_.clear();
    }

-    bool in_use(pipeline_trace_t* state) const {
-        return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 
-            || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
-            || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
+    bool in_use(pipeline_trace_t* trace) const {
+        return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0 
+            || (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0
+            || (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0;
    }

-    std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
-        std::vector<reg_use_t> out;        
-        {
-            uint32_t r = 0;
-            auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);        
-            while (used_iregs.any()) {
-                if (used_iregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
-                    out.push_back({RegType::Integer, r, owners_.at(tag)});
-                }
-                used_iregs >>= 1;
-                ++r;
+    std::vector<reg_use_t> get_uses(pipeline_trace_t* trace) const {
+        std::vector<reg_use_t> out;  
+        
+        auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
+        auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
+        auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid);
+
+        for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
+            if (used_iregs.test(r)) {
+                uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
+                auto owner = owners_.at(tag);
+                out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
            }
        }
-        {
-            uint32_t r = 0;
-            auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
-            while (used_fregs.any()) {
-                if (used_fregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
-                    out.push_back({RegType::Float, r, owners_.at(tag)});
-                }
-                used_fregs >>= 1;
-                ++r;
+
+        for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
+            if (used_fregs.test(r)) {
+                uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
+                auto owner = owners_.at(tag);
+                out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
            }
        }
-        {
-            uint32_t r = 0;
-            auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
-            while (used_vregs.any()) {
-                if (used_vregs.test(0)) {
-                    uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
-                    out.push_back({RegType::Vector, r, owners_.at(tag)});
-                }
-                used_vregs >>= 1;
-                ++r;
+
+        for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
+            if (used_vregs.test(r)) {
+                uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
+                auto owner = owners_.at(tag);
+                out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
            }
        }
+
        return out;
    }
    
-    void reserve(pipeline_trace_t* state) {
-        assert(state->wb);  
-        switch (state->rdest_type) {
+    void reserve(pipeline_trace_t* trace) {
+        assert(trace->wb);  
+        switch (trace->rdest_type) {
        case RegType::Integer:            
-            in_use_iregs_.at(state->wid).set(state->rdest);
+            in_use_iregs_.at(trace->wid).set(trace->rdest);
            break;
        case RegType::Float:
-            in_use_fregs_.at(state->wid).set(state->rdest);
+            in_use_fregs_.at(trace->wid).set(trace->rdest);
            break;
        case RegType::Vector:
-            in_use_vregs_.at(state->wid).set(state->rdest);
-            break;
-        default:  
+            in_use_vregs_.at(trace->wid).set(trace->rdest);
            break;
+        default: assert(false);
        }      
-        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
+        uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
        assert(owners_.count(tag) == 0);
-        owners_[tag] = state->uuid;
+        owners_[tag] = trace;
+        assert((int)trace->exe_type < 5);
    }

-    void release(pipeline_trace_t* state) {
-        assert(state->wb);      
-        switch (state->rdest_type) {
+    void release(pipeline_trace_t* trace) {
+        assert(trace->wb);      
+        switch (trace->rdest_type) {
        case RegType::Integer:
-            in_use_iregs_.at(state->wid).reset(state->rdest);
+            in_use_iregs_.at(trace->wid).reset(trace->rdest);
            break;
        case RegType::Float:
-            in_use_fregs_.at(state->wid).reset(state->rdest);
+            in_use_fregs_.at(trace->wid).reset(trace->rdest);
            break;
        case RegType::Vector:
-            in_use_vregs_.at(state->wid).reset(state->rdest);
-            break;
-        default:  
+            in_use_vregs_.at(trace->wid).reset(trace->rdest);
            break;
+        default: assert(false);
        }      
-        uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
+        uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
        owners_.erase(tag);
    }

@@ -135,7 +129,7 @@ private:
    std::vector<RegMask> in_use_iregs_;
    std::vector<RegMask> in_use_fregs_;
    std::vector<RegMask> in_use_vregs_;
-    std::unordered_map<uint32_t, uint64_t> owners_;
+    std::unordered_map<uint32_t, pipeline_trace_t*> owners_;
 };

 }
--- a/sim/simx/socket.cpp
+++ b/sim/simx/socket.cpp
@@ -0,0 +1,149 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "socket.h"
+#include "cluster.h"
+
+using namespace vortex;
+
+Socket::Socket(const SimContext& ctx, 
+                uint32_t socket_id,
+                Cluster* cluster, 
+                const Arch &arch, 
+                const DCRS &dcrs) 
+  : SimObject(ctx, "socket")
+  , icache_mem_req_port(this)
+  , icache_mem_rsp_port(this)
+  , dcache_mem_req_port(this)
+  , dcache_mem_rsp_port(this)
+  , socket_id_(socket_id)
+  , cluster_(cluster)
+  , cores_(arch.socket_size())  
+{
+  auto cores_per_socket = cores_.size();
+  
+  char sname[100];
+  snprintf(sname, 100, "socket%d-icaches", socket_id);
+  icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
+    !ICACHE_ENABLED,
+    log2ceil(ICACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // L
+    log2ceil(sizeof(uint32_t)), // W
+    log2ceil(ICACHE_NUM_WAYS),// A
+    1,                      // B
+    XLEN,                   // address bits
+    1,                      // number of ports
+    1,                      // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    (uint8_t)arch.num_warps(), // mshr
+    2,                      // pipeline latency
+  });
+
+  icaches_->MemReqPort.bind(&icache_mem_req_port);
+  icache_mem_rsp_port.bind(&icaches_->MemRspPort);
+
+  snprintf(sname, 100, "socket%d-dcaches", socket_id);
+  dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+    !DCACHE_ENABLED,
+    log2ceil(DCACHE_SIZE),  // C
+    log2ceil(L1_LINE_SIZE), // L
+    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_NUM_WAYS),// A
+    log2ceil(DCACHE_NUM_BANKS), // B
+    XLEN,                   // address bits
+    1,                      // number of ports
+    DCACHE_NUM_BANKS,       // number of inputs
+    true,                   // write-through
+    false,                  // write response
+    DCACHE_MSHR_SIZE,       // mshr
+    2,                      // pipeline latency
+  });
+
+  dcaches_->MemReqPort.bind(&dcache_mem_req_port);
+  dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
+
+  // create cores
+
+  for (uint32_t i = 0; i < cores_per_socket; ++i) {  
+    uint32_t core_id = socket_id * cores_per_socket + i;
+    cores_.at(i) = Core::Create(core_id, 
+                                this, 
+                                arch, 
+                                dcrs);
+
+    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+
+    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+      cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
+      dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
+    }
+  }
+}
+
+Socket::~Socket() {
+  //--
+}
+
+void Socket::reset() {  
+  //--
+}
+
+void Socket::tick() {
+  //--
+}
+
+void Socket::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+bool Socket::running() const {
+  for (auto& core : cores_) {
+    if (core->running())
+      return true;
+  }
+  return false;
+}
+
+bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
+  bool done = true;
+  Word exitcode_ = 0;
+  for (auto& core : cores_) {
+    Word ec;
+    if (core->check_exit(&ec, riscv_test)) {
+      exitcode_ |= ec;
+    } else {
+      done = false;
+    }
+  }
+  *exitcode = exitcode_;
+  return done;
+}
+
+void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
+}
+
+void Socket::resume(uint32_t core_index) {
+  cores_.at(core_index)->resume();
+}
+
+Socket::PerfStats Socket::perf_stats() const {
+  PerfStats perf_stats;
+  perf_stats.icache = icaches_->perf_stats();
+  perf_stats.dcache = dcaches_->perf_stats();  
+  return perf_stats;
+}
--- a/sim/simx/socket.h
+++ b/sim/simx/socket.h
@@ -0,0 +1,81 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "dcrs.h"
+#include "arch.h"
+#include "cache_cluster.h"
+#include "shared_mem.h"
+#include "core.h"
+#include "constants.h"
+
+namespace vortex {
+
+class Cluster;
+
+class Socket : public SimObject<Socket> {
+public:
+  struct PerfStats {
+    CacheSim::PerfStats icache;
+    CacheSim::PerfStats dcache;
+  };
+
+  SimPort<MemReq> icache_mem_req_port;
+  SimPort<MemRsp> icache_mem_rsp_port;
+
+  SimPort<MemReq> dcache_mem_req_port;
+  SimPort<MemRsp> dcache_mem_rsp_port;
+
+  Socket(const SimContext& ctx, 
+         uint32_t socket_id,
+         Cluster* cluster, 
+         const Arch &arch, 
+         const DCRS &dcrs);
+
+  ~Socket();
+
+  uint32_t id() const {
+    return socket_id_;
+  }
+
+  Cluster* cluster() const {
+    return cluster_;
+  }
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* ram);
+
+  bool running() const;
+
+  bool check_exit(Word* exitcode, bool riscv_test) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
+
+  void resume(uint32_t core_id);
+
+  PerfStats perf_stats() const;
+  
+private:
+  uint32_t                socket_id_;
+  Cluster*                cluster_;
+  std::vector<Core::Ptr>  cores_;
+  CacheCluster::Ptr       icaches_;
+  CacheCluster::Ptr       dcaches_;
+};
+
+} // namespace vortex
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
  case RegType::Integer: os << "x"; break;  
  case RegType::Float:   os << "f"; break;
  case RegType::Vector:  os << "v"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -81,7 +82,7 @@ enum class ExeType {
  LSU,
  FPU,
  SFU,
-  MAX,
+  ExeTypeCount
 };

 inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
@@ -90,7 +91,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
  case ExeType::LSU: os << "LSU"; break;
  case ExeType::FPU: os << "FPU"; break;
  case ExeType::SFU: os << "SFU"; break;
-  case ExeType::MAX: break;
+  default: assert(false);
  }
  return os;
 }
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  case AluType::SYSCALL: os << "SYSCALL"; break;
  case AluType::IMUL:    os << "IMUL"; break;
  case AluType::IDIV:    os << "IDIV"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
  case LsuType::LOAD:  os << "LOAD"; break;
  case LsuType::STORE: os << "STORE"; break;
  case LsuType::FENCE: os << "FENCE"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -138,7 +141,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
 enum class AddrType {
  Global,
  Shared,
-  IO,
+  IO
 };

 inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
  case AddrType::Global: os << "Global"; break;
  case AddrType::Shared: os << "Shared"; break;
  case AddrType::IO:     os << "IO"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -164,7 +168,7 @@ enum class FpuType {
  FMA,
  FDIV,
  FSQRT,
-  FCVT,
+  FCVT
 };

 inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
  case FpuType::FDIV:  os << "FDIV"; break;
  case FpuType::FSQRT: os << "FSQRT"; break;
  case FpuType::FCVT:  os << "FCVT"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -190,7 +195,7 @@ enum class SfuType {
  CSRRW,
  CSRRS,
  CSRRC,
-  CMOV  
+  CMOV
 };

 inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
  case SfuType::CSRRS:  os << "CSRRS"; break;
  case SfuType::CSRRC:  os << "CSRRC"; break;
  case SfuType::CMOV:   os << "CMOV"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
  switch (type) {
  case ArbiterType::Priority:   os << "Priority"; break;
  case ArbiterType::RoundRobin: os << "RoundRobin"; break;
+  default: assert(false);
  }
  return os;
 }
@@ -351,6 +358,92 @@ private:

 ///////////////////////////////////////////////////////////////////////////////

+template <typename Type>
+class Mux : public SimObject<Mux<Type>> {
+public:
+  std::vector<SimPort<Type>> Inputs;
+  std::vector<SimPort<Type>> Outputs;
+
+  Mux(
+    const SimContext& ctx, 
+    const char* name, 
+    ArbiterType type, 
+    uint32_t num_inputs, 
+    uint32_t num_outputs = 1,
+    uint32_t delay = 1
+  ) : SimObject<Mux<Type>>(ctx, name)    
+    , Inputs(num_inputs, this)
+    , Outputs(num_outputs, this)
+    , type_(type)
+    , delay_(delay)
+    , cursors_(num_outputs, 0)
+    , num_reqs_(num_inputs / num_outputs)
+  {
+    assert(delay != 0);    
+    assert(num_inputs <= 32);
+    assert(num_outputs <= 32);
+    assert(num_inputs >= num_outputs);
+
+    // bypass mode
+    if (num_inputs == num_outputs) {      
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        Inputs.at(i).bind(&Outputs.at(i));
+      }
+    }
+  }
+
+  void reset() {
+    for (auto& cursor : cursors_) {
+      cursor = 0;
+    }
+  }
+
+  void tick() {
+    uint32_t I = Inputs.size();
+    uint32_t O = Outputs.size();
+    uint32_t R = num_reqs_;
+
+    // skip bypass mode
+    if (I == O)
+      return;
+        
+    // process inputs       
+    for (uint32_t o = 0; o < O; ++o) {
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (cursors_.at(o) + r) & (R-1);
+        uint32_t j = o * R + i;
+        if (j >= I)
+          continue;
+        
+        auto& req_in = Inputs.at(j);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          DT(4, this->name() << "-" << req);
+          Outputs.at(o).send(req, delay_);                
+          req_in.pop();
+          this->update_cursor(o, i);
+          break;
+        }
+      }
+    }
+  }
+
+private:
+
+  void update_cursor(uint32_t index, uint32_t grant) {
+    if (type_ == ArbiterType::RoundRobin) {
+      cursors_.at(index) = grant + 1;
+    }
+  }
+
+  ArbiterType type_;
+  uint32_t delay_;  
+  std::vector<uint32_t> cursors_;
+  uint32_t num_reqs_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 template <typename Req, typename Rsp>
 class Switch : public SimObject<Switch<Req, Rsp>> {
 public:
@@ -364,13 +457,13 @@ public:
    const SimContext& ctx, 
    const char* name, 
    ArbiterType type, 
-    uint32_t num_inputs = 1, 
+    uint32_t num_inputs, 
    uint32_t num_outputs = 1,
    uint32_t delay = 1
  ) 
    : SimObject<Switch<Req, Rsp>>(ctx, name)    
-    , ReqIn(num_inputs,   this)
-    , RspIn(num_inputs,   this)
+    , ReqIn(num_inputs, this)
+    , RspIn(num_inputs, this)
    , ReqOut(num_outputs, this)    
    , RspOut(num_outputs, this)
    , type_(type)
@@ -383,8 +476,8 @@ public:
    assert(num_outputs <= 32);
    assert(num_inputs >= num_outputs);

+    // bypass mode    
    if (num_inputs == num_outputs) {
-      // bypass mode
      for (uint32_t i = 0; i < num_inputs; ++i) {
        ReqIn.at(i).bind(&ReqOut.at(i));
        RspOut.at(i).bind(&RspIn.at(i));
@@ -462,14 +555,14 @@ private:

 class SMemDemux : public SimObject<SMemDemux> {
 public:
-  SimPort<MemReq>  ReqIn;
-  SimPort<MemRsp>  RspIn;
+  SimPort<MemReq> ReqIn;
+  SimPort<MemRsp> RspIn;

-  SimPort<MemReq>  ReqSm;
-  SimPort<MemRsp>  RspSm;
+  SimPort<MemReq> ReqSM;
+  SimPort<MemRsp> RspSM;

-  SimPort<MemReq>  ReqDc;
-  SimPort<MemRsp>  RspDc;
+  SimPort<MemReq> ReqDC;
+  SimPort<MemRsp> RspDC;

  SMemDemux(
    const SimContext& ctx, 
@@ -478,45 +571,49 @@ public:
  ) : SimObject<SMemDemux>(ctx, name)    
    , ReqIn(this)
    , RspIn(this)
-    , ReqSm(this)
-    , RspSm(this)
-    , ReqDc(this)
-    , RspDc(this)
+    , ReqSM(this)
+    , RspSM(this)
+    , ReqDC(this)
+    , RspDC(this)
    , delay_(delay)
  {}

  void reset() {}

-  void tick() {
+  void tick() {      
+    // process incoming reponses
+    if (!RspSM.empty()) {
+      auto& rsp = RspSM.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspSM.pop();
+    }
+    if (!RspDC.empty()) {
+      auto& rsp = RspDC.front();
+      DT(4, this->name() << "-" << rsp);
+      RspIn.send(rsp, 1);
+      RspDC
+      .pop();
+    }
    // process incomming requests  
    if (!ReqIn.empty()) {
      auto& req = ReqIn.front();
      DT(4, this->name() << "-" << req);
      if (req.type == AddrType::Shared) {
-        ReqSm.send(req, delay_);
+        ReqSM.send(req, delay_);
      } else {
-        ReqDc.send(req, delay_);
+        ReqDC.send(req, delay_);
      }
      ReqIn.pop();
    }   
-      
-    // process incoming reponses
-    if (!RspSm.empty()) {
-      auto& rsp = RspSm.front();
-      DT(4, this->name() << "-" << rsp);
-      RspIn.send(rsp, 1);
-      RspSm.pop();
-    }
-    if (!RspDc.empty()) {
-      auto& rsp = RspDc.front();
-      DT(4, this->name() << "-" << rsp);
-      RspIn.send(rsp, 1);
-      RspDc.pop();
-    }
  }

 private:
  uint32_t delay_;
 };

-}
+///////////////////////////////////////////////////////////////////////////////
+
+using MemSwitch = Switch<MemReq, MemRsp>;
+
+}