Merge remote-tracking branch 'upstream/master' into vortex2

This commit is contained in:
Hansung Kim
2024-02-01 23:35:58 -08:00
203 changed files with 4383 additions and 21981 deletions

View File

@@ -1,20 +1,22 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
DESTDIR ?= $(CURDIR)
HW_DIR = $(abspath ../../hw)
COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = $(abspath ../../third_party)
RTL_DIR = $(HW_DIR)/rtl
DPI_DIR = $(HW_DIR)/dpi
AFU_DIR = $(RTL_DIR)/afu/opae
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
SCRIPT_DIR = $(HW_DIR)/scripts
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -I$(CURDIR) -I$(HW_DIR) -I$(COMMON_DIR) -I$(DESTDIR)
CXXFLAGS += -I/$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I/$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
LDFLAGS += -shared $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
@@ -53,9 +55,9 @@ endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
SRCS += $(CURDIR)/fpga.cpp $(CURDIR)/opae_sim.cpp
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
@@ -73,7 +75,7 @@ TOP = vortex_afu_shim
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += verilator.vlt
@@ -119,16 +121,16 @@ PROJECT = libopae-c-sim.so
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/vortex.xml:
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $@
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
$(SCRIPT_DIR)/scope.py $^ -o $@
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
$(SCRIPT_DIR)/gen_config.py -i $^ -o $@
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
clean:
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)

View File

@@ -1,2 +1 @@
VX_config.h
/obj_dir/*

View File

@@ -1,18 +1,20 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
THIRD_PARTY_DIR = ../../third_party
DESTDIR ?= $(CURDIR)
HW_DIR = $(abspath ../../hw)
COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = $(abspath ../../third_party)
RTL_DIR = $(HW_DIR)/rtl
DPI_DIR = $(HW_DIR)/dpi
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -I$(HW_DIR) -I$(COMMON_DIR)
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
@@ -38,9 +40,9 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += processor.cpp
SRCS += $(CURDIR)/processor.cpp
ifdef AXI_BUS
TOP = Vortex_axi
@@ -54,7 +56,7 @@ VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(RTL_INCLUDE)
@@ -87,11 +89,11 @@ PROJECT = rtlsim
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
$(DESTDIR)/$(PROJECT): $(SRCS) $(CURDIR)/main.cpp
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

View File

@@ -1,11 +1,12 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../hw/rtl
THIRD_PARTY_DIR = ../../third_party
DESTDIR ?= $(CURDIR)
HW_DIR = $(abspath ../../hw)
COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = $(abspath ../../third_party)
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw
CXXFLAGS += -I$(CURDIR) -I$(COMMON_DIR) -I$(HW_DIR)
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
@@ -14,8 +15,8 @@ CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG

View File

@@ -28,6 +28,7 @@ private:
uint16_t num_warps_;
uint16_t num_cores_;
uint16_t num_clusters_;
uint16_t socket_size_;
uint16_t vsize_;
uint16_t num_regs_;
uint16_t num_csrs_;
@@ -35,11 +36,12 @@ private:
uint16_t ipdom_size_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
, num_clusters_(num_clusters)
, num_clusters_(NUM_CLUSTERS)
, socket_size_(SOCKET_SIZE)
, vsize_(16)
, num_regs_(32)
, num_csrs_(4096)
@@ -82,6 +84,10 @@ public:
uint16_t num_clusters() const {
return num_clusters_;
}
uint16_t socket_size() const {
return socket_size_;
}
};
}

View File

@@ -45,20 +45,20 @@ public:
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
@@ -66,7 +66,7 @@ public:
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);

View File

@@ -41,19 +41,16 @@ struct params_t {
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
int32_t offset_bits = config.L - config.W;
int32_t index_bits = config.C - (config.L + config.A + config.B);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->sets_per_bank = 1 << index_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
this->words_per_line = 1 << offset_bits;
assert(config.ports_per_bank <= this->words_per_line);
@@ -63,7 +60,7 @@ struct params_t {
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
@@ -74,23 +71,23 @@ struct params_t {
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
uint32_t addr_bank_id(uint64_t addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
uint32_t addr_set_id(uint64_t addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
uint64_t addr_tag(uint64_t addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
@@ -288,8 +285,8 @@ private:
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
MemSwitch::Ptr bank_switch_;
MemSwitch::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
@@ -304,16 +301,16 @@ public:
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
, banks_((1 << config.B), {config, params_})
, mem_req_ports_((1 << config.B), simobject)
, mem_rsp_ports_((1 << config.B), simobject)
, pipeline_reqs_((1 << config.B), config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
@@ -323,14 +320,14 @@ public:
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
if (config.B != 0) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
@@ -383,20 +380,22 @@ public:
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
// first: schedule MSHR replay (flush MSHR queue)
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
// second: schedule memory fill (flush memory queue)
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None)
continue;
@@ -407,7 +406,7 @@ public:
mem_rsp_port.pop();
}
// schedule core requests
// last: schedule core requests (flush core queue)
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
@@ -425,18 +424,21 @@ public:
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None)
continue;
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
@@ -452,7 +454,7 @@ public:
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
} else {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
@@ -463,10 +465,6 @@ public:
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
@@ -516,7 +514,7 @@ private:
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
@@ -545,11 +543,10 @@ private:
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
case bank_req_t::Core: {
int32_t hit_line_id = -1;
int32_t free_line_id = -1;
int32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
@@ -557,38 +554,34 @@ private:
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
if (line.tag == pipeline_req.tag) {
hit_line_id = i;
hit = true;
line.lru_ctr = 0;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
free_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (hit_line_id != -1) {
// Hit handling
if (pipeline_req.write) {
// handle write hit
// handle write has_hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
@@ -606,23 +599,21 @@ private:
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
} else {
// Miss handling
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
if (free_line_id == -1 && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
@@ -635,8 +626,8 @@ private:
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
@@ -655,7 +646,7 @@ private:
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
// send fill request
if (!mshr_pending) {
@@ -663,8 +654,8 @@ private:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;

View File

@@ -23,16 +23,15 @@ public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t L; // log2 line size
uint8_t W; // log2 word size
uint8_t A; // log2 associativity
uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t B; // log2 number of banks
uint8_t addr_width; // word address bits
uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs
bool write_through; // is write-through
bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};

View File

@@ -18,34 +18,60 @@ using namespace vortex;
Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id,
ProcessorImpl* processor,
const Arch &arch, const
DCRS &dcrs)
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "cluster")
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, cores_per_socket_(arch.socket_size())
{
auto num_cores = arch.num_cores();
char sname[100];
uint32_t sockets_per_cluster = sockets_.size();
// create sockets
snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
auto socket = Socket::Create(socket_id,
this,
arch,
dcrs);
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
sockets_.at(i) = socket;
}
// Create l2cache
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L2_NUM_WAYS), // W
0, // A
log2ceil(L2_NUM_BANKS), // B
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
2, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
@@ -53,89 +79,11 @@ Cluster::Cluster(const SimContext& ctx,
l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
}
Cluster::~Cluster() {
@@ -153,14 +101,14 @@ void Cluster::tick() {
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
for (auto& socket : sockets_) {
socket->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
for (auto& socket : sockets_) {
if (socket->running())
return true;
}
return false;
@@ -169,9 +117,9 @@ bool Cluster::running() const {
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
for (auto& socket : sockets_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
if (socket->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
@@ -184,36 +132,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
auto sockets_per_cluster = sockets_.size();
auto cores_per_socket = cores_per_socket_;
uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
uint32_t local_core_id = core_id % cores_per_cluster;
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
for (uint32_t c = 0; c < cores_per_socket; ++c) {
uint32_t i = s * cores_per_socket + c;
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
sockets_.at(s)->resume(c);
}
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
PerfStats perf_stats;
perf_stats.l2cache = l2cache_->perf_stats();
return perf_stats;
}

View File

@@ -19,6 +19,7 @@
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "socket.h"
#include "constants.h"
namespace vortex {
@@ -28,18 +29,7 @@ class ProcessorImpl;
class Cluster : public SimObject<Cluster> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
return *this;
}
CacheSim::PerfStats l2cache;
};
SimPort<MemReq> mem_req_port;
@@ -53,6 +43,14 @@ public:
~Cluster();
uint32_t id() const {
return cluster_id_;
}
ProcessorImpl* processor() const {
return processor_;
}
void reset();
void tick();
@@ -65,22 +63,15 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
PerfStats perf_stats() const;
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
uint32_t cluster_id_;
ProcessorImpl* processor_;
std::vector<Socket::Ptr> sockets_;
std::vector<CoreMask> barriers_;
CacheSim::Ptr l2cache_;
uint32_t cores_per_socket_;
};
} // namespace vortex

View File

@@ -21,6 +21,7 @@
#include "mem.h"
#include "decode.h"
#include "core.h"
#include "socket.h"
#include "debug.h"
#include "constants.h"
#include "processor_impl.h"
@@ -29,35 +30,36 @@ using namespace vortex;
Core::Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
Socket* socket,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem)
const DCRS &dcrs)
: SimObject(ctx, "core")
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
, dcache_req_ports(NUM_LSU_LANES, this)
, dcache_rsp_ports(NUM_LSU_LANES, this)
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
, dcrs_(dcrs)
, decoder_(arch)
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0)
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
, scoreboard_(arch_)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::MAX)
, exe_units_((uint32_t)ExeType::MAX)
, sharedmem_(sharedmem)
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::ExeTypeCount)
, smem_demuxs_(NUM_LSU_LANES)
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, committed_traces_(ISSUE_WIDTH, nullptr)
, csrs_(arch.num_warps())
, cluster_(cluster)
{
, csrs_(arch.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
csrs_.at(i).resize(arch.num_threads());
}
@@ -70,6 +72,28 @@ Core::Core(const SimContext& ctx,
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
}
// initialize shared memory
snprintf(sname, 100, "core%d-shared_mem", core_id);
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demuxs_.at(i) = smem_demux;
}
// initialize dispatchers
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
@@ -82,6 +106,16 @@ Core::Core(const SimContext& ctx,
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
}
commit_arbs_.at(i) = arbiter;
}
this->reset();
}
@@ -99,8 +133,12 @@ void Core::reset() {
for (auto& exe_unit : exe_units_) {
exe_unit->reset();
}
for (auto& commit_arb : commit_arbs_) {
commit_arb->reset();
}
for ( auto& barrier : barriers_) {
for (auto& barrier : barriers_) {
barrier.reset();
}
@@ -112,7 +150,7 @@ void Core::reset() {
ibuf.clear();
}
commit_exe_= 0;
ibuffer_idx_ = 0;
scoreboard_.clear();
fetch_latch_.clear();
@@ -150,8 +188,10 @@ void Core::schedule() {
break;
}
}
if (scheduled_warp == -1)
if (scheduled_warp == -1) {
++perf_stats_.sched_idle;
return;
}
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
@@ -192,11 +232,11 @@ void Core::fetch() {
mem_req.tag = pending_icache_.allocate(trace);
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 1);
icache_req_ports.at(0).send(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
++pending_ifetches_;
fetch_latch_.pop();
++perf_stats_.ifetches;
++pending_ifetches_;
}
void Core::decode() {
@@ -206,7 +246,7 @@ void Core::decode() {
auto trace = decode_latch_.front();
// check ibuffer capacity
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) {
if (!trace->log_once(true)) {
DT(3, "*** ibuffer-stall: " << *trace);
@@ -223,13 +263,6 @@ void Core::decode() {
stalled_warps_.reset(trace->wid);
}
// update perf counters
uint32_t active_threads = trace->tmask.count();
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
perf_stats_.loads += active_threads;
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
perf_stats_.stores += active_threads;
DT(3, "pipeline-decode: " << *trace);
// insert to ibuffer
@@ -239,7 +272,7 @@ void Core::decode() {
}
void Core::issue() {
// operands to dispatch
// operands to dispatchers
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& operand = operands_.at(i);
if (operand->Output.empty())
@@ -257,7 +290,8 @@ void Core::issue() {
// issue ibuffer instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& ibuffer = ibuffers_.at(i);
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty())
continue;
@@ -265,17 +299,41 @@ void Core::issue() {
// check scoreboard
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(3, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
DTH(3, "*** scoreboard-stall: dependents={");
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);
if (j) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
}
DTN(3, "}, " << *trace << std::endl);
}
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
switch (use.exe_type) {
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
case ExeType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
default: assert(false);
}
} break;
default: assert(false);
}
}
++perf_stats_.scrb_stalls;
continue;
} else {
@@ -294,10 +352,11 @@ void Core::issue() {
ibuffer.pop();
}
ibuffer_idx_ += ISSUE_WIDTH;
}
void Core::execute() {
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
auto& dispatch = dispatchers_.at(i);
auto& exe_unit = exe_units_.at(i);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
@@ -313,10 +372,10 @@ void Core::execute() {
void Core::commit() {
// process completed instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto trace = committed_traces_.at(i);
if (!trace)
auto& commit_arb = commit_arbs_.at(i);
if (commit_arb->Outputs.at(0).empty())
continue;
committed_traces_.at(i) = nullptr;
auto trace = commit_arb->Outputs.at(0).front();
// advance to commit stage
DT(3, "pipeline-commit: " << *trace);
@@ -334,27 +393,11 @@ void Core::commit() {
perf_stats_.instrs += trace->tmask.count();
}
commit_arb->Outputs.at(0).pop();
// delete the trace
delete trace;
}
// select completed instructions
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
auto& exe_unit = exe_units_.at(ii);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
auto committed_trace = committed_traces_.at(j);
if (committed_trace)
continue;
auto& output = exe_unit->Outputs.at(j);
if (output.empty())
continue;
auto trace = output.front();
committed_traces_.at(j) = trace;
output.pop();
}
}
++commit_exe_;
}
void Core::wspawn(uint32_t num_warps, Word nextPC) {
@@ -379,7 +422,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
if (is_global) {
// global barrier handling
if (barrier.count() == active_warps_.count()) {
cluster_->barrier(bar_idx, count, core_id_);
socket_->barrier(bar_idx, count, core_id_);
barrier.reset();
}
} else {
@@ -416,7 +459,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
auto type = this->get_addr_type(addr);
if (type == AddrType::Shared) {
sharedmem_->read(data, addr, size);
shared_mem_->read(data, addr, size);
} else {
mmu_.read(data, addr, size, 0);
}
@@ -431,7 +474,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
this->writeToStdOut(data, addr, size);
} else {
if (type == AddrType::Shared) {
sharedmem_->write(data, addr, size);
shared_mem_->write(data, addr, size);
} else {
mmu_.write(data, addr, size, 0);
}
@@ -533,71 +576,76 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff;
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32;
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff;
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32;
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff;
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = cluster_->processor()->perf_stats();
auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto cluster_perf = socket_->cluster()->perf_stats();
auto socket_perf = socket_->perf_stats();
auto smem_perf = shared_mem_->perf_stats();
switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
@@ -612,14 +660,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
}
} break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
}
} else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;

View File

@@ -22,11 +22,11 @@
#include <memory>
#include <set>
#include <simobject.h>
#include <mem.h>
#include "debug.h"
#include "types.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache_sim.h"
@@ -40,19 +40,25 @@
namespace vortex {
class Cluster;
class Socket;
using TraceSwitch = Mux<pipeline_trace_t*>;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t sched_idle;
uint64_t sched_stalls;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t fpu_stalls;
uint64_t sfu_stalls;
uint64_t scrb_alu;
uint64_t scrb_fpu;
uint64_t scrb_lsu;
uint64_t scrb_sfu;
uint64_t scrb_wctl;
uint64_t scrb_csrs;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@@ -62,12 +68,16 @@ public:
PerfStats()
: cycles(0)
, instrs(0)
, sched_idle(0)
, sched_stalls(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, fpu_stalls(0)
, sfu_stalls(0)
, scrb_alu(0)
, scrb_fpu(0)
, scrb_lsu(0)
, scrb_sfu(0)
, scrb_wctl(0)
, scrb_csrs(0)
, ifetches(0)
, loads(0)
, stores(0)
@@ -84,10 +94,9 @@ public:
Core(const SimContext& ctx,
uint32_t core_id,
Cluster* cluster,
Socket* socket,
const Arch &arch,
const DCRS &dcrs,
SharedMem::Ptr sharedmem);
const DCRS &dcrs);
~Core();
@@ -105,6 +114,10 @@ public:
return core_id_;
}
Socket* socket() const {
return socket_;
}
const Arch& arch() const {
return arch_;
}
@@ -153,6 +166,7 @@ private:
void cout_flush();
uint32_t core_id_;
Socket* socket_;
const Arch& arch_;
const DCRS &dcrs_;
@@ -167,13 +181,13 @@ private:
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
SharedMem::Ptr sharedmem_;
SharedMem::Ptr shared_mem_;
std::vector<SMemDemux::Ptr> smem_demuxs_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint64_t issued_instrs_;
@@ -188,9 +202,10 @@ private:
PerfStats perf_stats_;
Cluster* cluster_;
std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t commit_exe_;
uint32_t ibuffer_idx_;
friend class Warp;
friend class LsuUnit;

View File

@@ -66,6 +66,7 @@ public:
}
auto& output = Outputs.at(i);
auto trace = input.front();
auto new_trace = trace;
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
@@ -81,33 +82,30 @@ public:
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
end /= num_lanes_;
if (start != end) {
new_trace = new pipeline_trace_t(*trace);
new_trace->eop = false;
start_p_.at(b) = start + 1;
} else {
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
}
new_trace->pid = start;
new_trace->sop = (0 == start_p);
ThreadMask tmask;
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
tmask[j] = trace->tmask[j];
}
new_trace->tmask = tmask;
} else {
trace->pid = 0;
new_trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
DT(3, "pipeline-dispatch: " << *new_trace);
output.send(new_trace, 1);
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
@@ -138,4 +136,4 @@ private:
std::vector<int> start_p_;
};
}
}

View File

@@ -51,8 +51,7 @@ void AluUnit::tick() {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
}
}
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
}
}
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
// handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
break;
}
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
#ifdef LSU_DUP_ENABLE
is_dup = (matches == trace->tmask.count());
#endif
}
uint32_t addr_count;
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t);
auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr);
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2);
dcache_req_port.send(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
if (is_dup)
break;
}
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.send(trace, 1);
++core_->perf_stats_.stores;
output.send(trace, 1);
}
// remove input
auto time = input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
input.pop();
break; // single block
}
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
core_->stalled_warps_.reset(trace->wid);
}
auto time = input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
input.pop();
break; // single block
}

View File

@@ -34,14 +34,13 @@ static void show_usage() {
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t num_clusters = NUM_CLUSTERS;
bool showStats = false;;
bool riscv_test = false;
const char* program = nullptr;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
switch (c) {
case 't':
num_threads = atoi(optarg);
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
break;
case 'c':
num_cores = atoi(optarg);
break;
case 'g':
num_clusters = atoi(optarg);
break;
case 'r':
riscv_test = true;
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {
{
// create processor configuation
Arch arch(num_threads, num_warps, num_cores, num_clusters);
Arch arch(num_threads, num_warps, num_cores);
// create memory module
RAM ram(RAM_PAGE_SIZE);

View File

@@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L3_NUM_WAYS), // W
0, // A
log2ceil(L3_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
true, // write-through
false, // write response
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
@@ -114,6 +113,7 @@ void ProcessorImpl::reset() {
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
@@ -126,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_latency_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}
return perf;
}

View File

@@ -24,17 +24,10 @@ namespace vortex {
class ProcessorImpl {
public:
struct PerfStats {
CacheSim::PerfStats l3cache;
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()
: mem_reads(0)
, mem_writes(0)
, mem_latency(0)
{}
};
ProcessorImpl(const Arch& arch);
@@ -46,7 +39,7 @@ public:
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
PerfStats perf_stats() const;
private:
@@ -55,7 +48,7 @@ private:
const Arch& arch_;
std::vector<std::shared_ptr<Cluster>> clusters_;
DCRS dcrs_;
MemSim::Ptr memsim_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;

View File

@@ -22,9 +22,11 @@ class Scoreboard {
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
RegType reg_type;
uint32_t reg_id;
ExeType exe_type;
SfuType sfu_type;
uint64_t uuid;
};
Scoreboard(const Arch &arch)
@@ -44,89 +46,81 @@ public:
owners_.clear();
}
bool in_use(pipeline_trace_t* state) const {
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
bool in_use(pipeline_trace_t* trace) const {
return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0
|| (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0
|| (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0;
}
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
std::vector<reg_use_t> out;
{
uint32_t r = 0;
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
while (used_iregs.any()) {
if (used_iregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
out.push_back({RegType::Integer, r, owners_.at(tag)});
}
used_iregs >>= 1;
++r;
std::vector<reg_use_t> get_uses(pipeline_trace_t* trace) const {
std::vector<reg_use_t> out;
auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid);
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_iregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
auto owner = owners_.at(tag);
out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}
{
uint32_t r = 0;
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
while (used_fregs.any()) {
if (used_fregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
out.push_back({RegType::Float, r, owners_.at(tag)});
}
used_fregs >>= 1;
++r;
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_fregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
auto owner = owners_.at(tag);
out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}
{
uint32_t r = 0;
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
while (used_vregs.any()) {
if (used_vregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
out.push_back({RegType::Vector, r, owners_.at(tag)});
}
used_vregs >>= 1;
++r;
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_vregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
auto owner = owners_.at(tag);
out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
}
}
return out;
}
void reserve(pipeline_trace_t* state) {
assert(state->wb);
switch (state->rdest_type) {
void reserve(pipeline_trace_t* trace) {
assert(trace->wb);
switch (trace->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
in_use_iregs_.at(trace->wid).set(trace->rdest);
break;
case RegType::Float:
in_use_fregs_.at(state->wid).set(state->rdest);
in_use_fregs_.at(trace->wid).set(trace->rdest);
break;
case RegType::Vector:
in_use_vregs_.at(state->wid).set(state->rdest);
break;
default:
in_use_vregs_.at(trace->wid).set(trace->rdest);
break;
default: assert(false);
}
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
assert(owners_.count(tag) == 0);
owners_[tag] = state->uuid;
owners_[tag] = trace;
assert((int)trace->exe_type < 5);
}
void release(pipeline_trace_t* state) {
assert(state->wb);
switch (state->rdest_type) {
void release(pipeline_trace_t* trace) {
assert(trace->wb);
switch (trace->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
in_use_iregs_.at(trace->wid).reset(trace->rdest);
break;
case RegType::Float:
in_use_fregs_.at(state->wid).reset(state->rdest);
in_use_fregs_.at(trace->wid).reset(trace->rdest);
break;
case RegType::Vector:
in_use_vregs_.at(state->wid).reset(state->rdest);
break;
default:
in_use_vregs_.at(trace->wid).reset(trace->rdest);
break;
default: assert(false);
}
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
owners_.erase(tag);
}
@@ -135,7 +129,7 @@ private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
std::unordered_map<uint32_t, pipeline_trace_t*> owners_;
};
}

149
sim/simx/socket.cpp Normal file
View File

@@ -0,0 +1,149 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "socket.h"
#include "cluster.h"
using namespace vortex;
Socket::Socket(const SimContext& ctx,
uint32_t socket_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "socket")
, icache_mem_req_port(this)
, icache_mem_rsp_port(this)
, dcache_mem_req_port(this)
, dcache_mem_rsp_port(this)
, socket_id_(socket_id)
, cluster_(cluster)
, cores_(arch.socket_size())
{
auto cores_per_socket = cores_.size();
char sname[100];
snprintf(sname, 100, "socket%d-icaches", socket_id);
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
1, // B
XLEN, // address bits
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&icache_mem_req_port);
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
snprintf(sname, 100, "socket%d-dcaches", socket_id);
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
log2ceil(DCACHE_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency
});
dcaches_->MemReqPort.bind(&dcache_mem_req_port);
dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
// create cores
for (uint32_t i = 0; i < cores_per_socket; ++i) {
uint32_t core_id = socket_id * cores_per_socket + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs);
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
}
}
}
Socket::~Socket() {
//--
}
void Socket::reset() {
//--
}
void Socket::tick() {
//--
}
void Socket::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
bool Socket::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
}
*exitcode = exitcode_;
return done;
}
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
}
void Socket::resume(uint32_t core_index) {
cores_.at(core_index)->resume();
}
Socket::PerfStats Socket::perf_stats() const {
PerfStats perf_stats;
perf_stats.icache = icaches_->perf_stats();
perf_stats.dcache = dcaches_->perf_stats();
return perf_stats;
}

81
sim/simx/socket.h Normal file
View File

@@ -0,0 +1,81 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <simobject.h>
#include "dcrs.h"
#include "arch.h"
#include "cache_cluster.h"
#include "shared_mem.h"
#include "core.h"
#include "constants.h"
namespace vortex {
class Cluster;
class Socket : public SimObject<Socket> {
public:
struct PerfStats {
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
};
SimPort<MemReq> icache_mem_req_port;
SimPort<MemRsp> icache_mem_rsp_port;
SimPort<MemReq> dcache_mem_req_port;
SimPort<MemRsp> dcache_mem_rsp_port;
Socket(const SimContext& ctx,
uint32_t socket_id,
Cluster* cluster,
const Arch &arch,
const DCRS &dcrs);
~Socket();
uint32_t id() const {
return socket_id_;
}
Cluster* cluster() const {
return cluster_;
}
void reset();
void tick();
void attach_ram(RAM* ram);
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
void resume(uint32_t core_id);
PerfStats perf_stats() const;
private:
uint32_t socket_id_;
Cluster* cluster_;
std::vector<Core::Ptr> cores_;
CacheCluster::Ptr icaches_;
CacheCluster::Ptr dcaches_;
};
} // namespace vortex

View File

@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
case RegType::Vector: os << "v"; break;
default: assert(false);
}
return os;
}
@@ -81,7 +82,7 @@ enum class ExeType {
LSU,
FPU,
SFU,
MAX,
ExeTypeCount
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
@@ -90,7 +91,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
case ExeType::LSU: os << "LSU"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
default: assert(false);
}
return os;
}
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
default: assert(false);
}
return os;
}
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
default: assert(false);
}
return os;
}
@@ -138,7 +141,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
enum class AddrType {
Global,
Shared,
IO,
IO
};
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
case AddrType::Global: os << "Global"; break;
case AddrType::Shared: os << "Shared"; break;
case AddrType::IO: os << "IO"; break;
default: assert(false);
}
return os;
}
@@ -164,7 +168,7 @@ enum class FpuType {
FMA,
FDIV,
FSQRT,
FCVT,
FCVT
};
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
case FpuType::FDIV: os << "FDIV"; break;
case FpuType::FSQRT: os << "FSQRT"; break;
case FpuType::FCVT: os << "FCVT"; break;
default: assert(false);
}
return os;
}
@@ -190,7 +195,7 @@ enum class SfuType {
CSRRW,
CSRRS,
CSRRC,
CMOV
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::CMOV: os << "CMOV"; break;
default: assert(false);
}
return os;
}
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
switch (type) {
case ArbiterType::Priority: os << "Priority"; break;
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
default: assert(false);
}
return os;
}
@@ -351,6 +358,92 @@ private:
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Mux : public SimObject<Mux<Type>> {
public:
std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs;
Mux(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
) : SimObject<Mux<Type>>(ctx, name)
, Inputs(num_inputs, this)
, Outputs(num_outputs, this)
, type_(type)
, delay_(delay)
, cursors_(num_outputs, 0)
, num_reqs_(num_inputs / num_outputs)
{
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
if (num_inputs == num_outputs) {
for (uint32_t i = 0; i < num_inputs; ++i) {
Inputs.at(i).bind(&Outputs.at(i));
}
}
}
void reset() {
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
uint32_t I = Inputs.size();
uint32_t O = Outputs.size();
uint32_t R = num_reqs_;
// skip bypass mode
if (I == O)
return;
// process inputs
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = Inputs.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
DT(4, this->name() << "-" << req);
Outputs.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
}
}
}
private:
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursors_.at(index) = grant + 1;
}
}
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
public:
@@ -364,13 +457,13 @@ public:
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs = 1,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
@@ -383,8 +476,8 @@ public:
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
@@ -462,14 +555,14 @@ private:
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqSM;
SimPort<MemRsp> RspSM;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SimPort<MemReq> ReqDC;
SimPort<MemRsp> RspDC;
SMemDemux(
const SimContext& ctx,
@@ -478,45 +571,49 @@ public:
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, ReqSM(this)
, RspSM(this)
, ReqDC(this)
, RspDC(this)
, delay_(delay)
{}
void reset() {}
void tick() {
void tick() {
// process incoming reponses
if (!RspSM.empty()) {
auto& rsp = RspSM.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSM.pop();
}
if (!RspDC.empty()) {
auto& rsp = RspDC.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDC
.pop();
}
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
ReqSM.send(req, delay_);
} else {
ReqDc.send(req, delay_);
ReqDC.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}
///////////////////////////////////////////////////////////////////////////////
using MemSwitch = Switch<MemReq, MemRsp>;
}