Merge remote-tracking branch 'upstream/master' into vortex2
This commit is contained in:
@@ -1,20 +1,22 @@
|
||||
XLEN ?= 32
|
||||
DESTDIR ?= .
|
||||
RTL_DIR = ../../hw/rtl
|
||||
DPI_DIR = ../../hw/dpi
|
||||
DESTDIR ?= $(CURDIR)
|
||||
HW_DIR = $(abspath ../../hw)
|
||||
COMMON_DIR = $(abspath ../common)
|
||||
THIRD_PARTY_DIR = $(abspath ../../third_party)
|
||||
RTL_DIR = $(HW_DIR)/rtl
|
||||
DPI_DIR = $(HW_DIR)/dpi
|
||||
AFU_DIR = $(RTL_DIR)/afu/opae
|
||||
SCRIPT_DIR = ../../hw/scripts
|
||||
THIRD_PARTY_DIR = ../../third_party
|
||||
SCRIPT_DIR = $(HW_DIR)/scripts
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
|
||||
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
|
||||
CXXFLAGS += -I$(CURDIR) -I$(HW_DIR) -I$(COMMON_DIR) -I$(DESTDIR)
|
||||
CXXFLAGS += -I/$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I/$(THIRD_PARTY_DIR)
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
|
||||
LDFLAGS += -shared $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
|
||||
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
|
||||
@@ -53,9 +55,9 @@ endif
|
||||
|
||||
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
SRCS += fpga.cpp opae_sim.cpp
|
||||
SRCS += $(CURDIR)/fpga.cpp $(CURDIR)/opae_sim.cpp
|
||||
|
||||
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
|
||||
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
|
||||
@@ -73,7 +75,7 @@ TOP = vortex_afu_shim
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += -DXLEN_$(XLEN)
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += verilator.vlt
|
||||
@@ -119,16 +121,16 @@ PROJECT = libopae-c-sim.so
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(DESTDIR)/vortex.xml:
|
||||
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
|
||||
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $@
|
||||
|
||||
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
|
||||
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
|
||||
$(SCRIPT_DIR)/scope.py $^ -o $@
|
||||
|
||||
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
|
||||
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
|
||||
$(SCRIPT_DIR)/gen_config.py -i $^ -o $@
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
|
||||
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
|
||||
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
|
||||
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)
|
||||
|
||||
1
sim/rtlsim/.gitignore
vendored
1
sim/rtlsim/.gitignore
vendored
@@ -1,2 +1 @@
|
||||
VX_config.h
|
||||
/obj_dir/*
|
||||
@@ -1,18 +1,20 @@
|
||||
XLEN ?= 32
|
||||
DESTDIR ?= .
|
||||
RTL_DIR = ../../hw/rtl
|
||||
DPI_DIR = ../../hw/dpi
|
||||
THIRD_PARTY_DIR = ../../third_party
|
||||
DESTDIR ?= $(CURDIR)
|
||||
HW_DIR = $(abspath ../../hw)
|
||||
COMMON_DIR = $(abspath ../common)
|
||||
THIRD_PARTY_DIR = $(abspath ../../third_party)
|
||||
RTL_DIR = $(HW_DIR)/rtl
|
||||
DPI_DIR = $(HW_DIR)/dpi
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I../../../hw -I../../common
|
||||
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
|
||||
CXXFLAGS += -I$(HW_DIR) -I$(COMMON_DIR)
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
|
||||
LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator
|
||||
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
|
||||
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
|
||||
@@ -38,9 +40,9 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
|
||||
endif
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
SRCS += processor.cpp
|
||||
SRCS += $(CURDIR)/processor.cpp
|
||||
|
||||
ifdef AXI_BUS
|
||||
TOP = Vortex_axi
|
||||
@@ -54,7 +56,7 @@ VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += verilator.vlt
|
||||
VL_FLAGS += -DSIMULATION
|
||||
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||
VL_FLAGS += -DXLEN_$(XLEN)
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
VL_FLAGS += $(RTL_INCLUDE)
|
||||
@@ -87,11 +89,11 @@ PROJECT = rtlsim
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@
|
||||
$(DESTDIR)/$(PROJECT): $(SRCS) $(CURDIR)/main.cpp
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
|
||||
$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@
|
||||
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
|
||||
rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
XLEN ?= 32
|
||||
DESTDIR ?= .
|
||||
RTL_DIR = ../hw/rtl
|
||||
THIRD_PARTY_DIR = ../../third_party
|
||||
DESTDIR ?= $(CURDIR)
|
||||
HW_DIR = $(abspath ../../hw)
|
||||
COMMON_DIR = $(abspath ../common)
|
||||
THIRD_PARTY_DIR = $(abspath ../../third_party)
|
||||
|
||||
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I. -I../common -I../../hw
|
||||
CXXFLAGS += -I$(CURDIR) -I$(COMMON_DIR) -I$(HW_DIR)
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
|
||||
CXXFLAGS += -I$(THIRD_PARTY_DIR)
|
||||
CXXFLAGS += -DXLEN_$(XLEN)
|
||||
@@ -14,8 +15,8 @@ CXXFLAGS += $(CONFIGS)
|
||||
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
|
||||
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
||||
@@ -28,6 +28,7 @@ private:
|
||||
uint16_t num_warps_;
|
||||
uint16_t num_cores_;
|
||||
uint16_t num_clusters_;
|
||||
uint16_t socket_size_;
|
||||
uint16_t vsize_;
|
||||
uint16_t num_regs_;
|
||||
uint16_t num_csrs_;
|
||||
@@ -35,11 +36,12 @@ private:
|
||||
uint16_t ipdom_size_;
|
||||
|
||||
public:
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters)
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
|
||||
: num_threads_(num_threads)
|
||||
, num_warps_(num_warps)
|
||||
, num_cores_(num_cores)
|
||||
, num_clusters_(num_clusters)
|
||||
, num_clusters_(NUM_CLUSTERS)
|
||||
, socket_size_(SOCKET_SIZE)
|
||||
, vsize_(16)
|
||||
, num_regs_(32)
|
||||
, num_csrs_(4096)
|
||||
@@ -82,6 +84,10 @@ public:
|
||||
uint16_t num_clusters() const {
|
||||
return num_clusters_;
|
||||
}
|
||||
|
||||
uint16_t socket_size() const {
|
||||
return socket_size_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
@@ -45,20 +45,20 @@ public:
|
||||
|
||||
char sname[100];
|
||||
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
|
||||
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
|
||||
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
|
||||
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
|
||||
for (uint32_t i = 0; i < num_requests; ++i) {
|
||||
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
|
||||
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
|
||||
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
|
||||
for (uint32_t i = 0; i < config.num_inputs; ++i) {
|
||||
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
|
||||
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
|
||||
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
|
||||
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
|
||||
@@ -66,7 +66,7 @@ public:
|
||||
}
|
||||
|
||||
snprintf(sname, 100, "%s-cache-arb", name);
|
||||
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
|
||||
auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
|
||||
|
||||
for (uint32_t i = 0; i < num_caches; ++i) {
|
||||
snprintf(sname, 100, "%s-cache%d", name, i);
|
||||
|
||||
@@ -41,19 +41,16 @@ struct params_t {
|
||||
uint32_t tag_select_addr_end;
|
||||
|
||||
params_t(const CacheSim::Config& config) {
|
||||
int32_t bank_bits = log2ceil(config.num_banks);
|
||||
int32_t offset_bits = config.B - config.W;
|
||||
int32_t log2_bank_size = config.C - bank_bits;
|
||||
int32_t index_bits = log2_bank_size - (config.B + config.A);
|
||||
assert(log2_bank_size > 0);
|
||||
int32_t offset_bits = config.L - config.W;
|
||||
int32_t index_bits = config.C - (config.L + config.A + config.B);
|
||||
assert(offset_bits >= 0);
|
||||
assert(index_bits >= 0);
|
||||
|
||||
this->log2_num_inputs = log2ceil(config.num_inputs);
|
||||
|
||||
this->words_per_line = 1 << offset_bits;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
this->lines_per_set = 1 << config.A;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
this->words_per_line = 1 << offset_bits;
|
||||
|
||||
assert(config.ports_per_bank <= this->words_per_line);
|
||||
|
||||
@@ -63,7 +60,7 @@ struct params_t {
|
||||
|
||||
// Bank select
|
||||
this->bank_select_addr_start = (1+this->word_select_addr_end);
|
||||
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
|
||||
this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);
|
||||
|
||||
// Set select
|
||||
this->set_select_addr_start = (1+this->bank_select_addr_end);
|
||||
@@ -74,23 +71,23 @@ struct params_t {
|
||||
this->tag_select_addr_end = (config.addr_width-1);
|
||||
}
|
||||
|
||||
uint32_t addr_bank_id(uint64_t word_addr) const {
|
||||
uint32_t addr_bank_id(uint64_t addr) const {
|
||||
if (bank_select_addr_end >= bank_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
|
||||
return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t addr_set_id(uint64_t word_addr) const {
|
||||
uint32_t addr_set_id(uint64_t addr) const {
|
||||
if (set_select_addr_end >= set_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
|
||||
return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t addr_tag(uint64_t word_addr) const {
|
||||
uint64_t addr_tag(uint64_t addr) const {
|
||||
if (tag_select_addr_end >= tag_select_addr_start)
|
||||
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
|
||||
return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
@@ -288,8 +285,8 @@ private:
|
||||
Config config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
Switch<MemReq, MemRsp>::Ptr bank_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
MemSwitch::Ptr bank_switch_;
|
||||
MemSwitch::Ptr bypass_switch_;
|
||||
std::vector<SimPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
|
||||
std::vector<bank_req_t> pipeline_reqs_;
|
||||
@@ -304,16 +301,16 @@ public:
|
||||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
|
||||
, banks_((1 << config.B), {config, params_})
|
||||
, mem_req_ports_((1 << config.B), simobject)
|
||||
, mem_rsp_ports_((1 << config.B), simobject)
|
||||
, pipeline_reqs_((1 << config.B), config.ports_per_bank)
|
||||
{
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
|
||||
|
||||
if (config_.bypass) {
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
|
||||
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
|
||||
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
|
||||
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
|
||||
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
|
||||
@@ -323,14 +320,14 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
|
||||
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
|
||||
|
||||
if (config.num_banks > 1) {
|
||||
if (config.B != 0) {
|
||||
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
|
||||
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
|
||||
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
|
||||
bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
|
||||
for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
|
||||
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
|
||||
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
|
||||
}
|
||||
@@ -383,20 +380,22 @@ public:
|
||||
pipeline_req.clear();
|
||||
}
|
||||
|
||||
// schedule MSHR replay
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
// first: schedule MSHR replay (flush MSHR queue)
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
bank.mshr.pop(&pipeline_req);
|
||||
}
|
||||
|
||||
// schedule memory fill
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
// second: schedule memory fill (flush memory queue)
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
|
||||
if (mem_rsp_port.empty())
|
||||
continue;
|
||||
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
// skip if bank already busy
|
||||
if (pipeline_req.type != bank_req_t::None)
|
||||
continue;
|
||||
|
||||
@@ -407,7 +406,7 @@ public:
|
||||
mem_rsp_port.pop();
|
||||
}
|
||||
|
||||
// schedule core requests
|
||||
// last: schedule core requests (flush core queue)
|
||||
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
|
||||
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
@@ -425,18 +424,21 @@ public:
|
||||
}
|
||||
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
// skip if bank already busy
|
||||
if (pipeline_req.type != bank_req_t::None)
|
||||
continue;
|
||||
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
// check MSHR capacity
|
||||
if ((!core_req.write || !config_.write_through)
|
||||
&& bank.mshr.full()) {
|
||||
++perf_stats_.mshr_stalls;
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -452,7 +454,7 @@ public:
|
||||
}
|
||||
// extend request ports
|
||||
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
|
||||
} else if (pipeline_req.type == bank_req_t::None) {
|
||||
} else {
|
||||
// schedule new request
|
||||
bank_req_t bank_req(config_.ports_per_bank);
|
||||
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
|
||||
@@ -463,10 +465,6 @@ public:
|
||||
bank_req.type = bank_req_t::Core;
|
||||
bank_req.write = core_req.write;
|
||||
pipeline_req = bank_req;
|
||||
} else {
|
||||
// bank in use
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (core_req.write)
|
||||
@@ -516,7 +514,7 @@ private:
|
||||
}
|
||||
|
||||
void processBankRequests() {
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
@@ -545,11 +543,10 @@ private:
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case bank_req_t::Core: {
|
||||
bool hit = false;
|
||||
bool found_free_line = false;
|
||||
uint32_t hit_line_id = 0;
|
||||
uint32_t repl_line_id = 0;
|
||||
case bank_req_t::Core: {
|
||||
int32_t hit_line_id = -1;
|
||||
int32_t free_line_id = -1;
|
||||
int32_t repl_line_id = 0;
|
||||
uint32_t max_cnt = 0;
|
||||
|
||||
auto& set = bank.sets.at(pipeline_req.set_id);
|
||||
@@ -557,38 +554,34 @@ private:
|
||||
// tag lookup
|
||||
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
|
||||
auto& line = set.lines.at(i);
|
||||
if (max_cnt < line.lru_ctr) {
|
||||
max_cnt = line.lru_ctr;
|
||||
repl_line_id = i;
|
||||
}
|
||||
if (line.valid) {
|
||||
if (line.tag == pipeline_req.tag) {
|
||||
line.lru_ctr = 0;
|
||||
if (line.tag == pipeline_req.tag) {
|
||||
hit_line_id = i;
|
||||
hit = true;
|
||||
line.lru_ctr = 0;
|
||||
} else {
|
||||
++line.lru_ctr;
|
||||
}
|
||||
if (max_cnt < line.lru_ctr) {
|
||||
max_cnt = line.lru_ctr;
|
||||
repl_line_id = i;
|
||||
}
|
||||
} else {
|
||||
found_free_line = true;
|
||||
repl_line_id = i;
|
||||
free_line_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (hit) {
|
||||
//
|
||||
// Hit handling
|
||||
//
|
||||
if (hit_line_id != -1) {
|
||||
// Hit handling
|
||||
if (pipeline_req.write) {
|
||||
// handle write hit
|
||||
// handle write has_hit
|
||||
auto& hit_line = set.lines.at(hit_line_id);
|
||||
if (config_.write_through) {
|
||||
// forward write request to memory
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
} else {
|
||||
@@ -606,23 +599,21 @@ private:
|
||||
DT(3, simobject_->name() << "-core-" << core_rsp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// Miss handling
|
||||
//
|
||||
} else {
|
||||
// Miss handling
|
||||
if (pipeline_req.write)
|
||||
++perf_stats_.write_misses;
|
||||
else
|
||||
++perf_stats_.read_misses;
|
||||
|
||||
if (!found_free_line && !config_.write_through) {
|
||||
if (free_line_id == -1 && !config_.write_through) {
|
||||
// write back dirty line
|
||||
auto& repl_line = set.lines.at(repl_line_id);
|
||||
if (repl_line.dirty) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
++perf_stats_.evictions;
|
||||
@@ -635,8 +626,8 @@ private:
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
}
|
||||
@@ -655,7 +646,7 @@ private:
|
||||
auto mshr_pending = bank.mshr.lookup(pipeline_req);
|
||||
|
||||
// allocate MSHR
|
||||
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
|
||||
auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
|
||||
|
||||
// send fill request
|
||||
if (!mshr_pending) {
|
||||
@@ -663,8 +654,8 @@ private:
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = false;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
++pending_fill_reqs_;
|
||||
|
||||
@@ -23,16 +23,15 @@ public:
|
||||
struct Config {
|
||||
bool bypass; // cache bypass
|
||||
uint8_t C; // log2 cache size
|
||||
uint8_t B; // log2 block size
|
||||
uint8_t L; // log2 line size
|
||||
uint8_t W; // log2 word size
|
||||
uint8_t A; // log2 associativity
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t B; // log2 number of banks
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
|
||||
@@ -18,34 +18,60 @@ using namespace vortex;
|
||||
Cluster::Cluster(const SimContext& ctx,
|
||||
uint32_t cluster_id,
|
||||
ProcessorImpl* processor,
|
||||
const Arch &arch, const
|
||||
DCRS &dcrs)
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "cluster")
|
||||
, mem_req_port(this)
|
||||
, mem_rsp_port(this)
|
||||
, cluster_id_(cluster_id)
|
||||
, cores_(arch.num_cores())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, sharedmems_(arch.num_cores())
|
||||
, processor_(processor)
|
||||
, sockets_(NUM_SOCKETS)
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, cores_per_socket_(arch.socket_size())
|
||||
{
|
||||
auto num_cores = arch.num_cores();
|
||||
|
||||
char sname[100];
|
||||
|
||||
uint32_t sockets_per_cluster = sockets_.size();
|
||||
|
||||
// create sockets
|
||||
|
||||
snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
|
||||
auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
|
||||
|
||||
snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
|
||||
auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
|
||||
|
||||
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
|
||||
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
|
||||
auto socket = Socket::Create(socket_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs);
|
||||
|
||||
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
|
||||
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
|
||||
|
||||
socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
|
||||
dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
|
||||
|
||||
sockets_.at(i) = socket;
|
||||
}
|
||||
|
||||
// Create l2cache
|
||||
|
||||
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
|
||||
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
|
||||
!L2_ENABLED,
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
log2ceil(MEM_BLOCK_SIZE), // L
|
||||
log2ceil(L2_NUM_WAYS), // W
|
||||
0, // A
|
||||
log2ceil(L2_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
L2_NUM_BANKS, // number of banks
|
||||
1, // number of ports
|
||||
5, // request size
|
||||
2, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
@@ -53,89 +79,11 @@ Cluster::Cluster(const SimContext& ctx,
|
||||
l2cache_->MemReqPort.bind(&this->mem_req_port);
|
||||
this->mem_rsp_port.bind(&l2cache_->MemRspPort);
|
||||
|
||||
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
|
||||
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
|
||||
!ICACHE_ENABLED,
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // B
|
||||
log2ceil(sizeof(uint32_t)), // W
|
||||
log2ceil(ICACHE_NUM_WAYS),// A
|
||||
XLEN, // address bits
|
||||
1, // number of banks
|
||||
1, // number of ports
|
||||
1, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
(uint8_t)arch.num_warps(), // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
|
||||
l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
|
||||
|
||||
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
|
||||
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
|
||||
|
||||
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
|
||||
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
||||
!DCACHE_ENABLED,
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // B
|
||||
log2ceil(sizeof(Word)), // W
|
||||
log2ceil(DCACHE_NUM_WAYS),// A
|
||||
XLEN, // address bits
|
||||
DCACHE_NUM_BANKS, // number of banks
|
||||
1, // number of ports
|
||||
DCACHE_NUM_BANKS, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
4, // pipeline latency
|
||||
});
|
||||
|
||||
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
|
||||
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// create shared memory blocks
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
|
||||
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
|
||||
(1 << SMEM_LOG_SIZE),
|
||||
sizeof(Word),
|
||||
NUM_LSU_LANES,
|
||||
NUM_LSU_LANES,
|
||||
false
|
||||
});
|
||||
}
|
||||
|
||||
// create cores
|
||||
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
uint32_t core_id = cluster_id * num_cores + i;
|
||||
cores_.at(i) = Core::Create(core_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs,
|
||||
sharedmems_.at(i));
|
||||
|
||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||
|
||||
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
|
||||
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
|
||||
auto smem_demux = SMemDemux::Create(sname);
|
||||
|
||||
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
|
||||
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
||||
|
||||
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
|
||||
|
||||
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
|
||||
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
|
||||
}
|
||||
}
|
||||
dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
|
||||
l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
|
||||
}
|
||||
|
||||
Cluster::~Cluster() {
|
||||
@@ -153,14 +101,14 @@ void Cluster::tick() {
|
||||
}
|
||||
|
||||
void Cluster::attach_ram(RAM* ram) {
|
||||
for (auto core : cores_) {
|
||||
core->attach_ram(ram);
|
||||
for (auto& socket : sockets_) {
|
||||
socket->attach_ram(ram);
|
||||
}
|
||||
}
|
||||
|
||||
bool Cluster::running() const {
|
||||
for (auto& core : cores_) {
|
||||
if (core->running())
|
||||
for (auto& socket : sockets_) {
|
||||
if (socket->running())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@@ -169,9 +117,9 @@ bool Cluster::running() const {
|
||||
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
bool done = true;
|
||||
Word exitcode_ = 0;
|
||||
for (auto& core : cores_) {
|
||||
for (auto& socket : sockets_) {
|
||||
Word ec;
|
||||
if (core->check_exit(&ec, riscv_test)) {
|
||||
if (socket->check_exit(&ec, riscv_test)) {
|
||||
exitcode_ |= ec;
|
||||
} else {
|
||||
done = false;
|
||||
@@ -184,36 +132,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||
auto& barrier = barriers_.at(bar_id);
|
||||
|
||||
uint32_t local_core_id = core_id % cores_.size();
|
||||
auto sockets_per_cluster = sockets_.size();
|
||||
auto cores_per_socket = cores_per_socket_;
|
||||
|
||||
uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
|
||||
uint32_t local_core_id = core_id % cores_per_cluster;
|
||||
barrier.set(local_core_id);
|
||||
|
||||
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
|
||||
|
||||
if (barrier.count() == (size_t)count) {
|
||||
// resume all suspended cores
|
||||
for (uint32_t i = 0; i < cores_.size(); ++i) {
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
|
||||
cores_.at(i)->resume();
|
||||
for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
|
||||
for (uint32_t c = 0; c < cores_per_socket; ++c) {
|
||||
uint32_t i = s * cores_per_socket + c;
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
|
||||
sockets_.at(s)->resume(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier.reset();
|
||||
}
|
||||
}
|
||||
|
||||
ProcessorImpl* Cluster::processor() const {
|
||||
return processor_;
|
||||
}
|
||||
|
||||
Cluster::PerfStats Cluster::perf_stats() const {
|
||||
Cluster::PerfStats perf;
|
||||
perf.icache = icaches_->perf_stats();
|
||||
perf.dcache = dcaches_->perf_stats();
|
||||
perf.l2cache = l2cache_->perf_stats();
|
||||
|
||||
for (auto sharedmem : sharedmems_) {
|
||||
perf.sharedmem += sharedmem->perf_stats();
|
||||
}
|
||||
|
||||
return perf;
|
||||
PerfStats perf_stats;
|
||||
perf_stats.l2cache = l2cache_->perf_stats();
|
||||
return perf_stats;
|
||||
}
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "cache_cluster.h"
|
||||
#include "shared_mem.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "constants.h"
|
||||
|
||||
namespace vortex {
|
||||
@@ -28,18 +29,7 @@ class ProcessorImpl;
|
||||
class Cluster : public SimObject<Cluster> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats icache;
|
||||
CacheSim::PerfStats dcache;
|
||||
SharedMem::PerfStats sharedmem;
|
||||
CacheSim::PerfStats l2cache;
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->icache += rhs.icache;
|
||||
this->dcache += rhs.dcache;
|
||||
this->sharedmem += rhs.sharedmem;
|
||||
this->l2cache += rhs.l2cache;
|
||||
return *this;
|
||||
}
|
||||
CacheSim::PerfStats l2cache;
|
||||
};
|
||||
|
||||
SimPort<MemReq> mem_req_port;
|
||||
@@ -53,6 +43,14 @@ public:
|
||||
|
||||
~Cluster();
|
||||
|
||||
uint32_t id() const {
|
||||
return cluster_id_;
|
||||
}
|
||||
|
||||
ProcessorImpl* processor() const {
|
||||
return processor_;
|
||||
}
|
||||
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
@@ -65,22 +63,15 @@ public:
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||
|
||||
ProcessorImpl* processor() const;
|
||||
|
||||
Cluster::PerfStats perf_stats() const;
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
uint32_t cluster_id_;
|
||||
std::vector<Core::Ptr> cores_;
|
||||
std::vector<CoreMask> barriers_;
|
||||
CacheSim::Ptr l2cache_;
|
||||
CacheCluster::Ptr icaches_;
|
||||
CacheCluster::Ptr dcaches_;
|
||||
std::vector<SharedMem::Ptr> sharedmems_;
|
||||
CacheCluster::Ptr tcaches_;
|
||||
CacheCluster::Ptr ocaches_;
|
||||
CacheCluster::Ptr rcaches_;
|
||||
ProcessorImpl* processor_;
|
||||
uint32_t cluster_id_;
|
||||
ProcessorImpl* processor_;
|
||||
std::vector<Socket::Ptr> sockets_;
|
||||
std::vector<CoreMask> barriers_;
|
||||
CacheSim::Ptr l2cache_;
|
||||
uint32_t cores_per_socket_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "mem.h"
|
||||
#include "decode.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "debug.h"
|
||||
#include "constants.h"
|
||||
#include "processor_impl.h"
|
||||
@@ -29,35 +30,36 @@ using namespace vortex;
|
||||
|
||||
Core::Core(const SimContext& ctx,
|
||||
uint32_t core_id,
|
||||
Cluster* cluster,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs,
|
||||
SharedMem::Ptr sharedmem)
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "core")
|
||||
, icache_req_ports(1, this)
|
||||
, icache_rsp_ports(1, this)
|
||||
, dcache_req_ports(NUM_LSU_LANES, this)
|
||||
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
||||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
, dcrs_(dcrs)
|
||||
, decoder_(arch)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, fcsrs_(arch.num_warps(), 0)
|
||||
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)ExeType::MAX)
|
||||
, exe_units_((uint32_t)ExeType::MAX)
|
||||
, sharedmem_(sharedmem)
|
||||
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
|
||||
, exe_units_((uint32_t)ExeType::ExeTypeCount)
|
||||
, smem_demuxs_(NUM_LSU_LANES)
|
||||
, fetch_latch_("fetch")
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, committed_traces_(ISSUE_WIDTH, nullptr)
|
||||
, csrs_(arch.num_warps())
|
||||
, cluster_(cluster)
|
||||
{
|
||||
, csrs_(arch.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
char sname[100];
|
||||
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
csrs_.at(i).resize(arch.num_threads());
|
||||
}
|
||||
@@ -70,6 +72,28 @@ Core::Core(const SimContext& ctx,
|
||||
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
||||
}
|
||||
|
||||
// initialize shared memory
|
||||
snprintf(sname, 100, "core%d-shared_mem", core_id);
|
||||
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
|
||||
(1 << SMEM_LOG_SIZE),
|
||||
sizeof(Word),
|
||||
NUM_LSU_LANES,
|
||||
NUM_LSU_LANES,
|
||||
false
|
||||
});
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
|
||||
auto smem_demux = SMemDemux::Create(sname);
|
||||
|
||||
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
||||
|
||||
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
||||
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
||||
|
||||
smem_demuxs_.at(i) = smem_demux;
|
||||
}
|
||||
|
||||
// initialize dispatchers
|
||||
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
||||
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
||||
@@ -82,6 +106,16 @@ Core::Core(const SimContext& ctx,
|
||||
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
|
||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
|
||||
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
|
||||
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||
}
|
||||
commit_arbs_.at(i) = arbiter;
|
||||
}
|
||||
|
||||
this->reset();
|
||||
}
|
||||
|
||||
@@ -99,8 +133,12 @@ void Core::reset() {
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
exe_unit->reset();
|
||||
}
|
||||
|
||||
for (auto& commit_arb : commit_arbs_) {
|
||||
commit_arb->reset();
|
||||
}
|
||||
|
||||
for ( auto& barrier : barriers_) {
|
||||
for (auto& barrier : barriers_) {
|
||||
barrier.reset();
|
||||
}
|
||||
|
||||
@@ -112,7 +150,7 @@ void Core::reset() {
|
||||
ibuf.clear();
|
||||
}
|
||||
|
||||
commit_exe_= 0;
|
||||
ibuffer_idx_ = 0;
|
||||
|
||||
scoreboard_.clear();
|
||||
fetch_latch_.clear();
|
||||
@@ -150,8 +188,10 @@ void Core::schedule() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (scheduled_warp == -1)
|
||||
if (scheduled_warp == -1) {
|
||||
++perf_stats_.sched_idle;
|
||||
return;
|
||||
}
|
||||
|
||||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
@@ -192,11 +232,11 @@ void Core::fetch() {
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
icache_req_ports.at(0).send(mem_req, 1);
|
||||
icache_req_ports.at(0).send(mem_req, 2);
|
||||
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_latch_.pop();
|
||||
++pending_ifetches_;
|
||||
fetch_latch_.pop();
|
||||
++perf_stats_.ifetches;
|
||||
++pending_ifetches_;
|
||||
}
|
||||
|
||||
void Core::decode() {
|
||||
@@ -206,7 +246,7 @@ void Core::decode() {
|
||||
auto trace = decode_latch_.front();
|
||||
|
||||
// check ibuffer capacity
|
||||
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (ibuffer.full()) {
|
||||
if (!trace->log_once(true)) {
|
||||
DT(3, "*** ibuffer-stall: " << *trace);
|
||||
@@ -223,13 +263,6 @@ void Core::decode() {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
uint32_t active_threads = trace->tmask.count();
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
|
||||
perf_stats_.loads += active_threads;
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
|
||||
perf_stats_.stores += active_threads;
|
||||
|
||||
DT(3, "pipeline-decode: " << *trace);
|
||||
|
||||
// insert to ibuffer
|
||||
@@ -239,7 +272,7 @@ void Core::decode() {
|
||||
}
|
||||
|
||||
void Core::issue() {
|
||||
// operands to dispatch
|
||||
// operands to dispatchers
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& operand = operands_.at(i);
|
||||
if (operand->Output.empty())
|
||||
@@ -257,7 +290,8 @@ void Core::issue() {
|
||||
|
||||
// issue ibuffer instructions
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& ibuffer = ibuffers_.at(i);
|
||||
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
|
||||
auto& ibuffer = ibuffers_.at(ii);
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
|
||||
@@ -265,17 +299,41 @@ void Core::issue() {
|
||||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
if (!trace->log_once(true)) {
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
__unused (use);
|
||||
if (j) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
|
||||
}
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
switch (use.exe_type) {
|
||||
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case ExeType::SFU: {
|
||||
++perf_stats_.scrb_sfu;
|
||||
switch (use.sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::BAR:
|
||||
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
|
||||
default: assert(false);
|
||||
}
|
||||
} break;
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
} else {
|
||||
@@ -294,10 +352,11 @@ void Core::issue() {
|
||||
|
||||
ibuffer.pop();
|
||||
}
|
||||
ibuffer_idx_ += ISSUE_WIDTH;
|
||||
}
|
||||
|
||||
void Core::execute() {
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
|
||||
auto& dispatch = dispatchers_.at(i);
|
||||
auto& exe_unit = exe_units_.at(i);
|
||||
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
||||
@@ -313,10 +372,10 @@ void Core::execute() {
|
||||
void Core::commit() {
|
||||
// process completed instructions
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto trace = committed_traces_.at(i);
|
||||
if (!trace)
|
||||
auto& commit_arb = commit_arbs_.at(i);
|
||||
if (commit_arb->Outputs.at(0).empty())
|
||||
continue;
|
||||
committed_traces_.at(i) = nullptr;
|
||||
auto trace = commit_arb->Outputs.at(0).front();
|
||||
|
||||
// advance to commit stage
|
||||
DT(3, "pipeline-commit: " << *trace);
|
||||
@@ -334,27 +393,11 @@ void Core::commit() {
|
||||
perf_stats_.instrs += trace->tmask.count();
|
||||
}
|
||||
|
||||
commit_arb->Outputs.at(0).pop();
|
||||
|
||||
// delete the trace
|
||||
delete trace;
|
||||
}
|
||||
|
||||
// select completed instructions
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
||||
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
|
||||
auto& exe_unit = exe_units_.at(ii);
|
||||
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
||||
auto committed_trace = committed_traces_.at(j);
|
||||
if (committed_trace)
|
||||
continue;
|
||||
auto& output = exe_unit->Outputs.at(j);
|
||||
if (output.empty())
|
||||
continue;
|
||||
auto trace = output.front();
|
||||
committed_traces_.at(j) = trace;
|
||||
output.pop();
|
||||
}
|
||||
}
|
||||
++commit_exe_;
|
||||
}
|
||||
|
||||
void Core::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
@@ -379,7 +422,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
|
||||
if (is_global) {
|
||||
// global barrier handling
|
||||
if (barrier.count() == active_warps_.count()) {
|
||||
cluster_->barrier(bar_idx, count, core_id_);
|
||||
socket_->barrier(bar_idx, count, core_id_);
|
||||
barrier.reset();
|
||||
}
|
||||
} else {
|
||||
@@ -416,7 +459,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
|
||||
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
auto type = this->get_addr_type(addr);
|
||||
if (type == AddrType::Shared) {
|
||||
sharedmem_->read(data, addr, size);
|
||||
shared_mem_->read(data, addr, size);
|
||||
} else {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
@@ -431,7 +474,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
||||
this->writeToStdOut(data, addr, size);
|
||||
} else {
|
||||
if (type == AddrType::Shared) {
|
||||
sharedmem_->write(data, addr, size);
|
||||
shared_mem_->write(data, addr, size);
|
||||
} else {
|
||||
mmu_.write(data, addr, size, 0);
|
||||
}
|
||||
@@ -533,71 +576,76 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
|
||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
||||
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32;
|
||||
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32;
|
||||
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
|
||||
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
|
||||
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
|
||||
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
|
||||
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
|
||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
||||
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
||||
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
||||
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
|
||||
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
|
||||
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
|
||||
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
|
||||
}
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = cluster_->processor()->perf_stats();
|
||||
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
||||
auto cluster_perf = socket_->cluster()->perf_stats();
|
||||
auto socket_perf = socket_->perf_stats();
|
||||
auto smem_perf = shared_mem_->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
||||
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||
@@ -612,14 +660,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||
|
||||
case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32;
|
||||
case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||
std::abort();
|
||||
} break;
|
||||
}
|
||||
} else {
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
|
||||
@@ -22,11 +22,11 @@
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <simobject.h>
|
||||
#include <mem.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "arch.h"
|
||||
#include "decode.h"
|
||||
#include "mem.h"
|
||||
#include "warp.h"
|
||||
#include "pipeline.h"
|
||||
#include "cache_sim.h"
|
||||
@@ -40,19 +40,25 @@
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Cluster;
|
||||
class Socket;
|
||||
|
||||
using TraceSwitch = Mux<pipeline_trace_t*>;
|
||||
|
||||
class Core : public SimObject<Core> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t cycles;
|
||||
uint64_t instrs;
|
||||
uint64_t sched_idle;
|
||||
uint64_t sched_stalls;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
uint64_t alu_stalls;
|
||||
uint64_t lsu_stalls;
|
||||
uint64_t fpu_stalls;
|
||||
uint64_t sfu_stalls;
|
||||
uint64_t scrb_alu;
|
||||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
uint64_t scrb_sfu;
|
||||
uint64_t scrb_wctl;
|
||||
uint64_t scrb_csrs;
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
@@ -62,12 +68,16 @@ public:
|
||||
PerfStats()
|
||||
: cycles(0)
|
||||
, instrs(0)
|
||||
, sched_idle(0)
|
||||
, sched_stalls(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
, alu_stalls(0)
|
||||
, lsu_stalls(0)
|
||||
, fpu_stalls(0)
|
||||
, sfu_stalls(0)
|
||||
, scrb_alu(0)
|
||||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
, scrb_sfu(0)
|
||||
, scrb_wctl(0)
|
||||
, scrb_csrs(0)
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
@@ -84,10 +94,9 @@ public:
|
||||
|
||||
Core(const SimContext& ctx,
|
||||
uint32_t core_id,
|
||||
Cluster* cluster,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs,
|
||||
SharedMem::Ptr sharedmem);
|
||||
const DCRS &dcrs);
|
||||
|
||||
~Core();
|
||||
|
||||
@@ -105,6 +114,10 @@ public:
|
||||
return core_id_;
|
||||
}
|
||||
|
||||
Socket* socket() const {
|
||||
return socket_;
|
||||
}
|
||||
|
||||
const Arch& arch() const {
|
||||
return arch_;
|
||||
}
|
||||
@@ -153,6 +166,7 @@ private:
|
||||
void cout_flush();
|
||||
|
||||
uint32_t core_id_;
|
||||
Socket* socket_;
|
||||
const Arch& arch_;
|
||||
const DCRS &dcrs_;
|
||||
|
||||
@@ -167,13 +181,13 @@ private:
|
||||
std::vector<Operand::Ptr> operands_;
|
||||
std::vector<Dispatcher::Ptr> dispatchers_;
|
||||
std::vector<ExeUnit::Ptr> exe_units_;
|
||||
SharedMem::Ptr sharedmem_;
|
||||
SharedMem::Ptr shared_mem_;
|
||||
std::vector<SMemDemux::Ptr> smem_demuxs_;
|
||||
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
||||
HashTable<pipeline_trace_t*> pending_icache_;
|
||||
std::vector<pipeline_trace_t*> committed_traces_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
uint64_t issued_instrs_;
|
||||
@@ -188,9 +202,10 @@ private:
|
||||
|
||||
PerfStats perf_stats_;
|
||||
|
||||
Cluster* cluster_;
|
||||
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
||||
|
||||
uint32_t commit_exe_;
|
||||
uint32_t ibuffer_idx_;
|
||||
|
||||
friend class Warp;
|
||||
friend class LsuUnit;
|
||||
|
||||
@@ -66,6 +66,7 @@ public:
|
||||
}
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
auto new_trace = trace;
|
||||
if (pid_count_ != 1) {
|
||||
auto start_p = start_p_.at(b);
|
||||
if (start_p == -1) {
|
||||
@@ -81,33 +82,30 @@ public:
|
||||
end = j;
|
||||
}
|
||||
start /= num_lanes_;
|
||||
end /= num_lanes_;
|
||||
auto new_trace = new pipeline_trace_t(*trace);
|
||||
new_trace->tmask.reset();
|
||||
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
|
||||
new_trace->tmask[j] = trace->tmask[j];
|
||||
}
|
||||
new_trace->pid = start;
|
||||
new_trace->sop = (start_p == 0);
|
||||
if (start == end) {
|
||||
new_trace->eop = 1;
|
||||
end /= num_lanes_;
|
||||
if (start != end) {
|
||||
new_trace = new pipeline_trace_t(*trace);
|
||||
new_trace->eop = false;
|
||||
start_p_.at(b) = start + 1;
|
||||
} else {
|
||||
start_p_.at(b) = -1;
|
||||
input.pop();
|
||||
++block_sent;
|
||||
delete trace;
|
||||
} else {
|
||||
new_trace->eop = 0;
|
||||
start_p_.at(b) = start + 1;
|
||||
}
|
||||
output.send(new_trace, 1);
|
||||
DT(3, "pipeline-dispatch: " << *new_trace);
|
||||
}
|
||||
new_trace->pid = start;
|
||||
new_trace->sop = (0 == start_p);
|
||||
ThreadMask tmask;
|
||||
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
|
||||
tmask[j] = trace->tmask[j];
|
||||
}
|
||||
new_trace->tmask = tmask;
|
||||
} else {
|
||||
trace->pid = 0;
|
||||
new_trace->pid = 0;
|
||||
input.pop();
|
||||
output.send(trace, 1);
|
||||
DT(3, "pipeline-dispatch: " << *trace);
|
||||
++block_sent;
|
||||
}
|
||||
}
|
||||
DT(3, "pipeline-dispatch: " << *new_trace);
|
||||
output.send(new_trace, 1);
|
||||
}
|
||||
if (block_sent == block_size_) {
|
||||
batch_idx_ = (batch_idx_ + 1) % batch_count_;
|
||||
@@ -138,4 +136,4 @@ private:
|
||||
std::vector<int> start_p_;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,8 +51,7 @@ void AluUnit::tick() {
|
||||
assert(core_->stalled_warps_.test(trace->wid));
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = input.pop();
|
||||
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
|
||||
auto time = input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
|
||||
auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
|
||||
|
||||
// handle shared memory response
|
||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t);
|
||||
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
|
||||
if (smem_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = smem_rsp_port.front();
|
||||
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
|
||||
fence_lock_ = true;
|
||||
DT(3, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
auto time = input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
input.pop();
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
|
||||
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
|
||||
matches += (addr0 == mem_addr);
|
||||
}
|
||||
#ifdef LSU_DUP_ENABLE
|
||||
is_dup = (matches == trace->tmask.count());
|
||||
#endif
|
||||
}
|
||||
|
||||
uint32_t addr_count;
|
||||
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
|
||||
if (!trace->tmask.test(t0 + t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_req_ports.at(t);
|
||||
auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
|
||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||
auto type = core_->get_addr_type(mem_addr.addr);
|
||||
|
||||
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.send(mem_req, 2);
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
|
||||
++pending_loads_;
|
||||
++core_->perf_stats_.loads;
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
if (is_dup)
|
||||
break;
|
||||
}
|
||||
@@ -254,13 +257,11 @@ void LsuUnit::tick() {
|
||||
// do not wait on writes
|
||||
if (is_write) {
|
||||
pending_rd_reqs_.release(tag);
|
||||
output.send(trace, 1);
|
||||
++core_->perf_stats_.stores;
|
||||
output.send(trace, 1);
|
||||
}
|
||||
|
||||
// remove input
|
||||
auto time = input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
input.pop();
|
||||
|
||||
break; // single block
|
||||
}
|
||||
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
|
||||
auto time = input.pop();
|
||||
auto stalls = (SimPlatform::instance().cycles() - time);
|
||||
|
||||
core_->perf_stats_.sfu_stalls += stalls;
|
||||
input.pop();
|
||||
|
||||
break; // single block
|
||||
}
|
||||
|
||||
@@ -34,14 +34,13 @@ static void show_usage() {
|
||||
uint32_t num_threads = NUM_THREADS;
|
||||
uint32_t num_warps = NUM_WARPS;
|
||||
uint32_t num_cores = NUM_CORES;
|
||||
uint32_t num_clusters = NUM_CLUSTERS;
|
||||
bool showStats = false;;
|
||||
bool riscv_test = false;
|
||||
const char* program = nullptr;
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) {
|
||||
switch (c) {
|
||||
case 't':
|
||||
num_threads = atoi(optarg);
|
||||
@@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) {
|
||||
break;
|
||||
case 'c':
|
||||
num_cores = atoi(optarg);
|
||||
break;
|
||||
case 'g':
|
||||
num_clusters = atoi(optarg);
|
||||
break;
|
||||
case 'r':
|
||||
riscv_test = true;
|
||||
@@ -88,7 +84,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
{
|
||||
// create processor configuation
|
||||
Arch arch(num_threads, num_warps, num_cores, num_clusters);
|
||||
Arch arch(num_threads, num_warps, num_cores);
|
||||
|
||||
// create memory module
|
||||
RAM ram(RAM_PAGE_SIZE);
|
||||
|
||||
@@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
|
||||
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
|
||||
!L3_ENABLED,
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
log2ceil(L3_NUM_WAYS), // W
|
||||
0, // A
|
||||
XLEN, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
1, // number of ports
|
||||
log2ceil(MEM_BLOCK_SIZE), // L
|
||||
log2ceil(L3_NUM_WAYS), // W
|
||||
0, // A
|
||||
log2ceil(L3_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
uint8_t(arch.num_clusters()), // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
true, // write-through
|
||||
false, // write response
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
||||
@@ -114,6 +113,7 @@ void ProcessorImpl::reset() {
|
||||
perf_mem_writes_ = 0;
|
||||
perf_mem_latency_ = 0;
|
||||
perf_mem_pending_reads_ = 0;
|
||||
|
||||
}
|
||||
|
||||
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
|
||||
@@ -126,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
|
||||
perf.mem_writes = perf_mem_writes_;
|
||||
perf.mem_latency = perf_mem_latency_;
|
||||
perf.l3cache = l3cache_->perf_stats();
|
||||
for (auto cluster : clusters_) {
|
||||
perf.clusters += cluster->perf_stats();
|
||||
}
|
||||
return perf;
|
||||
}
|
||||
|
||||
|
||||
@@ -24,17 +24,10 @@ namespace vortex {
|
||||
class ProcessorImpl {
|
||||
public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats l3cache;
|
||||
uint64_t mem_reads;
|
||||
uint64_t mem_writes;
|
||||
uint64_t mem_latency;
|
||||
CacheSim::PerfStats l3cache;
|
||||
Cluster::PerfStats clusters;
|
||||
|
||||
PerfStats()
|
||||
: mem_reads(0)
|
||||
, mem_writes(0)
|
||||
, mem_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
ProcessorImpl(const Arch& arch);
|
||||
@@ -46,7 +39,7 @@ public:
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
|
||||
ProcessorImpl::PerfStats perf_stats() const;
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
@@ -55,7 +48,7 @@ private:
|
||||
const Arch& arch_;
|
||||
std::vector<std::shared_ptr<Cluster>> clusters_;
|
||||
DCRS dcrs_;
|
||||
MemSim::Ptr memsim_;
|
||||
MemSim::Ptr memsim_;
|
||||
CacheSim::Ptr l3cache_;
|
||||
uint64_t perf_mem_reads_;
|
||||
uint64_t perf_mem_writes_;
|
||||
|
||||
@@ -22,9 +22,11 @@ class Scoreboard {
|
||||
public:
|
||||
|
||||
struct reg_use_t {
|
||||
RegType type;
|
||||
uint32_t reg;
|
||||
uint64_t owner;
|
||||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
ExeType exe_type;
|
||||
SfuType sfu_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
Scoreboard(const Arch &arch)
|
||||
@@ -44,89 +46,81 @@ public:
|
||||
owners_.clear();
|
||||
}
|
||||
|
||||
bool in_use(pipeline_trace_t* state) const {
|
||||
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|
||||
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|
||||
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
|
||||
bool in_use(pipeline_trace_t* trace) const {
|
||||
return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0
|
||||
|| (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0
|
||||
|| (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0;
|
||||
}
|
||||
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
|
||||
std::vector<reg_use_t> out;
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
|
||||
while (used_iregs.any()) {
|
||||
if (used_iregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
|
||||
out.push_back({RegType::Integer, r, owners_.at(tag)});
|
||||
}
|
||||
used_iregs >>= 1;
|
||||
++r;
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* trace) const {
|
||||
std::vector<reg_use_t> out;
|
||||
|
||||
auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
|
||||
auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
|
||||
auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid);
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_iregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
|
||||
while (used_fregs.any()) {
|
||||
if (used_fregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
|
||||
out.push_back({RegType::Float, r, owners_.at(tag)});
|
||||
}
|
||||
used_fregs >>= 1;
|
||||
++r;
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_fregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
|
||||
while (used_vregs.any()) {
|
||||
if (used_vregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
|
||||
out.push_back({RegType::Vector, r, owners_.at(tag)});
|
||||
}
|
||||
used_vregs >>= 1;
|
||||
++r;
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_vregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void reserve(pipeline_trace_t* state) {
|
||||
assert(state->wb);
|
||||
switch (state->rdest_type) {
|
||||
void reserve(pipeline_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).set(state->rdest);
|
||||
in_use_iregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).set(state->rdest);
|
||||
in_use_fregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
default:
|
||||
in_use_vregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
default: assert(false);
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state->uuid;
|
||||
owners_[tag] = trace;
|
||||
assert((int)trace->exe_type < 5);
|
||||
}
|
||||
|
||||
void release(pipeline_trace_t* state) {
|
||||
assert(state->wb);
|
||||
switch (state->rdest_type) {
|
||||
void release(pipeline_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).reset(state->rdest);
|
||||
in_use_iregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).reset(state->rdest);
|
||||
in_use_fregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
default:
|
||||
in_use_vregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
default: assert(false);
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
|
||||
@@ -135,7 +129,7 @@ private:
|
||||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<RegMask> in_use_vregs_;
|
||||
std::unordered_map<uint32_t, uint64_t> owners_;
|
||||
std::unordered_map<uint32_t, pipeline_trace_t*> owners_;
|
||||
};
|
||||
|
||||
}
|
||||
149
sim/simx/socket.cpp
Normal file
149
sim/simx/socket.cpp
Normal file
@@ -0,0 +1,149 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "socket.h"
|
||||
#include "cluster.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Socket::Socket(const SimContext& ctx,
|
||||
uint32_t socket_id,
|
||||
Cluster* cluster,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "socket")
|
||||
, icache_mem_req_port(this)
|
||||
, icache_mem_rsp_port(this)
|
||||
, dcache_mem_req_port(this)
|
||||
, dcache_mem_rsp_port(this)
|
||||
, socket_id_(socket_id)
|
||||
, cluster_(cluster)
|
||||
, cores_(arch.socket_size())
|
||||
{
|
||||
auto cores_per_socket = cores_.size();
|
||||
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "socket%d-icaches", socket_id);
|
||||
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
|
||||
!ICACHE_ENABLED,
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // L
|
||||
log2ceil(sizeof(uint32_t)), // W
|
||||
log2ceil(ICACHE_NUM_WAYS),// A
|
||||
1, // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
1, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
(uint8_t)arch.num_warps(), // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
||||
icaches_->MemReqPort.bind(&icache_mem_req_port);
|
||||
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
|
||||
|
||||
snprintf(sname, 100, "socket%d-dcaches", socket_id);
|
||||
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
||||
!DCACHE_ENABLED,
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // L
|
||||
log2ceil(sizeof(Word)), // W
|
||||
log2ceil(DCACHE_NUM_WAYS),// A
|
||||
log2ceil(DCACHE_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
DCACHE_NUM_BANKS, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
||||
dcaches_->MemReqPort.bind(&dcache_mem_req_port);
|
||||
dcache_mem_rsp_port.bind(&dcaches_->MemRspPort);
|
||||
|
||||
// create cores
|
||||
|
||||
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||
uint32_t core_id = socket_id * cores_per_socket + i;
|
||||
cores_.at(i) = Core::Create(core_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs);
|
||||
|
||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||
|
||||
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
|
||||
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Socket::~Socket() {
|
||||
//--
|
||||
}
|
||||
|
||||
void Socket::reset() {
|
||||
//--
|
||||
}
|
||||
|
||||
void Socket::tick() {
|
||||
//--
|
||||
}
|
||||
|
||||
void Socket::attach_ram(RAM* ram) {
|
||||
for (auto core : cores_) {
|
||||
core->attach_ram(ram);
|
||||
}
|
||||
}
|
||||
|
||||
bool Socket::running() const {
|
||||
for (auto& core : cores_) {
|
||||
if (core->running())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
bool done = true;
|
||||
Word exitcode_ = 0;
|
||||
for (auto& core : cores_) {
|
||||
Word ec;
|
||||
if (core->check_exit(&ec, riscv_test)) {
|
||||
exitcode_ |= ec;
|
||||
} else {
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
*exitcode = exitcode_;
|
||||
return done;
|
||||
}
|
||||
|
||||
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||
cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id);
|
||||
}
|
||||
|
||||
void Socket::resume(uint32_t core_index) {
|
||||
cores_.at(core_index)->resume();
|
||||
}
|
||||
|
||||
Socket::PerfStats Socket::perf_stats() const {
|
||||
PerfStats perf_stats;
|
||||
perf_stats.icache = icaches_->perf_stats();
|
||||
perf_stats.dcache = dcaches_->perf_stats();
|
||||
return perf_stats;
|
||||
}
|
||||
81
sim/simx/socket.h
Normal file
81
sim/simx/socket.h
Normal file
@@ -0,0 +1,81 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "dcrs.h"
|
||||
#include "arch.h"
|
||||
#include "cache_cluster.h"
|
||||
#include "shared_mem.h"
|
||||
#include "core.h"
|
||||
#include "constants.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Cluster;
|
||||
|
||||
class Socket : public SimObject<Socket> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
CacheSim::PerfStats icache;
|
||||
CacheSim::PerfStats dcache;
|
||||
};
|
||||
|
||||
SimPort<MemReq> icache_mem_req_port;
|
||||
SimPort<MemRsp> icache_mem_rsp_port;
|
||||
|
||||
SimPort<MemReq> dcache_mem_req_port;
|
||||
SimPort<MemRsp> dcache_mem_rsp_port;
|
||||
|
||||
Socket(const SimContext& ctx,
|
||||
uint32_t socket_id,
|
||||
Cluster* cluster,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs);
|
||||
|
||||
~Socket();
|
||||
|
||||
uint32_t id() const {
|
||||
return socket_id_;
|
||||
}
|
||||
|
||||
Cluster* cluster() const {
|
||||
return cluster_;
|
||||
}
|
||||
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
bool running() const;
|
||||
|
||||
bool check_exit(Word* exitcode, bool riscv_test) const;
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||
|
||||
void resume(uint32_t core_id);
|
||||
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
uint32_t socket_id_;
|
||||
Cluster* cluster_;
|
||||
std::vector<Core::Ptr> cores_;
|
||||
CacheCluster::Ptr icaches_;
|
||||
CacheCluster::Ptr dcaches_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
171
sim/simx/types.h
171
sim/simx/types.h
@@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
||||
case RegType::Integer: os << "x"; break;
|
||||
case RegType::Float: os << "f"; break;
|
||||
case RegType::Vector: os << "v"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -81,7 +82,7 @@ enum class ExeType {
|
||||
LSU,
|
||||
FPU,
|
||||
SFU,
|
||||
MAX,
|
||||
ExeTypeCount
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
||||
@@ -90,7 +91,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
||||
case ExeType::LSU: os << "LSU"; break;
|
||||
case ExeType::FPU: os << "FPU"; break;
|
||||
case ExeType::SFU: os << "SFU"; break;
|
||||
case ExeType::MAX: break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
||||
case AluType::SYSCALL: os << "SYSCALL"; break;
|
||||
case AluType::IMUL: os << "IMUL"; break;
|
||||
case AluType::IDIV: os << "IDIV"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
case LsuType::LOAD: os << "LOAD"; break;
|
||||
case LsuType::STORE: os << "STORE"; break;
|
||||
case LsuType::FENCE: os << "FENCE"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -138,7 +141,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
enum class AddrType {
|
||||
Global,
|
||||
Shared,
|
||||
IO,
|
||||
IO
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
@@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
case AddrType::Global: os << "Global"; break;
|
||||
case AddrType::Shared: os << "Shared"; break;
|
||||
case AddrType::IO: os << "IO"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -164,7 +168,7 @@ enum class FpuType {
|
||||
FMA,
|
||||
FDIV,
|
||||
FSQRT,
|
||||
FCVT,
|
||||
FCVT
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
||||
@@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
||||
case FpuType::FDIV: os << "FDIV"; break;
|
||||
case FpuType::FSQRT: os << "FSQRT"; break;
|
||||
case FpuType::FCVT: os << "FCVT"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -190,7 +195,7 @@ enum class SfuType {
|
||||
CSRRW,
|
||||
CSRRS,
|
||||
CSRRC,
|
||||
CMOV
|
||||
CMOV
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
||||
@@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
||||
case SfuType::CSRRS: os << "CSRRS"; break;
|
||||
case SfuType::CSRRC: os << "CSRRC"; break;
|
||||
case SfuType::CMOV: os << "CMOV"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
||||
switch (type) {
|
||||
case ArbiterType::Priority: os << "Priority"; break;
|
||||
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
@@ -351,6 +358,92 @@ private:
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Type>
|
||||
class Mux : public SimObject<Mux<Type>> {
|
||||
public:
|
||||
std::vector<SimPort<Type>> Inputs;
|
||||
std::vector<SimPort<Type>> Outputs;
|
||||
|
||||
Mux(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
) : SimObject<Mux<Type>>(ctx, name)
|
||||
, Inputs(num_inputs, this)
|
||||
, Outputs(num_outputs, this)
|
||||
, type_(type)
|
||||
, delay_(delay)
|
||||
, cursors_(num_outputs, 0)
|
||||
, num_reqs_(num_inputs / num_outputs)
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 32);
|
||||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
Inputs.at(i).bind(&Outputs.at(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (auto& cursor : cursors_) {
|
||||
cursor = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void tick() {
|
||||
uint32_t I = Inputs.size();
|
||||
uint32_t O = Outputs.size();
|
||||
uint32_t R = num_reqs_;
|
||||
|
||||
// skip bypass mode
|
||||
if (I == O)
|
||||
return;
|
||||
|
||||
// process inputs
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (cursors_.at(o) + r) & (R-1);
|
||||
uint32_t j = o * R + i;
|
||||
if (j >= I)
|
||||
continue;
|
||||
|
||||
auto& req_in = Inputs.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
DT(4, this->name() << "-" << req);
|
||||
Outputs.at(o).send(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(o, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void update_cursor(uint32_t index, uint32_t grant) {
|
||||
if (type_ == ArbiterType::RoundRobin) {
|
||||
cursors_.at(index) = grant + 1;
|
||||
}
|
||||
}
|
||||
|
||||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> cursors_;
|
||||
uint32_t num_reqs_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Req, typename Rsp>
|
||||
class Switch : public SimObject<Switch<Req, Rsp>> {
|
||||
public:
|
||||
@@ -364,13 +457,13 @@ public:
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs = 1,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
)
|
||||
: SimObject<Switch<Req, Rsp>>(ctx, name)
|
||||
, ReqIn(num_inputs, this)
|
||||
, RspIn(num_inputs, this)
|
||||
, ReqIn(num_inputs, this)
|
||||
, RspIn(num_inputs, this)
|
||||
, ReqOut(num_outputs, this)
|
||||
, RspOut(num_outputs, this)
|
||||
, type_(type)
|
||||
@@ -383,8 +476,8 @@ public:
|
||||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
// bypass mode
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
ReqIn.at(i).bind(&ReqOut.at(i));
|
||||
RspOut.at(i).bind(&RspIn.at(i));
|
||||
@@ -462,14 +555,14 @@ private:
|
||||
|
||||
class SMemDemux : public SimObject<SMemDemux> {
|
||||
public:
|
||||
SimPort<MemReq> ReqIn;
|
||||
SimPort<MemRsp> RspIn;
|
||||
SimPort<MemReq> ReqIn;
|
||||
SimPort<MemRsp> RspIn;
|
||||
|
||||
SimPort<MemReq> ReqSm;
|
||||
SimPort<MemRsp> RspSm;
|
||||
SimPort<MemReq> ReqSM;
|
||||
SimPort<MemRsp> RspSM;
|
||||
|
||||
SimPort<MemReq> ReqDc;
|
||||
SimPort<MemRsp> RspDc;
|
||||
SimPort<MemReq> ReqDC;
|
||||
SimPort<MemRsp> RspDC;
|
||||
|
||||
SMemDemux(
|
||||
const SimContext& ctx,
|
||||
@@ -478,45 +571,49 @@ public:
|
||||
) : SimObject<SMemDemux>(ctx, name)
|
||||
, ReqIn(this)
|
||||
, RspIn(this)
|
||||
, ReqSm(this)
|
||||
, RspSm(this)
|
||||
, ReqDc(this)
|
||||
, RspDc(this)
|
||||
, ReqSM(this)
|
||||
, RspSM(this)
|
||||
, ReqDC(this)
|
||||
, RspDC(this)
|
||||
, delay_(delay)
|
||||
{}
|
||||
|
||||
void reset() {}
|
||||
|
||||
void tick() {
|
||||
void tick() {
|
||||
// process incoming reponses
|
||||
if (!RspSM.empty()) {
|
||||
auto& rsp = RspSM.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspSM.pop();
|
||||
}
|
||||
if (!RspDC.empty()) {
|
||||
auto& rsp = RspDC.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspDC
|
||||
.pop();
|
||||
}
|
||||
// process incomming requests
|
||||
if (!ReqIn.empty()) {
|
||||
auto& req = ReqIn.front();
|
||||
DT(4, this->name() << "-" << req);
|
||||
if (req.type == AddrType::Shared) {
|
||||
ReqSm.send(req, delay_);
|
||||
ReqSM.send(req, delay_);
|
||||
} else {
|
||||
ReqDc.send(req, delay_);
|
||||
ReqDC.send(req, delay_);
|
||||
}
|
||||
ReqIn.pop();
|
||||
}
|
||||
|
||||
// process incoming reponses
|
||||
if (!RspSm.empty()) {
|
||||
auto& rsp = RspSm.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspSm.pop();
|
||||
}
|
||||
if (!RspDc.empty()) {
|
||||
auto& rsp = RspDc.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspDc.pop();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t delay_;
|
||||
};
|
||||
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
using MemSwitch = Switch<MemReq, MemRsp>;
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user