diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 7fc254db..1833cd93 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -185,7 +185,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = IMPLEMENTATION_ID; break; case VX_CAPS_MAX_CORES: - *value = NUM_CORES; + *value = NUM_CORES * NUM_CLUSTERS; break; case VX_CAPS_MAX_WARPS: *value = NUM_WARPS; diff --git a/driver/simx/Makefile b/driver/simx/Makefile index 58db553c..80fd2996 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -12,8 +12,8 @@ CXXFLAGS += -DDUMP_PERF_STATS #CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1 +#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 +CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1 CXXFLAGS += $(CONFIGS) @@ -21,7 +21,7 @@ LDFLAGS += -shared -pthread #LDFLAGS += -dynamiclib -pthread SRCS = vortex.cpp ../common/vx_utils.cpp -SRCS += $(SIMX_DIR)/util.cpp $(SIMX_DIR)/args.cpp $(SIMX_DIR)/mem.cpp $(SIMX_DIR)/core.cpp $(SIMX_DIR)/warp.cpp $(SIMX_DIR)/instr.cpp $(SIMX_DIR)/decode.cpp $(SIMX_DIR)/execute.cpp +SRCS += $(SIMX_DIR)/util.cpp $(SIMX_DIR)/args.cpp $(SIMX_DIR)/mem.cpp $(SIMX_DIR)/warp.cpp $(SIMX_DIR)/core.cpp $(SIMX_DIR)/decode.cpp $(SIMX_DIR)/execute.cpp # Debugigng ifdef DEBUG diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 5281119c..aa41a3f9 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -144,19 +144,18 @@ private: void run() { vortex::ArchDef arch("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS); vortex::Decoder decoder(arch); - vortex::MemoryUnit mu(PAGE_SIZE, arch.getWordSize(), true); + vortex::MemoryUnit mu(PAGE_SIZE, arch.wsize(), true); mu.attach(ram_, 0); - std::vector> cores(NUM_CORES); - for (size_t i = 0; i < NUM_CORES; ++i) { - cores[i] = std::make_shared(arch, decoder, mu); + std::vector> cores(arch.num_cores()); + for (int i = 0; i < arch.num_cores(); ++i) { + cores[i] = std::make_shared(arch, decoder, mu, i); } bool running; - do { running = false; - for (size_t i = 0; i < NUM_CORES; ++i) { + for (int i = 0; i < arch.num_cores(); ++i) { if (!cores[i]->running()) continue; running = true; @@ -236,7 +235,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { *value = IMPLEMENTATION_ID; break; case VX_CAPS_MAX_CORES: - *value = NUM_CORES; + *value = NUM_CORES * NUM_CLUSTERS; break; case VX_CAPS_MAX_WARPS: *value = NUM_WARPS; diff --git a/simX/Makefile b/simX/Makefile index 6aa494d7..37d7ad04 100644 --- a/simX/Makefile +++ b/simX/Makefile @@ -13,7 +13,7 @@ RTL_DIR = ../hw/rtl PROJECT = simX -SRCS = util.cpp args.cpp mem.cpp core.cpp warp.cpp instr.cpp decode.cpp execute.cpp main.cpp +SRCS = util.cpp args.cpp mem.cpp warp.cpp core.cpp decode.cpp execute.cpp main.cpp # Debugigng ifdef DEBUG diff --git a/simX/archdef.h b/simX/archdef.h index 78d12ad1..28806ff6 100644 --- a/simX/archdef.h +++ b/simX/archdef.h @@ -11,156 +11,56 @@ namespace vortex { class ArchDef { public: - struct Undefined {}; - - ArchDef(const std::string &s, + ArchDef(const std::string &/*arch*/, int num_cores, int num_warps, - int num_threads) { - std::istringstream iss(s.c_str()); - wordSize_ = 4; - encChar_ = 'w'; - numRegs_ = 32; - numPRegs_ = 0; - numCores_ = num_cores; - numWarps_ = num_warps; - numThreads_ = num_threads; - extent_ = EXT_END; + int num_threads) { + wsize_ = 4; + vsize_ = 16; + num_regs_ = 32; + num_csrs_ = 4096; + num_cores_ = num_cores; + num_warps_ = num_warps; + num_threads_ = num_threads; } - operator std::string () const { - if (extent_ == EXT_NULL) - return ""; - - std::ostringstream oss; - if (extent_ >= EXT_WORDSIZE) oss << wordSize_; - if (extent_ >= EXT_ENC ) oss << encChar_; - if (extent_ >= EXT_REGS ) oss << numRegs_; - if (extent_ >= EXT_PREGS ) oss << '/' << numPRegs_; - if (extent_ >= EXT_THREADS ) oss << '/' << numThreads_; - if (extent_ >= EXT_WARPS ) oss << '/' << numWarps_; - if (extent_ >= EXT_CORES ) oss << '/' << numCores_; - - return oss.str(); + int wsize() const { + return wsize_; } - bool operator==(const ArchDef &r) const { - Extent minExtent(r.extent_ > extent_ ? extent_ : r.extent_); - - // Can't be equal if we can't specify a binary encoding at all. - if (minExtent < EXT_PREGS) - return false; - - if (minExtent >= EXT_WORDSIZE) { - if (wordSize_!=r.wordSize_) - return false; - } - - if (minExtent >= EXT_ENC) { - if (encChar_ != r.encChar_) - return false; - } - - if (minExtent >= EXT_REGS) { - if (numRegs_ != r.numRegs_) - return false; - } - - if (minExtent >= EXT_PREGS) { - if (numPRegs_ != r.numPRegs_) - return false; - } - - if (minExtent >= EXT_THREADS) { - if (numThreads_ != r.numThreads_) - return false; - } - - if (minExtent >= EXT_WARPS) { - if (numWarps_ != r.numWarps_) - return false; - } - - if (minExtent >= EXT_CORES) { - if (numCores_ != r.numCores_) - return false; - } - - return true; + int vsize() const { + return vsize_; } - bool operator!=(const ArchDef &r) const { - return !(*this == r); + int num_regs() const { + return num_regs_; } - Size getWordSize() const { - if (extent_ < EXT_WORDSIZE) - throw Undefined(); - return wordSize_; + int num_csrs() const { + return num_csrs_; } - char getEncChar() const { - if ((extent_ < EXT_ENC) || (encChar_ == 'x')) - throw Undefined(); - return encChar_; + int num_threads() const { + return num_threads_; } - RegNum getNumRegs() const { - if (extent_ < EXT_REGS) - throw Undefined(); - return numRegs_; + int num_warps() const { + return num_warps_; } - RegNum getNumPRegs() const { - if (extent_ < EXT_PREGS) - throw Undefined(); - return numPRegs_; - } - - ThdNum getNumThreads() const { - if (extent_ < EXT_THREADS) - throw Undefined(); - return numThreads_; - } - - ThdNum getNumWarps() const { - if (extent_ < EXT_WARPS) - throw Undefined(); - return numWarps_; - } - - ThdNum getNumCores() const { - if (extent_ < EXT_CORES) - throw Undefined(); - return numCores_; - } - - bool is_cpu_mode() const { - return cpu_mode_; + int num_cores() const { + return num_cores_; } private: - enum Extent { - EXT_NULL, - EXT_WORDSIZE, - EXT_ENC, - EXT_REGS, - EXT_PREGS, - EXT_THREADS, - EXT_WARPS, - EXT_CORES, - EXT_END - }; - Extent extent_; - Size wordSize_; - ThdNum numThreads_; - ThdNum numWarps_; - ThdNum numCores_; - RegNum numRegs_; - ThdNum numPRegs_; - char encChar_; - bool cpu_mode_; + int wsize_; + int vsize_; + int num_regs_; + int num_csrs_; + int num_threads_; + int num_warps_; + int num_cores_; }; } \ No newline at end of file diff --git a/simX/core.cpp b/simX/core.cpp index 87e8438f..869d088f 100644 --- a/simX/core.cpp +++ b/simX/core.cpp @@ -1,10 +1,7 @@ #include #include #include - -// #define USE_DEBUG 7 -// #define PRINT_ACTIVE_THREADS - +#include #include "types.h" #include "util.h" #include "archdef.h" @@ -14,21 +11,25 @@ #include "debug.h" #define INIT_TRACE(trace_inst) \ - trace_inst.valid_inst = false; \ - trace_inst.pc = 0; \ + trace_inst.valid = false; \ + trace_inst.PC = 0; \ trace_inst.wid = schedule_w_; \ - trace_inst.rs1 = -1; \ - trace_inst.rs2 = -1; \ - trace_inst.rd = -1; \ - trace_inst.vs1 = -1; \ - trace_inst.vs2 = -1; \ - trace_inst.vd = -1; \ + trace_inst.irs1 = -1; \ + trace_inst.irs2 = -1; \ + trace_inst.frs1 = -1; \ + trace_inst.frs2 = -1; \ + trace_inst.frs3 = -1; \ + trace_inst.frd = -1; \ + trace_inst.ird = -1; \ + trace_inst.vrs1 = -1; \ + trace_inst.vrs2 = -1; \ + trace_inst.vrd = -1; \ trace_inst.is_lw = false; \ trace_inst.is_sw = false; \ if (trace_inst.mem_addresses != NULL) \ free(trace_inst.mem_addresses); \ trace_inst.mem_addresses = (unsigned *)malloc(32 * sizeof(unsigned)); \ - for (ThdNum tid = 0; tid < arch_.getNumThreads(); tid++) \ + for (int tid = 0; tid < arch_.num_threads(); tid++) \ trace_inst.mem_addresses[tid] = 0xdeadbeef; \ trace_inst.mem_stall_cycles = 0; \ trace_inst.fetch_stall_cycles = 0; \ @@ -37,18 +38,22 @@ trace_inst.stalled = false; #define CPY_TRACE(drain, source) \ - drain.valid_inst = source.valid_inst; \ - drain.pc = source.pc; \ + drain.valid = source.valid; \ + drain.PC = source.PC; \ drain.wid = source.wid; \ - drain.rs1 = source.rs1; \ - drain.rs2 = source.rs2; \ - drain.rd = source.rd; \ - drain.vs1 = source.vs1; \ - drain.vs2 = source.vs2; \ - drain.vd = source.vd; \ + drain.irs1 = source.irs1; \ + drain.irs2 = source.irs2; \ + drain.ird = source.ird; \ + drain.frs1 = source.frs1; \ + drain.frs2 = source.frs2; \ + drain.frs3 = source.frs3; \ + drain.frd = source.frd; \ + drain.vrs1 = source.vrs1; \ + drain.vrs2 = source.vrs2; \ + drain.vrd = source.vrd; \ drain.is_lw = source.is_lw; \ drain.is_sw = source.is_sw; \ - for (ThdNum tid = 0; tid < arch_.getNumThreads(); tid++)\ + for (int tid = 0; tid < arch_.num_threads(); tid++) \ drain.mem_addresses[tid] = source.mem_addresses[tid]; \ drain.mem_stall_cycles = source.mem_stall_cycles; \ drain.fetch_stall_cycles = source.fetch_stall_cycles; \ @@ -60,17 +65,17 @@ using namespace vortex; void printTrace(trace_inst_t *trace, const char *stage_name) { __unused(trace, stage_name); - D(3, stage_name << ": valid=" << trace->valid_inst); - D(3, stage_name << ": PC=" << std::hex << trace->pc << std::dec); - D(3, stage_name << ": wid=" << trace->wid); - D(3, stage_name << ": rd=" << trace->rd << ", rs1=" << trace->rs1 << ", trs2=" << trace->rs2); - D(3, stage_name << ": is_lw=" << trace->is_lw); - D(3, stage_name << ": is_sw=" << trace->is_sw); - D(3, stage_name << ": fetch_stall_cycles=" << trace->fetch_stall_cycles); - D(3, stage_name << ": mem_stall_cycles=" << trace->mem_stall_cycles); - D(3, stage_name << ": stall_warp=" << trace->stall_warp); - D(3, stage_name << ": wspawn=" << trace->wspawn); - D(3, stage_name << ": stalled=" << trace->stalled); + D(4, stage_name << ": valid=" << trace->valid); + D(4, stage_name << ": PC=" << std::hex << trace->PC << std::dec); + D(4, stage_name << ": wid=" << trace->wid); + D(4, stage_name << ": rd=" << trace->ird << ", rs1=" << trace->irs1 << ", trs2=" << trace->irs2); + D(4, stage_name << ": is_lw=" << trace->is_lw); + D(4, stage_name << ": is_sw=" << trace->is_sw); + D(4, stage_name << ": fetch_stall_cycles=" << trace->fetch_stall_cycles); + D(4, stage_name << ": mem_stall_cycles=" << trace->mem_stall_cycles); + D(4, stage_name << ": stall_warp=" << trace->stall_warp); + D(4, stage_name << ": wspawn=" << trace->wspawn); + D(4, stage_name << ": stalled=" << trace->stalled); } Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) @@ -79,8 +84,7 @@ Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) , decoder_(decoder) , mem_(mem) , steps_(0) - , num_instructions_(0) { - release_warp_ = false; + , num_insts_(0) { foundSchedule_ = true; schedule_w_ = 0; @@ -98,23 +102,17 @@ Core::Core(const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id) INIT_TRACE(inst_in_lsu_); INIT_TRACE(inst_in_wb_); - for (int i = 0; i < 32; i++) { - stalled_warps_[i] = false; - for (int j = 0; j < 32; j++) { - renameTable_[i][j] = true; - } + iRenameTable_.resize(arch.num_warps(), std::vector(arch.num_regs(), false)); + fRenameTable_.resize(arch.num_warps(), std::vector(arch.num_regs(), false)); + vRenameTable_.resize(arch.num_regs(), false); + + stalled_warps_.resize(arch.num_warps(), false); + + for (int i = 0; i < arch_.num_warps(); ++i) { + warps_.emplace_back(this, i); } - for (int i = 0; i < 32; i++) { - vecRenameTable_[i] = true; - } - - for (unsigned i = 0; i < arch_.getNumWarps(); ++i) { - warps_.push_back(Warp(this, i)); - } - - warps_[0].setActiveThreads(1); - warps_[0].setSpawned(true); + warps_[0].setTmask(0, true); } Core::~Core() { @@ -125,32 +123,20 @@ void Core::step() { D(3, "###########################################################"); steps_++; - D(3, "cycle: " << steps_); + D(3, std::dec << "Core" << id_ << ": cycle: " << steps_); DPH(3, "stalled warps:"); - for (ThdNum widd = 0; widd < arch_.getNumWarps(); widd++) { - DPN(3, " " << stalled_warps_[widd]); + for (int i = 0; i < arch_.num_warps(); i++) { + DPN(3, " " << stalled_warps_[i]); } DPN(3, "\n"); - // cout << "About to call writeback" << std::endl; this->writeback(); - // cout << "About to call load_store" << std::endl; this->load_store(); - // cout << "About to call execute_unit" << std::endl; this->execute_unit(); - // cout << "About to call scheduler" << std::endl; this->scheduler(); - // cout << "About to call decode" << std::endl; this->decode(); - // D(3, "About to call fetch" << std::flush); this->fetch(); - // D(3, "Finished fetch" << std::flush); - - if (release_warp_) { - release_warp_ = false; - stalled_warps_[release_warp_num_] = false; - } DPN(3, std::flush); } @@ -161,10 +147,8 @@ void Core::warpScheduler() { for (size_t wid = 0; wid < warps_.size(); ++wid) { // round robin scheduling next_warp = (next_warp + 1) % warps_.size(); - - bool has_active_threads = (warps_[next_warp].getActiveThreads() > 0); + bool has_active_threads = warps_[next_warp].active(); bool stalled = stalled_warps_[next_warp]; - if (has_active_threads && !stalled) { foundSchedule_ = true; break; @@ -174,35 +158,28 @@ void Core::warpScheduler() { } void Core::fetch() { - - // D(-1, "Found schedule: " << foundSchedule_); - if ((!inst_in_scheduler_.stalled) && (inst_in_fetch_.fetch_stall_cycles == 0)) { - // CPY_TRACE(inst_in_decode_, inst_in_fetch_); - // if (warps_[schedule_w_].activeThreads) - { - INIT_TRACE(inst_in_fetch_); + INIT_TRACE(inst_in_fetch_); - if (foundSchedule_) { - auto active_threads_b = warps_[schedule_w_].getActiveThreads(); + if (foundSchedule_) { + auto active_threads_b = warps_[schedule_w_].getActiveThreads(); + num_insts_ = num_insts_ + warps_[schedule_w_].getActiveThreads(); - num_instructions_ = num_instructions_ + warps_[schedule_w_].getActiveThreads(); - warps_[schedule_w_].step(&inst_in_fetch_); + warps_[schedule_w_].step(&inst_in_fetch_); - auto active_threads_a = warps_[schedule_w_].getActiveThreads(); - if (active_threads_b != active_threads_a) { - D(3, "** warp #" << schedule_w_ << " active threads changed from " << active_threads_b << " to " << active_threads_a); - } - - this->getCacheDelays(&inst_in_fetch_); - - if (inst_in_fetch_.stall_warp) { - stalled_warps_[inst_in_fetch_.wid] = true; - } + auto active_threads_a = warps_[schedule_w_].getActiveThreads(); + if (active_threads_b != active_threads_a) { + D(3, "** warp #" << schedule_w_ << " active threads changed from " << active_threads_b << " to " << active_threads_a); + } + + this->getCacheDelays(&inst_in_fetch_); + + if (inst_in_fetch_.stall_warp) { + stalled_warps_[inst_in_fetch_.wid] = true; } - this->warpScheduler(); } + this->warpScheduler(); } else { inst_in_fetch_.stalled = false; if (inst_in_fetch_.fetch_stall_cycles > 0) @@ -218,7 +195,6 @@ void Core::decode() { CPY_TRACE(inst_in_decode_, inst_in_fetch_); INIT_TRACE(inst_in_fetch_); } - //printTrace(&inst_in_decode_, "Decode"); } void Core::scheduler() { @@ -226,136 +202,162 @@ void Core::scheduler() { CPY_TRACE(inst_in_scheduler_, inst_in_decode_); INIT_TRACE(inst_in_decode_); } - //printTrace(&inst_in_scheduler_, "Scheduler"); } void Core::load_store() { - if ((inst_in_lsu_.mem_stall_cycles > 0) || (inst_in_lsu_.stalled)) { + if ((inst_in_lsu_.mem_stall_cycles > 0) || inst_in_lsu_.stalled) { // LSU currently busy if ((inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw)) { inst_in_scheduler_.stalled = true; } } else { - // LSU not busy - if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw) { - // Scheduler has LSU inst - bool scheduler_srcs_ready = true; - if (inst_in_scheduler_.rs1 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1]; - } + if (!inst_in_scheduler_.is_lw && !inst_in_scheduler_.is_sw) + return; - if (inst_in_scheduler_.rs2 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2]; - } + // Scheduler has LSU inst + bool scheduler_srcs_busy = false; - if (inst_in_scheduler_.vs1 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs1]; - } - if (inst_in_scheduler_.vs2 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs2]; - } + if (inst_in_scheduler_.irs1 > 0) { + scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs1]; + } - if (scheduler_srcs_ready) { - if (inst_in_scheduler_.rd != -1) - renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rd] = false; - if (inst_in_scheduler_.rd != -1) - vecRenameTable_[inst_in_scheduler_.vd] = false; - CPY_TRACE(inst_in_lsu_, inst_in_scheduler_); - INIT_TRACE(inst_in_scheduler_); - } else { - inst_in_scheduler_.stalled = true; - // INIT_TRACE(inst_in_lsu_); - } - } else { - // INIT_TRACE(inst_in_lsu_); + if (inst_in_scheduler_.irs2 > 0) { + scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs2]; + } + + if (inst_in_scheduler_.frs1 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs1]; + } + + if (inst_in_scheduler_.frs2 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs2]; + } + + if (inst_in_scheduler_.frs3 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs3]; + } + + if (inst_in_scheduler_.vrs1 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs1]; + } + if (inst_in_scheduler_.vrs2 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs2]; + } + + if (scheduler_srcs_busy) { + inst_in_scheduler_.stalled = true; + } else { + if (inst_in_scheduler_.ird > 0) + iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.ird] = true; + + if (inst_in_scheduler_.frd >= 0) + fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frd] = true; + + if (inst_in_scheduler_.vrd >= 0) + vRenameTable_[inst_in_scheduler_.vrd] = true; + + CPY_TRACE(inst_in_lsu_, inst_in_scheduler_); + INIT_TRACE(inst_in_scheduler_); } } if (inst_in_lsu_.mem_stall_cycles > 0) inst_in_lsu_.mem_stall_cycles--; - - //printTrace(&inst_in_lsu_, "LSU"); } void Core::execute_unit() { - // EXEC is always not busy - if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw) { - // Not an execute instruction - // INIT_TRACE(inst_in_exe_); - } else { - bool scheduler_srcs_ready = true; - if (inst_in_scheduler_.rs1 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1]; - // cout << "Rename RS1: " << inst_in_scheduler_.rs1 << " is " << renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs1] << " wid: " << inst_in_scheduler_.wid << '\n'; - } + if (inst_in_scheduler_.is_lw || inst_in_scheduler_.is_sw) + return; + + bool scheduler_srcs_busy = false; - if (inst_in_scheduler_.rs2 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2]; - // cout << "Rename RS2: " << inst_in_scheduler_.rs1 << " is " << renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rs2] << " wid: " << inst_in_scheduler_.wid << '\n'; - } - - // cout << "About to check vs*\n" << std::flush; - if (inst_in_scheduler_.vs1 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs1]; - } - if (inst_in_scheduler_.vs2 > 0) { - scheduler_srcs_ready = scheduler_srcs_ready && vecRenameTable_[inst_in_scheduler_.vs2]; - } - // cout << "Finished sources\n" << std::flush; - - if (scheduler_srcs_ready) { - if (inst_in_scheduler_.rd != -1) { - // cout << "rename setting rd: " << inst_in_scheduler_.rd << " to not useabel wid: " << inst_in_scheduler_.wid << '\n'; - renameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.rd] = false; - } - - // cout << "About to check vector wb: " << inst_in_scheduler_.vd << "\n" << std::flush; - if (inst_in_scheduler_.vd != -1) { - vecRenameTable_[inst_in_scheduler_.vd] = false; - } - // cout << "Finished wb checking" << "\n" << std::flush; - CPY_TRACE(inst_in_exe_, inst_in_scheduler_); - INIT_TRACE(inst_in_scheduler_); - // cout << "Finished trace copying and clearning" << "\n" << std::flush; - } else { - D(3, "Execute: srcs not ready!"); - inst_in_scheduler_.stalled = true; - // INIT_TRACE(inst_in_exe_); - } + if (inst_in_scheduler_.irs1 > 0) { + scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs1]; } - //printTrace(&inst_in_exe_, "EXE"); - // INIT_TRACE(inst_in_exe_); + if (inst_in_scheduler_.irs2 > 0) { + scheduler_srcs_busy = scheduler_srcs_busy || iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.irs2]; + } + + if (inst_in_scheduler_.frs1 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs1]; + } + + if (inst_in_scheduler_.frs2 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs2]; + } + + if (inst_in_scheduler_.frs3 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frs3]; + } + + if (inst_in_scheduler_.vrs1 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs1]; + } + + if (inst_in_scheduler_.vrs2 >= 0) { + scheduler_srcs_busy = scheduler_srcs_busy || vRenameTable_[inst_in_scheduler_.vrs2]; + } + + if (scheduler_srcs_busy) { + D(3, "Execute: srcs not ready!"); + inst_in_scheduler_.stalled = true; + } else { + if (inst_in_scheduler_.ird > 0) { + iRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.ird] = true; + } + + if (inst_in_scheduler_.frd >= 0) { + fRenameTable_[inst_in_scheduler_.wid][inst_in_scheduler_.frd] = true; + } + + if (inst_in_scheduler_.vrd >= 0) { + vRenameTable_[inst_in_scheduler_.vrd] = true; + } + + CPY_TRACE(inst_in_exe_, inst_in_scheduler_); + INIT_TRACE(inst_in_scheduler_); + } } void Core::writeback() { - if (inst_in_wb_.rd > 0) - renameTable_[inst_in_wb_.wid][inst_in_wb_.rd] = true; - if (inst_in_wb_.vd > 0) - vecRenameTable_[inst_in_wb_.vd] = true; + if (inst_in_wb_.ird > 0) { + iRenameTable_[inst_in_wb_.wid][inst_in_wb_.ird] = false; + } + + if (inst_in_wb_.frd >= 0) { + fRenameTable_[inst_in_wb_.wid][inst_in_wb_.frd] = false; + } + + if (inst_in_wb_.vrd >= 0) { + vRenameTable_[inst_in_wb_.vrd] = false; + } if (inst_in_wb_.stall_warp) { stalled_warps_[inst_in_wb_.wid] = false; - // release_warp_ = true; - // release_warp_num_ = inst_in_wb_.wid; } INIT_TRACE(inst_in_wb_); bool serviced_exe = false; - if ((inst_in_exe_.rd > 0) || (inst_in_exe_.stall_warp)) { + if ((inst_in_exe_.ird > 0) + || (inst_in_exe_.frd >= 0) + || (inst_in_exe_.vrd >= 0) + || (inst_in_exe_.stall_warp)) { CPY_TRACE(inst_in_wb_, inst_in_exe_); INIT_TRACE(inst_in_exe_); serviced_exe = true; - // cout << "WRITEBACK SERVICED EXE\n"; } if (inst_in_lsu_.is_sw) { INIT_TRACE(inst_in_lsu_); } else { - if (((inst_in_lsu_.rd > 0) || (inst_in_lsu_.vd > 0)) && (inst_in_lsu_.mem_stall_cycles == 0)) { + if (((inst_in_lsu_.ird > 0) + || (inst_in_lsu_.frd >= 0) + || (inst_in_lsu_.vrd >= 0)) + && (inst_in_lsu_.mem_stall_cycles == 0)) { if (serviced_exe) { - D(3, "$$$$$$$$$$$$$$$$$$$$ Stalling LSU because EXE is being used"); + // Stalling LSU because EXE is busy inst_in_lsu_.stalled = true; } else { CPY_TRACE(inst_in_wb_, inst_in_lsu_); @@ -366,27 +368,28 @@ void Core::writeback() { } void Core::getCacheDelays(trace_inst_t *trace_inst) { - trace_inst->fetch_stall_cycles += 3; + trace_inst->fetch_stall_cycles += 1; if (trace_inst->is_sw || trace_inst->is_lw) { - trace_inst->mem_stall_cycles += 5; + trace_inst->mem_stall_cycles += 3; } } bool Core::running() const { - bool stages_have_valid = inst_in_fetch_.valid_inst - || inst_in_decode_.valid_inst - || inst_in_scheduler_.valid_inst - || inst_in_lsu_.valid_inst - || inst_in_exe_.valid_inst - || inst_in_wb_.valid_inst; + bool stages_have_valid = inst_in_fetch_.valid + || inst_in_decode_.valid + || inst_in_scheduler_.valid + || inst_in_lsu_.valid + || inst_in_exe_.valid + || inst_in_wb_.valid; if (stages_have_valid) return true; - for (unsigned i = 0; i < warps_.size(); ++i) - if (warps_[i].running()) { + for (unsigned i = 0; i < warps_.size(); ++i) { + if (warps_[i].active()) { return true; } + } return false; } diff --git a/simX/core.h b/simX/core.h index 003e637f..d38e6ca4 100644 --- a/simX/core.h +++ b/simX/core.h @@ -60,8 +60,8 @@ public: return interruptEntry_; } - unsigned long num_instructions() const { - return num_instructions_; + unsigned long num_insts() const { + return num_insts_; } unsigned long num_steps() const { @@ -70,9 +70,10 @@ public: private: - bool renameTable_[32][32]; - bool vecRenameTable_[32]; - bool stalled_warps_[32]; + std::vector> iRenameTable_; + std::vector> fRenameTable_; + std::vector vRenameTable_; + std::vector stalled_warps_; bool foundSchedule_; Word id_; @@ -83,10 +84,8 @@ private: std::unordered_map> barriers_; int schedule_w_; uint64_t steps_; - uint64_t num_instructions_; + uint64_t num_insts_; Word interruptEntry_; - bool release_warp_; - int release_warp_num_; trace_inst_t inst_in_fetch_; trace_inst_t inst_in_decode_; diff --git a/simX/debug.h b/simX/debug.h index 54277c74..f7723c32 100644 --- a/simX/debug.h +++ b/simX/debug.h @@ -1,6 +1,8 @@ #pragma once -//#define USE_DEBUG 9 +#define USE_DEBUG 3 +#define DEBUG_HEADER << "DEBUG " +//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " #ifdef USE_DEBUG @@ -11,13 +13,13 @@ #define D(lvl, x) do { \ if ((lvl) <= USE_DEBUG) { \ - std::cout << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " << x << std::endl; \ + std::cout DEBUG_HEADER << x << std::endl; \ } \ } while(0) #define DPH(lvl, x) do { \ if ((lvl) <= USE_DEBUG) { \ - std::cout << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": " << x; \ + std::cout DEBUG_HEADER << x; \ } \ } while(0) @@ -27,10 +29,6 @@ } \ } while(0) -#define D_RAW(x) do { \ - std::cout << x; \ -} while (0) - #else #define DX(x) diff --git a/simX/decode.cpp b/simX/decode.cpp index 6b7d8734..71487c1e 100644 --- a/simX/decode.cpp +++ b/simX/decode.cpp @@ -54,7 +54,7 @@ std::ostream &vortex::operator<<(std::ostream &os, Instr &instr) { } Decoder::Decoder(const ArchDef &arch) { - inst_s_ = arch.getWordSize() * 8; + inst_s_ = arch.wsize() * 8; opcode_s_ = 7; reg_s_ = 5; func2_s_ = 2; @@ -94,7 +94,11 @@ Decoder::Decoder(const ArchDef &arch) { v_imm_mask_ = 0x7ff; } -std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, trace_inst_t *trace_inst) { +std::shared_ptr Decoder::decode( + const std::vector &v, + Size &idx, + trace_inst_t *trace_inst) +{ Word code(readWord(v, idx, inst_s_ / 8)); // std::cout << "code: " << (int) code << " v: " << v << " indx: " << idx << "\n"; @@ -107,12 +111,13 @@ std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, tr Word imeed, dest_bits, imm_bits, bit_11, bits_4_1, bit_10_5, bit_12, bits_19_12, bits_10_1, bit_20, unordered, func3; - InstType curInstType = sc_instTable.at(op).iType; // get current inst type - if (op == Opcode::FL || op == Opcode::FS) { // need to find out whether it is vector or floating point inst + InstType curInstType = sc_instTable.at(op).iType; + if (op == Opcode::FL || op == Opcode::FS) { + // need to find out whether it is vector or floating point inst Word width_bits = (code >> shift_func3_) & func3_mask_; if ((width_bits == 0x1) || (width_bits == 0x2) || (width_bits == 0x3) || (width_bits == 0x4)) { - curInstType = (op == Opcode::FL)? InstType::I_TYPE : InstType::S_TYPE; + curInstType = (op == Opcode::FL) ? InstType::I_TYPE : InstType::S_TYPE; } } @@ -122,52 +127,50 @@ std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, tr break; case InstType::R_TYPE: - instr->setDestReg((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + if (op == Opcode::FCI) { + instr->setDestFReg((code >> shift_rd_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs2_) & reg_mask_); + } else { + instr->setDestReg((code >> shift_rd_) & reg_mask_); + instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + } instr->setFunc3((code >> shift_func3_) & func3_mask_); instr->setFunc7((code >> shift_func7_) & func7_mask_); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); break; case InstType::I_TYPE: - instr->setDestReg((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + if (op == Opcode::FCI || op == Opcode::FL) { + instr->setDestFReg((code >> shift_rd_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs1_) & reg_mask_); + } else { + instr->setDestReg((code >> shift_rd_) & reg_mask_); + instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + } instr->setFunc7((code >> shift_func7_) & func7_mask_); func3 = (code >> shift_func3_) & func3_mask_; instr->setFunc3(func3); - - if ((func3 == 5) && (op != L_INST) && (op != FL)) { - // std::cout << "func7: " << func7 << "\n"; + if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) { instr->setSrcImm(signExt(((code >> shift_rs2_) & reg_mask_), 5, reg_mask_)); } else { instr->setSrcImm(signExt(code >> shift_i_immed_, 12, i_imm_mask_)); } - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); break; case InstType::S_TYPE: - // std::cout << "************STORE\n"; - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + if (op == Opcode::FS) { + instr->setSrcFReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs2_) & reg_mask_); + } else { + instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + } instr->setFunc3((code >> shift_func3_) & func3_mask_); - dest_bits = (code >> shift_rd_) & reg_mask_; imm_bits = (code >> shift_s_b_immed_ & func7_mask_); imeed = (imm_bits << reg_s_) | dest_bits; - // std::cout << "ENC: store imeed: " << imeed << "\n"; instr->setSrcImm(signExt(imeed, 12, s_imm_mask_)); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); break; case InstType::B_TYPE: @@ -184,51 +187,34 @@ std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, tr bit_12 = imm_bits >> 6; imeed = 0 | (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12); - instr->setSrcImm(signExt(imeed, 13, b_imm_mask_)); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); break; case InstType::U_TYPE: instr->setDestReg((code >> shift_rd_) & reg_mask_); instr->setSrcImm(signExt(code >> shift_j_u_immed_, 20, u_imm_mask_)); - trace_inst->valid_inst = true; - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); break; case InstType::J_TYPE: instr->setDestReg((code >> shift_rd_) & reg_mask_); - - // [20 | 10:1 | 11 | 19:12] - unordered = code >> shift_j_u_immed_; - bits_19_12 = unordered & 0xff; bit_11 = (unordered >> 8) & 0x1; bits_10_1 = (unordered >> 9) & 0x3ff; bit_20 = (unordered >> 19) & 0x1; - imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20); - if (bit_20) { imeed |= ~j_imm_mask_; } - instr->setSrcImm(imeed); - - trace_inst->valid_inst = true; - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); break; case InstType::V_TYPE: D(3, "Entered here: instr type = vector" << op); switch (op) { case Opcode::VSET_ARITH: //TODO: arithmetic ops - instr->setDestReg((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + instr->setDestVReg((code >> shift_rd_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs1_) & reg_mask_); func3 = (code >> shift_func3_) & func3_mask_; instr->setFunc3(func3); D(3, "Entered here: instr type = vector"); @@ -247,53 +233,34 @@ std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, tr instr->setVsew((immed >> 2) & 0x3); D(3, "sew " << ((immed >> 2) & 0x3)); } else { - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs2_) & reg_mask_); } - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); } else { - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs2_) & reg_mask_); instr->setVmask((code >> shift_vmask_) & 0x1); instr->setFunc6((code >> shift_func6_) & func6_mask_); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); } break; case Opcode::VL: D(3, "vector load instr"); - instr->setDestReg((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + instr->setDestVReg((code >> shift_rd_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs1_) & reg_mask_); instr->setVlsWidth((code >> shift_func3_) & func3_mask_); - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs2_) & reg_mask_); instr->setVmask((code >> shift_vmask_)); instr->setVmop((code >> shift_vmop_) & func3_mask_); instr->setVnf((code >> shift_vnf_) & func3_mask_); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->vd = ((code >> shift_rd_) & reg_mask_); - //trace_inst->vs2 = ((code>>shift_rs2_) & reg_mask_); break; case Opcode::VS: instr->setVs3((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs1_) & reg_mask_); instr->setVlsWidth((code >> shift_func3_) & func3_mask_); - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); + instr->setSrcVReg((code >> shift_rs2_) & reg_mask_); instr->setVmask((code >> shift_vmask_)); instr->setVmop((code >> shift_vmop_) & func3_mask_); instr->setVnf((code >> shift_vnf_) & func3_mask_); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - //trace_inst->vd = ((code>>shift_rd_) & reg_mask_); - trace_inst->vs1 = ((code >> shift_rd_) & reg_mask_); //vs3 break; default: @@ -303,23 +270,47 @@ std::shared_ptr Decoder::decode(const std::vector &v, Size &idx, tr break; case R4_TYPE: // RT: add R4_TYPE decoder - instr->setDestReg((code >> shift_rd_) & reg_mask_); - instr->setSrcReg((code >> shift_rs1_) & reg_mask_); - instr->setSrcReg((code >> shift_rs2_) & reg_mask_); - instr->setSrcReg((code >> shift_rs3_) & reg_mask_); + instr->setDestFReg((code >> shift_rd_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs1_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs2_) & reg_mask_); + instr->setSrcFReg((code >> shift_rs3_) & reg_mask_); instr->setFunc3((code >> shift_func3_) & func3_mask_); - - trace_inst->valid_inst = true; - trace_inst->rs1 = ((code >> shift_rs1_) & reg_mask_); - trace_inst->rs2 = ((code >> shift_rs2_) & reg_mask_); - trace_inst->rs3 = ((code >> shift_rs3_) & reg_mask_); - trace_inst->rd = ((code >> shift_rd_) & reg_mask_); break; default: std::cout << "Unrecognized argument class in word decoder.\n"; std::abort(); } + if (curInstType != InstType::N_TYPE) { + trace_inst->valid = true; + if (instr->hasRDest()) { + if (instr->is_FpDest()) { + trace_inst->frd = instr->getRDest(); + } else if (instr->is_VDest()) { + trace_inst->vrd = instr->getRDest(); + } else { + trace_inst->ird = instr->getRDest(); + } + } + + for (int i = 0; i < instr->getNRSrc(); ++i) { + if (instr->is_FpSrc(i)) { + if (i == 0) trace_inst->frs1 = instr->getRSrc(i); + else if (i == 1) trace_inst->frs2 = instr->getRSrc(i); + else if (i == 2) trace_inst->frs3 = instr->getRSrc(i); + else std::abort(); + } else if (instr->is_VSrc(i)) { + if (i == 0) trace_inst->vrs1 = instr->getRSrc(i); + else if (i == 1) trace_inst->vrs2 = instr->getRSrc(i); + else std::abort(); + } else { + if (i == 0) trace_inst->irs1 = instr->getRSrc(i); + else if (i == 1) trace_inst->irs2 = instr->getRSrc(i); + else std::abort(); + } + } + } + D(2, "Decoded instr 0x" << std::hex << code << " into: " << instr << std::flush); return instr; diff --git a/simX/execute.cpp b/simX/execute.cpp index 9dd926a1..f21f2f6b 100644 --- a/simX/execute.cpp +++ b/simX/execute.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "util.h" #include "warp.h" #include "instr.h" @@ -18,8 +19,8 @@ using namespace vortex; struct DivergentBranchException {}; static bool checkUnanimous(unsigned p, - const std::vector>> &m, - const std::vector &tm) { + const std::vector> &m, + const ThreadMask &tm) { bool same; unsigned i; for (i = 0; i < m.size(); ++i) { @@ -57,9 +58,13 @@ float intregToFloat(uint32_t input) { // Frac_value= 1 + sum{i = 1}{23}{b_{23-i}*2^{-i}} double frac_value; if (exp == 0) { // subnormal - if (frac == 0) // zero - if (sign) return -0.0; - else return 0.0; + if (frac == 0) { + // zero + if (sign) + return -0.0; + else + return 0.0; + } frac_value = 0.0; } else frac_value = 1.0; @@ -112,12 +117,13 @@ uint8_t fpBinIsNan(uint32_t din) { uint32_t fraction = din & 0x007FFFFF; uint32_t bit_22 = din & 0x00400000; - if ((expo==0xFF) && (fraction!=0)) + if ((expo==0xFF) && (fraction!=0)) { // if (!fsign && (fraction == 0x00400000)) - if(!fsign && (bit_22)) + if (!fsign && (bit_22)) return 1; // quiet NaN, return 1 else return 2; // signaling NaN, return 2 + } return 0; } @@ -127,11 +133,12 @@ uint8_t fpBinIsZero(uint32_t din) { uint32_t expo = (din>>23) & 0x000000FF; uint32_t fraction = din & 0x007FFFFF; - if ((expo==0) && (fraction==0)) + if ((expo==0) && (fraction==0)) { if (fsign) return 1; // negative 0 else return 2; // positive 0 + } return 0; // not zero } @@ -141,50 +148,39 @@ uint8_t fpBinIsInf(uint32_t din) { uint32_t expo = (din>>23) & 0x000000FF; uint32_t fraction = din & 0x007FFFFF; - if ((expo==0xFF) && (fraction==0)) + if ((expo==0xFF) && (fraction==0)) { if (fsign) return 1; // negative infinity else return 2; // positive infinity + } return 0; // not infinity } void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { - Size nextActiveThreads = activeThreads_; - Size wordSz = core_->arch().getWordSize(); - Word nextPc = pc_; - - memAccesses_.clear(); - - bool sjOnce(true); // Has not yet split or joined once. - bool pcSet(false); // PC has already been set + assert(tmask_.any()); + Word nextPC = PC_; + bool updatePC = false; + bool runOnce = false; Word func3 = instr.getFunc3(); Word func6 = instr.getFunc6(); Word func7 = instr.getFunc7(); Opcode opcode = instr.getOpcode(); - RegNum rdest = instr.getRDest(); - RegNum rsrc0 = instr.getRSrc(0); - RegNum rsrc1 = instr.getRSrc(1); - RegNum rsrc2 = instr.getRSrc(2); + int rdest = instr.getRDest(); + int rsrc0 = instr.getRSrc(0); + int rsrc1 = instr.getRSrc(1); + int rsrc2 = instr.getRSrc(2); Word immsrc = instr.getImm(); bool vmask = instr.getVmask(); - for (Size t = 0; t < activeThreads_; t++) { - std::vector> &iregs = iRegFile_[t]; - std::vector> &fregs = fRegFile_[t]; - - bool is_gpgpu = (opcode == GPGPU); - bool is_tmc = is_gpgpu && (func3 == 0); - bool is_wspawn = is_gpgpu && (func3 == 1); - bool is_barrier = is_gpgpu && (func3 == 4); - - bool not_active = !tmask_[t]; - bool gpgpu_zero = (is_tmc || is_barrier || is_wspawn) && (t != 0); - - if (not_active || gpgpu_zero) + for (std::size_t t = 0; t < tmask_.count(); t++) { + if (runOnce) continue; + + auto &iregs = iRegFile_[t]; + auto &fregs = fRegFile_[t]; ++insts_; @@ -194,19 +190,18 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case R_INST: { // std::cout << "R_INST\n"; - Word m_exten = func7 & 0x1; - if (m_exten) { + Word is_mul_ext = func7 & 0x1; + if (is_mul_ext) { // std::cout << "FOUND A MUL/DIV\n"; - switch (func3) { case 0: // MUL - D(3, "MUL: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "MUL: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = ((int)iregs[rsrc0]) * ((int)iregs[rsrc1]); break; case 1: // MULH - D(3, "MULH: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "MULH: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); { int64_t first = (int64_t)iregs[rsrc0]; if (iregs[rsrc0] & 0x80000000) { @@ -224,7 +219,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 2: // MULHSU - D(3, "MULHSU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "MULHSU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); { int64_t first = (int64_t)iregs[rsrc0]; if (iregs[rsrc0] & 0x80000000) { @@ -236,7 +231,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 3: // MULHU - D(3, "MULHU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "MULHU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); { uint64_t first = (uint64_t)iregs[rsrc0]; uint64_t second = (uint64_t)iregs[rsrc1]; @@ -246,7 +241,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 4: // DIV - D(3, "DIV: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "DIV: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (iregs[rsrc1] == 0) { iregs[rdest] = -1; break; @@ -257,7 +252,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 5: // DIVU - D(3, "DIVU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "DIVU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (iregs[rsrc1] == 0) { iregs[rdest] = -1; break; @@ -266,7 +261,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 6: // REM - D(3, "REM: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "REM: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (iregs[rsrc1] == 0) { iregs[rdest] = iregs[rsrc0]; break; @@ -275,7 +270,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 7: // REMU - D(3, "REMU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "REMU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (iregs[rsrc1] == 0) { iregs[rdest] = iregs[rsrc0]; break; @@ -291,22 +286,19 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { switch (func3) { case 0: if (func7) { - D(3, "SUBI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "SUBI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] - iregs[rsrc1]; - iregs[rdest].trunc(wordSz); } else { - D(3, "ADDI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "ADDI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] + iregs[rsrc1]; - iregs[rdest].trunc(wordSz); } break; case 1: - D(3, "SLLI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "SLLI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] << iregs[rsrc1]; - iregs[rdest].trunc(wordSz); break; case 2: - D(3, "SLTI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "SLTI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (int(iregs[rsrc0]) < int(iregs[rsrc1])) { iregs[rdest] = 1; } else { @@ -314,34 +306,32 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { } break; case 3: - D(3, "SLTU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); - if (Word_u(iregs[rsrc0]) < Word_u(iregs[rsrc1])) { + D(3, "SLTU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); + if (Word(iregs[rsrc0]) < Word(iregs[rsrc1])) { iregs[rdest] = 1; } else { iregs[rdest] = 0; } break; case 4: - D(3, "XORI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "XORI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] ^ iregs[rsrc1]; break; case 5: if (func7) { - D(3, "SRLI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "SRLI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = int(iregs[rsrc0]) >> int(iregs[rsrc1]); - iregs[rdest].trunc(wordSz); } else { - D(3, "SRLU: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); - iregs[rdest] = Word_u(iregs[rsrc0]) >> Word_u(iregs[rsrc1]); - iregs[rdest].trunc(wordSz); + D(3, "SRLU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); + iregs[rdest] = Word(iregs[rsrc0]) >> Word(iregs[rsrc1]); } break; case 6: - D(3, "ORI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "ORI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] | iregs[rsrc1]; break; case 7: - D(3, "ANDI: r" << rdest << " <- r" << rsrc0 << ", r" << rsrc1); + D(3, "AND: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); iregs[rdest] = iregs[rsrc0] & iregs[rsrc1]; break; default: @@ -350,58 +340,17 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { } } } break; - case L_INST: { - Word memAddr = ((iregs[rsrc0] + immsrc) & 0xFFFFFFFC); - Word shift_by = ((iregs[rsrc0] + immsrc) & 0x00000003) * 8; - Word data_read = core_->mem().read(memAddr, 0); - trace_inst->is_lw = true; - trace_inst->mem_addresses[t] = memAddr; - switch (func3) { - case 0: - // LBI - D(3, "LBI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); - iregs[rdest] = signExt((data_read >> shift_by) & 0xFF, 8, 0xFF); - break; - case 1: - // LWI - D(3, "LWI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); - iregs[rdest] = signExt((data_read >> shift_by) & 0xFFFF, 16, 0xFFFF); - break; - case 2: - // LDI - D(3, "LDI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); - iregs[rdest] = int(data_read & 0xFFFFFFFF); - break; - case 4: - // LBU - D(3, "LBU: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); - iregs[rdest] = unsigned((data_read >> shift_by) & 0xFF); - break; - case 5: - // LWU - D(3, "LWU: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); - iregs[rdest] = unsigned((data_read >> shift_by) & 0xFFFF); - break; - default: - std::cout << "ERROR: UNSUPPORTED L INST\n"; - std::abort(); - } - D(3, "LOAD MEM ADDRESS: " << std::hex << memAddr); - D(3, "LOAD MEM DATA: " << std::hex << data_read); - memAccesses_.push_back(Warp::MemAccess(false, memAddr)); - } break; case I_INST: //std::cout << "I_INST\n"; switch (func3) { case 0: // ADDI - D(3, "ADDI: r" << rdest << " <- r" << rsrc0 << ", imm=" << immsrc); + D(3, "ADDI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=" << immsrc); iregs[rdest] = iregs[rsrc0] + immsrc; - iregs[rdest].trunc(wordSz); break; case 2: // SLTI - D(3, "SLTI: r" << rdest << " <- r" << rsrc0 << ", imm=" << immsrc); + D(3, "SLTI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=" << immsrc); if (int(iregs[rsrc0]) < int(immsrc)) { iregs[rdest] = 1; } else { @@ -410,7 +359,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { break; case 3: { // SLTIU - D(3, "SLTIU: r" << rdest << " <- r" << rsrc0 << ", imm=" << immsrc); + D(3, "SLTIU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=" << immsrc); if (unsigned(iregs[rsrc0]) < unsigned(immsrc)) { iregs[rdest] = 1; } else { @@ -419,39 +368,36 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { } break; case 4: // XORI - D(3, "XORI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "XORI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); iregs[rdest] = iregs[rsrc0] ^ immsrc; break; case 6: // ORI - D(3, "ORI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "ORI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); iregs[rdest] = iregs[rsrc0] | immsrc; break; case 7: // ANDI - D(3, "ANDI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "ANDI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); iregs[rdest] = iregs[rsrc0] & immsrc; break; case 1: // SLLI - D(3, "SLLI: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "SLLI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); iregs[rdest] = iregs[rsrc0] << immsrc; - iregs[rdest].trunc(wordSz); break; case 5: if ((func7 == 0)) { // SRLI - D(3, "SRLI: r" << rdest << " <- r" << rsrc0 << ", imm=" << immsrc); - Word result = Word_u(iregs[rsrc0]) >> Word_u(immsrc); + D(3, "SRLI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=" << immsrc); + Word result = Word(iregs[rsrc0]) >> Word(immsrc); iregs[rdest] = result; - iregs[rdest].trunc(wordSz); } else { // SRAI - D(3, "SRAI: r" << rdest << " <- r" << rsrc0 << ", imm=" << immsrc); + D(3, "SRAI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=" << immsrc); Word op1 = iregs[rsrc0]; Word op2 = immsrc; iregs[rdest] = op1 >> op2; - iregs[rdest].trunc(wordSz); } break; default: @@ -459,31 +405,64 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { std::abort(); } break; + case L_INST: { + ++loads_; + Word memAddr = ((iregs[rsrc0] + immsrc) & 0xFFFFFFFC); + Word shift_by = ((iregs[rsrc0] + immsrc) & 0x00000003) * 8; + Word data_read = core_->mem().read(memAddr, 0); + trace_inst->is_lw = true; + trace_inst->mem_addresses[t] = memAddr; + switch (func3) { + case 0: + // LBI + D(3, "LBI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); + iregs[rdest] = signExt((data_read >> shift_by) & 0xFF, 8, 0xFF); + break; + case 1: + // LWI + D(3, "LHI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); + iregs[rdest] = signExt((data_read >> shift_by) & 0xFFFF, 16, 0xFFFF); + break; + case 2: + // LDI + D(3, "LWI: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); + iregs[rdest] = int(data_read & 0xFFFFFFFF); + break; + case 4: + // LBU + D(3, "LBU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); + iregs[rdest] = unsigned((data_read >> shift_by) & 0xFF); + break; + case 5: + // LWU + D(3, "LHU: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); + iregs[rdest] = unsigned((data_read >> shift_by) & 0xFFFF); + break; + default: + std::cout << "ERROR: UNSUPPORTED L INST\n"; + std::abort(); + } + D(3, "LOAD MEM ADDRESS: " << std::hex << memAddr); + } break; case S_INST: { ++stores_; Word memAddr = iregs[rsrc0] + immsrc; trace_inst->is_sw = true; trace_inst->mem_addresses[t] = memAddr; - // //std::cout << "FUNC3: " << func3 << "\n"; - if ((memAddr == 0x00010000) && (t == 0)) { - Word num = iregs[rsrc1]; - fprintf(stderr, "%c", (char)num); - break; - } switch (func3) { case 0: // SB - D(3, "SB: r" << rsrc1 << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "SB: r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); core_->mem().write(memAddr, iregs[rsrc1] & 0x000000FF, 0, 1); break; case 1: // SH - D(3, "SH: r" << rsrc1 << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "SH: r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); core_->mem().write(memAddr, iregs[rsrc1], 0, 2); break; case 2: - // SD - D(3, "SD: r" << rsrc1 << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + // SW + D(3, "SW: r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); core_->mem().write(memAddr, iregs[rsrc1], 0, 4); break; default: @@ -491,101 +470,100 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { std::abort(); } D(3, "STORE MEM ADDRESS: " << std::hex << memAddr); - memAccesses_.push_back(Warp::MemAccess(true, memAddr)); } break; case B_INST: trace_inst->stall_warp = true; switch (func3) { case 0: // BEQ - D(3, "BEQ: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); + D(3, "BEQ: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); if (int(iregs[rsrc0]) == int(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; case 1: // BNE - D(3, "BNE: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); + D(3, "BNE: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); if (int(iregs[rsrc0]) != int(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; case 4: // BLT - D(3, "BLT: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); + D(3, "BLT: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); if (int(iregs[rsrc0]) < int(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; case 5: // BGE - D(3, "BGE: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); + D(3, "BGE: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); if (int(iregs[rsrc0]) >= int(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; case 6: // BLTU - D(3, "BLTU: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); - if (Word_u(iregs[rsrc0]) < Word_u(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + D(3, "BLTU: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); + if (Word(iregs[rsrc0]) < Word(iregs[rsrc1])) { + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; case 7: // BGEU - D(3, "BGEU: r" << rsrc0 << ", r" << rsrc1 << ", imm=0x" << std::hex << immsrc); - if (Word_u(iregs[rsrc0]) >= Word_u(iregs[rsrc1])) { - if (!pcSet) - nextPc = (pc_ - 4) + immsrc; - pcSet = true; + D(3, "BGEU: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1] << ", imm=0x" << std::hex << immsrc); + if (Word(iregs[rsrc0]) >= Word(iregs[rsrc1])) { + if (!updatePC) + nextPC = (PC_ - 4) + immsrc; + updatePC = true; } break; } break; case LUI_INST: - D(3, "LUI: r" << rdest << " <- imm=0x" << std::hex << immsrc); + D(3, "LUI: r" << std::dec << rdest << " <- imm=0x" << std::hex << immsrc); iregs[rdest] = (immsrc << 12) & 0xfffff000; break; case AUIPC_INST: - D(3, "AUIPC: r" << rdest << " <- imm=0x" << std::hex << immsrc); - iregs[rdest] = ((immsrc << 12) & 0xfffff000) + (pc_ - 4); + D(3, "AUIPC: r" << std::dec << rdest << " <- imm=0x" << std::hex << immsrc); + iregs[rdest] = ((immsrc << 12) & 0xfffff000) + (PC_ - 4); break; case JAL_INST: - D(3, "JAL: r" << rdest << " <- imm=0x" << std::hex << immsrc); + D(3, "JAL: r" << std::dec << rdest << " <- imm=0x" << std::hex << immsrc); trace_inst->stall_warp = true; - if (!pcSet) { - nextPc = (pc_ - 4) + immsrc; - //std::cout << "JAL... SETTING PC: " << nextPc << "\n"; + if (!updatePC) { + nextPC = (PC_ - 4) + immsrc; + //std::cout << "JAL... SETTING PC: " << nextPC << "\n"; } if (rdest != 0) { - iregs[rdest] = pc_; + iregs[rdest] = PC_; } - pcSet = true; + updatePC = true; break; case JALR_INST: - D(3, "JALR: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "JALR: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); trace_inst->stall_warp = true; - if (!pcSet) { - nextPc = iregs[rsrc0] + immsrc; - //std::cout << "JALR... SETTING PC: " << nextPc << "\n"; + if (!updatePC) { + nextPC = iregs[rsrc0] + immsrc; + //std::cout << "JALR... SETTING PC: " << nextPC << "\n"; } if (rdest != 0) { - iregs[rdest] = pc_; + iregs[rdest] = PC_; } - pcSet = true; + updatePC = true; break; case SYS_INST: { - D(3, "SYS_INST: r" << rdest << " <- r" << rsrc0 << ", imm=0x" << std::hex << immsrc); + D(3, "SYS_INST: r" << std::dec << rdest << " <- r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", imm=0x" << std::hex << immsrc); Word rs1 = iregs[rsrc0]; Word csr_addr = immsrc & 0x00000FFF; // GPGPU CSR extension @@ -594,37 +572,35 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { iregs[rdest] = t; } else if (csr_addr == CSR_LTID) { // Core threadID - iregs[rdest] = t + - id_ * core_->arch().getNumThreads(); + iregs[rdest] = t + (id_ * core_->arch().num_threads()); } else if (csr_addr == CSR_GTID) { // Processor threadID - iregs[rdest] = t + - id_ * core_->arch().getNumThreads() + - core_->arch().getNumThreads() * core_->arch().getNumWarps() * core_->id(); + iregs[rdest] = t + (id_ * core_->arch().num_threads()) + + (core_->arch().num_threads() * core_->arch().num_warps() * core_->id()); } else if (csr_addr == CSR_LWID) { // Core warpID iregs[rdest] = id_; } else if (csr_addr == CSR_GWID) { // Processor warpID - iregs[rdest] = id_ + core_->arch().getNumWarps() * core_->id(); + iregs[rdest] = id_ + (core_->arch().num_warps() * core_->id()); } else if (csr_addr == CSR_GCID) { // Processor coreID iregs[rdest] = core_->id(); } else if (csr_addr == CSR_NT) { // Number of threads per warp - iregs[rdest] = core_->arch().getNumThreads(); + iregs[rdest] = core_->arch().num_threads(); } else if (csr_addr == CSR_NW) { // Number of warps per core - iregs[rdest] = core_->arch().getNumWarps(); + iregs[rdest] = core_->arch().num_warps(); } else if (csr_addr == CSR_NC) { // Number of cores - iregs[rdest] = core_->arch().getNumCores(); + iregs[rdest] = core_->arch().num_cores(); } else if (csr_addr == CSR_INSTRET) { // NumInsts - iregs[rdest] = (Word)core_->num_instructions(); + iregs[rdest] = (Word)core_->num_insts(); } else if (csr_addr == CSR_INSTRET_H) { // NumInsts - iregs[rdest] = (Word)(core_->num_instructions() >> 32); + iregs[rdest] = (Word)(core_->num_insts() >> 32); } else if (csr_addr == CSR_CYCLE) { // NumCycles iregs[rdest] = (Word)core_->num_steps(); @@ -636,8 +612,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { case 0: if (csr_addr < 2) { // ECALL/EBREAK - nextActiveThreads = 0; - spawned_ = false; + tmask_.reset(); } break; case 1: @@ -691,1119 +666,899 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { D(3, "FENCE"); break; case PJ_INST: - D(3, "PJ_INST: r" << rsrc0 << ", r" << rsrc1); + D(3, "PJ_INST: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); if (iregs[rsrc0]) { - if (!pcSet) - nextPc = iregs[rsrc1]; - pcSet = true; + if (!updatePC) + nextPC = iregs[rsrc1]; + updatePC = true; } break; case GPGPU: switch (func3) { - case 1: - // WSPAWN - D(3, "WSPAWN: r" << rsrc0 << ", r" << rsrc1); - trace_inst->wspawn = true; - if (sjOnce) { - sjOnce = false; - unsigned num_to_wspawn = std::min(iregs[rsrc0], core_->arch().getNumWarps()); - D(0, "Spawning " << num_to_wspawn << " new warps at PC: " << std::hex << iregs[rsrc1]); - for (unsigned i = 1; i < num_to_wspawn; ++i) { - Warp &newWarp(core_->warp(i)); - { - newWarp.set_pc(iregs[rsrc1]); - for (size_t kk = 0; kk < tmask_.size(); kk++) { - if (kk == 0) { - newWarp.setTmask(kk, true); - } else { - newWarp.setTmask(kk, false); - } - } - newWarp.setActiveThreads(1); - newWarp.setSpawned(true); - } - } - break; + case 0: { + // TMC + D(3, "TMC: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0]); + trace_inst->stall_warp = true; + int active_threads = std::min(iregs[rsrc0], core_->arch().num_threads()); + tmask_.reset(); + for (int i = 0; i < active_threads; ++i) { + tmask_[i] = true; } - break; + runOnce = true; + } break; + case 1: { + // WSPAWN + D(3, "WSPAWN: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0] << ", r" << std::dec << rsrc1 << "=0x" << std::hex << iregs[rsrc1]); + trace_inst->wspawn = true; + int active_warps = std::min(iregs[rsrc0], core_->arch().num_warps()); + D(0, "Spawning " << (active_warps-1) << " warps at PC: " << std::hex << iregs[rsrc1]); + + for (int i = 1; i < active_warps; ++i) { + Warp &newWarp = core_->warp(i); + newWarp.setPC(iregs[rsrc1]); + newWarp.setTmask(0, true); + } + runOnce = true; + } break; case 2: { // SPLIT - D(3, "SPLIT: r" << rsrc0); + D(3, "SPLIT: r" << std::dec << rsrc0 << "=0x" << std::hex << iregs[rsrc0]); trace_inst->stall_warp = true; - if (sjOnce) { - sjOnce = false; - if (checkUnanimous(rsrc0, iRegFile_, tmask_)) { - D(3, "Unanimous pred: " << rsrc0 << " val: " << iregs[rsrc0] << "\n"); - DomStackEntry e(tmask_); - e.uni = true; - domStack_.push(e); - break; - } - D(3, "Split: Original TM: "); - DX( for (auto y : tmask_) D(3, y << " "); ) - - DomStackEntry e(rsrc0, iRegFile_, tmask_, pc_); - domStack_.push(tmask_); + if (checkUnanimous(rsrc0, iRegFile_, tmask_)) { + D(3, "Unanimous pred: " << rsrc0 << " val: " << iregs[rsrc0] << "\n"); + DomStackEntry e(tmask_); + e.unanimous = true; domStack_.push(e); - for (unsigned i = 0; i < e.tmask.size(); ++i) { - tmask_[i] = !e.tmask[i] && tmask_[i]; - } - - D(3, "Split: New TM"); - DX( for (auto y : tmask_) D(3, y << " "); ) - D(3, "Split: Pushed TM PC: " << std::hex << e.pc << std::dec << "\n"); - DX( for (auto y : e.tmask) D(3, y << " "); ) + break; } - break; - } - case 3: + + D(3, "Split: Original TM: "); + DX( for (int i = 0; i < core_->arch().num_threads(); ++i) D(3, tmask_[i] << " "); ) + + ThreadMask tmask; + for (int i = 0; i < core_->arch().num_threads(); ++i) { + tmask[i] = tmask_[i] && !iRegFile_[i][rsrc0]; + } + + DomStackEntry e(tmask, PC_); + domStack_.push(tmask_); + domStack_.push(e); + for (unsigned i = 0; i < e.tmask.size(); ++i) { + tmask_[i] = !e.tmask[i] && tmask_[i]; + } + + D(3, "Split: New TM"); + DX( for (int i = 0; i < core_->arch().num_threads(); ++i) D(3, tmask_[i] << " "); ) + D(3, "Split: Pushed TM PC: " << std::hex << e.PC << std::dec << "\n"); + DX( for (int i = 0; i < core_->arch().num_threads(); ++i) D(3, e.tmask[i] << " "); ) + + runOnce = true; + } break; + case 3: { // JOIN D(3, "JOIN"); - if (sjOnce) { - sjOnce = false; - if (!domStack_.empty() && domStack_.top().uni) { - D(2, "Uni branch at join"); - printf("NEW DOMESTACK: \n"); - tmask_ = domStack_.top().tmask; - domStack_.pop(); - break; - } - if (!domStack_.top().fallThrough) { - if (!pcSet) { - nextPc = domStack_.top().pc; - D(3, "join: NOT FALLTHROUGH PC: " << std::hex << nextPc << std::dec); - } - pcSet = true; - } - - D(3, "Join: Old TM: "); - DX( for (auto y : tmask_) D(3, y << " "); ) - std::cout << "\n"; + if (!domStack_.empty() && domStack_.top().unanimous) { + D(2, "Uni branch at join"); + printf("NEW DOMESTACK: \n"); tmask_ = domStack_.top().tmask; - - D(3, "Join: New TM: "); - DX( for (auto y : tmask_) D(3, y << " "); ) - domStack_.pop(); + break; } - break; - case 4: - trace_inst->stall_warp = true; - // is_barrier - break; - case 0: - // TMC - D(3, "TMC: r" << rsrc0); - trace_inst->stall_warp = true; - nextActiveThreads = std::min(iregs[rsrc0], core_->arch().getNumThreads()); - { - for (size_t ff = 0; ff < tmask_.size(); ff++) { - if (ff < nextActiveThreads) { - tmask_[ff] = true; - } else { - tmask_[ff] = false; - } + + if (!domStack_.top().fallThrough) { + if (!updatePC) { + nextPC = domStack_.top().PC; + D(3, "join: NOT FALLTHROUGH PC: " << std::hex << nextPC << std::dec); } + updatePC = true; } - if (nextActiveThreads == 0) { - spawned_ = false; - } - break; + + D(3, "Join: Old TM: "); + DX( for (int i = 0; i < core_->arch().num_threads(); ++i) D(3, tmask_[i] << " "); ) + std::cout << "\n"; + tmask_ = domStack_.top().tmask; + + D(3, "Join: New TM: "); + DX( for (int i = 0; i < core_->arch().num_threads(); ++i) D(3, tmask_[i] << " "); ) + + domStack_.pop(); + runOnce = true; + } break; + case 4: { + // is_barrier + trace_inst->stall_warp = true; + runOnce = true; + } break; default: std::cout << "ERROR: UNSUPPORTED GPGPU INSTRUCTION " << instr << "\n"; } break; case VSET_ARITH: { D(3, "VSET_ARITH"); - int VLMAX = (instr.getVlmul() * VLEN_) / instr.getVsew(); + int VLEN = core_->arch().vsize() * 8; + int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew(); switch (func3) { case 0: // vector-vector - trace_inst->vs1 = rsrc0; - trace_inst->vs2 = rsrc1; - trace_inst->vd = rdest; switch (func6) { case 0: { - D(3, "Addition " << rsrc0 << " " << rsrc1 << " Dest:" << rdest); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; - std::vector> &mask = vregFile_[0]; + D(4, "Addition " << rsrc0 << " " << rsrc1 << " Dest:" << rdest); + auto& vr1 = vRegFile_[rsrc0]; + auto& vr2 = vRegFile_[rsrc1]; + auto& vd = vRegFile_[rdest]; + auto& mask = vRegFile_[0]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *mask_ptr = (uint8_t *)mask[i].value(); - uint8_t value = (*mask_ptr & 0x1); + uint8_t emask = *(uint8_t *)(mask.data() + i); + uint8_t value = emask & 0x1; if (vmask || (!vmask && value)) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = *first_ptr + *second_ptr; - D(3, "Adding " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = first + second; + D(4, "Adding " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } - } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *mask_ptr = (uint16_t *)mask[i].value(); - uint16_t value = (*mask_ptr & 0x1); + uint16_t emask = *(uint16_t *)(mask.data() + i); + uint16_t value = emask & 0x1; if (vmask || (!vmask && value)) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = *first_ptr + *second_ptr; - D(3, "Adding " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = first + second; + D(4, "Adding " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } } else if (vtype_.vsew == 32) { - D(3, "Doing 32 bit vector addition"); + D(4, "Doing 32 bit vector addition"); for (int i = 0; i < vl_; i++) { - int *mask_ptr = (int *)mask[i].value(); - int value = (*mask_ptr & 0x1); + uint32_t emask = *(uint32_t *)(mask.data() + i); + uint32_t value = emask & 0x1; if (vmask || (!vmask && value)) { - int *first_ptr = (int *)vr1[i].value(); - int *second_ptr = (int *)vr2[i].value(); - int result = *first_ptr + *second_ptr; - D(3, "Adding " << *first_ptr << " + " << *second_ptr << " = " << result); - - int *result_ptr = (int *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = first + second; + D(4, "Adding " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } - } - - DX( - D(3, "Vector Register state after addition:"); - for (size_t i = 0; i < vregFile_.size(); i++) { - for (size_t j = 0; j < vregFile_[0].size(); j++) { - if (vtype_.vsew == 8) { - uint8_t *ptr_val = (uint8_t *)vregFile_[i][j].value(); - D(3, "reg[" << i << "][" << j << "] = " << *ptr_val); - } else if (vtype_.vsew == 16) { - uint16_t *ptr_val = (uint16_t *)vregFile_[i][j].value(); - D(3, "reg[" << i << "][" << j << "] = " << *ptr_val); - } else if (vtype_.vsew == 32) { - uint32_t *ptr_val = (uint32_t *)vregFile_[i][j].value(); - D(3, "reg[" << i << "][" << j << "] = " << *ptr_val); - } - } - } - D(3, "After vector register state after addition"); - ) - + } } break; case 24: //vmseq { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr == *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first == second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr == *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first == second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr == *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first == second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } } break; case 25: //vmsne { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr != *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first != second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr != *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first != second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr != *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first != second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } } break; case 26: //vmsltu { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } } break; case 27: //vmslt { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - int8_t *first_ptr = (int8_t *)vr1[i].value(); - int8_t *second_ptr = (int8_t *)vr2[i].value(); - int8_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int8_t *result_ptr = (int8_t *)vd[i].value(); - *result_ptr = result; + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - int16_t *first_ptr = (int16_t *)vr1[i].value(); - int16_t *second_ptr = (int16_t *)vr2[i].value(); - int16_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int16_t *result_ptr = (int16_t *)vd[i].value(); - *result_ptr = result; + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; } - } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - int32_t *first_ptr = (int32_t *)vr1[i].value(); - int32_t *second_ptr = (int32_t *)vr2[i].value(); - int32_t result = (*first_ptr < *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int32_t *result_ptr = (int32_t *)vd[i].value(); - *result_ptr = result; + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first < second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; } } } break; case 28: //vmsleu { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } } break; case 29: //vmsle { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - int8_t *first_ptr = (int8_t *)vr1[i].value(); - int8_t *second_ptr = (int8_t *)vr2[i].value(); - int8_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int8_t *result_ptr = (int8_t *)vd[i].value(); - *result_ptr = result; + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - int16_t *first_ptr = (int16_t *)vr1[i].value(); - int16_t *second_ptr = (int16_t *)vr2[i].value(); - int16_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int16_t *result_ptr = (int16_t *)vd[i].value(); - *result_ptr = result; + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - int32_t *first_ptr = (int32_t *)vr1[i].value(); - int32_t *second_ptr = (int32_t *)vr2[i].value(); - int32_t result = (*first_ptr <= *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int32_t *result_ptr = (int32_t *)vd[i].value(); - *result_ptr = result; + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first <= second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; } } } break; case 30: //vmsgtu { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } } } break; case 31: //vmsgt { - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - int8_t *first_ptr = (int8_t *)vr1[i].value(); - int8_t *second_ptr = (int8_t *)vr2[i].value(); - int8_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int8_t *result_ptr = (int8_t *)vd[i].value(); - *result_ptr = result; + int8_t first = *(int8_t *)(vr1.data() + i); + int8_t second = *(int8_t *)(vr2.data() + i); + int8_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - int16_t *first_ptr = (int16_t *)vr1[i].value(); - int16_t *second_ptr = (int16_t *)vr2[i].value(); - int16_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int16_t *result_ptr = (int16_t *)vd[i].value(); - *result_ptr = result; + int16_t first = *(int16_t *)(vr1.data() + i); + int16_t second = *(int16_t *)(vr2.data() + i); + int16_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int16_t *)(vd.data() + i) = result; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - int32_t *first_ptr = (int32_t *)vr1[i].value(); - int32_t *second_ptr = (int32_t *)vr2[i].value(); - int32_t result = (*first_ptr > *second_ptr) ? 1 : 0; - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - int32_t *result_ptr = (int32_t *)vd[i].value(); - *result_ptr = result; + int32_t first = *(int32_t *)(vr1.data() + i); + int32_t second = *(int32_t *)(vr2.data() + i); + int32_t result = (first > second) ? 1 : 0; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(int32_t *)(vd.data() + i) = result; } } } break; } break; case 2: { - trace_inst->vs1 = rsrc0; - trace_inst->vs2 = rsrc1; - trace_inst->vd = rdest; - switch (func6) { case 24: //vmandnot { D(3, "vmandnot"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = (first_value & !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; - } + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; + } for (int i = vl_; i < VLMAX; i++) { - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } - } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = (first_value & !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = (first_value & !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 25: //vmand { D(3, "vmand"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = (first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = (first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } - for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } - } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = (first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 26: //vmor { D(3, "vmor"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = (first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = (first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = (first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - D(3, "VLMAX: " << VLMAX); for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 27: //vmxor { D(3, "vmxor"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { - uint8_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = (first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = (first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = (first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 28: //vmornot { D(3, "vmornot"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = (first_value | !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = (first_value | !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = (first_value | !second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 29: //vmnand { D(3, "vmnand"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = !(first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint8_t *result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = !(first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } - for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } - } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = !(first_value & second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } - for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 30: //vmnor { D(3, "vmnor"); - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { - uint8_t *result_ptr; - for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = !(first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = !(first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint16_t *result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = !(first_value | second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - uint32_t *result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 31: //vmxnor { D(3, "vmxnor"); - uint8_t *result_ptr; - - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t first_value = (*first_ptr & 0x1); - uint8_t second_value = (*second_ptr & 0x1); + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t first_value = (first & 0x1); + uint8_t second_value = (second & 0x1); uint8_t result = !(first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t first_value = (*first_ptr & 0x1); - uint16_t second_value = (*second_ptr & 0x1); + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t first_value = (first & 0x1); + uint16_t second_value = (second & 0x1); uint16_t result = !(first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t first_value = (*first_ptr & 0x1); - uint32_t second_value = (*second_ptr & 0x1); + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t first_value = (first & 0x1); + uint32_t second_value = (second & 0x1); uint32_t result = !(first_value ^ second_value); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 37: //vmul { D(3, "vmul"); - uint8_t *result_ptr; - - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } - } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 45: //vmacc { D(3, "vmacc"); - uint8_t *result_ptr; - - std::vector> &vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + auto &vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr += result; + uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr += result; + uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } - } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (*first_ptr * *second_ptr); - D(3, "Comparing " << *first_ptr << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr += result; + uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (first * second); + D(4, "Comparing " << first << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) += result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; @@ -1813,112 +1568,84 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { switch (func6) { case 0: { D(3, "vmadd.vx"); - uint8_t *result_ptr; - - //vector> & vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + //vector & vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - //uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (iregs[rsrc0] + *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + //uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (iregs[rsrc0] + second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - //uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (iregs[rsrc0] + *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + //uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (iregs[rsrc0] + second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - //uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (iregs[rsrc0] + *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + //uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (iregs[rsrc0] + second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; case 37: //vmul.vx { D(3, "vmul.vx"); - uint8_t *result_ptr; - - //vector> & vr1 = vregFile_[rsrc0]; - std::vector> &vr2 = vregFile_[rsrc1]; - std::vector> &vd = vregFile_[rdest]; + //vector & vr1 = vRegFile_[rsrc0]; + auto &vr2 = vRegFile_[rsrc1]; + auto &vd = vRegFile_[rdest]; if (vtype_.vsew == 8) { for (int i = 0; i < vl_; i++) { - //uint8_t *first_ptr = (uint8_t *)vr1[i].value(); - uint8_t *second_ptr = (uint8_t *)vr2[i].value(); - uint8_t result = (iregs[rsrc0] * *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = result; + //uint8_t first = *(uint8_t *)(vr1.data() + i); + uint8_t second = *(uint8_t *)(vr2.data() + i); + uint8_t result = (iregs[rsrc0] * second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint8_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint8_t *)vd[i].value(); - *result_ptr = 0; + *(uint8_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 16) { - uint16_t *result_ptr; for (int i = 0; i < vl_; i++) { - //uint16_t *first_ptr = (uint16_t *)vr1[i].value(); - uint16_t *second_ptr = (uint16_t *)vr2[i].value(); - uint16_t result = (iregs[rsrc0] * *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = result; + //uint16_t first = *(uint16_t *)(vr1.data() + i); + uint16_t second = *(uint16_t *)(vr2.data() + i); + uint16_t result = (iregs[rsrc0] * second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint16_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint16_t *)vd[i].value(); - *result_ptr = 0; + *(uint16_t *)(vd.data() + i) = 0; } } else if (vtype_.vsew == 32) { - uint32_t *result_ptr; - for (int i = 0; i < vl_; i++) { - //uint32_t *first_ptr = (uint32_t *)vr1[i].value(); - uint32_t *second_ptr = (uint32_t *)vr2[i].value(); - uint32_t result = (iregs[rsrc0] * *second_ptr); - D(3, "Comparing " << iregs[rsrc0] << " + " << *second_ptr << " = " << result); - - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = result; + //uint32_t first = *(uint32_t *)(vr1.data() + i); + uint32_t second = *(uint32_t *)(vr2.data() + i); + uint32_t result = (iregs[rsrc0] * second); + D(4, "Comparing " << iregs[rsrc0] << " + " << second << " = " << result); + *(uint32_t *)(vd.data() + i) = result; } for (int i = vl_; i < VLMAX; i++) { - result_ptr = (uint32_t *)vd[i].value(); - *result_ptr = 0; + *(uint32_t *)(vd.data() + i) = 0; } } } break; @@ -1933,38 +1660,22 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << iregs[rsrc0] << "VLMAX" << VLMAX); int s0 = iregs[rsrc0]; - if (s0 <= VLMAX) { vl_ = s0; } else if (s0 < (2 * VLMAX)) { vl_ = (int)ceil((s0 * 1.0) / 2.0); - D(3, "Length:" << vl_ << ceil(s0 / 2)); } else if (s0 >= (2 * VLMAX)) { vl_ = VLMAX; - } - + } iregs[rdest] = vl_; - D(3, "VL:" << iregs[rdest]); - - Word regNum(0); - - vregFile_.clear(); - for (int j = 0; j < 32; j++) { - vregFile_.push_back(std::vector>()); - for (Word i = 0; i < (VLEN_ / instr.getVsew()); ++i) { - int *elem_ptr = (int *)malloc(instr.getVsew() / 8); - for (Word f = 0; f < (instr.getVsew() / 32); f++) - elem_ptr[f] = 0; - vregFile_[j].push_back(Reg(id_, regNum++, (char *)elem_ptr)); - } - } } break; default: { - std::cout << "default???\n" << std::flush; + std::abort(); } } } break; case (FL | VL): + ++loads_; if ( func3==0x2 ) { //std::cout << "FL_INST\n"; // rs1 is integer is register! @@ -1972,70 +1683,62 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { D(9,"something weird happen!"); Word data_read = core_->mem().read(memAddr, 0); D(3, "Memaddr"); - D_RAW(' ' << setw(8) << hex << memAddr << endl); + DPN(3, ' ' << std::setw(8) << std::hex << memAddr << std::endl); trace_inst->is_lw = true; trace_inst->mem_addresses[t] = memAddr; // //std::cout < data_read: " << data_read << "\n"; switch (func3) { case 2: // FLW fregs[rdest] = data_read & 0xFFFFFFFF; - D(3, "fpReg[rd]"); - D_RAW(' ' << setw(8) << hex << fregs[rdest] << endl); + D(4, "fpReg[rd] " << std::setw(8) << std::hex << fregs[rdest] << std::endl); break; default: std::cout << "ERROR: UNSUPPORTED FL INST\n"; exit(1); } D(3, "LOAD MEM ADDRESS: " << std::hex << memAddr); - D(3, "LOAD MEM DATA: " << std::hex << data_read); - memAccesses_.push_back(Warp::MemAccess(false, memAddr)); - } else { + } else { + int VLEN = core_->arch().vsize() * 8; D(3, "Executing vector load"); - D(3, "lmul: " << vtype_.vlmul << " VLEN:" << VLEN_ << "sew: " << vtype_.vsew); - D(3, "src: " << rsrc0 << " " << iregs[rsrc0]); - D(3, "dest" << rdest); - D(3, "width" << instr.getVlsWidth()); + D(4, "lmul: " << vtype_.vlmul << " VLEN:" << VLEN << "sew: " << vtype_.vsew); + D(4, "src: " << rsrc0 << " " << iregs[rsrc0]); + D(4, "dest" << rdest); + D(4, "width" << instr.getVlsWidth()); - std::vector> &vd = vregFile_[rdest]; + auto &vd = vRegFile_[rdest]; switch (instr.getVlsWidth()) { case 6: { //load word and unit strided (not checking for unit stride) for (int i = 0; i < vl_; i++) { Word memAddr = ((iregs[rsrc0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8); Word data_read = core_->mem().read(memAddr, 0); - D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); - int *result_ptr = (int *)vd[i].value(); + D(4, "Mem addr: " << std::hex << memAddr << " Data read " << data_read); + int *result_ptr = (int *)(vd.data() + i); *result_ptr = data_read; trace_inst->is_lw = true; trace_inst->mem_addresses[i] = memAddr; + + D(3, "STORE MEM ADDRESS: " << std::hex << memAddr); } - D(3, "Vector Register state ----:"); // cout << "Finished loop" << std::endl; } break; default: - std::cout << "Serious default??\n" << std::flush; + std::abort(); } break; } break; case (FS | VS): + ++stores_; if ((func3 == 0x1) || (func3 == 0x2) || (func3 == 0x3) || (func3 == 0x4)) { //std::cout << "FS_INST\n"; - ++stores_; // base is integer register! Word memAddr = iregs[rsrc0] + immsrc; - D(3, "STORE MEM ADDRESS: " << std::hex << fregs[rsrc0] << " + " << immsrc << "\n"); - D(3, "STORE MEM ADDRESS: " << std::hex << memAddr); trace_inst->is_sw = true; trace_inst->mem_addresses[t] = memAddr; - // //std::cout << "FUNC3: " << func3 << "\n"; - if ((memAddr == 0x00010000) && (t == 0)) { // ** Is this protected mem space? - unsigned num = fregs[rsrc1]; - fprintf(stderr, "%c", (char) fregs[rsrc1]); - break; - } + switch (func3) { case 1: std::cout << "ERROR: UNSUPPORTED FS INST\n"; @@ -2062,13 +1765,11 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { default: std::cout << "ERROR: UNSUPPORTED FS INST\n"; exit(1); - } + } D(3, "STORE MEM ADDRESS: " << std::hex << memAddr); - memAccesses_.push_back(Warp::MemAccess(true, memAddr)); } else { for (int i = 0; i < vl_; i++) { // cout << "iter" << std::endl; - ++stores_; Word memAddr = iregs[rsrc0] + (i * vtype_.vsew / 8); // std::cout << "STORE MEM ADDRESS *** : " << std::hex << memAddr << "\n"; @@ -2078,16 +1779,14 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { switch (instr.getVlsWidth()) { case 6: //store word and unit strided (not checking for unit stride) { - uint32_t *ptr_val = (uint32_t *)vregFile_[instr.getVs3()][i].value(); - D(3, "value: " << std::flush << (*ptr_val) << std::flush); - core_->mem().write(memAddr, *ptr_val, 0, 4); - D(3, "store: " << memAddr << " value:" << *ptr_val << std::flush); + uint32_t value = *(uint32_t *)(vRegFile_[instr.getVs3()].data() + i); + core_->mem().write(memAddr, value, 0, 4); + D(4, "store: " << memAddr << " value:" << value); } break; default: - std::cout << "ERROR: UNSUPPORTED S INST\n" << std::flush; std::abort(); - } - // cout << "Loop finished" << std::endl; + } + D(3, "STORE MEM ADDRESS: " << std::hex << memAddr); } } break; @@ -2161,28 +1860,24 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { csrs_[0x001] = csrs_[0x001] | 0x10; // set NX bit } - D(3, "fpOut: " << fpOut); + D(4, "fpOut: " << fpOut); if (fpBinIsNan(floatToBin(fpOut)) == 0) { fregs[rdest] = floatToBin(fpOut); } else { // According to risc-v spec p.64 section 11.3 // If the result is NaN, it is the canonical NaN fregs[rdest] = 0x7fc00000; - } + } } } break; - // FSGNJ.S, FSGNJN.S FSGJNX.S + // FSGNJ.S, FSGNJN.S FSGNJX.S case 0x10: { - bool fsign1 = fregs[rsrc0] & 0x80000000; - uint32_t fdata1 = fregs[rsrc0] & 0x7FFFFFFF; - bool fsign2 = fregs[rsrc1] & 0x80000000; - uint32_t fdata2 = fregs[rsrc1] & 0x7FFFFFFF; - - D(3, "fdata1 " << hex << fdata1 << endl); - D(3, "fsign2 " << hex << fsign2 << endl); + bool fsign1 = fregs[rsrc0] & 0x80000000; + uint32_t fdata1 = fregs[rsrc0] & 0x7FFFFFFF; + bool fsign2 = fregs[rsrc1] & 0x80000000; - switch(func3) { + switch (func3) { case 0: // FSGNJ.S fregs[rdest] = (fsign2 << 31) | fdata1; break; @@ -2190,7 +1885,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { fsign2 = !fsign2; fregs[rdest] = (fsign2 << 31) | fdata1; break; - case 2: { // FSGJNX.S + case 2: { // FSGNJX.S bool sign = fsign1 ^ fsign2; fregs[rdest] = (sign << 31) | fdata1; } break; @@ -2219,10 +1914,10 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { // handle corner case that compare +0 and -0 if (func3) { // FMAX.S - fregs[rdest] = (sr1IsZero==2)? fregs[rsrc1] : fregs[rsrc0]; + fregs[rdest] = (sr1IsZero==2) ? fregs[rsrc1] : fregs[rsrc0]; } else { // FMIM.S - fregs[rdest] = (sr1IsZero==2)? fregs[rsrc0] : fregs[rsrc1]; + fregs[rdest] = (sr1IsZero==2) ? fregs[rsrc0] : fregs[rsrc1]; } } else { float rs1 = intregToFloat(fregs[rsrc0]); @@ -2243,8 +1938,6 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { // FCVT.W.S FCVT.WU.S case 0x60: { // TODO: Need to clip result if rounded result is not representable in the destination format - // typedef uint32_t Word_u; - // typedef int32_t Word_s; // FCVT.W.S // Convert floating point to 32-bit signed integer float fpSrc = intregToFloat(fregs[rsrc0]); @@ -2285,7 +1978,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { } } - //show_fe_exceptions(); // once shown, it will clear corresponding bits, just for debug + //show_fe_exceptions(); // fcsr defined in riscv if (fetestexcept(FE_INEXACT)) { @@ -2416,8 +2109,7 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { // FCVT.S.W // Convert 32-bit signed integer to floating point // iregs[rsrc0] is actually a unsigned number - data = (int) iregs[rsrc0]; - D(3, "data" << data); + data = (int)iregs[rsrc0]; fregs[rdest] = floatToBin(data); } } break; @@ -2443,13 +2135,13 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { csrs_[0x001] = csrs_[0x001] | 0x10; // set NV bit } - if (fpBinIsNan(fregs[rsrc0]) || fpBinIsNan(fregs[rsrc1]) || fpBinIsNan(fregs[rsrc2])) { // if one of op is NaN + if (fpBinIsNan(fregs[rsrc0]) || fpBinIsNan(fregs[rsrc1]) || fpBinIsNan(fregs[rsrc2])) { + // if one of op is NaN // if addend is not quiet NaN, them set FCSR if ((fpBinIsNan(fregs[rsrc0])==2) | (fpBinIsNan(fregs[rsrc1])==2) | (fpBinIsNan(fregs[rsrc1])==2)) { csrs_[0x003] = csrs_[0x003] | 0x10; // set NV bit csrs_[0x001] = csrs_[0x001] | 0x10; // set NV bit } - fregs[rdest] = 0x7fc00000; // canonical(quiet) NaN } else { float rs1 = intregToFloat(fregs[rsrc0]); @@ -2507,24 +2199,22 @@ void Warp::execute(Instr &instr, trace_inst_t *trace_inst) { } break; default: - D(3, "pc: " << std::hex << (pc_ - 4)); + D(3, "PC: " << std::hex << (PC_ - 4)); D(3, "ERROR: Unsupported instruction: " << instr); std::abort(); } + + if (instr.hasRDest()) { + if (instr.is_FpDest()) { + D(3, "r" << std::dec << rdest << "=0x" << std::hex << std::hex << fregs[rdest]); + } else { + D(3, "r" << std::dec << rdest << "=0x" << std::hex << std::hex << iregs[rdest]); + } + } } - activeThreads_ = nextActiveThreads; - - // This way, if pc was set by a side effect (such as interrupt), it will - // retain its new value. - if (pcSet) { - pc_ = nextPc; - D(3, "Next PC: " << std::hex << nextPc << std::dec); - } - - if (nextActiveThreads > iRegFile_.size()) { - std::cerr << "Error: attempt to spawn " << nextActiveThreads << " threads. " - << iRegFile_.size() << " available.\n"; - abort(); + if (updatePC) { + PC_ = nextPC; + D(3, "Next PC: " << std::hex << nextPC << std::dec); } } diff --git a/simX/instr.cpp b/simX/instr.cpp deleted file mode 100644 index 9f439a31..00000000 --- a/simX/instr.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "instr.h" - -using namespace vortex; - -void Instr::setVlmul(Word lmul) { - vlmul_ = std::pow(2, lmul); -} - -void Instr::setVsew(Word sew) { - vsew_ = std::pow(2, 3+sew); -} - -void Instr::setVediv(Word ediv) { - vediv_ = std::pow(2,ediv); -} \ No newline at end of file diff --git a/simX/instr.h b/simX/instr.h index 04a9536b..972a68b4 100644 --- a/simX/instr.h +++ b/simX/instr.h @@ -52,9 +52,12 @@ public: Instr() : opcode_(Opcode::NOP) , nRsrc_(0) - , nPsrc_(0) , hasImmSrc_(false) , hasRDest_(false) + , is_FpDest_(false) + , is_VDest_(false) + , is_FpSrc_(0) + , is_VSrc_(0) , func2_(0) , func3_(0) , func7_(0) @@ -65,20 +68,24 @@ public: /* Setters used to "craft" the instruction. */ void setOpcode(Opcode opcode) { opcode_ = opcode; } - void setDestReg(RegNum destReg) { hasRDest_ = true; rdest_ = destReg; } - void setSrcReg(RegNum srcReg) { rsrc_[nRsrc_++] = srcReg; } + void setDestReg(int destReg) { hasRDest_ = true; rdest_ = destReg; } + void setSrcReg(int srcReg) { rsrc_[nRsrc_++] = srcReg; } + void setDestFReg(int destReg) { hasRDest_ = true; is_FpDest_ = true; rdest_ = destReg; } + void setSrcFReg(int srcReg) { is_FpSrc_ |= (1 << nRsrc_); rsrc_[nRsrc_++] = srcReg; } + void setDestVReg(int destReg) { hasRDest_ = true; is_VDest_ = true; rdest_ = destReg; } + void setSrcVReg(int srcReg) { is_VSrc_ |= (1 << nRsrc_); rsrc_[nRsrc_++] = srcReg; } void setFunc3(Word func3) { func3_ = func3; } void setFunc7(Word func7) { func7_ = func7; } void setSrcImm(Word srcImm) { hasImmSrc_ = true; immsrc_ = srcImm; } - void setVsetImm(Word vset_imm) { if(vset_imm) vsetImm_ = true; else vsetImm_ = false; } + void setVsetImm(Word vset_imm) { if (vset_imm) vsetImm_ = true; else vsetImm_ = false; } void setVlsWidth(Word width) { vlsWidth_ = width; } void setVmop(Word mop) { vMop_ = mop; } void setVnf(Word nf) { vNf_ = nf; } void setVmask(Word mask) { vmask_ = mask; } void setVs3(Word vs) { vs3_ = vs; } - void setVlmul(Word lmul); - void setVsew(Word sew); - void setVediv(Word ediv); + void setVlmul(Word lmul) { vlmul_ = 1 << lmul; } + void setVsew(Word sew) { vsew_ = 1 << (3+sew); } + void setVediv(Word ediv) { vediv_ = 1 << ediv; } void setFunc6(Word func6) { func6_ = func6; } /* Getters used by encoders. */ @@ -86,10 +93,10 @@ public: Word getFunc3() const { return func3_; } Word getFunc6() const { return func6_; } Word getFunc7() const { return func7_; } - RegNum getNRSrc() const { return nRsrc_; } - RegNum getRSrc(RegNum i) const { return rsrc_[i]; } + int getNRSrc() const { return nRsrc_; } + int getRSrc(int i) const { return rsrc_[i]; } bool hasRDest() const { return hasRDest_; } - RegNum getRDest() const { return rdest_; } + int getRDest() const { return rdest_; } bool hasImm() const { return hasImmSrc_; } Word getImm() const { return immsrc_; } bool getVsetImm() const { return vsetImm_; } @@ -102,6 +109,12 @@ public: Word getVsew() const { return vsew_; } Word getVediv() const { return vediv_; } + bool is_FpDest() const { return is_FpDest_; } + bool is_FpSrc(int i) const { return (is_FpSrc_ >> i) & 0x1; } + + bool is_VDest() const { return is_VDest_; } + bool is_VSrc(int i) const { return (is_VSrc_ >> i) & 0x1; } + private: enum { @@ -110,15 +123,18 @@ private: Opcode opcode_; int nRsrc_; - int nPsrc_; bool hasImmSrc_; - bool hasRDest_; + bool hasRDest_; + bool is_FpDest_; + bool is_VDest_; + int is_FpSrc_; + int is_VSrc_; Word immsrc_; Word func2_; Word func3_; Word func7_; - RegNum rsrc_[MAX_REG_SOURCES]; - RegNum rdest_; + int rsrc_[MAX_REG_SOURCES]; + int rdest_; //Vector bool vsetImm_; diff --git a/simX/main.cpp b/simX/main.cpp index 29888696..9013c44a 100644 --- a/simX/main.cpp +++ b/simX/main.cpp @@ -15,8 +15,8 @@ using namespace vortex; int main(int argc, char **argv) { - std::string archString("rv32i"); - int num_cores(1); + std::string archString("rv32imf"); + int num_cores(NUM_CORES * NUM_CLUSTERS); int num_warps(NUM_WARPS); int num_threads(NUM_THREADS); std::string imgFileName; @@ -48,7 +48,7 @@ int main(int argc, char **argv) { ArchDef arch(archString, num_cores, num_warps, num_threads); Decoder decoder(arch); - MemoryUnit mu(4096, arch.getWordSize(), true); + MemoryUnit mu(4096, arch.wsize(), true); RAM old_ram; old_ram.loadHexImpl(imgFileName.c_str()); @@ -59,7 +59,7 @@ int main(int argc, char **argv) { std::vector> cores(num_cores); for (int i = 0; i < num_cores; ++i) { - cores[i] = std::make_shared(arch, decoder, mu); + cores[i] = std::make_shared(arch, decoder, mu, i); } bool running; diff --git a/simX/simX.cpp b/simX/simX.cpp deleted file mode 100644 index 7c2dda86..00000000 --- a/simX/simX.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "debug.h" -#include "types.h" -#include "core.h" -#include "args.h" - -using namespace vortex; - -int main(int argc, char **argv) { - - std::string archString("rv32i"); - int num_cores(1); - int num_warps(NUM_WARPS); - int num_threads(NUM_THREADS); - std::string imgFileName; - bool showHelp(false); - bool showStats(false); - - /* Read the command line arguments. */ - CommandLineArgFlag fh("-h", "--help", "", showHelp); - CommandLineArgSetter fa("-a", "--arch", "", archString); - CommandLineArgSetter fi("-i", "--image", "", imgFileName); - CommandLineArgSetter fc("-c", "--cores", "", num_cores); - CommandLineArgSetter fw("-w", "--warps", "", num_warps); - CommandLineArgSetter ft("-t", "--threads", "", num_threads); - CommandLineArgFlag fs("-s", "--stats", "", showStats); - - CommandLineArg::readArgs(argc - 1, argv + 1); - - if (showHelp || imgFileName.empty()) { - std::cout << "Vortex emulator command line arguments:\n" - " -i, --image Program RAM image\n" - " -c, --cores Number of cores\n" - " -w, --warps Number of warps\n" - " -t, --threads Number of threads\n" - " -a, --arch Architecture string\n" - " -s, --stats Print stats on exit.\n"; - return 0; - } - - ArchDef arch(archString, num_cores, num_warps, num_threads); - - Decoder decoder(arch); - MemoryUnit mu(4096, arch.getWordSize(), true); - - RAM old_ram; - old_ram.loadHexImpl(imgFileName.c_str()); - mu.attach(old_ram, 0); - - struct stat hello; - fstat(0, &hello); - - std::vector> cores(num_cores); - for (int i = 0; i < num_cores; ++i) { - cores[i] = std::make_shared(arch, decoder, mu); - } - - bool running; - - do { - running = false; - for (int i = 0; i < num_cores; ++i) { - if (!cores[i]->running()) - continue; - running = true; - cores[i]->step(); - } - } while (running); - - return 0; -} diff --git a/simX/trace.h b/simX/trace.h index 49dff68b..0bddd70b 100644 --- a/simX/trace.h +++ b/simX/trace.h @@ -5,22 +5,27 @@ namespace vortex { struct trace_inst_t { // Warp step - bool valid_inst; - unsigned pc; + bool valid; + unsigned PC; // Core scheduler int wid; // Encoder - int rs1; - int rs2; - int rs3; - int rd; + int irs1; + int irs2; + int ird; - //Encoder - int vs1; - int vs2; - int vd; + // Floating-point + int frs1; + int frs2; + int frs3; + int frd; + + // Vector extension + int vrs1; + int vrs2; + int vrd; // Instruction execute bool is_lw; diff --git a/simX/types.h b/simX/types.h index ac855983..359d6a9d 100644 --- a/simX/types.h +++ b/simX/types.h @@ -1,20 +1,18 @@ #pragma once #include +#include #include namespace vortex { typedef uint8_t Byte; typedef uint32_t Word; -typedef uint32_t Word_u; -typedef int32_t Word_s; -typedef Word_u Addr; -typedef Word_u Size; +typedef uint32_t Addr; +typedef uint32_t Size; -typedef unsigned RegNum; -typedef unsigned ThdNum; +typedef std::bitset<32> ThreadMask; enum MemFlags { RD_USR = 1, diff --git a/simX/util.cpp b/simX/util.cpp index e7afaeba..f33ebf34 100644 --- a/simX/util.cpp +++ b/simX/util.cpp @@ -12,15 +12,15 @@ Word vortex::signExt(Word w, Size bit, Word mask) { return w; } -void vortex::wordToBytes(Byte *b, Word_u w, Size wordSize) { +void vortex::wordToBytes(Byte *b, Word w, Size wordSize) { while (wordSize--) { *(b++) = w & 0xff; w >>= 8; } } -Word_u vortex::bytesToWord(const Byte *b, Size wordSize) { - Word_u w = 0; +Word vortex::bytesToWord(const Byte *b, Size wordSize) { + Word w = 0; b += wordSize-1; while (wordSize--) { w <<= 8; @@ -29,15 +29,15 @@ Word_u vortex::bytesToWord(const Byte *b, Size wordSize) { return w; } -Word_u vortex::flagsToWord(bool r, bool w, bool x) { - Word_u word = 0; +Word vortex::flagsToWord(bool r, bool w, bool x) { + Word word = 0; if (r) word |= RD_USR; if (w) word |= WR_USR; if (x) word |= EX_USR; return word; } -void vortex::wordToFlags(bool &r, bool &w, bool &x, Word_u f) { +void vortex::wordToFlags(bool &r, bool &w, bool &x, Word f) { r = f & RD_USR; w = f & WR_USR; x = f & EX_USR; @@ -49,10 +49,10 @@ Byte vortex::readByte(const std::vector &b, Size &n) { return b[n++]; } -Word_u vortex::readWord(const std::vector &b, Size &n, Size wordSize) { +Word vortex::readWord(const std::vector &b, Size &n, Size wordSize) { if (b.size() - n < wordSize) throw std::out_of_range("out of range"); - Word_u w(0); + Word w(0); n += wordSize; // std::cout << "wordSize: " << wordSize << "\n"; for (Size i = 0; i < wordSize; i++) { diff --git a/simX/util.h b/simX/util.h index b9bef8f3..984b475f 100644 --- a/simX/util.h +++ b/simX/util.h @@ -12,13 +12,13 @@ void unused(Args&&...) {} Word signExt(Word w, Size bit, Word mask); -Word_u bytesToWord(const Byte *b, Size wordSize); -void wordToBytes(Byte *b, Word_u w, Size wordSize); -Word_u flagsToWord(bool r, bool w, bool x); -void wordToFlags(bool &r, bool &w, bool &x, Word_u f); +Word bytesToWord(const Byte *b, Size wordSize); +void wordToBytes(Byte *b, Word w, Size wordSize); +Word flagsToWord(bool r, bool w, bool x); +void wordToFlags(bool &r, bool &w, bool &x, Word f); Byte readByte(const std::vector &b, Size &n); -Word_u readWord(const std::vector &b, Size &n, Size wordSize); +Word readWord(const std::vector &b, Size &n, Size wordSize); void writeByte(std::vector &p, Size &n, Byte b); void writeWord(std::vector &p, Size &n, Size wordSize, Word w); diff --git a/simX/warp.cpp b/simX/warp.cpp index 046c6ca1..56cf9c58 100644 --- a/simX/warp.cpp +++ b/simX/warp.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "util.h" #include "instr.h" @@ -11,87 +12,67 @@ using namespace vortex; Warp::Warp(Core *core, Word id) : id_(id) + , active_(false) , core_(core) - , pc_(0x80000000) - , shadowPc_(0) - , activeThreads_(0) - , shadowActiveThreads_(0) - , shadowIReg_(core_->arch().getNumRegs()) - , VLEN_(1024) - , spawned_(false) + , PC_(0x80000000) , steps_(0) , insts_(0) , loads_(0) , stores_(0) { - D(3, "Creating a new thread with PC: " << std::hex << pc_); - /* Build the register file. */ - Word regNum(0); - for (Word j = 0; j < core_->arch().getNumThreads(); ++j) { - iRegFile_.push_back(std::vector>(0)); - for (Word i = 0; i < core_->arch().getNumRegs(); ++i) { - iRegFile_[j].push_back(Reg(id, regNum++)); - } - bool act = false; - if (j == 0) - act = true; - tmask_.push_back(act); - shadowTmask_.push_back(act); - } + tmask_.reset(); - for (Word i = 0; i < (1 << 12); i++) { - csrs_.push_back(Reg(id, regNum++)); - } - - /* Set initial register contents. */ - iRegFile_[0][0] = (core_->arch().getNumThreads() << (core_->arch().getWordSize() * 8 / 2)) | id; + iRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); + fRegFile_.resize(core_->arch().num_threads(), std::vector(core_->arch().num_regs(), 0)); + vRegFile_.resize(core_->arch().num_regs(), std::vector(core_->arch().vsize(), 0)); + csrs_.resize(core_->arch().num_csrs()); } void Warp::step(trace_inst_t *trace_inst) { + assert(tmask_.any()); + Size fetchPos(0); Size decPos; - Size wordSize(core_->arch().getWordSize()); + Size wordSize(core_->arch().wsize()); std::vector fetchBuffer(wordSize); - if (activeThreads_ == 0) - return; - ++steps_; - D(3, "current PC=0x" << std::hex << pc_); + D(3, "current PC=0x" << std::hex << PC_); - // std::cout << "pc: " << std::hex << pc << "\n"; - trace_inst->pc = pc_; + // std::cout << "PC: " << std::hex << PC << "\n"; + trace_inst->PC = PC_; /* Fetch and decode. */ - if (wordSize < sizeof(pc_)) - pc_ &= ((1ll << (wordSize * 8)) - 1); + if (wordSize < sizeof(PC_)) + PC_ &= ((1ll << (wordSize * 8)) - 1); unsigned fetchSize = 4; fetchBuffer.resize(fetchSize); - Word fetched = core_->mem().fetch(pc_ + fetchPos, 0); + Word fetched = core_->mem().fetch(PC_ + fetchPos, 0); writeWord(fetchBuffer, fetchPos, fetchSize, fetched); decPos = 0; std::shared_ptr instr = core_->decoder().decode(fetchBuffer, decPos, trace_inst); - // Update pc - pc_ += decPos; + // Update PC + PC_ += decPos; // Execute this->execute(*instr, trace_inst); // At Debug Level 3, print debug info after each instruction. - D(3, "Register state:"); - for (unsigned i = 0; i < iRegFile_[0].size(); ++i) { - D_RAW(" %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); - for (unsigned j = 0; j < (activeThreads_); ++j) - D_RAW(' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' '); - D_RAW('(' << shadowIReg_[i] << ')' << std::endl); + D(4, "Register state:"); + for (int i = 0; i < core_->arch().num_regs(); ++i) { + DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':'); + for (int j = 0; j < core_->arch().num_threads(); ++j) { + DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_[j][i] << std::setfill(' ') << ' '); + } + DPN(4, std::endl); } DPH(3, "Thread mask:"); - for (unsigned i = 0; i < tmask_.size(); ++i) + for (int i = 0; i < core_->arch().num_threads(); ++i) DPN(3, " " << tmask_[i]); DPN(3, "\n"); } diff --git a/simX/warp.h b/simX/warp.h index 06f79101..764e8afa 100644 --- a/simX/warp.h +++ b/simX/warp.h @@ -7,69 +7,25 @@ namespace vortex { -template -class Reg { -public: - Reg() - : value_(0), cpuId_(0), regNum_(0) {} - Reg(Word c, Word n) - : value_(0), cpuId_(c), regNum_(n) {} - Reg(Word c, Word n, T v) - : value_(v), cpuId_(c), regNum_(n) {} - - const T &value() const { - return value_; - } - - Reg &operator=(T r) { - if (regNum_) { - value_ = r; - doWrite(); - } - return *this; - } - - operator T() const { - doRead(); - return value_; - } - - void trunc(Size s) { - Word mask((~0ull >> (sizeof(Word) - s) * 8)); - value_ &= mask; - } - -private: - T value_; - Word cpuId_, regNum_; - - void doWrite() const {} - void doRead() const {} -}; - -/////////////////////////////////////////////////////////////////////////////// - struct DomStackEntry { - DomStackEntry( - unsigned p, - const std::vector>> &m, - std::vector &tm, - Word pc - ) : pc(pc) - , fallThrough(false) - , uni(false) { - for (unsigned i = 0; i < m.size(); ++i) { - tmask.push_back(!bool(m[i][p]) && tm[i]); - } - } + DomStackEntry(const ThreadMask &tmask, Word PC) + : tmask(tmask) + , PC(PC) + , fallThrough(false) + , unanimous(false) + {} - DomStackEntry(const std::vector &tmask) - : tmask(tmask), fallThrough(true), uni(false) {} + DomStackEntry(const ThreadMask &tmask) + : tmask(tmask) + , PC(0) + , fallThrough(true) + , unanimous(false) + {} - std::vector tmask; - Word pc; + ThreadMask tmask; + Word PC; bool fallThrough; - bool uni; + bool unanimous; }; struct vtype { @@ -86,11 +42,13 @@ class trace_inst_t; class Warp { public: Warp(Core *core, Word id = 0); - - void step(trace_inst_t *); - bool running() const { - return (activeThreads_ != 0); + bool active() const { + return tmask_.any(); + } + + std::size_t getActiveThreads() const { + return tmask_.count(); } void printStats() const; @@ -103,68 +61,40 @@ public: return id_; } - Word get_pc() const { - return pc_; + Word getPC() const { + return PC_; } - void set_pc(Word pc) { - pc_ = pc; - } - - void setActiveThreads(Size activeThreads) { - activeThreads_ = activeThreads; - } - - Size getActiveThreads() const { - return activeThreads_; - } - - void setSpawned(bool spawned) { - spawned_ = spawned; + void setPC(Word PC) { + PC_ = PC; } void setTmask(size_t index, bool value) { tmask_[index] = value; } + void step(trace_inst_t *); + private: void execute(Instr &instr, trace_inst_t *); - - struct MemAccess { - MemAccess(bool w, Word a) - : wr(w), addr(a) {} - bool wr; - Word addr; - }; - std::vector memAccesses_; - Word id_; + bool active_; Core *core_; - Word pc_; - Word shadowPc_; - Size activeThreads_; - Size shadowActiveThreads_; - std::vector>> iRegFile_; - std::vector>> fRegFile_; - std::vector> csrs_; - - std::vector tmask_; - std::vector shadowTmask_; + + Word PC_; + ThreadMask tmask_; + + std::vector> iRegFile_; + std::vector> fRegFile_; + std::vector> vRegFile_; + std::vector csrs_; std::stack domStack_; - std::vector shadowIReg_; - std::vector shadowFReg_; - - struct vtype vtype_; // both of them are XLEN WIDE - int vl_; // both of them are XLEN WIDE - Word VLEN_; // total vector length - - std::vector>> vregFile_; // 32 vector registers - - bool spawned_; - + struct vtype vtype_; + int vl_; + unsigned long steps_; unsigned long insts_; unsigned long loads_;