diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index 4a4eb62d..0e590415 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -91,22 +91,22 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) { return err; } -extern int vx_get_perf(vx_device_h device, size_t* cycles, size_t* instrs) { +extern int vx_get_perf(vx_device_h device, int core_id, size_t* cycles, size_t* instrs) { int ret = 0; unsigned value; if (cycles) { - ret |= vx_csr_get(device, 0, CSR_CYCLE_H, &value); + ret |= vx_csr_get(device, core_id, CSR_CYCLE_H, &value); *cycles = value; - ret |= vx_csr_get(device, 0, CSR_CYCLE, &value); + ret |= vx_csr_get(device, core_id, CSR_CYCLE, &value); *cycles = (*cycles << 32) | value; } if (instrs) { - ret |= vx_csr_get(device, 0, CSR_INSTRET_H, &value); + ret |= vx_csr_get(device, core_id, CSR_INSTRET_H, &value); *instrs = value; - ret |= vx_csr_get(device, 0, CSR_INSTRET, &value); + ret |= vx_csr_get(device, core_id, CSR_INSTRET, &value); *instrs = (*instrs << 32) | value; } diff --git a/driver/include/vortex.h b/driver/include/vortex.h index ecdd0542..e5aa9eb6 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice); int vx_ready_wait(vx_device_h hdevice, long long timeout); // set device constant registers -int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value); +int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value); // get device constant registers -int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value); +int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// @@ -72,7 +72,7 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) int vx_upload_kernel_file(vx_device_h device, const char* filename); // get performance counters -int vx_get_perf(vx_device_h device, size_t* cycles, size_t* instrs); +int vx_get_perf(vx_device_h device, int core_id, size_t* cycles, size_t* instrs); #ifdef __cplusplus } diff --git a/driver/opae/Makefile b/driver/opae/Makefile index b13b897d..9946470f 100644 --- a/driver/opae/Makefile +++ b/driver/opae/Makefile @@ -17,6 +17,9 @@ CXXFLAGS +=-fstack-protector # Position independent code CXXFLAGS += -fPIC +# Dump perf stats +CXXFLAGS += -DDUMP_PERF_STATS + # Enable scope analyzer #CXXFLAGS += -DSCOPE diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index c8bf410b..e66b9afe 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -211,14 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) { vx_scope_stop(device->fpga, 0); #endif - { - // Dump perf stats +#ifdef DUMP_PERF_STATS + // Dump perf stats + if (device->num_cores > 1) { + uint64_t total_instrs = 0, total_cycles = 0; + for (unsigned core_id = 0; core_id < device->num_cores; ++core_id) { + uint64_t instrs, cycles; + int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles); + assert(ret == 0); + float IPC = (float)(double(instrs) / double(cycles)); + fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); + total_instrs += instrs; + total_cycles = std::max(total_cycles, cycles); + } + float IPC = (float)(double(total_instrs) / double(total_cycles)); + fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); + } else { uint64_t instrs, cycles; - int ret = vx_get_perf(hdevice, &instrs, &cycles); + int ret = vx_get_perf(hdevice, 0, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); assert(ret == 0); + fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); } +#endif fpgaClose(device->fpga); @@ -480,7 +495,7 @@ extern int vx_start(vx_device_h hdevice) { } // set device constant registers -extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) { +extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) { if (nullptr == hdevice) return -1; @@ -491,8 +506,8 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value return -1; // write CSR value - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA, value)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_WRITE)); @@ -500,7 +515,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value } // get device constant registers -extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) { +extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) { if (nullptr == hdevice || nullptr == value) return -1; @@ -512,8 +527,8 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* valu // write CSR value - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_READ)); // Ensure ready for new command diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index ea4f3c22..d3d0df94 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -28,6 +28,8 @@ CFLAGS += -fPIC CFLAGS += -DUSE_RTLSIM $(CONFIGS) +CFLAGS += -DDUMP_PERF_STATS + LDFLAGS += -shared -pthread # LDFLAGS += -dynamiclib -pthread diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 80167966..03d6c294 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -69,7 +69,28 @@ public: } ~vx_device() { - simulator_.print_stats(std::cout); + #ifdef DUMP_PERF_STATS + unsigned num_cores; + this->get_csr(0, CSR_NC, &num_cores); + if (num_cores > 1) { + uint64_t total_instrs = 0, total_cycles = 0; + for (unsigned core_id = 0; core_id < num_cores; ++core_id) { + uint64_t instrs, cycles; + vx_get_perf(this, core_id, &instrs, &cycles); + float IPC = (float)(double(instrs) / double(cycles)); + fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); + total_instrs += instrs; + total_cycles = std::max(total_cycles, cycles); + } + float IPC = (float)(double(total_instrs) / double(total_cycles)); + fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); + } else { + uint64_t instrs, cycles; + vx_get_perf(this, 0, &instrs, &cycles); + float IPC = (float)(double(instrs) / double(cycles)); + fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + } + #endif if (future_.valid()) { future_.wait(); } @@ -152,6 +173,28 @@ public: return 0; } + int set_csr(int core_id, int addr, unsigned value) { + if (future_.valid()) { + future_.wait(); // ensure prior run completed + } + simulator_.set_csr(core_id, addr, value); + while (simulator_.is_busy()) { + simulator_.step(); + }; + return 0; + } + + int get_csr(int core_id, int addr, unsigned *value) { + if (future_.valid()) { + future_.wait(); // ensure prior run completed + } + simulator_.get_csr(core_id, addr, value); + while (simulator_.is_busy()) { + simulator_.step(); + }; + return 0; + } + private: size_t mem_allocation_; @@ -324,10 +367,20 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { return device->wait(timeout); } -extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) { - return -1; +extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) { + if (nullptr == hdevice) + return -1; + + vx_device *device = ((vx_device*)hdevice); + + return device->set_csr(core_id, addr, value); } -extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) { - return -1; +extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) { + if (nullptr == hdevice) + return -1; + + vx_device *device = ((vx_device*)hdevice); + + return device->get_csr(core_id, addr, value); } \ No newline at end of file diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 292c410f..b499803d 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -358,10 +358,10 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { return device->wait(timeout); } -extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) { +extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned /*value*/) { return -1; } -extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) { +extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned* /*value*/) { return -1; } \ No newline at end of file diff --git a/driver/stub/vortex.cpp b/driver/stub/vortex.cpp index 007bce0e..f4a101f0 100644 --- a/driver/stub/vortex.cpp +++ b/driver/stub/vortex.cpp @@ -48,10 +48,10 @@ extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) { return -1; } -extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) { +extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned /*value*/) { return -1; } -extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) { +extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned* /*value*/) { return -1; } \ No newline at end of file diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index b4b1200e..99a5599f 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -22,6 +22,7 @@ Simulator::Simulator() { dram_rsp_active_ = false; snp_req_active_ = false; + csr_req_active_ = false; #ifdef VCD_OUTPUT Verilated::traceEverOn(true); @@ -163,15 +164,6 @@ void Simulator::eval_io_bus() { vortex_->io_rsp_valid = 0; } -void Simulator::eval_csr_bus() { - vortex_->csr_io_req_valid = 0; - vortex_->csr_io_req_coreid = 0; - vortex_->csr_io_req_addr = 0; - vortex_->csr_io_req_rw = 0; - vortex_->csr_io_req_data = 0; - vortex_->csr_io_rsp_ready = 1; -} - void Simulator::eval_snp_bus() { if (snp_req_active_) { if (vortex_->snp_rsp_valid) { @@ -204,6 +196,27 @@ void Simulator::eval_snp_bus() { } } +void Simulator::eval_csr_bus() { + if (csr_req_active_) { + if (vortex_->csr_io_req_rw) { + if (vortex_->csr_io_req_ready) { + vortex_->snp_req_valid = 0; + csr_req_active_ = false; + } + } else { + if (vortex_->csr_io_rsp_valid) { + *csr_rsp_value_ = vortex_->csr_io_rsp_data; + vortex_->snp_req_valid = 0; + vortex_->csr_io_rsp_ready = 0; + csr_req_active_ = false; + } + } + } else { + vortex_->csr_io_req_valid = 0; + vortex_->csr_io_rsp_ready = 0; + } +} + void Simulator::wait(uint32_t cycles) { for (int i = 0; i < cycles; ++i) { this->step(); @@ -211,7 +224,9 @@ void Simulator::wait(uint32_t cycles) { } bool Simulator::is_busy() const { - return vortex_->busy || snp_req_active_; + return vortex_->busy + || snp_req_active_ + || csr_req_active_; } void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { @@ -221,22 +236,52 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { if (0 == size) return; - snp_req_active_ = true; - snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; - vortex_->snp_req_addr = mem_addr / GLOBAL_BLOCK_SIZE; vortex_->snp_req_tag = 0; vortex_->snp_req_valid = 1; vortex_->snp_rsp_ready = 1; + snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; --snp_req_size_; pending_snp_reqs_ = 1; + + snp_req_active_ = true; #ifdef DBG_PRINT_CACHE_SNP std::cout << timestamp << ": [sim] snp req: addr=" << std::hex << vortex_->snp_req_addr << std::dec << " tag=" << vortex_->snp_req_tag << " remain=" << snp_req_size_ << std::endl; #endif } +void Simulator::set_csr(int core_id, int addr, unsigned value) { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] set_csr()" << std::endl; +#endif + + vortex_->csr_io_req_valid = 1; + vortex_->csr_io_req_coreid = core_id; + vortex_->csr_io_req_addr = addr; + vortex_->csr_io_req_rw = 1; + vortex_->csr_io_req_data = value; + vortex_->csr_io_rsp_ready = 0; + + csr_req_active_ = true; +} + +void Simulator::get_csr(int core_id, int addr, unsigned *value) { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] get_csr()" << std::endl; +#endif + + vortex_->csr_io_req_valid = 1; + vortex_->csr_io_req_coreid = core_id; + vortex_->csr_io_req_addr = addr; + vortex_->csr_io_req_rw = 0; + vortex_->csr_io_rsp_ready = 1; + + csr_rsp_value_ = value; + csr_req_active_ = true; +} + void Simulator::run() { #ifndef NDEBUG std::cout << timestamp << ": [sim] run()" << std::endl; diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index e104b66b..a16218bb 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -31,6 +31,8 @@ public: Simulator(); virtual ~Simulator(); + void attach_ram(RAM* ram); + void load_bin(const char* program_file); void load_ihex(const char* program_file); @@ -39,12 +41,14 @@ public: void reset(); void step(); void wait(uint32_t cycles); + void flush_caches(uint32_t mem_addr, uint32_t size); - - void attach_ram(RAM* ram); + void set_csr(int core_id, int addr, unsigned value); + void get_csr(int core_id, int addr, unsigned *value); void run(); int get_last_wb_value(int reg) const; + void print_stats(std::ostream& out); private: @@ -60,8 +64,11 @@ private: int dram_rsp_active_; bool snp_req_active_; + bool csr_req_active_; + uint32_t snp_req_size_; uint32_t pending_snp_reqs_; + uint32_t* csr_rsp_value_; RAM *ram_; VVortex *vortex_;