#include #include #include #include #include #include #include #include #include #include #include #define CACHE_LINESIZE 64 #define ALLOC_BASE_ADDR 0x10000000 #define LOCAL_MEM_SIZE 0xffffffff /////////////////////////////////////////////////////////////////////////////// inline size_t align_size(size_t size, size_t alignment) { assert(0 == (alignment & (alignment - 1))); return (size + alignment - 1) & ~(alignment - 1); } /////////////////////////////////////////////////////////////////////////////// class vx_device; class vx_buffer { public: vx_buffer(size_t size, vx_device* device) : size_(size) , device_(device) { auto aligned_asize = align_size(size, CACHE_LINESIZE); data_ = malloc(aligned_asize); } ~vx_buffer() { if (data_) { free(data_); } } void* data() const { return data_; } size_t size() const { return size_; } vx_device* device() const { return device_; } private: size_t size_; vx_device* device_; void* data_; }; /////////////////////////////////////////////////////////////////////////////// class vx_device { public: vx_device() { mem_allocation_ = ALLOC_BASE_ADDR; } ~vx_device() { if (future_.valid()) { future_.wait(); } } int alloc_local_mem(size_t size, size_t* dev_maddr) { auto dev_mem_size = LOCAL_MEM_SIZE; size_t asize = align_size(size, CACHE_LINESIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; mem_allocation_ += asize; return 0; } int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) { size_t asize = align_size(size, CACHE_LINESIZE); if (dest_addr + asize > ram_.size()) return -1; /*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr); for (int i = 0; i < size; i += 4) { printf("mem-write: 0x%x <- 0x%x\n", uint32_t(dest_addr + i), *(uint32_t*)((uint8_t*)src + src_offset + i)); }*/ ram_.write(dest_addr, asize, (uint8_t*)src + src_offset); return 0; } int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) { size_t asize = align_size(size, CACHE_LINESIZE); if (src_addr + asize > ram_.size()) return -1; ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset); /*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr); for (int i = 0; i < size; i += 4) { printf("mem-read: 0x%x -> 0x%x\n", uint32_t(src_addr + i), *(uint32_t*)((uint8_t*)dest + dest_offset + i)); }*/ return 0; } int start() { if (future_.valid()) { future_.wait(); // ensure prior run completed } simulator_.attach_ram(&ram_); future_ = std::async(std::launch::async, [&]{ simulator_.reset(); while (simulator_.is_busy()) { simulator_.step(); } }); return 0; } int wait(long long timeout) { if (!future_.valid()) return 0; auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); std::chrono::seconds wait_time(1); for (;;) { auto status = future_.wait_for(wait_time); // wait for 1 sec and check status if (status == std::future_status::ready || 0 == timeout_sec--) break; } return 0; } int flush_caches(size_t dev_maddr, size_t size) { if (future_.valid()) { future_.wait(); // ensure prior run completed } simulator_.attach_ram(&ram_); simulator_.flush_caches(dev_maddr, size); while (simulator_.snp_req_active()) { simulator_.step(); }; simulator_.attach_ram(NULL); return 0; } int set_csr(int core_id, int addr, unsigned value) { if (future_.valid()) { future_.wait(); // ensure prior run completed } simulator_.set_csr(core_id, addr, value); while (simulator_.csr_req_active()) { simulator_.step(); }; return 0; } int get_csr(int core_id, int addr, unsigned *value) { if (future_.valid()) { future_.wait(); // ensure prior run completed } simulator_.get_csr(core_id, addr, value); while (simulator_.csr_req_active()) { simulator_.step(); }; return 0; } private: size_t mem_allocation_; RAM ram_; Simulator simulator_; std::future future_; }; /////////////////////////////////////////////////////////////////////////////// extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { if (nullptr == hdevice) return -1; switch (caps_id) { case VX_CAPS_VERSION: *value = IMPLEMENTATION_ID; break; case VX_CAPS_MAX_CORES: *value = NUM_CORES; break; case VX_CAPS_MAX_WARPS: *value = NUM_WARPS; break; case VX_CAPS_MAX_THREADS: *value = NUM_THREADS; break; case VX_CAPS_CACHE_LINESIZE: *value = CACHE_LINESIZE; break; case VX_CAPS_LOCAL_MEM_SIZE: *value = 0xffffffff; break; case VX_CAPS_ALLOC_BASE_ADDR: *value = 0x10000000; break; case VX_CAPS_KERNEL_BASE_ADDR: *value = STARTUP_ADDR; break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); return -1; } return 0; } extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; *hdevice = new vx_device(); return 0; } extern int vx_dev_close(vx_device_h hdevice) { if (nullptr == hdevice) return -1; vx_device *device = ((vx_device*)hdevice); #ifdef DUMP_PERF_STATS unsigned num_cores; vx_csr_get(hdevice, 0, CSR_NC, &num_cores); if (num_cores > 1) { uint64_t total_instrs = 0, total_cycles = 0; // ------------------------- #ifdef PERF_ENABLE // PERF: cache uint64_t total_r = 0; uint64_t total_w = 0; uint64_t dram_st = 0; uint64_t dram_lat = 0; uint64_t dram_rsp = 0; uint64_t msrq_st = 0; uint64_t total_st = 0; uint64_t r_miss = 0; uint64_t w_miss = 0; uint64_t core_rsp_st = 0; uint64_t total_evict = 0; // PERF: pipeline stalls uint64_t lsu_stall = 0; uint64_t fpu_stall = 0; uint64_t mul_stall = 0; uint64_t csr_stall = 0; uint64_t alu_stall = 0; uint64_t gpu_stall = 0; uint64_t ibuffer_stall = 0; uint64_t scoreboard_stall = 0; uint64_t icache_stall = 0; #endif // ------------------------- for (unsigned core_id = 0; core_id < num_cores; ++core_id) { uint64_t instrs, cycles; vx_get_perf(hdevice, core_id, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); total_instrs += instrs; total_cycles = std::max(total_cycles, cycles); #ifdef PERF_ENABLE // PERF: cache // total_read uint64_t total_r_per_core; vx_csr_get_l(hdevice, core_id, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r_per_core); fprintf(stdout, "PERF: \t\ttotal_reads_per_core=%ld\n", total_r_per_core); total_r += total_r_per_core; // total_write uint64_t total_w_per_core; vx_csr_get_l(hdevice, core_id, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w_per_core); fprintf(stdout, "PERF: \t\ttotal_writes_per_core=%ld\n", total_w_per_core); total_w += total_w_per_core; // dram_stall uint64_t dram_st_per_core; vx_csr_get_l(hdevice, core_id, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st_per_core); fprintf(stdout, "PERF: \t\tdram_stalls_per_core=%ld\n", dram_st_per_core); dram_st += dram_st_per_core; // dram_latency uint64_t dram_lat_per_core, dram_rsp_per_core; vx_csr_get_l(hdevice, core_id, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat_per_core); vx_csr_get_l(hdevice, core_id, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp_per_core); fprintf(stdout, "PERF: \t\tdram_latency_per_core=%ld\n", dram_lat_per_core); fprintf(stdout, "PERF: \t\tdram_response_per_core=%ld\n", dram_rsp_per_core); dram_lat += dram_lat_per_core; dram_rsp += dram_rsp_per_core; float dram_lat_per_rsp_per_core = (float)(double(dram_lat_per_core) / double(dram_rsp_per_core)); fprintf(stdout, "PERF: \t\tdram_latency_per_response_per_core=%f\n", dram_lat_per_rsp_per_core); // miss_reserve_queue_stall uint64_t msrq_st_per_core; vx_csr_get_l(hdevice, core_id, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st_per_core); fprintf(stdout, "PERF: \t\tmsrq_stalls_per_core=%ld\n", msrq_st_per_core); msrq_st += msrq_st_per_core; // total_stall uint64_t total_st_per_core; vx_csr_get_l(hdevice, core_id, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st_per_core); fprintf(stdout, "PERF: \t\ttotal_stalls_per_core=%ld\n", total_st_per_core); total_st += total_st_per_core; // read_miss uint64_t r_miss_per_core; vx_csr_get_l(hdevice, core_id, CSR_R_MISS, CSR_R_MISS_H, &r_miss_per_core); fprintf(stdout, "PERF: \t\tread_misses_per_core=%ld\n", r_miss_per_core); r_miss += r_miss_per_core; // write_miss uint64_t w_miss_per_core; vx_csr_get_l(hdevice, core_id, CSR_W_MISS, CSR_W_MISS_H, &w_miss_per_core); fprintf(stdout, "PERF: \t\twrite_misses_per_core=%ld\n", w_miss_per_core); w_miss += w_miss_per_core; // core_rsp_stalls uint64_t core_rsp_st_per_core; vx_csr_get_l(hdevice, core_id, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st_per_core); fprintf(stdout, "PERF: \t\tcore_rsp_stalls_per_core=%ld\n", core_rsp_st_per_core); core_rsp_st += core_rsp_st_per_core; // total_evictions uint64_t total_evict_per_core; vx_csr_get_l(hdevice, core_id, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict_per_core); fprintf(stdout, "PERF: \t\ttotal_evictions_per_core=%ld\n", total_evict_per_core); total_evict += total_evict_per_core; // PERF: pipeline stall // lsu_stall uint64_t lsu_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall_per_core); fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall_per_core); lsu_stall += lsu_stall_per_core; // fpu_stall uint64_t fpu_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall_per_core); fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall_per_core); fpu_stall += fpu_stall_per_core; // mul_stall uint64_t mul_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall_per_core); fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall_per_core); mul_stall += mul_stall_per_core; // csr_stall uint64_t csr_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall_per_core); fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall_per_core); csr_stall += csr_stall_per_core; // alu_stall uint64_t alu_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall_per_core); fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall_per_core); alu_stall += alu_stall_per_core; // gpu_stall uint64_t gpu_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall_per_core); fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall_per_core); gpu_stall += gpu_stall_per_core; // ibuffer_stall uint64_t ibuffer_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall_per_core); fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall_per_core); ibuffer_stall += ibuffer_stall_per_core; // scoreboard_stall uint64_t scoreboard_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall_per_core); fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall_per_core); scoreboard_stall += scoreboard_stall_per_core; // icache_stall uint64_t icache_stall_per_core; vx_csr_get_l(hdevice, core_id, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall_per_core); fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall_per_core); icache_stall += icache_stall_per_core; #endif // ------------------------- } float IPC = (float)(double(total_instrs) / double(total_cycles)); fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); #ifdef PERF_ENABLE // PERF: cache fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r); fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w); fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st); fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat); fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp); float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp)); fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp); fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st); fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st); fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss); fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss); fprintf(stdout, "PERF: \t\tcore_rsp_stalls=%ld\n", core_rsp_st); fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict); // PERF: pipeline stall fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall); fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall); fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall); fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall); fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall); fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall); fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall); fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall); fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall); #endif // ------------------------- } else { uint64_t instrs, cycles; vx_get_perf(hdevice, 0, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); #ifdef PERF_ENABLE // PERF: cache // total_read uint64_t total_r; vx_csr_get_l(hdevice, 0, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r); fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r); // total_write uint64_t total_w; vx_csr_get_l(hdevice, 0, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w); fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w); // dram_stall uint64_t dram_st; vx_csr_get_l(hdevice, 0, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st); fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st); // dram_latency uint64_t dram_lat, dram_rsp; vx_csr_get_l(hdevice, 0, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat); vx_csr_get_l(hdevice, 0, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp); float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp)); fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat); fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp); fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp); // miss_reserve_queue_stall uint64_t msrq_st; vx_csr_get_l(hdevice, 0, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st); fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st); // total_stall uint64_t total_st; vx_csr_get_l(hdevice, 0, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st); fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st); // read_miss uint64_t r_miss; vx_csr_get_l(hdevice, 0, CSR_R_MISS, CSR_R_MISS_H, &r_miss); fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss); // write_miss uint64_t w_miss; vx_csr_get_l(hdevice, 0, CSR_W_MISS, CSR_W_MISS_H, &w_miss); fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss); // core_rsp_stalls uint64_t core_rsp_st; vx_csr_get_l(hdevice, 0, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st); fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", core_rsp_st); // total_evictions uint64_t total_evict; vx_csr_get_l(hdevice, 0, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict); fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict); // PERF: pipeline stalls // TODO: // lsu_stall uint64_t lsu_stall; vx_csr_get_l(hdevice, 0, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall); fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall); // fpu_stall uint64_t fpu_stall; vx_csr_get_l(hdevice, 0, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall); fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall); // mul_stall uint64_t mul_stall; vx_csr_get_l(hdevice, 0, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall); fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall); // csr_stall uint64_t csr_stall; vx_csr_get_l(hdevice, 0, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall); fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall); // alu_stall uint64_t alu_stall; vx_csr_get_l(hdevice, 0, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall); fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall); // gpu_stall uint64_t gpu_stall; vx_csr_get_l(hdevice, 0, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall); fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall); // ibuffer_stall uint64_t ibuffer_stall; vx_csr_get_l(hdevice, 0, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall); fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall); // scoreboard_stall uint64_t scoreboard_stall; vx_csr_get_l(hdevice, 0, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall); fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall); // icache_stall uint64_t icache_stall; vx_csr_get_l(hdevice, 0, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall); fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall); #endif // ------------------------- } #endif delete device; return 0; } extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr || 0 >= size) return -1; vx_device *device = ((vx_device*)hdevice); return device->alloc_local_mem(size, dev_maddr); } extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { if (nullptr == hdevice || 0 >= size) return -1; vx_device *device = ((vx_device*)hdevice); return device->flush_caches(dev_maddr, size); } extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) { if (nullptr == hdevice || 0 >= size || nullptr == hbuffer) return -1; vx_device *device = ((vx_device*)hdevice); auto buffer = new vx_buffer(size, device); if (nullptr == buffer->data()) { delete buffer; return -1; } *hbuffer = buffer; return 0; } extern void* vx_host_ptr(vx_buffer_h hbuffer) { if (nullptr == hbuffer) return nullptr; vx_buffer* buffer = ((vx_buffer*)hbuffer); return buffer->data(); } extern int vx_buf_release(vx_buffer_h hbuffer) { if (nullptr == hbuffer) return -1; vx_buffer* buffer = ((vx_buffer*)hbuffer); delete buffer; return 0; } extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) { if (nullptr == hbuffer || 0 >= size) return -1; auto buffer = (vx_buffer*)hbuffer; if (size + src_offset > buffer->size()) return -1; return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset); } extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) { if (nullptr == hbuffer || 0 >= size) return -1; auto buffer = (vx_buffer*)hbuffer; if (size + dest_offset > buffer->size()) return -1; return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset); } extern int vx_start(vx_device_h hdevice) { if (nullptr == hdevice) return -1; vx_device *device = ((vx_device*)hdevice); return device->start(); } extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { if (nullptr == hdevice) return -1; vx_device *device = ((vx_device*)hdevice); return device->wait(timeout); } extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) { if (nullptr == hdevice) return -1; vx_device *device = ((vx_device*)hdevice); return device->set_csr(core_id, addr, value); } extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) { if (nullptr == hdevice) return -1; vx_device *device = ((vx_device*)hdevice); return device->get_csr(core_id, addr, value); } extern int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value) { if (nullptr == hdevice) return -1; unsigned csr_value; vx_csr_get(hdevice, core_id, addr_h, &csr_value); *value = csr_value; vx_csr_get(hdevice, core_id, addr, &csr_value); *value = (*value << 32) | csr_value; return 0; }