diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index d4fcf518..d7b8f829 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -4,31 +4,6 @@ #include #include -extern int vx_dev_caps(int caps_id) { - switch (caps_id) { - case VX_CAPS_VERSION: - return 0; - case VX_CAPS_MAX_CORES: - return NUM_CORES; - case VX_CAPS_MAX_WARPS: - return NUM_WARPS; - case VX_CAPS_MAX_THREADS: - return NUM_THREADS; - case VX_CAPS_CACHE_LINESIZE: - return 64; - case VX_CAPS_LOCAL_MEM_SIZE: - return 0xffffffff; - case VX_CAPS_ALLOC_BASE_ADDR: - return 0x10000000; - case VX_CAPS_KERNEL_BASE_ADDR: - return 0x80000000; - default: - std::cout << "invalid caps id: " << caps_id << std::endl; - std::abort(); - return 0; - } -} - extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) { int err = 0; @@ -36,7 +11,10 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ return -1; uint32_t buffer_transfer_size = 65536; - uint32_t kernel_base_addr = vx_dev_caps(VX_CAPS_KERNEL_BASE_ADDR); + unsigned kernel_base_addr; + err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr); + if (err != 0) + return -1; // allocate device buffer vx_buffer_h buffer; @@ -47,7 +25,7 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_ // get buffer address auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); - #if defined(USE_SIMX) +#if defined(USE_SIMX) // default startup routine ((uint32_t*)buf_ptr)[0] = 0xf1401073; ((uint32_t*)buf_ptr)[1] = 0xf1401073; diff --git a/driver/include/vortex.h b/driver/include/vortex.h index 2f379b11..ded648db 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -21,15 +21,15 @@ typedef void* vx_buffer_h; #define VX_CAPS_ALLOC_BASE_ADDR 0x6 #define VX_CAPS_KERNEL_BASE_ADDR 0x7 -// return device configurations -int vx_dev_caps(int caps_id); - // open the device and connect to it int vx_dev_open(vx_device_h* hdevice); // Close the device when all the operations are done int vx_dev_close(vx_device_h hdevice); +// return device configurations +int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value); + // Allocate shared buffer with device int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer); @@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice); int vx_ready_wait(vx_device_h hdevice, long long timeout); // set device constant registers -int vx_set_regiters(int state, int value); +int vx_csr_set(vx_device_h hdevice, int address, int value); // get device constant registers -int vx_get_regiters(int state, int* value); +int vx_csr_get(vx_device_h hdevice, int address, int* value); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// diff --git a/driver/opae/scope.cpp b/driver/opae/scope.cpp index 1f642659..6053d353 100644 --- a/driver/opae/scope.cpp +++ b/driver/opae/scope.cpp @@ -18,8 +18,8 @@ return -1; \ } while (false) -#define MMIO_CSR_SCOPE_CMD (AFU_IMAGE_MMIO_CSR_SCOPE_CMD * 4) -#define MMIO_CSR_SCOPE_DATA (AFU_IMAGE_MMIO_CSR_SCOPE_DATA * 4) +#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4) +#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4) struct scope_signal_t { int width; @@ -136,7 +136,7 @@ int vx_scope_start(fpga_handle hfpga, uint64_t delay) { if (delay != uint64_t(-1)) { // set start delay uint64_t cmd_delay = ((delay << 3) | 4); - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_delay)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_delay)); std::cout << "scope start delay: " << delay << std::endl; } @@ -150,7 +150,7 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { if (delay != uint64_t(-1)) { // stop recording uint64_t cmd_stop = ((delay << 3) | 5); - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_stop)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop)); std::cout << "scope stop delay: " << delay << std::endl; } @@ -170,9 +170,9 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { uint64_t frame_width, max_frames, data_valid; // wait for recording to terminate - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); do { - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); if (data_valid) break; std::this_thread::sleep_for(std::chrono::seconds(1)); @@ -180,15 +180,15 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { std::cout << "scope trace dump begin..." << std::endl; - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 2)); - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &frame_width)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 2)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width)); std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl; - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 3)); - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &max_frames)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 3)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames)); std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl; - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1)); if (fwidth != (int)frame_width) { std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl; @@ -209,7 +209,7 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { ofs << "b1 0" << std::endl; uint64_t delta; - fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &delta); + fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta); assert(res == FPGA_OK); while (delta != 0) { @@ -228,14 +228,14 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { do { if (frame_no == (max_frames-1)) { // verify last frame is valid - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0)); - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); assert(data_valid == 1); - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 1)); } uint64_t word; - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &word)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &word)); do { int signal_width = scope_signals[signal_id-1].width; @@ -267,8 +267,8 @@ int vx_scope_stop(fpga_handle hfpga, uint64_t delay) { std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl; // verify data not valid - CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0)); - CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid)); + CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, 0)); + CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid)); assert(data_valid == 0); return 0; diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index ff020636..abb9c5b1 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -1,17 +1,24 @@ #include +#include #include #include +#include #include #include #include #include #include #include +#include #include "vortex_afu.h" #ifdef SCOPE #include "scope.h" #endif +#define CACHE_LINESIZE 64 +#define ALLOC_BASE_ADDR 0x10000000 +#define LOCAL_MEM_SIZE 0xffffffff + #define CHECK_RES(_expr) \ do { \ fpga_result res = _expr; \ @@ -24,22 +31,31 @@ /////////////////////////////////////////////////////////////////////////////// -#define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ -#define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE -#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN -#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH +#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ +#define CMD_MEM_WRITE AFU_IMAGE_CMD_MEM_WRITE +#define CMD_RUN AFU_IMAGE_CMD_RUN +#define CMD_CLFLUSH AFU_IMAGE_CMD_CLFLUSH +#define CMD_CSR_READ AFU_IMAGE_CMD_CSR_READ +#define CMD_CSR_WRITE AFU_IMAGE_CMD_CSR_WRITE -#define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4) -#define MMIO_CSR_IO_ADDR (AFU_IMAGE_MMIO_CSR_IO_ADDR * 4) -#define MMIO_CSR_MEM_ADDR (AFU_IMAGE_MMIO_CSR_MEM_ADDR * 4) -#define MMIO_CSR_DATA_SIZE (AFU_IMAGE_MMIO_CSR_DATA_SIZE * 4) -#define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4) +#define MMIO_CMD_TYPE (AFU_IMAGE_MMIO_CMD_TYPE * 4) +#define MMIO_IO_ADDR (AFU_IMAGE_MMIO_IO_ADDR * 4) +#define MMIO_MEM_ADDR (AFU_IMAGE_MMIO_MEM_ADDR * 4) +#define MMIO_DATA_SIZE (AFU_IMAGE_MMIO_DATA_SIZE * 4) +#define MMIO_STATUS (AFU_IMAGE_MMIO_STATUS * 4) +#define MMIO_CSR_ADDR (AFU_IMAGE_MMIO_CSR_ADDR * 4) +#define MMIO_CSR_DATA (AFU_IMAGE_MMIO_CSR_DATA * 4) +#define MMIO_CSR_READ (AFU_IMAGE_MMIO_CSR_READ * 4) /////////////////////////////////////////////////////////////////////////////// typedef struct vx_device_ { fpga_handle fpga; size_t mem_allocation; + int implementation_id; + int num_cores; + int num_warps; + int num_threads; } vx_device_t; typedef struct vx_buffer_ { @@ -62,21 +78,58 @@ inline bool is_aligned(size_t addr, size_t alignment) { /////////////////////////////////////////////////////////////////////////////// +extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { + if (nullptr == hdevice) + return -1; + + vx_device_t *device = ((vx_device_t*)hdevice); + + switch (caps_id) { + case VX_CAPS_VERSION: + *value = device->implementation_id; + break; + case VX_CAPS_MAX_CORES: + *value = device->num_cores; + break; + case VX_CAPS_MAX_WARPS: + *value = device->num_warps; + break; + case VX_CAPS_MAX_THREADS: + *value = device->num_threads; + break; + case VX_CAPS_CACHE_LINESIZE: + *value = CACHE_LINESIZE; + break; + case VX_CAPS_LOCAL_MEM_SIZE: + *value = LOCAL_MEM_SIZE; + break; + case VX_CAPS_ALLOC_BASE_ADDR: + *value = ALLOC_BASE_ADDR; + break; + case VX_CAPS_KERNEL_BASE_ADDR: + *value = STARTUP_ADDR; + break; + default: + fprintf(stderr, "invalid caps id: %d\n", caps_id); + std::abort(); + return -1; + } + + return 0; +} + extern int vx_dev_open(vx_device_h* hdevice) { + if (nullptr == hdevice) + return -1; + fpga_properties filter = nullptr; fpga_result res; fpga_guid guid; fpga_token accel_token; uint32_t num_matches; fpga_handle accel_handle; - vx_device_t* device; - - if (nullptr == hdevice) - return -1; - - // ensure that the block size 64 - assert(64 == vx_dev_caps(VX_CAPS_CACHE_LINESIZE)); - + vx_device_t* device; + // Set up a filter that will search for an accelerator fpgaGetProperties(nullptr, &filter); fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR); @@ -114,17 +167,32 @@ extern int vx_dev_open(vx_device_h* hdevice) { } device->fpga = accel_handle; - device->mem_allocation = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR); + device->mem_allocation = ALLOC_BASE_ADDR; - *hdevice = device; + { + // Load device CAPS + int ret = 0; + ret |= vx_csr_get(device, CSR_IMPL_ID, &device->implementation_id); + ret |= vx_csr_get(device, CSR_NC, &device->num_cores); + ret |= vx_csr_get(device, CSR_NW, &device->num_warps); + ret |= vx_csr_get(device, CSR_NT, &device->num_threads); + if (ret != 0) { + fpgaClose(accel_handle); + return ret; + } + } #ifdef SCOPE { - int ret = vx_scope_start(device->fpga, 0); - if (ret != 0) + int ret = vx_scope_start(accel_handle, 0); + if (ret != 0) { + fpgaClose(accel_handle); return ret; + } } -#endif +#endif + + *hdevice = device; return 0; } @@ -154,10 +222,8 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) vx_device_t *device = ((vx_device_t*)hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); - - size_t asize = align_size(size, line_size); + size_t dev_mem_size = LOCAL_MEM_SIZE; + size_t asize = align_size(size, CACHE_LINESIZE); if (device->mem_allocation + asize > dev_mem_size) return -1; @@ -182,9 +248,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb vx_device_t *device = ((vx_device_t*)hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - - size_t asize = align_size(size, line_size); + size_t asize = align_size(size, CACHE_LINESIZE); res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0); if (FPGA_OK != res) { @@ -260,7 +324,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) { long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000); for (;;) { - CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_STATUS, &data)); + CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &data)); if (0 == data || 0 == timeout) { if (data != 0) { fprintf(stdout, "ready-wait timed out: status=%ld\n", data); @@ -282,17 +346,15 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); - - size_t asize = align_size(size, line_size); + size_t dev_mem_size = LOCAL_MEM_SIZE; + size_t asize = align_size(size, CACHE_LINESIZE); // check alignment - if (!is_aligned(dev_maddr, line_size)) + if (!is_aligned(dev_maddr, CACHE_LINESIZE)) return -1; - if (!is_aligned(buffer->io_addr + src_offset, line_size)) + if (!is_aligned(buffer->io_addr + src_offset, CACHE_LINESIZE)) return -1; - + // bound checking if (src_offset + asize > buffer->size) return -1; @@ -303,12 +365,12 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si if (vx_ready_wait(buffer->hdevice, -1) != 0) return -1; - auto ls_shift = (int)std::log2(line_size); + auto ls_shift = (int)std::log2(CACHE_LINESIZE); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_WRITE)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE)); // Wait for the write operation to finish if (vx_ready_wait(buffer->hdevice, -1) != 0) @@ -325,15 +387,13 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer); vx_device_t *device = ((vx_device_t*)buffer->hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); - - size_t asize = align_size(size, line_size); + size_t dev_mem_size = LOCAL_MEM_SIZE; + size_t asize = align_size(size, CACHE_LINESIZE); // check alignment - if (!is_aligned(dev_maddr, line_size)) + if (!is_aligned(dev_maddr, CACHE_LINESIZE)) return -1; - if (!is_aligned(buffer->io_addr + dest_offset, line_size)) + if (!is_aligned(buffer->io_addr + dest_offset, CACHE_LINESIZE)) return -1; // bound checking @@ -346,12 +406,12 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, if (vx_ready_wait(buffer->hdevice, -1) != 0) return -1; - auto ls_shift = (int)std::log2(line_size); + auto ls_shift = (int)std::log2(CACHE_LINESIZE); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_READ)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ)); // Wait for the write operation to finish if (vx_ready_wait(buffer->hdevice, -1) != 0) @@ -367,23 +427,21 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { vx_device_t* device = ((vx_device_t*)hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - - size_t asize = align_size(size, line_size); + size_t asize = align_size(size, CACHE_LINESIZE); // check alignment - if (!is_aligned(dev_maddr, line_size)) + if (!is_aligned(dev_maddr, CACHE_LINESIZE)) return -1; // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) return -1; - auto ls_shift = (int)std::log2(line_size); + auto ls_shift = (int)std::log2(CACHE_LINESIZE); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CLFLUSH)); // Wait for the write operation to finish if (vx_ready_wait(hdevice, -1) != 0) @@ -396,13 +454,59 @@ extern int vx_start(vx_device_h hdevice) { if (nullptr == hdevice) return -1; + vx_device_t *device = ((vx_device_t*)hdevice); + // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) return -1; - // start execution + // start execution + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN)); + + return 0; +} + +// set device constant registers +extern int vx_csr_set(vx_device_h hdevice, int address, int value) { + if (nullptr == hdevice) + return -1; + vx_device_t *device = ((vx_device_t*)hdevice); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_RUN)); + + // Ensure ready for new command + if (vx_ready_wait(hdevice, -1) != 0) + return -1; + + // write CSR value + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA, value)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_WRITE)); + + return 0; +} + +// get device constant registers +extern int vx_csr_get(vx_device_h hdevice, int address, int* value) { + if (nullptr == hdevice || nullptr == value) + return -1; + + vx_device_t *device = ((vx_device_t*)hdevice); + + // Ensure ready for new command + if (vx_ready_wait(hdevice, -1) != 0) + return -1; + + // write CSR value + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_READ)); + + // Ensure ready for new command + if (vx_ready_wait(hdevice, -1) != 0) + return -1; + + uint64_t value64; + CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64)); + *value = (int)value64; return 0; } \ No newline at end of file diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 5550c821..51c3bb8c 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -7,14 +7,19 @@ #include #include +#include #include #include +#define CACHE_LINESIZE 64 +#define ALLOC_BASE_ADDR 0x10000000 +#define LOCAL_MEM_SIZE 0xffffffff + /////////////////////////////////////////////////////////////////////////////// -static size_t align_size(size_t size) { - uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - return cache_block_size * ((size + cache_block_size - 1) / cache_block_size); +inline size_t align_size(size_t size, size_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); } /////////////////////////////////////////////////////////////////////////////// @@ -26,7 +31,7 @@ public: vx_buffer(size_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size); + auto aligned_asize = align_size(size, CACHE_LINESIZE); data_ = malloc(aligned_asize); } @@ -59,7 +64,7 @@ private: class vx_device { public: vx_device() { - mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR); + mem_allocation_ = ALLOC_BASE_ADDR; simulator_.attach_ram(&ram_); } @@ -70,8 +75,8 @@ public: } int alloc_local_mem(size_t size, size_t* dev_maddr) { - size_t asize = align_size(size); - auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + auto dev_mem_size = LOCAL_MEM_SIZE; + size_t asize = align_size(size, CACHE_LINESIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -80,7 +85,7 @@ public: } int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) { - size_t asize = align_size(size); + size_t asize = align_size(size, CACHE_LINESIZE); if (dest_addr + asize > ram_.size()) return -1; @@ -94,7 +99,7 @@ public: } int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size); + size_t asize = align_size(size, CACHE_LINESIZE); if (src_addr + asize > ram_.size()) return -1; @@ -156,6 +161,44 @@ private: /////////////////////////////////////////////////////////////////////////////// +extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { + if (nullptr == hdevice) + return -1; + + switch (caps_id) { + case VX_CAPS_VERSION: + *value = IMPLEMENTATION_ID; + break; + case VX_CAPS_MAX_CORES: + *value = NUM_CORES; + break; + case VX_CAPS_MAX_WARPS: + *value = NUM_WARPS; + break; + case VX_CAPS_MAX_THREADS: + *value = NUM_THREADS; + break; + case VX_CAPS_CACHE_LINESIZE: + *value = CACHE_LINESIZE; + break; + case VX_CAPS_LOCAL_MEM_SIZE: + *value = 0xffffffff; + break; + case VX_CAPS_ALLOC_BASE_ADDR: + *value = 0x10000000; + break; + case VX_CAPS_KERNEL_BASE_ADDR: + *value = STARTUP_ADDR; + break; + default: + std::cout << "invalid caps id: " << caps_id << std::endl; + std::abort(); + return -1; + } + + return 0; +} + extern int vx_dev_open(vx_device_h* hdevice) { if (nullptr == hdevice) return -1; diff --git a/driver/simx/vortex.cpp b/driver/simx/vortex.cpp index 394e1db2..981ca5c1 100644 --- a/driver/simx/vortex.cpp +++ b/driver/simx/vortex.cpp @@ -11,13 +11,16 @@ #include #include -#define PAGE_SIZE 4096 +#define CACHE_LINESIZE 64 +#define PAGE_SIZE 4096 +#define ALLOC_BASE_ADDR 0x10000000 +#define LOCAL_MEM_SIZE 0xffffffff /////////////////////////////////////////////////////////////////////////////// -static size_t align_size(size_t size) { - uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); - return cache_block_size * ((size + cache_block_size - 1) / cache_block_size); +inline size_t align_size(size_t size, size_t alignment) { + assert(0 == (alignment & (alignment - 1))); + return (size + alignment - 1) & ~(alignment - 1); } /////////////////////////////////////////////////////////////////////////////// @@ -29,7 +32,7 @@ public: vx_buffer(size_t size, vx_device* device) : size_(size) , device_(device) { - auto aligned_asize = align_size(size); + auto aligned_asize = align_size(size, CACHE_LINESIZE); data_ = malloc(aligned_asize); } @@ -65,7 +68,7 @@ public: : is_done_(false) , is_running_(false) , thread_(__thread_proc__, this) { - mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR); + mem_allocation_ = ALLOC_BASE_ADDR; } ~vx_device() { @@ -77,8 +80,8 @@ public: } int alloc_local_mem(size_t size, size_t* dev_maddr) { - auto asize = align_size(size); - auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + auto dev_mem_size = LOCAL_MEM_SIZE; + auto asize = align_size(size, CACHE_LINESIZE); if (mem_allocation_ + asize > dev_mem_size) return -1; *dev_maddr = mem_allocation_; @@ -87,7 +90,7 @@ public: } int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) { - auto asize = align_size(size); + auto asize = align_size(size, CACHE_LINESIZE); if (dest_addr + asize > ram_.size()) return -1; @@ -101,7 +104,7 @@ public: } int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) { - size_t asize = align_size(size); + size_t asize = align_size(size, CACHE_LINESIZE); if (src_addr + asize > ram_.size()) return -1; @@ -216,6 +219,44 @@ extern int vx_dev_close(vx_device_h hdevice) { return 0; } +extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) { + if (nullptr == hdevice) + return -1; + + switch (caps_id) { + case VX_CAPS_VERSION: + *value = IMPLEMENTATION_ID; + break; + case VX_CAPS_MAX_CORES: + *value = NUM_CORES; + break; + case VX_CAPS_MAX_WARPS: + *value = NUM_WARPS; + break; + case VX_CAPS_MAX_THREADS: + *value = NUM_THREADS; + break; + case VX_CAPS_CACHE_LINESIZE: + *value = CACHE_LINESIZE; + break; + case VX_CAPS_LOCAL_MEM_SIZE: + *value = LOCAL_MEM_SIZE; + break; + case VX_CAPS_ALLOC_BASE_ADDR: + *value = ALLOC_BASE_ADDR; + break; + case VX_CAPS_KERNEL_BASE_ADDR: + *value = STARTUP_ADDR; + break; + default: + std::cout << "invalid caps id: " << caps_id << std::endl; + std::abort(); + return -1; + } + + return 0; +} + extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) { if (nullptr == hdevice || nullptr == dev_maddr diff --git a/driver/stub/vortex.cpp b/driver/stub/vortex.cpp index 13f09ae5..532e64d7 100644 --- a/driver/stub/vortex.cpp +++ b/driver/stub/vortex.cpp @@ -8,6 +8,10 @@ extern int vx_dev_close(vx_device_h /*hdevice*/) { return -1; } +extern int vx_dev_caps(vx_device_h /*hdevice*/, unsigned /*caps_id*/, unsigned* /*value*/) { + return -1; +} + extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) { return -1; } diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index 7301e12d..92a785f4 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -179,7 +179,12 @@ int main(int argc, char *argv[]) { count = 1; } - uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES); + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); uint32_t num_points = max_cores * count; uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64; uint32_t buf_size = num_blocks * 64; @@ -187,10 +192,6 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - // allocate device memory RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); kernel_arg.src_ptr = value; diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 71f42e0b..2d5b47f8 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -110,9 +110,14 @@ int main(int argc, char *argv[]) { count = 1; } - uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES); - uint32_t max_warps = vx_dev_caps(VX_CAPS_MAX_WARPS); - uint32_t max_threads = vx_dev_caps(VX_CAPS_MAX_THREADS); + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores, max_warps, max_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); uint32_t num_points = count * max_cores * max_warps * max_threads; uint32_t buf_size = num_points * sizeof(uint32_t); @@ -120,10 +125,6 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - // upload program std::cout << "upload program" << std::endl; RT_CHECK(vx_upload_kernel_file(device, kernel_file)); diff --git a/hw/opae/vortex_afu.json b/hw/opae/vortex_afu.json index 98dcecfd..f198383c 100644 --- a/hw/opae/vortex_afu.json +++ b/hw/opae/vortex_afu.json @@ -5,18 +5,23 @@ "clock-frequency-high": "auto", "clock-frequency-low": "auto", - "mmio-csr-cmd": 10, - "mmio-csr-io-addr": 12, - "mmio-csr-mem-addr": 14, - "mmio-csr-data-size": 16, - "mmio-csr-status": 18, - "mmio-csr-scope-cmd": 20, - "mmio-csr-scope-data": 22, + "cmd-mem-read": 1, + "cmd-mem-write": 2, + "cmd-run": 3, + "cmd-clflush": 4, + "cmd-csr-read": 5, + "cmd-csr-write": 6, - "cmd-type-read": 1, - "cmd-type-write": 2, - "cmd-type-run": 3, - "cmd-type-clflush": 4, + "mmio-cmd-type": 10, + "mmio-io-addr": 12, + "mmio-mem-addr": 14, + "mmio-data-size": 16, + "mmio-status": 18, + "mmio-scope-read": 20, + "mmio-scope-write": 22, + "mmio-csr-addr": 24, + "mmio-csr-data": 26, + "mmio-csr-read": 28, "afu-top-interface": { diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 0650c0ac..2216611e 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -53,19 +53,25 @@ localparam CCI_RW_QUEUE_SIZE = 1024; localparam AFU_ID_L = 16'h0002; // AFU ID Lower localparam AFU_ID_H = 16'h0004; // AFU ID Higher -localparam CMD_TYPE_READ = `AFU_IMAGE_CMD_TYPE_READ; -localparam CMD_TYPE_WRITE = `AFU_IMAGE_CMD_TYPE_WRITE; -localparam CMD_TYPE_RUN = `AFU_IMAGE_CMD_TYPE_RUN; -localparam CMD_TYPE_CLFLUSH = `AFU_IMAGE_CMD_TYPE_CLFLUSH; +localparam CMD_MEM_READ = `AFU_IMAGE_CMD_MEM_READ; +localparam CMD_MEM_WRITE = `AFU_IMAGE_CMD_MEM_WRITE; +localparam CMD_RUN = `AFU_IMAGE_CMD_RUN; +localparam CMD_CLFLUSH = `AFU_IMAGE_CMD_CLFLUSH; +localparam CMD_CSR_READ = `AFU_IMAGE_CMD_CSR_READ; +localparam CMD_CSR_WRITE = `AFU_IMAGE_CMD_CSR_WRITE; -localparam MMIO_CSR_CMD = `AFU_IMAGE_MMIO_CSR_CMD; -localparam MMIO_CSR_IO_ADDR = `AFU_IMAGE_MMIO_CSR_IO_ADDR; -localparam MMIO_CSR_MEM_ADDR = `AFU_IMAGE_MMIO_CSR_MEM_ADDR; -localparam MMIO_CSR_DATA_SIZE = `AFU_IMAGE_MMIO_CSR_DATA_SIZE; -localparam MMIO_CSR_STATUS = `AFU_IMAGE_MMIO_CSR_STATUS; +localparam MMIO_CMD_TYPE = `AFU_IMAGE_MMIO_CMD_TYPE; +localparam MMIO_IO_ADDR = `AFU_IMAGE_MMIO_IO_ADDR; +localparam MMIO_MEM_ADDR = `AFU_IMAGE_MMIO_MEM_ADDR; +localparam MMIO_DATA_SIZE = `AFU_IMAGE_MMIO_DATA_SIZE; +localparam MMIO_STATUS = `AFU_IMAGE_MMIO_STATUS; -localparam MMIO_CSR_SCOPE_CMD = `AFU_IMAGE_MMIO_CSR_SCOPE_CMD; -localparam MMIO_CSR_SCOPE_DATA= `AFU_IMAGE_MMIO_CSR_SCOPE_DATA; +localparam MMIO_SCOPE_READ = `AFU_IMAGE_MMIO_SCOPE_READ; +localparam MMIO_SCOPE_WRITE = `AFU_IMAGE_MMIO_SCOPE_WRITE; + +localparam MMIO_CSR_ADDR = `AFU_IMAGE_MMIO_CSR_ADDR; +localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA; +localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ; logic [127:0] afu_id = `AFU_ACCEL_UUID; @@ -75,7 +81,9 @@ typedef enum logic[3:0] { STATE_WRITE, STATE_START, STATE_RUN, - STATE_CLFLUSH + STATE_CLFLUSH, + STATE_CSR_READ, + STATE_CSR_WRITE } state_t; typedef logic [$clog2(CCI_RD_WINDOW_SIZE)-1:0] t_cci_rdq_tag; @@ -134,20 +142,24 @@ logic avs_rdq_empty; logic avs_rdq_full; `DEBUG_END -// CSR variables ////////////////////////////////////////////////////////////// +// CMD variables ////////////////////////////////////////////////////////////// -logic [2:0] csr_cmd; -t_ccip_clAddr csr_io_addr; -logic[DRAM_ADDR_WIDTH-1:0] csr_mem_addr; -logic[DRAM_ADDR_WIDTH-1:0] csr_data_size; +logic [2:0] cmd_type; +t_ccip_clAddr cmd_io_addr; +logic[DRAM_ADDR_WIDTH-1:0] cmd_mem_addr; +logic[DRAM_ADDR_WIDTH-1:0] cmd_data_size; `ifdef SCOPE -logic [63:0] csr_scope_cmd; -logic [63:0] csr_scope_data; -logic csr_scope_read; -logic csr_scope_write; +logic [63:0] cmd_scope_rdata; +logic [63:0] cmd_scope_wdata; +logic cmd_scope_read; +logic cmd_scope_write; `endif +logic [31:0] cmd_csr_addr; +logic [31:0] cmd_csr_rdata; +logic [31:0] cmd_csr_wdata; + // MMIO controller //////////////////////////////////////////////////////////// `IGNORE_WARNINGS_BEGIN @@ -159,9 +171,9 @@ t_if_ccip_c2_Tx mmio_tx; assign af2cp_sTxPort.c2 = mmio_tx; `ifdef SCOPE -assign csr_scope_cmd = 64'(cp2af_sRxPort.c0.data); -assign csr_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CSR_SCOPE_CMD == mmio_hdr.address); -assign csr_scope_read = cp2af_sRxPort.c0.mmioRdValid && (MMIO_CSR_SCOPE_DATA == mmio_hdr.address); +assign cmd_scope_wdata = 64'(cp2af_sRxPort.c0.data); +assign cmd_scope_read = cp2af_sRxPort.c0.mmioRdValid && (MMIO_SCOPE_READ == mmio_hdr.address); +assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mmio_hdr.address); `endif always_ff @(posedge clk) @@ -170,57 +182,63 @@ begin mmio_tx.hdr <= 0; mmio_tx.data <= 0; mmio_tx.mmioRdValid <= 0; - csr_cmd <= 0; - csr_io_addr <= 0; - csr_mem_addr <= 0; - csr_data_size <= 0; + cmd_type <= 0; + cmd_io_addr <= 0; + cmd_mem_addr <= 0; + cmd_data_size <= 0; end else begin - csr_cmd <= 0; + cmd_type <= 0; mmio_tx.mmioRdValid <= 0; // serve MMIO write request if (cp2af_sRxPort.c0.mmioWrValid) begin case (mmio_hdr.address) - MMIO_CSR_IO_ADDR: begin - csr_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); + MMIO_IO_ADDR: begin + cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: CSR_IO_ADDR: 0x%0h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_IO_ADDR: 0x%0h", $time, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); `endif end - MMIO_CSR_MEM_ADDR: begin - csr_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); + MMIO_MEM_ADDR: begin + cmd_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: CSR_MEM_ADDR: 0x%0h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_MEM_ADDR: 0x%0h", $time, t_local_mem_addr'(cp2af_sRxPort.c0.data)); `endif end - MMIO_CSR_DATA_SIZE: begin - csr_data_size <= $bits(csr_data_size)'(cp2af_sRxPort.c0.data); + MMIO_DATA_SIZE: begin + cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: CSR_DATA_SIZE: %0d", $time, $bits(csr_data_size)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_DATA_SIZE: %0d", $time, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); `endif end - MMIO_CSR_CMD: begin - csr_cmd <= $bits(csr_cmd)'(cp2af_sRxPort.c0.data); + MMIO_CMD_TYPE: begin + cmd_type <= $bits(cmd_type)'(cp2af_sRxPort.c0.data); `ifdef DBG_PRINT_OPAE - $display("%t: CSR_CMD: %0d", $time, $bits(csr_cmd)'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_CMD_TYPE: %0d", $time, $bits(cmd_type)'(cp2af_sRxPort.c0.data)); `endif end `ifdef SCOPE - MMIO_CSR_SCOPE_CMD: begin + MMIO_SCOPE_WRITE: begin `ifdef DBG_PRINT_OPAE - $display("%t: CSR_SCOPE_CMD: %0h", $time, 64'(cp2af_sRxPort.c0.data)); + $display("%t: MMIO_SCOPE_WRITE: %0h", $time, 64'(cp2af_sRxPort.c0.data)); `endif end `endif - default: begin - // user-defined CSRs - //if (mmio_hdr.addres >= MMIO_CSR_USER) begin - // write Vortex CRS - //end - end + MMIO_CSR_ADDR: begin + cmd_csr_addr <= $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data); + `ifdef DBG_PRINT_OPAE + $display("%t: MMIO_CSR_ADDR: %0h", $time, $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data)); + `endif + end + MMIO_CSR_DATA: begin + cmd_csr_wdata <= $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data); + `ifdef DBG_PRINT_OPAE + $display("%t: MMIO_CSR_DATA: %0h", $time, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data)); + `endif + end endcase end @@ -243,22 +261,28 @@ begin AFU_ID_H: mmio_tx.data <= afu_id[127:64]; // afu id hi 16'h0006: mmio_tx.data <= 64'h0; // next AFU 16'h0008: mmio_tx.data <= 64'h0; // reserved - MMIO_CSR_STATUS: begin + MMIO_STATUS: begin `ifdef DBG_PRINT_OPAE if (state != state_t'(mmio_tx.data)) begin - $display("%t: STATUS: state=%0d", $time, state); + $display("%t: MMIO_STATUS: state=%0d", $time, state); end `endif mmio_tx.data <= 64'(state); end `ifdef SCOPE - MMIO_CSR_SCOPE_DATA: begin - mmio_tx.data <= csr_scope_data; + MMIO_SCOPE_READ: begin + mmio_tx.data <= cmd_scope_rdata; `ifdef DBG_PRINT_OPAE - $display("%t: SCOPE: data=%0h", $time, csr_scope_data); + $display("%t: MMIO_SCOPE_READ: data=%0h", $time, cmd_scope_rdata); `endif end `endif + MMIO_CSR_READ: begin + mmio_tx.data <= cmd_csr_rdata; + `ifdef DBG_PRINT_OPAE + $display("%t: MMIO_CSR_READ: data=%0h", $time, cmd_csr_rdata); + `endif + end default: mmio_tx.data <= 64'h0; endcase mmio_tx.mmioRdValid <= 1; // post response @@ -271,6 +295,8 @@ end logic cmd_read_done; logic cmd_write_done; logic cmd_clflush_done; +logic cmd_csr_read_done; +logic cmd_csr_write_done; logic cmd_run_done; always_ff @(posedge clk) @@ -285,32 +311,44 @@ begin case (state) STATE_IDLE: begin - case (csr_cmd) - CMD_TYPE_READ: begin + case (cmd_type) + CMD_MEM_READ: begin `ifdef DBG_PRINT_OPAE - $display("%t: STATE READ: ia=%0h da=%0h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE READ: ia=%0h addr=%0h size=%0d", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); `endif state <= STATE_READ; end - CMD_TYPE_WRITE: begin + CMD_MEM_WRITE: begin `ifdef DBG_PRINT_OPAE - $display("%t: STATE WRITE: ia=%0h da=%0h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size); + $display("%t: STATE WRITE: ia=%0h addr=%0h size=%0d", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); `endif state <= STATE_WRITE; end - CMD_TYPE_RUN: begin + CMD_RUN: begin `ifdef DBG_PRINT_OPAE $display("%t: STATE START", $time); `endif vx_reset <= 1; state <= STATE_START; end - CMD_TYPE_CLFLUSH: begin + CMD_CLFLUSH: begin `ifdef DBG_PRINT_OPAE - $display("%t: STATE CFLUSH: da=%0h sz=%0d", $time, csr_mem_addr, csr_data_size); + $display("%t: STATE CFLUSH: addr=%0h size=%0d", $time, cmd_mem_addr, cmd_data_size); `endif state <= STATE_CLFLUSH; end + CMD_CSR_READ: begin + `ifdef DBG_PRINT_OPAE + $display("%t: STATE CSR_READ: addr=%0h", $time, cmd_csr_addr); + `endif + state <= STATE_CSR_READ; + end + CMD_CSR_WRITE: begin + `ifdef DBG_PRINT_OPAE + $display("%t: STATE CSR_WRITE: addr=%0h data=%0d", $time, cmd_csr_addr, cmd_csr_wdata); + `endif + state <= STATE_CSR_WRITE; + end default: begin state <= state; end @@ -345,6 +383,18 @@ begin end end + STATE_CSR_READ: begin + if (cmd_csr_read_done) begin + state <= STATE_IDLE; + end + end + + STATE_CSR_WRITE: begin + if (cmd_csr_write_done) begin + state <= STATE_IDLE; + end + end + default: begin state <= state; end @@ -385,7 +435,7 @@ assign cci_dram_rd_req_enable = (state == STATE_READ) assign cci_dram_wr_req_enable = (state == STATE_WRITE) && !cci_rdq_empty - && (cci_dram_wr_req_ctr < csr_data_size); + && (cci_dram_wr_req_ctr < cmd_data_size); assign vx_dram_req_enable = vortex_enabled && (avs_pending_reads < AVS_RD_QUEUE_SIZE); assign vx_dram_rd_req_enable = vx_dram_req_enable && vx_dram_req_valid && !vx_dram_req_rw; @@ -414,19 +464,19 @@ end always_comb begin case (state) - CMD_TYPE_READ: avs_address = cci_dram_rd_req_addr; - CMD_TYPE_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout))); + CMD_MEM_READ: avs_address = cci_dram_rd_req_addr; + CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout))); default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH]; endcase case (state) - CMD_TYPE_READ: avs_byteenable = 64'hffffffffffffffff; - CMD_TYPE_WRITE: avs_byteenable = 64'hffffffffffffffff; + CMD_MEM_READ: avs_byteenable = 64'hffffffffffffffff; + CMD_MEM_WRITE: avs_byteenable = 64'hffffffffffffffff; default: avs_byteenable = vx_dram_req_byteen_; endcase case (state) - CMD_TYPE_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)]; + CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)]; default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset; endcase end @@ -434,7 +484,7 @@ end assign avs_read = cci_dram_rd_req_enable || vx_dram_rd_req_enable; assign avs_write = cci_dram_wr_req_enable || vx_dram_wr_req_enable; -assign cmd_write_done = (cci_dram_wr_req_ctr >= csr_data_size); +assign cmd_write_done = (cci_dram_wr_req_ctr >= cmd_data_size); always_ff @(posedge clk) begin @@ -451,12 +501,12 @@ begin else begin if (state == STATE_IDLE) begin - if (CMD_TYPE_READ == csr_cmd) begin - cci_dram_rd_req_addr <= csr_mem_addr; - cci_dram_rd_req_ctr <= csr_data_size; + if (CMD_MEM_READ == cmd_type) begin + cci_dram_rd_req_addr <= cmd_mem_addr; + cci_dram_rd_req_ctr <= cmd_data_size; end - else if (CMD_TYPE_WRITE == csr_cmd) begin - cci_dram_wr_req_addr <= csr_mem_addr; + else if (CMD_MEM_WRITE == cmd_type) begin + cci_dram_wr_req_addr <= cmd_mem_addr; cci_dram_wr_req_ctr <= 0; end end @@ -598,17 +648,17 @@ begin else begin if ((STATE_IDLE == state) - && (CMD_TYPE_WRITE == csr_cmd)) begin - cci_rd_req_addr <= csr_io_addr; + && (CMD_MEM_WRITE == cmd_type)) begin + cci_rd_req_addr <= cmd_io_addr; cci_rd_req_ctr <= 0; cci_rd_rsp_ctr <= 0; cci_pending_reads <= 0; - cci_rd_req_enable <= (csr_data_size != 0); + cci_rd_req_enable <= (cmd_data_size != 0); cci_rd_req_wait <= 0; end cci_rd_req_enable <= (STATE_WRITE == state) - && (cci_rd_req_ctr_next < csr_data_size) + && (cci_rd_req_ctr_next < cmd_data_size) && (cci_pending_reads_next < CCI_RD_QUEUE_SIZE); if (cci_rd_req_fire) begin @@ -618,7 +668,7 @@ begin cci_rd_req_wait <= 1; // end current request batch end `ifdef DBG_PRINT_OPAE - $display("%t: CCI Rd Req: addr=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, (csr_data_size - cci_rd_req_ctr_next), cci_pending_reads_next); + $display("%t: CCI Rd Req: addr=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads_next); `endif end @@ -695,9 +745,9 @@ begin else begin if ((STATE_IDLE == state) - && (CMD_TYPE_READ == csr_cmd)) begin - cci_wr_req_addr <= csr_io_addr; - cci_wr_req_ctr <= csr_data_size; + && (CMD_MEM_READ == cmd_type)) begin + cci_wr_req_addr <= cmd_io_addr; + cci_wr_req_ctr <= cmd_data_size; cci_pending_writes <= 0; end @@ -733,11 +783,11 @@ logic [`VX_DRAM_ADDR_WIDTH-1:0] snp_rsp_ctr, snp_rsp_ctr_next; logic vx_snp_req_fire, vx_snp_rsp_fire; if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin - assign snp_req_baseaddr = {csr_mem_addr, (`VX_DRAM_ADDR_WIDTH - DRAM_ADDR_WIDTH)'(0)}; - assign snp_req_size = {csr_data_size, (`VX_DRAM_ADDR_WIDTH - DRAM_ADDR_WIDTH)'(0)}; + assign snp_req_baseaddr = {cmd_mem_addr, (`VX_DRAM_ADDR_WIDTH - DRAM_ADDR_WIDTH)'(0)}; + assign snp_req_size = {cmd_data_size, (`VX_DRAM_ADDR_WIDTH - DRAM_ADDR_WIDTH)'(0)}; end else begin - assign snp_req_baseaddr = csr_mem_addr; - assign snp_req_size = csr_data_size; + assign snp_req_baseaddr = cmd_mem_addr; + assign snp_req_size = cmd_data_size; end assign vx_snp_req_fire = vx_snp_req_valid && vx_snp_req_ready; @@ -761,7 +811,7 @@ begin else begin if ((STATE_IDLE == state) - && (CMD_TYPE_CLFLUSH == csr_cmd)) begin + && (CMD_CLFLUSH == cmd_type)) begin vx_snp_req_addr <= snp_req_baseaddr; vx_snp_req_tag <= 0; snp_req_ctr <= 0; @@ -802,6 +852,23 @@ begin end end +// CSRs/////////////////////////////////////////////////////////////////////// + +assign cmd_csr_read_done = 1; +assign cmd_csr_write_done = 1; + +always_comb begin + case (cmd_csr_addr) + `CSR_VEND_ID : cmd_csr_rdata = `VENDOR_ID; + `CSR_ARCH_ID : cmd_csr_rdata = `ARCHITECTURE_ID; + `CSR_IMPL_ID : cmd_csr_rdata = `IMPLEMENTATION_ID; + `CSR_NT : cmd_csr_rdata = `NUM_THREADS; + `CSR_NW : cmd_csr_rdata = `NUM_WARPS; + `CSR_NC : cmd_csr_rdata = `NUM_CORES * `NUM_CLUSTERS; + default : cmd_csr_rdata = 0; + endcase +end + // Vortex ///////////////////////////////////////////////////////////////////// assign cmd_run_done = !vx_busy; @@ -944,10 +1011,10 @@ VX_scope #( .stop (0), .changed (scope_data_in_ste[1]), .data_in (scope_data_in_ste[SCOPE_DATAW+1:2]), - .bus_in (csr_scope_cmd), - .bus_out (csr_scope_data), - .bus_read (csr_scope_read), - .bus_write(csr_scope_write) + .bus_in (cmd_scope_wdata), + .bus_out (cmd_scope_rdata), + .bus_read (cmd_scope_read), + .bus_write(cmd_scope_write) ); `endif diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 4f6377a0..6a5f7386 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -59,21 +59,33 @@ `define L3_ENABLE (`NUM_CLUSTERS > 1) `endif -`define CSR_LTID 12'h020 -`define CSR_LWID 12'h021 -`define CSR_GTID 12'hF14 // reserved Hardware Thread ID (mhartid) -`define CSR_GWID 12'h023 -`define CSR_GCID 12'h024 -`define CSR_NT 12'h025 -`define CSR_NW 12'h026 -`define CSR_NC 12'h027 +// Configuration Values ======================================================= -`define CSR_CYCLL 12'hC00 -`define CSR_CYCLH 12'hC80 -`define CSR_INSTL 12'hC02 -`define CSR_INSTH 12'hC82 +`define VENDOR_ID 0 +`define ARCHITECTURE_ID 0 +`define IMPLEMENTATION_ID 0 -// ========================= Dcache Configurable Knobs ======================== +// CSR Addresses ============================================================== + +`define CSR_VEND_ID 12'hF11 +`define CSR_ARCH_ID 12'hF12 +`define CSR_IMPL_ID 12'hF13 +`define CSR_GTID 12'hF14 + +`define CSR_LTID 12'h020 +`define CSR_LWID 12'h021 +`define CSR_GWID 12'h023 +`define CSR_GCID 12'h024 +`define CSR_NT 12'h025 +`define CSR_NW 12'h026 +`define CSR_NC 12'h027 + +`define CSR_CYCLE_L 12'hC00 +`define CSR_CYCLE_H 12'hC80 +`define CSR_INSTR_L 12'hC02 +`define CSR_INSTR_H 12'hC82 + +// Dcache Configurable Knobs ================================================== // Size of cache in bytes `ifndef DCACHE_SIZE @@ -144,7 +156,7 @@ `define DPRFQ_STRIDE 0 `endif -// ========================== Icache Configurable Knobs ======================= +// Icache Configurable Knobs ================================================== // Size of cache in bytes `ifndef ICACHE_SIZE @@ -210,7 +222,7 @@ `define IPRFQ_STRIDE 0 `endif -// =========================== SM Configurable Knobs ========================== +// SM Configurable Knobs ====================================================== // Size of cache in bytes `ifndef SCACHE_SIZE @@ -247,7 +259,7 @@ `define SCWBQ_SIZE `SCREQ_SIZE `endif -// ======================== L2cache Configurable Knobs ======================== +// L2cache Configurable Knobs ================================================= // Size of cache in bytes `ifndef L2CACHE_SIZE @@ -318,7 +330,7 @@ `define L2PRFQ_STRIDE 0 `endif -// ======================== L3cache Configurable Knobs ======================== +// L3cache Configurable Knobs ================================================= // Size of cache in bytes `ifndef L3CACHE_SIZE diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index b8144358..7488129e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -43,18 +43,21 @@ module VX_csr_data #( always @(*) begin case (read_addr) - `CSR_LWID : read_data = 32'(warp_num); - `CSR_GTID , - `CSR_GWID : read_data = CORE_ID * `NUM_WARPS + 32'(warp_num); - `CSR_GCID : read_data = CORE_ID; - `CSR_NT : read_data = `NUM_THREADS; - `CSR_NW : read_data = `NUM_WARPS; - `CSR_NC : read_data = `NUM_CORES * `NUM_CLUSTERS; - `CSR_CYCLL : read_data = num_cycles[31:0]; - `CSR_CYCLH : read_data = num_cycles[63:32]; - `CSR_INSTL : read_data = num_instrs[31:0]; - `CSR_INSTH : read_data = num_instrs[63:32]; - default: read_data = 32'(csr_table[rd_addr]); + `CSR_LWID : read_data = 32'(warp_num); + `CSR_GTID , + `CSR_GWID : read_data = CORE_ID * `NUM_WARPS + 32'(warp_num); + `CSR_GCID : read_data = CORE_ID; + `CSR_NT : read_data = `NUM_THREADS; + `CSR_NW : read_data = `NUM_WARPS; + `CSR_NC : read_data = `NUM_CORES * `NUM_CLUSTERS; + `CSR_CYCLE_L : read_data = num_cycles[31:0]; + `CSR_CYCLE_H : read_data = num_cycles[63:32]; + `CSR_INSTR_L : read_data = num_instrs[31:0]; + `CSR_INSTR_H : read_data = num_instrs[63:32]; + `CSR_VEND_ID : read_data = `VENDOR_ID; + `CSR_ARCH_ID : read_data = `ARCHITECTURE_ID; + `CSR_IMPL_ID : read_data = `IMPLEMENTATION_ID; + default : read_data = 32'(csr_table[rd_addr]); endcase end