From c2e9240b7dd29af16777551c0a40a0ed9712bdd5 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 8 May 2020 08:28:28 -0700 Subject: [PATCH] OPAE rtl fixes --- driver/opae/vortex.cpp | 37 ++++----- driver/tests/basic/basic.cpp | 122 ++++++++++-------------------- driver/tests/demo/demo.cpp | 119 +++++++++-------------------- hw/opae/README | 16 +++- hw/opae/run_ase.sh | 6 +- hw/opae/vortex_afu.sv | 88 +++++++++++++-------- hw/rtl/VX_back_end.v | 12 +-- hw/rtl/VX_dmem_ctrl.v | 5 +- hw/rtl/VX_icache_stage.v | 25 +++--- hw/rtl/VX_lsu_unit.v | 2 +- hw/rtl/Vortex.v | 5 +- hw/rtl/cache/VX_tag_data_access.v | 14 ++-- 12 files changed, 208 insertions(+), 243 deletions(-) diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 43816c30..eba1a7c5 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -143,6 +143,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); size_t asize = align_size(size, line_size); + if (device->mem_allocation + asize > dev_mem_size) return -1; @@ -189,11 +190,11 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb return -1; } - buffer->wsid = wsid; + buffer->wsid = wsid; buffer->host_ptr = host_ptr; - buffer->io_addr = io_addr; - buffer->hdevice = hdevice; - buffer->size = size; + buffer->io_addr = io_addr; + buffer->hdevice = hdevice; + buffer->size = asize; *hbuffer = buffer; @@ -265,18 +266,18 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + size_t asize = align_size(size, line_size); + // check alignment if (!is_aligned(dev_maddr, line_size)) return -1; - if (!is_aligned(size, line_size)) - return -1; if (!is_aligned(buffer->io_addr + src_offset, line_size)) return -1; // bound checking - if (size + src_offset > buffer->size) + if (src_offset + asize > buffer->size) return -1; - if (dev_maddr + size > dev_mem_size) + if (dev_maddr + asize > dev_mem_size) return -1; // Ensure ready for new command @@ -287,7 +288,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr >> ls_shift) )); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_WRITE)); // Wait for the write operation to finish @@ -308,18 +309,18 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE); + size_t asize = align_size(size, line_size); + // check alignment if (!is_aligned(dev_maddr, line_size)) return -1; - if (!is_aligned(size, line_size)) - return -1; if (!is_aligned(buffer->io_addr + dest_offset, line_size)) return -1; // bound checking - if (size + dest_offset > buffer->size) + if (dest_offset + asize > buffer->size) return -1; - if (dev_maddr + size > dev_mem_size) + if (dev_maddr + asize > dev_mem_size) return -1; // Ensure ready for new command @@ -330,7 +331,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr) >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_READ)); // Wait for the write operation to finish @@ -347,13 +348,13 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { vx_device_t* device = ((vx_device_t*)hdevice); - int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE); + + size_t asize = align_size(size, line_size); // check alignment if (!is_aligned(dev_maddr, line_size)) return -1; - if (!is_aligned(size, line_size)) - return -1; // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) @@ -362,7 +363,7 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) { auto ls_shift = (int)std::log2(line_size); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift)); - CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size >> ls_shift)); + CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift)); CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH)); // Wait for the write operation to finish diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index 684d655d..424c68dc 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -4,6 +4,16 @@ int test = -1; +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + static void parse_args(int argc, char **argv) { int c; while ((c = getopt(argc, argv, "t:h?")) != -1) { @@ -27,12 +37,27 @@ uint64_t shuffle(int i, uint64_t value) { return (value << i) | (value & ((1 << i)-1));; } +vx_device_h device = nullptr; +vx_buffer_h sbuf = nullptr; +vx_buffer_h dbuf = nullptr; + +void cleanup() { + if (sbuf) { + vx_buf_release(sbuf); + } + if (dbuf) { + vx_buf_release(dbuf); + } + if (device) { + vx_dev_close(device); + } +} + int run_memcopy_test(vx_buffer_h sbuf, vx_buffer_h dbuf, uint32_t address, uint64_t value, int num_blocks) { - int ret; int errors = 0; // write sbuf data @@ -42,15 +67,11 @@ int run_memcopy_test(vx_buffer_h sbuf, // write buffer to local memory std::cout << "write buffer to local memory" << std::endl; - ret = vx_copy_to_dev(sbuf, address, 64 * num_blocks, 0); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_to_dev(sbuf, address, 64 * num_blocks, 0)); // read buffer from local memory std::cout << "read buffer from local memory" << std::endl; - ret = vx_copy_from_dev(dbuf, address, 64 * num_blocks, 0); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_from_dev(dbuf, address, 64 * num_blocks, 0)); // verify result std::cout << "verify result" << std::endl; @@ -77,7 +98,6 @@ int run_kernel_test(vx_device_h device, vx_buffer_h sbuf, vx_buffer_h dbuf, const char* program) { - int ret; int errors = 0; uint64_t seed = 0x0badf00d40ff40ff; @@ -93,43 +113,27 @@ int run_kernel_test(vx_device_h device, // write buffer to local memory std::cout << "write buffer to local memory" << std::endl; - ret = vx_copy_to_dev(sbuf, src_dev_addr, 64 * num_blocks, 0); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_to_dev(sbuf, src_dev_addr, 64 * num_blocks, 0)); // upload program std::cout << "upload program" << std::endl; - ret = vx_upload_kernel_file(device, program); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_upload_kernel_file(device, program)); // start device std::cout << "start device" << std::endl; - ret = vx_start(device); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_start(device)); // wait for completion std::cout << "wait for completion" << std::endl; - ret = vx_ready_wait(device, -1); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_ready_wait(device, -1)); // flush the caches std::cout << "flush the caches" << std::endl; - ret = vx_flush_caches(device, dest_dev_addr, 64 * num_blocks); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_flush_caches(device, dest_dev_addr, 64 * num_blocks)); // read buffer from local memory std::cout << "read buffer from local memory" << std::endl; - ret = vx_copy_from_dev(dbuf, dest_dev_addr, 64 * num_blocks, 0); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_from_dev(dbuf, dest_dev_addr, 64 * num_blocks, 0)); // verify result std::cout << "verify result" << std::endl; @@ -152,75 +156,33 @@ int run_kernel_test(vx_device_h device, return 0; } -vx_device_h device = nullptr; -vx_buffer_h sbuf = nullptr; -vx_buffer_h dbuf = nullptr; - -void cleanup() { - if (sbuf) { - vx_buf_release(sbuf); - } - if (dbuf) { - vx_buf_release(dbuf); - } - if (device) { - vx_dev_close(device); - } -} - int main(int argc, char *argv[]) { - int ret; - // parse command arguments parse_args(argc, argv); // open device connection std::cout << "open device connection" << std::endl; vx_device_h device; - ret = vx_dev_open(&device); - if (ret != 0) - return ret; + RT_CHECK(vx_dev_open(&device)); // create source buffer std::cout << "create source buffer" << std::endl; - ret = vx_alloc_shared_mem(device, 4096, &sbuf); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_shared_mem(device, 4096, &sbuf)); // create destination buffer std::cout << "create destination buffer" << std::endl; - ret = vx_alloc_shared_mem(device, 4096, &dbuf); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_shared_mem(device, 4096, &dbuf)); // run tests - if (0 == test || -1 == test) { + /*9if (0 == test || -1 == test) { std::cout << "run memcopy test" << std::endl; - - ret = run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8); - if (ret != 0) { - cleanup(); - return ret; - } - } + RT_CHECK(run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1)); + RT_CHECK(run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8)); + }*/ if (1 == test || -1 == test) { std::cout << "run kernel test" << std::endl; - ret = run_kernel_test(device, sbuf, dbuf, "kernel.bin"); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(run_kernel_test(device, sbuf, dbuf, "kernel.bin")); } // cleanup diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 32c30dbe..4f937d51 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -4,6 +4,16 @@ #include #include "common.h" +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + const char* program_file = "kernel.bin"; uint32_t data_stride = 0xffffffff; @@ -39,40 +49,38 @@ static void parse_args(int argc, char **argv) { } } +vx_device_h device = nullptr; +vx_buffer_h buffer = nullptr; + +void cleanup() { + if (buffer) { + vx_buf_release(buffer); + } + if (device) { + vx_dev_close(device); + } +} + int run_test(vx_device_h device, vx_buffer_h buffer, const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t num_points) { - int ret; - // start device std::cout << "start device" << std::endl; - ret = vx_start(device); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_start(device)); // wait for completion std::cout << "wait for completion" << std::endl; - ret = vx_ready_wait(device, -1); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_ready_wait(device, -1)); // flush the destination buffer caches std::cout << "flush the destination buffer caches" << std::endl; - ret = vx_flush_caches(device, kernel_arg.dst_ptr, buf_size); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_flush_caches(device, kernel_arg.dst_ptr, buf_size)); // download destination buffer std::cout << "download destination buffer" << std::endl; - ret = vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0); - if (ret != 0) { - return ret; - } + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); // verify result std::cout << "verify result" << std::endl; @@ -96,20 +104,7 @@ int run_test(vx_device_h device, return 0; } -vx_device_h device = nullptr; -vx_buffer_h buffer = nullptr; - -void cleanup() { - if (buffer) { - vx_buf_release(buffer); - } - if (device) { - vx_dev_close(device); - } -} - int main(int argc, char *argv[]) { - int ret; size_t value; kernel_arg_t kernel_arg; @@ -132,50 +127,28 @@ int main(int argc, char *argv[]) { // open device connection std::cout << "open device connection" << std::endl; - ret = vx_dev_open(&device); - if (ret != 0) - return ret; + RT_CHECK(vx_dev_open(&device)); // upload program std::cout << "upload program" << std::endl; - ret = vx_upload_kernel_file(device, program_file); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_upload_kernel_file(device, program_file)); // allocate device memory std::cout << "allocate device memory" << std::endl; - ret = vx_alloc_dev_mem(device, buf_size, &value); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); kernel_arg.src0_ptr = value; - ret = vx_alloc_dev_mem(device, buf_size, &value); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); kernel_arg.src1_ptr = value; - ret = vx_alloc_dev_mem(device, buf_size, &value); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); kernel_arg.dst_ptr = value; // allocate shared memory std::cout << "allocate shared memory" << std::endl; uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - ret = vx_alloc_shared_mem(device, alloc_size, &buffer); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); // populate source buffer values std::cout << "populate source buffer values" << std::endl; @@ -187,19 +160,9 @@ int main(int argc, char *argv[]) { } // upload source buffers - std::cout << "upload source buffers" << std::endl; - - ret = vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0); - if (ret != 0) { - cleanup(); - return ret; - } - - ret = vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0); - if (ret != 0) { - cleanup(); - return ret; - } + std::cout << "upload source buffers" << std::endl; + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0)); + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; @@ -210,20 +173,12 @@ int main(int argc, char *argv[]) { auto buf_ptr = (int*)vx_host_ptr(buffer); memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - ret = vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); } // run tests std::cout << "run tests" << std::endl; - ret = run_test(device, buffer, kernel_arg, buf_size, num_points); - if (ret != 0) { - cleanup(); - return ret; - } + RT_CHECK(run_test(device, buffer, kernel_arg, buf_size, num_points)); // cleanup std::cout << "cleanup" << std::endl; diff --git a/hw/opae/README b/hw/opae/README index 90307f37..e0ed668d 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -42,6 +42,8 @@ make # ASE build instructions # +source /export/fpga/bin/setup-fpga-env fpga-pac-a10 + # Acquire a sever node for running ASE simulations qsub-sim @@ -51,7 +53,17 @@ vcd add -r /*/Vortex/hw/rtl/* run -all # compress VCD trace -tar -zcvf vortex.vcd.tar.gz work/vortex.vcd +tar -zcvf vortex.vcd.tar.gz ./build_ase/work/vortex.vcd # decompress VCD trace -tar -zxvf vortex.vcd.tar.gz vortex.vcd \ No newline at end of file +tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz + +# launch Gtkwave +gtkwave ./build_ase/work/vortex.vcd & + +# test +./run_ase.sh ../../driver/tests/basic/basic + +# kill process by Users +ps -u tinebp +kill -9 \ No newline at end of file diff --git a/hw/opae/run_ase.sh b/hw/opae/run_ase.sh index 0cd953db..147f1147 100755 --- a/hw/opae/run_ase.sh +++ b/hw/opae/run_ase.sh @@ -14,17 +14,19 @@ rm -rf $ASE_WORKDIR/.app_lock.pid $ASE_WORKDIR/.ase_ready.pid # Start Simulator in background pushd $SCRIPT_DIR/build_ase -make sim & +echo " [DBG] starting ASE simnulator" +nohup make sim & popd # Wait for simulator readiness # When .ase_ready is created in the $ASE_WORKDIR, ASE is ready for simulation -while [! -f $ASE_WORKDIR/.ase_ready.pid] +while [ ! -f $ASE_WORKDIR/.ase_ready.pid ] do sleep 1 done # run application pushd $PROGRAM_DIR +echo " [DBG] running ./$PROGRAM $*" ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $* popd \ No newline at end of file diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 3fa56557..0c4df51b 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -28,7 +28,12 @@ module vortex_afu #( output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select ); -localparam DRAM_ADDR_WIDTH = (32 - `CLOG2(`GLOBAL_BLOCK_SIZE)); +localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr); +localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data); +localparam DRAM_TAG_WIDTH = `L3DRAM_TAG_WIDTH; + +`STATIC_ASSERT(DRAM_ADDR_WIDTH == `L3DRAM_ADDR_WIDTH, "invalid vortex dram bus!") +`STATIC_ASSERT(DRAM_LINE_WIDTH == `L3DRAM_LINE_WIDTH, "invalid vortex dram bus!") localparam AVS_RD_QUEUE_SIZE = 16; @@ -58,6 +63,7 @@ typedef enum logic[3:0] { STATE_IDLE, STATE_READ, STATE_WRITE, + STATE_START, STATE_RUN, STATE_CLFLUSH } state_t; @@ -72,13 +78,13 @@ state_t state; logic vx_dram_req_read; logic vx_dram_req_write; logic [DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr; -logic [`GLOBAL_BLOCK_SIZE-1:0] vx_dram_req_data; -logic [`L3DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; +logic [DRAM_LINE_WIDTH-1:0] vx_dram_req_data; +logic [DRAM_TAG_WIDTH-1:0] vx_dram_req_tag; logic vx_dram_req_ready; logic vx_dram_rsp_valid; -logic [`GLOBAL_BLOCK_SIZE-1:0] vx_dram_rsp_data; -logic [`L3DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; +logic [DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data; +logic [DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag; logic vx_dram_rsp_ready; logic vx_snp_req_valid; @@ -90,9 +96,9 @@ logic vx_busy; // AVS Queues ///////////////////////////////////////////////////////////////// logic avs_rtq_push; -t_local_mem_addr avs_rtq_din; +logic [DRAM_TAG_WIDTH-1:0] avs_rtq_din; logic avs_rtq_pop; -t_local_mem_addr avs_rtq_dout; +logic [DRAM_TAG_WIDTH-1:0] avs_rtq_dout; logic avs_rtq_empty; logic avs_rtq_full; @@ -229,7 +235,7 @@ begin CMD_TYPE_RUN: begin $display("%t: STATE START", $time); vx_reset <= 1; - state <= STATE_RUN; + state <= STATE_START; end CMD_TYPE_CLFLUSH: begin $display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size); @@ -250,6 +256,10 @@ begin end end + STATE_START: begin // vortex reset cycle + state <= STATE_RUN; + end + STATE_RUN: begin if (!vx_busy) begin state <= STATE_IDLE; @@ -271,7 +281,7 @@ end logic cci_rdq_empty; t_cci_rdq_data cci_rdq_dout; logic cci_rdq_pop; -logic [`L3DRAM_TAG_WIDTH-1:0] dram_req_tag; +logic [DRAM_TAG_WIDTH-1:0] dram_req_tag; t_ccip_clAddr next_avs_address; always_comb @@ -372,7 +382,7 @@ end always_comb begin - vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready; + vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty; vx_dram_rsp_tag = avs_rtq_dout; vx_dram_rsp_data = avs_rdq_dout; end @@ -389,7 +399,7 @@ begin end VX_generic_queue #( - .DATAW($bits(t_local_mem_addr)), + .DATAW(DRAM_TAG_WIDTH), .SIZE(AVS_RD_QUEUE_SIZE) ) avs_rd_req_queue ( .clk (clk), @@ -412,7 +422,7 @@ begin end VX_generic_queue #( - .DATAW($bits(t_local_mem_data)), + .DATAW(DRAM_LINE_WIDTH), .SIZE(AVS_RD_QUEUE_SIZE) ) avs_rd_rsp_queue ( .clk (clk), @@ -595,30 +605,46 @@ end // Vortex binding ///////////////////////////////////////////////////////////// Vortex_Socket #() vx_socket ( - .clk (clk), - .reset (SoftReset || vx_reset), + .clk (clk), + .reset (vx_reset), - // DRAM Req - .dram_req_write (vx_dram_req_write), - .dram_req_read (vx_dram_req_read), - .dram_req_addr (vx_dram_req_addr), - .dram_req_data (vx_dram_req_data), - .dram_req_tag (vx_dram_req_tag), - .dram_req_ready (vx_dram_req_ready), + // DRAM request + .dram_req_write (vx_dram_req_write), + .dram_req_read (vx_dram_req_read), + .dram_req_addr (vx_dram_req_addr), + .dram_req_data (vx_dram_req_data), + .dram_req_tag (vx_dram_req_tag), + .dram_req_ready (vx_dram_req_ready), - // DRAM Rsp - .dram_rsp_valid (vx_dram_rsp_valid), - .dram_rsp_data (vx_dram_rsp_data), - .dram_rsp_tag (vx_dram_rsp_tag), - .dram_rsp_ready (vx_dram_rsp_ready), + // DRAM response + .dram_rsp_valid (vx_dram_rsp_valid), + .dram_rsp_data (vx_dram_rsp_data), + .dram_rsp_tag (vx_dram_rsp_tag), + .dram_rsp_ready (vx_dram_rsp_ready), - // Cache Snooping Req - .snp_req_valid (vx_snp_req_valid), - .snp_req_addr (vx_snp_req_addr), - .snp_req_ready (vx_snp_req_ready), + // Cache snooping + .snp_req_valid (vx_snp_req_valid), + .snp_req_addr (vx_snp_req_addr), + .snp_req_ready (vx_snp_req_ready), + + // I/O request + .io_req_read (), + .io_req_write (), + .io_req_addr (), + .io_req_data (), + .io_req_byteen (), + .io_req_tag (), + .io_req_ready (0), + + // I/O response + .io_rsp_valid (0), + .io_rsp_data (0), + .io_rsp_tag (0), + .io_rsp_ready (), // status - .busy (vx_busy) + .busy (vx_busy), + .ebreak () ); endmodule diff --git a/hw/rtl/VX_back_end.v b/hw/rtl/VX_back_end.v index 8727db3c..5575d6cc 100644 --- a/hw/rtl/VX_back_end.v +++ b/hw/rtl/VX_back_end.v @@ -8,12 +8,8 @@ module VX_back_end #( input wire schedule_delay, - VX_cache_core_rsp_if dcache_rsp_if, VX_cache_core_req_if dcache_req_if, - - output wire mem_delay, - output wire exec_delay, - output wire gpr_stage_delay, + VX_cache_core_rsp_if dcache_rsp_if, VX_jal_rsp_if jal_rsp_if, VX_branch_rsp_if branch_rsp_if, @@ -22,6 +18,10 @@ module VX_back_end #( VX_warp_ctl_if warp_ctl_if, + output wire mem_delay, + output wire exec_delay, + output wire gpr_stage_delay, + output wire ebreak ); @@ -78,8 +78,8 @@ module VX_back_end #( .reset (reset), .lsu_req_if (lsu_req_if), .mem_wb_if (mem_wb_if), - .dcache_rsp_if (dcache_rsp_if), .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if), .delay (mem_delay), .no_slot_mem (no_slot_mem) ); diff --git a/hw/rtl/VX_dmem_ctrl.v b/hw/rtl/VX_dmem_ctrl.v index 5c78aa47..8a8a9f4a 100644 --- a/hw/rtl/VX_dmem_ctrl.v +++ b/hw/rtl/VX_dmem_ctrl.v @@ -35,8 +35,11 @@ module VX_dmem_ctrl ( .CORE_TAG_ID_BITS(`CORE_TAG_ID_BITS) ) dcache_core_rsp_qual_if(), smem_core_rsp_if(); + // use "case equality" to handle uninitialized address value + wire smem_select = ((dcache_core_req_if.core_req_addr[0][31:24] == `SHARED_MEM_TOP_ADDR) === 1'b1); + VX_dcache_io_arb dcache_io_arb ( - .io_select (dcache_core_req_if.core_req_addr[0][31:24] == `SHARED_MEM_TOP_ADDR), + .io_select (smem_select), .core_req_if (dcache_core_req_if), .dcache_core_req_if (dcache_core_req_qual_if), .io_core_req_if (smem_core_req_if), diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 8613eceb..edd197e4 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -9,19 +9,24 @@ module VX_icache_stage ( output wire[`NUM_THREADS-1:0] icache_stage_valids, VX_inst_meta_if fe_inst_meta_fi, VX_inst_meta_if fe_inst_meta_id, - - VX_cache_core_rsp_if icache_rsp_if, - VX_cache_core_req_if icache_req_if + + VX_cache_core_req_if icache_req_if, + VX_cache_core_rsp_if icache_rsp_if ); - reg[`NUM_THREADS-1:0] pending_threads[`NUM_WARPS-1:0]; + reg [`NUM_THREADS-1:0] valid_threads [`NUM_WARPS-1:0]; wire valid_inst = (| fe_inst_meta_fi.valid); +`DEBUG_BEGIN + wire [`CORE_REQ_TAG_WIDTH-1:0] core_req_tag = icache_req_if.core_req_tag; + wire [`CORE_REQ_TAG_WIDTH-1:0] core_rsp_tag = icache_rsp_if.core_rsp_tag; +`DEBUG_END + // Icache Request assign icache_req_if.core_req_valid = valid_inst && !total_freeze; assign icache_req_if.core_req_addr = fe_inst_meta_fi.inst_pc; - assign icache_req_if.core_req_data = 32'b0; + assign icache_req_if.core_req_data = 'z; assign icache_req_if.core_req_read = `BYTE_EN_LW; assign icache_req_if.core_req_write = `BYTE_EN_NO; assign icache_req_if.core_req_tag = {fe_inst_meta_fi.inst_pc, 2'b1, 5'b0, fe_inst_meta_fi.warp_num}; @@ -33,8 +38,8 @@ module VX_icache_stage ( assign {fe_inst_meta_id.inst_pc, rsp_wb, rsp_rd, fe_inst_meta_id.warp_num} = icache_rsp_if.core_rsp_tag; - assign fe_inst_meta_id.instruction = icache_rsp_if.core_rsp_data[0][31:0]; - assign fe_inst_meta_id.valid = icache_rsp_if.core_rsp_valid ? pending_threads[fe_inst_meta_id.warp_num] : 0; + assign fe_inst_meta_id.instruction = icache_rsp_if.core_rsp_data[0]; + assign fe_inst_meta_id.valid = icache_rsp_if.core_rsp_valid ? valid_threads[fe_inst_meta_id.warp_num] : 0; assign icache_stage_wid = fe_inst_meta_id.warp_num; assign icache_stage_valids = fe_inst_meta_id.valid & {`NUM_THREADS{!icache_stage_delay}}; @@ -49,12 +54,10 @@ module VX_icache_stage ( always @(posedge clk) begin if (reset) begin - for (i = 0; i < `NUM_WARPS; i = i + 1) begin - pending_threads[i] <= 0; - end + //-- end else begin if (icache_req_if.core_req_valid && icache_req_if.core_req_ready) begin - pending_threads[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid; + valid_threads[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid; end end end diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index f7e1dfd6..fcf0d928 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -11,8 +11,8 @@ module VX_lsu_unit ( VX_wb_if mem_wb_if, // Dcache interface - VX_cache_core_rsp_if dcache_rsp_if, VX_cache_core_req_if dcache_req_if, + VX_cache_core_rsp_if dcache_rsp_if, output wire delay ); diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index ba9c1de7..7b0ba93e 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -244,8 +244,11 @@ module Vortex #( .icache_dram_rsp_if (icache_dram_rsp_if) ); + // use "case equality" to handle uninitialized address value + wire io_select = ((dcache_io_core_req_if.core_req_addr[0] >= `IO_BUS_BASE_ADDR) === 1'b1); + VX_dcache_io_arb dcache_io_arb ( - .io_select (dcache_io_core_req_if.core_req_addr[0] >= `IO_BUS_BASE_ADDR), + .io_select (io_select), .core_req_if (dcache_io_core_req_if), .dcache_core_req_if (dcache_core_req_if), .io_core_req_if (io_core_req_if), diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index d76a7be8..3d216373 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -90,13 +90,11 @@ module VX_tag_data_access #( wire fill_sent; wire invalidate_line; + wire tags_match; wire real_writefill = writefill_st1e - && ((valid_req_st1e - && !use_read_valid_st1e) - || (valid_req_st1e - && use_read_valid_st1e - && (writeaddr_st1e[`TAG_LINE_ADDR_RNG] != use_read_tag_st1e))); + && ((valid_req_st1e && !use_read_valid_st1e) + || (valid_req_st1e && use_read_valid_st1e && !tags_match)); VX_tag_data_structure #( .CACHE_SIZE (CACHE_SIZE), @@ -256,14 +254,14 @@ module VX_tag_data_access #( assign data_write[i * `WORD_WIDTH +: `WORD_WIDTH] = force_write ? writedata_st1e[i * `WORD_WIDTH +: `WORD_WIDTH] : use_write_dat; end + end assign use_write_enable = (writefill_st1e && !real_writefill) ? 0 : we; assign use_write_data = data_write; - wire[`TAG_SELECT_BITS-1:0] writeaddr_tag = writeaddr_st1e[`TAG_LINE_ADDR_RNG]; - - wire tags_match = writeaddr_tag == use_read_tag_st1e; + // use "case equality" to handle uninitialized tag when block entry is not valid + assign tags_match = ((writeaddr_st1e[`TAG_LINE_ADDR_RNG] == use_read_tag_st1e) === 1'b1); wire snoop_hit = valid_req_st1e && is_snp_st1e && use_read_valid_st1e && tags_match && use_read_dirty_st1e; wire req_invalid = valid_req_st1e && !is_snp_st1e && !use_read_valid_st1e && !writefill_st1e;