From 0b355f228e4d1e881e3f3ad3bb8a0de42d283d4c Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 22 Aug 2020 00:22:04 -0700 Subject: [PATCH] ibuffer addition --- driver/common/vx_utils.cpp | 22 ++ driver/include/vortex.h | 3 + driver/opae/vortex.cpp | 18 +- driver/rtlsim/vortex.cpp | 3 +- driver/tests/basic/basic.cpp | 6 +- driver/tests/demo/demo.cpp | 2 +- driver/tests/dogfood/testcases.h | 38 +-- hw/opae/README | 3 + hw/rtl/VX_alu_unit.v | 137 ++++++---- hw/rtl/VX_bru_unit.v | 56 ----- hw/rtl/VX_commit.v | 62 ++--- hw/rtl/VX_config.vh | 32 ++- hw/rtl/VX_csr_arb.v | 43 ++-- hw/rtl/VX_csr_data.v | 8 +- hw/rtl/VX_csr_unit.v | 25 +- hw/rtl/VX_decode.v | 167 ++++++------- hw/rtl/VX_define.vh | 75 +++--- hw/rtl/VX_execute.v | 30 +-- hw/rtl/VX_fpu_unit.v | 140 +++++++---- hw/rtl/VX_gpr_bypass.v | 53 ++++ hw/rtl/VX_gpr_fp_ctrl.v | 15 +- hw/rtl/VX_gpr_stage.v | 16 +- hw/rtl/VX_gpu_unit.v | 40 +-- hw/rtl/VX_ibuffer.v | 187 ++++++++++++++ hw/rtl/VX_icache_stage.v | 2 +- hw/rtl/VX_instr_demux.v | 233 ++++++++++++++++++ hw/rtl/VX_issue.v | 140 +++++------ hw/rtl/VX_issue_demux.v | 102 -------- hw/rtl/VX_lsu_unit.v | 182 +++++++------- hw/rtl/VX_mul_unit.v | 124 ++++++---- hw/rtl/VX_pipeline.v | 14 +- hw/rtl/VX_platform.vh | 4 + hw/rtl/VX_print_instr.vh | 67 ++--- hw/rtl/VX_scoreboard.v | 72 +++--- hw/rtl/VX_types.vh | 10 +- hw/rtl/VX_warp_sched.v | 189 +++++++------- hw/rtl/VX_writeback.v | 195 +++++---------- hw/rtl/cache/VX_bank.v | 24 +- hw/rtl/cache/VX_cache.v | 3 +- hw/rtl/cache/VX_cache_miss_resrv.v | 2 +- hw/rtl/cache/VX_snp_forwarder.v | 9 +- hw/rtl/cache/VX_tag_data_access.v | 9 +- hw/rtl/fp_cores/VX_fp_fpga.v | 91 ++++--- hw/rtl/fp_cores/VX_fp_noncomp.v | 55 +++-- hw/rtl/fp_cores/VX_fpnew.v | 16 +- hw/rtl/fp_cores/altera/VX_fp_add.v | 19 +- hw/rtl/fp_cores/altera/VX_fp_div.v | 19 +- hw/rtl/fp_cores/altera/VX_fp_ftoi.v | 17 +- hw/rtl/fp_cores/altera/VX_fp_ftou.v | 17 +- hw/rtl/fp_cores/altera/VX_fp_itof.v | 17 +- hw/rtl/fp_cores/altera/VX_fp_madd.v | 27 +- hw/rtl/fp_cores/altera/VX_fp_msub.v | 27 +- hw/rtl/fp_cores/altera/VX_fp_mul.v | 19 +- hw/rtl/fp_cores/altera/VX_fp_sqrt.v | 17 +- hw/rtl/fp_cores/altera/VX_fp_sub.v | 19 +- hw/rtl/fp_cores/altera/VX_fp_utof.v | 17 +- hw/rtl/interfaces/VX_alu_req_if.v | 17 +- hw/rtl/interfaces/VX_bru_req_if.v | 29 --- hw/rtl/interfaces/VX_cmt_to_issue_if.v | 36 --- hw/rtl/interfaces/VX_csr_req_if.v | 11 +- hw/rtl/interfaces/VX_csr_rsp_if.v | 15 -- ...X_csr_to_fpu_if.v => VX_csr_to_issue_if.v} | 6 +- hw/rtl/interfaces/VX_decode_if.v | 15 +- hw/rtl/interfaces/VX_exu_to_cmt_if.v | 11 +- hw/rtl/interfaces/VX_fpu_req_if.v | 10 +- hw/rtl/interfaces/VX_fpu_to_cmt_if.v | 15 +- hw/rtl/interfaces/VX_gpr_read_if.v | 3 +- hw/rtl/interfaces/VX_gpu_req_if.v | 6 +- hw/rtl/interfaces/VX_issue_if.v | 39 --- hw/rtl/interfaces/VX_lsu_req_if.v | 4 +- hw/rtl/interfaces/VX_mul_req_if.v | 7 +- hw/rtl/interfaces/VX_warp_ctl_if.v | 1 + .../{VX_wb_if.v => VX_writeback_if.v} | 8 +- hw/rtl/libs/VX_cam_buffer.v | 29 ++- hw/rtl/libs/VX_elastic_buffer.v | 64 ++--- hw/rtl/libs/VX_generic_queue.v | 1 - hw/rtl/libs/VX_rr_arbiter.v | 4 +- hw/rtl/libs/VX_skid_buffer.v | 65 +++++ hw/simulate/testbench.cpp | 2 +- hw/syn/quartus/project.sdc | 2 +- 80 files changed, 1811 insertions(+), 1528 deletions(-) delete mode 100644 hw/rtl/VX_bru_unit.v create mode 100644 hw/rtl/VX_gpr_bypass.v create mode 100644 hw/rtl/VX_ibuffer.v create mode 100644 hw/rtl/VX_instr_demux.v delete mode 100644 hw/rtl/VX_issue_demux.v delete mode 100644 hw/rtl/interfaces/VX_bru_req_if.v delete mode 100644 hw/rtl/interfaces/VX_cmt_to_issue_if.v delete mode 100644 hw/rtl/interfaces/VX_csr_rsp_if.v rename hw/rtl/interfaces/{VX_csr_to_fpu_if.v => VX_csr_to_issue_if.v} (64%) delete mode 100644 hw/rtl/interfaces/VX_issue_if.v rename hw/rtl/interfaces/{VX_wb_if.v => VX_writeback_if.v} (74%) create mode 100644 hw/rtl/libs/VX_skid_buffer.v diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index d7b8f829..eb3e93b1 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -89,4 +89,26 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) { delete[] content; return err; +} + +extern int vx_get_perf(vx_device_h device, uint64_t* cycles, uint64_t* instrs) { + int ret = 0; + + unsigned value; + + if (cycles) { + ret |= vx_csr_get(device, 0, CSR_CYCLE_H, &value); + *cycles = value; + ret |= vx_csr_get(device, 0, CSR_CYCLE, &value); + *cycles = (*cycles << 32) | value; + } + + if (instrs) { + ret |= vx_csr_get(device, 0, CSR_INSTRET_H, &value); + *instrs = value; + ret |= vx_csr_get(device, 0, CSR_INSTRET, &value); + *instrs = (*instrs << 32) | value; + } + + return ret; } \ No newline at end of file diff --git a/driver/include/vortex.h b/driver/include/vortex.h index a9597253..7c331800 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -71,6 +71,9 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) // upload kernel file to device int vx_upload_kernel_file(vx_device_h device, const char* filename); +// get performance counters +int vx_get_perf(vx_device_h device, uint64_t* cycles, uint64_t* instrs); + #ifdef __cplusplus } #endif diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 02955678..c8bf410b 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -212,25 +212,11 @@ extern int vx_dev_close(vx_device_h hdevice) { #endif { - // Dump performance stats + // Dump perf stats uint64_t instrs, cycles; - unsigned value; - - int ret = 0; - ret |= vx_csr_get(hdevice, 0, CSR_INSTRET_H, &value); - instrs = value; - ret |= vx_csr_get(hdevice, 0, CSR_INSTRET, &value); - instrs = (instrs << 32) | value; - - ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value); - cycles = value; - ret |= vx_csr_get(hdevice, 0, CSR_CYCLE, &value); - cycles = (cycles << 32) | value; - + int ret = vx_get_perf(hdevice, &instrs, &cycles); float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); - assert(ret == 0); } diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 51c3bb8c..b01eea34 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -68,7 +68,8 @@ public: simulator_.attach_ram(&ram_); } - ~vx_device() { + ~vx_device() { + simulator_.print_stats(std::cout); if (future_.valid()) { future_.wait(); } diff --git a/driver/tests/basic/basic.cpp b/driver/tests/basic/basic.cpp index ee16533d..69ba7d12 100755 --- a/driver/tests/basic/basic.cpp +++ b/driver/tests/basic/basic.cpp @@ -155,7 +155,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i]; int32_t ref = i; if (curr != ref) { - std::cout << "error at value " << i + std::cout << "error at result #" << i << ": actual 0x" << curr << ", expected 0x" << ref << std::endl; ++errors; } @@ -238,7 +238,7 @@ int main(int argc, char *argv[]) { std::cout << "cleanup" << std::endl; cleanup(); - std::cout << "Test PASSED" << std::endl; - + std::cout << "Test PASSED" << std::endl; + return 0; } diff --git a/driver/tests/demo/demo.cpp b/driver/tests/demo/demo.cpp index 10d0b8ae..1a2a62af 100644 --- a/driver/tests/demo/demo.cpp +++ b/driver/tests/demo/demo.cpp @@ -86,7 +86,7 @@ int run_test(const kernel_arg_t& kernel_arg, int ref = i + i; int cur = buf_ptr[i]; if (cur != ref) { - std::cout << "error at value " << i + std::cout << "error at result #" << i << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; ++errors; } diff --git a/driver/tests/dogfood/testcases.h b/driver/tests/dogfood/testcases.h index 311efc20..fdbb727b 100644 --- a/driver/tests/dogfood/testcases.h +++ b/driver/tests/dogfood/testcases.h @@ -57,7 +57,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] + b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -85,7 +85,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -113,7 +113,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] / b[i]; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -143,7 +143,7 @@ public: auto y = a[i] * b[i]; auto ref = x + y; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -171,7 +171,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] + b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -199,7 +199,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] - b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -227,7 +227,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -255,7 +255,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i] + 0.5f; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -283,7 +283,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] * b[i] - 0.5f; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -311,7 +311,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = -a[i] * b[i] - 0.5f; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -339,7 +339,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = -a[i] * b[i] + 0.5f; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -369,7 +369,7 @@ public: auto y = a[i] * b[i] + 0.5f; auto ref = x + y; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -397,7 +397,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = a[i] / b[i]; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -427,7 +427,7 @@ public: auto y = b[i] / a[i]; auto ref = x + y; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -456,7 +456,7 @@ public: for (int i = 0; i < n; ++i) { auto ref = sqrt(a[i] * b[i]); if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -485,7 +485,7 @@ public: auto x = a[i] + b[i]; auto ref = (int32_t)x; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -514,7 +514,7 @@ public: auto x = a[i] + b[i]; auto ref = (uint32_t)x; if (c[i] != ref) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -543,7 +543,7 @@ public: auto x = a[i] + b[i]; auto ref = (float)x; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } @@ -572,7 +572,7 @@ public: auto x = a[i] + b[i]; auto ref = (float)x; if (!almost_equal(c[i], ref)) { - std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; + std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } } diff --git a/hw/opae/README b/hw/opae/README index 1b916d3c..cb1215db 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -75,6 +75,9 @@ tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt # compress VCD trace tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd +tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd +tar -zcvf trace.vcd.tar.gz trace.vcd +tar -zcvf run.log.tar.gz run.log # decompress VCD trace tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index bd3e478d..eea4cdbe 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -3,90 +3,143 @@ module VX_alu_unit #( parameter CORE_ID = 0 ) ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, // Inputs - VX_alu_req_if alu_req_if, + VX_alu_req_if alu_req_if, // Outputs - VX_exu_to_cmt_if alu_commit_if + VX_branch_ctl_if branch_ctl_if, + VX_exu_to_cmt_if alu_commit_if ); - reg [`NUM_THREADS-1:0][31:0] alu_result; - - wire [`NUM_THREADS-1:0][31:0] addsub_result; - wire [`NUM_THREADS-1:0] less_result; - wire [`NUM_THREADS-1:0][31:0] shift_result; - reg [`NUM_THREADS-1:0][31:0] misc_result; + reg [`NUM_THREADS-1:0][31:0] alu_result; + reg [`NUM_THREADS-1:0][31:0] add_result; + reg [`NUM_THREADS-1:0][32:0] sub_result; + reg [`NUM_THREADS-1:0][31:0] shift_result; + reg [`NUM_THREADS-1:0][31:0] misc_result; + + wire valid_r; + wire [`NW_BITS-1:0] wid_r; + wire [`NUM_THREADS-1:0] thread_mask_r; + wire [31:0] curr_PC_r; + wire [`NR_BITS-1:0] rd_r; + wire wb_r; + wire [`NT_BITS-1:0] tid_r; + wire is_sub_r; + wire [`BR_BITS-1:0] br_op_r; + wire is_br_op_r, is_br_op_s; + wire [1:0] alu_op_class_r; + wire [31:0] next_PC_r; + + wire is_br_op = `IS_BR_OP(alu_req_if.op); + wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op); + wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op); + wire alu_signed = `ALU_SIGNED(alu_op); + wire [1:0] alu_op_class = `ALU_OP_CLASS(alu_op); + wire is_sub = (alu_op == `ALU_SUB); - wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op); wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; - wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.curr_PC}} : alu_in1; - wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2; - - wire negate_add = (alu_op == `ALU_SUB); - wire signed_less = (alu_op == `ALU_SLT); - wire signed_shift = (alu_op == `ALU_SRA); + wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.curr_PC}} : alu_in1; + wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2; + wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.rs2_is_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2; for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire [32:0] addsub_in1 = {alu_in1_PC[i], 1'b1}; - wire [32:0] addsub_in2 = {alu_in2_imm[i], 1'b0} ^ {33{negate_add}}; - `IGNORE_WARNINGS_BEGIN - wire [32:0] addsub_addd = addsub_in1 + addsub_in2; - `IGNORE_WARNINGS_END - assign addsub_result[i] = addsub_addd[32:1]; + always @(posedge clk) begin + add_result[i] <= alu_in1_PC[i] + alu_in2_imm[i]; + end end for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire [32:0] less_in1 = {signed_less & alu_in1[i][31], alu_in1[i]}; - wire [32:0] less_in2 = {signed_less & alu_in2_imm[i][31], alu_in2_imm[i]}; - assign less_result[i] = $signed(less_in1) < $signed(less_in2); + wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]}; + wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]}; + always @(posedge clk) begin + sub_result[i] <= $signed(sub_in1) - $signed(sub_in2); + end end for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire [32:0] shift_in1 = {signed_shift & alu_in1[i][31], alu_in1[i]}; + wire [32:0] shift_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]}; `IGNORE_WARNINGS_BEGIN wire [32:0] shift_value = $signed(shift_in1) >>> alu_in2_imm[i][4:0]; `IGNORE_WARNINGS_END - assign shift_result[i] = shift_value[31:0]; + always @(posedge clk) begin + shift_result[i] <= shift_value[31:0]; + end end for (genvar i = 0; i < `NUM_THREADS; i++) begin - always @(*) begin + always @(posedge clk) begin case (alu_op) - `ALU_AND: misc_result[i] = alu_in1[i] & alu_in2_imm[i]; - `ALU_OR: misc_result[i] = alu_in1[i] | alu_in2_imm[i]; - `ALU_XOR: misc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; + `ALU_AND: misc_result[i] <= alu_in1[i] & alu_in2_imm[i]; + `ALU_OR: misc_result[i] <= alu_in1[i] | alu_in2_imm[i]; + `ALU_XOR: misc_result[i] <= alu_in1[i] ^ alu_in2_imm[i]; //`ALU_SLL, - default: misc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; + default: misc_result[i] <= alu_in1[i] << alu_in2_imm[i][4:0]; endcase end end + + reg [31:0] next_PC = alu_req_if.curr_PC + 4; + + VX_shift_register #( + .DATAW(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `NT_BITS + 1 + 1 + `BR_BITS + 2 + 32), + .DEPTH(1) + ) alu_shift_reg ( + .clk(clk), + .reset(reset), + .enable(alu_req_if.ready), + .in({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, is_sub, is_br_op, br_op, alu_op_class, next_PC}), + .out({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, tid_r, is_sub_r, is_br_op_r, br_op_r, alu_op_class_r, next_PC_r}) + ); for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin - case (`ALU_OP_CLASS(alu_op)) - 0: alu_result[i] = addsub_result[i]; - 1: alu_result[i] = {31'b0, less_result[i]}; + case (alu_op_class_r) + 0: alu_result[i] = is_sub_r ? sub_result[i][31:0] : add_result[i]; + 1: alu_result[i] = {31'b0, sub_result[i][32]}; 2: alu_result[i] = shift_result[i]; default: alu_result[i] = misc_result[i]; endcase end - end + end + + // branch handling + + wire br_neg = `BR_NEG(br_op_r); + wire br_less = `BR_LESS(br_op_r); + wire br_static = `BR_STATIC(br_op_r); + wire is_jal = is_br_op_r && (br_op_r == `BR_JAL || br_op_r == `BR_JALR); + + wire [31:0] br_dest = add_result[tid_r]; + wire [32:0] cmp_result = sub_result[tid_r]; + wire is_less = cmp_result[32]; + wire is_equal = ~(| cmp_result[31:0]); + wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static; + + wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{next_PC_r}} : alu_result; + + // output + + wire stall_out = ~alu_commit_if.ready && alu_commit_if.valid; VX_generic_register #( - .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32)) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32) ) alu_reg ( .clk (clk), .reset (reset), - .stall (0), + .stall (stall_out), .flush (0), - .in ({alu_req_if.valid, alu_req_if.issue_tag, alu_result}), - .out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data}) + .in ({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, alu_jal_result, is_br_op_r, br_taken, br_dest}), + .out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_s, branch_ctl_if.taken, branch_ctl_if.dest}) ); - assign alu_req_if.ready = 1'b1; + assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_s; + assign branch_ctl_if.wid = alu_commit_if.wid; + + // can accept new request? + assign alu_req_if.ready = ~stall_out; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_bru_unit.v b/hw/rtl/VX_bru_unit.v deleted file mode 100644 index 4d6bbde5..00000000 --- a/hw/rtl/VX_bru_unit.v +++ /dev/null @@ -1,56 +0,0 @@ -`include "VX_define.vh" - -module VX_bru_unit #( - parameter CORE_ID = 0 -) ( - input wire clk, - input wire reset, - - // Inputs - VX_bru_req_if bru_req_if, - - // Outputs - VX_branch_ctl_if branch_ctl_if, - VX_exu_to_cmt_if bru_commit_if -); - wire [`BRU_BITS-1:0] bru_op = bru_req_if.op; - wire bru_neg = `BRU_NEG(bru_op); - wire bru_less = `BRU_LESS(bru_op); - wire bru_signed = `BRU_SIGNED(bru_op); - wire bru_static = `BRU_STATIC(bru_op); - - wire [31:0] rs1_data = bru_req_if.rs1_data; - wire [31:0] rs2_data = bru_req_if.rs2_data; - - wire [32:0] signed_in1 = {bru_signed & rs1_data[31], rs1_data}; - wire [32:0] signed_in2 = {bru_signed & rs2_data[31], rs2_data}; - wire is_less = $signed(signed_in1) < $signed(signed_in2); - - wire is_equal = (rs1_data == rs2_data); - - wire taken = ((bru_less ? is_less : is_equal) ^ bru_neg) | bru_static; - - wire [31:0] base_addr = bru_req_if.rs1_is_PC ? bru_req_if.curr_PC : rs1_data; - wire [31:0] dest = base_addr + bru_req_if.offset; - - wire [31:0] jal_result = bru_req_if.curr_PC + 4; - wire [31:0] jal_result_r; - - VX_generic_register #( - .N(1 + `NW_BITS + `ISTAG_BITS + 1 + 32 + 32) - ) bru_reg ( - .clk (clk), - .reset (reset), - .stall (0), - .flush (0), - .in ({bru_req_if.valid, bru_req_if.wid, bru_req_if.issue_tag, taken, dest, jal_result}), - .out ({bru_commit_if.valid, branch_ctl_if.wid, bru_commit_if.issue_tag, branch_ctl_if.taken, branch_ctl_if.dest, jal_result_r}) - ); - - assign branch_ctl_if.valid = bru_commit_if.valid; - - assign bru_commit_if.data = {`NUM_THREADS{jal_result_r}}; - - assign bru_req_if.ready = 1'b1; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index f6f6c594..7134fd61 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -8,7 +8,6 @@ module VX_commit #( // inputs VX_exu_to_cmt_if alu_commit_if, - VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if mul_commit_if, VX_exu_to_cmt_if csr_commit_if, @@ -16,15 +15,13 @@ module VX_commit #( VX_exu_to_cmt_if gpu_commit_if, // outputs - VX_cmt_to_issue_if cmt_to_issue_if, - VX_wb_if writeback_if, + VX_writeback_if writeback_if, VX_cmt_to_csr_if cmt_to_csr_if ); - // update CRSs + // CSRs update wire [`NUM_EXS-1:0] commited_mask; - assign commited_mask = {alu_commit_if.valid, - bru_commit_if.valid, + assign commited_mask = {alu_commit_if.valid, lsu_commit_if.valid, csr_commit_if.valid, mul_commit_if.valid, @@ -44,7 +41,7 @@ module VX_commit #( always @(*) begin fflags = 0; for (integer i = 0; i < `NUM_THREADS; i++) begin - if (cmt_to_issue_if.fpu_data.thread_mask[i]) begin + if (fpu_commit_if.thread_mask[i]) begin fflags.NX |= fpu_commit_if.fflags[i].NX; fflags.UF |= fpu_commit_if.fflags[i].UF; fflags.OF |= fpu_commit_if.fflags[i].OF; @@ -64,7 +61,7 @@ module VX_commit #( csr_update_r <= (| commited_mask); fflags_r <= fflags; has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags; - wid_r <= cmt_to_issue_if.fpu_data.wid; + wid_r <= fpu_commit_if.wid; num_commits_r <= num_commits; end @@ -74,23 +71,7 @@ module VX_commit #( assign cmt_to_csr_if.has_fflags = has_fflags_r; assign cmt_to_csr_if.fflags = fflags_r; - // Notify issue stage - - assign cmt_to_issue_if.alu_valid = alu_commit_if.valid; - assign cmt_to_issue_if.bru_valid = bru_commit_if.valid; - assign cmt_to_issue_if.lsu_valid = lsu_commit_if.valid; - assign cmt_to_issue_if.csr_valid = csr_commit_if.valid; - assign cmt_to_issue_if.mul_valid = mul_commit_if.valid; - assign cmt_to_issue_if.fpu_valid = fpu_commit_if.valid; - assign cmt_to_issue_if.gpu_valid = gpu_commit_if.valid; - - assign cmt_to_issue_if.alu_tag = alu_commit_if.issue_tag; - assign cmt_to_issue_if.bru_tag = bru_commit_if.issue_tag; - assign cmt_to_issue_if.lsu_tag = lsu_commit_if.issue_tag; - assign cmt_to_issue_if.csr_tag = csr_commit_if.issue_tag; - assign cmt_to_issue_if.mul_tag = mul_commit_if.issue_tag; - assign cmt_to_issue_if.fpu_tag = fpu_commit_if.issue_tag; - assign cmt_to_issue_if.gpu_tag = gpu_commit_if.issue_tag; + // Writeback VX_writeback #( .CORE_ID(CORE_ID) @@ -99,41 +80,38 @@ module VX_commit #( .reset (reset), .alu_commit_if (alu_commit_if), - .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), .fpu_commit_if (fpu_commit_if), .gpu_commit_if (gpu_commit_if), - .cmt_to_issue_if(cmt_to_issue_if), .writeback_if (writeback_if) ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if (alu_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.alu_data.wid, cmt_to_issue_if.alu_data.curr_PC, alu_commit_if.issue_tag, cmt_to_issue_if.alu_data.thread_mask, cmt_to_issue_if.alu_data.wb, cmt_to_issue_if.alu_data.rd, alu_commit_if.data); + if (alu_commit_if.valid && alu_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.curr_PC, alu_commit_if.thread_mask, alu_commit_if.wb, alu_commit_if.rd, alu_commit_if.data); end - if (bru_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.bru_data.wid, cmt_to_issue_if.bru_data.curr_PC, bru_commit_if.issue_tag, cmt_to_issue_if.bru_data.thread_mask, cmt_to_issue_if.bru_data.wb, cmt_to_issue_if.bru_data.rd, bru_commit_if.data); + if (lsu_commit_if.valid && lsu_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, lsu_commit_if.wid, lsu_commit_if.curr_PC, lsu_commit_if.thread_mask, lsu_commit_if.wb, lsu_commit_if.rd, lsu_commit_if.data); end - if (lsu_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.lsu_data.wid, cmt_to_issue_if.lsu_data.curr_PC, lsu_commit_if.issue_tag, cmt_to_issue_if.lsu_data.thread_mask, cmt_to_issue_if.lsu_data.wb, cmt_to_issue_if.lsu_data.rd, lsu_commit_if.data); - end - if (csr_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.csr_data.wid, cmt_to_issue_if.csr_data.curr_PC, csr_commit_if.issue_tag, cmt_to_issue_if.csr_data.thread_mask, cmt_to_issue_if.csr_data.wb, cmt_to_issue_if.csr_data.rd, csr_commit_if.data); + if (csr_commit_if.valid && csr_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.curr_PC, csr_commit_if.thread_mask, csr_commit_if.wb, csr_commit_if.rd, csr_commit_if.data); end - if (mul_commit_if.validy) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.mul_data.wid, cmt_to_issue_if.mul_data.curr_PC, mul_commit_if.issue_tag, cmt_to_issue_if.mul_data.thread_mask, cmt_to_issue_if.mul_data.wb, cmt_to_issue_if.mul_data.rd, mul_commit_if.data); + if (mul_commit_if.valid && mul_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=MUL, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.wid, mul_commit_if.curr_PC, mul_commit_if.thread_mask, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data); end - if (fpu_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.fpu_data.wid, cmt_to_issue_if.fpu_data.curr_PC, fpu_commit_if.issue_tag, cmt_to_issue_if.fpu_data.thread_mask, cmt_to_issue_if.fpu_data.wb, cmt_to_issue_if.fpu_data.rd, fpu_commit_if.data); + if (fpu_commit_if.valid && fpu_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.curr_PC, fpu_commit_if.thread_mask, fpu_commit_if.wb, fpu_commit_if.rd, fpu_commit_if.data); end - if (gpu_commit_if.valid) begin - $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.gpu_data.wid, cmt_to_issue_if.gpu_data.curr_PC, gpu_commit_if.issue_tag, cmt_to_issue_if.gpu_data.thread_mask, cmt_to_issue_if.gpu_data.wb, cmt_to_issue_if.gpu_data.rd, gpu_commit_if.data); + if (gpu_commit_if.valid && gpu_commit_if.ready) begin + $display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.curr_PC, gpu_commit_if.thread_mask, gpu_commit_if.wb, gpu_commit_if.rd, gpu_commit_if.data); end end +`else + `UNUSED_FIELD(fpu_commit_if, curr_PC) `endif endmodule diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 39037f51..ab4d4742 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -60,16 +60,6 @@ `define ARCHITECTURE_ID 0 `define IMPLEMENTATION_ID 0 -// Size of MUL Request Queue Size -`ifndef MULRQ_SIZE -`define MULRQ_SIZE 8 -`endif - -// Size of issue queue -`ifndef ISSUEQ_SIZE -`define ISSUEQ_SIZE (8 + `NUM_WARPS) -`endif - // CSR Addresses ////////////////////////////////////////////////////////////// `define CSR_FFLAGS 12'h001 @@ -109,6 +99,28 @@ `define CSR_MIMPID 12'hF13 `define CSR_MHARTID 12'hF14 +// Pipeline Queues ============================================================ + +// Size of instruction queue +`ifndef IBUF_SIZE +`define IBUF_SIZE 8 +`endif + +// Size of LSU Request Queue +`ifndef LSUQ_SIZE +`define LSUQ_SIZE 8 +`endif + +// Size of MUL Request Queue +`ifndef MULQ_SIZE +`define MULQ_SIZE 8 +`endif + +// Size of FPU Request Queue +`ifndef FPUQ_SIZE +`define FPUQ_SIZE 8 +`endif + // Dcache Configurable Knobs ================================================== // Size of cache in bytes diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 4cee91ff..7eb109eb 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -1,9 +1,6 @@ `include "VX_define.vh" -module VX_csr_arb ( - input wire clk, - input wire reset, - +module VX_csr_arb ( // inputs VX_csr_req_if csr_core_req_if, VX_csr_io_req_if csr_io_req_if, @@ -12,7 +9,7 @@ module VX_csr_arb ( VX_csr_req_if csr_req_if, // input - VX_csr_rsp_if csr_rsp_if, + VX_exu_to_cmt_if csr_rsp_if, // outputs VX_exu_to_cmt_if csr_commit_if, @@ -21,33 +18,33 @@ module VX_csr_arb ( input wire select_io_req, input wire select_io_rsp ); - - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - // requests - assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid; - assign csr_req_if.issue_tag = (~select_io_req) ? csr_core_req_if.issue_tag : 0; - assign csr_req_if.wid = (~select_io_req) ? csr_core_req_if.wid : 0; - assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0; - assign csr_req_if.op = (~select_io_req) ? csr_core_req_if.op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); - assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr; - assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); - assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0; - assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0; + assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid; + assign csr_req_if.wid = (~select_io_req) ? csr_core_req_if.wid : 0; + assign csr_req_if.thread_mask = (~select_io_req) ? csr_core_req_if.thread_mask : 0; + assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0; + assign csr_req_if.op = (~select_io_req) ? csr_core_req_if.op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); + assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr; + assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); + assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0; + assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0; assign csr_req_if.is_io = select_io_req; assign csr_core_req_if.ready = csr_req_if.ready && (~select_io_req); - assign csr_io_req_if.ready = csr_req_if.ready && select_io_req; + assign csr_io_req_if.ready = csr_req_if.ready && select_io_req; // responses assign csr_io_rsp_if.valid = csr_rsp_if.valid & select_io_rsp; assign csr_io_rsp_if.data = csr_rsp_if.data[0]; - assign csr_commit_if.valid = csr_rsp_if.valid & ~select_io_rsp; - assign csr_commit_if.issue_tag= csr_rsp_if.issue_tag; - assign csr_commit_if.data = csr_rsp_if.data; + assign csr_commit_if.valid = csr_rsp_if.valid & ~select_io_rsp; + assign csr_commit_if.wid = csr_rsp_if.wid; + assign csr_commit_if.thread_mask = csr_rsp_if.thread_mask; + assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC; + assign csr_commit_if.rd = csr_rsp_if.rd; + assign csr_commit_if.wb = csr_rsp_if.wb; + assign csr_commit_if.data = csr_rsp_if.data; - assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : 1'b1; + assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready; endmodule diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index bd426ba2..82fcd3b3 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -7,7 +7,7 @@ module VX_csr_data #( input wire reset, VX_cmt_to_csr_if cmt_to_csr_if, - VX_csr_to_fpu_if csr_to_fpu_if, + VX_csr_to_issue_if csr_to_issue_if, input wire[`NW_BITS-1:0] wid, @@ -129,11 +129,11 @@ module VX_csr_data #( `CSR_MIMPID : read_data = `IMPLEMENTATION_ID; default: begin - assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr); - end + assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr); + end endcase end - assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.wid]; + assign csr_to_issue_if.frm = csr_frm[csr_to_issue_if.wid]; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index c6a66b6c..e742cb40 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -7,7 +7,7 @@ module VX_csr_unit #( input wire reset, VX_cmt_to_csr_if cmt_to_csr_if, - VX_csr_to_fpu_if csr_to_fpu_if, + VX_csr_to_issue_if csr_to_issue_if, VX_csr_io_req_if csr_io_req_if, VX_csr_io_rsp_if csr_io_rsp_if, @@ -15,16 +15,13 @@ module VX_csr_unit #( VX_csr_req_if csr_req_if, VX_exu_to_cmt_if csr_commit_if ); - VX_csr_req_if csr_pipe_req_if(); - VX_csr_rsp_if csr_pipe_rsp_if(); + VX_csr_req_if csr_pipe_req_if(); + VX_exu_to_cmt_if csr_pipe_rsp_if(); wire select_io_req = csr_io_req_if.valid; wire select_io_rsp; VX_csr_arb csr_arb ( - .clk (clk), - .reset (reset), - .csr_core_req_if (csr_req_if), .csr_io_req_if (csr_io_req_if), .csr_req_if (csr_pipe_req_if), @@ -41,7 +38,6 @@ module VX_csr_unit #( wire [`CSR_ADDR_BITS-1:0] csr_addr_s1; wire [31:0] csr_read_data, csr_read_data_s1; wire [31:0] csr_updated_data_s1; - wire [`NW_BITS-1:0] wid_s1; VX_csr_data #( .CORE_ID(CORE_ID) @@ -49,7 +45,7 @@ module VX_csr_unit #( .clk (clk), .reset (reset), .cmt_to_csr_if (cmt_to_csr_if), - .csr_to_fpu_if (csr_to_fpu_if), + .csr_to_issue_if (csr_to_issue_if), .read_enable (csr_pipe_req_if.valid), .read_addr (csr_pipe_req_if.csr_addr), .read_data (csr_read_data), @@ -60,7 +56,7 @@ module VX_csr_unit #( ); wire csr_hazard = (csr_addr_s1 == csr_pipe_req_if.csr_addr) - && (wid_s1 == csr_pipe_req_if.wid) + && (csr_pipe_rsp_if.wid == csr_pipe_req_if.wid) && csr_pipe_rsp_if.valid; wire [31:0] csr_read_data_qual = csr_hazard ? csr_updated_data_s1 : csr_read_data; @@ -86,21 +82,21 @@ module VX_csr_unit #( end default: csr_updated_data = 32'hdeadbeef; endcase - end - + end + wire csr_we_s0 = csr_we_s0_unqual && csr_pipe_req_if.valid; wire stall = ~csr_pipe_rsp_if.ready && csr_pipe_rsp_if.valid; VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 1 + `CSR_ADDR_BITS + 1 + 32 + 32) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 1 + 32 + 32) ) csr_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({csr_pipe_req_if.valid, csr_pipe_req_if.issue_tag, csr_pipe_req_if.wid, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}), - .out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.issue_tag, wid_s1, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1}) + .in ({csr_pipe_req_if.valid, csr_pipe_req_if.wid, csr_pipe_req_if.thread_mask, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}), + .out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.wid, csr_pipe_rsp_if.thread_mask, csr_pipe_rsp_if.curr_PC, csr_pipe_rsp_if.rd, csr_pipe_rsp_if.wb, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1}) ); for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -109,6 +105,7 @@ module VX_csr_unit #( csr_read_data_s1; end + // can accept new request? assign csr_pipe_req_if.ready = ~stall; endmodule diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 3b9bd6c1..36dc82c0 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -15,11 +15,13 @@ module VX_decode #( VX_wstall_if wstall_if, VX_join_if join_if ); - wire valid_in = ifetch_rsp_if.valid; - wire [31:0] instr = ifetch_rsp_if.instr; + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + wire [31:0] instr = ifetch_rsp_if.instr; reg [`ALU_BITS-1:0] alu_op; - reg [`BRU_BITS-1:0] br_op; + reg [`BR_BITS-1:0] br_op; reg [`LSU_BITS-1:0] lsu_op; reg [`CSR_BITS-1:0] csr_op; reg [`MUL_BITS-1:0] mul_op; @@ -100,27 +102,27 @@ module VX_decode #( wire is_br = (is_btype || is_jal || is_jalr || is_jals); always @(*) begin - br_op = `BRU_OTHER; + br_op = `BR_OTHER; case (opcode) `INST_B: begin case (func3) - 3'h0: br_op = `BRU_EQ; - 3'h1: br_op = `BRU_NE; - 3'h4: br_op = `BRU_LT; - 3'h5: br_op = `BRU_GE; - 3'h6: br_op = `BRU_LTU; - 3'h7: br_op = `BRU_GEU; + 3'h0: br_op = `BR_EQ; + 3'h1: br_op = `BR_NE; + 3'h4: br_op = `BR_LT; + 3'h5: br_op = `BR_GE; + 3'h6: br_op = `BR_LTU; + 3'h7: br_op = `BR_GEU; default:; endcase end - `INST_JAL: br_op = `BRU_JAL; - `INST_JALR: br_op = `BRU_JALR; + `INST_JAL: br_op = `BR_JAL; + `INST_JALR: br_op = `BR_JALR; `INST_SYS: begin - if (is_jals && u_12 == 12'h000) br_op = `BRU_ECALL; - if (is_jals && u_12 == 12'h001) br_op = `BRU_EBREAK; - if (is_jals && u_12 == 12'h302) br_op = `BRU_MRET; - if (is_jals && u_12 == 12'h102) br_op = `BRU_SRET; - if (is_jals && u_12 == 12'h7B2) br_op = `BRU_DRET; + if (is_jals && u_12 == 12'h000) br_op = `BR_ECALL; + if (is_jals && u_12 == 12'h001) br_op = `BR_EBREAK; + if (is_jals && u_12 == 12'h302) br_op = `BR_MRET; + if (is_jals && u_12 == 12'h102) br_op = `BR_SRET; + if (is_jals && u_12 == 12'h7B2) br_op = `BR_DRET; end default:; endcase @@ -290,104 +292,93 @@ module VX_decode #( /////////////////////////////////////////////////////////////////////////// - VX_decode_if decode_tmp_if(); + assign decode_if.valid = ifetch_rsp_if.valid + && (decode_if.ex_type != `EX_NOP); // skip noop - assign decode_tmp_if.valid = ifetch_rsp_if.valid; - assign decode_tmp_if.wid = ifetch_rsp_if.wid; - assign decode_tmp_if.thread_mask = ifetch_rsp_if.thread_mask; - assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; + assign decode_if.wid = ifetch_rsp_if.wid; + assign decode_if.thread_mask = ifetch_rsp_if.thread_mask; + assign decode_if.curr_PC = ifetch_rsp_if.curr_PC; - assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU : - is_csr ? `EX_CSR : - is_mul ? `EX_MUL : - is_fpu ? `EX_FPU : - is_gpu ? `EX_GPU : - is_br ? `EX_BRU : - (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : - `EX_NOP; + assign decode_if.ex_type = is_lsu ? `EX_LSU : + is_csr ? `EX_CSR : + is_mul ? `EX_MUL : + is_fpu ? `EX_FPU : + is_gpu ? `EX_GPU : + is_br ? `EX_ALU : + (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : + `EX_NOP; - assign decode_tmp_if.ex_op = is_lsu ? `OP_BITS'(lsu_op) : - is_csr ? `OP_BITS'(csr_op) : - is_mul ? `OP_BITS'(mul_op) : - is_fpu ? `OP_BITS'(fpu_op) : - is_gpu ? `OP_BITS'(gpu_op) : - is_br ? `OP_BITS'(br_op) : - (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : - 0; + assign decode_if.ex_op = is_lsu ? `OP_BITS'(lsu_op) : + is_csr ? `OP_BITS'(csr_op) : + is_mul ? `OP_BITS'(mul_op) : + is_fpu ? `OP_BITS'(fpu_op) : + is_gpu ? `OP_BITS'(gpu_op) : + is_br ? `OP_BITS'({1'b1, br_op}) : + (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'({1'b0, alu_op}) : + 0; - assign decode_tmp_if.wb = use_rd; + assign decode_if.wb = use_rd; `ifdef EXT_F_ENABLE - wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS)); wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX))); wire rs2_is_fp = is_fs || is_fr4 || is_fci; - assign decode_tmp_if.rd = {rd_is_fp, rd}; - assign decode_tmp_if.rs1 = {rs1_is_fp, rs1_qual}; - assign decode_tmp_if.rs2 = {rs2_is_fp, rs2}; - assign decode_tmp_if.rs3 = {1'b1, rs3}; + assign decode_if.rd = {rd_is_fp, rd}; + assign decode_if.rs1 = {rs1_is_fp, rs1_qual}; + assign decode_if.rs2 = {rs2_is_fp, rs2}; + assign decode_if.rs3 = {1'b1, rs3}; `else - assign decode_tmp_if.rd = rd; - assign decode_tmp_if.rs1 = rs1_qual; - assign decode_tmp_if.rs2 = rs2; - assign decode_tmp_if.rs3 = rs3; + assign decode_if.rd = rd; + assign decode_if.rs1 = rs1_qual; + assign decode_if.rs2 = rs2; + assign decode_if.rs3 = rs3; `endif - assign decode_tmp_if.use_rs3 = use_rs3; + assign decode_if.use_rs3 = use_rs3; - assign decode_tmp_if.reg_use_mask = ((`NUM_REGS)'(use_rd) << decode_tmp_if.rd) - | ((`NUM_REGS)'(use_rs1) << decode_tmp_if.rs1) - | ((`NUM_REGS)'(use_rs2) << decode_tmp_if.rs2) - | ((`NUM_REGS)'(use_rs3) << decode_tmp_if.rs3); + assign decode_if.used_regs = ((`NUM_REGS)'(use_rd) << decode_if.rd) + | ((`NUM_REGS)'(use_rs1) << decode_if.rs1) + | ((`NUM_REGS)'(use_rs2) << decode_if.rs2) + | ((`NUM_REGS)'(use_rs3) << decode_if.rs3); - assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} : - (is_jal || is_jalr || is_jals) ? jalx_offset : - is_csr ? 32'(u_12) : - src2_imm; + assign decode_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} : + (is_jal || is_jalr || is_jals) ? jalx_offset : + is_csr ? 32'(u_12) : + src2_imm; - assign decode_tmp_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals; - assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; + assign decode_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals; + assign decode_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm || is_br; - assign decode_tmp_if.frm = func3; + assign decode_if.frm = func3; - assign join_if.is_join = valid_in && is_gpu && (gpu_op == `GPU_JOIN); + /////////////////////////////////////////////////////////////////////////// + + wire decode_fire = decode_if.valid && decode_if.ready; + + assign join_if.is_join = decode_fire && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.wid = ifetch_rsp_if.wid; - assign wstall_if.wstall = valid_in && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); + assign wstall_if.wstall = decode_fire && (is_btype || is_jal || is_jalr + || (is_gpu && (gpu_op == `GPU_TMC + || gpu_op == `GPU_SPLIT + || gpu_op == `GPU_BAR))); assign wstall_if.wid = ifetch_rsp_if.wid; - wire stall = ~decode_if.ready && decode_if.valid; + /////////////////////////////////////////////////////////////////////////// - VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS) - ) decode_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (0), - .in ({decode_tmp_if.valid, decode_tmp_if.wid, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}), - .out ({decode_if.valid, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask}) - ); - - assign ifetch_rsp_if.ready = ~stall; + assign ifetch_rsp_if.ready = decode_if.ready; `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if (decode_tmp_if.valid && ~stall) begin - $write("%t: Core%0d-Decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.wid, decode_tmp_if.curr_PC); - print_ex_type(decode_tmp_if.ex_type); + if (decode_if.valid && decode_if.ready) begin + $write("%t: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.curr_PC); + print_ex_type(decode_if.ex_type); $write(", op="); - print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op); - $write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm); - print_frm(decode_tmp_if.frm); - $write("\n"); - - // trap unsupported instructions - assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.ex_op) == `ALU_OTHER)); - assert(~(~stall && (decode_tmp_if.ex_type == `EX_BRU) && `BRU_OP(decode_tmp_if.ex_op) == `BRU_OTHER)); - assert(~(~stall && (decode_tmp_if.ex_type == `EX_CSR) && `CSR_OP(decode_tmp_if.ex_op) == `CSR_OTHER)); - assert(~(~stall && (decode_tmp_if.ex_type == `EX_GPU) && `GPU_OP(decode_tmp_if.ex_op) == `GPU_OTHER)); + print_ex_op(decode_if.ex_type, decode_if.ex_op); + $write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm); + print_frm(decode_if.frm); + $write("\n"); end end `endif diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index f81b06ce..ae5dbe80 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -33,8 +33,6 @@ `define CSR_WIDTH 12 -`define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE) - /////////////////////////////////////////////////////////////////////////////// `define LATENCY_IDIV 33 @@ -98,15 +96,14 @@ `define EX_NOP 3'h0 `define EX_ALU 3'h1 -`define EX_BRU 3'h2 -`define EX_LSU 3'h3 -`define EX_CSR 3'h4 -`define EX_MUL 3'h5 -`define EX_FPU 3'h6 -`define EX_GPU 3'h7 +`define EX_LSU 3'h2 +`define EX_CSR 3'h3 +`define EX_MUL 3'h4 +`define EX_FPU 3'h5 +`define EX_GPU 3'h6 `define EX_BITS 3 -`define NUM_EXS 7 +`define NUM_EXS 6 `define NE_BITS `LOG2UP(`NUM_EXS) /////////////////////////////////////////////////////////////////////////////// @@ -117,8 +114,8 @@ `define ALU_SUB 4'b0001 `define ALU_LUI 4'b0010 `define ALU_AUIPC 4'b0011 -`define ALU_SLT 4'b0100 -`define ALU_SLTU 4'b0101 +`define ALU_SLTU 4'b0100 +`define ALU_SLT 4'b0101 `define ALU_SRL 4'b1000 `define ALU_SRA 4'b1001 `define ALU_AND 4'b1100 @@ -129,27 +126,31 @@ `define ALU_BITS 4 `define ALU_OP(x) x[`ALU_BITS-1:0] `define ALU_OP_CLASS(x) x[3:2] +`define ALU_SIGNED(x) x[0] -`define BRU_EQ 4'b0000 -`define BRU_NE 4'b0001 -`define BRU_LTU 4'b0010 -`define BRU_GEU 4'b0011 -`define BRU_LT 4'b0110 -`define BRU_GE 4'b0111 -`define BRU_JAL 4'b1000 -`define BRU_JALR 4'b1001 -`define BRU_ECALL 4'b1010 -`define BRU_EBREAK 4'b1011 -`define BRU_MRET 4'b1100 -`define BRU_SRET 4'b1101 -`define BRU_DRET 4'b1110 -`define BRU_OTHER 4'b1111 -`define BRU_BITS 4 -`define BRU_OP(x) x[`BRU_BITS-1:0] -`define BRU_NEG(x) x[0] -`define BRU_LESS(x) x[1] -`define BRU_SIGNED(x) x[2] -`define BRU_STATIC(x) x[3] +`define BR_EQ 4'b0000 +`define BR_NE 4'b0010 +`define BR_LTU 4'b0100 +`define BR_GEU 4'b0110 +`define BR_LT 4'b0101 +`define BR_GE 4'b0111 +`define BR_JAL 4'b1000 +`define BR_JALR 4'b1001 +`define BR_ECALL 4'b1010 +`define BR_EBREAK 4'b1011 +`define BR_MRET 4'b1100 +`define BR_SRET 4'b1101 +`define BR_DRET 4'b1110 +`define BR_OTHER 4'b1111 +`define BR_BITS 4 +`define BR_OP(x) x[`BR_BITS-1:0] +`define BR_NEG(x) x[1] +`define BR_LESS(x) x[2] +`define BR_STATIC(x) x[3] + +`define ALU_BR_BITS 5 +`define ALU_BR_OP(x) x[`ALU_BR_BITS-1:0] +`define IS_BR_OP(x) x[4] `define LSU_LB {1'b0, `BYTEEN_SB} `define LSU_LH {1'b0, `BYTEEN_SH} @@ -262,10 +263,10 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, wid -`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 1 + `NR_BITS + `NW_BITS) +`ifdef DBG_CORE_REQ_INFO // pc, rd, wid +`define DBG_CORE_REQ_MDATAW (32 + `NR_BITS + `NW_BITS) `else -`define DEBUG_CORE_REQ_MDATA_WIDTH 0 +`define DBG_CORE_REQ_MDATAW 0 `endif ////////////////////////// Dcache Configurable Knobs ////////////////////////// @@ -274,10 +275,10 @@ `define DCACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 0) // TAG sharing enable -`define DCORE_TAG_ID_BITS `ISTAG_BITS +`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) // Core request tag bits -`define DCORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `DCORE_TAG_ID_BITS) +`define DCORE_TAG_WIDTH (`DBG_CORE_REQ_MDATAW + `DCORE_TAG_ID_BITS) // DRAM request data bits `define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8) @@ -312,7 +313,7 @@ `define ICORE_TAG_ID_BITS `NW_BITS // Core request tag bits -`define ICORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `ICORE_TAG_ID_BITS) +`define ICORE_TAG_WIDTH (`DBG_CORE_REQ_MDATAW + `ICORE_TAG_ID_BITS) // DRAM request data bits `define IDRAM_LINE_WIDTH (`IBANK_LINE_SIZE * 8) diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 78d2436f..f5b792f3 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -22,7 +22,6 @@ module VX_execute #( // inputs VX_alu_req_if alu_req_if, - VX_bru_req_if bru_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, @@ -30,10 +29,10 @@ module VX_execute #( VX_gpu_req_if gpu_req_if, // outputs + VX_csr_to_issue_if csr_to_issue_if, VX_branch_ctl_if branch_ctl_if, VX_warp_ctl_if warp_ctl_if, VX_exu_to_cmt_if alu_commit_if, - VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if csr_commit_if, VX_exu_to_cmt_if mul_commit_if, @@ -43,25 +42,14 @@ module VX_execute #( output wire ebreak ); - VX_csr_to_fpu_if csr_to_fpu_if(); - VX_alu_unit #( .CORE_ID(CORE_ID) ) alu_unit ( .clk (clk), .reset (reset), .alu_req_if (alu_req_if), - .alu_commit_if (alu_commit_if) - ); - - VX_bru_unit #( - .CORE_ID(CORE_ID) - ) bru_unit ( - .clk (clk), - .reset (reset), - .bru_req_if (bru_req_if), .branch_ctl_if (branch_ctl_if), - .bru_commit_if (bru_commit_if) + .alu_commit_if (alu_commit_if) ); VX_lsu_unit #( @@ -82,7 +70,7 @@ module VX_execute #( .clk (clk), .reset (reset), .cmt_to_csr_if (cmt_to_csr_if), - .csr_to_fpu_if (csr_to_fpu_if), + .csr_to_issue_if (csr_to_issue_if), .csr_io_req_if (csr_io_req_if), .csr_io_rsp_if (csr_io_rsp_if), .csr_req_if (csr_req_if), @@ -95,8 +83,8 @@ module VX_execute #( ) mul_unit ( .clk (clk), .reset (reset), - .alu_req_if (mul_req_if), - .alu_commit_if (mul_commit_if) + .mul_req_if (mul_req_if), + .mul_commit_if (mul_commit_if) ); `else assign mul_req_if.ready = 0; @@ -112,7 +100,6 @@ module VX_execute #( .clk (clk), .reset (reset), .fpu_req_if (fpu_req_if), - .csr_to_fpu_if (csr_to_fpu_if), .fpu_commit_if (fpu_commit_if) ); `else @@ -134,9 +121,10 @@ module VX_execute #( .gpu_commit_if (gpu_commit_if) ); - assign ebreak = bru_req_if.valid - && (bru_req_if.op == `BRU_EBREAK - || bru_req_if.op == `BRU_ECALL); + assign ebreak = alu_req_if.valid + && `IS_BR_OP(alu_req_if.op) + && (`BR_OP(alu_req_if.op) == `BR_EBREAK + || `BR_OP(alu_req_if.op) == `BR_ECALL); `SCOPE_ASSIGN (scope_decode_valid, decode_if.valid); `SCOPE_ASSIGN (scope_decode_wid, decode_if.wid); diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index a255d2cc..c2be8803 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -9,59 +9,81 @@ module VX_fpu_unit #( // inputs VX_fpu_req_if fpu_req_if, - VX_csr_to_fpu_if csr_to_fpu_if, - + // outputs VX_fpu_to_cmt_if fpu_commit_if -); - VX_fpu_req_if fpu_req_tmp_if(); +); + localparam FPUQ_BITS = `LOG2UP(`FPUQ_SIZE); - // resolve dynamic FRM - wire [`FRM_BITS-1:0] frm, frm_tmp; - assign csr_to_fpu_if.wid = fpu_req_if.wid; - assign frm = (fpu_req_if.frm == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.frm; + wire ready_in; + wire valid_out; + wire ready_out; - // use a skid buffer since fpcore has realtime backpressure - VX_elastic_buffer #( - .DATAW (`ISTAG_BITS + `NW_BITS + 32 + `FPU_BITS + `FRM_BITS + (3 * `NUM_THREADS * 32)), - .SIZE (0) - ) input_buffer ( - .clk (clk), - .reset (reset), - .valid_in (fpu_req_if.valid), - .ready_in (fpu_req_if.ready), - .data_in ({fpu_req_if.issue_tag, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.op, frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), - .data_out ({fpu_req_tmp_if.issue_tag, fpu_req_tmp_if.wid, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.op, frm_tmp, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data}), - .ready_out (fpu_req_tmp_if.ready), - .valid_out (fpu_req_tmp_if.valid) + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_thread_mask; + wire [31:0] rsp_curr_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; + + wire has_fflags; + fflags_t [`NUM_THREADS-1:0] fflags; + wire [`NUM_THREADS-1:0][31:0] result; + + wire [FPUQ_BITS-1:0] tag_in, tag_out; + wire fpuq_full; + + wire fpuq_push = fpu_req_if.valid && fpu_req_if.ready; + wire fpuq_pop = valid_out && ready_out; + + VX_cam_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .SIZE (`FPUQ_SIZE) + ) mul_queue ( + .clk (clk), + .reset (reset), + .acquire_slot (fpuq_push), + .write_addr (tag_in), + .read_addr (tag_out), + .release_addr (tag_out), + .write_data ({fpu_req_if.wid, fpu_req_if.thread_mask, fpu_req_if.curr_PC, fpu_req_if.rd, fpu_req_if.wb}), + .read_data ({rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb}), + .release_slot (fpuq_pop), + .full (fpuq_full) ); + + wire valid_in = fpu_req_if.valid && ~fpuq_full; + + // can accept new request? + assign fpu_req_if.ready = ready_in && ~fpuq_full; `ifdef SYNTHESIS - VX_fp_fpga fp_core ( + VX_fp_fpga #( + .TAGW (FPUQ_BITS) + ) fp_core ( .clk (clk), .reset (reset), - .valid_in (fpu_req_tmp_if.valid), - .ready_in (fpu_req_tmp_if.ready), + .valid_in (valid_in), + .ready_in (ready_in), - .tag_in (fpu_req_tmp_if.issue_tag), + .tag_in (tag_in), - .op (fpu_req_tmp_if.op), - .frm (frm_tmp), + .op (fpu_req_if.op), + .frm (fpu_req_if.frm), - .dataa (fpu_req_tmp_if.rs1_data), - .datab (fpu_req_tmp_if.rs2_data), - .datac (fpu_req_tmp_if.rs3_data), - .result (fpu_commit_if.data), + .dataa (fpu_req_if.rs1_data), + .datab (fpu_req_if.rs2_data), + .datac (fpu_req_if.rs3_data), + .result (result), - .has_fflags (fpu_commit_if.has_fflags), - .fflags (fpu_commit_if.fflags), + .has_fflags (has_fflags), + .fflags (fflags), - .tag_out (fpu_commit_if.issue_tag), + .tag_out (tag_out), - .ready_out (1'b1), - .valid_out (fpu_commit_if.valid) + .ready_out (ready_out), + .valid_out (valid_out) ); `else @@ -70,33 +92,49 @@ module VX_fpu_unit #( .FMULADD (1), .FDIVSQRT (1), .FNONCOMP (1), - .FCONV (1) + .FCONV (1), + .TAGW (FPUQ_BITS) ) fp_core ( .clk (clk), .reset (reset), - .valid_in (fpu_req_tmp_if.valid), - .ready_in (fpu_req_tmp_if.ready), + .valid_in (valid_in), + .ready_in (ready_in), - .tag_in (fpu_req_tmp_if.issue_tag), + .tag_in (tag_in), - .op (fpu_req_tmp_if.op), - .frm (frm_tmp), + .op (fpu_req_if.op), + .frm (fpu_req_if.frm), - .dataa (fpu_req_tmp_if.rs1_data), - .datab (fpu_req_tmp_if.rs2_data), - .datac (fpu_req_tmp_if.rs3_data), - .result (fpu_commit_if.data), + .dataa (fpu_req_if.rs1_data), + .datab (fpu_req_if.rs2_data), + .datac (fpu_req_if.rs3_data), + .result (result), - .has_fflags (fpu_commit_if.has_fflags), - .fflags (fpu_commit_if.fflags), + .has_fflags (has_fflags), + .fflags (fflags), - .tag_out (fpu_commit_if.issue_tag), + .tag_out (tag_out), - .ready_out (1'b1), - .valid_out (fpu_commit_if.valid) + .ready_out (ready_out), + .valid_out (valid_out) ); `endif + wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid; + + VX_generic_register #( + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + (`NUM_THREADS * `FFG_BITS)) + ) fpu_reg ( + .clk (clk), + .reset (reset), + .stall (stall_out), + .flush (1'b0), + .in ({valid_out, rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb, result, has_fflags, fflags}), + .out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.thread_mask, fpu_commit_if.curr_PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, fpu_commit_if.has_fflags, fpu_commit_if.fflags}) + ); + + assign ready_out = ~stall_out; + endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_bypass.v b/hw/rtl/VX_gpr_bypass.v new file mode 100644 index 00000000..f2537301 --- /dev/null +++ b/hw/rtl/VX_gpr_bypass.v @@ -0,0 +1,53 @@ +`include "VX_platform.vh" + +module VX_gpr_bypass #( + parameter DATAW = 1, + parameter BUFFERED = 1 +) ( + input wire clk, + input wire reset, + input wire push, + input reg pop, + input wire [DATAW-1:0] data_in, + output wire [DATAW-1:0] data_out +); + reg [DATAW-1:0] buffer, buffer2; + reg use_buffer, use_buffer2; + reg delayed_push; + + always @(posedge clk) begin + if (reset) begin + delayed_push <= 0; + use_buffer <= 0; + use_buffer2 <= 0; + end else begin + delayed_push <= push; + assert(!use_buffer2 || use_buffer); + if (pop) begin + if (use_buffer) begin + buffer <= buffer2; + use_buffer <= use_buffer2; + use_buffer2 <= 0; + end + end + if (delayed_push) begin + if (use_buffer) begin + assert(!use_buffer2); // queue full! + if (pop) begin + buffer <= data_in; + end else begin + buffer2 <= data_in; + use_buffer2 <= 1; + end + use_buffer <= 1; + end else if (!pop) begin + buffer <= data_in; + use_buffer <= 1; + end + end + end + end + + assign data_out = use_buffer ? buffer : data_in; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index ff9eea79..54a203c0 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -16,25 +16,30 @@ module VX_gpr_fp_ctrl ( reg [`NUM_THREADS-1:0][31:0] rs1_tmp_data, rs2_tmp_data, rs3_tmp_data; reg read_rs3; + reg [`NW_BITS-1:0] rs3_wid; wire rs3_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3; - wire read_fire = gpr_read_if.valid && read_rs3; + wire read_fire = gpr_read_if.valid && gpr_read_if.ready_out; always @(posedge clk) begin if (reset) begin read_rs3 <= 0; + rs3_wid <= 0; end else begin if (rs3_delay) begin read_rs3 <= 1; + rs3_wid <= gpr_read_if.wid; end else if (read_fire) begin read_rs3 <= 0; end + if (read_rs3) begin + assert(rs3_wid == gpr_read_if.wid); + end end end - // backup original rs1 data - always @(posedge clk) begin - if (~gpr_read_if.use_rs3 || rs3_delay) begin + always @(posedge clk) begin + if (~read_rs3) begin rs1_tmp_data <= rs1_data; end rs2_tmp_data <= rs2_data; @@ -44,7 +49,7 @@ module VX_gpr_fp_ctrl ( // outputs wire [`NR_BITS-1:0] rs1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1; assign raddr1 = {gpr_read_if.wid, rs1}; - assign gpr_read_if.ready = ~rs3_delay; + assign gpr_read_if.ready_in = ~rs3_delay; assign gpr_read_if.rs1_data = rs1_tmp_data; assign gpr_read_if.rs2_data = rs2_tmp_data; assign gpr_read_if.rs3_data = rs3_tmp_data; diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index bcf55b56..acb2fa08 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -7,7 +7,7 @@ module VX_gpr_stage #( input wire reset, // inputs - VX_wb_if writeback_if, + VX_writeback_if writeback_if, // outputs VX_gpr_read_if gpr_read_if @@ -50,14 +50,14 @@ module VX_gpr_stage #( assign gpr_read_if.rs1_data = rs1_tmp_data; assign gpr_read_if.rs2_data = rs2_tmp_data; assign gpr_read_if.rs3_data = 0; - assign gpr_read_if.ready = 1; + assign gpr_read_if.ready_in = 1; - wire valid = gpr_read_if.valid; - wire use_rs3 = gpr_read_if.use_rs3; - wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3; - `UNUSED_VAR (valid); - `UNUSED_VAR (use_rs3); - `UNUSED_VAR (rs3); + `UNUSED_FIELD (gpr_read_if, valid); + `UNUSED_FIELD (gpr_read_if, use_rs3); + `UNUSED_FIELD (gpr_read_if, rs3); + `UNUSED_FIELD (gpr_read_if, ready_out); `endif + assign writeback_if.ready = 1'b1; + endmodule diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 8c85f267..ee8af594 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -3,15 +3,15 @@ module VX_gpu_unit #( parameter CORE_ID = 0 ) ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, // Input - VX_gpu_req_if gpu_req_if, + VX_gpu_req_if gpu_req_if, // Output - VX_warp_ctl_if warp_ctl_if, - VX_exu_to_cmt_if gpu_commit_if + VX_warp_ctl_if warp_ctl_if, + VX_exu_to_cmt_if gpu_commit_if ); gpu_tmc_t tmc; gpu_wspawn_t wspawn; @@ -23,15 +23,13 @@ module VX_gpu_unit #( wire is_split = (gpu_req_if.op == `GPU_SPLIT); wire is_bar = (gpu_req_if.op == `GPU_BAR); - wire gpu_req_fire = gpu_req_if.valid; - // tmc wire [`NUM_THREADS-1:0] tmc_new_mask; for (genvar i = 0; i < `NUM_THREADS; i++) begin assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]); end - assign tmc.valid = gpu_req_fire && is_tmc; + assign tmc.valid = is_tmc; assign tmc.thread_mask = tmc_new_mask; // wspawn @@ -41,7 +39,7 @@ module VX_gpu_unit #( for (genvar i = 0; i < `NUM_WARPS; i++) begin assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); end - assign wspawn.valid = gpu_req_fire && is_wspawn; + assign wspawn.valid = is_wspawn; assign wspawn.wmask = wspawn_wmask; assign wspawn.pc = wspawn_pc; @@ -56,7 +54,7 @@ module VX_gpu_unit #( assign split_else_mask[i] = gpu_req_if.thread_mask[i] & ~taken; end - assign split.valid = gpu_req_fire && is_split; + assign split.valid = is_split; assign split.diverged = (| split_then_mask) && (| split_else_mask); assign split.then_mask = split_then_mask; assign split.else_mask = split_else_mask; @@ -64,23 +62,29 @@ module VX_gpu_unit #( // barrier - assign barrier.valid = is_bar && gpu_req_fire; - assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; - assign barrier.num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); + assign barrier.valid = is_bar; + assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; + assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); // output + wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid; + VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t)) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t)) ) gpu_reg ( .clk (clk), .reset (reset), - .stall (0), + .stall (stall), .flush (0), - .in ({gpu_req_if.valid, gpu_req_if.issue_tag, gpu_req_if.wid, tmc, wspawn, split, barrier}), - .out ({gpu_commit_if.valid, gpu_commit_if.issue_tag, warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) + .in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}), + .out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.thread_mask, gpu_commit_if.curr_PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) ); - assign gpu_req_if.ready = 1'b1; + assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; + assign warp_ctl_if.wid = gpu_commit_if.wid; + + // can accept new request? + assign gpu_req_if.ready = ~stall; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v new file mode 100644 index 00000000..fd8b6a8d --- /dev/null +++ b/hw/rtl/VX_ibuffer.v @@ -0,0 +1,187 @@ +`include "VX_define.vh" + +module VX_ibuffer #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // inputs + input wire freeze, // do not switch to another warp + VX_decode_if ibuf_enq_if, + + // outputs + VX_decode_if ibuf_deq_if +); + localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + 1 + `NUM_REGS; + localparam SIZE = `IBUF_SIZE; + + `USE_FAST_BRAM reg [DATAW-1:0] entries [`NUM_WARPS-1:0][SIZE-1:0]; + reg [`LOG2UP(SIZE+1)-1:0] size_r [`NUM_WARPS-1:0]; + reg [`LOG2UP(SIZE):0] rd_ptr_r [`NUM_WARPS-1:0]; + reg [`LOG2UP(SIZE):0] wr_ptr_r [`NUM_WARPS-1:0]; + + wire [`NUM_WARPS-1:0] q_full; + wire [`NUM_WARPS-1:0][`LOG2UP(SIZE+1)-1:0] q_size; + wire [DATAW-1:0] q_data_in; + wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev; + reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out; + + wire enq_fire = ibuf_enq_if.valid && ibuf_enq_if.ready; + wire deq_fire = ibuf_deq_if.valid && ibuf_deq_if.ready; + + for (genvar i = 0; i < `NUM_WARPS; ++i) begin + + wire writing = enq_fire && (i == ibuf_enq_if.wid); + wire reading = deq_fire && (i == ibuf_deq_if.wid); + + wire [`LOG2UP(SIZE-1)-1:0] rd_ptr_a = rd_ptr_r[i][`LOG2UP(SIZE-1)-1:0]; + wire [`LOG2UP(SIZE-1)-1:0] wr_ptr_a = wr_ptr_r[i][`LOG2UP(SIZE-1)-1:0]; + + always @(posedge clk) begin + if (reset) begin + rd_ptr_r[i] <= 0; + wr_ptr_r[i] <= 0; + size_r[i] <= 0; + end else begin + if (writing) begin + if ((0 == size_r[i]) || ((1 == size_r[i]) && reading)) begin + q_data_out[i] <= q_data_in; + end else begin + entries[i][wr_ptr_a] <= q_data_in; + wr_ptr_r[i] <= wr_ptr_r[i] + 1; + end + if (!reading) begin + size_r[i] <= size_r[i] + 1; + end + end + if (reading) begin + if (size_r[i] != 1) begin + q_data_out[i] <= q_data_prev[i]; + rd_ptr_r[i] <= rd_ptr_r[i] + 1; + end + if (!writing) begin + size_r[i] <= size_r[i] - 1; + end + end + end + end + + assign q_data_prev[i] = (wr_ptr_r != rd_ptr_r) ? entries[i][rd_ptr_a] : q_data_in; + assign q_full[i] = (size_r[i] == SIZE); + assign q_size[i] = size_r[i]; + end + + /////////////////////////////////////////////////////////////////////////// + + reg [`NUM_WARPS-1:0] valid_table, valid_table_n; + reg [`NUM_WARPS-1:0] ready_table, ready_table_n; + reg [`LOG2UP(`NUM_WARPS+1)-1:0] active_warps; + reg [`NW_BITS-1:0] deq_wid, deq_wid_n; + reg deq_valid, deq_valid_n; + reg [DATAW-1:0] deq_instr, deq_instr_n; + + always @(*) begin + valid_table_n = valid_table; + if (deq_fire) begin + valid_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1); + end + if (enq_fire) begin + valid_table_n[ibuf_enq_if.wid] = 1; + end + end + + always @(*) begin + deq_wid_n = 0; + deq_valid_n = 0; + ready_table_n = ready_table; + if (deq_fire) begin + ready_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1); + end + for (integer i = 0; i < `NUM_WARPS; i++) begin + if (ready_table_n[i]) begin + deq_wid_n = `NW_BITS'(i); + deq_valid_n = 1; + deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i]; + ready_table_n[i] = 0; + break; + end + end + end + + wire warp_added = enq_fire && (0 == q_size[ibuf_enq_if.wid]) && (!deq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid); + wire warp_removed = deq_fire && (1 == q_size[ibuf_deq_if.wid]) && (!enq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid); + + always @(posedge clk) begin + if (reset) begin + valid_table <= 0; + ready_table <= 0; + deq_valid <= 0; + active_warps <= 0; + end else begin + valid_table <= valid_table_n; + ready_table <= (| ready_table_n) ? ready_table_n : valid_table_n; + + if (enq_fire && (0 == active_warps)) begin + deq_valid <= 1; + deq_wid <= ibuf_enq_if.wid; + deq_instr <= q_data_in; + end else if (!freeze) begin + deq_valid <= deq_valid_n; + deq_wid <= deq_wid_n; + deq_instr <= deq_instr_n; + end + + if (warp_added && !warp_removed) begin + active_warps <= active_warps + 1; + end else if (warp_removed && !warp_added) begin + active_warps <= active_warps - 1; + end + + begin + integer k = 0; + for (integer i = 0; i < `NUM_WARPS; i++) begin + k += 32'(q_size[i] != 0); + end + assert(k == 32'(active_warps)); + assert(~deq_fire || active_warps != 0); + end + end + end + + assign ibuf_enq_if.ready = ~q_full[ibuf_enq_if.wid]; + assign q_data_in = {ibuf_enq_if.thread_mask, + ibuf_enq_if.curr_PC, + ibuf_enq_if.ex_type, + ibuf_enq_if.ex_op, + ibuf_enq_if.frm, + ibuf_enq_if.wb, + ibuf_enq_if.rd, + ibuf_enq_if.rs1, + ibuf_enq_if.rs2, + ibuf_enq_if.rs3, + ibuf_enq_if.imm, + ibuf_enq_if.rs1_is_PC, + ibuf_enq_if.rs2_is_imm, + ibuf_enq_if.use_rs3, + ibuf_enq_if.used_regs}; + + assign ibuf_deq_if.valid = deq_valid; + assign ibuf_deq_if.wid = deq_wid; + assign {ibuf_deq_if.thread_mask, + ibuf_deq_if.curr_PC, + ibuf_deq_if.ex_type, + ibuf_deq_if.ex_op, + ibuf_deq_if.frm, + ibuf_deq_if.wb, + ibuf_deq_if.rd, + ibuf_deq_if.rs1, + ibuf_deq_if.rs2, + ibuf_deq_if.rs3, + ibuf_deq_if.imm, + ibuf_deq_if.rs1_is_PC, + ibuf_deq_if.rs2_is_imm, + ibuf_deq_if.use_rs3, + ibuf_deq_if.used_regs} = deq_instr; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 9af23468..ae621785 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -46,7 +46,7 @@ module VX_icache_stage #( assign ifetch_req_if.ready = icache_req_if.ready; `ifdef DBG_CORE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, `NR_BITS'(0), ifetch_req_if.wid, req_tag}; + assign icache_req_if.tag = {ifetch_req_if.curr_PC, `NR_BITS'(0), ifetch_req_if.wid, req_tag}; `else assign icache_req_if.tag = req_tag; `endif diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v new file mode 100644 index 00000000..9674b09d --- /dev/null +++ b/hw/rtl/VX_instr_demux.v @@ -0,0 +1,233 @@ +`include "VX_define.vh" + +module VX_instr_demux ( + input wire clk, + input wire reset, + + // inputs + VX_decode_if execute_if, + VX_gpr_read_if gpr_read_if, + VX_csr_to_issue_if csr_to_issue_if, + + // outputs + VX_alu_req_if alu_req_if, + VX_lsu_req_if lsu_req_if, + VX_csr_req_if csr_req_if, + VX_mul_req_if mul_req_if, + VX_fpu_req_if fpu_req_if, + VX_gpu_req_if gpu_req_if +); + // ALU unit + + wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); + wire alu_req_ready; + + wire [`NT_BITS-1:0] tid; + VX_priority_encoder #( + .N(`NUM_THREADS) + ) tid_select ( + .data_in (execute_if.thread_mask), + .data_out (tid), + `UNUSED_PIN (valid_out) + ); + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `ALU_BR_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS) + ) alu_reg ( + .clk (clk), + .reset (reset), + .ready_in (alu_req_ready), + .valid_in (alu_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `ALU_BR_OP(execute_if.ex_op), execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}), + .data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}), + .ready_out (alu_req_if.ready), + .valid_out (alu_req_if.valid) + ); + + VX_gpr_bypass #( + .DATAW ((2 * `NUM_THREADS * 32)) + ) alu_bypass ( + .clk (clk), + .reset (reset), + .push (alu_req_valid && alu_req_ready), + .data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}), + .data_out ({alu_req_if.rs1_data, alu_req_if.rs2_data}), + .pop (alu_req_if.valid && alu_req_if.ready) + ); + + // lsu unit + + wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU); + wire lsu_req_ready; + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1) + ) lsu_reg ( + .clk (clk), + .reset (reset), + .ready_in (lsu_req_ready), + .valid_in (lsu_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `LSU_RW(execute_if.ex_op), `LSU_BE(execute_if.ex_op), execute_if.imm, execute_if.rd, execute_if.wb}), + .data_out ({lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.curr_PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb}), + .ready_out (lsu_req_if.ready), + .valid_out (lsu_req_if.valid) + ); + + VX_gpr_bypass #( + .DATAW ((2 * `NUM_THREADS * 32)) + ) lsu_bypass ( + .clk (clk), + .reset (reset), + .push (lsu_req_valid && lsu_req_ready), + .data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}), + .data_out ({lsu_req_if.base_addr, lsu_req_if.store_data}), + .pop (lsu_req_if.valid && lsu_req_if.ready) + ); + + // csr unit + + wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR); + wire csr_req_ready; + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1) + ) csr_reg ( + .clk (clk), + .reset (reset), + .ready_in (csr_req_ready), + .valid_in (csr_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `CSR_OP(execute_if.ex_op), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}), + .data_out ({csr_req_if.wid, csr_req_if.thread_mask, csr_req_if.curr_PC, csr_req_if.op, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io}), + .ready_out (csr_req_if.ready), + .valid_out (csr_req_if.valid) + ); + + reg tmp_rs2_is_imm; + reg [`NR_BITS-1:0] tmp_rs1; + + always @(posedge clk) begin + tmp_rs2_is_imm <= execute_if.rs2_is_imm; + tmp_rs1 <= execute_if.rs1; + end + + wire [31:0] csr_req_mask = tmp_rs2_is_imm ? 32'(tmp_rs1) : gpr_read_if.rs1_data[0]; + + VX_gpr_bypass #( + .DATAW (32) + ) csr_bypass ( + .clk (clk), + .reset (reset), + .push (csr_req_valid && csr_req_ready), + .data_in (csr_req_mask), + .data_out (csr_req_if.csr_mask), + .pop (csr_req_if.valid && csr_req_if.ready) + ); + + // mul unit + +`ifdef EXT_M_ENABLE + wire mul_req_valid = execute_if.valid && (execute_if.ex_type == `EX_MUL); + wire mul_req_ready; + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1) + ) mul_reg ( + .clk (clk), + .reset (reset), + .ready_in (mul_req_ready), + .valid_in (mul_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `MUL_OP(execute_if.ex_op), execute_if.rd, execute_if.wb}), + .data_out ({mul_req_if.wid, mul_req_if.thread_mask, mul_req_if.curr_PC, mul_req_if.op, mul_req_if.rd, mul_req_if.wb}), + .ready_out (mul_req_if.ready), + .valid_out (mul_req_if.valid) + ); + + VX_gpr_bypass #( + .DATAW ((2 * `NUM_THREADS * 32)) + ) mul_bypass ( + .clk (clk), + .reset (reset), + .push (mul_req_valid && mul_req_ready), + .data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}), + .data_out ({mul_req_if.rs1_data, mul_req_if.rs2_data}), + .pop (mul_req_if.valid && mul_req_if.ready) + ); +`endif + + // fpu unit + +`ifdef EXT_F_ENABLE + wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU); + wire fpu_req_ready; + + // resolve dynamic FRM + assign csr_to_issue_if.wid = execute_if.wid; + wire [`FRM_BITS-1:0] fpu_frm = (execute_if.frm == `FRM_DYN) ? csr_to_issue_if.frm : execute_if.frm; + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `FRM_BITS + `NR_BITS + 1) + ) fpu_reg ( + .clk (clk), + .reset (reset), + .ready_in (fpu_req_ready), + .valid_in (fpu_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `FPU_OP(execute_if.ex_op), fpu_frm, execute_if.rd, execute_if.wb}), + .data_out ({fpu_req_if.wid, fpu_req_if.thread_mask, fpu_req_if.curr_PC, fpu_req_if.op, fpu_req_if.frm, fpu_req_if.rd, fpu_req_if.wb}), + .ready_out (fpu_req_if.ready), + .valid_out (fpu_req_if.valid) + ); + + VX_gpr_bypass #( + .DATAW ((3 * `NUM_THREADS * 32)) + ) fpu_bypass ( + .clk (clk), + .reset (reset), + .push (fpu_req_valid && fpu_req_ready), + .data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}), + .data_out ({fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .pop (fpu_req_if.valid && fpu_req_if.ready) + ); +`endif + + // gpu unit + + wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); + wire gpu_req_ready; + + VX_skid_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `GPU_BITS + `NR_BITS + 1) + ) gpu_reg ( + .clk (clk), + .reset (reset), + .ready_in (gpu_req_ready), + .valid_in (gpu_req_valid), + .data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `GPU_OP(execute_if.ex_op), execute_if.rd, execute_if.wb}), + .data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.op, gpu_req_if.rd, gpu_req_if.wb}), + .ready_out (gpu_req_if.ready), + .valid_out (gpu_req_if.valid) + ); + + VX_gpr_bypass #( + .DATAW ((`NUM_THREADS * 32) + 32) + ) gpu_bypass ( + .clk (clk), + .reset (reset), + .push (gpu_req_valid && gpu_req_ready), + .data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data[0]}), + .data_out ({gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .pop (gpu_req_if.valid && gpu_req_if.ready) + ); + + // can take next request? + assign execute_if.ready = (alu_req_ready && (execute_if.ex_type == `EX_ALU)) + || (lsu_req_ready && (execute_if.ex_type == `EX_LSU)) + || (csr_req_ready && (execute_if.ex_type == `EX_CSR)) + `ifdef EXT_M_ENABLE + || (mul_req_ready && (execute_if.ex_type == `EX_MUL)) + `endif + `ifdef EXT_F_ENABLE + || (fpu_req_ready && (execute_if.ex_type == `EX_FPU)) + `endif + || (gpu_req_ready && (execute_if.ex_type == `EX_GPU)); + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 6dcc1f7e..bb50513b 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -7,53 +7,51 @@ module VX_issue #( input wire reset, VX_decode_if decode_if, - VX_wb_if writeback_if, - VX_cmt_to_issue_if cmt_to_issue_if, + VX_writeback_if writeback_if, + VX_csr_to_issue_if csr_to_issue_if, VX_alu_req_if alu_req_if, - VX_bru_req_if bru_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, VX_fpu_req_if fpu_req_if, VX_gpu_req_if gpu_req_if ); - - wire [`ISTAG_BITS-1:0] issue_tag; - wire schedule_delay; - + VX_decode_if ibuf_deq_if(); + VX_decode_if execute_if(); VX_gpr_read_if gpr_read_if(); - assign gpr_read_if.valid = decode_if.valid && ~schedule_delay; - assign gpr_read_if.wid = decode_if.wid; - assign gpr_read_if.rs1 = decode_if.rs1; - assign gpr_read_if.rs2 = decode_if.rs2; - assign gpr_read_if.rs3 = decode_if.rs3; - assign gpr_read_if.use_rs3 = decode_if.use_rs3; - wire ex_busy = (~alu_req_if.ready && (decode_if.ex_type == `EX_ALU)) - || (~bru_req_if.ready && (decode_if.ex_type == `EX_BRU)) - || (~lsu_req_if.ready && (decode_if.ex_type == `EX_LSU)) - || (~csr_req_if.ready && (decode_if.ex_type == `EX_CSR)) - `ifdef EXT_M_ENABLE - || (~mul_req_if.ready && (decode_if.ex_type == `EX_MUL)) - `endif - `ifdef EXT_F_ENABLE - || (~fpu_req_if.ready && (decode_if.ex_type == `EX_FPU)) - `endif - || (~gpu_req_if.ready && (decode_if.ex_type == `EX_GPU)); + wire scoreboard_delay; + + VX_ibuffer #( + .CORE_ID(CORE_ID) + ) ibuffer ( + .clk (clk), + .reset (reset), + .ibuf_enq_if (decode_if), + .ibuf_deq_if (ibuf_deq_if), + .freeze (~gpr_read_if.ready_in) + ); VX_scoreboard #( .CORE_ID(CORE_ID) ) scoreboard ( .clk (clk), .reset (reset), - .decode_if (decode_if), + .ibuf_deq_if (ibuf_deq_if), .writeback_if (writeback_if), - .cmt_to_issue_if(cmt_to_issue_if), - .ex_busy (ex_busy), - .issue_tag (issue_tag), - .schedule_delay (schedule_delay) + .exe_delay (~execute_if.ready), + .gpr_delay (~gpr_read_if.ready_in), + .delay (scoreboard_delay) ); + + assign gpr_read_if.valid = ibuf_deq_if.valid && ~scoreboard_delay; + assign gpr_read_if.wid = ibuf_deq_if.wid; + assign gpr_read_if.rs1 = ibuf_deq_if.rs1; + assign gpr_read_if.rs2 = ibuf_deq_if.rs2; + assign gpr_read_if.rs3 = ibuf_deq_if.rs3; + assign gpr_read_if.use_rs3 = ibuf_deq_if.use_rs3; + assign gpr_read_if.ready_out = execute_if.ready; VX_gpr_stage #( .CORE_ID(CORE_ID) @@ -63,72 +61,54 @@ module VX_issue #( .writeback_if (writeback_if), .gpr_read_if (gpr_read_if) ); - - VX_issue_if issue_if(); - - assign issue_if.rs1_data = gpr_read_if.rs1_data; - assign issue_if.rs2_data = gpr_read_if.rs2_data; - assign issue_if.rs3_data = gpr_read_if.rs3_data; - - wire [`NT_BITS-1:0] tid; - VX_priority_encoder #( - .N(`NUM_THREADS) - ) sel_src ( - .data_in (decode_if.thread_mask), - .data_out (tid), - `UNUSED_PIN (valid_out) - ); - - wire stall = schedule_delay || ~gpr_read_if.ready; - wire flush = stall; // clear output on stall - - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `FRM_BITS + `NT_BITS) - ) issue_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (flush), - .in ({decode_if.valid, issue_tag, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.frm, tid}), - .out ({issue_if.valid, issue_if.issue_tag, issue_if.wid, issue_if.thread_mask, issue_if.curr_PC, issue_if.rd, issue_if.rs1, issue_if.imm, issue_if.rs1_is_PC, issue_if.rs2_is_imm, issue_if.ex_type, issue_if.ex_op, issue_if.wb, issue_if.frm, issue_if.tid}) - ); - - assign decode_if.ready = issue_if.ready; - assign issue_if.ready = ~stall; - VX_issue_demux issue_demux ( - .issue_if (issue_if), - .alu_req_if (alu_req_if), - .bru_req_if (bru_req_if), - .lsu_req_if (lsu_req_if), - .csr_req_if (csr_req_if), - .mul_req_if (mul_req_if), - .fpu_req_if (fpu_req_if), - .gpu_req_if (gpu_req_if) - ); + assign execute_if.valid = ibuf_deq_if.valid && gpr_read_if.ready_in && ~scoreboard_delay; + assign execute_if.wid = ibuf_deq_if.wid; + assign execute_if.thread_mask = ibuf_deq_if.thread_mask; + assign execute_if.curr_PC = ibuf_deq_if.curr_PC; + assign execute_if.ex_type = ibuf_deq_if.ex_type; + assign execute_if.ex_op = ibuf_deq_if.ex_op; + assign execute_if.frm = ibuf_deq_if.frm; + assign execute_if.wb = ibuf_deq_if.wb; + assign execute_if.rd = ibuf_deq_if.rd; + assign execute_if.rs1 = ibuf_deq_if.rs1; + assign execute_if.imm = ibuf_deq_if.imm; + assign execute_if.rs1_is_PC = ibuf_deq_if.rs1_is_PC; + assign execute_if.rs2_is_imm = ibuf_deq_if.rs2_is_imm; + + VX_instr_demux instr_demux ( + .clk (clk), + .reset (reset), + .execute_if (execute_if), + .gpr_read_if (gpr_read_if), + .csr_to_issue_if(csr_to_issue_if), + .alu_req_if (alu_req_if), + .lsu_req_if (lsu_req_if), + .csr_req_if (csr_req_if), + .mul_req_if (mul_req_if), + .fpu_req_if (fpu_req_if), + .gpu_req_if (gpu_req_if) + ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.curr_PC, alu_req_if.issue_tag, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data); - end - if (bru_req_if.valid && bru_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, offset=%0h", $time, CORE_ID, bru_req_if.wid, bru_req_if.curr_PC, bru_req_if.issue_tag, bru_req_if.thread_mask, bru_req_if.rs1_data, bru_req_if.rs2_data, bru_req_if.offset); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.curr_PC, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data); end if (lsu_req_if.valid && lsu_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.curr_PC, lsu_req_if.issue_tag, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.curr_PC, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); end if (csr_req_if.valid && csr_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.curr_PC, csr_req_if.issue_tag, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.curr_PC, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask); end if (mul_req_if.valid && mul_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.curr_PC, mul_req_if.issue_tag, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.curr_PC, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data); end if (fpu_req_if.valid && fpu_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.issue_tag, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); end if (gpu_req_if.valid && gpu_req_if.ready) begin - $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.curr_PC, gpu_req_if.issue_tag, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); + $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.curr_PC, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); end end `endif diff --git a/hw/rtl/VX_issue_demux.v b/hw/rtl/VX_issue_demux.v deleted file mode 100644 index 16b0dd85..00000000 --- a/hw/rtl/VX_issue_demux.v +++ /dev/null @@ -1,102 +0,0 @@ -`include "VX_define.vh" - -module VX_issue_demux ( - // inputs - VX_issue_if issue_if, - - // outputs - VX_alu_req_if alu_req_if, - VX_bru_req_if bru_req_if, - VX_lsu_req_if lsu_req_if, - VX_csr_req_if csr_req_if, - VX_mul_req_if mul_req_if, - VX_fpu_req_if fpu_req_if, - VX_gpu_req_if gpu_req_if -); - // ALU unit - assign alu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_ALU); - assign alu_req_if.issue_tag = issue_if.issue_tag; - assign alu_req_if.wid = issue_if.wid; - assign alu_req_if.thread_mask = issue_if.thread_mask; - assign alu_req_if.curr_PC = issue_if.curr_PC; - assign alu_req_if.op = `ALU_OP(issue_if.ex_op); - assign alu_req_if.rs1_is_PC = issue_if.rs1_is_PC; - assign alu_req_if.rs2_is_imm = issue_if.rs2_is_imm; - assign alu_req_if.imm = issue_if.imm; - assign alu_req_if.rs1_data = issue_if.rs1_data; - assign alu_req_if.rs2_data = issue_if.rs2_data; - - // BRU unit - assign bru_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_BRU); - assign bru_req_if.issue_tag = issue_if.issue_tag; - assign bru_req_if.wid = issue_if.wid; - assign bru_req_if.thread_mask = issue_if.thread_mask; - assign bru_req_if.curr_PC = issue_if.curr_PC; - assign bru_req_if.op = `BRU_OP(issue_if.ex_op); - assign bru_req_if.rs1_is_PC = issue_if.rs1_is_PC; - assign bru_req_if.rs1_data = issue_if.rs1_data[issue_if.tid]; - assign bru_req_if.rs2_data = issue_if.rs2_data[issue_if.tid]; - assign bru_req_if.offset = issue_if.imm; - - // LSU unit - assign lsu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_LSU); - assign lsu_req_if.issue_tag = issue_if.issue_tag; - assign lsu_req_if.wid = issue_if.wid; - assign lsu_req_if.thread_mask = issue_if.thread_mask; - assign lsu_req_if.curr_PC = issue_if.curr_PC; - assign lsu_req_if.rw = `LSU_RW(issue_if.ex_op); - assign lsu_req_if.byteen = `LSU_BE(issue_if.ex_op); - assign lsu_req_if.base_addr = issue_if.rs1_data; - assign lsu_req_if.store_data = issue_if.rs2_data; - assign lsu_req_if.offset = issue_if.imm; - assign lsu_req_if.rd = issue_if.rd; - assign lsu_req_if.wb = issue_if.wb; - - // CSR unit - assign csr_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_CSR); - assign csr_req_if.issue_tag = issue_if.issue_tag; - assign csr_req_if.wid = issue_if.wid; - assign csr_req_if.thread_mask = issue_if.thread_mask; - assign csr_req_if.curr_PC = issue_if.curr_PC; - assign csr_req_if.op = `CSR_OP(issue_if.ex_op); - assign csr_req_if.csr_addr = issue_if.imm[`CSR_ADDR_BITS-1:0]; - assign csr_req_if.csr_mask = issue_if.rs2_is_imm ? 32'(issue_if.rs1) : issue_if.rs1_data[0]; - assign csr_req_if.is_io = 1'b0; - - // MUL unit -`ifdef EXT_M_ENABLE - assign mul_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_MUL); - assign mul_req_if.issue_tag = issue_if.issue_tag; - assign mul_req_if.wid = issue_if.wid; - assign mul_req_if.thread_mask = issue_if.thread_mask; - assign mul_req_if.curr_PC = issue_if.curr_PC; - assign mul_req_if.op = `MUL_OP(issue_if.ex_op); - assign mul_req_if.rs1_data = issue_if.rs1_data; - assign mul_req_if.rs2_data = issue_if.rs2_data; -`endif - - // FPU unit -`ifdef EXT_F_ENABLE - assign fpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_FPU); - assign fpu_req_if.issue_tag = issue_if.issue_tag; - assign fpu_req_if.wid = issue_if.wid; - assign fpu_req_if.thread_mask = issue_if.thread_mask; - assign fpu_req_if.curr_PC = issue_if.curr_PC; - assign fpu_req_if.op = `FPU_OP(issue_if.ex_op); - assign fpu_req_if.frm = issue_if.frm; - assign fpu_req_if.rs1_data = issue_if.rs1_data; - assign fpu_req_if.rs2_data = issue_if.rs2_data; - assign fpu_req_if.rs3_data = issue_if.rs3_data; -`endif - - // GPU unit - assign gpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_GPU); - assign gpu_req_if.issue_tag = issue_if.issue_tag; - assign gpu_req_if.wid = issue_if.wid; - assign gpu_req_if.thread_mask = issue_if.thread_mask; - assign gpu_req_if.curr_PC = issue_if.curr_PC; - assign gpu_req_if.op = `GPU_OP(issue_if.ex_op); - assign gpu_req_if.rs1_data = issue_if.rs1_data; - assign gpu_req_if.rs2_data = issue_if.rs2_data[0]; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 9b7ecc8f..2463bd67 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -18,10 +18,6 @@ module VX_lsu_unit #( // outputs VX_exu_to_cmt_if lsu_commit_if ); - - wire valid_in; - wire ready_in; - wire [`NUM_THREADS-1:0] req_thread_mask; wire req_rw; wire [`NUM_THREADS-1:0][29:0] req_addr; @@ -30,10 +26,9 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0][31:0] req_data; wire [1:0] req_sext; wire [`NR_BITS-1:0] req_rd; - wire [`NW_BITS-1:0] req_wid; - wire [`ISTAG_BITS-1:0] req_issue_tag; wire req_wb; - wire [31:0] req_pc; + wire [`NW_BITS-1:0] req_wid; + wire [31:0] req_curr_PC; wire [`NUM_THREADS-1:0][31:0] full_address; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -74,121 +69,127 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0][31:0] req_address; `IGNORE_WARNINGS_END - // use a skid buffer because the dcache's ready signal is combinational - // use buffer size of two for stall-free execution - VX_elastic_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + `ISTAG_BITS + (`NUM_THREADS * 32) + 2 + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + 1 + 32), - .SIZE (2) - ) input_buffer ( - .clk (clk), - .reset (reset), - .valid_in (lsu_req_if.valid), - .ready_in (lsu_req_if.ready), - .data_in ({lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.issue_tag, full_address, mem_req_sext, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.curr_PC}), - .data_out ({req_wid, req_thread_mask, req_issue_tag, req_address, req_sext, req_rw, req_addr, req_offset, req_byteen, req_data, req_rd, req_wb, req_pc}), - .ready_out (ready_in), - .valid_out (valid_in) + wire valid_in; + wire stall_in; + + VX_generic_register #( + .N(1 + `NW_BITS + `NUM_THREADS + 32 + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 2 + (`NUM_THREADS * (30 + 2 + 4 + 32))) + ) lsu_req_reg ( + .clk (clk), + .reset (reset), + .stall (stall_in), + .flush (0), + .in ({lsu_req_if.valid, lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.curr_PC, lsu_req_if.rw, lsu_req_if.rd, lsu_req_if.wb, full_address, mem_req_sext, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data}), + .out ({valid_in, req_wid, req_thread_mask, req_curr_PC, req_rw, req_rd, req_wb, req_address, req_sext, req_addr, req_offset, req_byteen, req_data}) ); - reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] mem_rsp_mask_buf; - reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] mem_rsp_data_prev_buf; + wire [`NW_BITS-1:0] rsp_wid; + wire [31:0] rsp_curr_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; + wire [`NUM_THREADS-1:0][1:0] rsp_offset; + wire [1:0] rsp_sext; + reg [`NUM_THREADS-1:0][31:0] rsp_data; - reg [`NUM_THREADS-1:0][1:0] mem_rsp_offset_buf [`ISSUEQ_SIZE-1:0]; - reg [1:0] mem_rsp_sext_buf [`ISSUEQ_SIZE-1:0]; - reg [`NW_BITS-1:0] mem_rsp_wid_buf [`ISSUEQ_SIZE-1:0]; - reg [31:0] mem_rsp_curr_PC_buf [`ISSUEQ_SIZE-1:0]; - reg [`NR_BITS-1:0] mem_rsp_rd_buf [`ISSUEQ_SIZE-1:0]; + reg [`NUM_THREADS-1:0] mem_rsp_mask[`LSUQ_SIZE-1:0]; - reg [`NUM_THREADS-1:0][31:0] mem_rsp_data_curr; + wire [`DCORE_TAG_ID_BITS-1:0] req_tag, rsp_tag; + wire lsuq_full; - wire [`ISTAG_BITS-1:0] rsp_issue_tag = dcache_rsp_if.tag[0][`ISTAG_BITS-1:0]; + wire lsuq_push = (| dcache_req_if.valid) && dcache_req_if.ready + && (0 == req_rw); // only loads - wire [`NUM_THREADS-1:0] mem_rsp_mask = mem_rsp_mask_buf [rsp_issue_tag]; - wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset = mem_rsp_offset_buf [rsp_issue_tag]; - wire [1:0] mem_rsp_sext = mem_rsp_sext_buf [rsp_issue_tag]; - wire [`NUM_THREADS-1:0][31:0] mem_rsp_data_prev= mem_rsp_data_prev_buf [rsp_issue_tag]; - wire [`NW_BITS-1:0] mem_rsp_wid = mem_rsp_wid_buf [rsp_issue_tag]; - wire [31:0] mem_rsp_curr_PC = mem_rsp_curr_PC_buf [rsp_issue_tag]; - wire [`NR_BITS-1:0] mem_rsp_rd = mem_rsp_rd_buf [rsp_issue_tag]; + wire lsuq_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; + + assign rsp_tag = dcache_rsp_if.tag[0][`DCORE_TAG_ID_BITS-1:0]; - wire dcache_req_fire = (| dcache_req_if.valid) && dcache_req_if.ready; - wire dcache_rsp_fire = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; + wire [`NUM_THREADS-1:0] mem_rsp_mask_upd = mem_rsp_mask[rsp_tag] & ~dcache_rsp_if.valid; - wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask & ~dcache_rsp_if.valid; + wire lsuq_pop = lsuq_pop_part && (0 == mem_rsp_mask_upd); + + VX_cam_buffer #( + .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), + .SIZE (`LSUQ_SIZE) + ) lsu_queue ( + .clk (clk), + .reset (reset), + .write_addr (req_tag), + .acquire_slot (lsuq_push), + .read_addr (rsp_tag), + .write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}), + .read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}), + .release_addr (rsp_tag), + .release_slot (lsuq_pop), + .full (lsuq_full) + ); always @(posedge clk) begin - if (dcache_req_fire && (0 == req_rw)) begin - mem_rsp_mask_buf [req_issue_tag] <= req_thread_mask; - mem_rsp_data_prev_buf [req_issue_tag] <= 0; + if (lsuq_push) begin + mem_rsp_mask[req_tag] <= req_thread_mask; end - if (dcache_rsp_fire) begin - mem_rsp_mask_buf [rsp_issue_tag] <= mem_rsp_mask_n; - mem_rsp_data_prev_buf [rsp_issue_tag] <= mem_rsp_data_curr | mem_rsp_data_prev; + if (lsuq_pop_part) begin + mem_rsp_mask[rsp_tag] <= mem_rsp_mask_upd; end end - always @(posedge clk) begin - if (dcache_req_fire && (0 == req_rw)) begin - mem_rsp_offset_buf [req_issue_tag] <= req_offset; - mem_rsp_sext_buf [req_issue_tag] <= req_sext; - mem_rsp_wid_buf [req_issue_tag] <= req_wid; - mem_rsp_curr_PC_buf [req_issue_tag] <= req_pc; - mem_rsp_rd_buf [req_issue_tag] <= req_rd; - end - end - - wire stall_in; + wire store_stall = valid_in && req_rw && stall_out; // Core Request - assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~stall_in}} & req_thread_mask; + assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~lsuq_full && ~store_stall}} & req_thread_mask; assign dcache_req_if.rw = {`NUM_THREADS{req_rw}}; assign dcache_req_if.byteen = req_byteen; assign dcache_req_if.addr = req_addr; assign dcache_req_if.data = req_data; - assign ready_in = dcache_req_if.ready && ~stall_in; - `ifdef DBG_CORE_REQ_INFO - assign dcache_req_if.tag = {req_pc, req_wb, req_rd, req_wid, req_issue_tag}; + assign dcache_req_if.tag = {req_curr_PC, req_rd, req_wid, req_tag}; `else - assign dcache_req_if.tag = req_issue_tag; + assign dcache_req_if.tag = req_tag; `endif + assign stall_in = ~dcache_req_if.ready || lsuq_full || store_stall; + + // Can accept new request? + assign lsu_req_if.ready = ~stall_in; + // Core Response for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire [31:0] rsp_data_shifted = dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0}; + wire [31:0] rsp_data_shifted = dcache_rsp_if.data[i] >> {rsp_offset[i], 3'b0}; always @(*) begin - case (mem_rsp_sext) - 1: mem_rsp_data_curr[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]}; - 2: mem_rsp_data_curr[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]}; - default: mem_rsp_data_curr[i] = rsp_data_shifted; + case (rsp_sext) + 1: rsp_data[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]}; + 2: rsp_data[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]}; + default: rsp_data[i] = rsp_data_shifted; endcase end end - reg is_load_rsp; - reg [`NUM_THREADS-1:0][31:0] load_data; - reg [`ISTAG_BITS-1:0] rsp_issue_tag_r; + wire is_store_req = valid_in && ~lsuq_full && req_rw && dcache_req_if.ready; + wire is_load_rsp = (| dcache_rsp_if.valid); - always @(posedge clk) begin - if (reset) begin - is_load_rsp <= 0; - end else begin - is_load_rsp <= dcache_rsp_fire && (0 == mem_rsp_mask_n); - load_data <= mem_rsp_data_curr | mem_rsp_data_prev; - rsp_issue_tag_r <= rsp_issue_tag; - end - end + wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid; + wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores - wire is_store_req = dcache_req_fire && req_rw; - assign stall_in = is_load_rsp && valid_in && req_rw; // LOAD has priority + wire arb_valid = is_store_req || is_load_rsp; + wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid; + wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid; + wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC; + wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd; + wire arb_wb = is_store_req ? 0 : rsp_wb; - assign lsu_commit_if.valid = is_load_rsp || is_store_req; - assign lsu_commit_if.issue_tag = is_load_rsp ? rsp_issue_tag_r : req_issue_tag; - assign lsu_commit_if.data = load_data; + VX_generic_register #( + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) + ) lsu_rsp_reg ( + .clk (clk), + .reset (reset), + .stall (stall_out), + .flush (1'b0), + .in ({arb_valid, arb_wid, arb_thread_mask, arb_curr_PC, arb_rd, arb_wb, rsp_data}), + .out ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data}) + ); // Can accept new cache response? - assign dcache_rsp_if.ready = 1'b1; + assign dcache_rsp_if.ready = ~(stall_out || mem_rsp_stall); // scope registration `SCOPE_ASSIGN (scope_dcache_req_valid, dcache_req_if.valid); @@ -198,28 +199,23 @@ module VX_lsu_unit #( `SCOPE_ASSIGN (scope_dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (scope_dcache_req_tag, dcache_req_if.tag); `SCOPE_ASSIGN (scope_dcache_req_ready, dcache_req_if.ready); - `SCOPE_ASSIGN (scope_dcache_req_wid, req_wid); + `SCOPE_ASSIGN (scope_dcache_req_wid, req_wid); `SCOPE_ASSIGN (scope_dcache_req_curr_PC, req_pc); `SCOPE_ASSIGN (scope_dcache_rsp_valid, dcache_rsp_if.valid); `SCOPE_ASSIGN (scope_dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (scope_dcache_rsp_tag, dcache_rsp_if.tag); `SCOPE_ASSIGN (scope_dcache_rsp_ready, dcache_rsp_if.ready); - - `UNUSED_VAR (mem_rsp_wid) - `UNUSED_VAR (mem_rsp_curr_PC) - `UNUSED_VAR (mem_rsp_rd) - `UNUSED_VAR (req_wb) `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin if ((| dcache_req_if.valid) && dcache_req_if.ready) begin $display("%t: D$%0d req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, rd=%0d, rw=%0b, byteen=%0h, data=%0h", - $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data); + $time, CORE_ID, req_wid, req_curr_PC, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin $display("%t: D$%0d rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", - $time, CORE_ID, dcache_rsp_if.valid, mem_rsp_wid, mem_rsp_curr_PC, dcache_rsp_if.tag, mem_rsp_rd, dcache_rsp_if.data); + $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_curr_PC, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data); end end `endif diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 86d531f9..646a387c 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -7,34 +7,52 @@ module VX_mul_unit #( input wire reset, // Inputs - VX_mul_req_if alu_req_if, + VX_mul_req_if mul_req_if, // Outputs - VX_exu_to_cmt_if alu_commit_if + VX_exu_to_cmt_if mul_commit_if ); - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`MUL_BITS-1:0] alu_op; - wire [`NUM_THREADS-1:0][31:0] alu_in1, alu_in2; - wire valid_in, ready_in; - - // use a skid buffer due to MUL/DIV output arbitration adding realtime backpressure - VX_elastic_buffer #( - .DATAW (`ISTAG_BITS + `MUL_BITS + (2 * `NUM_THREADS * 32)), - .SIZE (0) - ) input_buffer ( - .clk (clk), - .reset (reset), - .valid_in (alu_req_if.valid), - .ready_in (alu_req_if.ready), - .data_in ({alu_req_if.issue_tag, alu_req_if.op, alu_req_if.rs1_data, alu_req_if.rs2_data}), - .data_out ({issue_tag, alu_op, alu_in1, alu_in2}), - .ready_out (ready_in), - .valid_out (valid_in) - ); + localparam MULQ_BITS = `LOG2UP(`MULQ_SIZE); + + wire [`MUL_BITS-1:0] alu_op = mul_req_if.op; + wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data; + wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data; + + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_thread_mask; + wire [31:0] rsp_curr_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; + wire [MULQ_BITS-1:0] tag_in, tag_out; + wire valid_out; + wire stall_out; + wire mulq_full; + + wire mulq_push = mul_req_if.valid && mul_req_if.ready; + wire mulq_pop = valid_out && ~stall_out; + + VX_cam_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), + .SIZE (`MULQ_SIZE) + ) mul_queue ( + .clk (clk), + .reset (reset), + .acquire_slot (mulq_push), + .write_addr (tag_in), + .read_addr (tag_out), + .release_addr (tag_out), + .write_data ({mul_req_if.wid, mul_req_if.thread_mask, mul_req_if.curr_PC, mul_req_if.rd, mul_req_if.wb}), + .read_data ({rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb}), + .release_slot (mulq_pop), + .full (mulq_full) + ); + + /////////////////////////////////////////////////////////////////////////// wire [`NUM_THREADS-1:0][31:0] mul_result; wire is_mulw = (alu_op == `MUL_MUL); wire is_mulw_out; + wire stall_mul; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -51,7 +69,7 @@ module VX_mul_unit #( ) multiplier ( .clk(clk), .reset(reset), - .clk_en(1'b1), + .clk_en(~stall_mul), .dataa(mul_in1), .datab(mul_in2), .result(mul_result_tmp) @@ -60,20 +78,20 @@ module VX_mul_unit #( assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; end - wire [`ISTAG_BITS-1:0] mul_issue_tag; + wire [MULQ_BITS-1:0] mul_tag; wire mul_valid_out; - wire mul_fire = valid_in && ready_in && ~`IS_DIV_OP(alu_op); + wire mul_fire = mul_req_if.valid && mul_req_if.ready && ~`IS_DIV_OP(alu_op); VX_shift_register #( - .DATAW(1 + `ISTAG_BITS + 1), + .DATAW(1 + MULQ_BITS + 1), .DEPTH(`LATENCY_IMUL) ) mul_shift_reg ( .clk(clk), .reset(reset), - .enable(1'b1), - .in({mul_fire, issue_tag, is_mulw}), - .out({mul_valid_out, mul_issue_tag, is_mulw_out}) + .enable(~stall_mul), + .in({mul_fire, tag_in, is_mulw}), + .out({mul_valid_out, mul_tag, is_mulw_out}) ); /////////////////////////////////////////////////////////////////////////// @@ -81,8 +99,8 @@ module VX_mul_unit #( wire [`NUM_THREADS-1:0][31:0] div_result; wire is_div = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); wire is_signed_div = (alu_op == `MUL_DIV || alu_op == `MUL_REM); - reg [`NUM_THREADS-1:0] is_div_qual; - wire [`NUM_THREADS-1:0] is_div_out; + reg [`NUM_THREADS-1:0] is_div_qual; + wire is_div_out; wire stall_div; for (genvar i = 0; i < `NUM_THREADS; i++) begin @@ -95,8 +113,8 @@ module VX_mul_unit #( always @(*) begin if (~stall_div) begin is_div_qual[i] = is_div; - div_in1_qual = alu_in1[i]; - div_in2_qual = alu_in2[i]; + div_in1_qual = alu_in1[i]; + div_in2_qual = alu_in2[i]; if (0 == alu_in2[i]) begin div_in2_qual = 1; if (is_div) begin @@ -134,34 +152,52 @@ module VX_mul_unit #( .remainder(rem_result_tmp) ); - assign div_result[i] = is_div_out[i] ? div_result_tmp : rem_result_tmp; + assign div_result[i] = is_div_out ? div_result_tmp : rem_result_tmp; end - wire [`ISTAG_BITS-1:0] div_issue_tag; + wire [MULQ_BITS-1:0] div_tag; wire div_valid_out; - wire div_fire = valid_in && ready_in && `IS_DIV_OP(alu_op); + wire div_fire = mul_req_if.valid && mul_req_if.ready && `IS_DIV_OP(alu_op); VX_shift_register #( - .DATAW(1 + `ISTAG_BITS + `NUM_THREADS), + .DATAW(1 + MULQ_BITS + 1), .DEPTH(`LATENCY_IDIV + 1) ) div_shift_reg ( .clk(clk), .reset(reset), .enable(~stall_div), - .in({div_fire, issue_tag, is_div_qual}), - .out({div_valid_out, div_issue_tag, is_div_out}) + .in({div_fire, tag_in, (| is_div_qual)}), + .out({div_valid_out, div_tag, is_div_out}) ); /////////////////////////////////////////////////////////////////////////// - assign stall_div = mul_valid_out && div_valid_out; // arbitration prioritizes MUL + wire arbiter_hazard = mul_valid_out && div_valid_out; + + assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid; + assign stall_mul = stall_out || mulq_full; + assign stall_div = stall_out || mulq_full + || arbiter_hazard; // arbitration prioritizes MUL + wire stall_in = stall_mul || stall_div; + + assign valid_out = mul_valid_out || div_valid_out; + assign tag_out = mul_valid_out ? mul_tag : div_tag; + + wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result; + + VX_generic_register #( + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)) + ) alu_reg ( + .clk (clk), + .reset (reset), + .stall (stall_out), + .flush (0), + .in ({valid_out, rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb, result}), + .out ({mul_commit_if.valid, mul_commit_if.wid, mul_commit_if.thread_mask, mul_commit_if.curr_PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data}) + ); // can accept new request? - assign ready_in = ~stall_div; - - assign alu_commit_if.valid = mul_valid_out || div_valid_out; - assign alu_commit_if.issue_tag = mul_valid_out ? mul_issue_tag : div_issue_tag; - assign alu_commit_if.data = mul_valid_out ? mul_result : div_result; + assign mul_req_if.ready = ~stall_in; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index f7aa9fa3..bd8df46d 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -101,24 +101,22 @@ module VX_pipeline #( assign csr_io_rsp_data = csr_io_rsp_if.data; assign csr_io_rsp_if.ready = csr_io_rsp_ready; + VX_csr_to_issue_if csr_to_issue_if(); VX_cmt_to_csr_if cmt_to_csr_if(); VX_decode_if decode_if(); VX_branch_ctl_if branch_ctl_if(); VX_warp_ctl_if warp_ctl_if(); VX_ifetch_rsp_if ifetch_rsp_if(); VX_alu_req_if alu_req_if(); - VX_bru_req_if bru_req_if(); VX_lsu_req_if lsu_req_if(); VX_csr_req_if csr_req_if(); VX_mul_req_if mul_req_if(); VX_fpu_req_if fpu_req_if(); VX_gpu_req_if gpu_req_if(); - VX_wb_if writeback_if(); - VX_cmt_to_issue_if cmt_to_issue_if(); + VX_writeback_if writeback_if(); VX_wstall_if wstall_if(); VX_join_if join_if(); VX_exu_to_cmt_if alu_commit_if(); - VX_exu_to_cmt_if bru_commit_if(); VX_exu_to_cmt_if lsu_commit_if(); VX_exu_to_cmt_if csr_commit_if(); VX_exu_to_cmt_if mul_commit_if(); @@ -159,10 +157,9 @@ module VX_pipeline #( .decode_if (decode_if), .writeback_if (writeback_if), - .cmt_to_issue_if(cmt_to_issue_if), + .csr_to_issue_if(csr_to_issue_if), .alu_req_if (alu_req_if), - .bru_req_if (bru_req_if), .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), @@ -183,10 +180,10 @@ module VX_pipeline #( .csr_io_req_if (csr_io_req_if), .csr_io_rsp_if (csr_io_rsp_if), + .csr_to_issue_if(csr_to_issue_if), .cmt_to_csr_if (cmt_to_csr_if), .alu_req_if (alu_req_if), - .bru_req_if (bru_req_if), .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), @@ -196,7 +193,6 @@ module VX_pipeline #( .warp_ctl_if (warp_ctl_if), .branch_ctl_if (branch_ctl_if), .alu_commit_if (alu_commit_if), - .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), @@ -213,14 +209,12 @@ module VX_pipeline #( .reset (reset), .alu_commit_if (alu_commit_if), - .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), .fpu_commit_if (fpu_commit_if), .gpu_commit_if (gpu_commit_if), - .cmt_to_issue_if(cmt_to_issue_if), .writeback_if (writeback_if), .cmt_to_csr_if (cmt_to_csr_if) ); diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index b12551c4..7e59af79 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -35,6 +35,10 @@ wire [$bits(x)-1:0] __``x``__ = x; \ /* verilator lint_on UNUSED */ +`define UNUSED_FIELD(x,y) /* verilator lint_off UNUSED */ \ + wire [$bits(x.y)-1:0] __``y``__ = x.y; \ + /* verilator lint_on UNUSED */ + `define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \ . x () \ /* verilator lint_on PINCONNECTEMPTY */ diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_print_instr.vh index f8682f44..9bc6a0fe 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_print_instr.vh @@ -24,39 +24,40 @@ task print_ex_op; begin case (ex) `EX_ALU: begin - case (`ALU_BITS'(op)) - `ALU_ADD: $write("ADD"); - `ALU_SUB: $write("SUB"); - `ALU_SLL: $write("SLL"); - `ALU_SRL: $write("SRL"); - `ALU_SRA: $write("SRA"); - `ALU_SLT: $write("SLT"); - `ALU_SLTU: $write("SLTU"); - `ALU_XOR: $write("XOR"); - `ALU_OR: $write("OR"); - `ALU_AND: $write("AND"); - `ALU_LUI: $write("LUI"); - `ALU_AUIPC: $write("AUIPC"); - default: $write("?"); - endcase - end - `EX_BRU: begin - case (`BRU_BITS'(op)) - `BRU_EQ: $write("BEQ"); - `BRU_NE: $write("BNE"); - `BRU_LT: $write("BLT"); - `BRU_GE: $write("BGE"); - `BRU_LTU: $write("BLTU"); - `BRU_GEU: $write("BGEU"); - `BRU_JAL: $write("JAL"); - `BRU_JALR: $write("JALR"); - `BRU_ECALL: $write("ECALL"); - `BRU_EBREAK:$write("EBREAK"); - `BRU_MRET: $write("MRET"); - `BRU_SRET: $write("SRET"); - `BRU_DRET: $write("DRET"); - default: $write("?"); - endcase + if (`IS_BR_OP(op)) begin + case (`BR_BITS'(op)) + `BR_EQ: $write("BEQ"); + `BR_NE: $write("BNE"); + `BR_LT: $write("BLT"); + `BR_GE: $write("BGE"); + `BR_LTU: $write("BLTU"); + `BR_GEU: $write("BGEU"); + `BR_JAL: $write("JAL"); + `BR_JALR: $write("JALR"); + `BR_ECALL: $write("ECALL"); + `BR_EBREAK:$write("EBREAK"); + `BR_MRET: $write("MRET"); + `BR_SRET: $write("SRET"); + `BR_DRET: $write("DRET"); + default: $write("?"); + endcase + end else begin + case (`ALU_BITS'(op)) + `ALU_ADD: $write("ADD"); + `ALU_SUB: $write("SUB"); + `ALU_SLL: $write("SLL"); + `ALU_SRL: $write("SRL"); + `ALU_SRA: $write("SRA"); + `ALU_SLT: $write("SLT"); + `ALU_SLTU: $write("SLTU"); + `ALU_XOR: $write("XOR"); + `ALU_OR: $write("OR"); + `ALU_AND: $write("AND"); + `ALU_LUI: $write("LUI"); + `ALU_AUIPC: $write("AUIPC"); + default: $write("?"); + endcase + end end `EX_LSU: begin case (`LSU_BITS'(op)) diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 265e9cf5..1f681db8 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -6,66 +6,56 @@ module VX_scoreboard #( input wire clk, input wire reset, - VX_decode_if decode_if, - VX_wb_if writeback_if, - VX_cmt_to_issue_if cmt_to_issue_if, - input wire ex_busy, - output wire [`ISTAG_BITS-1:0] issue_tag, - output wire schedule_delay + VX_decode_if ibuf_deq_if, + VX_writeback_if writeback_if, + input wire exe_delay, + input wire gpr_delay, + + output wire delay ); - reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; + reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; + reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; - wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.wid] & decode_if.reg_use_mask; - wire inuse_hazard = (inuse_mask != 0); + wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs; - wire issue_buf_full; - - assign schedule_delay = ex_busy || inuse_hazard || issue_buf_full; - - wire issue_fire = decode_if.valid && decode_if.ready; + assign delay = (| inuse_mask); - wire reserve_rd = issue_fire && (decode_if.wb != 0); + wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0); - wire release_rd = writeback_if.valid; + wire release_reg = writeback_if.valid && writeback_if.ready; + + wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[{writeback_if.wid, writeback_if.rd}] & ~writeback_if.thread_mask; always @(posedge clk) begin if (reset) begin - for (integer i = 0; i < `NUM_WARPS; i++) begin - inuse_reg_mask[i] <= `NUM_REGS'(0); + for (integer w = 0; w < `NUM_WARPS; w++) begin + for (integer i = 0; i < `NUM_REGS; i++) begin + inuse_registers[w * `NUM_REGS + i] <= 0; + end + inuse_reg_mask [w] <= `NUM_REGS'(0); end end else begin - if (reserve_rd) begin - inuse_reg_mask[decode_if.wid][decode_if.rd] <= 1; + if (reserve_reg) begin + inuse_registers[{ibuf_deq_if.wid, ibuf_deq_if.rd}] <= ibuf_deq_if.thread_mask; + inuse_reg_mask[ibuf_deq_if.wid][ibuf_deq_if.rd] <= 1; end - if (release_rd) begin + if (release_reg) begin assert(inuse_reg_mask[writeback_if.wid][writeback_if.rd] != 0); - inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= 0; + inuse_registers[{writeback_if.wid, writeback_if.rd}] <= inuse_registers_n; + inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= (| inuse_registers_n); end end end - VX_cam_buffer #( - .DATAW ($bits(issue_data_t)), - .SIZE (`ISSUEQ_SIZE), - .RPORTS (`NUM_EXS) - ) issue_table ( - .clk (clk), - .reset (reset), - .write_data ({decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), - .write_addr (issue_tag), - .acquire_slot (issue_fire), - .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.bru_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), - .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.bru_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), - .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.bru_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), - .full (issue_buf_full) - ); + // issue the instruction + assign ibuf_deq_if.ready = ~(delay || exe_delay || gpr_delay); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if (decode_if.valid && ~decode_if.ready) begin - $display("%t: Core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b", - $time, CORE_ID, decode_if.wid, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, - inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy); + if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin + $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b", + $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.curr_PC, ibuf_deq_if.rd, ibuf_deq_if.wb, + inuse_mask[ibuf_deq_if.rd], inuse_mask[ibuf_deq_if.rs1], inuse_mask[ibuf_deq_if.rs2], inuse_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay); end end `endif diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 7b464cf5..92c3cb9e 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -3,14 +3,6 @@ `include "VX_define.vh" -typedef struct packed { - logic [`NW_BITS-1:0] wid; - logic [`NUM_THREADS-1:0] thread_mask; - logic [31:0] curr_PC; - logic [`NR_BITS-1:0] rd; - logic wb; -} issue_data_t; - typedef struct packed { logic is_normal; logic is_zero; @@ -53,7 +45,7 @@ typedef struct packed { typedef struct packed { logic valid; logic [`NB_BITS-1:0] id; - logic [`NW_BITS:0] num_warps; + logic [`NW_BITS-1:0] size_m1; } gpu_barrier_t; `endif \ No newline at end of file diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index a4ef9214..f48622e8 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -20,36 +20,46 @@ module VX_warp_sched #( wire [31:0] join_pc; wire [`NUM_THREADS-1:0] join_tm; - reg [`NUM_WARPS-1:0] warp_active; - reg [`NUM_WARPS-1:0] warp_stalled; - reg [`NUM_WARPS-1:0] visible_active; - wire update_visible_active; + reg [`NUM_WARPS-1:0] warp_active; // real active warps (updated when a warp is activated or disabled) + reg [`NUM_WARPS-1:0] warp_stalled; // asserted when a branch/gpgpu instructions are issued + reg [`NUM_WARPS-1:0] warp_ready, warp_ready_n; // enforces round-robin, barrier, and non-speculating branches - reg [`NUM_WARPS-1:0] warp_lock; + // Lock warp until instruction decode to resolve branches + reg [`NUM_WARPS-1:0] fetch_lock; reg [`NUM_THREADS-1:0] thread_masks[`NUM_WARPS-1:0]; reg [31:0] warp_pcs[`NUM_WARPS-1:0]; // barriers - reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; - wire reached_barrier_limit; - reg [`NUM_WARPS-1:0] total_barrier_stall; - + reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; // warps waiting on barrier + wire reached_barrier_limit; // the expected number of warps reached the barrier + // wspawn reg [31:0] use_wspawn_pc; reg [`NUM_WARPS-1:0] use_wspawn; - - wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] warp_pc; wire [`NW_BITS-1:0] warp_to_schedule; wire scheduled_warp; + + wire [`NUM_WARPS-1:0] total_warp_stalled; - wire stall_out; - wire global_stall; - wire real_schedule; + reg didnt_split; - reg didnt_split; + always @(*) begin + warp_ready_n = warp_ready; + if (warp_ctl_if.valid + && warp_ctl_if.tmc.valid + && (0 == warp_ctl_if.tmc.thread_mask)) begin + warp_ready_n[warp_ctl_if.wid] = 0; + end + if (wstall_if.wstall) begin + warp_ready_n[wstall_if.wid] = 0; + end + if (scheduled_warp) begin + warp_ready_n[warp_to_schedule] = 0; + end + end always @(posedge clk) begin if (reset) begin @@ -57,42 +67,41 @@ module VX_warp_sched #( barrier_stall_mask[i] <= 0; end - use_wspawn_pc <= 0; - use_wspawn <= 0; - warp_pcs[0] <= `STARTUP_ADDR; - warp_active[0] <= 1; // Activating first warp - visible_active[0] <= 1; // Activating first warp - thread_masks[0] <= 1; // Activating first thread in first warp - warp_stalled <= 0; - didnt_split <= 0; - warp_lock <= 0; + use_wspawn_pc <= 0; + use_wspawn <= 0; + warp_pcs[0] <= `STARTUP_ADDR; + warp_active[0] <= 1; // Activating first warp + warp_ready[0] <= 1; // set first warp as ready + thread_masks[0] <= 1; // Activating first thread in first warp + warp_stalled <= 0; + didnt_split <= 0; + fetch_lock <= 0; for (integer i = 1; i < `NUM_WARPS; i++) begin - warp_pcs[i] <= 0; - warp_active[i] <= 0; // Activating first warp - visible_active[i] <= 0; // Activating first warp - thread_masks[i] <= 1; // Activating first thread in first warp + warp_pcs[i] <= 0; + warp_active[i] <= 0; + warp_ready[i] <= 0; + thread_masks[i] <= 0; end end else begin - if (warp_ctl_if.wspawn.valid) begin + if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin warp_active <= warp_ctl_if.wspawn.wmask; use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1)); use_wspawn_pc <= warp_ctl_if.wspawn.pc; end - if (warp_ctl_if.barrier.valid) begin + if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin warp_stalled[warp_ctl_if.wid] <= 0; if (reached_barrier_limit) begin barrier_stall_mask[warp_ctl_if.barrier.id] <= 0; end else begin barrier_stall_mask[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1; end - end else if (warp_ctl_if.tmc.valid) begin + end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.thread_mask; warp_stalled[warp_ctl_if.wid] <= 0; if (0 == warp_ctl_if.tmc.thread_mask) begin - warp_active[warp_ctl_if.wid] <= 0; - visible_active[warp_ctl_if.wid] <= 0; + warp_active[warp_ctl_if.wid] <= 0; end end else if (join_if.is_join && !didnt_split) begin if (!join_fall) begin @@ -100,7 +109,7 @@ module VX_warp_sched #( end thread_masks[join_if.wid] <= join_tm; didnt_split <= 0; - end else if (warp_ctl_if.split.valid) begin + end else if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin warp_stalled[warp_ctl_if.wid] <= 0; if (warp_ctl_if.split.diverged) begin thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_mask; @@ -110,26 +119,19 @@ module VX_warp_sched #( end end - if (use_wspawn[warp_to_schedule] && !global_stall) begin + if (use_wspawn[warp_to_schedule] && scheduled_warp) begin use_wspawn[warp_to_schedule] <= 0; thread_masks[warp_to_schedule] <= 1; end // Stalling the scheduling of warps if (wstall_if.wstall) begin - warp_stalled[wstall_if.wid] <= 1; - visible_active[wstall_if.wid] <= 0; + warp_stalled[wstall_if.wid] <= 1; end - // Refilling active warps - if (update_visible_active) begin - visible_active <= warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock; - end - - // Don't change state if stall - if (!global_stall && real_schedule && (thread_mask != 0)) begin - visible_active[warp_to_schedule] <= 0; - warp_pcs[warp_to_schedule] <= warp_pc + 4; + // update 'warp_ready' when a warp is scheduled (update round-robin warp schedule) + if (scheduled_warp) begin + warp_pcs[warp_to_schedule] <= warp_pc + 4; end // Branch @@ -140,38 +142,42 @@ module VX_warp_sched #( warp_stalled[branch_ctl_if.wid] <= 0; end - // Lock/Release - if (scheduled_warp && !stall_out) begin - warp_lock[warp_to_schedule] <= 1; + // Lock warp until instruction decode to resolve branches + if (scheduled_warp) begin + fetch_lock[warp_to_schedule] <= 1; end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - warp_lock[ifetch_rsp_if.wid] <= 0; + fetch_lock[ifetch_rsp_if.wid] <= 0; end + + // reset 'warp_ready' when it goes to zero (reset round-robin warp schedule) + warp_ready <= (| warp_ready_n) ? warp_ready_n : (warp_active & ~total_warp_stalled); end end - wire [`NUM_WARPS-1:0] b_mask = barrier_stall_mask[warp_ctl_if.barrier.id][`NUM_WARPS-1:0]; - wire [`NW_BITS:0] b_count; + // calculate active barrier status +`IGNORE_WARNINGS_BEGIN + wire [`NW_BITS:0] active_barrier_count; +`IGNORE_WARNINGS_END VX_countones #( .N(`NUM_WARPS) ) barrier_count ( - .valids(b_mask), - .count (b_count) - ); + .valids(barrier_stall_mask[warp_ctl_if.barrier.id]), + .count (active_barrier_count) + ); - wire [`NW_BITS:0] count_visible_active; + wire reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1); - VX_countones #( - .N(`NUM_WARPS) - ) num_visible ( - .valids(visible_active), - .count (count_visible_active) - ); - - assign reached_barrier_limit = (b_count == warp_ctl_if.barrier.num_warps); + reg [`NUM_WARPS-1:0] total_barrier_stall; + always @(*) begin + total_barrier_stall = barrier_stall_mask[0]; + for (integer i = 1; i < `NUM_BARRIERS; ++i) begin + total_barrier_stall |= barrier_stall_mask[i]; + end + end - assign total_barrier_stall = barrier_stall_mask[0] | barrier_stall_mask[1] | barrier_stall_mask[2] | barrier_stall_mask[3]; + // split/join stack management wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0]; wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.wid]}; @@ -180,7 +186,8 @@ module VX_warp_sched #( assign {join_fall, join_pc, join_tm} = ipdom[join_if.wid]; for (genvar i = 0; i < `NUM_WARPS; i++) begin - wire push = warp_ctl_if.split.valid + wire push = warp_ctl_if.valid + && warp_ctl_if.split.valid && warp_ctl_if.split.diverged && (i == warp_ctl_if.wid); @@ -203,46 +210,40 @@ module VX_warp_sched #( ); end + // calculate next warp schedule + wire schedule; - - wire branch_hazard = schedule - && branch_ctl_if.valid - && branch_ctl_if.taken - && (branch_ctl_if.wid == warp_to_schedule); - - assign real_schedule = schedule - && !warp_stalled[warp_to_schedule] - && !total_barrier_stall[warp_to_schedule] - && !warp_lock[0]; - - wire wstall_this_cycle = wstall_if.wstall && (wstall_if.wid == warp_to_schedule); // Maybe bug - - assign update_visible_active = (0 == count_visible_active) && !(stall_out || wstall_this_cycle || branch_hazard || join_if.is_join); - - assign global_stall = stall_out || wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join; - - assign scheduled_warp = !(wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join) && !reset; - - assign warp_pc = use_wspawn[warp_to_schedule] ? use_wspawn_pc : warp_pcs[warp_to_schedule]; - assign thread_mask = global_stall ? 0 : (use_wspawn[warp_to_schedule] ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule]); + assign total_warp_stalled = warp_stalled | total_barrier_stall | fetch_lock; - wire [`NUM_WARPS-1:0] use_active = (count_visible_active != 0) ? visible_active : - (warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock); + wire [`NUM_WARPS-1:0] use_ready = warp_ready & ~total_warp_stalled; - // Choosing a warp to schedule VX_fixed_arbiter #( .N(`NUM_WARPS) ) choose_schedule ( .clk (clk), .reset (reset), - .requests (use_active), + .requests (use_ready), .grant_index (warp_to_schedule), .grant_valid (schedule), `UNUSED_PIN (grant_onehot) - ); + ); - assign stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid; + wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid; + + wire branch_hazard = branch_ctl_if.valid + && branch_ctl_if.taken + && (branch_ctl_if.wid == warp_to_schedule); + + wire wstall_this_cycle = wstall_if.wstall && (wstall_if.wid == warp_to_schedule); + + wire stall = stall_out || wstall_this_cycle || branch_hazard || join_if.is_join; + + assign scheduled_warp = schedule && ~stall; + + wire [`NUM_THREADS-1:0] thread_mask = use_wspawn[warp_to_schedule] ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule]; + + assign warp_pc = use_wspawn[warp_to_schedule] ? use_wspawn_pc : warp_pcs[warp_to_schedule]; VX_generic_register #( .N(1 + `NUM_THREADS + 32 + `NW_BITS) @@ -251,7 +252,7 @@ module VX_warp_sched #( .reset (reset), .stall (stall_out), .flush (0), - .in ({(| thread_mask), thread_mask, warp_pc, warp_to_schedule}), + .in ({scheduled_warp, thread_mask, warp_pc, warp_to_schedule}), .out ({ifetch_req_if.valid, ifetch_req_if.thread_mask, ifetch_req_if.curr_PC, ifetch_req_if.wid}) ); diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index dd5da562..5ef76750 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -8,165 +8,82 @@ module VX_writeback #( // inputs VX_exu_to_cmt_if alu_commit_if, - VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if csr_commit_if, VX_exu_to_cmt_if mul_commit_if, VX_fpu_to_cmt_if fpu_commit_if, VX_exu_to_cmt_if gpu_commit_if, - VX_cmt_to_issue_if cmt_to_issue_if, // outputs - VX_wb_if writeback_if + VX_writeback_if writeback_if ); - reg [`ISSUEQ_SIZE-1:0] wb_valid_table, wb_valid_table_n; - reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] wb_data_table, wb_data_table_n; - reg [`ISSUEQ_SIZE-1:0][`NW_BITS-1:0] wb_wid_table, wb_wid_table_n; - reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] wb_thread_mask_table, wb_thread_mask_table_n; - reg [`ISSUEQ_SIZE-1:0][31:0] wb_curr_PC_table, wb_curr_PC_table_n; - reg [`ISSUEQ_SIZE-1:0][`NR_BITS-1:0] wb_rd_table, wb_rd_table_n; + wire alu_valid = alu_commit_if.valid && alu_commit_if.wb; + wire lsu_valid = lsu_commit_if.valid && lsu_commit_if.wb; + wire csr_valid = csr_commit_if.valid && csr_commit_if.wb; + wire mul_valid = mul_commit_if.valid && mul_commit_if.wb; + wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb; - reg wb_valid, wb_valid_n; - reg [`NUM_THREADS-1:0][31:0] wb_data, wb_data_n; - reg [`NW_BITS-1:0] wb_wid, wb_wid_n; - reg [`NUM_THREADS-1:0] wb_thread_mask, wb_thread_mask_n; - reg [31:0] wb_curr_PC, wb_curr_PC_n; - reg [`NR_BITS-1:0] wb_rd, wb_rd_n; + VX_writeback_if writeback_tmp_if(); - reg [`ISTAG_BITS-1:0] wb_index; - reg [`ISTAG_BITS-1:0] wb_index_n; + assign writeback_tmp_if.valid = alu_valid ? alu_commit_if.valid : + lsu_valid ? lsu_commit_if.valid : + csr_valid ? csr_commit_if.valid : + mul_valid ? mul_commit_if.valid : + fpu_valid ? fpu_commit_if.valid : + 0; - always @(*) begin - wb_valid_table_n = wb_valid_table; - wb_wid_table_n = wb_wid_table; - wb_thread_mask_table_n = wb_thread_mask_table; - wb_curr_PC_table_n = wb_curr_PC_table; - wb_rd_table_n = wb_rd_table; - wb_data_table_n = wb_data_table; + assign writeback_tmp_if.wid = alu_valid ? alu_commit_if.wid : + lsu_valid ? lsu_commit_if.wid : + csr_valid ? csr_commit_if.wid : + mul_valid ? mul_commit_if.wid : + fpu_valid ? fpu_commit_if.wid : + 0; + + assign writeback_tmp_if.thread_mask = alu_valid ? alu_commit_if.thread_mask : + lsu_valid ? lsu_commit_if.thread_mask : + csr_valid ? csr_commit_if.thread_mask : + mul_valid ? mul_commit_if.thread_mask : + fpu_valid ? fpu_commit_if.thread_mask : + 0; - if (wb_valid) begin - wb_valid_table_n[wb_index] = 0; - end + assign writeback_tmp_if.rd = alu_valid ? alu_commit_if.rd : + lsu_valid ? lsu_commit_if.rd : + csr_valid ? csr_commit_if.rd : + mul_valid ? mul_commit_if.rd : + fpu_valid ? fpu_commit_if.rd : + 0; - if (alu_commit_if.valid) begin - wb_valid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wb; - wb_thread_mask_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.thread_mask; - wb_data_table_n [alu_commit_if.issue_tag] = alu_commit_if.data; - wb_wid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wid; - wb_curr_PC_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.curr_PC; - wb_rd_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.rd; - end + assign writeback_tmp_if.data = alu_valid ? alu_commit_if.data : + lsu_valid ? lsu_commit_if.data : + csr_valid ? csr_commit_if.data : + mul_valid ? mul_commit_if.data : + fpu_valid ? fpu_commit_if.data : + 0; - if (bru_commit_if.valid) begin - wb_valid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wb; - wb_thread_mask_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.thread_mask; - wb_data_table_n [bru_commit_if.issue_tag] = bru_commit_if.data; - wb_wid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wid; - wb_curr_PC_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.curr_PC; - wb_rd_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.rd; - end + wire stall = ~writeback_if.ready && writeback_if.valid; - if (lsu_commit_if.valid) begin - wb_valid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wb; - wb_thread_mask_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.thread_mask; - wb_data_table_n [lsu_commit_if.issue_tag] = lsu_commit_if.data; - wb_wid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wid; - wb_curr_PC_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.curr_PC; - wb_rd_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.rd; - end + VX_generic_register #( + .N(1 + `NW_BITS + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32)) + ) wb_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({writeback_tmp_if.valid, writeback_tmp_if.wid, writeback_tmp_if.thread_mask, writeback_tmp_if.rd, writeback_tmp_if.data}), + .out ({writeback_if.valid, writeback_if.wid, writeback_if.thread_mask, writeback_if.rd, writeback_if.data}) + ); - if (csr_commit_if.valid) begin - wb_valid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wb; - wb_thread_mask_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.thread_mask; - wb_data_table_n [csr_commit_if.issue_tag] = csr_commit_if.data; - wb_wid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wid; - wb_curr_PC_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.curr_PC; - wb_rd_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.rd; - end - - if (mul_commit_if.valid) begin - wb_valid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wb; - wb_thread_mask_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.thread_mask; - wb_data_table_n [mul_commit_if.issue_tag] = mul_commit_if.data; - wb_wid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wid; - wb_curr_PC_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.curr_PC; - wb_rd_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.rd; - end - - if (fpu_commit_if.valid) begin - wb_valid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wb; - wb_thread_mask_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.thread_mask; - wb_data_table_n [fpu_commit_if.issue_tag] = fpu_commit_if.data; - wb_wid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wid; - wb_curr_PC_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.curr_PC; - wb_rd_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.rd; - end - - if (gpu_commit_if.valid) begin - wb_valid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wb; - wb_thread_mask_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.thread_mask; - wb_data_table_n [gpu_commit_if.issue_tag] = gpu_commit_if.data; - wb_wid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wid; - wb_curr_PC_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.curr_PC; - wb_rd_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.rd; - end - end - - always @(*) begin - wb_index_n = 0; - wb_valid_n = 0; - wb_thread_mask_n = {`NUM_THREADS{1'bx}}; - wb_wid_n = {`NW_BITS{1'bx}}; - wb_curr_PC_n = {32{1'bx}}; - wb_data_n = {(`NUM_THREADS * 32){1'bx}}; - for (integer i = `ISSUEQ_SIZE-1; i >= 0; i--) begin - if (wb_valid_table_n[i]) begin - wb_index_n = `ISTAG_BITS'(i); - wb_valid_n = 1; - wb_thread_mask_n= wb_thread_mask_table_n[i]; - wb_wid_n = wb_wid_table_n[i]; - wb_curr_PC_n = wb_curr_PC_table_n[i]; - wb_rd_n = wb_rd_table_n[i]; - wb_data_n = wb_data_table_n[i]; - end - end - end - - always @(posedge clk) begin - if (reset) begin - wb_valid_table <= 0; - wb_index <= 0; - wb_valid <= 0; - end else begin - wb_valid_table <= wb_valid_table_n; - wb_thread_mask_table <= wb_thread_mask_table_n; - wb_wid_table <= wb_wid_table_n; - wb_curr_PC_table <= wb_curr_PC_table_n; - wb_rd_table <= wb_rd_table_n; - wb_data_table <= wb_data_table_n; - - wb_index <= wb_index_n; - wb_valid <= wb_valid_n; - wb_thread_mask <= wb_thread_mask_n; - wb_wid <= wb_wid_n; - wb_curr_PC <= wb_curr_PC_n; - wb_rd <= wb_rd_n; - wb_data <= wb_data_n; - end - end - - // writeback request - assign writeback_if.valid = wb_valid; - assign writeback_if.thread_mask = wb_thread_mask; - assign writeback_if.wid = wb_wid; - assign writeback_if.curr_PC = wb_curr_PC; - assign writeback_if.rd = wb_rd; - assign writeback_if.data = wb_data; + assign alu_commit_if.ready = !stall; + assign lsu_commit_if.ready = !stall && !alu_valid; + assign csr_commit_if.ready = !stall && !alu_valid && !lsu_valid; + assign mul_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid; + assign fpu_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid && !mul_valid; + assign gpu_commit_if.ready = 1'b1; // special workaround to get RISC-V tests Pass/Fail status reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */; always @(posedge clk) begin - if (writeback_if.valid) begin + if (writeback_if.valid && writeback_if.ready) begin last_wb_value[writeback_if.rd] <= writeback_if.data[0]; end end diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index fc68ca09..e2ef3d0f 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -106,7 +106,6 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_pc_st0; - wire debug_wb_st0; wire[`NR_BITS-1:0] debug_rd_st0; wire[`NW_BITS-1:0] debug_wid_st0; wire debug_rw_st0; @@ -115,7 +114,6 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st0; wire[31:0] debug_pc_st1e; - wire debug_wb_st1e; wire[`NR_BITS-1:0] debug_rd_st1e; wire[`NW_BITS-1:0] debug_wid_st1e; wire debug_rw_st1e; @@ -124,7 +122,6 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e; wire[31:0] debug_pc_st2; - wire debug_wb_st2; wire[`NR_BITS-1:0] debug_rd_st2; wire[`NW_BITS-1:0] debug_wid_st2; wire debug_rw_st2; @@ -359,7 +356,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st0, debug_wb_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0; + assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0; end `endif @@ -446,7 +443,6 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO .debug_pc_st1e(debug_pc_st1e), - .debug_wb_st1e(debug_wb_st1e), .debug_rd_st1e(debug_rd_st1e), .debug_wid_st1e(debug_wid_st1e), .debug_tagid_st1e(debug_tagid_st1e), @@ -488,7 +484,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st1e, debug_wb_st1e, debug_rd_st1e, debug_wid_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1]; + assign {debug_pc_st1e, debug_rd_st1e, debug_wid_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1]; end `endif @@ -529,7 +525,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st2, debug_wb_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; + assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; end `endif @@ -740,25 +736,25 @@ module VX_bank #( `ifdef DBG_PRINT_CACHE_BANK always @(posedge clk) begin if ((|core_req_valid) && core_req_ready) begin - $display("%t: bank%0d:%0d core req: addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(core_req_addr[0], BANK_ID), core_req_tag); + $display("%t: cache%0d:%0d core req: addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(core_req_addr[0], BANK_ID), core_req_tag); end if (core_rsp_valid && core_rsp_ready) begin - $display("%t: bank%0d:%0d core rsp: tag=%0h, data=%0h", $time, CACHE_ID, BANK_ID, core_rsp_tag, core_rsp_data); + $display("%t: cache%0d:%0d core rsp: tag=%0h, data=%0h", $time, CACHE_ID, BANK_ID, core_rsp_tag, core_rsp_data); end if (dram_fill_req_valid && dram_fill_req_ready) begin - $display("%t: bank%0d:%0d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID)); + $display("%t: cache%0d:%0d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID)); end if (dram_wb_req_valid && dram_wb_req_ready) begin - $display("%t: bank%0d:%0d dram_wb req: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_wb_req_addr, BANK_ID), dram_wb_req_data); + $display("%t: cache%0d:%0d dram_wb req: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_wb_req_addr, BANK_ID), dram_wb_req_data); end if (dram_fill_rsp_valid && dram_fill_rsp_ready) begin - $display("%t: bank%0d:%0d dram_fill rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_rsp_addr, BANK_ID), dram_fill_rsp_data); + $display("%t: cache%0d:%0d dram_fill rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_rsp_addr, BANK_ID), dram_fill_rsp_data); end if (snp_req_valid && snp_req_ready) begin - $display("%t: bank%0d:%0d snp req: addr=%0h, invalidate=%0d, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(snp_req_addr, BANK_ID), snp_req_invalidate, snp_req_tag); + $display("%t: cache%0d:%0d snp req: addr=%0h, invalidate=%0d, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(snp_req_addr, BANK_ID), snp_req_invalidate, snp_req_tag); end if (snp_rsp_valid && snp_rsp_ready) begin - $display("%t: bank%0d:%0d snp rsp: tag=%0h", $time, CACHE_ID, BANK_ID, snp_rsp_tag); + $display("%t: cache%0d:%0d snp rsp: tag=%0h", $time, CACHE_ID, BANK_ID, snp_rsp_tag); end end `endif diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 25fa81aa..63e31394 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -130,14 +130,13 @@ module VX_cache #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_core_req_use_pc; - wire debug_core_req_wb; wire[`NR_BITS-1:0] debug_core_req_rd; wire[`NW_BITS-1:0] debug_core_req_wid; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_core_req_idx; /* verilator lint_on UNUSED */ if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_core_req_use_pc, debug_core_req_wb, debug_core_req_rd, debug_core_req_wid, debug_core_req_idx} = core_req_tag[0]; + assign {debug_core_req_use_pc, debug_core_req_rd, debug_core_req_wid, debug_core_req_idx} = core_req_tag[0]; end `endif diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index 6535837d..61c4bcfb 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -157,7 +157,7 @@ module VX_cache_miss_resrv #( `ifdef DBG_PRINT_CACHE_MSRQ always @(posedge clk) begin if (mrvq_push || mrvq_pop || increment_head || recover_state) begin - $write("%t: bank%0d:%0d msrq: push=%b pop=%b incr=%d recv=%d", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop, increment_head, recover_state); + $write("%t: cache%0d:%0d msrq: push=%b pop=%b incr=%d recv=%d", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop, increment_head, recover_state); for (integer j = 0; j < MRVQ_SIZE; j++) begin if (valid_table[j]) begin $write(" "); diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 7a4f2aec..14df2031 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -65,12 +65,13 @@ module VX_snp_forwarder #( ) snp_fwd_buffer ( .clk (clk), .reset (reset), - .write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}), - .write_addr (sfq_write_addr), - .acquire_slot (sfq_acquire), - .release_slot (sfq_release), + .write_addr (sfq_write_addr), + .acquire_slot (sfq_acquire), .read_addr (sfq_read_addr), + .write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}), .read_data ({snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}), + .release_addr (sfq_read_addr), + .release_slot (sfq_release), .full (sfq_full) ); diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index a03224fc..e636356a 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -28,7 +28,6 @@ module VX_tag_data_access #( `ifdef DBG_CORE_REQ_INFO `IGNORE_WARNINGS_BEGIN input wire[31:0] debug_pc_st1e, - input wire debug_wb_st1e, input wire[`NR_BITS-1:0] debug_rd_st1e, input wire[`NW_BITS-1:0] debug_wid_st1e, input wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e, @@ -217,15 +216,15 @@ module VX_tag_data_access #( if (valid_req_st1e) begin if ((| use_write_enable)) begin if (writefill_st1e) begin - $display("%t: bank%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data); + $display("%t: cache%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data); end else begin - $display("%t: bank%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e); + $display("%t: cache%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e); end end else if (miss_st1e) begin - $display("%t: bank%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e); + $display("%t: cache%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e); end else begin - $display("%t: bank%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1); + $display("%t: cache%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1); end end end diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index 7634622f..75f237f5 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -1,14 +1,16 @@ `include "VX_define.vh" `include "dspba_library_ver.sv" -module VX_fp_fpga ( +module VX_fp_fpga #( + parameter TAGW = 1 +) ( input wire clk, input wire reset, input wire valid_in, output wire ready_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, @@ -21,7 +23,7 @@ module VX_fp_fpga ( output wire has_fflags, output fflags_t [`NUM_THREADS-1:0] fflags, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -31,7 +33,7 @@ module VX_fp_fpga ( wire [NUM_FPC-1:0] per_core_ready_in; wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result; - wire [NUM_FPC-1:0][`ISTAG_BITS-1:0] per_core_tag_out; + wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out; wire [NUM_FPC-1:0] per_core_ready_out; wire [NUM_FPC-1:0] per_core_valid_out; @@ -62,7 +64,10 @@ module VX_fp_fpga ( endcase end - VX_fp_noncomp fp_noncomp ( + VX_fp_noncomp #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_noncomp ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 0)), @@ -80,7 +85,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[0]) ); - VX_fp_add fp_add ( + VX_fp_add #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_add ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 1)), @@ -94,7 +102,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[1]) ); - VX_fp_sub fp_sub ( + VX_fp_sub #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_sub ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 2)), @@ -108,7 +119,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[2]) ); - VX_fp_mul fp_mul ( + VX_fp_mul #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_mul ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 3)), @@ -122,7 +136,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[3]) ); - VX_fp_madd fp_madd ( + VX_fp_madd #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_madd ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 4)), @@ -138,7 +155,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[4]) ); - VX_fp_msub fp_msub ( + VX_fp_msub #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_msub ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 5)), @@ -154,7 +174,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[5]) ); - VX_fp_div fp_div ( + VX_fp_div #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_div ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 6)), @@ -168,7 +191,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[6]) ); - VX_fp_sqrt fp_sqrt ( + VX_fp_sqrt #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_sqrt ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 7)), @@ -181,7 +207,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[7]) ); - VX_fp_ftoi fp_ftoi ( + VX_fp_ftoi #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_ftoi ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 8)), @@ -194,7 +223,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[8]) ); - VX_fp_ftou fp_ftou ( + VX_fp_ftou #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_ftou ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 9)), @@ -207,7 +239,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[9]) ); - VX_fp_itof fp_itof ( + VX_fp_itof #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_itof ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 10)), @@ -220,7 +255,10 @@ module VX_fp_fpga ( .valid_out (per_core_valid_out[10]) ); - VX_fp_utof fp_utof ( + VX_fp_utof #( + .TAGW (TAGW), + .LANES(`NUM_THREADS) + ) fp_utof ( .clk (clk), .reset (reset), .valid_in (valid_in && (core_select == 11)), @@ -248,21 +286,10 @@ module VX_fp_fpga ( assign per_core_ready_out[i] = ready_out && (i == fp_index); end - wire tmp_valid = fp_valid; - wire [`ISTAG_BITS-1:0] tmp_tag = per_core_tag_out[fp_index]; - wire [`NUM_THREADS-1:0][31:0] tmp_result = per_core_result[fp_index]; - wire tmp_has_fflags = fpnew_has_fflags && (fp_index == 0); - fflags_t [`NUM_THREADS-1:0] tmp_flags = fpnew_fflags; - - VX_generic_register #( - .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32) + 1 + `FFG_BITS) - ) nc_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (1'b0), - .in ({tmp_valid, tmp_tag, tmp_result, tmp_has_fflags, tmp_fflags}), - .out ({valid_out, tag_out, result, has_fflags, fflags}) - ); + assign valid_out = fp_valid; + assign tag_out = per_core_tag_out[fp_index]; + assign result = per_core_result[fp_index]; + assign has_fflags = fpnew_has_fflags && (fp_index == 0); + assign fflags = fpnew_fflags; endmodule \ No newline at end of file diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index f83c691c..16c44dd4 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -1,25 +1,28 @@ `include "VX_define.vh" -module VX_fp_noncomp ( +module VX_fp_noncomp #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, output wire has_fflags, - output fflags_t [`NUM_THREADS-1:0] fflags, + output fflags_t [LANES-1:0] fflags, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -35,21 +38,21 @@ module VX_fp_noncomp ( SIG_NAN = 32'h00000100, QUT_NAN = 32'h00000200; - wire [`NUM_THREADS-1:0] a_sign, b_sign; - wire [`NUM_THREADS-1:0][7:0] a_exponent, b_exponent; - wire [`NUM_THREADS-1:0][22:0] a_mantissa, b_mantissa; - fp_type_t [`NUM_THREADS-1:0] a_type, b_type; + wire [LANES-1:0] a_sign, b_sign; + wire [LANES-1:0][7:0] a_exponent, b_exponent; + wire [LANES-1:0][22:0] a_mantissa, b_mantissa; + fp_type_t [LANES-1:0] a_type, b_type; - wire [`NUM_THREADS-1:0] a_smaller, ab_equal; + wire [LANES-1:0] a_smaller, ab_equal; - reg [`NUM_THREADS-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg - reg [`NUM_THREADS-1:0][31:0] fminmax_res; // result of fmin/fmax - reg [`NUM_THREADS-1:0][31:0] fsgnj_res; // result of sign injection - reg [`NUM_THREADS-1:0][31:0] fcmp_res; // result of comparison - reg [`NUM_THREADS-1:0][ 4:0] fcmp_excp; // exception of comparison + reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg + reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax + reg [LANES-1:0][31:0] fsgnj_res; // result of sign injection + reg [LANES-1:0][31:0] fcmp_res; // result of comparison + reg [LANES-1:0][ 4:0] fcmp_excp; // exception of comparison // Setup - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin assign a_sign[i] = dataa[i][31]; assign a_exponent[i] = dataa[i][30:23]; assign a_mantissa[i] = dataa[i][22:0]; @@ -75,7 +78,7 @@ module VX_fp_noncomp ( end // FCLASS - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin always @(*) begin if (a_type[i].is_normal) begin fclass_mask[i] = a_sign[i] ? NEG_NORM : POS_NORM; @@ -99,7 +102,7 @@ module VX_fp_noncomp ( end // Min/Max - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin always @(*) begin if (a_type[i].is_nan && b_type[i].is_nan) fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN @@ -118,7 +121,7 @@ module VX_fp_noncomp ( end // Sign Injection - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (op) `FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]}; @@ -130,7 +133,7 @@ module VX_fp_noncomp ( end // Comparison - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin always @(*) begin case (frm) `FRM_RNE: begin @@ -176,8 +179,8 @@ module VX_fp_noncomp ( reg tmp_valid; reg tmp_has_fflags; - fflags_t [`NUM_THREADS-1:0] tmp_fflags; - reg [`NUM_THREADS-1:0][31:0] tmp_result; + fflags_t [LANES-1:0] tmp_fflags; + reg [LANES-1:0][31:0] tmp_result; always @(*) begin case (op) @@ -191,7 +194,7 @@ module VX_fp_noncomp ( endcase end - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin always @(*) begin tmp_valid = 1'b1; case (op) @@ -228,7 +231,7 @@ module VX_fp_noncomp ( assign ready_in = ~stall; VX_generic_register #( - .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32) + 1 + (`NUM_THREADS * `FFG_BITS)) + .N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS)) ) nc_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index d90f652e..84d3268e 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -3,6 +3,7 @@ `include "defs_div_sqrt_mvp.sv" module VX_fpnew #( + parameter TAGW = 1, parameter FMULADD = 1, parameter FDIVSQRT = 1, parameter FNONCOMP = 1, @@ -14,7 +15,7 @@ module VX_fpnew #( input wire valid_in, output wire ready_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, input wire [`FPU_BITS-1:0] op, input wire [`FRM_BITS-1:0] frm, @@ -27,7 +28,7 @@ module VX_fpnew #( output wire has_fflags, output fflags_t [`NUM_THREADS-1:0] fflags, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -66,7 +67,7 @@ module VX_fpnew #( wire fpu_ready_in, fpu_valid_in; wire fpu_ready_out, fpu_valid_out; - reg [`ISTAG_BITS-1:0] fpu_tag_in, fpu_tag_out; + reg [TAGW-1:0] fpu_tag_in, fpu_tag_out; reg [2:0][`NUM_THREADS-1:0][31:0] fpu_operands; @@ -77,9 +78,6 @@ module VX_fpnew #( wire [`NUM_THREADS-1:0][31:0] fpu_result; fpnew_pkg::status_t [0:`NUM_THREADS-1] fpu_status; - wire is_class_op, is_class_op_out; - assign is_class_op = (op == `FPU_CLASS); - reg [FOP_BITS-1:0] fpu_op; reg [`FRM_BITS-1:0] fpu_rnd; reg fpu_op_mod; @@ -136,7 +134,7 @@ module VX_fpnew #( fpnew_top #( .Features (FPU_FEATURES), .Implementation (FPU_IMPLEMENTATION), - .TagType (logic[`ISTAG_BITS+1+1-1:0]) + .TagType (logic[TAGW+1+1-1:0]) ) fpnew_core ( .clk_i (clk), .rst_ni (1'b1), @@ -148,13 +146,13 @@ module VX_fpnew #( .dst_fmt_i (fpnew_pkg::fp_format_e'(fpu_dst_fmt)), .int_fmt_i (fpnew_pkg::int_format_e'(fpu_int_fmt)), .vectorial_op_i (1'b0), - .tag_i ({fpu_tag_in, fpu_has_fflags, is_class_op}), + .tag_i ({fpu_tag_in, fpu_has_fflags}), .in_valid_i (fpu_valid_in), .in_ready_o (fpu_ready_in), .flush_i (reset), .result_o (fpu_result[0]), .status_o (fpu_status[0]), - .tag_o ({fpu_tag_out, fpu_has_fflags_out, is_class_op_out}), + .tag_o ({fpu_tag_out, fpu_has_fflags_out}), .out_valid_o (fpu_valid_out), .out_ready_i (fpu_ready_out), `UNUSED_PIN (busy_o) diff --git a/hw/rtl/fp_cores/altera/VX_fp_add.v b/hw/rtl/fp_cores/altera/VX_fp_add.v index c7c39506..eeb94556 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_add.v +++ b/hw/rtl/fp_cores/altera/VX_fp_add.v @@ -1,19 +1,22 @@ `include "VX_define.vh" -module VX_fp_add ( +module VX_fp_add #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -22,7 +25,7 @@ module VX_fp_add ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), @@ -65,7 +68,7 @@ module VX_fp_add ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(1) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_div.v b/hw/rtl/fp_cores/altera/VX_fp_div.v index 54fe7e57..698782d9 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_div.v +++ b/hw/rtl/fp_cores/altera/VX_fp_div.v @@ -1,19 +1,22 @@ `include "VX_define.vh" -module VX_fp_div ( +module VX_fp_div #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -22,7 +25,7 @@ module VX_fp_div ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_div fdiv ( .clk (clk), .areset (1'b0), @@ -34,7 +37,7 @@ module VX_fp_div ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_FDIV) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v index a7ba66ae..6c2aa613 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v @@ -1,18 +1,21 @@ `include "VX_define.vh" -module VX_fp_ftoi ( +module VX_fp_ftoi #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -21,7 +24,7 @@ module VX_fp_ftoi ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_ftoi ftoi ( .clk (clk), .areset (1'b0), @@ -32,7 +35,7 @@ module VX_fp_ftoi ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_FTOI) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftou.v b/hw/rtl/fp_cores/altera/VX_fp_ftou.v index a0912f12..71460515 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftou.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftou.v @@ -1,18 +1,21 @@ `include "VX_define.vh" -module VX_fp_ftou ( +module VX_fp_ftou #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -21,7 +24,7 @@ module VX_fp_ftou ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_ftou ftou ( .clk (clk), .areset (1'b0), @@ -32,7 +35,7 @@ module VX_fp_ftou ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_FTOU) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_itof.v b/hw/rtl/fp_cores/altera/VX_fp_itof.v index c95ede12..4a08ab01 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_itof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_itof.v @@ -1,18 +1,21 @@ `include "VX_define.vh" -module VX_fp_itof ( +module VX_fp_itof #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -21,7 +24,7 @@ module VX_fp_itof ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_itof itof ( .clk (clk), .areset (1'b0), @@ -32,7 +35,7 @@ module VX_fp_itof ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_ITOF) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_madd.v b/hw/rtl/fp_cores/altera/VX_fp_madd.v index 58b410d1..bef3b468 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_madd.v +++ b/hw/rtl/fp_cores/altera/VX_fp_madd.v @@ -1,22 +1,25 @@ `include "VX_define.vh" -module VX_fp_madd ( +module VX_fp_madd #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - input wire [`NUM_THREADS-1:0][31:0] datac, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + input wire [LANES-1:0][31:0] datac, + output wire [LANES-1:0][31:0] result, input wire negate, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -24,11 +27,11 @@ module VX_fp_madd ( wire enable0, enable1; assign ready_in = enable0 && enable1; - wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1; - wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; + wire [LANES-1:0][31:0] result_st0, result_st1; + wire [TAGW-1:0] out_tag_st0, out_tag_st1; wire in_valid_st0, out_valid_st0, out_valid_st1; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin twentynm_fp_mac mac_fp_wys0 ( // inputs .accumulate(), @@ -111,7 +114,7 @@ module VX_fp_madd ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1 + 1), + .DATAW(TAGW + 1 + 1), .DEPTH(1) ) shift_reg0 ( .clk(clk), @@ -122,7 +125,7 @@ module VX_fp_madd ( ); VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(1) ) shift_reg1 ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_msub.v b/hw/rtl/fp_cores/altera/VX_fp_msub.v index 62fb99b9..3104de3b 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_msub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_msub.v @@ -1,22 +1,25 @@ `include "VX_define.vh" -module VX_fp_msub ( +module VX_fp_msub #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - input wire [`NUM_THREADS-1:0][31:0] datac, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + input wire [LANES-1:0][31:0] datac, + output wire [LANES-1:0][31:0] result, input wire negate, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -24,11 +27,11 @@ module VX_fp_msub ( wire enable0, enable1; assign ready_in = enable0 && enable1; - wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1; - wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; + wire [LANES-1:0][31:0] result_st0, result_st1; + wire [TAGW-1:0] out_tag_st0, out_tag_st1; wire in_valid_st0, out_valid_st0, out_valid_st1; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin twentynm_fp_mac mac_fp_wys0 ( // inputs .accumulate(), @@ -111,7 +114,7 @@ module VX_fp_msub ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1 + 1), + .DATAW(TAGW + 1 + 1), .DEPTH(1) ) shift_reg0 ( .clk(clk), @@ -122,7 +125,7 @@ module VX_fp_msub ( ); VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(1) ) shift_reg1 ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_mul.v b/hw/rtl/fp_cores/altera/VX_fp_mul.v index 8be10473..e2d00457 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_mul.v +++ b/hw/rtl/fp_cores/altera/VX_fp_mul.v @@ -1,19 +1,22 @@ `include "VX_define.vh" -module VX_fp_mul ( +module VX_fp_mul #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -22,7 +25,7 @@ module VX_fp_mul ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), @@ -65,7 +68,7 @@ module VX_fp_mul ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(1) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v index 511b7512..784f5cb4 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v @@ -1,18 +1,21 @@ `include "VX_define.vh" -module VX_fp_sqrt ( +module VX_fp_sqrt #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -21,7 +24,7 @@ module VX_fp_sqrt ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_sqrt fsqrt ( .clk (clk), .areset (1'b0), @@ -32,7 +35,7 @@ module VX_fp_sqrt ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_FSQRT) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_sub.v b/hw/rtl/fp_cores/altera/VX_fp_sub.v index 574eac6a..f1c8ed26 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sub.v @@ -1,19 +1,22 @@ `include "VX_define.vh" -module VX_fp_sub ( +module VX_fp_sub #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - input wire [`NUM_THREADS-1:0][31:0] datab, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + input wire [LANES-1:0][31:0] datab, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -22,7 +25,7 @@ module VX_fp_sub ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), @@ -65,7 +68,7 @@ module VX_fp_sub ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(1) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/fp_cores/altera/VX_fp_utof.v b/hw/rtl/fp_cores/altera/VX_fp_utof.v index 2fb253fc..935a44fb 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_utof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_utof.v @@ -1,18 +1,21 @@ `include "VX_define.vh" -module VX_fp_utof ( +module VX_fp_utof #( + parameter TAGW = 1, + parameter LANES = 1 +) ( input wire clk, input wire reset, output wire ready_in, input wire valid_in, - input wire [`ISTAG_BITS-1:0] tag_in, + input wire [TAGW-1:0] tag_in, - input wire [`NUM_THREADS-1:0][31:0] dataa, - output wire [`NUM_THREADS-1:0][31:0] result, + input wire [LANES-1:0][31:0] dataa, + output wire [LANES-1:0][31:0] result, - output wire [`ISTAG_BITS-1:0] tag_out, + output wire [TAGW-1:0] tag_out, input wire ready_out, output wire valid_out @@ -21,7 +24,7 @@ module VX_fp_utof ( wire enable = ~stall; assign ready_in = enable; - for (genvar i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < LANES; i++) begin acl_fp_utof utof ( .clk (clk), .areset (1'b0), @@ -32,7 +35,7 @@ module VX_fp_utof ( end VX_shift_register #( - .DATAW(`ISTAG_BITS + 1), + .DATAW(TAGW + 1), .DEPTH(`LATENCY_UTOF) ) shift_reg ( .clk(clk), diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 13ec04f7..af1a80ab 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -5,23 +5,20 @@ interface VX_alu_req_if (); - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; -`DEBUG_BEGIN - wire [`NW_BITS-1:0] wid; + wire valid; + + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; -`DEBUG_END wire [31:0] curr_PC; - - wire [`ALU_BITS-1:0] op; - + wire [`ALU_BR_BITS-1:0] op; wire rs1_is_PC; wire rs2_is_imm; - wire [31:0] imm; - + wire [`NT_BITS-1:0] tid; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NR_BITS-1:0] rd; + wire wb; wire ready; diff --git a/hw/rtl/interfaces/VX_bru_req_if.v b/hw/rtl/interfaces/VX_bru_req_if.v deleted file mode 100644 index 17e125ba..00000000 --- a/hw/rtl/interfaces/VX_bru_req_if.v +++ /dev/null @@ -1,29 +0,0 @@ -`ifndef VX_BRANCH_REQ_IF -`define VX_BRANCH_REQ_IF - -`include "VX_define.vh" - -interface VX_bru_req_if (); - - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NW_BITS-1:0] wid; -`DEBUG_BEGIN - wire [`NUM_THREADS-1:0] thread_mask; -`DEBUG_END - wire [31:0] curr_PC; - - wire [`BRU_BITS-1:0] op; - - wire rs1_is_PC; - - wire [31:0] rs1_data; - wire [31:0] rs2_data; - - wire [31:0] offset; - - wire ready; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cmt_to_issue_if.v b/hw/rtl/interfaces/VX_cmt_to_issue_if.v deleted file mode 100644 index 4065ba0c..00000000 --- a/hw/rtl/interfaces/VX_cmt_to_issue_if.v +++ /dev/null @@ -1,36 +0,0 @@ -`ifndef VX_CMT_TO_ISSUE_IF -`define VX_CMT_TO_ISSUE_IF - -`include "VX_define.vh" - -interface VX_cmt_to_issue_if (); - - wire alu_valid; - wire bru_valid; - wire lsu_valid; - wire csr_valid; - wire mul_valid; - wire fpu_valid; - wire gpu_valid; - - wire [`ISTAG_BITS-1:0] alu_tag; - wire [`ISTAG_BITS-1:0] bru_tag; - wire [`ISTAG_BITS-1:0] lsu_tag; - wire [`ISTAG_BITS-1:0] csr_tag; - wire [`ISTAG_BITS-1:0] mul_tag; - wire [`ISTAG_BITS-1:0] fpu_tag; - wire [`ISTAG_BITS-1:0] gpu_tag; - -`IGNORE_WARNINGS_BEGIN - issue_data_t alu_data; - issue_data_t bru_data; - issue_data_t lsu_data; - issue_data_t csr_data; - issue_data_t mul_data; - issue_data_t fpu_data; - issue_data_t gpu_data; -`IGNORE_WARNINGS_END - -endinterface - -`endif diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index c8de21be..9f3e1f6f 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -6,18 +6,13 @@ interface VX_csr_req_if (); wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; -`DEBUG_BEGIN wire [`NUM_THREADS-1:0] thread_mask; -`DEBUG_END - wire [31:0] curr_PC; - + wire [31:0] curr_PC; wire [`CSR_BITS-1:0] op; - wire [`CSR_ADDR_BITS-1:0] csr_addr; wire [31:0] csr_mask; - wire [`NR_BITS-1:0] rd; wire wb; wire is_io; @@ -26,4 +21,4 @@ interface VX_csr_req_if (); endinterface -`endif +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_csr_rsp_if.v b/hw/rtl/interfaces/VX_csr_rsp_if.v deleted file mode 100644 index 9e141783..00000000 --- a/hw/rtl/interfaces/VX_csr_rsp_if.v +++ /dev/null @@ -1,15 +0,0 @@ -`ifndef VX_CSR_RSP_IF -`define VX_CSR_RSP_IF - -`include "VX_define.vh" - -interface VX_csr_rsp_if (); - - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NUM_THREADS-1:0][31:0] data; - wire ready; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_csr_to_fpu_if.v b/hw/rtl/interfaces/VX_csr_to_issue_if.v similarity index 64% rename from hw/rtl/interfaces/VX_csr_to_fpu_if.v rename to hw/rtl/interfaces/VX_csr_to_issue_if.v index 2b1aac5a..f222370c 100644 --- a/hw/rtl/interfaces/VX_csr_to_fpu_if.v +++ b/hw/rtl/interfaces/VX_csr_to_issue_if.v @@ -1,5 +1,5 @@ -`ifndef VX_CSR_TO_FPU_IF -`define VX_CSR_TO_FPU_IF +`ifndef VX_CSR_TO_ISSUE_IF +`define VX_CSR_TO_ISSUE_IF `include "VX_define.vh" @@ -7,7 +7,7 @@ `IGNORE_WARNINGS_BEGIN `endif -interface VX_csr_to_fpu_if (); +interface VX_csr_to_issue_if (); wire [`NW_BITS-1:0] wid; wire [`FRM_BITS-1:0] frm; diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index 4f82f7a5..b8f11e47 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -6,29 +6,26 @@ interface VX_decode_if (); wire valid; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; wire [`EX_BITS-1:0] ex_type; wire [`OP_BITS-1:0] ex_op; + wire [`FRM_BITS-1:0] frm; + wire wb; wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rs2; + wire [`NR_BITS-1:0] rs3; wire [31:0] imm; wire rs1_is_PC; - wire rs2_is_imm; - - wire [`NUM_REGS-1:0] reg_use_mask; - - // FP states - wire [`NR_BITS-1:0] rs3; + wire rs2_is_imm; wire use_rs3; - wire [`FRM_BITS-1:0] frm; - - wire wb; + wire [`NUM_REGS-1:0] used_regs; wire ready; diff --git a/hw/rtl/interfaces/VX_exu_to_cmt_if.v b/hw/rtl/interfaces/VX_exu_to_cmt_if.v index 85ec1074..8ade9994 100644 --- a/hw/rtl/interfaces/VX_exu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_exu_to_cmt_if.v @@ -5,9 +5,14 @@ interface VX_exu_to_cmt_if (); - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NUM_THREADS-1:0][31:0] data; + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [31:0] curr_PC; + wire [`NUM_THREADS-1:0][31:0] data; + wire [`NR_BITS-1:0] rd; + wire wb; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index c920710d..fcaf61cd 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -10,20 +10,18 @@ interface VX_fpu_req_if (); wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; -`DEBUG_BEGIN wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; -`DEBUG_END - wire [`FPU_BITS-1:0] op; wire [`FRM_BITS-1:0] frm; - wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; - + wire [`NR_BITS-1:0] rd; + wire wb; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v index b4a2e015..e0b857d0 100644 --- a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v @@ -5,11 +5,16 @@ interface VX_fpu_to_cmt_if (); - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NUM_THREADS-1:0][31:0] data; - wire has_fflags; - fflags_t [`NUM_THREADS-1:0] fflags; + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [31:0] curr_PC; + wire [`NUM_THREADS-1:0][31:0] data; + wire [`NR_BITS-1:0] rd; + wire wb; + wire has_fflags; + fflags_t [`NUM_THREADS-1:0] fflags; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_gpr_read_if.v b/hw/rtl/interfaces/VX_gpr_read_if.v index 4e444ba2..c9675bee 100644 --- a/hw/rtl/interfaces/VX_gpr_read_if.v +++ b/hw/rtl/interfaces/VX_gpr_read_if.v @@ -19,7 +19,8 @@ interface VX_gpr_read_if (); wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs3_data; - wire ready; + wire ready_in; + wire ready_out; endinterface diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 81661138..1ef02da6 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -6,15 +6,15 @@ interface VX_gpu_req_if(); wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; - wire [`GPU_BITS-1:0] op; - wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [31:0] rs2_data; + wire [`NR_BITS-1:0] rd; + wire wb; wire ready; diff --git a/hw/rtl/interfaces/VX_issue_if.v b/hw/rtl/interfaces/VX_issue_if.v deleted file mode 100644 index 21e9b658..00000000 --- a/hw/rtl/interfaces/VX_issue_if.v +++ /dev/null @@ -1,39 +0,0 @@ -`ifndef VX_ISSUE_IF -`define VX_ISSUE_IF - -`include "VX_define.vh" - -interface VX_issue_if (); - - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NW_BITS-1:0] wid; - wire [`NUM_THREADS-1:0] thread_mask; - wire [31:0] curr_PC; - - wire [`EX_BITS-1:0] ex_type; - wire [`OP_BITS-1:0] ex_op; - - wire [`FRM_BITS-1:0] frm; - - wire wb; - - wire [`NR_BITS-1:0] rd; - - wire [`NUM_THREADS-1:0][31:0] rs1_data; - wire [`NUM_THREADS-1:0][31:0] rs2_data; - wire [`NUM_THREADS-1:0][31:0] rs3_data; - - wire [`NR_BITS-1:0] rs1; - wire [31:0] imm; - - wire rs1_is_PC; - wire rs2_is_imm; - - wire [`NT_BITS-1:0] tid; - - wire ready; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index 1e0ab4fb..9d682af6 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -6,9 +6,9 @@ interface VX_lsu_req_if (); wire valid; - wire [`NUM_THREADS-1:0] thread_mask; - wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; wire rw; diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 6cd6432f..c646116f 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -10,16 +10,15 @@ interface VX_mul_req_if (); wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; -`DEBUG_BEGIN + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; -`DEBUG_END wire [`MUL_BITS-1:0] op; - wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NR_BITS-1:0] rd; + wire wb; wire ready; diff --git a/hw/rtl/interfaces/VX_warp_ctl_if.v b/hw/rtl/interfaces/VX_warp_ctl_if.v index 1f2e422a..c7eb8a4b 100644 --- a/hw/rtl/interfaces/VX_warp_ctl_if.v +++ b/hw/rtl/interfaces/VX_warp_ctl_if.v @@ -5,6 +5,7 @@ interface VX_warp_ctl_if (); + wire valid; wire [`NW_BITS-1:0] wid; gpu_tmc_t tmc; diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_writeback_if.v similarity index 74% rename from hw/rtl/interfaces/VX_wb_if.v rename to hw/rtl/interfaces/VX_writeback_if.v index 062377ec..466c7398 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_writeback_if.v @@ -1,9 +1,9 @@ -`ifndef VX_WB_IF -`define VX_WB_IF +`ifndef VX_WRITEBACK_IF +`define VX_WRITEBACK_IF `include "VX_define.vh" -interface VX_wb_if (); +interface VX_writeback_if (); wire valid; wire [`NUM_THREADS-1:0] thread_mask; @@ -16,6 +16,8 @@ interface VX_wb_if (); wire [`NR_BITS-1:0] rd; wire [`NUM_THREADS-1:0][31:0] data; + wire ready; + endinterface `endif diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index 4f12ee36..dd5e2296 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -1,19 +1,21 @@ `include "VX_platform.vh" module VX_cam_buffer #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter RPORTS = 1, + parameter DATAW = 1, + parameter SIZE = 1, + parameter RPORTS = 1, + parameter CPORTS = 1, parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, input wire reset, - input wire [DATAW-1:0] write_data, output wire [ADDRW-1:0] write_addr, + input wire [DATAW-1:0] write_data, input wire acquire_slot, input wire [RPORTS-1:0][ADDRW-1:0] read_addr, output reg [RPORTS-1:0][DATAW-1:0] read_data, - input wire [RPORTS-1:0] release_slot, + input wire [CPORTS-1:0][ADDRW-1:0] release_addr, + input wire [CPORTS-1:0] release_slot, output wire full ); reg [DATAW-1:0] entries [SIZE-1:0]; @@ -34,12 +36,11 @@ module VX_cam_buffer #( always @(*) begin free_slots_n = free_slots; - for (integer i = 0; i < RPORTS; i++) begin + for (integer i = 0; i < CPORTS; i++) begin if (release_slot[i]) begin - free_slots_n[read_addr[i]] = 1; - end - read_data[i] = entries[read_addr[i]]; - end + free_slots_n[release_addr[i]] = 1; + end + end if (acquire_slot) begin free_slots_n[write_addr_r] = 0; end @@ -55,15 +56,19 @@ module VX_cam_buffer #( assert(1 == free_slots[write_addr]) else $display("%t: inused slot at port %d", $time, write_addr); entries[write_addr] <= write_data; end - for (integer i = 0; i < RPORTS; i++) begin + for (integer i = 0; i < CPORTS; i++) begin if (release_slot[i]) begin - assert(0 == free_slots[read_addr[i]]) else $display("%t: freed slot at port %d", $time, read_addr[i]); + assert(0 == free_slots[release_addr[i]]) else $display("%t: freed slot at port %d", $time, release_addr[i]); end end free_slots <= free_slots_n; write_addr_r <= free_index; full_r <= ~free_valid; end + end + + for (genvar i = 0; i < RPORTS; i++) begin + assign read_data[i] = entries[read_addr[i]]; end assign write_addr = write_addr_r; diff --git a/hw/rtl/libs/VX_elastic_buffer.v b/hw/rtl/libs/VX_elastic_buffer.v index fbd36b36..c551fbc6 100644 --- a/hw/rtl/libs/VX_elastic_buffer.v +++ b/hw/rtl/libs/VX_elastic_buffer.v @@ -14,53 +14,25 @@ module VX_elastic_buffer #( input wire ready_out, output wire valid_out ); - if (0 == SIZE) begin + wire empty, full; - reg [DATAW-1:0] skid_buffer; - reg skid_valid; + VX_generic_queue #( + .DATAW (DATAW), + .SIZE (SIZE), + .BUFFERED (BUFFERED) + ) queue ( + .clk (clk), + .reset (reset), + .push (valid_in), + .pop (ready_out), + .data_in(data_in), + .data_out(data_out), + .empty (empty), + .full (full), + `UNUSED_PIN (size) + ); - always @(posedge clk) begin - if (reset) begin - skid_valid <= 0; - end else begin - if (valid_in && ~ready_out) begin - assert(~skid_valid); - skid_buffer <= data_in; - skid_valid <= 1; - end - if (ready_out) begin - skid_valid <= 0; - end - end - end - - assign ready_in = ready_out || ~skid_valid; - assign data_out = skid_valid ? skid_buffer : data_in; - assign valid_out = valid_in || skid_valid; - - end else begin - - wire empty, full; - - VX_generic_queue #( - .DATAW (DATAW), - .SIZE (SIZE), - .BUFFERED (BUFFERED) - ) queue ( - .clk (clk), - .reset (reset), - .push (valid_in), - .pop (ready_out), - .data_in(data_in), - .data_out(data_out), - .empty (empty), - .full (full), - `UNUSED_PIN (size) - ); - - assign ready_in = ~full; - assign valid_out = ~empty; - - end + assign ready_in = ~full; + assign valid_out = ~empty; endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 1383ebcf..306baaac 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -70,7 +70,6 @@ module VX_generic_queue #( if (writing) begin data[wr_ptr_a] <= data_in; wr_ptr_r <= wr_ptr_r + 1; - if (!reading) begin size_r <= size_r + 1; end diff --git a/hw/rtl/libs/VX_rr_arbiter.v b/hw/rtl/libs/VX_rr_arbiter.v index e5fbc746..c876590e 100644 --- a/hw/rtl/libs/VX_rr_arbiter.v +++ b/hw/rtl/libs/VX_rr_arbiter.v @@ -36,14 +36,14 @@ module VX_rr_arbiter #( end end grant_onehot_r = N'(0); - grant_onehot_r[grant_index] = 1; + grant_onehot_r[grant_table[state]] = 1; end always @(posedge clk) begin if (reset) begin state <= 0; end else begin - state <= grant_index; + state <= grant_table[state]; end end diff --git a/hw/rtl/libs/VX_skid_buffer.v b/hw/rtl/libs/VX_skid_buffer.v new file mode 100644 index 00000000..63c3e4a3 --- /dev/null +++ b/hw/rtl/libs/VX_skid_buffer.v @@ -0,0 +1,65 @@ +`include "VX_platform.vh" + +module VX_skid_buffer #( + parameter DATAW = 1 +) ( + input wire clk, + input wire reset, + input wire valid_in, + output reg ready_in, + input wire [DATAW-1:0] data_in, + output reg [DATAW-1:0] data_out, + input wire ready_out, + output reg valid_out +); + reg [DATAW-1:0] buffer; + reg use_buffer; + + wire push = valid_in && ready_in; + + always @(posedge clk) begin + if (reset) begin + use_buffer <= 0; + valid_out <= 0; + end else begin + if (push && (valid_out && !ready_out)) begin + assert(!use_buffer); + use_buffer <= 1; + end + if (ready_out) begin + use_buffer <= 0; + end + if (push) begin + buffer <= data_in; + end + if (!valid_out || ready_out) begin + valid_out <= valid_in || use_buffer; + data_out <= use_buffer ? buffer : data_in; + end + end + end + + assign ready_in = !use_buffer; + + /*wire empty, full; + + VX_generic_queue #( + .DATAW (DATAW), + .SIZE (2), + .BUFFERED (0) + ) queue ( + .clk (clk), + .reset (reset), + .push (valid_in), + .pop (ready_out), + .data_in(data_in), + .data_out(data_out), + .empty (empty), + .full (full), + `UNUSED_PIN (size) + ); + + assign ready_in = ~full; + assign valid_out = ~empty;*/ + +endmodule \ No newline at end of file diff --git a/hw/simulate/testbench.cpp b/hw/simulate/testbench.cpp index 18c1c887..554974a5 100644 --- a/hw/simulate/testbench.cpp +++ b/hw/simulate/testbench.cpp @@ -3,7 +3,7 @@ #include #include -#define ALL_TESTS +//#define ALL_TESTS int main(int argc, char **argv) { if (argc == 1) { diff --git a/hw/syn/quartus/project.sdc b/hw/syn/quartus/project.sdc index 61b8cba9..e06c4389 100644 --- a/hw/syn/quartus/project.sdc +++ b/hw/syn/quartus/project.sdc @@ -1,6 +1,6 @@ set_time_format -unit ns -decimal_places 3 -create_clock -name {clk} -period "300 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] +create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] derive_pll_clocks -create_base_clocks derive_clock_uncertainty