From 6c123913382cb534f1fc4fca0def773b733d99a0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Fri, 14 Aug 2020 21:50:14 -0700 Subject: [PATCH] pipeline refactoring - fmax >= 222 mhz --- driver/tests/dogfood/Makefile | 2 +- driver/tests/dogfood/dogfood.cpp | 3 + driver/tests/dogfood/testcases.h | 4 +- hw/opae/README | 1 + hw/opae/vortex_afu.qsf | 2 +- hw/opae/vortex_afu.sv | 3 +- hw/rtl/VX_alu_unit.v | 137 ++++++++-------- hw/rtl/VX_bru_unit.v | 56 +++++++ hw/rtl/VX_cluster.v | 8 +- hw/rtl/VX_commit.v | 87 +++++++---- hw/rtl/VX_csr_arb.v | 8 +- hw/rtl/VX_csr_data.v | 34 ++-- hw/rtl/VX_csr_io_arb.v | 6 +- hw/rtl/VX_csr_unit.v | 31 ++-- hw/rtl/VX_decode.v | 58 +++---- hw/rtl/VX_define.vh | 147 +++++++----------- hw/rtl/VX_execute.v | 25 ++- hw/rtl/VX_fpu_unit.v | 60 ++++--- hw/rtl/VX_gpr_fp_ctrl.v | 33 ++-- hw/rtl/VX_gpr_ram.v | 15 +- hw/rtl/VX_gpr_stage.v | 19 ++- hw/rtl/VX_gpu_unit.v | 93 +++++------ hw/rtl/VX_icache_stage.v | 12 +- hw/rtl/VX_ipdom_stack.v | 22 +-- hw/rtl/VX_issue.v | 148 +++++++----------- hw/rtl/VX_issue_demux.v | 130 +++++++++------- hw/rtl/VX_lsu_unit.v | 170 +++++++++++--------- hw/rtl/VX_mem_arb.v | 6 +- hw/rtl/VX_mul_unit.v | 164 +++++++++++--------- hw/rtl/VX_pipeline.v | 8 +- hw/rtl/VX_platform.vh | 3 + hw/rtl/VX_print_instr.vh | 35 +++-- hw/rtl/VX_scheduler.v | 82 ---------- hw/rtl/VX_scope.vh | 40 ++--- hw/rtl/VX_scoreboard.v | 73 +++++++++ hw/rtl/VX_types.vh | 59 +++++++ hw/rtl/VX_warp_sched.v | 181 ++++++++++------------ hw/rtl/VX_writeback.v | 64 ++++---- hw/rtl/Vortex.v | 5 +- hw/rtl/cache/VX_bank.v | 36 ++--- hw/rtl/cache/VX_cache.v | 8 +- hw/rtl/cache/VX_cache_core_req_bank_sel.v | 6 +- hw/rtl/cache/VX_cache_core_rsp_merge.v | 6 +- hw/rtl/cache/VX_cache_dram_req_arb.v | 3 +- hw/rtl/cache/VX_cache_miss_resrv.v | 9 +- hw/rtl/cache/VX_snp_forwarder.v | 6 +- hw/rtl/cache/VX_snp_rsp_arb.v | 3 +- hw/rtl/cache/VX_tag_data_access.v | 19 ++- hw/rtl/fp_cores/VX_fp_fpga.v | 4 +- hw/rtl/fp_cores/VX_fp_noncomp.v | 14 +- hw/rtl/fp_cores/VX_fpnew.v | 7 +- hw/rtl/fp_cores/altera/VX_fp_add.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_div.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_ftoi.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_ftou.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_itof.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_madd.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_msub.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_mul.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_sqrt.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_sub.v | 4 +- hw/rtl/fp_cores/altera/VX_fp_utof.v | 4 +- hw/rtl/interfaces/VX_alu_req_if.v | 14 +- hw/rtl/interfaces/VX_branch_ctl_if.v | 2 +- hw/rtl/interfaces/VX_bru_req_if.v | 29 ++++ hw/rtl/interfaces/VX_cmt_to_csr_if.v | 2 +- hw/rtl/interfaces/VX_cmt_to_issue_if.v | 3 + hw/rtl/interfaces/VX_csr_req_if.v | 4 +- hw/rtl/interfaces/VX_csr_rsp_if.v | 15 ++ hw/rtl/interfaces/VX_csr_to_fpu_if.v | 2 +- hw/rtl/interfaces/VX_decode_if.v | 3 +- hw/rtl/interfaces/VX_exu_to_cmt_if.v | 7 +- hw/rtl/interfaces/VX_fpu_req_if.v | 6 +- hw/rtl/interfaces/VX_fpu_to_cmt_if.v | 1 - hw/rtl/interfaces/VX_fpu_to_csr_if.v | 2 +- hw/rtl/interfaces/VX_gpr_read_if.v | 2 +- hw/rtl/interfaces/VX_gpu_req_if.v | 10 +- hw/rtl/interfaces/VX_ifetch_req_if.v | 2 +- hw/rtl/interfaces/VX_ifetch_rsp_if.v | 2 +- hw/rtl/interfaces/VX_issue_if.v | 39 +++++ hw/rtl/interfaces/VX_join_if.v | 2 +- hw/rtl/interfaces/VX_lsu_req_if.v | 2 +- hw/rtl/interfaces/VX_mul_req_if.v | 4 +- hw/rtl/interfaces/VX_warp_ctl_if.v | 22 +-- hw/rtl/interfaces/VX_wb_if.v | 6 +- hw/rtl/interfaces/VX_wstall_if.v | 2 +- hw/rtl/libs/VX_cam_buffer.v | 18 +-- hw/rtl/libs/VX_countones.v | 4 +- hw/rtl/libs/VX_divide.v | 31 ++-- hw/rtl/libs/VX_elastic_buffer.v | 64 +++++--- hw/rtl/libs/VX_fair_arbiter.v | 12 +- hw/rtl/libs/VX_generic_queue.v | 3 +- hw/rtl/libs/VX_matrix_arbiter.v | 15 +- hw/rtl/libs/VX_multiplier.v | 18 +-- hw/rtl/libs/VX_onehot_encooder.v | 4 +- hw/rtl/libs/VX_priority_encoder.v | 6 +- hw/rtl/libs/VX_rr_arbiter.v | 9 +- hw/rtl/libs/VX_scope.v | 1 - hw/rtl/libs/VX_shift_register.v | 4 +- hw/syn/quartus/cache/Makefile | 4 +- hw/syn/quartus/core/Makefile | 4 +- hw/syn/quartus/pipeline/Makefile | 4 +- hw/syn/quartus/project.sdc | 2 +- hw/syn/quartus/project.tcl | 1 + hw/syn/quartus/top/Makefile | 4 +- hw/syn/quartus/vortex/Makefile | 4 +- hw/unit_tests/VX_divide_tb.v | 37 ++--- 107 files changed, 1392 insertions(+), 1239 deletions(-) create mode 100644 hw/rtl/VX_bru_unit.v delete mode 100644 hw/rtl/VX_scheduler.v create mode 100644 hw/rtl/VX_scoreboard.v create mode 100644 hw/rtl/VX_types.vh create mode 100644 hw/rtl/interfaces/VX_bru_req_if.v create mode 100644 hw/rtl/interfaces/VX_csr_rsp_if.v create mode 100644 hw/rtl/interfaces/VX_issue_if.v diff --git a/driver/tests/dogfood/Makefile b/driver/tests/dogfood/Makefile index 72fdf50b..8a3a3bd6 100644 --- a/driver/tests/dogfood/Makefile +++ b/driver/tests/dogfood/Makefile @@ -18,7 +18,7 @@ VX_SRCS = kernel.c CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I../../include +CXXFLAGS += -I../../include -I../../../hw PROJECT = dogfood diff --git a/driver/tests/dogfood/dogfood.cpp b/driver/tests/dogfood/dogfood.cpp index 1a62959f..dc7e7ff0 100644 --- a/driver/tests/dogfood/dogfood.cpp +++ b/driver/tests/dogfood/dogfood.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "testcases.h" #include "common.h" @@ -25,6 +26,7 @@ public: this->add_test("imul", new Test_IMUL()); this->add_test("idiv", new Test_IDIV()); this->add_test("idiv-mul", new Test_IDIV_MUL()); + #ifdef EXT_F_ENABLE this->add_test("fadd", new Test_FADD()); this->add_test("fsub", new Test_FSUB()); this->add_test("fmul", new Test_FMUL()); @@ -40,6 +42,7 @@ public: this->add_test("ftou", new Test_FTOU()); this->add_test("tof", new Test_ITOF()); this->add_test("utof", new Test_UTOF()); + #endif } ~TestMngr() { diff --git a/driver/tests/dogfood/testcases.h b/driver/tests/dogfood/testcases.h index 4669c857..311efc20 100644 --- a/driver/tests/dogfood/testcases.h +++ b/driver/tests/dogfood/testcases.h @@ -15,8 +15,8 @@ union Float_t { }; inline bool almost_equal_eps(float a, float b, float eps = std::numeric_limits::epsilon()) { - auto tolerance = std::max(std::fabs(a), std::fabs(b)) * eps; - return std::fabs(a - b) <= tolerance; + auto tolerance = std::max(fabs(a), fabs(b)) * eps; + return fabs(a - b) <= tolerance; } inline bool almost_equal_ulp(float a, float b, int32_t ulp = 4) { diff --git a/hw/opae/README b/hw/opae/README index cafa5897..1b916d3c 100644 --- a/hw/opae/README +++ b/hw/opae/README @@ -62,6 +62,7 @@ make ase # tests ./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256 ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16 +./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n 16 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd # modify "vsim_run.tcl" to dump VCD trace diff --git a/hw/opae/vortex_afu.qsf b/hw/opae/vortex_afu.qsf index 75c1bda1..8f3f724b 100644 --- a/hw/opae/vortex_afu.qsf +++ b/hw/opae/vortex_afu.qsf @@ -1,7 +1,7 @@ # Analysis & Synthesis Assignments set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 -set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON +# set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG \ No newline at end of file diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index a059e8e7..1730eedb 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -1035,8 +1035,7 @@ wire [SCOPE_DATAW+1:0] scope_data_in_ste; assign scope_data_in_st[0] = {`SCOPE_SIGNALS_DATA_LIST `SCOPE_SIGNALS_UPD_LIST, scope_changed, scope_start}; assign scope_data_in_ste = scope_data_in_st[SCOPE_SR_DEPTH-1]; -genvar i; -for (i = 1; i < SCOPE_SR_DEPTH; i++) begin +for (genvar i = 1; i < SCOPE_SR_DEPTH; i++) begin VX_generic_register #( .N (SCOPE_DATAW+2) ) scope_sr ( diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 3fe6c5b0..bd3e478d 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -10,98 +10,83 @@ module VX_alu_unit #( VX_alu_req_if alu_req_if, // Outputs - VX_branch_ctl_if branch_ctl_if, - VX_exu_to_cmt_if alu_commit_if + VX_exu_to_cmt_if alu_commit_if ); - reg [`NUM_THREADS-1:0][31:0] alu_result; - wire [`NUM_THREADS-1:0][32:0] sub_result; - wire [`NUM_THREADS-1:0][32:0] shift_result; + reg [`NUM_THREADS-1:0][31:0] alu_result; + + wire [`NUM_THREADS-1:0][31:0] addsub_result; + wire [`NUM_THREADS-1:0] less_result; + wire [`NUM_THREADS-1:0][31:0] shift_result; + reg [`NUM_THREADS-1:0][31:0] misc_result; - wire [`ALU_BITS-1:0] alu_op = alu_req_if.alu_op; + wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op); wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; - genvar i; + wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.curr_PC}} : alu_in1; + wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2; - for (i = 0; i < `NUM_THREADS; i++) begin + wire negate_add = (alu_op == `ALU_SUB); + wire signed_less = (alu_op == `ALU_SLT); + wire signed_shift = (alu_op == `ALU_SRA); - wire [32:0] sub_in1 = {(alu_op != `ALU_SLTU) & (alu_op != `ALU_BLTU) & (alu_op != `ALU_BGEU) & alu_in1[i][31], alu_in1[i]}; - wire [32:0] sub_in2 = {(alu_op != `ALU_SLTU) & (alu_op != `ALU_BLTU) & (alu_op != `ALU_BGEU) & alu_in2[i][31], alu_in2[i]}; - assign sub_result[i] = $signed(sub_in1) - $signed(sub_in2); + for (genvar i = 0; i < `NUM_THREADS; i++) begin + wire [32:0] addsub_in1 = {alu_in1_PC[i], 1'b1}; + wire [32:0] addsub_in2 = {alu_in2_imm[i], 1'b0} ^ {33{negate_add}}; + `IGNORE_WARNINGS_BEGIN + wire [32:0] addsub_addd = addsub_in1 + addsub_in2; + `IGNORE_WARNINGS_END + assign addsub_result[i] = addsub_addd[32:1]; + end - wire [32:0] shift_in1 = {(alu_op == `ALU_SRA) & alu_in1[i][31], alu_in1[i]}; - assign shift_result[i] = $signed(shift_in1) >>> alu_in2[i][4:0]; - - always @(*) begin - case (alu_op) - `ALU_SUB: alu_result[i] = sub_result[i][31:0]; - `ALU_SLL: alu_result[i] = alu_in1[i] << alu_in2[i][4:0]; - `ALU_SLT, - `ALU_SLTU: alu_result[i] = 32'(sub_result[i][32]); - `ALU_XOR: alu_result[i] = alu_in1[i] ^ alu_in2[i]; - `ALU_SRL, - `ALU_SRA: alu_result[i] = shift_result[i][31:0]; - `ALU_OR: alu_result[i] = alu_in1[i] | alu_in2[i]; - `ALU_AND: alu_result[i] = alu_in1[i] & alu_in2[i]; - default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC - endcase - end + for (genvar i = 0; i < `NUM_THREADS; i++) begin + wire [32:0] less_in1 = {signed_less & alu_in1[i][31], alu_in1[i]}; + wire [32:0] less_in2 = {signed_less & alu_in2_imm[i][31], alu_in2_imm[i]}; + assign less_result[i] = $signed(less_in1) < $signed(less_in2); + end + + for (genvar i = 0; i < `NUM_THREADS; i++) begin + wire [32:0] shift_in1 = {signed_shift & alu_in1[i][31], alu_in1[i]}; + `IGNORE_WARNINGS_BEGIN + wire [32:0] shift_value = $signed(shift_in1) >>> alu_in2_imm[i][4:0]; + `IGNORE_WARNINGS_END + assign shift_result[i] = shift_value[31:0]; end - wire [`NT_BITS-1:0] br_result_index; - - VX_priority_encoder #( - .N(`NUM_THREADS) - ) choose_alu_result ( - .data_in (alu_req_if.thread_mask), - .data_out (br_result_index), - `UNUSED_PIN (valid_out) - ); - - wire [32:0] br_result = sub_result[br_result_index]; - wire br_sign = br_result[32]; - wire br_nzero = (| br_result[31:0]); - wire br_sign_s1; - wire br_nzero_s1; - - wire [`BR_BITS-1:0] br_op = `IS_BR_OP(alu_req_if.alu_op) ? `BR_OP(alu_req_if.alu_op) : `BR_NO; - wire [`BR_BITS-1:0] br_op_s1; - - wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC; - wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset); - - wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR); - wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; - - wire stall = ~alu_commit_if.ready && alu_commit_if.valid; + for (genvar i = 0; i < `NUM_THREADS; i++) begin + always @(*) begin + case (alu_op) + `ALU_AND: misc_result[i] = alu_in1[i] & alu_in2_imm[i]; + `ALU_OR: misc_result[i] = alu_in1[i] | alu_in2_imm[i]; + `ALU_XOR: misc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; + //`ALU_SLL, + default: misc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; + endcase + end + end + + for (genvar i = 0; i < `NUM_THREADS; i++) begin + always @(*) begin + case (`ALU_OP_CLASS(alu_op)) + 0: alu_result[i] = addsub_result[i]; + 1: alu_result[i] = {31'b0, less_result[i]}; + 2: alu_result[i] = shift_result[i]; + default: alu_result[i] = misc_result[i]; + endcase + end + end VX_generic_register #( - .N(1 + `NW_BITS + `ISTAG_BITS + (`NUM_THREADS * 32) + `BR_BITS + 32 + 1 + 1) + .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32)) ) alu_reg ( .clk (clk), .reset (reset), - .stall (stall), + .stall (0), .flush (0), - .in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.issue_tag, alu_jal_result, br_op, br_dest, br_sign, br_nzero}), - .out ({alu_commit_if.valid, branch_ctl_if.warp_num, alu_commit_if.issue_tag, alu_commit_if.data, br_op_s1, branch_ctl_if.dest, br_sign_s1, br_nzero_s1}) - ); - - reg br_taken; - always @(*) begin - case (br_op_s1) - `BR_NE: br_taken = br_nzero_s1; - `BR_EQ: br_taken = ~br_nzero_s1; - `BR_LT, - `BR_LTU: br_taken = br_sign_s1; - `BR_GE, - `BR_GEU: br_taken = ~br_sign_s1; - default: br_taken = 1'b1; - endcase - end + .in ({alu_req_if.valid, alu_req_if.issue_tag, alu_result}), + .out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data}) + ); - assign branch_ctl_if.valid = alu_commit_if.valid && (br_op_s1 != `BR_NO); - assign branch_ctl_if.taken = br_taken; - - assign alu_req_if.ready = ~stall; + assign alu_req_if.ready = 1'b1; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_bru_unit.v b/hw/rtl/VX_bru_unit.v new file mode 100644 index 00000000..4d6bbde5 --- /dev/null +++ b/hw/rtl/VX_bru_unit.v @@ -0,0 +1,56 @@ +`include "VX_define.vh" + +module VX_bru_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Inputs + VX_bru_req_if bru_req_if, + + // Outputs + VX_branch_ctl_if branch_ctl_if, + VX_exu_to_cmt_if bru_commit_if +); + wire [`BRU_BITS-1:0] bru_op = bru_req_if.op; + wire bru_neg = `BRU_NEG(bru_op); + wire bru_less = `BRU_LESS(bru_op); + wire bru_signed = `BRU_SIGNED(bru_op); + wire bru_static = `BRU_STATIC(bru_op); + + wire [31:0] rs1_data = bru_req_if.rs1_data; + wire [31:0] rs2_data = bru_req_if.rs2_data; + + wire [32:0] signed_in1 = {bru_signed & rs1_data[31], rs1_data}; + wire [32:0] signed_in2 = {bru_signed & rs2_data[31], rs2_data}; + wire is_less = $signed(signed_in1) < $signed(signed_in2); + + wire is_equal = (rs1_data == rs2_data); + + wire taken = ((bru_less ? is_less : is_equal) ^ bru_neg) | bru_static; + + wire [31:0] base_addr = bru_req_if.rs1_is_PC ? bru_req_if.curr_PC : rs1_data; + wire [31:0] dest = base_addr + bru_req_if.offset; + + wire [31:0] jal_result = bru_req_if.curr_PC + 4; + wire [31:0] jal_result_r; + + VX_generic_register #( + .N(1 + `NW_BITS + `ISTAG_BITS + 1 + 32 + 32) + ) bru_reg ( + .clk (clk), + .reset (reset), + .stall (0), + .flush (0), + .in ({bru_req_if.valid, bru_req_if.wid, bru_req_if.issue_tag, taken, dest, jal_result}), + .out ({bru_commit_if.valid, branch_ctl_if.wid, bru_commit_if.issue_tag, branch_ctl_if.taken, branch_ctl_if.dest, jal_result_r}) + ); + + assign branch_ctl_if.valid = bru_commit_if.valid; + + assign bru_commit_if.data = {`NUM_THREADS{jal_result_r}}; + + assign bru_req_if.ready = 1'b1; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 2633bf7d..9a9c4a8f 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -135,9 +135,7 @@ module VX_cluster #( wire [`NUM_CORES-1:0] per_core_busy; wire [`NUM_CORES-1:0] per_core_ebreak; - genvar i; - - for (i = 0; i < `NUM_CORES; i++) begin + for (genvar i = 0; i < `NUM_CORES; i++) begin VX_core #( .CORE_ID(i + (CLUSTER_ID * `NUM_CORES)) ) core ( @@ -316,7 +314,7 @@ module VX_cluster #( wire[`NUM_CORES-1:0][`DSNP_TAG_WIDTH-1:0] l2_snp_fwdin_tag; wire[`NUM_CORES-1:0] l2_snp_fwdin_ready; - for (i = 0; i < `L2NUM_REQUESTS; i = i + 2) begin + for (genvar i = 0; i < `L2NUM_REQUESTS; i = i + 2) begin assign l2_core_req_valid [i] = per_core_D_dram_req_valid[(i/2)]; assign l2_core_req_valid [i+1] = per_core_I_dram_req_valid[(i/2)]; @@ -472,7 +470,7 @@ module VX_cluster #( wire[`NUM_CORES-1:0][`DSNP_TAG_WIDTH-1:0] arb_snp_fwdin_tag; wire[`NUM_CORES-1:0] arb_snp_fwdin_ready; - for (i = 0; i < `L2NUM_REQUESTS; i = i + 2) begin + for (genvar i = 0; i < `L2NUM_REQUESTS; i = i + 2) begin assign arb_dram_req_valid [i] = per_core_D_dram_req_valid[(i/2)]; assign arb_dram_req_valid [i+1] = per_core_I_dram_req_valid[(i/2)]; diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index d4c246c3..f6f6c594 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -8,6 +8,7 @@ module VX_commit #( // inputs VX_exu_to_cmt_if alu_commit_if, + VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if mul_commit_if, VX_exu_to_cmt_if csr_commit_if, @@ -22,12 +23,13 @@ module VX_commit #( // update CRSs wire [`NUM_EXS-1:0] commited_mask; - assign commited_mask = {(alu_commit_if.valid && alu_commit_if.ready), - (lsu_commit_if.valid && lsu_commit_if.ready), - (csr_commit_if.valid && csr_commit_if.ready), - (mul_commit_if.valid && mul_commit_if.ready), - (fpu_commit_if.valid && fpu_commit_if.ready), - (gpu_commit_if.valid && gpu_commit_if.ready)}; + assign commited_mask = {alu_commit_if.valid, + bru_commit_if.valid, + lsu_commit_if.valid, + csr_commit_if.valid, + mul_commit_if.valid, + fpu_commit_if.valid, + gpu_commit_if.valid}; wire [`NE_BITS:0] num_commits; @@ -38,18 +40,10 @@ module VX_commit #( .count (num_commits) ); - assign cmt_to_csr_if.valid = (| commited_mask); - assign cmt_to_csr_if.warp_num = cmt_to_issue_if.fpu_data.warp_num; - assign cmt_to_csr_if.num_commits = num_commits; - - assign cmt_to_csr_if.has_fflags = (fpu_commit_if.valid && fpu_commit_if.ready) && fpu_commit_if.has_fflags; - - integer i; - fflags_t fflags; always @(*) begin fflags = 0; - for (i = 0; i < `NUM_THREADS; i++) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin if (cmt_to_issue_if.fpu_data.thread_mask[i]) begin fflags.NX |= fpu_commit_if.fflags[i].NX; fflags.UF |= fpu_commit_if.fflags[i].UF; @@ -59,18 +53,39 @@ module VX_commit #( end end end - assign cmt_to_csr_if.fflags = fflags; + + fflags_t fflags_r; + reg has_fflags_r; + reg [`NW_BITS-1:0] wid_r; + reg [`NE_BITS:0] num_commits_r; + reg csr_update_r; + + always @(posedge clk) begin + csr_update_r <= (| commited_mask); + fflags_r <= fflags; + has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags; + wid_r <= cmt_to_issue_if.fpu_data.wid; + num_commits_r <= num_commits; + end + + assign cmt_to_csr_if.valid = csr_update_r; + assign cmt_to_csr_if.wid = wid_r; + assign cmt_to_csr_if.num_commits = num_commits_r; + assign cmt_to_csr_if.has_fflags = has_fflags_r; + assign cmt_to_csr_if.fflags = fflags_r; // Notify issue stage - assign cmt_to_issue_if.alu_valid = alu_commit_if.valid && alu_commit_if.ready; - assign cmt_to_issue_if.lsu_valid = lsu_commit_if.valid && lsu_commit_if.ready; - assign cmt_to_issue_if.csr_valid = csr_commit_if.valid && csr_commit_if.ready; - assign cmt_to_issue_if.mul_valid = mul_commit_if.valid && mul_commit_if.ready; - assign cmt_to_issue_if.fpu_valid = fpu_commit_if.valid && fpu_commit_if.ready; - assign cmt_to_issue_if.gpu_valid = gpu_commit_if.valid && gpu_commit_if.ready; + assign cmt_to_issue_if.alu_valid = alu_commit_if.valid; + assign cmt_to_issue_if.bru_valid = bru_commit_if.valid; + assign cmt_to_issue_if.lsu_valid = lsu_commit_if.valid; + assign cmt_to_issue_if.csr_valid = csr_commit_if.valid; + assign cmt_to_issue_if.mul_valid = mul_commit_if.valid; + assign cmt_to_issue_if.fpu_valid = fpu_commit_if.valid; + assign cmt_to_issue_if.gpu_valid = gpu_commit_if.valid; assign cmt_to_issue_if.alu_tag = alu_commit_if.issue_tag; + assign cmt_to_issue_if.bru_tag = bru_commit_if.issue_tag; assign cmt_to_issue_if.lsu_tag = lsu_commit_if.issue_tag; assign cmt_to_issue_if.csr_tag = csr_commit_if.issue_tag; assign cmt_to_issue_if.mul_tag = mul_commit_if.issue_tag; @@ -84,6 +99,7 @@ module VX_commit #( .reset (reset), .alu_commit_if (alu_commit_if), + .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), @@ -96,23 +112,26 @@ module VX_commit #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if (alu_commit_if.valid && alu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.alu_data.warp_num, cmt_to_issue_if.alu_data.curr_PC, alu_commit_if.issue_tag, cmt_to_issue_if.alu_data.thread_mask, cmt_to_issue_if.alu_data.wb, cmt_to_issue_if.alu_data.rd, alu_commit_if.data); + if (alu_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.alu_data.wid, cmt_to_issue_if.alu_data.curr_PC, alu_commit_if.issue_tag, cmt_to_issue_if.alu_data.thread_mask, cmt_to_issue_if.alu_data.wb, cmt_to_issue_if.alu_data.rd, alu_commit_if.data); end - if (lsu_commit_if.valid && lsu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.lsu_data.warp_num, cmt_to_issue_if.lsu_data.curr_PC, lsu_commit_if.issue_tag, cmt_to_issue_if.lsu_data.thread_mask, cmt_to_issue_if.lsu_data.wb, cmt_to_issue_if.lsu_data.rd, lsu_commit_if.data); + if (bru_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.bru_data.wid, cmt_to_issue_if.bru_data.curr_PC, bru_commit_if.issue_tag, cmt_to_issue_if.bru_data.thread_mask, cmt_to_issue_if.bru_data.wb, cmt_to_issue_if.bru_data.rd, bru_commit_if.data); end - if (csr_commit_if.valid && csr_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.csr_data.warp_num, cmt_to_issue_if.csr_data.curr_PC, csr_commit_if.issue_tag, cmt_to_issue_if.csr_data.thread_mask, cmt_to_issue_if.csr_data.wb, cmt_to_issue_if.csr_data.rd, csr_commit_if.data); + if (lsu_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.lsu_data.wid, cmt_to_issue_if.lsu_data.curr_PC, lsu_commit_if.issue_tag, cmt_to_issue_if.lsu_data.thread_mask, cmt_to_issue_if.lsu_data.wb, cmt_to_issue_if.lsu_data.rd, lsu_commit_if.data); + end + if (csr_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.csr_data.wid, cmt_to_issue_if.csr_data.curr_PC, csr_commit_if.issue_tag, cmt_to_issue_if.csr_data.thread_mask, cmt_to_issue_if.csr_data.wb, cmt_to_issue_if.csr_data.rd, csr_commit_if.data); end - if (mul_commit_if.valid && mul_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.mul_data.warp_num, cmt_to_issue_if.mul_data.curr_PC, mul_commit_if.issue_tag, cmt_to_issue_if.mul_data.thread_mask, cmt_to_issue_if.mul_data.wb, cmt_to_issue_if.mul_data.rd, mul_commit_if.data); + if (mul_commit_if.validy) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.mul_data.wid, cmt_to_issue_if.mul_data.curr_PC, mul_commit_if.issue_tag, cmt_to_issue_if.mul_data.thread_mask, cmt_to_issue_if.mul_data.wb, cmt_to_issue_if.mul_data.rd, mul_commit_if.data); end - if (fpu_commit_if.valid && fpu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.fpu_data.warp_num, cmt_to_issue_if.fpu_data.curr_PC, fpu_commit_if.issue_tag, cmt_to_issue_if.fpu_data.thread_mask, cmt_to_issue_if.fpu_data.wb, cmt_to_issue_if.fpu_data.rd, fpu_commit_if.data); + if (fpu_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.fpu_data.wid, cmt_to_issue_if.fpu_data.curr_PC, fpu_commit_if.issue_tag, cmt_to_issue_if.fpu_data.thread_mask, cmt_to_issue_if.fpu_data.wb, cmt_to_issue_if.fpu_data.rd, fpu_commit_if.data); end - if (gpu_commit_if.valid && gpu_commit_if.ready) begin - $display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.gpu_data.warp_num, cmt_to_issue_if.gpu_data.curr_PC, gpu_commit_if.issue_tag, cmt_to_issue_if.gpu_data.thread_mask, cmt_to_issue_if.gpu_data.wb, cmt_to_issue_if.gpu_data.rd, gpu_commit_if.data); + if (gpu_commit_if.valid) begin + $display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.gpu_data.wid, cmt_to_issue_if.gpu_data.curr_PC, gpu_commit_if.issue_tag, cmt_to_issue_if.gpu_data.thread_mask, cmt_to_issue_if.gpu_data.wb, cmt_to_issue_if.gpu_data.rd, gpu_commit_if.data); end end `endif diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index ed7e7f24..4cee91ff 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -12,7 +12,7 @@ module VX_csr_arb ( VX_csr_req_if csr_req_if, // input - VX_exu_to_cmt_if csr_rsp_if, + VX_csr_rsp_if csr_rsp_if, // outputs VX_exu_to_cmt_if csr_commit_if, @@ -28,9 +28,9 @@ module VX_csr_arb ( // requests assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid; assign csr_req_if.issue_tag = (~select_io_req) ? csr_core_req_if.issue_tag : 0; - assign csr_req_if.warp_num = (~select_io_req) ? csr_core_req_if.warp_num : 0; + assign csr_req_if.wid = (~select_io_req) ? csr_core_req_if.wid : 0; assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0; - assign csr_req_if.csr_op = (~select_io_req) ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); + assign csr_req_if.op = (~select_io_req) ? csr_core_req_if.op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr; assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0; @@ -48,6 +48,6 @@ module VX_csr_arb ( assign csr_commit_if.issue_tag= csr_rsp_if.issue_tag; assign csr_commit_if.data = csr_rsp_if.data; - assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready; + assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : 1'b1; endmodule diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 71971aa4..bd426ba2 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -9,7 +9,7 @@ module VX_csr_data #( VX_cmt_to_csr_if cmt_to_csr_if, VX_csr_to_fpu_if csr_to_fpu_if, - input wire[`NW_BITS-1:0] warp_num, + input wire[`NW_BITS-1:0] wid, input wire read_enable, input wire[`CSR_ADDR_BITS-1:0] read_addr, @@ -38,24 +38,24 @@ module VX_csr_data #( always @(posedge clk) begin if (cmt_to_csr_if.has_fflags) begin - csr_fflags[cmt_to_csr_if.warp_num] <= cmt_to_csr_if.fflags; - csr_fcsr[cmt_to_csr_if.warp_num][`FFG_BITS-1:0] <= cmt_to_csr_if.fflags; + csr_fflags[cmt_to_csr_if.wid] <= cmt_to_csr_if.fflags; + csr_fcsr[cmt_to_csr_if.wid][`FFG_BITS-1:0] <= cmt_to_csr_if.fflags; end if (write_enable) begin case (write_addr) `CSR_FFLAGS: begin - csr_fcsr[warp_num][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; - csr_fflags[warp_num] <= write_data[`FFG_BITS-1:0]; + csr_fcsr[wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; + csr_fflags[wid] <= write_data[`FFG_BITS-1:0]; end `CSR_FRM: begin - csr_fcsr[warp_num][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; - csr_frm[warp_num] <= write_data[`FRM_BITS-1:0]; + csr_fcsr[wid][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; + csr_frm[wid] <= write_data[`FRM_BITS-1:0]; end `CSR_FCSR: begin - csr_fcsr[warp_num] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; - csr_frm[warp_num] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS]; - csr_fflags[warp_num] <= write_data[`FFG_BITS-1:0]; + csr_fcsr[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; + csr_frm[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS]; + csr_fflags[wid] <= write_data[`FFG_BITS-1:0]; end `CSR_SATP: csr_satp <= write_data; @@ -79,7 +79,7 @@ module VX_csr_data #( always @(posedge clk) begin if (reset) begin - csr_cycle <= 0; + csr_cycle <= 0; csr_instret <= 0; end else begin csr_cycle <= csr_cycle + 1; @@ -91,15 +91,15 @@ module VX_csr_data #( always @(*) begin case (read_addr) - `CSR_FFLAGS : read_data = 32'(csr_fflags[warp_num]); - `CSR_FRM : read_data = 32'(csr_frm[warp_num]); - `CSR_FCSR : read_data = 32'(csr_fcsr[warp_num]); + `CSR_FFLAGS : read_data = 32'(csr_fflags[wid]); + `CSR_FRM : read_data = 32'(csr_frm[wid]); + `CSR_FCSR : read_data = 32'(csr_fcsr[wid]); - `CSR_LWID : read_data = 32'(warp_num); + `CSR_LWID : read_data = 32'(wid); `CSR_LTID , `CSR_GTID , `CSR_MHARTID , - `CSR_GWID : read_data = CORE_ID * `NUM_WARPS + 32'(warp_num); + `CSR_GWID : read_data = CORE_ID * `NUM_WARPS + 32'(wid); `CSR_GCID : read_data = CORE_ID; `CSR_NT : read_data = `NUM_THREADS; `CSR_NW : read_data = `NUM_WARPS; @@ -134,6 +134,6 @@ module VX_csr_data #( endcase end - assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.warp_num]; + assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.wid]; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index 5e6782da..04097c55 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -51,9 +51,7 @@ module VX_csr_io_arb #( end else begin - genvar i; - - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign out_csr_io_req_valid[i] = in_csr_io_req_valid && (request_id == `REQS_BITS'(i)); assign out_csr_io_req_rw[i] = in_csr_io_req_rw; assign out_csr_io_req_addr[i] = in_csr_io_req_addr; @@ -78,7 +76,7 @@ module VX_csr_io_arb #( assign out_csr_io_rsp_valid = in_csr_io_rsp_valid [bus_rsp_sel]; assign out_csr_io_rsp_data = in_csr_io_rsp_data [bus_rsp_sel]; - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign in_csr_io_rsp_ready[i] = out_csr_io_rsp_ready && (bus_rsp_sel == `REQS_BITS'(i)); end diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 14ea5fd8..c6a66b6c 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -15,11 +15,11 @@ module VX_csr_unit #( VX_csr_req_if csr_req_if, VX_exu_to_cmt_if csr_commit_if ); - VX_csr_req_if csr_pipe_req_if(); - VX_exu_to_cmt_if csr_pipe_commit_if(); + VX_csr_req_if csr_pipe_req_if(); + VX_csr_rsp_if csr_pipe_rsp_if(); - wire select_io_req = csr_io_req_if.valid; - wire select_io_rsp; + wire select_io_req = csr_io_req_if.valid; + wire select_io_rsp; VX_csr_arb csr_arb ( .clk (clk), @@ -29,7 +29,7 @@ module VX_csr_unit #( .csr_io_req_if (csr_io_req_if), .csr_req_if (csr_pipe_req_if), - .csr_rsp_if (csr_pipe_commit_if), + .csr_rsp_if (csr_pipe_rsp_if), .csr_io_rsp_if (csr_io_rsp_if), .csr_commit_if (csr_commit_if), @@ -41,7 +41,7 @@ module VX_csr_unit #( wire [`CSR_ADDR_BITS-1:0] csr_addr_s1; wire [31:0] csr_read_data, csr_read_data_s1; wire [31:0] csr_updated_data_s1; - wire [`NW_BITS-1:0] warp_num_s1; + wire [`NW_BITS-1:0] wid_s1; VX_csr_data #( .CORE_ID(CORE_ID) @@ -56,12 +56,12 @@ module VX_csr_unit #( .write_enable (csr_we_s1), .write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]), .write_addr (csr_addr_s1), - .warp_num (csr_pipe_req_if.warp_num) + .wid (csr_pipe_req_if.wid) ); wire csr_hazard = (csr_addr_s1 == csr_pipe_req_if.csr_addr) - && (warp_num_s1 == csr_pipe_req_if.warp_num) - && csr_pipe_commit_if.valid; + && (wid_s1 == csr_pipe_req_if.wid) + && csr_pipe_rsp_if.valid; wire [31:0] csr_read_data_qual = csr_hazard ? csr_updated_data_s1 : csr_read_data; @@ -71,7 +71,7 @@ module VX_csr_unit #( always @(*) begin csr_we_s0_unqual = 0; - case (csr_pipe_req_if.csr_op) + case (csr_pipe_req_if.op) `CSR_RW: begin csr_updated_data = csr_pipe_req_if.csr_mask; csr_we_s0_unqual = 1; @@ -90,7 +90,7 @@ module VX_csr_unit #( wire csr_we_s0 = csr_we_s0_unqual && csr_pipe_req_if.valid; - wire stall = ~csr_pipe_commit_if.ready && csr_pipe_commit_if.valid; + wire stall = ~csr_pipe_rsp_if.ready && csr_pipe_rsp_if.valid; VX_generic_register #( .N(1 + `ISTAG_BITS + `NW_BITS + 1 + `CSR_ADDR_BITS + 1 + 32 + 32) @@ -99,13 +99,12 @@ module VX_csr_unit #( .reset (reset), .stall (stall), .flush (0), - .in ({csr_pipe_req_if.valid, csr_pipe_req_if.issue_tag, csr_pipe_req_if.warp_num, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}), - .out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.issue_tag, warp_num_s1, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1}) + .in ({csr_pipe_req_if.valid, csr_pipe_req_if.issue_tag, csr_pipe_req_if.wid, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}), + .out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.issue_tag, wid_s1, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1}) ); - genvar i; - for (i = 0; i < `NUM_THREADS; i++) begin - assign csr_pipe_commit_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i : + for (genvar i = 0; i < `NUM_THREADS; i++) begin + assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i : (csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) : csr_read_data_s1; end diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 5c71df81..3b9bd6c1 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -19,7 +19,7 @@ module VX_decode #( wire [31:0] instr = ifetch_rsp_if.instr; reg [`ALU_BITS-1:0] alu_op; - reg [`BR_BITS-1:0] br_op; + reg [`BRU_BITS-1:0] br_op; reg [`LSU_BITS-1:0] lsu_op; reg [`CSR_BITS-1:0] csr_op; reg [`MUL_BITS-1:0] mul_op; @@ -100,27 +100,27 @@ module VX_decode #( wire is_br = (is_btype || is_jal || is_jalr || is_jals); always @(*) begin - br_op = `BR_EQ; + br_op = `BRU_OTHER; case (opcode) `INST_B: begin case (func3) - 3'h0: br_op = `BR_EQ; - 3'h1: br_op = `BR_NE; - 3'h4: br_op = `BR_LT; - 3'h5: br_op = `BR_GE; - 3'h6: br_op = `BR_LTU; - 3'h7: br_op = `BR_GEU; + 3'h0: br_op = `BRU_EQ; + 3'h1: br_op = `BRU_NE; + 3'h4: br_op = `BRU_LT; + 3'h5: br_op = `BRU_GE; + 3'h6: br_op = `BRU_LTU; + 3'h7: br_op = `BRU_GEU; default:; endcase end - `INST_JAL: br_op = `BR_JAL; - `INST_JALR: br_op = `BR_JALR; + `INST_JAL: br_op = `BRU_JAL; + `INST_JALR: br_op = `BRU_JALR; `INST_SYS: begin - if (is_jals && u_12 == 12'h000) br_op = `BR_ECALL; - if (is_jals && u_12 == 12'h001) br_op = `BR_EBREAK; - if (is_jals && u_12 == 12'h302) br_op = `BR_MRET; - if (is_jals && u_12 == 12'h102) br_op = `BR_SRET; - if (is_jals && u_12 == 12'h7B2) br_op = `BR_DRET; + if (is_jals && u_12 == 12'h000) br_op = `BRU_ECALL; + if (is_jals && u_12 == 12'h001) br_op = `BRU_EBREAK; + if (is_jals && u_12 == 12'h302) br_op = `BRU_MRET; + if (is_jals && u_12 == 12'h102) br_op = `BRU_SRET; + if (is_jals && u_12 == 12'h7B2) br_op = `BRU_DRET; end default:; endcase @@ -292,18 +292,17 @@ module VX_decode #( VX_decode_if decode_tmp_if(); - assign decode_tmp_if.valid = ifetch_rsp_if.valid; - assign decode_tmp_if.warp_num = ifetch_rsp_if.warp_num; - assign decode_tmp_if.thread_mask= ifetch_rsp_if.thread_mask; - assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; - assign decode_tmp_if.next_PC = ifetch_rsp_if.curr_PC + 32'h4; + assign decode_tmp_if.valid = ifetch_rsp_if.valid; + assign decode_tmp_if.wid = ifetch_rsp_if.wid; + assign decode_tmp_if.thread_mask = ifetch_rsp_if.thread_mask; + assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU : is_csr ? `EX_CSR : is_mul ? `EX_MUL : is_fpu ? `EX_FPU : is_gpu ? `EX_GPU : - is_br ? `EX_ALU : + is_br ? `EX_BRU : (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : `EX_NOP; @@ -312,7 +311,7 @@ module VX_decode #( is_mul ? `OP_BITS'(mul_op) : is_fpu ? `OP_BITS'(fpu_op) : is_gpu ? `OP_BITS'(gpu_op) : - is_br ? `OP_BITS'({1'b1, br_op}) : + is_br ? `OP_BITS'(br_op) : (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : 0; @@ -347,28 +346,28 @@ module VX_decode #( is_csr ? 32'(u_12) : src2_imm; - assign decode_tmp_if.rs1_is_PC = is_auipc; + assign decode_tmp_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals; assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; assign decode_tmp_if.frm = func3; assign join_if.is_join = valid_in && is_gpu && (gpu_op == `GPU_JOIN); - assign join_if.warp_num = ifetch_rsp_if.warp_num; + assign join_if.wid = ifetch_rsp_if.wid; assign wstall_if.wstall = valid_in && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR))); - assign wstall_if.warp_num = ifetch_rsp_if.warp_num; + assign wstall_if.wid = ifetch_rsp_if.wid; wire stall = ~decode_if.ready && decode_if.valid; VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS) ) decode_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}), - .out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask}) + .in ({decode_tmp_if.valid, decode_tmp_if.wid, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}), + .out ({decode_if.valid, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask}) ); assign ifetch_rsp_if.ready = ~stall; @@ -376,7 +375,7 @@ module VX_decode #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (decode_tmp_if.valid && ~stall) begin - $write("%t: Core%0d-Decode: warp=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC); + $write("%t: Core%0d-Decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.wid, decode_tmp_if.curr_PC); print_ex_type(decode_tmp_if.ex_type); $write(", op="); print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op); @@ -386,6 +385,7 @@ module VX_decode #( // trap unsupported instructions assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.ex_op) == `ALU_OTHER)); + assert(~(~stall && (decode_tmp_if.ex_type == `EX_BRU) && `BRU_OP(decode_tmp_if.ex_op) == `BRU_OTHER)); assert(~(~stall && (decode_tmp_if.ex_type == `EX_CSR) && `CSR_OP(decode_tmp_if.ex_op) == `CSR_OTHER)); assert(~(~stall && (decode_tmp_if.ex_type == `EX_GPU) && `GPU_OP(decode_tmp_if.ex_op) == `GPU_OTHER)); end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 20e02bd0..f81b06ce 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -3,7 +3,6 @@ `include "VX_platform.vh" `include "VX_config.vh" -`include "VX_scope.vh" /////////////////////////////////////////////////////////////////////////////// @@ -38,8 +37,8 @@ /////////////////////////////////////////////////////////////////////////////// -`define LATENCY_IDIV 23 -`define LATENCY_IMUL 2 +`define LATENCY_IDIV 33 +`define LATENCY_IMUL 3 `define LATENCY_FDIV 16 `define LATENCY_FSQRT 10 @@ -87,72 +86,70 @@ `define BYTEEN_BITS 3 `define BYTEEN_TYPE(x) x[1:0] -/////////////////////////////////////////////////////////////////////////////// - -`define BR_EQ 4'h0 -`define BR_NE 4'h1 -`define BR_LT 4'h2 -`define BR_GE 4'h3 -`define BR_LTU 4'h4 -`define BR_GEU 4'h5 -`define BR_JAL 4'h6 -`define BR_JALR 4'h7 -`define BR_ECALL 4'h8 -`define BR_EBREAK 4'h9 -`define BR_MRET 4'hA -`define BR_SRET 4'hB -`define BR_DRET 4'hC -`define BR_NO 4'hF -`define BR_BITS 4 +`define FRM_RNE 3'b000 // round to nearest even +`define FRM_RTZ 3'b001 // round to zero +`define FRM_RDN 3'b010 // round to -inf +`define FRM_RUP 3'b011 // round to +inf +`define FRM_RMM 3'b100 // round to nearest max magnitude +`define FRM_DYN 3'b111 // dynamic mode +`define FRM_BITS 3 /////////////////////////////////////////////////////////////////////////////// `define EX_NOP 3'h0 `define EX_ALU 3'h1 -`define EX_LSU 3'h2 -`define EX_CSR 3'h3 -`define EX_MUL 3'h4 -`define EX_FPU 3'h5 -`define EX_GPU 3'h6 +`define EX_BRU 3'h2 +`define EX_LSU 3'h3 +`define EX_CSR 3'h4 +`define EX_MUL 3'h5 +`define EX_FPU 3'h6 +`define EX_GPU 3'h7 `define EX_BITS 3 -`define NUM_EXS 6 +`define NUM_EXS 7 `define NE_BITS `LOG2UP(`NUM_EXS) /////////////////////////////////////////////////////////////////////////////// `define OP_BITS 5 -`define ALU_ADD 5'h00 -`define ALU_SUB 5'h01 -`define ALU_SLL 5'h02 -`define ALU_SRL 5'h03 -`define ALU_SRA 5'h04 -`define ALU_SLT 5'h05 -`define ALU_SLTU 5'h06 -`define ALU_XOR 5'h07 -`define ALU_OR 5'h08 -`define ALU_AND 5'h09 -`define ALU_LUI 5'h0A -`define ALU_AUIPC 5'h0B -`define ALU_BEQ {1'b1, `BR_EQ} -`define ALU_BNE {1'b1, `BR_NE} -`define ALU_BLT {1'b1, `BR_LT} -`define ALU_BGE {1'b1, `BR_GE} -`define ALU_BLTU {1'b1, `BR_LTU} -`define ALU_BGEU {1'b1, `BR_GEU} -`define ALU_JAL {1'b1, `BR_JAL} -`define ALU_JALR {1'b1, `BR_JALR} -`define ALU_ECALL {1'b1, `BR_ECALL} -`define ALU_EBREAK {1'b1, `BR_EBREAK} -`define ALU_MRET {1'b1, `BR_MRET} -`define ALU_SRET {1'b1, `BR_SRET} -`define ALU_DRET {1'b1, `BR_DRET} -`define ALU_OTHER 5'h1F -`define ALU_BITS 5 +`define ALU_ADD 4'b0000 +`define ALU_SUB 4'b0001 +`define ALU_LUI 4'b0010 +`define ALU_AUIPC 4'b0011 +`define ALU_SLT 4'b0100 +`define ALU_SLTU 4'b0101 +`define ALU_SRL 4'b1000 +`define ALU_SRA 4'b1001 +`define ALU_AND 4'b1100 +`define ALU_OR 4'b1101 +`define ALU_XOR 4'b1110 +`define ALU_SLL 4'b1111 +`define ALU_OTHER 4'b0111 +`define ALU_BITS 4 `define ALU_OP(x) x[`ALU_BITS-1:0] -`define BR_OP(x) x[`BR_BITS-1:0] -`define IS_BR_OP(x) x[4] +`define ALU_OP_CLASS(x) x[3:2] + +`define BRU_EQ 4'b0000 +`define BRU_NE 4'b0001 +`define BRU_LTU 4'b0010 +`define BRU_GEU 4'b0011 +`define BRU_LT 4'b0110 +`define BRU_GE 4'b0111 +`define BRU_JAL 4'b1000 +`define BRU_JALR 4'b1001 +`define BRU_ECALL 4'b1010 +`define BRU_EBREAK 4'b1011 +`define BRU_MRET 4'b1100 +`define BRU_SRET 4'b1101 +`define BRU_DRET 4'b1110 +`define BRU_OTHER 4'b1111 +`define BRU_BITS 4 +`define BRU_OP(x) x[`BRU_BITS-1:0] +`define BRU_NEG(x) x[0] +`define BRU_LESS(x) x[1] +`define BRU_SIGNED(x) x[2] +`define BRU_STATIC(x) x[3] `define LSU_LB {1'b0, `BYTEEN_SB} `define LSU_LH {1'b0, `BYTEEN_SH} @@ -213,14 +210,6 @@ `define FPU_BITS 5 `define FPU_OP(x) x[`FPU_BITS-1:0] -`define FRM_RNE 3'b000 // round to nearest even -`define FRM_RTZ 3'b001 // round to zero -`define FRM_RDN 3'b010 // round to -inf -`define FRM_RUP 3'b011 // round to +inf -`define FRM_RMM 3'b100 // round to nearest max magnitude -`define FRM_DYN 3'b111 // dynamic mode -`define FRM_BITS 3 - `define GPU_TMC 3'h0 `define GPU_WSPAWN 3'h1 `define GPU_SPLIT 3'h2 @@ -273,7 +262,7 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num +`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, wid `define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 1 + `NR_BITS + `NW_BITS) `else `define DEBUG_CORE_REQ_MDATA_WIDTH 0 @@ -421,34 +410,6 @@ `define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)} -/////////////////////////////////////////////////////////////////////////////// - -typedef struct packed { - logic [`NW_BITS-1:0] warp_num; - logic [`NUM_THREADS-1:0] thread_mask; - logic [31:0] curr_PC; - logic [`NR_BITS-1:0] rd; - logic wb; -} issue_data_t; - -typedef struct packed { - logic is_normal; - logic is_zero; - logic is_subnormal; - logic is_inf; - logic is_nan; - logic is_signaling; - logic is_quiet; -} fp_type_t; - -typedef struct packed { - logic NV; // Invalid - logic DZ; // Divide by zero - logic OF; // Overflow - logic UF; // Underflow - logic NX; // Inexact -} fflags_t; - -`define FFG_BITS $bits(fflags_t) +`include "VX_types.vh" `endif diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 81c76419..78d2436f 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -22,6 +22,7 @@ module VX_execute #( // inputs VX_alu_req_if alu_req_if, + VX_bru_req_if bru_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, @@ -32,6 +33,7 @@ module VX_execute #( VX_branch_ctl_if branch_ctl_if, VX_warp_ctl_if warp_ctl_if, VX_exu_to_cmt_if alu_commit_if, + VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if csr_commit_if, VX_exu_to_cmt_if mul_commit_if, @@ -49,10 +51,19 @@ module VX_execute #( .clk (clk), .reset (reset), .alu_req_if (alu_req_if), - .branch_ctl_if (branch_ctl_if), .alu_commit_if (alu_commit_if) ); + VX_bru_unit #( + .CORE_ID(CORE_ID) + ) bru_unit ( + .clk (clk), + .reset (reset), + .bru_req_if (bru_req_if), + .branch_ctl_if (branch_ctl_if), + .bru_commit_if (bru_commit_if) + ); + VX_lsu_unit #( .CORE_ID(CORE_ID) ) lsu_unit ( @@ -116,29 +127,33 @@ module VX_execute #( VX_gpu_unit #( .CORE_ID(CORE_ID) ) gpu_unit ( + .clk (clk), + .reset (reset), .gpu_req_if (gpu_req_if), .warp_ctl_if (warp_ctl_if), .gpu_commit_if (gpu_commit_if) ); - assign ebreak = alu_req_if.valid && (alu_req_if.alu_op == `ALU_EBREAK || alu_req_if.alu_op == `ALU_ECALL); + assign ebreak = bru_req_if.valid + && (bru_req_if.op == `BRU_EBREAK + || bru_req_if.op == `BRU_ECALL); `SCOPE_ASSIGN (scope_decode_valid, decode_if.valid); - `SCOPE_ASSIGN (scope_decode_warp_num, decode_if.warp_num); + `SCOPE_ASSIGN (scope_decode_wid, decode_if.wid); `SCOPE_ASSIGN (scope_decode_curr_PC, decode_if.curr_PC); `SCOPE_ASSIGN (scope_decode_is_jal, decode_if.is_jal); `SCOPE_ASSIGN (scope_decode_rs1, decode_if.rs1); `SCOPE_ASSIGN (scope_decode_rs2, decode_if.rs2); `SCOPE_ASSIGN (scope_execute_valid, alu_req_if.valid); - `SCOPE_ASSIGN (scope_execute_warp_num, alu_req_if.warp_num); + `SCOPE_ASSIGN (scope_execute_wid, alu_req_if.wid); `SCOPE_ASSIGN (scope_execute_curr_PC, alu_req_if.curr_PC); `SCOPE_ASSIGN (scope_execute_rd, alu_req_if.rd); `SCOPE_ASSIGN (scope_execute_a, alu_req_if.rs1_data); `SCOPE_ASSIGN (scope_execute_b, alu_req_if.rs2_data); `SCOPE_ASSIGN (scope_writeback_valid, writeback_if.valid); - `SCOPE_ASSIGN (scope_writeback_warp_num, writeback_if.warp_num); + `SCOPE_ASSIGN (scope_writeback_wid, writeback_if.wid); `SCOPE_ASSIGN (scope_writeback_curr_PC, writeback_if.curr_PC); `SCOPE_ASSIGN (scope_writeback_wb, writeback_if.wb); `SCOPE_ASSIGN (scope_writeback_rd, writeback_if.rd); diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index cc8de7dc..a255d2cc 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -14,9 +14,27 @@ module VX_fpu_unit #( // outputs VX_fpu_to_cmt_if fpu_commit_if ); - - assign csr_to_fpu_if.warp_num = fpu_req_if.warp_num; - wire [`FRM_BITS-1:0] frm = (fpu_req_if.frm == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.frm; + VX_fpu_req_if fpu_req_tmp_if(); + + // resolve dynamic FRM + wire [`FRM_BITS-1:0] frm, frm_tmp; + assign csr_to_fpu_if.wid = fpu_req_if.wid; + assign frm = (fpu_req_if.frm == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.frm; + + // use a skid buffer since fpcore has realtime backpressure + VX_elastic_buffer #( + .DATAW (`ISTAG_BITS + `NW_BITS + 32 + `FPU_BITS + `FRM_BITS + (3 * `NUM_THREADS * 32)), + .SIZE (0) + ) input_buffer ( + .clk (clk), + .reset (reset), + .valid_in (fpu_req_if.valid), + .ready_in (fpu_req_if.ready), + .data_in ({fpu_req_if.issue_tag, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.op, frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}), + .data_out ({fpu_req_tmp_if.issue_tag, fpu_req_tmp_if.wid, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.op, frm_tmp, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data}), + .ready_out (fpu_req_tmp_if.ready), + .valid_out (fpu_req_tmp_if.valid) + ); `ifdef SYNTHESIS @@ -24,17 +42,17 @@ module VX_fpu_unit #( .clk (clk), .reset (reset), - .valid_in (fpu_req_if.valid), - .ready_in (fpu_req_if.ready), + .valid_in (fpu_req_tmp_if.valid), + .ready_in (fpu_req_tmp_if.ready), - .tag_in (fpu_req_if.issue_tag), + .tag_in (fpu_req_tmp_if.issue_tag), - .op (fpu_req_if.fpu_op), - .frm (frm), + .op (fpu_req_tmp_if.op), + .frm (frm_tmp), - .dataa (fpu_req_if.rs1_data), - .datab (fpu_req_if.rs2_data), - .datac (fpu_req_if.rs3_data), + .dataa (fpu_req_tmp_if.rs1_data), + .datab (fpu_req_tmp_if.rs2_data), + .datac (fpu_req_tmp_if.rs3_data), .result (fpu_commit_if.data), .has_fflags (fpu_commit_if.has_fflags), @@ -42,7 +60,7 @@ module VX_fpu_unit #( .tag_out (fpu_commit_if.issue_tag), - .ready_out (fpu_commit_if.ready), + .ready_out (1'b1), .valid_out (fpu_commit_if.valid) ); @@ -57,17 +75,17 @@ module VX_fpu_unit #( .clk (clk), .reset (reset), - .valid_in (fpu_req_if.valid), - .ready_in (fpu_req_if.ready), + .valid_in (fpu_req_tmp_if.valid), + .ready_in (fpu_req_tmp_if.ready), - .tag_in (fpu_req_if.issue_tag), + .tag_in (fpu_req_tmp_if.issue_tag), - .op (fpu_req_if.fpu_op), - .frm (frm), + .op (fpu_req_tmp_if.op), + .frm (frm_tmp), - .dataa (fpu_req_if.rs1_data), - .datab (fpu_req_if.rs2_data), - .datac (fpu_req_if.rs3_data), + .dataa (fpu_req_tmp_if.rs1_data), + .datab (fpu_req_tmp_if.rs2_data), + .datac (fpu_req_tmp_if.rs3_data), .result (fpu_commit_if.data), .has_fflags (fpu_commit_if.has_fflags), @@ -75,7 +93,7 @@ module VX_fpu_unit #( .tag_out (fpu_commit_if.issue_tag), - .ready_out (fpu_commit_if.ready), + .ready_out (1'b1), .valid_out (fpu_commit_if.valid) ); diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index 0e9dbe99..ff9eea79 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -10,40 +10,43 @@ module VX_gpr_fp_ctrl ( input wire [`NUM_THREADS-1:0][31:0] rs2_data, // outputs - output wire [`NW_BITS+`NR_BITS-1:0] raddr1, - + output wire [`NW_BITS+`NR_BITS-1:0] raddr1, VX_gpr_read_if gpr_read_if ); - reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data; + reg [`NUM_THREADS-1:0][31:0] rs1_tmp_data, rs2_tmp_data, rs3_tmp_data; reg read_rs3; wire rs3_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3; - wire read_fire = gpr_read_if.valid && read_rs3; always @(posedge clk) begin if (reset) begin read_rs3 <= 0; - end else if (rs3_delay) begin - read_rs3 <= 1; - end else if (read_fire) begin - read_rs3 <= 0; - end + end else begin + if (rs3_delay) begin + read_rs3 <= 1; + end else if (read_fire) begin + read_rs3 <= 0; + end + end end // backup original rs1 data always @(posedge clk) begin - if (rs3_delay) begin - tmp_rs1_data <= rs1_data; + if (~gpr_read_if.use_rs3 || rs3_delay) begin + rs1_tmp_data <= rs1_data; end + rs2_tmp_data <= rs2_data; + rs3_tmp_data <= rs1_data; end // outputs - assign raddr1 = {gpr_read_if.warp_num, (read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1)}; + wire [`NR_BITS-1:0] rs1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1; + assign raddr1 = {gpr_read_if.wid, rs1}; assign gpr_read_if.ready = ~rs3_delay; - assign gpr_read_if.rs1_data = gpr_read_if.use_rs3 ? tmp_rs1_data : rs1_data; - assign gpr_read_if.rs2_data = rs2_data; - assign gpr_read_if.rs3_data = rs1_data; + assign gpr_read_if.rs1_data = rs1_tmp_data; + assign gpr_read_if.rs2_data = rs2_tmp_data; + assign gpr_read_if.rs3_data = rs3_tmp_data; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index bceb7d01..5299e5ce 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -20,12 +20,7 @@ module VX_gpr_ram ( for (integer i = 0; i < `NUM_REGS; i++) begin if (i == 0) begin ram[j * `NUM_REGS + i] = {`NUM_THREADS{32'h00000000}}; // set r0 = 0 - end - `ifndef SYNTHESIS - else begin - ram[j * `NUM_REGS + i] = {`NUM_THREADS{32'hdeadbeef}}; end - `endif end end end @@ -48,8 +43,7 @@ module VX_gpr_ram ( wire [`NUM_THREADS-1:0][31:0] write_bit_mask; - integer i; - for (i = 0; i < `NUM_THREADS; i++) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin assign write_bit_mask[i] = {32{~we[i]}}; end @@ -61,9 +55,8 @@ module VX_gpr_ram ( wire [`NUM_THREADS-1:0][31:0] tmp_b; `ifndef SYNTHESIS - integer j; - for (i = 0; i < `NUM_THREADS; i++) begin - for (j = 0; j < 32; j++) begin + for (integer i = 0; i < `NUM_THREADS; i++) begin + for (integer j = 0; j < 32; j++) begin assign rs1_data[i][j] = ((tmp_a[i][j] === 1'dx) || cena_1) ? 1'b0 : tmp_a[i][j]; assign rs2_data[i][j] = ((tmp_b[i][j] === 1'dx) || cena_2) ? 1'b0 : tmp_b[i][j]; end @@ -72,7 +65,7 @@ module VX_gpr_ram ( assign rs1_data = tmp_a; assign rs2_data = tmp_b; `endif - for (i = 0; i < 'NT; i=i+4) begin + for (integer i = 0; i < 'NT; i=i+4) begin `IGNORE_WARNINGS_BEGIN rf2_32x128_wm1 first_ram ( .CENYA(), diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 8a05137f..bcf55b56 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -21,10 +21,10 @@ module VX_gpr_stage #( VX_gpr_ram gpr_ram ( .clk (clk), .we ({`NUM_THREADS{writeback_if.valid}} & writeback_if.thread_mask), - .waddr ({writeback_if.warp_num, writeback_if.rd}), + .waddr ({writeback_if.wid, writeback_if.rd}), .wdata (writeback_if.data), .rs1 (raddr1), - .rs2 ({gpr_read_if.warp_num, gpr_read_if.rs2}), + .rs2 ({gpr_read_if.wid, gpr_read_if.rs2}), .rs1_data (rs1_data), .rs2_data (rs2_data) ); @@ -39,9 +39,16 @@ module VX_gpr_stage #( .gpr_read_if(gpr_read_if) ); `else - assign raddr1 = {gpr_read_if.warp_num, gpr_read_if.rs1}; - assign gpr_read_if.rs1_data = rs1_data; - assign gpr_read_if.rs2_data = rs2_data; + reg [`NUM_THREADS-1:0][31:0] rs1_tmp_data, rs2_tmp_data; + + always @(posedge clk) begin + rs1_tmp_data <= rs1_data; + rs2_tmp_data <= rs2_data; + end + + assign raddr1 = {gpr_read_if.wid, gpr_read_if.rs1}; + assign gpr_read_if.rs1_data = rs1_tmp_data; + assign gpr_read_if.rs2_data = rs2_tmp_data; assign gpr_read_if.rs3_data = 0; assign gpr_read_if.ready = 1; @@ -53,6 +60,4 @@ module VX_gpr_stage #( `UNUSED_VAR (rs3); `endif - assign writeback_if.ready = 1'b1; // writes are stall-free - endmodule diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index bde5d1c9..8c85f267 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -3,6 +3,9 @@ module VX_gpu_unit #( parameter CORE_ID = 0 ) ( + input wire clk, + input wire reset, + // Input VX_gpu_req_if gpu_req_if, @@ -10,74 +13,74 @@ module VX_gpu_unit #( VX_warp_ctl_if warp_ctl_if, VX_exu_to_cmt_if gpu_commit_if ); + gpu_tmc_t tmc; + gpu_wspawn_t wspawn; + gpu_barrier_t barrier; + gpu_split_t split; - wire is_wspawn = (gpu_req_if.gpu_op == `GPU_WSPAWN); - wire is_tmc = (gpu_req_if.gpu_op == `GPU_TMC); - wire is_split = (gpu_req_if.gpu_op == `GPU_SPLIT); - wire is_bar = (gpu_req_if.gpu_op == `GPU_BAR); + wire is_wspawn = (gpu_req_if.op == `GPU_WSPAWN); + wire is_tmc = (gpu_req_if.op == `GPU_TMC); + wire is_split = (gpu_req_if.op == `GPU_SPLIT); + wire is_bar = (gpu_req_if.op == `GPU_BAR); - wire gpu_req_fire = gpu_req_if.valid && gpu_commit_if.ready; - - assign warp_ctl_if.warp_num = gpu_req_if.warp_num; + wire gpu_req_fire = gpu_req_if.valid; // tmc - genvar i; - wire [`NUM_THREADS-1:0] tmc_new_mask; - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]); end - assign warp_ctl_if.change_mask = is_tmc && gpu_req_fire; - assign warp_ctl_if.thread_mask = tmc_new_mask; - - // barrier - - assign warp_ctl_if.is_barrier = is_bar && gpu_req_fire; - assign warp_ctl_if.barrier_id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; - assign warp_ctl_if.barrier_num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); + assign tmc.valid = gpu_req_fire && is_tmc; + assign tmc.thread_mask = tmc_new_mask; // wspawn wire [31:0] wspawn_pc = gpu_req_if.rs2_data; wire [`NUM_WARPS-1:0] wspawn_wmask; - for (i = 0; i < `NUM_WARPS; i++) begin + for (genvar i = 0; i < `NUM_WARPS; i++) begin assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); end - assign warp_ctl_if.wspawn = is_wspawn && gpu_req_fire; - assign warp_ctl_if.wspawn_pc = wspawn_pc; - assign warp_ctl_if.wspawn_wmask = wspawn_wmask; + assign wspawn.valid = gpu_req_fire && is_wspawn; + assign wspawn.wmask = wspawn_wmask; + assign wspawn.pc = wspawn_pc; // split - wire[`NUM_THREADS-1:0] split_new_use_mask; - wire[`NUM_THREADS-1:0] split_new_later_mask; + wire [`NUM_THREADS-1:0] split_then_mask; + wire [`NUM_THREADS-1:0] split_else_mask; - for (i = 0; i < `NUM_THREADS; i++) begin - wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1); - assign split_new_use_mask[i] = gpu_req_if.thread_mask[i] & (curr_bool); - assign split_new_later_mask[i] = gpu_req_if.thread_mask[i] & (!curr_bool); + for (genvar i = 0; i < `NUM_THREADS; i++) begin + wire taken = gpu_req_if.rs1_data[i][0]; + assign split_then_mask[i] = gpu_req_if.thread_mask[i] & taken; + assign split_else_mask[i] = gpu_req_if.thread_mask[i] & ~taken; end - wire [`NT_BITS:0] num_valids; + assign split.valid = gpu_req_fire && is_split; + assign split.diverged = (| split_then_mask) && (| split_else_mask); + assign split.then_mask = split_then_mask; + assign split.else_mask = split_else_mask; + assign split.pc = gpu_req_if.curr_PC + 4; - VX_countones #( - .N(`NUM_THREADS) - ) valids_counter ( - .valids(gpu_req_if.thread_mask), - .count (num_valids) - ); + // barrier - assign warp_ctl_if.is_split = is_split && (num_valids > 1) && gpu_req_fire; - assign warp_ctl_if.do_split = (split_new_use_mask != 0) && (split_new_use_mask != {`NUM_THREADS{1'b1}}); - assign warp_ctl_if.split_new_mask = split_new_use_mask; - assign warp_ctl_if.split_later_mask = split_new_later_mask; - assign warp_ctl_if.split_save_pc = gpu_req_if.next_PC; + assign barrier.valid = is_bar && gpu_req_fire; + assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; + assign barrier.num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); - // commit - assign gpu_commit_if.valid = gpu_req_if.valid; - assign gpu_commit_if.issue_tag = gpu_req_if.issue_tag; - assign gpu_commit_if.data = 0; - assign gpu_req_if.ready = gpu_commit_if.ready; + // output + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t)) + ) gpu_reg ( + .clk (clk), + .reset (reset), + .stall (0), + .flush (0), + .in ({gpu_req_if.valid, gpu_req_if.issue_tag, gpu_req_if.wid, tmc, wspawn, split, barrier}), + .out ({gpu_commit_if.valid, gpu_commit_if.issue_tag, warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) + ); + + assign gpu_req_if.ready = 1'b1; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 276a24a2..9af23468 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -25,7 +25,7 @@ module VX_icache_stage #( wire icache_req_fire = icache_req_if.valid && icache_req_if.ready; - wire [`NW_BITS-1:0] req_tag = ifetch_req_if.warp_num; + wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid; wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[0][`NW_BITS-1:0]; always @(posedge clk) begin @@ -46,13 +46,13 @@ module VX_icache_stage #( assign ifetch_req_if.ready = icache_req_if.ready; `ifdef DBG_CORE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, `NR_BITS'(0), ifetch_req_if.warp_num, req_tag}; + assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, `NR_BITS'(0), ifetch_req_if.wid, req_tag}; `else assign icache_req_if.tag = req_tag; `endif assign ifetch_rsp_if.valid = icache_rsp_if.valid; - assign ifetch_rsp_if.warp_num = rsp_tag; + assign ifetch_rsp_if.wid = rsp_tag; assign ifetch_rsp_if.thread_mask = rsp_thread_mask_buf[rsp_tag]; assign ifetch_rsp_if.curr_PC = rsp_curr_PC_buf[rsp_tag]; assign ifetch_rsp_if.instr = icache_rsp_if.data[0]; @@ -61,7 +61,7 @@ module VX_icache_stage #( assign icache_rsp_if.ready = ifetch_rsp_if.ready; `SCOPE_ASSIGN (scope_icache_req_valid, icache_req_if.valid); - `SCOPE_ASSIGN (scope_icache_req_warp_num, ifetch_req_if.warp_num); + `SCOPE_ASSIGN (scope_icache_req_wid, ifetch_req_if.wid); `SCOPE_ASSIGN (scope_icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN (scope_icache_req_tag, icache_req_if.tag); `SCOPE_ASSIGN (scope_icache_req_ready, icache_req_if.ready); @@ -74,10 +74,10 @@ module VX_icache_stage #( `ifdef DBG_PRINT_CORE_ICACHE always @(posedge clk) begin if (icache_req_if.valid && icache_req_if.ready) begin - $display("%t: I$%0d req: warp=%0d, PC=%0h", $time, CORE_ID, ifetch_req_if.warp_num, ifetch_req_if.curr_PC); + $display("%t: I$%0d req: wid=%0d, PC=%0h", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.curr_PC); end if (icache_rsp_if.valid && icache_rsp_if.ready) begin - $display("%t: I$%0d rsp: warp=%0d, PC=%0h, instr=%0h", $time, CORE_ID, ifetch_rsp_if.warp_num, ifetch_rsp_if.curr_PC, ifetch_rsp_if.instr); + $display("%t: I$%0d rsp: wid=%0d, PC=%0h, instr=%0h", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.curr_PC, ifetch_rsp_if.instr); end end `endif diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index d12f7d98..23ea2745 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -26,16 +26,18 @@ module VX_ipdom_stack #( always @(posedge clk) begin if (reset) begin wr_ptr <= 0; - end else if (push) begin - stack_1[wr_ptr] <= q1; - stack_2[wr_ptr] <= q2; - is_part[wr_ptr] <= 0; - rd_ptr <= wr_ptr; - wr_ptr <= wr_ptr + 1; - end else if (pop) begin - wr_ptr <= wr_ptr - DEPTH'(is_part[rd_ptr]); - rd_ptr <= rd_ptr - DEPTH'(is_part[rd_ptr]); - is_part[rd_ptr] <= 1; + end else begin + if (push) begin + stack_1[wr_ptr] <= q1; + stack_2[wr_ptr] <= q2; + is_part[wr_ptr] <= 0; + rd_ptr <= wr_ptr; + wr_ptr <= wr_ptr + 1; + end else if (pop) begin + wr_ptr <= wr_ptr - DEPTH'(is_part[rd_ptr]); + rd_ptr <= rd_ptr - DEPTH'(is_part[rd_ptr]); + is_part[rd_ptr] <= 1; + end end end diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 295434cd..6dcc1f7e 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -11,6 +11,7 @@ module VX_issue #( VX_cmt_to_issue_if cmt_to_issue_if, VX_alu_req_if alu_req_if, + VX_bru_req_if bru_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, @@ -23,13 +24,14 @@ module VX_issue #( VX_gpr_read_if gpr_read_if(); assign gpr_read_if.valid = decode_if.valid && ~schedule_delay; - assign gpr_read_if.warp_num = decode_if.warp_num; + assign gpr_read_if.wid = decode_if.wid; assign gpr_read_if.rs1 = decode_if.rs1; assign gpr_read_if.rs2 = decode_if.rs2; assign gpr_read_if.rs3 = decode_if.rs3; assign gpr_read_if.use_rs3 = decode_if.use_rs3; wire ex_busy = (~alu_req_if.ready && (decode_if.ex_type == `EX_ALU)) + || (~bru_req_if.ready && (decode_if.ex_type == `EX_BRU)) || (~lsu_req_if.ready && (decode_if.ex_type == `EX_LSU)) || (~csr_req_if.ready && (decode_if.ex_type == `EX_CSR)) `ifdef EXT_M_ENABLE @@ -40,9 +42,9 @@ module VX_issue #( `endif || (~gpu_req_if.ready && (decode_if.ex_type == `EX_GPU)); - VX_scheduler #( + VX_scoreboard #( .CORE_ID(CORE_ID) - ) scheduler ( + ) scoreboard ( .clk (clk), .reset (reset), .decode_if (decode_if), @@ -62,117 +64,71 @@ module VX_issue #( .gpr_read_if (gpr_read_if) ); - VX_alu_req_if alu_req_tmp_if(); - VX_lsu_req_if lsu_req_tmp_if(); - VX_csr_req_if csr_req_tmp_if(); - VX_mul_req_if mul_req_tmp_if(); - VX_fpu_req_if fpu_req_tmp_if(); - VX_gpu_req_if gpu_req_tmp_if(); + VX_issue_if issue_if(); - VX_issue_demux issue_demux ( - .decode_if (decode_if), - .gpr_read_if(gpr_read_if), - .issue_tag (issue_tag), - .alu_req_if (alu_req_tmp_if), - .lsu_req_if (lsu_req_tmp_if), - .csr_req_if (csr_req_tmp_if), - .mul_req_if (mul_req_tmp_if), - .fpu_req_if (fpu_req_tmp_if), - .gpu_req_if (gpu_req_tmp_if) - ); + assign issue_if.rs1_data = gpr_read_if.rs1_data; + assign issue_if.rs2_data = gpr_read_if.rs2_data; + assign issue_if.rs3_data = gpr_read_if.rs3_data; - wire stall = schedule_delay || ~gpr_read_if.ready; - assign decode_if.ready = ~stall; + wire [`NT_BITS-1:0] tid; + VX_priority_encoder #( + .N(`NUM_THREADS) + ) sel_src ( + .data_in (decode_if.thread_mask), + .data_out (tid), + `UNUSED_PIN (valid_out) + ); + + wire stall = schedule_delay || ~gpr_read_if.ready; + wire flush = stall; // clear output on stall + + VX_generic_register #( + .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `FRM_BITS + `NT_BITS) + ) issue_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (flush), + .in ({decode_if.valid, issue_tag, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.frm, tid}), + .out ({issue_if.valid, issue_if.issue_tag, issue_if.wid, issue_if.thread_mask, issue_if.curr_PC, issue_if.rd, issue_if.rs1, issue_if.imm, issue_if.rs1_is_PC, issue_if.rs2_is_imm, issue_if.ex_type, issue_if.ex_op, issue_if.wb, issue_if.frm, issue_if.tid}) + ); + + assign decode_if.ready = issue_if.ready; + assign issue_if.ready = ~stall; - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `ALU_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32) - ) alu_reg ( - .clk (clk), - .reset (reset), - .stall (~alu_req_if.ready), - .flush (stall && alu_req_if.ready), - .in ({alu_req_tmp_if.valid, alu_req_tmp_if.issue_tag, alu_req_tmp_if.warp_num, alu_req_tmp_if.curr_PC, alu_req_tmp_if.thread_mask, alu_req_tmp_if.alu_op, alu_req_tmp_if.rs1_data, alu_req_tmp_if.rs2_data, alu_req_tmp_if.offset, alu_req_tmp_if.next_PC}), - .out ({alu_req_if.valid, alu_req_if.issue_tag, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.thread_mask, alu_req_if.alu_op, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC}) - ); - - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + 1 + `BYTEEN_BITS + (`NUM_THREADS * 32) + 32 + (`NUM_THREADS * 32) + `NR_BITS + 1) - ) lsu_reg ( - .clk (clk), - .reset (reset), - .stall (~lsu_req_if.ready), - .flush (stall && lsu_req_if.ready), - .in ({lsu_req_tmp_if.valid, lsu_req_tmp_if.issue_tag, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.thread_mask, lsu_req_tmp_if.rw, lsu_req_tmp_if.byteen, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset, lsu_req_tmp_if.store_data, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb}), - .out ({lsu_req_if.valid, lsu_req_if.issue_tag, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data, lsu_req_if.rd, lsu_req_if.wb}) - ); - - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `CSR_BITS + `CSR_ADDR_BITS + 32 + 1) - ) csr_reg ( - .clk (clk), - .reset (reset), - .stall (~csr_req_if.ready), - .flush (stall && csr_req_if.ready), - .in ({csr_req_tmp_if.valid, csr_req_tmp_if.issue_tag, csr_req_tmp_if.warp_num, csr_req_tmp_if.curr_PC, csr_req_tmp_if.thread_mask, csr_req_tmp_if.csr_op, csr_req_tmp_if.csr_addr, csr_req_tmp_if.csr_mask, csr_req_tmp_if.is_io}), - .out ({csr_req_if.valid, csr_req_if.issue_tag, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.thread_mask, csr_req_if.csr_op, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io}) - ); - -`ifdef EXT_M_ENABLE - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `MUL_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) - ) mul_reg ( - .clk (clk), - .reset (reset), - .stall (~mul_req_if.ready), - .flush (stall && mul_req_if.ready), - .in ({mul_req_tmp_if.valid, mul_req_tmp_if.issue_tag, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.thread_mask, mul_req_tmp_if.mul_op, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data}), - .out ({mul_req_if.valid, mul_req_if.issue_tag, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.thread_mask, mul_req_if.mul_op, mul_req_if.rs1_data, mul_req_if.rs2_data}) - ); -`endif - -`ifdef EXT_F_ENABLE - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `FPU_BITS + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) - ) fpu_reg ( - .clk (clk), - .reset (reset), - .stall (~fpu_req_if.ready), - .flush (stall && fpu_req_if.ready), - .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.issue_tag, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.thread_mask, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.frm, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data}), - .out ({fpu_req_if.valid, fpu_req_if.issue_tag, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.thread_mask, fpu_req_if.fpu_op, fpu_req_if.frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}) - ); -`endif - - VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + 32 + `NUM_THREADS + `GPU_BITS + (`NUM_THREADS * 32) + 32 + 32) - ) gpu_reg ( - .clk (clk), - .reset (reset), - .stall (~gpu_req_if.ready), - .flush (stall && gpu_req_if.ready), - .in ({gpu_req_tmp_if.valid, gpu_req_tmp_if.issue_tag, gpu_req_tmp_if.warp_num, gpu_req_tmp_if.curr_PC, gpu_req_tmp_if.thread_mask, gpu_req_tmp_if.gpu_op, gpu_req_tmp_if.rs1_data, gpu_req_tmp_if.rs2_data, gpu_req_tmp_if.next_PC}), - .out ({gpu_req_if.valid, gpu_req_if.issue_tag, gpu_req_if.warp_num, gpu_req_if.curr_PC, gpu_req_if.thread_mask, gpu_req_if.gpu_op, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.next_PC}) + VX_issue_demux issue_demux ( + .issue_if (issue_if), + .alu_req_if (alu_req_if), + .bru_req_if (bru_req_if), + .lsu_req_if (lsu_req_if), + .csr_req_if (csr_req_if), + .mul_req_if (mul_req_if), + .fpu_req_if (fpu_req_if), + .gpu_req_if (gpu_req_if) ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, offset=%0h, next_PC=%0h", $time, CORE_ID, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.issue_tag, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.offset, alu_req_if.next_PC); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.curr_PC, alu_req_if.issue_tag, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data); + end + if (bru_req_if.valid && bru_req_if.ready) begin + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, offset=%0h", $time, CORE_ID, bru_req_if.wid, bru_req_if.curr_PC, bru_req_if.issue_tag, bru_req_if.thread_mask, bru_req_if.rs1_data, bru_req_if.rs2_data, bru_req_if.offset); end if (lsu_req_if.valid && lsu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.issue_tag, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.curr_PC, lsu_req_if.issue_tag, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); end if (csr_req_if.valid && csr_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.issue_tag, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.curr_PC, csr_req_if.issue_tag, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask); end if (mul_req_if.valid && mul_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.issue_tag, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.curr_PC, mul_req_if.issue_tag, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data); end if (fpu_req_if.valid && fpu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.issue_tag, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.issue_tag, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); end if (gpu_req_if.valid && gpu_req_if.ready) begin - $display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.warp_num, gpu_req_if.curr_PC, gpu_req_if.issue_tag, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); + $display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.curr_PC, gpu_req_if.issue_tag, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); end end `endif diff --git a/hw/rtl/VX_issue_demux.v b/hw/rtl/VX_issue_demux.v index 44c4433b..16b0dd85 100644 --- a/hw/rtl/VX_issue_demux.v +++ b/hw/rtl/VX_issue_demux.v @@ -2,12 +2,11 @@ module VX_issue_demux ( // inputs - VX_decode_if decode_if, - VX_gpr_read_if gpr_read_if, - input wire [`ISTAG_BITS-1:0] issue_tag, - + VX_issue_if issue_if, + // outputs VX_alu_req_if alu_req_if, + VX_bru_req_if bru_req_if, VX_lsu_req_if lsu_req_if, VX_csr_req_if csr_req_if, VX_mul_req_if mul_req_if, @@ -15,74 +14,89 @@ module VX_issue_demux ( VX_gpu_req_if gpu_req_if ); // ALU unit - assign alu_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_ALU); - assign alu_req_if.thread_mask = decode_if.thread_mask; - assign alu_req_if.issue_tag = issue_tag; - assign alu_req_if.warp_num = decode_if.warp_num; - assign alu_req_if.curr_PC = decode_if.curr_PC; - assign alu_req_if.alu_op = `ALU_OP(decode_if.ex_op); - assign alu_req_if.rs1_data = decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : gpr_read_if.rs1_data; - assign alu_req_if.rs2_data = decode_if.rs2_is_imm ? {`NUM_THREADS{decode_if.imm}} : gpr_read_if.rs2_data; - assign alu_req_if.offset = decode_if.imm; - assign alu_req_if.next_PC = decode_if.next_PC; + assign alu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_ALU); + assign alu_req_if.issue_tag = issue_if.issue_tag; + assign alu_req_if.wid = issue_if.wid; + assign alu_req_if.thread_mask = issue_if.thread_mask; + assign alu_req_if.curr_PC = issue_if.curr_PC; + assign alu_req_if.op = `ALU_OP(issue_if.ex_op); + assign alu_req_if.rs1_is_PC = issue_if.rs1_is_PC; + assign alu_req_if.rs2_is_imm = issue_if.rs2_is_imm; + assign alu_req_if.imm = issue_if.imm; + assign alu_req_if.rs1_data = issue_if.rs1_data; + assign alu_req_if.rs2_data = issue_if.rs2_data; + + // BRU unit + assign bru_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_BRU); + assign bru_req_if.issue_tag = issue_if.issue_tag; + assign bru_req_if.wid = issue_if.wid; + assign bru_req_if.thread_mask = issue_if.thread_mask; + assign bru_req_if.curr_PC = issue_if.curr_PC; + assign bru_req_if.op = `BRU_OP(issue_if.ex_op); + assign bru_req_if.rs1_is_PC = issue_if.rs1_is_PC; + assign bru_req_if.rs1_data = issue_if.rs1_data[issue_if.tid]; + assign bru_req_if.rs2_data = issue_if.rs2_data[issue_if.tid]; + assign bru_req_if.offset = issue_if.imm; // LSU unit - assign lsu_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_LSU); - assign lsu_req_if.thread_mask = decode_if.thread_mask; - assign lsu_req_if.issue_tag = issue_tag; - assign lsu_req_if.warp_num = decode_if.warp_num; - assign lsu_req_if.curr_PC = decode_if.curr_PC; - assign lsu_req_if.rw = `LSU_RW(decode_if.ex_op); - assign lsu_req_if.byteen = `LSU_BE(decode_if.ex_op); - assign lsu_req_if.base_addr = gpr_read_if.rs1_data; - assign lsu_req_if.store_data = gpr_read_if.rs2_data; - assign lsu_req_if.offset = decode_if.imm; - assign lsu_req_if.rd = decode_if.rd; - assign lsu_req_if.wb = decode_if.wb; + assign lsu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_LSU); + assign lsu_req_if.issue_tag = issue_if.issue_tag; + assign lsu_req_if.wid = issue_if.wid; + assign lsu_req_if.thread_mask = issue_if.thread_mask; + assign lsu_req_if.curr_PC = issue_if.curr_PC; + assign lsu_req_if.rw = `LSU_RW(issue_if.ex_op); + assign lsu_req_if.byteen = `LSU_BE(issue_if.ex_op); + assign lsu_req_if.base_addr = issue_if.rs1_data; + assign lsu_req_if.store_data = issue_if.rs2_data; + assign lsu_req_if.offset = issue_if.imm; + assign lsu_req_if.rd = issue_if.rd; + assign lsu_req_if.wb = issue_if.wb; // CSR unit - assign csr_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_CSR); - assign csr_req_if.issue_tag = issue_tag; - assign csr_req_if.warp_num = decode_if.warp_num; - assign csr_req_if.curr_PC = decode_if.curr_PC; - assign csr_req_if.csr_op = `CSR_OP(decode_if.ex_op); - assign csr_req_if.csr_addr = decode_if.imm[`CSR_ADDR_BITS-1:0]; - assign csr_req_if.csr_mask = decode_if.rs2_is_imm ? 32'(decode_if.rs1) : gpr_read_if.rs1_data[0]; + assign csr_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_CSR); + assign csr_req_if.issue_tag = issue_if.issue_tag; + assign csr_req_if.wid = issue_if.wid; + assign csr_req_if.thread_mask = issue_if.thread_mask; + assign csr_req_if.curr_PC = issue_if.curr_PC; + assign csr_req_if.op = `CSR_OP(issue_if.ex_op); + assign csr_req_if.csr_addr = issue_if.imm[`CSR_ADDR_BITS-1:0]; + assign csr_req_if.csr_mask = issue_if.rs2_is_imm ? 32'(issue_if.rs1) : issue_if.rs1_data[0]; assign csr_req_if.is_io = 1'b0; // MUL unit `ifdef EXT_M_ENABLE - assign mul_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_MUL); - assign mul_req_if.issue_tag = issue_tag; - assign mul_req_if.warp_num = decode_if.warp_num; - assign mul_req_if.curr_PC = decode_if.curr_PC; - assign mul_req_if.mul_op = `MUL_OP(decode_if.ex_op); - assign mul_req_if.rs1_data = gpr_read_if.rs1_data; - assign mul_req_if.rs2_data = gpr_read_if.rs2_data; + assign mul_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_MUL); + assign mul_req_if.issue_tag = issue_if.issue_tag; + assign mul_req_if.wid = issue_if.wid; + assign mul_req_if.thread_mask = issue_if.thread_mask; + assign mul_req_if.curr_PC = issue_if.curr_PC; + assign mul_req_if.op = `MUL_OP(issue_if.ex_op); + assign mul_req_if.rs1_data = issue_if.rs1_data; + assign mul_req_if.rs2_data = issue_if.rs2_data; `endif // FPU unit `ifdef EXT_F_ENABLE - assign fpu_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_FPU); - assign fpu_req_if.issue_tag = issue_tag; - assign fpu_req_if.warp_num = decode_if.warp_num; - assign fpu_req_if.curr_PC = decode_if.curr_PC; - assign fpu_req_if.fpu_op = `FPU_OP(decode_if.ex_op); - assign fpu_req_if.frm = decode_if.frm; - assign fpu_req_if.rs1_data = gpr_read_if.rs1_data; - assign fpu_req_if.rs2_data = gpr_read_if.rs2_data; - assign fpu_req_if.rs3_data = gpr_read_if.rs3_data; + assign fpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_FPU); + assign fpu_req_if.issue_tag = issue_if.issue_tag; + assign fpu_req_if.wid = issue_if.wid; + assign fpu_req_if.thread_mask = issue_if.thread_mask; + assign fpu_req_if.curr_PC = issue_if.curr_PC; + assign fpu_req_if.op = `FPU_OP(issue_if.ex_op); + assign fpu_req_if.frm = issue_if.frm; + assign fpu_req_if.rs1_data = issue_if.rs1_data; + assign fpu_req_if.rs2_data = issue_if.rs2_data; + assign fpu_req_if.rs3_data = issue_if.rs3_data; `endif // GPU unit - assign gpu_req_if.valid = decode_if.valid && (decode_if.ex_type == `EX_GPU); - assign gpu_req_if.thread_mask = decode_if.thread_mask; - assign gpu_req_if.issue_tag = issue_tag; - assign gpu_req_if.warp_num = decode_if.warp_num; - assign gpu_req_if.curr_PC = decode_if.curr_PC; - assign gpu_req_if.gpu_op = `GPU_OP(decode_if.ex_op); - assign gpu_req_if.rs1_data = gpr_read_if.rs1_data; - assign gpu_req_if.rs2_data = gpr_read_if.rs2_data[0]; - assign gpu_req_if.next_PC = decode_if.next_PC; + assign gpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_GPU); + assign gpu_req_if.issue_tag = issue_if.issue_tag; + assign gpu_req_if.wid = issue_if.wid; + assign gpu_req_if.thread_mask = issue_if.thread_mask; + assign gpu_req_if.curr_PC = issue_if.curr_PC; + assign gpu_req_if.op = `GPU_OP(issue_if.ex_op); + assign gpu_req_if.rs1_data = issue_if.rs1_data; + assign gpu_req_if.rs2_data = issue_if.rs2_data[0]; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 87acc21b..9b7ecc8f 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -19,24 +19,24 @@ module VX_lsu_unit #( VX_exu_to_cmt_if lsu_commit_if ); - wire use_valid; - wire [`NUM_THREADS-1:0] use_thread_mask; - wire use_req_rw; - wire [`NUM_THREADS-1:0][29:0] use_req_addr; - wire [`NUM_THREADS-1:0][1:0] use_req_offset; - wire [`NUM_THREADS-1:0][3:0] use_req_byteen; - wire [`NUM_THREADS-1:0][31:0] use_req_data; - wire [1:0] use_req_sext; - wire [`NR_BITS-1:0] use_rd; - wire [`NW_BITS-1:0] use_warp_num; - wire [`ISTAG_BITS-1:0] use_issue_tag; - wire use_wb; - wire [31:0] use_pc; + wire valid_in; + wire ready_in; - genvar i; + wire [`NUM_THREADS-1:0] req_thread_mask; + wire req_rw; + wire [`NUM_THREADS-1:0][29:0] req_addr; + wire [`NUM_THREADS-1:0][1:0] req_offset; + wire [`NUM_THREADS-1:0][3:0] req_byteen; + wire [`NUM_THREADS-1:0][31:0] req_data; + wire [1:0] req_sext; + wire [`NR_BITS-1:0] req_rd; + wire [`NW_BITS-1:0] req_wid; + wire [`ISTAG_BITS-1:0] req_issue_tag; + wire req_wb; + wire [31:0] req_pc; wire [`NUM_THREADS-1:0][31:0] full_address; - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin assign full_address[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; end @@ -63,38 +63,39 @@ module VX_lsu_unit #( endcase end - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin assign mem_req_addr[i] = full_address[i][31:2]; assign mem_req_offset[i] = full_address[i][1:0]; assign mem_req_byteen[i] = wmask << full_address[i][1:0]; assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0}; - end - - wire stall_in = ~dcache_req_if.ready && use_valid; - - // Can accept new request? - assign lsu_req_if.ready = ~stall_in; + end `IGNORE_WARNINGS_BEGIN - wire [`NUM_THREADS-1:0][31:0] use_address; + wire [`NUM_THREADS-1:0][31:0] req_address; `IGNORE_WARNINGS_END - VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + `ISTAG_BITS + (`NUM_THREADS * 32) + 2 + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + 1 + 32) - ) lsu_req_reg ( - .clk (clk), - .reset (reset), - .stall (stall_in), - .flush (0), - .in ({lsu_req_if.valid, lsu_req_if.warp_num, lsu_req_if.thread_mask, lsu_req_if.issue_tag, full_address, mem_req_sext, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.curr_PC}), - .out ({use_valid, use_warp_num, use_thread_mask, use_issue_tag, use_address, use_req_sext, use_req_rw, use_req_addr, use_req_offset, use_req_byteen, use_req_data, use_rd, use_wb, use_pc}) + // use a skid buffer because the dcache's ready signal is combinational + // use buffer size of two for stall-free execution + VX_elastic_buffer #( + .DATAW (`NW_BITS + `NUM_THREADS + `ISTAG_BITS + (`NUM_THREADS * 32) + 2 + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + 1 + 32), + .SIZE (2) + ) input_buffer ( + .clk (clk), + .reset (reset), + .valid_in (lsu_req_if.valid), + .ready_in (lsu_req_if.ready), + .data_in ({lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.issue_tag, full_address, mem_req_sext, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.curr_PC}), + .data_out ({req_wid, req_thread_mask, req_issue_tag, req_address, req_sext, req_rw, req_addr, req_offset, req_byteen, req_data, req_rd, req_wb, req_pc}), + .ready_out (ready_in), + .valid_out (valid_in) ); - reg [`NUM_THREADS-1:0] mem_rsp_mask_buf [`ISSUEQ_SIZE-1:0]; + reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] mem_rsp_mask_buf; + reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] mem_rsp_data_prev_buf; + reg [`NUM_THREADS-1:0][1:0] mem_rsp_offset_buf [`ISSUEQ_SIZE-1:0]; - reg [1:0] mem_rsp_sext_buf [`ISSUEQ_SIZE-1:0]; - reg [`NUM_THREADS-1:0][31:0] mem_rsp_data_all_buf [`ISSUEQ_SIZE-1:0]; - reg [`NW_BITS-1:0] mem_rsp_warp_num_buf [`ISSUEQ_SIZE-1:0]; + reg [1:0] mem_rsp_sext_buf [`ISSUEQ_SIZE-1:0]; + reg [`NW_BITS-1:0] mem_rsp_wid_buf [`ISSUEQ_SIZE-1:0]; reg [31:0] mem_rsp_curr_PC_buf [`ISSUEQ_SIZE-1:0]; reg [`NR_BITS-1:0] mem_rsp_rd_buf [`ISSUEQ_SIZE-1:0]; @@ -105,47 +106,56 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0] mem_rsp_mask = mem_rsp_mask_buf [rsp_issue_tag]; wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset = mem_rsp_offset_buf [rsp_issue_tag]; wire [1:0] mem_rsp_sext = mem_rsp_sext_buf [rsp_issue_tag]; - wire [`NUM_THREADS-1:0][31:0] mem_rsp_data_all = mem_rsp_data_all_buf [rsp_issue_tag]; - wire [`NW_BITS-1:0] mem_rsp_warp_num = mem_rsp_warp_num_buf [rsp_issue_tag]; + wire [`NUM_THREADS-1:0][31:0] mem_rsp_data_prev= mem_rsp_data_prev_buf [rsp_issue_tag]; + wire [`NW_BITS-1:0] mem_rsp_wid = mem_rsp_wid_buf [rsp_issue_tag]; wire [31:0] mem_rsp_curr_PC = mem_rsp_curr_PC_buf [rsp_issue_tag]; wire [`NR_BITS-1:0] mem_rsp_rd = mem_rsp_rd_buf [rsp_issue_tag]; - wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask & ~dcache_rsp_if.valid; - wire dcache_req_fire = (| dcache_req_if.valid) && dcache_req_if.ready; wire dcache_rsp_fire = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; + wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask & ~dcache_rsp_if.valid; + always @(posedge clk) begin - if (dcache_req_fire && (0 == use_req_rw)) begin - mem_rsp_mask_buf [use_issue_tag] <= use_thread_mask; - mem_rsp_offset_buf [use_issue_tag] <= use_req_offset; - mem_rsp_sext_buf [use_issue_tag] <= use_req_sext; - mem_rsp_data_all_buf [use_issue_tag] <= 0; - mem_rsp_warp_num_buf [use_issue_tag] <= use_warp_num; - mem_rsp_curr_PC_buf [use_issue_tag] <= use_pc; - mem_rsp_rd_buf [use_issue_tag] <= use_rd; + if (dcache_req_fire && (0 == req_rw)) begin + mem_rsp_mask_buf [req_issue_tag] <= req_thread_mask; + mem_rsp_data_prev_buf [req_issue_tag] <= 0; end if (dcache_rsp_fire) begin - mem_rsp_mask_buf [rsp_issue_tag] <= mem_rsp_mask_n; - mem_rsp_data_all_buf [rsp_issue_tag] <= mem_rsp_data_all | mem_rsp_data_curr; + mem_rsp_mask_buf [rsp_issue_tag] <= mem_rsp_mask_n; + mem_rsp_data_prev_buf [rsp_issue_tag] <= mem_rsp_data_curr | mem_rsp_data_prev; end end + always @(posedge clk) begin + if (dcache_req_fire && (0 == req_rw)) begin + mem_rsp_offset_buf [req_issue_tag] <= req_offset; + mem_rsp_sext_buf [req_issue_tag] <= req_sext; + mem_rsp_wid_buf [req_issue_tag] <= req_wid; + mem_rsp_curr_PC_buf [req_issue_tag] <= req_pc; + mem_rsp_rd_buf [req_issue_tag] <= req_rd; + end + end + + wire stall_in; + // Core Request - assign dcache_req_if.valid = {`NUM_THREADS{use_valid}} & use_thread_mask; - assign dcache_req_if.rw = {`NUM_THREADS{use_req_rw}}; - assign dcache_req_if.byteen = use_req_byteen; - assign dcache_req_if.addr = use_req_addr; - assign dcache_req_if.data = use_req_data; + assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~stall_in}} & req_thread_mask; + assign dcache_req_if.rw = {`NUM_THREADS{req_rw}}; + assign dcache_req_if.byteen = req_byteen; + assign dcache_req_if.addr = req_addr; + assign dcache_req_if.data = req_data; + + assign ready_in = dcache_req_if.ready && ~stall_in; `ifdef DBG_CORE_REQ_INFO - assign dcache_req_if.tag = {use_pc, use_wb, use_rd, use_warp_num, use_issue_tag}; + assign dcache_req_if.tag = {req_pc, req_wb, req_rd, req_wid, req_issue_tag}; `else - assign dcache_req_if.tag = use_issue_tag; + assign dcache_req_if.tag = req_issue_tag; `endif // Core Response - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin wire [31:0] rsp_data_shifted = dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0}; always @(*) begin case (mem_rsp_sext) @@ -156,46 +166,60 @@ module VX_lsu_unit #( end end - wire is_store_rsp = dcache_req_fire && use_req_rw; - wire is_load_rsp = (| dcache_rsp_if.valid) && (0 == mem_rsp_mask_n); + reg is_load_rsp; + reg [`NUM_THREADS-1:0][31:0] load_data; + reg [`ISTAG_BITS-1:0] rsp_issue_tag_r; - assign lsu_commit_if.valid = is_load_rsp || is_store_rsp; - assign lsu_commit_if.issue_tag = is_store_rsp ? use_issue_tag : rsp_issue_tag; - assign lsu_commit_if.data = mem_rsp_data_curr | mem_rsp_data_all; + always @(posedge clk) begin + if (reset) begin + is_load_rsp <= 0; + end else begin + is_load_rsp <= dcache_rsp_fire && (0 == mem_rsp_mask_n); + load_data <= mem_rsp_data_curr | mem_rsp_data_prev; + rsp_issue_tag_r <= rsp_issue_tag; + end + end + + wire is_store_req = dcache_req_fire && req_rw; + assign stall_in = is_load_rsp && valid_in && req_rw; // LOAD has priority + + assign lsu_commit_if.valid = is_load_rsp || is_store_req; + assign lsu_commit_if.issue_tag = is_load_rsp ? rsp_issue_tag_r : req_issue_tag; + assign lsu_commit_if.data = load_data; // Can accept new cache response? - assign dcache_rsp_if.ready = lsu_commit_if.ready && ~is_store_rsp; // STORE has priority + assign dcache_rsp_if.ready = 1'b1; // scope registration `SCOPE_ASSIGN (scope_dcache_req_valid, dcache_req_if.valid); - `SCOPE_ASSIGN (scope_dcache_req_addr, use_address); + `SCOPE_ASSIGN (scope_dcache_req_addr, req_address); `SCOPE_ASSIGN (scope_dcache_req_rw, dcache_req_if.rw ); `SCOPE_ASSIGN (scope_dcache_req_byteen,dcache_req_if.byteen); `SCOPE_ASSIGN (scope_dcache_req_data, dcache_req_if.data); `SCOPE_ASSIGN (scope_dcache_req_tag, dcache_req_if.tag); `SCOPE_ASSIGN (scope_dcache_req_ready, dcache_req_if.ready); - `SCOPE_ASSIGN (scope_dcache_req_warp_num, use_warp_num); - `SCOPE_ASSIGN (scope_dcache_req_curr_PC, use_pc); + `SCOPE_ASSIGN (scope_dcache_req_wid, req_wid); + `SCOPE_ASSIGN (scope_dcache_req_curr_PC, req_pc); `SCOPE_ASSIGN (scope_dcache_rsp_valid, dcache_rsp_if.valid); `SCOPE_ASSIGN (scope_dcache_rsp_data, dcache_rsp_if.data); `SCOPE_ASSIGN (scope_dcache_rsp_tag, dcache_rsp_if.tag); `SCOPE_ASSIGN (scope_dcache_rsp_ready, dcache_rsp_if.ready); - `UNUSED_VAR (mem_rsp_warp_num) + `UNUSED_VAR (mem_rsp_wid) `UNUSED_VAR (mem_rsp_curr_PC) `UNUSED_VAR (mem_rsp_rd) - `UNUSED_VAR (use_wb) + `UNUSED_VAR (req_wb) `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin if ((| dcache_req_if.valid) && dcache_req_if.ready) begin - $display("%t: D$%0d req: warp=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, rd=%0d, rw=%0b, byteen=%0h, data=%0h", - $time, CORE_ID, use_warp_num, use_pc, dcache_req_if.valid, use_address, dcache_req_if.tag, use_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data); + $display("%t: D$%0d req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, rd=%0d, rw=%0b, byteen=%0h, data=%0h", + $time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin - $display("%t: D$%0d rsp: valid=%b, warp=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", - $time, CORE_ID, dcache_rsp_if.valid, mem_rsp_warp_num, mem_rsp_curr_PC, dcache_rsp_if.tag, mem_rsp_rd, dcache_rsp_if.data); + $display("%t: D$%0d rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", + $time, CORE_ID, dcache_rsp_if.valid, mem_rsp_wid, mem_rsp_curr_PC, dcache_rsp_if.tag, mem_rsp_rd, dcache_rsp_if.data); end end `endif diff --git a/hw/rtl/VX_mem_arb.v b/hw/rtl/VX_mem_arb.v index edf6741c..f8c5ac37 100644 --- a/hw/rtl/VX_mem_arb.v +++ b/hw/rtl/VX_mem_arb.v @@ -83,15 +83,13 @@ module VX_mem_arb #( assign out_mem_req_data = in_mem_req_data [bus_req_sel]; assign out_mem_req_tag = {in_mem_req_tag [bus_req_sel], REQS_BITS'(bus_req_sel)}; - genvar i; - - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign in_mem_req_ready[i] = out_mem_req_ready && (bus_req_sel == REQS_BITS'(i)); end wire [REQS_BITS-1:0] bus_rsp_sel = out_mem_rsp_tag[REQS_BITS-1:0]; - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign in_mem_rsp_valid[i] = out_mem_rsp_valid && (bus_rsp_sel == REQS_BITS'(i)); assign in_mem_rsp_data[i] = out_mem_rsp_data; assign in_mem_rsp_tag[i] = out_mem_rsp_tag[REQS_BITS +: TAG_IN_WIDTH]; diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 394ebe06..86d531f9 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -11,52 +11,36 @@ module VX_mul_unit #( // Outputs VX_exu_to_cmt_if alu_commit_if -); - - wire [`MUL_BITS-1:0] alu_op = alu_req_if.mul_op; - wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; - wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; +); + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`MUL_BITS-1:0] alu_op; + wire [`NUM_THREADS-1:0][31:0] alu_in1, alu_in2; + wire valid_in, ready_in; + + // use a skid buffer due to MUL/DIV output arbitration adding realtime backpressure + VX_elastic_buffer #( + .DATAW (`ISTAG_BITS + `MUL_BITS + (2 * `NUM_THREADS * 32)), + .SIZE (0) + ) input_buffer ( + .clk (clk), + .reset (reset), + .valid_in (alu_req_if.valid), + .ready_in (alu_req_if.ready), + .data_in ({alu_req_if.issue_tag, alu_req_if.op, alu_req_if.rs1_data, alu_req_if.rs2_data}), + .data_out ({issue_tag, alu_op, alu_in1, alu_in2}), + .ready_out (ready_in), + .valid_out (valid_in) + ); - wire [`NUM_THREADS-1:0][31:0] mul_result, div_result; + wire [`NUM_THREADS-1:0][31:0] mul_result; + wire is_mulw = (alu_op == `MUL_MUL); + wire is_mulw_out; - wire stall_mul, stall_div; - - wire is_mul_mul = (alu_op == `MUL_MUL); - wire is_mul_mul_out; - - wire is_div_divu = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); - reg [`NUM_THREADS-1:0] is_div_divu_qual; - wire [`NUM_THREADS-1:0] is_div_divu_out; - - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]}; wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]}; - - reg [32:0] div_in1, div_in2; - - // handle divide by zero - always @(*) begin - is_div_divu_qual[i] = is_div_divu; - div_in1 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in1[i][31], alu_in1[i]}; - div_in2 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in2[i][31], alu_in2[i]}; - - if (0 == alu_in2[i]) begin - if (is_div_divu) begin - div_in1 = {1'b0, 32'hFFFFFFFF}; // quotient = (0xFFFFFFFF / 1) - div_in2 = 1; - end else begin - is_div_divu_qual[i] = 1; // remainder = (in1 / 1) - div_in2 = 1; - end - end - end - wire [63:0] mul_result_tmp; - wire [31:0] div_result_tmp; - wire [31:0] rem_result_tmp; VX_multiplier #( .WIDTHA(33), @@ -67,12 +51,71 @@ module VX_mul_unit #( ) multiplier ( .clk(clk), .reset(reset), - .clk_en(~stall_mul), + .clk_en(1'b1), .dataa(mul_in1), .datab(mul_in2), .result(mul_result_tmp) ); + assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; + end + + wire [`ISTAG_BITS-1:0] mul_issue_tag; + wire mul_valid_out; + + wire mul_fire = valid_in && ready_in && ~`IS_DIV_OP(alu_op); + + VX_shift_register #( + .DATAW(1 + `ISTAG_BITS + 1), + .DEPTH(`LATENCY_IMUL) + ) mul_shift_reg ( + .clk(clk), + .reset(reset), + .enable(1'b1), + .in({mul_fire, issue_tag, is_mulw}), + .out({mul_valid_out, mul_issue_tag, is_mulw_out}) + ); + + /////////////////////////////////////////////////////////////////////////// + + wire [`NUM_THREADS-1:0][31:0] div_result; + wire is_div = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU); + wire is_signed_div = (alu_op == `MUL_DIV || alu_op == `MUL_REM); + reg [`NUM_THREADS-1:0] is_div_qual; + wire [`NUM_THREADS-1:0] is_div_out; + wire stall_div; + + for (genvar i = 0; i < `NUM_THREADS; i++) begin + + reg [31:0] div_in1_qual, div_in2_qual; + reg [32:0] div_in1, div_in2; + wire [31:0] div_result_tmp, rem_result_tmp; + + // handle divide by zero + always @(*) begin + if (~stall_div) begin + is_div_qual[i] = is_div; + div_in1_qual = alu_in1[i]; + div_in2_qual = alu_in2[i]; + if (0 == alu_in2[i]) begin + div_in2_qual = 1; + if (is_div) begin + div_in1_qual = 32'hFFFFFFFF; // quotient = (0xFFFFFFFF / 1) + end else begin + is_div_qual[i] = 1; // remainder = (in1 / 1) + end + end + end + end + + // latch divider inputs + always @(posedge clk) begin + if (~stall_div) begin + div_in1 <= {is_signed_div & alu_in1[i][31], div_in1_qual}; + div_in2 <= {is_signed_div & alu_in2[i][31], div_in2_qual}; + end + end + VX_divide #( .WIDTHN(33), .WIDTHD(33), @@ -90,49 +133,32 @@ module VX_mul_unit #( .quotient(div_result_tmp), .remainder(rem_result_tmp) ); - - assign mul_result[i] = is_mul_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; - assign div_result[i] = is_div_divu_out[i] ? div_result_tmp : rem_result_tmp; + + assign div_result[i] = is_div_out[i] ? div_result_tmp : rem_result_tmp; end - wire is_mul_fire = alu_req_if.valid && alu_req_if.ready && ~`IS_DIV_OP(alu_op); - wire is_div_fire = alu_req_if.valid && alu_req_if.ready && `IS_DIV_OP(alu_op); - - wire mul_valid_out; + wire [`ISTAG_BITS-1:0] div_issue_tag; wire div_valid_out; - wire [`ISTAG_BITS-1:0] mul_issue_tag; - wire [`ISTAG_BITS-1:0] div_issue_tag; - - VX_shift_register #( - .DATAW(1 + `ISTAG_BITS + 1), - .DEPTH(`LATENCY_IMUL) - ) mul_shift_reg ( - .clk(clk), - .reset(reset), - .enable(~stall_mul), - .in({is_mul_fire, alu_req_if.issue_tag, is_mul_mul}), - .out({mul_valid_out, mul_issue_tag, is_mul_mul_out}) - ); + wire div_fire = valid_in && ready_in && `IS_DIV_OP(alu_op); VX_shift_register #( .DATAW(1 + `ISTAG_BITS + `NUM_THREADS), - .DEPTH(`LATENCY_IDIV) + .DEPTH(`LATENCY_IDIV + 1) ) div_shift_reg ( .clk(clk), .reset(reset), .enable(~stall_div), - .in({is_div_fire, alu_req_if.issue_tag, is_div_divu_qual}), - .out({div_valid_out, div_issue_tag, is_div_divu_out}) + .in({div_fire, issue_tag, is_div_qual}), + .out({div_valid_out, div_issue_tag, is_div_out}) ); + + /////////////////////////////////////////////////////////////////////////// - wire stall_out = (~alu_commit_if.ready && alu_commit_if.valid); - assign stall_mul = stall_out; - assign stall_div = stall_out - || (mul_valid_out && div_valid_out); // arbitration prioritizes MUL + assign stall_div = mul_valid_out && div_valid_out; // arbitration prioritizes MUL // can accept new request? - assign alu_req_if.ready = ~(stall_mul || stall_div); + assign ready_in = ~stall_div; assign alu_commit_if.valid = mul_valid_out || div_valid_out; assign alu_commit_if.issue_tag = mul_valid_out ? mul_issue_tag : div_issue_tag; diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index efc66237..f7aa9fa3 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -107,6 +107,7 @@ module VX_pipeline #( VX_warp_ctl_if warp_ctl_if(); VX_ifetch_rsp_if ifetch_rsp_if(); VX_alu_req_if alu_req_if(); + VX_bru_req_if bru_req_if(); VX_lsu_req_if lsu_req_if(); VX_csr_req_if csr_req_if(); VX_mul_req_if mul_req_if(); @@ -117,6 +118,7 @@ module VX_pipeline #( VX_wstall_if wstall_if(); VX_join_if join_if(); VX_exu_to_cmt_if alu_commit_if(); + VX_exu_to_cmt_if bru_commit_if(); VX_exu_to_cmt_if lsu_commit_if(); VX_exu_to_cmt_if csr_commit_if(); VX_exu_to_cmt_if mul_commit_if(); @@ -157,9 +159,10 @@ module VX_pipeline #( .decode_if (decode_if), .writeback_if (writeback_if), - .cmt_to_issue_if (cmt_to_issue_if), + .cmt_to_issue_if(cmt_to_issue_if), .alu_req_if (alu_req_if), + .bru_req_if (bru_req_if), .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), @@ -183,6 +186,7 @@ module VX_pipeline #( .cmt_to_csr_if (cmt_to_csr_if), .alu_req_if (alu_req_if), + .bru_req_if (bru_req_if), .lsu_req_if (lsu_req_if), .csr_req_if (csr_req_if), .mul_req_if (mul_req_if), @@ -192,6 +196,7 @@ module VX_pipeline #( .warp_ctl_if (warp_ctl_if), .branch_ctl_if (branch_ctl_if), .alu_commit_if (alu_commit_if), + .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), @@ -208,6 +213,7 @@ module VX_pipeline #( .reset (reset), .alu_commit_if (alu_commit_if), + .bru_commit_if (bru_commit_if), .lsu_commit_if (lsu_commit_if), .csr_commit_if (csr_commit_if), .mul_commit_if (mul_commit_if), diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index dcea510d..b12551c4 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -1,6 +1,8 @@ `ifndef VX_PLATFORM `define VX_PLATFORM +`include "VX_scope.vh" + /////////////////////////////////////////////////////////////////////////////// `ifndef NDEBUG @@ -50,6 +52,7 @@ /////////////////////////////////////////////////////////////////////////////// `define USE_FAST_BRAM (* syn_ramstyle = "mlab" *) +`define RELAX_BRAM_RW (* syn_ramstyle = "no_rw_check" *) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_print_instr.vh index 9a5702e2..f8682f44 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_print_instr.vh @@ -22,7 +22,7 @@ task print_ex_op; input [`EX_BITS-1:0] ex; input [`OP_BITS-1:0] op; begin - case (ex) + case (ex) `EX_ALU: begin case (`ALU_BITS'(op)) `ALU_ADD: $write("ADD"); @@ -37,22 +37,27 @@ task print_ex_op; `ALU_AND: $write("AND"); `ALU_LUI: $write("LUI"); `ALU_AUIPC: $write("AUIPC"); - `ALU_BEQ: $write("BEQ"); - `ALU_BNE: $write("BNE"); - `ALU_BLT: $write("BLT"); - `ALU_BGE: $write("BGE"); - `ALU_BLTU: $write("BLTU"); - `ALU_BGEU: $write("BGEU"); - `ALU_JAL: $write("JAL"); - `ALU_JALR: $write("JALR"); - `ALU_ECALL: $write("ECALL"); - `ALU_EBREAK:$write("EBREAK"); - `ALU_MRET: $write("MRET"); - `ALU_SRET: $write("SRET"); - `ALU_DRET: $write("DRET"); default: $write("?"); - endcase + endcase end + `EX_BRU: begin + case (`BRU_BITS'(op)) + `BRU_EQ: $write("BEQ"); + `BRU_NE: $write("BNE"); + `BRU_LT: $write("BLT"); + `BRU_GE: $write("BGE"); + `BRU_LTU: $write("BLTU"); + `BRU_GEU: $write("BGEU"); + `BRU_JAL: $write("JAL"); + `BRU_JALR: $write("JALR"); + `BRU_ECALL: $write("ECALL"); + `BRU_EBREAK:$write("EBREAK"); + `BRU_MRET: $write("MRET"); + `BRU_SRET: $write("SRET"); + `BRU_DRET: $write("DRET"); + default: $write("?"); + endcase + end `EX_LSU: begin case (`LSU_BITS'(op)) `LSU_LB: $write("LB"); diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v deleted file mode 100644 index a3bfa32a..00000000 --- a/hw/rtl/VX_scheduler.v +++ /dev/null @@ -1,82 +0,0 @@ -`include "VX_define.vh" - -module VX_scheduler #( - parameter CORE_ID = 0 -) ( - input wire clk, - input wire reset, - - VX_decode_if decode_if, - VX_wb_if writeback_if, - VX_cmt_to_issue_if cmt_to_issue_if, - input wire ex_busy, - output wire [`ISTAG_BITS-1:0] issue_tag, - output wire schedule_delay -); - localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); - reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0]; - reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; - - wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.warp_num] & decode_if.reg_use_mask; - wire inuse_hazard = (inuse_mask != 0); - - wire issue_buf_full; - - assign schedule_delay = ex_busy || inuse_hazard || issue_buf_full; - - wire issue_fire = decode_if.valid && decode_if.ready; - - wire writeback_fire = writeback_if.valid && writeback_if.ready; - - wire acquire_rd = issue_fire && (decode_if.wb != 0); - - wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[{writeback_if.warp_num, writeback_if.rd}] & ~writeback_if.thread_mask; - - always @(posedge clk) begin - if (reset) begin - for (integer w = 0; w < `NUM_WARPS; w++) begin - for (integer i = 0; i < `NUM_REGS; i++) begin - inuse_registers[w * `NUM_REGS + i] <= 0; - end - inuse_reg_mask[w] <= `NUM_REGS'(0); - end - end else begin - if (acquire_rd) begin - inuse_registers[{decode_if.warp_num, decode_if.rd}] <= decode_if.thread_mask; - inuse_reg_mask[decode_if.warp_num][decode_if.rd] <= 1; - end - if (writeback_fire) begin - assert(inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] != 0); - inuse_registers[{writeback_if.warp_num, writeback_if.rd}] <= inuse_registers_n; - inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] <= (| inuse_registers_n); - end - end - end - - VX_cam_buffer #( - .DATAW ($bits(issue_data_t)), - .SIZE (`ISSUEQ_SIZE), - .RPORTS (`NUM_EXS) - ) issue_buffer ( - .clk (clk), - .reset (reset), - .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), - .write_addr (issue_tag), - .acquire_slot (issue_fire), - .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), - .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), - .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), - .full (issue_buf_full) - ); - -`ifdef DBG_PRINT_PIPELINE - always @(posedge clk) begin - if (decode_if.valid && ~decode_if.ready) begin - $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b", - $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, - inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy); - end - end -`endif - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_scope.vh b/hw/rtl/VX_scope.vh index 2ceea369..29f522e3 100644 --- a/hw/rtl/VX_scope.vh +++ b/hw/rtl/VX_scope.vh @@ -15,12 +15,12 @@ scope_snp_req_invalidate, \ scope_snp_req_tag, \ scope_snp_rsp_tag, \ - scope_icache_req_warp_num, \ + scope_icache_req_wid, \ scope_icache_req_addr, \ scope_icache_req_tag, \ scope_icache_rsp_data, \ scope_icache_rsp_tag, \ - scope_dcache_req_warp_num, \ + scope_dcache_req_wid, \ scope_dcache_req_curr_PC, \ scope_dcache_req_addr, \ scope_dcache_req_rw, \ @@ -29,17 +29,17 @@ scope_dcache_req_tag, \ scope_dcache_rsp_data, \ scope_dcache_rsp_tag, \ - scope_decode_warp_num, \ + scope_decode_wid, \ scope_decode_curr_PC, \ scope_decode_is_jal, \ scope_decode_rs1, \ scope_decode_rs2, \ - scope_execute_warp_num, \ + scope_execute_wid, \ scope_execute_curr_PC, \ scope_execute_rd, \ scope_execute_a, \ scope_execute_b, \ - scope_writeback_warp_num, \ + scope_writeback_wid, \ scope_writeback_curr_PC, \ scope_writeback_wb, \ scope_writeback_rd, \ @@ -103,7 +103,7 @@ wire scope_snp_rsp_valid; \ wire [`VX_SNP_TAG_WIDTH-1:0] scope_snp_rsp_tag; \ wire scope_icache_req_valid; \ - wire [`NW_BITS-1:0] scope_icache_req_warp_num; \ + wire [`NW_BITS-1:0] scope_icache_req_wid; \ wire [31:0] scope_icache_req_addr; \ wire [`ICORE_TAG_WIDTH-1:0] scope_icache_req_tag; \ wire scope_icache_req_ready; \ @@ -112,7 +112,7 @@ wire [`ICORE_TAG_WIDTH-1:0] scope_icache_rsp_tag; \ wire scope_icache_rsp_ready; \ wire [`NUM_THREADS-1:0] scope_dcache_req_valid; \ - wire [`NW_BITS-1:0] scope_dcache_req_warp_num; \ + wire [`NW_BITS-1:0] scope_dcache_req_wid; \ wire [31:0] scope_dcache_req_curr_PC; \ wire [63:0] scope_dcache_req_addr; \ wire scope_dcache_req_rw; \ @@ -131,19 +131,19 @@ wire scope_exec_delay; \ wire scope_gpr_stage_delay; \ wire [`NUM_THREADS-1:0] scope_decode_valid; \ - wire [`NW_BITS-1:0] scope_decode_warp_num; \ + wire [`NW_BITS-1:0] scope_decode_wid; \ wire [31:0] scope_decode_curr_PC; \ wire scope_decode_is_jal; \ wire [`NR_BITS-1:0] scope_decode_rs1; \ wire [`NR_BITS-1:0] scope_decode_rs2; \ wire [`NUM_THREADS-1:0] scope_execute_valid; \ - wire [`NW_BITS-1:0] scope_execute_warp_num; \ + wire [`NW_BITS-1:0] scope_execute_wid; \ wire [31:0] scope_execute_curr_PC; \ wire [`NR_BITS-1:0] scope_execute_rd; \ wire [63:0] scope_execute_a; \ wire [63:0] scope_execute_b; \ wire [`NUM_THREADS-1:0] scope_writeback_valid; \ - wire [`NW_BITS-1:0] scope_writeback_warp_num; \ + wire [`NW_BITS-1:0] scope_writeback_wid; \ wire [31:0] scope_writeback_curr_PC; \ wire scope_writeback_wb; \ wire [`NR_BITS-1:0] scope_writeback_rd; \ @@ -162,7 +162,7 @@ `define SCOPE_SIGNALS_ISTAGE_IO \ output wire scope_icache_req_valid, \ - output wire [`NW_BITS-1:0] scope_icache_req_warp_num, \ + output wire [`NW_BITS-1:0] scope_icache_req_wid, \ output wire [31:0] scope_icache_req_addr, \ output wire [`ICORE_TAG_WIDTH-1:0] scope_icache_req_tag, \ output wire scope_icache_req_ready, \ @@ -173,7 +173,7 @@ `define SCOPE_SIGNALS_LSU_IO \ output wire [`NUM_THREADS-1:0] scope_dcache_req_valid, \ - output wire [`NW_BITS-1:0] scope_dcache_req_warp_num, \ + output wire [`NW_BITS-1:0] scope_dcache_req_wid, \ output wire [31:0] scope_dcache_req_curr_PC, \ output wire [63:0] scope_dcache_req_addr, \ output wire scope_dcache_req_rw, \ @@ -210,19 +210,19 @@ `define SCOPE_SIGNALS_BE_IO \ output wire [`NUM_THREADS-1:0] scope_decode_valid, \ - output wire [`NW_BITS-1:0] scope_decode_warp_num, \ + output wire [`NW_BITS-1:0] scope_decode_wid, \ output wire [31:0] scope_decode_curr_PC, \ output wire scope_decode_is_jal, \ output wire [`NR_BITS-1:0] scope_decode_rs1, \ output wire [`NR_BITS-1:0] scope_decode_rs2, \ output wire [`NUM_THREADS-1:0] scope_execute_valid, \ - output wire [`NW_BITS-1:0] scope_execute_warp_num, \ + output wire [`NW_BITS-1:0] scope_execute_wid, \ output wire [31:0] scope_execute_curr_PC, \ output wire [`NR_BITS-1:0] scope_execute_rd, \ output wire [63:0] scope_execute_a, \ output wire [63:0] scope_execute_b, \ output wire [`NUM_THREADS-1:0] scope_writeback_valid, \ - output wire [`NW_BITS-1:0] scope_writeback_warp_num, \ + output wire [`NW_BITS-1:0] scope_writeback_wid, \ output wire [31:0] scope_writeback_curr_PC, \ output wire scope_writeback_wb, \ output wire [`NR_BITS-1:0] scope_writeback_rd, \ @@ -230,7 +230,7 @@ `define SCOPE_SIGNALS_ISTAGE_BIND \ .scope_icache_req_valid (scope_icache_req_valid), \ - .scope_icache_req_warp_num (scope_icache_req_warp_num), \ + .scope_icache_req_wid (scope_icache_req_wid), \ .scope_icache_req_addr (scope_icache_req_addr), \ .scope_icache_req_tag (scope_icache_req_tag), \ .scope_icache_req_ready (scope_icache_req_ready), \ @@ -241,7 +241,7 @@ `define SCOPE_SIGNALS_LSU_BIND \ .scope_dcache_req_valid (scope_dcache_req_valid), \ - .scope_dcache_req_warp_num (scope_dcache_req_warp_num), \ + .scope_dcache_req_wid (scope_dcache_req_wid), \ .scope_dcache_req_curr_PC (scope_dcache_req_curr_PC), \ .scope_dcache_req_addr (scope_dcache_req_addr), \ .scope_dcache_req_rw (scope_dcache_req_rw), \ @@ -332,19 +332,19 @@ `define SCOPE_SIGNALS_BE_BIND \ .scope_decode_valid (scope_decode_valid), \ - .scope_decode_warp_num (scope_decode_warp_num), \ + .scope_decode_wid (scope_decode_wid), \ .scope_decode_curr_PC (scope_decode_curr_PC), \ .scope_decode_is_jal (scope_decode_is_jal), \ .scope_decode_rs1 (scope_decode_rs1), \ .scope_decode_rs2 (scope_decode_rs2), \ .scope_execute_valid (scope_execute_valid), \ - .scope_execute_warp_num (scope_execute_warp_num), \ + .scope_execute_wid (scope_execute_wid), \ .scope_execute_curr_PC (scope_execute_curr_PC), \ .scope_execute_rd (scope_execute_rd), \ .scope_execute_a (scope_execute_a), \ .scope_execute_b (scope_execute_b), \ .scope_writeback_valid (scope_writeback_valid), \ - .scope_writeback_warp_num (scope_writeback_warp_num), \ + .scope_writeback_wid (scope_writeback_wid), \ .scope_writeback_curr_PC(scope_writeback_curr_PC), \ .scope_writeback_wb (scope_writeback_wb), \ .scope_writeback_rd (scope_writeback_rd), \ diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v new file mode 100644 index 00000000..265e9cf5 --- /dev/null +++ b/hw/rtl/VX_scoreboard.v @@ -0,0 +1,73 @@ +`include "VX_define.vh" + +module VX_scoreboard #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + VX_decode_if decode_if, + VX_wb_if writeback_if, + VX_cmt_to_issue_if cmt_to_issue_if, + input wire ex_busy, + output wire [`ISTAG_BITS-1:0] issue_tag, + output wire schedule_delay +); + reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; + + wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.wid] & decode_if.reg_use_mask; + wire inuse_hazard = (inuse_mask != 0); + + wire issue_buf_full; + + assign schedule_delay = ex_busy || inuse_hazard || issue_buf_full; + + wire issue_fire = decode_if.valid && decode_if.ready; + + wire reserve_rd = issue_fire && (decode_if.wb != 0); + + wire release_rd = writeback_if.valid; + + always @(posedge clk) begin + if (reset) begin + for (integer i = 0; i < `NUM_WARPS; i++) begin + inuse_reg_mask[i] <= `NUM_REGS'(0); + end + end else begin + if (reserve_rd) begin + inuse_reg_mask[decode_if.wid][decode_if.rd] <= 1; + end + if (release_rd) begin + assert(inuse_reg_mask[writeback_if.wid][writeback_if.rd] != 0); + inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= 0; + end + end + end + + VX_cam_buffer #( + .DATAW ($bits(issue_data_t)), + .SIZE (`ISSUEQ_SIZE), + .RPORTS (`NUM_EXS) + ) issue_table ( + .clk (clk), + .reset (reset), + .write_data ({decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), + .write_addr (issue_tag), + .acquire_slot (issue_fire), + .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.bru_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), + .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.bru_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), + .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.bru_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), + .full (issue_buf_full) + ); + +`ifdef DBG_PRINT_PIPELINE + always @(posedge clk) begin + if (decode_if.valid && ~decode_if.ready) begin + $display("%t: Core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b", + $time, CORE_ID, decode_if.wid, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, + inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh new file mode 100644 index 00000000..7b464cf5 --- /dev/null +++ b/hw/rtl/VX_types.vh @@ -0,0 +1,59 @@ +`ifndef VX_TYPES +`define VX_TYPES + +`include "VX_define.vh" + +typedef struct packed { + logic [`NW_BITS-1:0] wid; + logic [`NUM_THREADS-1:0] thread_mask; + logic [31:0] curr_PC; + logic [`NR_BITS-1:0] rd; + logic wb; +} issue_data_t; + +typedef struct packed { + logic is_normal; + logic is_zero; + logic is_subnormal; + logic is_inf; + logic is_nan; + logic is_signaling; + logic is_quiet; +} fp_type_t; + +typedef struct packed { + logic NV; // Invalid + logic DZ; // Divide by zero + logic OF; // Overflow + logic UF; // Underflow + logic NX; // Inexact +} fflags_t; + +`define FFG_BITS $bits(fflags_t) + +typedef struct packed { + logic valid; + logic [`NUM_THREADS-1:0] thread_mask; +} gpu_tmc_t; + +typedef struct packed { + logic valid; + logic [`NUM_WARPS-1:0] wmask; + logic [31:0] pc; +} gpu_wspawn_t; + +typedef struct packed { + logic valid; + logic diverged; + logic [`NUM_THREADS-1:0] then_mask; + logic [`NUM_THREADS-1:0] else_mask; + logic [31:0] pc; +} gpu_split_t; + +typedef struct packed { + logic valid; + logic [`NB_BITS-1:0] id; + logic [`NW_BITS:0] num_warps; +} gpu_barrier_t; + +`endif \ No newline at end of file diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 80eda9f7..a4ef9214 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -16,64 +16,44 @@ module VX_warp_sched #( output wire busy ); - wire update_use_wspawn; - wire update_visible_active; - - wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0]; - wire join_fall; wire [31:0] join_pc; wire [`NUM_THREADS-1:0] join_tm; reg [`NUM_WARPS-1:0] warp_active; reg [`NUM_WARPS-1:0] warp_stalled; - - reg [`NUM_WARPS-1:0] visible_active; - wire [`NUM_WARPS-1:0] use_active; - + reg [`NUM_WARPS-1:0] visible_active; + wire update_visible_active; + reg [`NUM_WARPS-1:0] warp_lock; - wire wstall_this_cycle; - reg [`NUM_THREADS-1:0] thread_masks[`NUM_WARPS-1:0]; reg [31:0] warp_pcs[`NUM_WARPS-1:0]; // barriers - reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; - wire [`NUM_WARPS-1:0] b_mask; - wire [`NW_BITS:0] b_count; - + reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; wire reached_barrier_limit; + reg [`NUM_WARPS-1:0] total_barrier_stall; // wspawn reg [31:0] use_wspawn_pc; reg [`NUM_WARPS-1:0] use_wspawn; - wire [`NW_BITS-1:0] warp_to_schedule; - wire schedule; - + wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; wire [31:0] warp_pc; + wire [`NW_BITS-1:0] warp_to_schedule; wire scheduled_warp; - wire hazard; + wire stall_out; wire global_stall; + wire real_schedule; - wire real_schedule; - - wire [31:0] new_pc; - - reg [`NUM_WARPS-1:0] total_barrier_stall; - - reg didnt_split; - - wire stall; + reg didnt_split; always @(posedge clk) begin - integer i; if (reset) begin - for (i = 0; i < `NUM_BARRIERS; i++) begin + for (integer i = 0; i < `NUM_BARRIERS; i++) begin barrier_stall_mask[i] <= 0; end @@ -87,92 +67,92 @@ module VX_warp_sched #( didnt_split <= 0; warp_lock <= 0; - for (i = 1; i < `NUM_WARPS; i++) begin + for (integer i = 1; i < `NUM_WARPS; i++) begin warp_pcs[i] <= 0; warp_active[i] <= 0; // Activating first warp visible_active[i] <= 0; // Activating first warp thread_masks[i] <= 1; // Activating first thread in first warp end - - end else begin - - if (warp_ctl_if.wspawn) begin - warp_active <= warp_ctl_if.wspawn_wmask; - use_wspawn <= warp_ctl_if.wspawn_wmask & (~`NUM_WARPS'(1)); - use_wspawn_pc <= warp_ctl_if.wspawn_pc; + end else begin + if (warp_ctl_if.wspawn.valid) begin + warp_active <= warp_ctl_if.wspawn.wmask; + use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1)); + use_wspawn_pc <= warp_ctl_if.wspawn.pc; end - if (warp_ctl_if.is_barrier) begin - warp_stalled[warp_ctl_if.warp_num] <= 0; + if (warp_ctl_if.barrier.valid) begin + warp_stalled[warp_ctl_if.wid] <= 0; if (reached_barrier_limit) begin - barrier_stall_mask[warp_ctl_if.barrier_id] <= 0; + barrier_stall_mask[warp_ctl_if.barrier.id] <= 0; end else begin - barrier_stall_mask[warp_ctl_if.barrier_id][warp_ctl_if.warp_num] <= 1; + barrier_stall_mask[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1; end - end else if (warp_ctl_if.change_mask) begin - thread_masks[warp_ctl_if.warp_num] <= warp_ctl_if.thread_mask; - warp_stalled[warp_ctl_if.warp_num] <= 0; - if (0 == warp_ctl_if.thread_mask) begin - warp_active[warp_ctl_if.warp_num] <= 0; - visible_active[warp_ctl_if.warp_num] <= 0; + end else if (warp_ctl_if.tmc.valid) begin + thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.thread_mask; + warp_stalled[warp_ctl_if.wid] <= 0; + if (0 == warp_ctl_if.tmc.thread_mask) begin + warp_active[warp_ctl_if.wid] <= 0; + visible_active[warp_ctl_if.wid] <= 0; end end else if (join_if.is_join && !didnt_split) begin if (!join_fall) begin - warp_pcs[join_if.warp_num] <= join_pc; + warp_pcs[join_if.wid] <= join_pc; end - thread_masks[join_if.warp_num] <= join_tm; - didnt_split <= 0; - end else if (warp_ctl_if.is_split) begin - warp_stalled[warp_ctl_if.warp_num] <= 0; - if (warp_ctl_if.do_split) begin - thread_masks[warp_ctl_if.warp_num] <= warp_ctl_if.split_new_mask; + thread_masks[join_if.wid] <= join_tm; + didnt_split <= 0; + end else if (warp_ctl_if.split.valid) begin + warp_stalled[warp_ctl_if.wid] <= 0; + if (warp_ctl_if.split.diverged) begin + thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_mask; didnt_split <= 0; end else begin didnt_split <= 1; end end - if (update_use_wspawn) begin + if (use_wspawn[warp_to_schedule] && !global_stall) begin use_wspawn[warp_to_schedule] <= 0; thread_masks[warp_to_schedule] <= 1; end // Stalling the scheduling of warps if (wstall_if.wstall) begin - warp_stalled[wstall_if.warp_num] <= 1; - visible_active[wstall_if.warp_num] <= 0; + warp_stalled[wstall_if.wid] <= 1; + visible_active[wstall_if.wid] <= 0; end // Refilling active warps if (update_visible_active) begin - visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall) & ~warp_lock; + visible_active <= warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock; end // Don't change state if stall if (!global_stall && real_schedule && (thread_mask != 0)) begin visible_active[warp_to_schedule] <= 0; - warp_pcs[warp_to_schedule] <= new_pc; + warp_pcs[warp_to_schedule] <= warp_pc + 4; end // Branch if (branch_ctl_if.valid) begin if (branch_ctl_if.taken) begin - warp_pcs[branch_ctl_if.warp_num] <= branch_ctl_if.dest; + warp_pcs[branch_ctl_if.wid] <= branch_ctl_if.dest; end - warp_stalled[branch_ctl_if.warp_num] <= 0; + warp_stalled[branch_ctl_if.wid] <= 0; end // Lock/Release - if (scheduled_warp && !stall) begin - warp_lock[warp_num] <= 1; + if (scheduled_warp && !stall_out) begin + warp_lock[warp_to_schedule] <= 1; end if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin - warp_lock[ifetch_rsp_if.warp_num] <= 0; + warp_lock[ifetch_rsp_if.wid] <= 0; end - end end + wire [`NUM_WARPS-1:0] b_mask = barrier_stall_mask[warp_ctl_if.barrier.id][`NUM_WARPS-1:0]; + wire [`NW_BITS:0] b_count; + VX_countones #( .N(`NUM_WARPS) ) barrier_count ( @@ -188,26 +168,24 @@ module VX_warp_sched #( .valids(visible_active), .count (count_visible_active) ); - - assign b_mask = barrier_stall_mask[warp_ctl_if.barrier_id][`NUM_WARPS-1:0]; - assign reached_barrier_limit = (b_count == warp_ctl_if.barrier_num_warps); - - assign wstall_this_cycle = wstall_if.wstall && (wstall_if.warp_num == warp_to_schedule); // Maybe bug + assign reached_barrier_limit = (b_count == warp_ctl_if.barrier.num_warps); assign total_barrier_stall = barrier_stall_mask[0] | barrier_stall_mask[1] | barrier_stall_mask[2] | barrier_stall_mask[3]; - assign update_visible_active = (0 == count_visible_active) && !(stall || wstall_this_cycle || hazard || join_if.is_join); + wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0]; + wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.wid]}; + wire [(1+32+`NUM_THREADS-1):0] q2 = {1'b0, warp_ctl_if.split.pc, warp_ctl_if.split.else_mask}; - wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.warp_num]}; - wire [(1+32+`NUM_THREADS-1):0] q2 = {1'b0, warp_ctl_if.split_save_pc, warp_ctl_if.split_later_mask}; + assign {join_fall, join_pc, join_tm} = ipdom[join_if.wid]; - assign {join_fall, join_pc, join_tm} = ipdom[join_if.warp_num]; + for (genvar i = 0; i < `NUM_WARPS; i++) begin + wire push = warp_ctl_if.split.valid + && warp_ctl_if.split.diverged + && (i == warp_ctl_if.wid); - genvar i; - for (i = 0; i < `NUM_WARPS; i++) begin - wire push = warp_ctl_if.is_split && warp_ctl_if.do_split && (i == warp_ctl_if.warp_num); - wire pop = join_if.is_join && (i == join_if.warp_num); + wire pop = join_if.is_join + && (i == join_if.wid); VX_ipdom_stack #( .WIDTH(1+32+`NUM_THREADS), @@ -217,37 +195,40 @@ module VX_warp_sched #( .reset(reset), .push (push), .pop (pop), - .d (ipdom[i]), .q1 (q1), .q2 (q2), + .d (ipdom[i]), `UNUSED_PIN (empty), `UNUSED_PIN (full) ); end - wire should_bra = (branch_ctl_if.valid && branch_ctl_if.taken && (warp_to_schedule == branch_ctl_if.warp_num)); + wire schedule; - assign hazard = should_bra && schedule; + wire branch_hazard = schedule + && branch_ctl_if.valid + && branch_ctl_if.taken + && (branch_ctl_if.wid == warp_to_schedule); - assign real_schedule = schedule && !warp_stalled[warp_to_schedule] && !total_barrier_stall[warp_to_schedule] && !warp_lock[0]; + assign real_schedule = schedule + && !warp_stalled[warp_to_schedule] + && !total_barrier_stall[warp_to_schedule] + && !warp_lock[0]; - assign global_stall = stall || wstall_this_cycle || hazard || !real_schedule || join_if.is_join; + wire wstall_this_cycle = wstall_if.wstall && (wstall_if.wid == warp_to_schedule); // Maybe bug - assign scheduled_warp = !(wstall_this_cycle || hazard || !real_schedule || join_if.is_join) && !reset; + assign update_visible_active = (0 == count_visible_active) && !(stall_out || wstall_this_cycle || branch_hazard || join_if.is_join); - wire real_use_wspawn = use_wspawn[warp_to_schedule]; + assign global_stall = stall_out || wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join; - assign warp_pc = real_use_wspawn ? use_wspawn_pc : warp_pcs[warp_to_schedule]; + assign scheduled_warp = !(wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join) && !reset; + + assign warp_pc = use_wspawn[warp_to_schedule] ? use_wspawn_pc : warp_pcs[warp_to_schedule]; - assign thread_mask = global_stall ? 0 : (real_use_wspawn ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule]); + assign thread_mask = global_stall ? 0 : (use_wspawn[warp_to_schedule] ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule]); - assign warp_num = warp_to_schedule; - - assign update_use_wspawn = use_wspawn[warp_to_schedule] && !global_stall; - - assign new_pc = warp_pc + 4; - - assign use_active = (count_visible_active != 0) ? visible_active : (warp_active & (~warp_stalled) & (~total_barrier_stall) & (~warp_lock)); + wire [`NUM_WARPS-1:0] use_active = (count_visible_active != 0) ? visible_active : + (warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock); // Choosing a warp to schedule VX_fixed_arbiter #( @@ -261,17 +242,17 @@ module VX_warp_sched #( `UNUSED_PIN (grant_onehot) ); - assign stall = ~ifetch_req_if.ready && ifetch_req_if.valid; + assign stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid; VX_generic_register #( .N(1 + `NUM_THREADS + 32 + `NW_BITS) ) fetch_reg ( .clk (clk), .reset (reset), - .stall (stall), + .stall (stall_out), .flush (0), - .in ({(| thread_mask), thread_mask, warp_pc, warp_num}), - .out ({ifetch_req_if.valid, ifetch_req_if.thread_mask, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}) + .in ({(| thread_mask), thread_mask, warp_pc, warp_to_schedule}), + .out ({ifetch_req_if.valid, ifetch_req_if.thread_mask, ifetch_req_if.curr_PC, ifetch_req_if.wid}) ); assign busy = (warp_active != 0); diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 717e9cf4..dd5da562 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -8,6 +8,7 @@ module VX_writeback #( // inputs VX_exu_to_cmt_if alu_commit_if, + VX_exu_to_cmt_if bru_commit_if, VX_exu_to_cmt_if lsu_commit_if, VX_exu_to_cmt_if csr_commit_if, VX_exu_to_cmt_if mul_commit_if, @@ -20,26 +21,24 @@ module VX_writeback #( ); reg [`ISSUEQ_SIZE-1:0] wb_valid_table, wb_valid_table_n; reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] wb_data_table, wb_data_table_n; - reg [`ISSUEQ_SIZE-1:0][`NW_BITS-1:0] wb_warp_num_table, wb_warp_num_table_n; + reg [`ISSUEQ_SIZE-1:0][`NW_BITS-1:0] wb_wid_table, wb_wid_table_n; reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] wb_thread_mask_table, wb_thread_mask_table_n; reg [`ISSUEQ_SIZE-1:0][31:0] wb_curr_PC_table, wb_curr_PC_table_n; reg [`ISSUEQ_SIZE-1:0][`NR_BITS-1:0] wb_rd_table, wb_rd_table_n; + reg wb_valid, wb_valid_n; reg [`NUM_THREADS-1:0][31:0] wb_data, wb_data_n; - reg [`NW_BITS-1:0] wb_warp_num, wb_warp_num_n; + reg [`NW_BITS-1:0] wb_wid, wb_wid_n; reg [`NUM_THREADS-1:0] wb_thread_mask, wb_thread_mask_n; reg [31:0] wb_curr_PC, wb_curr_PC_n; reg [`NR_BITS-1:0] wb_rd, wb_rd_n; reg [`ISTAG_BITS-1:0] wb_index; - reg [`ISTAG_BITS-1:0] wb_index_n; - - reg wb_valid; - reg wb_valid_n; + reg [`ISTAG_BITS-1:0] wb_index_n; always @(*) begin wb_valid_table_n = wb_valid_table; - wb_warp_num_table_n = wb_warp_num_table; + wb_wid_table_n = wb_wid_table; wb_thread_mask_table_n = wb_thread_mask_table; wb_curr_PC_table_n = wb_curr_PC_table; wb_rd_table_n = wb_rd_table; @@ -53,16 +52,25 @@ module VX_writeback #( wb_valid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wb; wb_thread_mask_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.thread_mask; wb_data_table_n [alu_commit_if.issue_tag] = alu_commit_if.data; - wb_warp_num_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.warp_num; + wb_wid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wid; wb_curr_PC_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.curr_PC; wb_rd_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.rd; end + if (bru_commit_if.valid) begin + wb_valid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wb; + wb_thread_mask_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.thread_mask; + wb_data_table_n [bru_commit_if.issue_tag] = bru_commit_if.data; + wb_wid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wid; + wb_curr_PC_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.curr_PC; + wb_rd_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.rd; + end + if (lsu_commit_if.valid) begin wb_valid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wb; wb_thread_mask_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.thread_mask; wb_data_table_n [lsu_commit_if.issue_tag] = lsu_commit_if.data; - wb_warp_num_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.warp_num; + wb_wid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wid; wb_curr_PC_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.curr_PC; wb_rd_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.rd; end @@ -71,7 +79,7 @@ module VX_writeback #( wb_valid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wb; wb_thread_mask_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.thread_mask; wb_data_table_n [csr_commit_if.issue_tag] = csr_commit_if.data; - wb_warp_num_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.warp_num; + wb_wid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wid; wb_curr_PC_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.curr_PC; wb_rd_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.rd; end @@ -80,7 +88,7 @@ module VX_writeback #( wb_valid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wb; wb_thread_mask_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.thread_mask; wb_data_table_n [mul_commit_if.issue_tag] = mul_commit_if.data; - wb_warp_num_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.warp_num; + wb_wid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wid; wb_curr_PC_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.curr_PC; wb_rd_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.rd; end @@ -89,7 +97,7 @@ module VX_writeback #( wb_valid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wb; wb_thread_mask_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.thread_mask; wb_data_table_n [fpu_commit_if.issue_tag] = fpu_commit_if.data; - wb_warp_num_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.warp_num; + wb_wid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wid; wb_curr_PC_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.curr_PC; wb_rd_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.rd; end @@ -98,23 +106,25 @@ module VX_writeback #( wb_valid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wb; wb_thread_mask_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.thread_mask; wb_data_table_n [gpu_commit_if.issue_tag] = gpu_commit_if.data; - wb_warp_num_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.warp_num; + wb_wid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wid; wb_curr_PC_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.curr_PC; wb_rd_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.rd; end end - integer i; - always @(*) begin - wb_index_n = 0; - wb_valid_n = 0; - for (i = `ISSUEQ_SIZE-1; i >= 0; i--) begin + wb_index_n = 0; + wb_valid_n = 0; + wb_thread_mask_n = {`NUM_THREADS{1'bx}}; + wb_wid_n = {`NW_BITS{1'bx}}; + wb_curr_PC_n = {32{1'bx}}; + wb_data_n = {(`NUM_THREADS * 32){1'bx}}; + for (integer i = `ISSUEQ_SIZE-1; i >= 0; i--) begin if (wb_valid_table_n[i]) begin wb_index_n = `ISTAG_BITS'(i); wb_valid_n = 1; wb_thread_mask_n= wb_thread_mask_table_n[i]; - wb_warp_num_n = wb_warp_num_table_n[i]; + wb_wid_n = wb_wid_table_n[i]; wb_curr_PC_n = wb_curr_PC_table_n[i]; wb_rd_n = wb_rd_table_n[i]; wb_data_n = wb_data_table_n[i]; @@ -130,15 +140,15 @@ module VX_writeback #( end else begin wb_valid_table <= wb_valid_table_n; wb_thread_mask_table <= wb_thread_mask_table_n; - wb_warp_num_table <= wb_warp_num_table_n; + wb_wid_table <= wb_wid_table_n; wb_curr_PC_table <= wb_curr_PC_table_n; wb_rd_table <= wb_rd_table_n; wb_data_table <= wb_data_table_n; wb_index <= wb_index_n; - wb_valid <= wb_valid_n && writeback_if.ready; + wb_valid <= wb_valid_n; wb_thread_mask <= wb_thread_mask_n; - wb_warp_num <= wb_warp_num_n; + wb_wid <= wb_wid_n; wb_curr_PC <= wb_curr_PC_n; wb_rd <= wb_rd_n; wb_data <= wb_data_n; @@ -148,18 +158,10 @@ module VX_writeback #( // writeback request assign writeback_if.valid = wb_valid; assign writeback_if.thread_mask = wb_thread_mask; - assign writeback_if.warp_num = wb_warp_num; + assign writeback_if.wid = wb_wid; assign writeback_if.curr_PC = wb_curr_PC; assign writeback_if.rd = wb_rd; assign writeback_if.data = wb_data; - - // commit back-pressure - assign alu_commit_if.ready = 1'b1; - assign lsu_commit_if.ready = 1'b1; - assign csr_commit_if.ready = 1'b1; - assign mul_commit_if.ready = 1'b1; - assign fpu_commit_if.ready = 1'b1; - assign gpu_commit_if.ready = 1'b1; // special workaround to get RISC-V tests Pass/Fail status reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */; diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index a606c15f..8122f184 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -191,8 +191,7 @@ module Vortex ( wire [`CLOG2(`NUM_CLUSTERS)-1:0] csr_io_request_id = `CLOG2(`NUM_CLUSTERS)'(csr_io_req_coreid >> `CLOG2(`NUM_CLUSTERS)); wire [`NC_BITS-1:0] per_cluster_csr_io_req_coreid = `NC_BITS'(csr_io_req_coreid); - genvar i; - for (i = 0; i < `NUM_CLUSTERS; i++) begin + for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin VX_cluster #( .CLUSTER_ID(i) ) cluster ( @@ -358,7 +357,7 @@ module Vortex ( wire [`NUM_CLUSTERS-1:0][`L2SNP_TAG_WIDTH-1:0] l3_snp_fwdin_tag; wire [`NUM_CLUSTERS-1:0] l3_snp_fwdin_ready; - for (i = 0; i < `L3NUM_REQUESTS; i++) begin + for (genvar i = 0; i < `L3NUM_REQUESTS; i++) begin // Core Request assign l3_core_req_valid [i] = per_cluster_dram_req_valid [i]; assign l3_core_req_rw [i] = per_cluster_dram_req_rw [i]; diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 73f955a9..fc68ca09 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -108,7 +108,7 @@ module VX_bank #( wire[31:0] debug_pc_st0; wire debug_wb_st0; wire[`NR_BITS-1:0] debug_rd_st0; - wire[`NW_BITS-1:0] debug_warp_num_st0; + wire[`NW_BITS-1:0] debug_wid_st0; wire debug_rw_st0; wire[WORD_SIZE-1:0] debug_byteen_st0; wire[`REQS_BITS-1:0] debug_tid_st0; @@ -117,7 +117,7 @@ module VX_bank #( wire[31:0] debug_pc_st1e; wire debug_wb_st1e; wire[`NR_BITS-1:0] debug_rd_st1e; - wire[`NW_BITS-1:0] debug_warp_num_st1e; + wire[`NW_BITS-1:0] debug_wid_st1e; wire debug_rw_st1e; wire[WORD_SIZE-1:0] debug_byteen_st1e; wire[`REQS_BITS-1:0] debug_tid_st1e; @@ -126,7 +126,7 @@ module VX_bank #( wire[31:0] debug_pc_st2; wire debug_wb_st2; wire[`NR_BITS-1:0] debug_rd_st2; - wire[`NW_BITS-1:0] debug_warp_num_st2; + wire[`NW_BITS-1:0] debug_wid_st2; wire debug_rw_st2; wire[WORD_SIZE-1:0] debug_byteen_st2; wire[`REQS_BITS-1:0] debug_tid_st2; @@ -271,10 +271,9 @@ module VX_bank #( wire going_to_write_st1 [STAGE_1_CYCLES-1:0]; `DEBUG_END - integer j; always @(*) begin is_fill_in_pipe = 0; - for (j = 0; j < STAGE_1_CYCLES; j++) begin + for (integer j = 0; j < STAGE_1_CYCLES; j++) begin if (is_fill_st1[j]) begin is_fill_in_pipe = 1; end @@ -360,7 +359,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st0, debug_wb_st0, debug_rd_st0, debug_warp_num_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0; + assign {debug_pc_st0, debug_wb_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0; end `endif @@ -375,8 +374,7 @@ module VX_bank #( .out ({is_mrvq_st1[0] , is_snp_st1[0], snp_invalidate_st1[0], going_to_write_st1[0], valid_st1[0], addr_st1[0], wsel_st1[0], writeword_st1[0], inst_meta_st1[0], is_fill_st1[0], writedata_st1[0]}) ); - genvar i; - for (i = 1; i < STAGE_1_CYCLES; i++) begin + for (genvar i = 1; i < STAGE_1_CYCLES; i++) begin VX_generic_register #( .N(1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `UP(`WORD_SELECT_WIDTH) + `WORD_WIDTH + `REQ_INST_META_WIDTH + 1 + `BANK_LINE_WIDTH) ) s0_1_cc ( @@ -446,13 +444,13 @@ module VX_bank #( .clk (clk), .reset (reset), -`ifdef DBG_CORE_REQ_INFO + `ifdef DBG_CORE_REQ_INFO .debug_pc_st1e(debug_pc_st1e), .debug_wb_st1e(debug_wb_st1e), .debug_rd_st1e(debug_rd_st1e), - .debug_warp_num_st1e(debug_warp_num_st1e), + .debug_wid_st1e(debug_wid_st1e), .debug_tagid_st1e(debug_tagid_st1e), -`endif + `endif .stall (stall_bank_pipe), .stall_bank_pipe(stall_bank_pipe), @@ -490,7 +488,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st1e, debug_wb_st1e, debug_rd_st1e, debug_warp_num_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1]; + assign {debug_pc_st1e, debug_wb_st1e, debug_rd_st1e, debug_wid_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1]; end `endif @@ -531,7 +529,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_pc_st2, debug_wb_st2, debug_rd_st2, debug_warp_num_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; + assign {debug_pc_st2, debug_wb_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2; end `endif @@ -543,10 +541,10 @@ module VX_bank #( assign mrvq_push_stall = miss_add_unqual && mrvq_full; wire miss_add = miss_add_unqual - && !mrvq_full - && !(cwbq_push_stall - || dwbq_push_stall - || dram_fill_req_stall); + && !mrvq_full + && !(cwbq_push_stall + || dwbq_push_stall + || dram_fill_req_stall); assign recover_mrvq_state_st2 = miss_add_unqual && is_mrvq_st2; // Doesn't need to include the stalls @@ -718,7 +716,9 @@ module VX_bank #( always @(posedge clk) begin if (reset) begin dwbq_dual_valid_sel <= 0; - end else if (dwbq_is_dwb_out && dwbq_is_snp_out && (dram_wb_req_fire || snp_rsp_fire)) begin + end else if (dwbq_is_dwb_out + && dwbq_is_snp_out + && (dram_wb_req_fire || snp_rsp_fire)) begin dwbq_dual_valid_sel <= ~dwbq_dual_valid_sel; end end diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 8e56a942..25fa81aa 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -132,12 +132,12 @@ module VX_cache #( wire[31:0] debug_core_req_use_pc; wire debug_core_req_wb; wire[`NR_BITS-1:0] debug_core_req_rd; - wire[`NW_BITS-1:0] debug_core_req_warp_num; + wire[`NW_BITS-1:0] debug_core_req_wid; wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_core_req_idx; /* verilator lint_on UNUSED */ if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin - assign {debug_core_req_use_pc, debug_core_req_wb, debug_core_req_rd, debug_core_req_warp_num, debug_core_req_idx} = core_req_tag[0]; + assign {debug_core_req_use_pc, debug_core_req_wb, debug_core_req_rd, debug_core_req_wid, debug_core_req_idx} = core_req_tag[0]; end `endif @@ -246,10 +246,8 @@ module VX_cache #( assign dram_req_tag = dram_req_addr; assign dram_rsp_ready = (| per_bank_dram_fill_rsp_ready); - - genvar i; - for (i = 0; i < NUM_BANKS; i++) begin + for (genvar i = 0; i < NUM_BANKS; i++) begin wire [NUM_REQUESTS-1:0] curr_bank_core_req_valid; wire [NUM_REQUESTS-1:0] curr_bank_core_req_rw; wire [NUM_REQUESTS-1:0][WORD_SIZE-1:0] curr_bank_core_req_byteen; diff --git a/hw/rtl/cache/VX_cache_core_req_bank_sel.v b/hw/rtl/cache/VX_cache_core_req_bank_sel.v index fa5372f3..f00984b2 100644 --- a/hw/rtl/cache/VX_cache_core_req_bank_sel.v +++ b/hw/rtl/cache/VX_cache_core_req_bank_sel.v @@ -18,12 +18,10 @@ module VX_cache_core_req_bank_sel #( output reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valid, output wire core_req_ready ); - integer i; - if (NUM_BANKS == 1) begin always @(*) begin per_bank_valid = 0; - for (i = 0; i < NUM_REQUESTS; i++) begin + for (integer i = 0; i < NUM_REQUESTS; i++) begin per_bank_valid[0][i] = core_req_valid[i]; end end @@ -33,7 +31,7 @@ module VX_cache_core_req_bank_sel #( always @(*) begin per_bank_valid = 0; per_bank_ready_sel = {NUM_BANKS{1'b1}}; - for (i = 0; i < NUM_REQUESTS; i++) begin + for (integer i = 0; i < NUM_REQUESTS; i++) begin per_bank_valid[core_req_addr[i][`BANK_SELECT_ADDR_RNG]][i] = core_req_valid[i]; per_bank_ready_sel[core_req_addr[i][`BANK_SELECT_ADDR_RNG]] = 0; end diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 4a72e717..7cf8b1c8 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -48,14 +48,12 @@ module VX_cache_core_rsp_merge #( wire stall = ~core_rsp_ready && (| core_rsp_valid); - integer i; - if (CORE_TAG_ID_BITS != 0) begin always @(*) begin core_rsp_valid_unqual = 0; core_rsp_data_unqual = 0; core_rsp_tag_unqual = per_bank_core_rsp_tag[main_bank_index]; - for (i = 0; i < NUM_BANKS; i++) begin + for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i] && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == per_bank_core_rsp_tag[main_bank_index][CORE_TAG_ID_BITS-1:0])) begin core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; @@ -71,7 +69,7 @@ module VX_cache_core_rsp_merge #( core_rsp_valid_unqual = 0; core_rsp_data_unqual = 0; core_rsp_tag_unqual = 0; - for (i = 0; i < NUM_BANKS; i++) begin + for (integer i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i] && !core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] && ((main_bank_index == `BANK_BITS'(i)) diff --git a/hw/rtl/cache/VX_cache_dram_req_arb.v b/hw/rtl/cache/VX_cache_dram_req_arb.v index 03989a28..f3117fd2 100644 --- a/hw/rtl/cache/VX_cache_dram_req_arb.v +++ b/hw/rtl/cache/VX_cache_dram_req_arb.v @@ -106,8 +106,7 @@ module VX_cache_dram_req_arb #( `UNUSED_PIN (grant_onehot) ); - genvar i; - for (i = 0; i < NUM_BANKS; i++) begin + for (genvar i = 0; i < NUM_BANKS; i++) begin assign per_bank_dram_wb_req_ready[i] = dram_req_ready && (dwb_bank == `BANK_BITS'(i)); end diff --git a/hw/rtl/cache/VX_cache_miss_resrv.v b/hw/rtl/cache/VX_cache_miss_resrv.v index f7bed651..6535837d 100644 --- a/hw/rtl/cache/VX_cache_miss_resrv.v +++ b/hw/rtl/cache/VX_cache_miss_resrv.v @@ -77,9 +77,8 @@ module VX_cache_miss_resrv #( reg [MRVQ_SIZE-1:0] make_ready; reg [MRVQ_SIZE-1:0] make_ready_push; reg [MRVQ_SIZE-1:0] valid_address_match; - - genvar i; - for (i = 0; i < MRVQ_SIZE; i++) begin + + for (genvar i = 0; i < MRVQ_SIZE; i++) begin assign valid_address_match[i] = valid_table[i] ? (addr_table[i] == fill_addr_st1) : 0; assign make_ready[i] = is_fill_st1 && valid_address_match[i]; end @@ -121,7 +120,6 @@ module VX_cache_miss_resrv #( head_ptr <= 0; tail_ptr <= 0; end else begin - if (mrvq_push) begin valid_table[enqueue_index] <= 1; ready_table[enqueue_index] <= mrvq_init_ready_state; @@ -157,11 +155,10 @@ module VX_cache_miss_resrv #( end `ifdef DBG_PRINT_CACHE_MSRQ - integer j; always @(posedge clk) begin if (mrvq_push || mrvq_pop || increment_head || recover_state) begin $write("%t: bank%0d:%0d msrq: push=%b pop=%b incr=%d recv=%d", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop, increment_head, recover_state); - for (j = 0; j < MRVQ_SIZE; j++) begin + for (integer j = 0; j < MRVQ_SIZE; j++) begin if (valid_table[j]) begin $write(" "); if (schedule_ptr == $bits(schedule_ptr)'(j)) $write("*"); diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 11d3266e..7a4f2aec 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -83,9 +83,7 @@ module VX_snp_forwarder #( end end - genvar i; - - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign snp_fwdout_valid[i] = snp_req_valid && snp_req_ready; assign snp_fwdout_addr[i] = snp_req_addr; assign snp_fwdout_invalidate[i] = snp_req_invalidate; @@ -110,7 +108,7 @@ module VX_snp_forwarder #( assign fwdin_valid = snp_fwdin_valid[fwdin_sel]; assign fwdin_tag = snp_fwdin_tag[fwdin_sel]; - for (i = 0; i < NUM_REQUESTS; i++) begin + for (genvar i = 0; i < NUM_REQUESTS; i++) begin assign snp_fwdin_ready[i] = fwdin_ready && (fwdin_sel == `REQS_BITS'(i)); end diff --git a/hw/rtl/cache/VX_snp_rsp_arb.v b/hw/rtl/cache/VX_snp_rsp_arb.v index e9662c34..331b73a1 100644 --- a/hw/rtl/cache/VX_snp_rsp_arb.v +++ b/hw/rtl/cache/VX_snp_rsp_arb.v @@ -34,8 +34,7 @@ module VX_snp_rsp_arb #( assign snp_rsp_valid = fsq_valid; assign snp_rsp_tag = per_bank_snp_rsp_tag[fsq_bank]; - genvar i; - for (i = 0; i < NUM_BANKS; i++) begin + for (genvar i = 0; i < NUM_BANKS; i++) begin assign per_bank_snp_rsp_ready[i] = snp_rsp_ready && (fsq_bank == `BANK_BITS'(i)); end diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index 0b1748de..a03224fc 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -30,7 +30,7 @@ module VX_tag_data_access #( input wire[31:0] debug_pc_st1e, input wire debug_wb_st1e, input wire[`NR_BITS-1:0] debug_rd_st1e, - input wire[`NW_BITS-1:0] debug_warp_num_st1e, + input wire[`NW_BITS-1:0] debug_wid_st1e, input wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e, `IGNORE_WARNINGS_END `endif @@ -135,8 +135,7 @@ module VX_tag_data_access #( .out ({read_valid_st1c[0], read_dirty_st1c[0], read_dirtyb_st1c[0], read_tag_st1c[0], read_data_st1c[0]}) ); - genvar i; - for (i = 1; i < STAGE_1_CYCLES-1; i++) begin + for (genvar i = 1; i < STAGE_1_CYCLES-1; i++) begin VX_generic_register #( .N(1 + 1 + BANK_LINE_SIZE + `TAG_SELECT_BITS + `BANK_LINE_WIDTH) ) s0_1_cc ( @@ -157,11 +156,11 @@ module VX_tag_data_access #( if (`WORD_SELECT_WIDTH != 0) begin wire [`WORD_WIDTH-1:0] readword = use_read_data_st1e[wordsel_st1e * `WORD_WIDTH +: `WORD_WIDTH]; - for (i = 0; i < WORD_SIZE; i++) begin + for (genvar i = 0; i < WORD_SIZE; i++) begin assign readword_st1e[i * 8 +: 8] = readword[i * 8 +: 8] & {8{mem_byteen_st1e[i]}}; end end else begin - for (i = 0; i < WORD_SIZE; i++) begin + for (genvar i = 0; i < WORD_SIZE; i++) begin assign readword_st1e[i * 8 +: 8] = use_read_data_st1e[i * 8 +: 8] & {8{mem_byteen_st1e[i]}}; end end @@ -176,7 +175,7 @@ module VX_tag_data_access #( && ~is_snp_st1e && ~real_writefill; - for (i = 0; i < `BANK_LINE_WORDS; i++) begin + for (genvar i = 0; i < `BANK_LINE_WORDS; i++) begin wire normal_write = ((`WORD_SELECT_WIDTH == 0) || (wordsel_st1e == `UP(`WORD_SELECT_WIDTH)'(i))) && should_write; @@ -218,15 +217,15 @@ module VX_tag_data_access #( if (valid_req_st1e) begin if ((| use_write_enable)) begin if (writefill_st1e) begin - $display("%t: bank%0d:%0d store-fill: warp=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_warp_num_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data); + $display("%t: bank%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data); end else begin - $display("%t: bank%0d:%0d store-write: warp=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_warp_num_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e); + $display("%t: bank%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e); end end else if (miss_st1e) begin - $display("%t: bank%0d:%0d store-miss: warp=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_warp_num_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e); + $display("%t: bank%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e); end else begin - $display("%t: bank%0d:%0d store-read: warp=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_warp_num_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1); + $display("%t: bank%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1); end end end diff --git a/hw/rtl/fp_cores/VX_fp_fpga.v b/hw/rtl/fp_cores/VX_fp_fpga.v index fdce1d67..7634622f 100644 --- a/hw/rtl/fp_cores/VX_fp_fpga.v +++ b/hw/rtl/fp_cores/VX_fp_fpga.v @@ -41,8 +41,6 @@ module VX_fp_fpga ( reg [FPC_BITS-1:0] core_select; reg fmadd_negate; - genvar i; - always @(*) begin core_select = 0; fmadd_negate = 0; @@ -246,7 +244,7 @@ module VX_fp_fpga ( .valid_out (fp_valid) ); - for (i = 0; i < NUM_FPC; i++) begin + for (genvar i = 0; i < NUM_FPC; i++) begin assign per_core_ready_out[i] = ready_out && (i == fp_index); end diff --git a/hw/rtl/fp_cores/VX_fp_noncomp.v b/hw/rtl/fp_cores/VX_fp_noncomp.v index 4ae4c047..f83c691c 100644 --- a/hw/rtl/fp_cores/VX_fp_noncomp.v +++ b/hw/rtl/fp_cores/VX_fp_noncomp.v @@ -48,10 +48,8 @@ module VX_fp_noncomp ( reg [`NUM_THREADS-1:0][31:0] fcmp_res; // result of comparison reg [`NUM_THREADS-1:0][ 4:0] fcmp_excp; // exception of comparison - genvar i; - // Setup - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin assign a_sign[i] = dataa[i][31]; assign a_exponent[i] = dataa[i][30:23]; assign a_mantissa[i] = dataa[i][22:0]; @@ -77,7 +75,7 @@ module VX_fp_noncomp ( end // FCLASS - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin if (a_type[i].is_normal) begin fclass_mask[i] = a_sign[i] ? NEG_NORM : POS_NORM; @@ -101,7 +99,7 @@ module VX_fp_noncomp ( end // Min/Max - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin if (a_type[i].is_nan && b_type[i].is_nan) fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN @@ -120,7 +118,7 @@ module VX_fp_noncomp ( end // Sign Injection - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin case (op) `FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]}; @@ -132,7 +130,7 @@ module VX_fp_noncomp ( end // Comparison - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin case (frm) `FRM_RNE: begin @@ -193,7 +191,7 @@ module VX_fp_noncomp ( endcase end - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin always @(*) begin tmp_valid = 1'b1; case (op) diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index bcd27376..d90f652e 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -129,11 +129,9 @@ module VX_fpnew #( endcase end - genvar i; - `DISABLE_TRACING - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin if (0 == i) begin fpnew_top #( .Features (FPU_FEATURES), @@ -194,8 +192,7 @@ module VX_fpnew #( `ENABLE_TRACING assign fpu_valid_in = valid_in; - assign ready_in = fpu_ready_in - || ~valid_in; // fix + assign ready_in = fpu_ready_in; assign fpu_tag_in = tag_in; assign tag_out = fpu_tag_out; diff --git a/hw/rtl/fp_cores/altera/VX_fp_add.v b/hw/rtl/fp_cores/altera/VX_fp_add.v index e055adfa..c7c39506 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_add.v +++ b/hw/rtl/fp_cores/altera/VX_fp_add.v @@ -22,9 +22,7 @@ module VX_fp_add ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), diff --git a/hw/rtl/fp_cores/altera/VX_fp_div.v b/hw/rtl/fp_cores/altera/VX_fp_div.v index a0db0790..54fe7e57 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_div.v +++ b/hw/rtl/fp_cores/altera/VX_fp_div.v @@ -22,9 +22,7 @@ module VX_fp_div ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_div fdiv ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v index 3036410a..a7ba66ae 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v @@ -21,9 +21,7 @@ module VX_fp_ftoi ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_ftoi ftoi ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/fp_cores/altera/VX_fp_ftou.v b/hw/rtl/fp_cores/altera/VX_fp_ftou.v index 461d45df..a0912f12 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_ftou.v +++ b/hw/rtl/fp_cores/altera/VX_fp_ftou.v @@ -21,9 +21,7 @@ module VX_fp_ftou ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_ftou ftou ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/fp_cores/altera/VX_fp_itof.v b/hw/rtl/fp_cores/altera/VX_fp_itof.v index d67749ad..c95ede12 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_itof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_itof.v @@ -21,9 +21,7 @@ module VX_fp_itof ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_itof itof ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/fp_cores/altera/VX_fp_madd.v b/hw/rtl/fp_cores/altera/VX_fp_madd.v index c7939cd7..58b410d1 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_madd.v +++ b/hw/rtl/fp_cores/altera/VX_fp_madd.v @@ -28,9 +28,7 @@ module VX_fp_madd ( wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; wire in_valid_st0, out_valid_st0, out_valid_st1; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin twentynm_fp_mac mac_fp_wys0 ( // inputs .accumulate(), diff --git a/hw/rtl/fp_cores/altera/VX_fp_msub.v b/hw/rtl/fp_cores/altera/VX_fp_msub.v index 211c1b34..62fb99b9 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_msub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_msub.v @@ -28,9 +28,7 @@ module VX_fp_msub ( wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1; wire in_valid_st0, out_valid_st0, out_valid_st1; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin twentynm_fp_mac mac_fp_wys0 ( // inputs .accumulate(), diff --git a/hw/rtl/fp_cores/altera/VX_fp_mul.v b/hw/rtl/fp_cores/altera/VX_fp_mul.v index 56633586..8be10473 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_mul.v +++ b/hw/rtl/fp_cores/altera/VX_fp_mul.v @@ -22,9 +22,7 @@ module VX_fp_mul ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), diff --git a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v index 22649771..511b7512 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v @@ -21,9 +21,7 @@ module VX_fp_sqrt ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_sqrt fsqrt ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/fp_cores/altera/VX_fp_sub.v b/hw/rtl/fp_cores/altera/VX_fp_sub.v index f88567da..574eac6a 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_sub.v +++ b/hw/rtl/fp_cores/altera/VX_fp_sub.v @@ -22,9 +22,7 @@ module VX_fp_sub ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin twentynm_fp_mac mac_fp_wys ( // inputs .accumulate(), diff --git a/hw/rtl/fp_cores/altera/VX_fp_utof.v b/hw/rtl/fp_cores/altera/VX_fp_utof.v index 601c0634..2fb253fc 100644 --- a/hw/rtl/fp_cores/altera/VX_fp_utof.v +++ b/hw/rtl/fp_cores/altera/VX_fp_utof.v @@ -21,9 +21,7 @@ module VX_fp_utof ( wire enable = ~stall; assign ready_in = enable; - genvar i; - - for (i = 0; i < `NUM_THREADS; i++) begin + for (genvar i = 0; i < `NUM_THREADS; i++) begin acl_fp_utof utof ( .clk (clk), .areset (1'b0), diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v index 901a2754..13ec04f7 100644 --- a/hw/rtl/interfaces/VX_alu_req_if.v +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -7,18 +7,22 @@ interface VX_alu_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; +`DEBUG_BEGIN + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; +`DEBUG_END wire [31:0] curr_PC; - wire [`ALU_BITS-1:0] alu_op; + wire [`ALU_BITS-1:0] op; + + wire rs1_is_PC; + wire rs2_is_imm; + + wire [31:0] imm; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; - wire [31:0] offset; - wire [31:0] next_PC; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_branch_ctl_if.v b/hw/rtl/interfaces/VX_branch_ctl_if.v index 325b2671..5e5e840a 100644 --- a/hw/rtl/interfaces/VX_branch_ctl_if.v +++ b/hw/rtl/interfaces/VX_branch_ctl_if.v @@ -6,7 +6,7 @@ interface VX_branch_ctl_if (); wire valid; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire taken; wire [31:0] dest; diff --git a/hw/rtl/interfaces/VX_bru_req_if.v b/hw/rtl/interfaces/VX_bru_req_if.v new file mode 100644 index 00000000..17e125ba --- /dev/null +++ b/hw/rtl/interfaces/VX_bru_req_if.v @@ -0,0 +1,29 @@ +`ifndef VX_BRANCH_REQ_IF +`define VX_BRANCH_REQ_IF + +`include "VX_define.vh" + +interface VX_bru_req_if (); + + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; +`DEBUG_BEGIN + wire [`NUM_THREADS-1:0] thread_mask; +`DEBUG_END + wire [31:0] curr_PC; + + wire [`BRU_BITS-1:0] op; + + wire rs1_is_PC; + + wire [31:0] rs1_data; + wire [31:0] rs2_data; + + wire [31:0] offset; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.v b/hw/rtl/interfaces/VX_cmt_to_csr_if.v index c35e3ecd..5cb33922 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.v +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.v @@ -7,7 +7,7 @@ interface VX_cmt_to_csr_if (); wire valid; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [`NE_BITS:0] num_commits; diff --git a/hw/rtl/interfaces/VX_cmt_to_issue_if.v b/hw/rtl/interfaces/VX_cmt_to_issue_if.v index e083a3e5..4065ba0c 100644 --- a/hw/rtl/interfaces/VX_cmt_to_issue_if.v +++ b/hw/rtl/interfaces/VX_cmt_to_issue_if.v @@ -6,6 +6,7 @@ interface VX_cmt_to_issue_if (); wire alu_valid; + wire bru_valid; wire lsu_valid; wire csr_valid; wire mul_valid; @@ -13,6 +14,7 @@ interface VX_cmt_to_issue_if (); wire gpu_valid; wire [`ISTAG_BITS-1:0] alu_tag; + wire [`ISTAG_BITS-1:0] bru_tag; wire [`ISTAG_BITS-1:0] lsu_tag; wire [`ISTAG_BITS-1:0] csr_tag; wire [`ISTAG_BITS-1:0] mul_tag; @@ -21,6 +23,7 @@ interface VX_cmt_to_issue_if (); `IGNORE_WARNINGS_BEGIN issue_data_t alu_data; + issue_data_t bru_data; issue_data_t lsu_data; issue_data_t csr_data; issue_data_t mul_data; diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index 1a727c7f..c8de21be 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -7,13 +7,13 @@ interface VX_csr_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; `DEBUG_BEGIN wire [`NUM_THREADS-1:0] thread_mask; `DEBUG_END - wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; - wire [`CSR_BITS-1:0] csr_op; + wire [`CSR_BITS-1:0] op; wire [`CSR_ADDR_BITS-1:0] csr_addr; wire [31:0] csr_mask; diff --git a/hw/rtl/interfaces/VX_csr_rsp_if.v b/hw/rtl/interfaces/VX_csr_rsp_if.v new file mode 100644 index 00000000..9e141783 --- /dev/null +++ b/hw/rtl/interfaces/VX_csr_rsp_if.v @@ -0,0 +1,15 @@ +`ifndef VX_CSR_RSP_IF +`define VX_CSR_RSP_IF + +`include "VX_define.vh" + +interface VX_csr_rsp_if (); + + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NUM_THREADS-1:0][31:0] data; + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_csr_to_fpu_if.v b/hw/rtl/interfaces/VX_csr_to_fpu_if.v index c6e3b3bd..2b1aac5a 100644 --- a/hw/rtl/interfaces/VX_csr_to_fpu_if.v +++ b/hw/rtl/interfaces/VX_csr_to_fpu_if.v @@ -9,7 +9,7 @@ interface VX_csr_to_fpu_if (); - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [`FRM_BITS-1:0] frm; endinterface diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index 1fd5b96e..4f82f7a5 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -6,10 +6,9 @@ interface VX_decode_if (); wire valid; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; wire [31:0] curr_PC; - wire [31:0] next_PC; wire [`EX_BITS-1:0] ex_type; wire [`OP_BITS-1:0] ex_op; diff --git a/hw/rtl/interfaces/VX_exu_to_cmt_if.v b/hw/rtl/interfaces/VX_exu_to_cmt_if.v index df5d3c14..85ec1074 100644 --- a/hw/rtl/interfaces/VX_exu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_exu_to_cmt_if.v @@ -5,10 +5,9 @@ interface VX_exu_to_cmt_if (); - wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NUM_THREADS-1:0][31:0] data; - wire ready; + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NUM_THREADS-1:0][31:0] data; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index 2c1b299d..c920710d 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -11,13 +11,13 @@ interface VX_fpu_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; `DEBUG_BEGIN wire [`NUM_THREADS-1:0] thread_mask; -`DEBUG_END - wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; +`DEBUG_END - wire [`FPU_BITS-1:0] fpu_op; + wire [`FPU_BITS-1:0] op; wire [`FRM_BITS-1:0] frm; wire [`NUM_THREADS-1:0][31:0] rs1_data; diff --git a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v index beb44a5a..b4a2e015 100644 --- a/hw/rtl/interfaces/VX_fpu_to_cmt_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_cmt_if.v @@ -10,7 +10,6 @@ interface VX_fpu_to_cmt_if (); wire [`NUM_THREADS-1:0][31:0] data; wire has_fflags; fflags_t [`NUM_THREADS-1:0] fflags; - wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v index 71e1e7b9..1dfae0ac 100644 --- a/hw/rtl/interfaces/VX_fpu_to_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -11,7 +11,7 @@ interface VX_fpu_to_csr_if (); wire valid; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire fflags_NV; wire fflags_DZ; diff --git a/hw/rtl/interfaces/VX_gpr_read_if.v b/hw/rtl/interfaces/VX_gpr_read_if.v index 9b24ce56..4e444ba2 100644 --- a/hw/rtl/interfaces/VX_gpr_read_if.v +++ b/hw/rtl/interfaces/VX_gpr_read_if.v @@ -7,7 +7,7 @@ interface VX_gpr_read_if (); wire valid; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [`NR_BITS-1:0] rs1; wire [`NR_BITS-1:0] rs2; diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 604f5903..81661138 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -6,17 +6,15 @@ interface VX_gpu_req_if(); wire valid; - wire [`ISTAG_BITS-1:0] issue_tag; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; -`DEBUG_BEGIN wire [31:0] curr_PC; -`DEBUG_END - wire [`GPU_BITS-1:0] gpu_op; + + wire [`GPU_BITS-1:0] op; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [31:0] rs2_data; - wire [31:0] next_PC; wire ready; diff --git a/hw/rtl/interfaces/VX_ifetch_req_if.v b/hw/rtl/interfaces/VX_ifetch_req_if.v index c92888eb..a2469cb3 100644 --- a/hw/rtl/interfaces/VX_ifetch_req_if.v +++ b/hw/rtl/interfaces/VX_ifetch_req_if.v @@ -7,7 +7,7 @@ interface VX_ifetch_req_if (); wire valid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [31:0] curr_PC; wire ready; diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.v b/hw/rtl/interfaces/VX_ifetch_rsp_if.v index b5efc4fc..f9918a03 100644 --- a/hw/rtl/interfaces/VX_ifetch_rsp_if.v +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.v @@ -7,7 +7,7 @@ interface VX_ifetch_rsp_if (); wire valid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [31:0] curr_PC; wire [31:0] instr; wire ready; diff --git a/hw/rtl/interfaces/VX_issue_if.v b/hw/rtl/interfaces/VX_issue_if.v new file mode 100644 index 00000000..21e9b658 --- /dev/null +++ b/hw/rtl/interfaces/VX_issue_if.v @@ -0,0 +1,39 @@ +`ifndef VX_ISSUE_IF +`define VX_ISSUE_IF + +`include "VX_define.vh" + +interface VX_issue_if (); + + wire valid; + wire [`ISTAG_BITS-1:0] issue_tag; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] thread_mask; + wire [31:0] curr_PC; + + wire [`EX_BITS-1:0] ex_type; + wire [`OP_BITS-1:0] ex_op; + + wire [`FRM_BITS-1:0] frm; + + wire wb; + + wire [`NR_BITS-1:0] rd; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; + + wire [`NR_BITS-1:0] rs1; + wire [31:0] imm; + + wire rs1_is_PC; + wire rs2_is_imm; + + wire [`NT_BITS-1:0] tid; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_join_if.v b/hw/rtl/interfaces/VX_join_if.v index 15c1509b..6e96ad31 100644 --- a/hw/rtl/interfaces/VX_join_if.v +++ b/hw/rtl/interfaces/VX_join_if.v @@ -6,7 +6,7 @@ interface VX_join_if (); wire is_join; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; endinterface diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index b4b80598..1e0ab4fb 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -8,7 +8,7 @@ interface VX_lsu_req_if (); wire valid; wire [`NUM_THREADS-1:0] thread_mask; wire [`ISTAG_BITS-1:0] issue_tag; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; wire [31:0] curr_PC; wire rw; diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v index 44306bde..6cd6432f 100644 --- a/hw/rtl/interfaces/VX_mul_req_if.v +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -12,11 +12,11 @@ interface VX_mul_req_if (); wire valid; wire [`ISTAG_BITS-1:0] issue_tag; `DEBUG_BEGIN + wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; wire [31:0] curr_PC; `DEBUG_END - wire [`MUL_BITS-1:0] mul_op; + wire [`MUL_BITS-1:0] op; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; diff --git a/hw/rtl/interfaces/VX_warp_ctl_if.v b/hw/rtl/interfaces/VX_warp_ctl_if.v index 50eca6b0..1f2e422a 100644 --- a/hw/rtl/interfaces/VX_warp_ctl_if.v +++ b/hw/rtl/interfaces/VX_warp_ctl_if.v @@ -5,24 +5,12 @@ interface VX_warp_ctl_if (); - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; - wire change_mask; - wire [`NUM_THREADS-1:0] thread_mask; - - wire wspawn; - wire [31:0] wspawn_pc; - wire [`NUM_WARPS-1:0] wspawn_wmask; - - wire is_barrier; - wire [`NB_BITS-1:0] barrier_id; - wire [`NW_BITS:0] barrier_num_warps; - - wire is_split; - wire do_split; - wire [`NUM_THREADS-1:0] split_new_mask; - wire [`NUM_THREADS-1:0] split_later_mask; - wire [31:0] split_save_pc; + gpu_tmc_t tmc; + gpu_wspawn_t wspawn; + gpu_barrier_t barrier; + gpu_split_t split; endinterface diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index 859db75b..062377ec 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -7,16 +7,14 @@ interface VX_wb_if (); wire valid; wire [`NUM_THREADS-1:0] thread_mask; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; `IGNORE_WARNINGS_BEGIN wire [31:0] curr_PC; `IGNORE_WARNINGS_END wire [`NR_BITS-1:0] rd; - wire [`NUM_THREADS-1:0][31:0] data; - - wire ready; + wire [`NUM_THREADS-1:0][31:0] data; endinterface diff --git a/hw/rtl/interfaces/VX_wstall_if.v b/hw/rtl/interfaces/VX_wstall_if.v index d2bef0d8..5b4e5039 100644 --- a/hw/rtl/interfaces/VX_wstall_if.v +++ b/hw/rtl/interfaces/VX_wstall_if.v @@ -6,7 +6,7 @@ interface VX_wstall_if(); wire wstall; - wire [`NW_BITS-1:0] warp_num; + wire [`NW_BITS-1:0] wid; endinterface diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index 06379f15..4f12ee36 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -32,19 +32,17 @@ module VX_cam_buffer #( .valid_out (free_valid) ); - integer i; - always @(*) begin free_slots_n = free_slots; - if (acquire_slot) begin - free_slots_n[write_addr_r] = 0; - end - for (i = 0; i < RPORTS; i++) begin + for (integer i = 0; i < RPORTS; i++) begin if (release_slot[i]) begin free_slots_n[read_addr[i]] = 1; end read_data[i] = entries[read_addr[i]]; - end + end + if (acquire_slot) begin + free_slots_n[write_addr_r] = 0; + end end always @(posedge clk) begin @@ -54,12 +52,12 @@ module VX_cam_buffer #( write_addr_r <= ADDRW'(1'b0); end else begin if (acquire_slot) begin - assert(1 == free_slots[write_addr]); + assert(1 == free_slots[write_addr]) else $display("%t: inused slot at port %d", $time, write_addr); entries[write_addr] <= write_data; end - for (i = 0; i < RPORTS; i++) begin + for (integer i = 0; i < RPORTS; i++) begin if (release_slot[i]) begin - assert(0 == free_slots[read_addr[i]]); + assert(0 == free_slots[read_addr[i]]) else $display("%t: freed slot at port %d", $time, read_addr[i]); end end free_slots <= free_slots_n; diff --git a/hw/rtl/libs/VX_countones.v b/hw/rtl/libs/VX_countones.v index 6160d0df..2789014f 100644 --- a/hw/rtl/libs/VX_countones.v +++ b/hw/rtl/libs/VX_countones.v @@ -7,11 +7,9 @@ module VX_countones #( input wire [N-1:0] valids, output reg [$clog2(N):0] count ); - - integer i; always @(*) begin count = 0; - for (i = N-1; i >= 0; i = i - 1) begin + for (integer i = N-1; i >= 0; i = i - 1) begin if (valids[i]) begin count = count + 1; end diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index be13a186..9cdf6848 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -52,15 +52,6 @@ module VX_divide #( reg [WIDTHD-1:0] remainder_unqual; always @(*) begin - `ifndef SYNTHESIS - // this edge case kills verilator in some cases by causing a division - // overflow exception. INT_MIN / -1 (on x86) - if (numer == {1'b1, (WIDTHN-1)'(1'b0)} - && denom == {WIDTHD{1'b1}}) begin - quotient_unqual = 0; - remainder_unqual = 0; - end else - `endif begin if (NSIGNED && DSIGNED) begin quotient_unqual = $signed(numer) / $signed(denom); @@ -88,21 +79,21 @@ module VX_divide #( reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1]; reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1]; - genvar i; - for (i = 0; i < PIPELINE; i++) begin + for (genvar i = 0; i < PIPELINE; i++) begin always @(posedge clk) begin if (reset) begin quotient_pipe[i] <= 0; remainder_pipe[i] <= 0; - end - else if (clk_en) begin - if (i == 0) begin - quotient_pipe[i] <= quotient_unqual; - remainder_pipe[i] <= remainder_unqual; - end else begin - quotient_pipe[i] <= quotient_pipe[i-1]; - remainder_pipe[i] <= remainder_pipe[i-1]; - end + end else begin + if (clk_en) begin + if (i == 0) begin + quotient_pipe[i] <= quotient_unqual; + remainder_pipe[i] <= remainder_unqual; + end else begin + quotient_pipe[i] <= quotient_pipe[i-1]; + remainder_pipe[i] <= remainder_pipe[i-1]; + end + end end end end diff --git a/hw/rtl/libs/VX_elastic_buffer.v b/hw/rtl/libs/VX_elastic_buffer.v index c551fbc6..fbd36b36 100644 --- a/hw/rtl/libs/VX_elastic_buffer.v +++ b/hw/rtl/libs/VX_elastic_buffer.v @@ -14,25 +14,53 @@ module VX_elastic_buffer #( input wire ready_out, output wire valid_out ); - wire empty, full; + if (0 == SIZE) begin - VX_generic_queue #( - .DATAW (DATAW), - .SIZE (SIZE), - .BUFFERED (BUFFERED) - ) queue ( - .clk (clk), - .reset (reset), - .push (valid_in), - .pop (ready_out), - .data_in(data_in), - .data_out(data_out), - .empty (empty), - .full (full), - `UNUSED_PIN (size) - ); + reg [DATAW-1:0] skid_buffer; + reg skid_valid; - assign ready_in = ~full; - assign valid_out = ~empty; + always @(posedge clk) begin + if (reset) begin + skid_valid <= 0; + end else begin + if (valid_in && ~ready_out) begin + assert(~skid_valid); + skid_buffer <= data_in; + skid_valid <= 1; + end + if (ready_out) begin + skid_valid <= 0; + end + end + end + + assign ready_in = ready_out || ~skid_valid; + assign data_out = skid_valid ? skid_buffer : data_in; + assign valid_out = valid_in || skid_valid; + + end else begin + + wire empty, full; + + VX_generic_queue #( + .DATAW (DATAW), + .SIZE (SIZE), + .BUFFERED (BUFFERED) + ) queue ( + .clk (clk), + .reset (reset), + .push (valid_in), + .pop (ready_out), + .data_in(data_in), + .data_out(data_out), + .empty (empty), + .full (full), + `UNUSED_PIN (size) + ); + + assign ready_in = ~full; + assign valid_out = ~empty; + + end endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_fair_arbiter.v b/hw/rtl/libs/VX_fair_arbiter.v index 2e07625b..8f9cbbff 100644 --- a/hw/rtl/libs/VX_fair_arbiter.v +++ b/hw/rtl/libs/VX_fair_arbiter.v @@ -11,7 +11,7 @@ module VX_fair_arbiter #( output wire grant_valid ); - if (N == 1) begin + if (N == 1) begin `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -33,11 +33,13 @@ module VX_fair_arbiter #( if (reset) begin requests_use <= 0; refill_original <= 0; - end else if (refill) begin - requests_use <= refill_value; - refill_original <= refill_value; end else begin - requests_use <= update_value; + if (refill) begin + requests_use <= refill_value; + refill_original <= refill_value; + end else begin + requests_use <= update_value; + end end end diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 5ee61e6e..1383ebcf 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -38,7 +38,6 @@ module VX_generic_queue #( end else if (reading && !writing) begin size_r <= 0; end - if (writing) begin head_r <= data_in; end @@ -146,7 +145,7 @@ module VX_generic_queue #( end bypass_r <= writing - && (empty_r || ((1 == size_r) && reading)); // empty or about to go empty + && (empty_r || ((1 == size_r) && reading)); // empty or about to go empty curr_r <= data_in; head_r <= data[reading ? rd_ptr_next_r : rd_ptr_r]; diff --git a/hw/rtl/libs/VX_matrix_arbiter.v b/hw/rtl/libs/VX_matrix_arbiter.v index 62414abd..7ccda7dc 100644 --- a/hw/rtl/libs/VX_matrix_arbiter.v +++ b/hw/rtl/libs/VX_matrix_arbiter.v @@ -24,11 +24,9 @@ module VX_matrix_arbiter #( reg [N-1:1] state [0:N-1]; wire [N-1:0] pri [0:N-1]; - - genvar i, j; - - for (i = 0; i < N; i++) begin - for (j = 0; j < N; j++) begin + + for (genvar i = 0; i < N; i++) begin + for (genvar j = 0; j < N; j++) begin if (j > i) begin assign pri[j][i] = requests[i] && state[i][j]; end @@ -43,13 +41,12 @@ module VX_matrix_arbiter #( assign grant_onehot[i] = requests[i] && !(| pri[i]); end - for (i = 0; i < N; i++) begin - for (j = i + 1; j < N; j++) begin + for (genvar i = 0; i < N; i++) begin + for (genvar j = i + 1; j < N; j++) begin always @(posedge clk) begin if (reset) begin state[i][j] <= 0; - end - else begin + end else begin state[i][j] <= (state[i][j] || grant_onehot[j]) && !grant_onehot[i]; end end diff --git a/hw/rtl/libs/VX_multiplier.v b/hw/rtl/libs/VX_multiplier.v index d5b7793c..5822f31b 100644 --- a/hw/rtl/libs/VX_multiplier.v +++ b/hw/rtl/libs/VX_multiplier.v @@ -50,18 +50,18 @@ module VX_multiplier #( reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1]; - genvar i; - for (i = 0; i < PIPELINE; i++) begin + for (genvar i = 0; i < PIPELINE; i++) begin always @(posedge clk) begin if (reset) begin result_pipe[i] <= 0; - end - else if (clk_en) begin - if (i == 0) begin - result_pipe[i] <= result_unqual; - end else begin - result_pipe[i] <= result_pipe[i-1]; - end + end else begin + if (clk_en) begin + if (i == 0) begin + result_pipe[i] <= result_unqual; + end else begin + result_pipe[i] <= result_pipe[i-1]; + end + end end end end diff --git a/hw/rtl/libs/VX_onehot_encooder.v b/hw/rtl/libs/VX_onehot_encooder.v index 65c6dd8e..89b997ce 100644 --- a/hw/rtl/libs/VX_onehot_encooder.v +++ b/hw/rtl/libs/VX_onehot_encooder.v @@ -7,12 +7,10 @@ module VX_onehot_encoder #( output reg [`LOG2UP(N)-1:0] binary, output reg valid ); - integer i; - always @(*) begin valid = 1'b0; binary = `LOG2UP(N)'(0); - for (i = 0; i < N; i++) begin + for (integer i = 0; i < N; i++) begin if (onehot[i]) begin valid = 1'b1; binary = `LOG2UP(N)'(i); diff --git a/hw/rtl/libs/VX_priority_encoder.v b/hw/rtl/libs/VX_priority_encoder.v index ca5c42d4..81a93a95 100644 --- a/hw/rtl/libs/VX_priority_encoder.v +++ b/hw/rtl/libs/VX_priority_encoder.v @@ -6,13 +6,11 @@ module VX_priority_encoder #( input wire [N-1:0] data_in, output reg [`LOG2UP(N)-1:0] data_out, output reg valid_out -); - integer i; - +); always @(*) begin data_out = 0; valid_out = 0; - for (i = N-1; i >= 0; i = i - 1) begin + for (integer i = N-1; i >= 0; i = i - 1) begin if (data_in[i]) begin data_out = `LOG2UP(N)'(i); valid_out = 1; diff --git a/hw/rtl/libs/VX_rr_arbiter.v b/hw/rtl/libs/VX_rr_arbiter.v index 05910a71..e5fbc746 100644 --- a/hw/rtl/libs/VX_rr_arbiter.v +++ b/hw/rtl/libs/VX_rr_arbiter.v @@ -26,12 +26,10 @@ module VX_rr_arbiter #( reg [`CLOG2(N)-1:0] state; reg [N-1:0] grant_onehot_r; - integer i, j; - always @(*) begin - for (i = 0; i < N; i++) begin + for (integer i = 0; i < N; i++) begin grant_table[i] = `CLOG2(N)'(i); - for (j = 0; j < N; j++) begin + for (integer j = 0; j < N; j++) begin if (requests[(i+j) % N]) begin grant_table[i] = `CLOG2(N)'((i+j) % N); end @@ -44,8 +42,7 @@ module VX_rr_arbiter #( always @(posedge clk) begin if (reset) begin state <= 0; - end - else begin + end else begin state <= grant_index; end end diff --git a/hw/rtl/libs/VX_scope.v b/hw/rtl/libs/VX_scope.v index dcca50ee..1091e785 100644 --- a/hw/rtl/libs/VX_scope.v +++ b/hw/rtl/libs/VX_scope.v @@ -74,7 +74,6 @@ module VX_scope #( read_delta <= 0; data_valid <= 0; end else begin - if (bus_write) begin case (cmd_type) CMD_GET_VALID, diff --git a/hw/rtl/libs/VX_shift_register.v b/hw/rtl/libs/VX_shift_register.v index b4f2496f..a82a4607 100644 --- a/hw/rtl/libs/VX_shift_register.v +++ b/hw/rtl/libs/VX_shift_register.v @@ -16,7 +16,7 @@ module VX_shift_register #( always @(posedge clk) begin if (reset) begin - entries <= '0; + entries <= (DEPTH * DATAW)'(0); end else begin if (enable) begin entries <= in; @@ -28,7 +28,7 @@ module VX_shift_register #( always @(posedge clk) begin if (reset) begin - entries <= '0; + entries <= (DEPTH * DATAW)'(0); end else begin if (enable) begin entries <= {entries[DEPTH-2:0], in}; diff --git a/hw/syn/quartus/cache/Makefile b/hw/syn/quartus/cache/Makefile index 8c69066f..444a8d29 100755 --- a/hw/syn/quartus/cache/Makefile +++ b/hw/syn/quartus/cache/Makefile @@ -9,9 +9,9 @@ DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on -FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on ASM_ARGS = -STA_ARGS = --do_report_timing +STA_ARGS = --parallel --do_report_timing # Build targets all: $(PROJECT).sta.rpt diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index 31ed1051..477d454d 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -11,9 +11,9 @@ DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on -FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on ASM_ARGS = -STA_ARGS = --do_report_timing +STA_ARGS = --parallel --do_report_timing # Build targets all: $(PROJECT).sta.rpt diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index 447037ff..76bad071 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -11,9 +11,9 @@ DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on -FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on ASM_ARGS = -STA_ARGS = --do_report_timing +STA_ARGS = --parallel --do_report_timing # Build targets all: $(PROJECT).sta.rpt diff --git a/hw/syn/quartus/project.sdc b/hw/syn/quartus/project.sdc index 59686a41..61b8cba9 100644 --- a/hw/syn/quartus/project.sdc +++ b/hw/syn/quartus/project.sdc @@ -1,6 +1,6 @@ set_time_format -unit ns -decimal_places 3 -create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] +create_clock -name {clk} -period "300 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] derive_pll_clocks -create_base_clocks derive_clock_uncertainty diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index e3d1f2cc..b2164fa0 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -33,6 +33,7 @@ set_global_assignment -name TOP_LEVEL_ENTITY $opts(top) set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009 +set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS OFF set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG diff --git a/hw/syn/quartus/top/Makefile b/hw/syn/quartus/top/Makefile index 4f8111b9..6258682f 100644 --- a/hw/syn/quartus/top/Makefile +++ b/hw/syn/quartus/top/Makefile @@ -11,9 +11,9 @@ DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on --set=VERILOG_MACRO=NOPAE=1 -FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on ASM_ARGS = -STA_ARGS = --do_report_timing +STA_ARGS = --parallel --do_report_timing # Build targets all: $(PROJECT).sta.rpt diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 633e32cd..e424db86 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -11,9 +11,9 @@ DEVICE = 10AX115N3F40E2SG # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on -FIT_ARGS = --part=$(DEVICE) --read_settings_files=on +FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on ASM_ARGS = -STA_ARGS = --do_report_timing +STA_ARGS = --parallel --do_report_timing # Build targets all: $(PROJECT).sta.rpt diff --git a/hw/unit_tests/VX_divide_tb.v b/hw/unit_tests/VX_divide_tb.v index 92ccfac7..a4ba539e 100644 --- a/hw/unit_tests/VX_divide_tb.v +++ b/hw/unit_tests/VX_divide_tb.v @@ -17,26 +17,23 @@ module VX_tb_divide(); wire [31:0] o_div[0:7], o_rem[0:7]; - genvar i; - generate - for (i = 0; i < 8; i++) begin - VX_divide#( - .WIDTHN(32), - .WIDTHD(32), - .WIDTHQ(32), - .WIDTHR(32), - .PIPELINE(i) - ) div( - .clock(clk), - .aclr(rst), - .clken(1'b1), - .numer(numer), - .denom(denom), - .quotient(o_div[i]), - .remainder(o_rem[i]) - ); - end - endgenerate + for (genvar i = 0; i < 8; i++) begin + VX_divide#( + .WIDTHN(32), + .WIDTHD(32), + .WIDTHQ(32), + .WIDTHR(32), + .PIPELINE(i) + ) div( + .clock(clk), + .aclr(rst), + .clken(1'b1), + .numer(numer), + .denom(denom), + .quotient(o_div[i]), + .remainder(o_rem[i]) + ); + end initial begin clk = 0; rst = 0;