From 27e95530efecb8a7f850e204f8fd570d5fc3245e Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 30 Jul 2020 03:06:01 -0700 Subject: [PATCH] pipeline optimization --- hw/rtl/VX_alu_unit.v | 69 +++++++++----------- hw/rtl/VX_config.vh | 4 +- hw/rtl/VX_decode.v | 89 +++++++++++++++---------- hw/rtl/VX_define.vh | 10 ++- hw/rtl/VX_gpr_fp_ctrl.v | 87 ++++++------------------- hw/rtl/VX_gpr_stage.v | 59 +++++------------ hw/rtl/VX_gpu_unit.v | 2 +- hw/rtl/VX_issue.v | 13 ++-- hw/rtl/VX_platform.vh | 8 +++ hw/rtl/VX_scheduler.v | 101 +++++++---------------------- hw/rtl/VX_warp_sched.v | 12 ++-- hw/rtl/VX_writeback.v | 7 -- hw/rtl/interfaces/VX_decode_if.v | 10 +-- hw/rtl/interfaces/VX_gpr_read_if.v | 5 +- hw/rtl/interfaces/VX_wb_if.v | 2 +- hw/rtl/libs/VX_cam_buffer.v | 2 +- hw/rtl/libs/VX_generic_queue.v | 6 +- hw/rtl/libs/VX_generic_stack.v | 34 ---------- hw/rtl/libs/VX_index_queue.v | 2 +- hw/unit_tests/VX_divide_tb.v | 2 +- 20 files changed, 184 insertions(+), 340 deletions(-) delete mode 100644 hw/rtl/libs/VX_generic_stack.v diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 61867ac2..7a5dd6a3 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -46,9 +46,9 @@ module VX_alu_unit #( default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC endcase end - end + end - wire [`NT_BITS-1:0] br_result_index; + wire [`NT_BITS-1:0] br_result_index, br_result_index_o; VX_priority_encoder #( .N(`NUM_THREADS) @@ -58,15 +58,35 @@ module VX_alu_unit #( `UNUSED_PIN (valid_out) ); - wire [32:0] br_result = sub_result[br_result_index]; - wire br_sign = br_result[32]; + wire [`BR_BITS-1:0] br_op = `IS_BR_OP(alu_req_if.alu_op) ? `BR_OP(alu_req_if.alu_op) : 0; + wire [`BR_BITS-1:0] br_op_o; + + wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC; + wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset); + + wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR); + wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; + + wire stall = ~alu_commit_if.ready && alu_commit_if.valid; + + VX_generic_register #( + .N(1 + `NW_BITS + `ISTAG_BITS + (`NUM_THREADS * 32) + `BR_BITS + 32 + `NT_BITS) + ) alu_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.issue_tag, alu_jal_result, br_op, br_dest, br_result_index}), + .out ({alu_commit_if.valid, branch_ctl_if.warp_num, alu_commit_if.issue_tag, alu_commit_if.data, br_op_o, branch_ctl_if.dest, br_result_index_o}) + ); + + wire [31:0] br_result = alu_commit_if.data[br_result_index_o]; + wire br_sign = br_result[31]; wire br_nzero = (| br_result[31:0]); - wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.alu_op); - reg br_taken; always @(*) begin - case (br_op) + case (br_op_o) `BR_NE: br_taken = br_nzero; `BR_EQ: br_taken = ~br_nzero; `BR_LT, @@ -75,39 +95,10 @@ module VX_alu_unit #( `BR_GEU: br_taken = ~br_sign; default: br_taken = 1'b1; endcase - end + end - wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC; - wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset); - - wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR); - wire is_br_valid = `IS_BR_OP(alu_op) && alu_req_if.valid; - - wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result; - - wire stall = ~alu_commit_if.ready && alu_commit_if.valid; - - VX_generic_register #( - .N(1 + `NW_BITS + 1 + 32) - ) branch_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (0), - .in ({is_br_valid, alu_req_if.warp_num, br_taken, br_dest}), - .out ({branch_ctl_if.valid, branch_ctl_if.warp_num, branch_ctl_if.taken, branch_ctl_if.dest}) - ); - - VX_generic_register #( - .N(1 + `ISTAG_BITS + (`NUM_THREADS * 32)) - ) alu_reg ( - .clk (clk), - .reset (reset), - .stall (stall), - .flush (0), - .in ({alu_req_if.valid, alu_req_if.issue_tag, alu_jal_result}), - .out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data}) - ); + assign branch_ctl_if.valid = alu_req_if.valid && (br_op_o != 0); + assign branch_ctl_if.taken = br_taken; assign alu_req_if.ready = ~stall; diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 2e6929cb..0562cc3f 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -28,7 +28,7 @@ `endif `ifndef NUM_CSRS -`define NUM_CSRS 1024 +`define NUM_CSRS 64 `endif `ifndef STARTUP_ADDR @@ -57,7 +57,7 @@ `define EXT_M_ENABLE -//`define EXT_F_ENABLE +`define EXT_F_ENABLE // Configuration Values ======================================================= diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 5e737e1d..ff1d839c 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -35,10 +35,10 @@ module VX_decode #( wire [6:0] func7 = instr[31:25]; wire [11:0] u_12 = instr[31:20]; - wire [`NR_BITS-1:0] rd = instr[11:7]; - wire [`NR_BITS-1:0] rs1 = instr[19:15]; - wire [`NR_BITS-1:0] rs2 = instr[24:20]; - wire [`NR_BITS-1:0] rs3 = instr[31:27]; + wire [4:0] rd = instr[11:7]; + wire [4:0] rs1 = instr[19:15]; + wire [4:0] rs2 = instr[24:20]; + wire [4:0] rs3 = instr[31:27]; // opcode types wire is_rtype = (opcode == `INST_R); @@ -202,7 +202,7 @@ module VX_decode #( wire is_fcvtf = is_fci && (func7 == 7'h68); // convert to float wire is_fmvcls = is_fci && (func7 == 7'h70 || func7 == 7'h78); // move + class wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd; - wire is_fpu = (is_fl || is_fs || is_fci || is_fr4); + wire is_fpu = (is_fl || is_fs || is_fci || is_fr4); always @(*) begin fpu_op = `FPU_OTHER; @@ -242,7 +242,8 @@ module VX_decode #( wire is_fcvtf = 0; wire is_fmvcls = 0; wire is_fr4 = 0; - wire is_fpu = 0; + wire is_fpu = 0; + always @(*) begin fpu_op = `FPU_OTHER; end @@ -271,6 +272,29 @@ module VX_decode #( endcase end + /////////////////////////////////////////////////////////////////////////// + + wire use_rd = (is_fl || is_fci || is_fr4) + || ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype)); + + wire use_rs1 = is_fpu + || is_gpu + || ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu) && (rs1 != 0)); + + wire use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls)) + || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN)) + || ((is_btype || is_stype || is_rtype) && (rs2 != 0)); + + wire use_rs3 = is_fr4; + + wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS)); + wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX))); + wire rs2_is_fp = is_fs || is_fr4 || is_fci; + + wire [4:0] rs1_qual = is_lui ? 5'h0 : rs1; + + /////////////////////////////////////////////////////////////////////////// + VX_decode_if decode_tmp_if(); assign decode_tmp_if.valid = ifetch_rsp_if.valid; @@ -297,9 +321,26 @@ module VX_decode #( (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : 0; - assign decode_tmp_if.rd = rd; - assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1; - assign decode_tmp_if.rs2 = rs2; + assign decode_tmp_if.wb = use_rd; + + `ifdef EXT_F_ENABLE + assign decode_tmp_if.rd = {rd_is_fp, rd}; + assign decode_tmp_if.rs1 = {rs1_is_fp, rs1_qual}; + assign decode_tmp_if.rs2 = {rs2_is_fp, rs2}; + assign decode_tmp_if.rs3 = {1'b1, rs3}; + `else + assign decode_tmp_if.rd = rd; + assign decode_tmp_if.rs1 = rs1_qual; + assign decode_tmp_if.rs2 = rs2; + assign decode_tmp_if.rs3 = rs3; + `endif + + assign decode_tmp_if.use_rs3 = use_rs3; + + assign decode_tmp_if.reg_use_mask = ((`NUM_REGS)'(use_rd) << rd) + | ((`NUM_REGS)'(use_rs1) << rs1_qual) + | ((`NUM_REGS)'(use_rs2) << rs2) + | ((`NUM_REGS)'(use_rs3) << rs3); assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} : (is_jal || is_jalr || is_jals) ? jalx_offset : @@ -308,26 +349,8 @@ module VX_decode #( assign decode_tmp_if.rs1_is_PC = is_auipc; assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; - - assign decode_tmp_if.use_rs1 = is_fpu - || is_gpu - || ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu) - && (decode_tmp_if.rs1 != 0)); - - assign decode_tmp_if.use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls)) - || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN)) - || ((is_btype || is_stype || is_rtype) - && (decode_tmp_if.rs2 != 0)); - - assign decode_tmp_if.rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS)); - assign decode_tmp_if.rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX))); - assign decode_tmp_if.rs2_is_fp = is_fs || is_fr4 || is_fci; - assign decode_tmp_if.rs3 = rs3; - assign decode_tmp_if.use_rs3 = is_fr4; - assign decode_tmp_if.frm = func3; - - assign decode_tmp_if.wb = (is_fl || is_fci || is_fr4) - || ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype)); + + assign decode_tmp_if.frm = func3; assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.warp_num = ifetch_rsp_if.warp_num; @@ -338,14 +361,14 @@ module VX_decode #( wire stall = ~decode_if.ready && decode_if.valid; VX_generic_register #( - .N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + 1 + `FRM_BITS) + .N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS) ) decode_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (0), - .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.rd_is_fp, decode_tmp_if.frm}), - .out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.rd_is_fp, decode_if.frm}) + .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}), + .out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask}) ); assign ifetch_rsp_if.ready = ~stall; @@ -357,7 +380,7 @@ module VX_decode #( print_ex_type(decode_tmp_if.ex_type); $write(", op="); print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op); - $write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b, use_rs3=%b, rd_is_fp=%b, rs1_is_fp=%b, rs2_is_fp=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.use_rs3, decode_tmp_if.rd_is_fp,decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp); + $write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm); print_frm(decode_tmp_if.frm); $write("\n"); diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 6027223f..8e0279b5 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -7,7 +7,6 @@ /////////////////////////////////////////////////////////////////////////////// -`define QUEUE_FORCE_MLAB 1 // `define SYNTHESIS 1 // `define ASIC 1 @@ -23,7 +22,11 @@ `define REQS_BITS `LOG2UP(NUM_REQUESTS) +`ifdef EXT_F_ENABLE +`define NUM_REGS 64 +`else `define NUM_REGS 32 +`endif `define NR_BITS `LOG2UP(`NUM_REGS) @@ -33,7 +36,9 @@ `define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE) -`define LATENCY_IDIV 23 +/////////////////////////////////////////////////////////////////////////////// + +`define LATENCY_IDIV 24 `define LATENCY_IMUL 2 `define LATENCY_FMULADD 2 @@ -408,7 +413,6 @@ typedef struct packed { logic [`NUM_THREADS-1:0] thread_mask; logic [31:0] curr_PC; logic [`NR_BITS-1:0] rd; - logic rd_is_fp; logic wb; } is_data_t; diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index fab66404..1a87a5c3 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -3,98 +3,49 @@ // control module to support multi-cycle read for fp register module VX_gpr_fp_ctrl ( - input wire clk, - input wire reset, + input wire clk, + input wire reset, - input wire [`NUM_THREADS-1:0][31:0] rs1_int_data, - input wire [`NUM_THREADS-1:0][31:0] rs2_int_data, - input wire [`NUM_THREADS-1:0][31:0] rs1_fp_data, - input wire [`NUM_THREADS-1:0][31:0] rs2_fp_data, + input wire [`NUM_THREADS-1:0][31:0] rs1_data, + input wire [`NUM_THREADS-1:0][31:0] rs2_data, // outputs - output wire [`NR_BITS-1:0] raddr1, - output wire [`NR_BITS-1:0] raddr2, + output wire [`NR_BITS-1:0] raddr1, - VX_gpr_read_if gpr_read_if + VX_gpr_read_if gpr_read_if ); - // param - localparam GPR_DELAY_WID = 1; - reg [GPR_DELAY_WID-1:0] multi_cyc_state; - reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data; - reg [`NUM_THREADS-1:0][31:0] tmp_rs2_data; - reg [`NUM_THREADS-1:0][31:0] rs1_data; - reg [`NUM_THREADS-1:0][31:0] rs2_data; - reg [`NUM_THREADS-1:0][31:0] rs3_data; + reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data; + reg read_rs3; - wire gpr_delay; + wire gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3; wire gpr_fire = gpr_read_if.valid && gpr_read_if.ready; always @(posedge clk) begin if (reset) begin - multi_cyc_state <= 0; + read_rs3 <= 0; end else if (gpr_delay) begin - multi_cyc_state <= 1; + read_rs3 <= 1; end else if (gpr_fire) begin - multi_cyc_state <= 0; + read_rs3 <= 0; end end - // select rs1 data - - always @(posedge clk) begin - if (reset) begin - tmp_rs1_data <= 0; - end else begin - if (gpr_delay) begin - if (gpr_read_if.rs1_is_fp) begin - tmp_rs1_data <= rs1_fp_data; - end else begin - tmp_rs1_data <= rs1_int_data; - end - end - end - end - - // select rs2 data - - always @(posedge clk) begin - if(reset) begin - tmp_rs2_data <= 0; - end else begin - if (gpr_delay) begin - if (gpr_read_if.rs2_is_fp) begin - tmp_rs2_data <= rs2_fp_data; - end else begin - tmp_rs2_data <= rs2_int_data; - end - end + // backup original rs1 data + always @(posedge clk) begin + if (gpr_delay) begin + tmp_rs1_data <= rs1_data; end end // outputs - - assign gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && (0 == multi_cyc_state); - assign raddr1 = multi_cyc_state ? gpr_read_if.rs3 : gpr_read_if.rs1; - assign raddr2 = gpr_read_if.rs2; - - always @(*) begin - if (gpr_read_if.use_rs3) begin - rs1_data = tmp_rs1_data; - rs2_data = tmp_rs2_data; - rs3_data = rs1_fp_data; - end else begin - rs1_data = gpr_read_if.rs1_is_fp ? rs1_fp_data : rs1_int_data; - rs2_data = gpr_read_if.rs2_is_fp ? rs2_fp_data : rs2_int_data; - rs3_data = {`NUM_THREADS{32'h8000_0000}}; // default value: -0 in single fp - end - end + assign raddr1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1; assign gpr_read_if.ready = ~gpr_delay; - assign gpr_read_if.rs1_data = rs1_data; + assign gpr_read_if.rs1_data = gpr_read_if.use_rs3 ? tmp_rs1_data : rs1_data; assign gpr_read_if.rs2_data = rs2_data; - assign gpr_read_if.rs3_data = rs3_data; + assign gpr_read_if.rs3_data = rs1_data; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 529945f4..06e7d344 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -14,79 +14,52 @@ module VX_gpr_stage #( ); `UNUSED_VAR (reset) - wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs1_data [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs2_data [`NUM_WARPS-1:0]; wire [`NR_BITS-1:0] raddr1; - wire [`NR_BITS-1:0] raddr2; genvar i; for (i = 0; i < `NUM_WARPS; i++) begin - wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && ~writeback_if.rd_is_fp && (i == writeback_if.warp_num)}}; + wire [`NUM_THREADS-1:0] we = writeback_if.thread_mask + & {`NUM_THREADS{writeback_if.valid && (i == writeback_if.warp_num)}}; VX_gpr_ram gpr_int_ram ( .clk (clk), .we (we), .waddr (writeback_if.rd), .wdata (writeback_if.data), .rs1 (raddr1), - .rs2 (raddr2), - .rs1_data (rs1_int_data[i]), - .rs2_data (rs2_int_data[i]) + .rs2 (gpr_read_if.rs2), + .rs1_data (rs1_data[i]), + .rs2_data (rs2_data[i]) ); end -`ifdef EXT_F_ENABLE - - wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0]; - wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0]; - - for (i = 0; i < `NUM_WARPS; i++) begin - wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && writeback_if.rd_is_fp && (i == writeback_if.warp_num)}}; - VX_gpr_ram gpr_fp_ram ( - .clk (clk), - .we (we), - .waddr (writeback_if.rd), - .wdata (writeback_if.data), - .rs1 (raddr1), - .rs2 (raddr2), - .rs1_data (rs1_fp_data[i]), - .rs2_data (rs2_fp_data[i]) - ); - end - +`ifdef EXT_F_ENABLE VX_gpr_fp_ctrl VX_gpr_fp_ctrl ( - .clk (clk), - .reset (reset), + .clk (clk), + .reset (reset), //inputs - .rs1_int_data (rs1_int_data[gpr_read_if.warp_num]), - .rs2_int_data (rs2_int_data[gpr_read_if.warp_num]), - .rs1_fp_data (rs1_fp_data[gpr_read_if.warp_num]), - .rs2_fp_data (rs2_fp_data[gpr_read_if.warp_num]), + .rs1_data (rs1_data[gpr_read_if.warp_num]), + .rs2_data (rs2_data[gpr_read_if.warp_num]), // outputs - .raddr1 (raddr1), - .raddr2 (raddr2), - .gpr_read_if (gpr_read_if) + .raddr1 (raddr1), + .gpr_read_if(gpr_read_if) ); - `else assign raddr1 = gpr_read_if.rs1; - assign raddr2 = gpr_read_if.rs2; - assign gpr_read_if.rs1_data = rs1_int_data[gpr_read_if.warp_num]; - assign gpr_read_if.rs2_data = rs2_int_data[gpr_read_if.warp_num]; + assign gpr_read_if.rs1_data = rs1_data[gpr_read_if.warp_num]; + assign gpr_read_if.rs2_data = rs2_data[gpr_read_if.warp_num]; assign gpr_read_if.rs3_data = 0; assign gpr_read_if.ready = 1; wire valid = gpr_read_if.valid; - wire rs1_is_fp = gpr_read_if.rs1_is_fp; - wire rs2_is_fp = gpr_read_if.rs2_is_fp; wire use_rs3 = gpr_read_if.use_rs3; wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3; `UNUSED_VAR (valid); - `UNUSED_VAR (rs1_is_fp); - `UNUSED_VAR (rs2_is_fp); `UNUSED_VAR (use_rs3); `UNUSED_VAR (rs3); `endif diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 23439d7d..bde5d1c9 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -53,7 +53,7 @@ module VX_gpu_unit #( wire[`NUM_THREADS-1:0] split_new_use_mask; wire[`NUM_THREADS-1:0] split_new_later_mask; - for (i = 0; i < `NUM_THREADS; i++) begin : masks_init + for (i = 0; i < `NUM_THREADS; i++) begin wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1); assign split_new_use_mask[i] = gpu_req_if.thread_mask[i] & (curr_bool); assign split_new_later_mask[i] = gpu_req_if.thread_mask[i] & (!curr_bool); diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index a445022f..1967df71 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -23,8 +23,6 @@ module VX_issue #( assign gpr_read_if.rs1 = decode_if.rs1; assign gpr_read_if.rs2 = decode_if.rs2; assign gpr_read_if.rs3 = decode_if.rs3; - assign gpr_read_if.rs1_is_fp = decode_if.rs1_is_fp; - assign gpr_read_if.rs2_is_fp = decode_if.rs2_is_fp; assign gpr_read_if.use_rs3 = decode_if.use_rs3; wire [`ISTAG_BITS-1:0] issue_tag, issue_tmp_tag; @@ -52,8 +50,7 @@ module VX_issue #( .mul_busy (mul_busy), .fpu_busy (fpu_busy), .gpu_busy (gpu_busy), - .issue_tag (issue_tag), - `UNUSED_PIN (is_empty) + .issue_tag (issue_tag) ); VX_gpr_stage #( @@ -72,14 +69,14 @@ module VX_issue #( wire flush = alu_req_if.ready && ~decode_if.ready; VX_generic_register #( - .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) - ) decode_reg ( + .N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32)) + ) issue_reg ( .clk (clk), .reset (reset), .stall (stall), .flush (flush), - .in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}), - .out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data}) + .in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}), + .out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data}) ); VX_issue_demux issue_demux ( diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 49c74062..dcea510d 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -1,6 +1,8 @@ `ifndef VX_PLATFORM `define VX_PLATFORM +/////////////////////////////////////////////////////////////////////////////// + `ifndef NDEBUG `define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \ x \ @@ -45,6 +47,12 @@ `define ENABLE_TRACING /* verilator tracing_on */ `define DISABLE_TRACING /* verilator tracing_off */ +/////////////////////////////////////////////////////////////////////////////// + +`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *) + +/////////////////////////////////////////////////////////////////////////////// + `define CLOG2(x) $clog2(x) `define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0)) `define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1) diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index 19055311..08f65575 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -16,106 +16,55 @@ module VX_scheduler #( input wire mul_busy, input wire fpu_busy, input wire gpu_busy, - output wire [`ISTAG_BITS-1:0] issue_tag, - output wire is_empty + output wire [`ISTAG_BITS-1:0] issue_tag ); - localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); - - `ifdef EXT_F_ENABLE - localparam NREGS = (`NUM_REGS * 2); - reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0]; - wire [`NR_BITS:0] read_rs1 = {decode_if.rs1_is_fp, decode_if.rs1}; - wire [`NR_BITS:0] read_rs2 = {decode_if.rs2_is_fp, decode_if.rs2}; - wire [`NR_BITS:0] read_rs3 = {1'b1, decode_if.rs3}; - wire [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd}; - wire [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd}; - wire rs3_inuse = inuse_table[decode_if.warp_num][read_rs3]; - `else - localparam NREGS = `NUM_REGS; - reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0]; - wire [`NR_BITS-1:0] read_rs1 = decode_if.rs1; - wire [`NR_BITS-1:0] read_rs2 = decode_if.rs2; - wire [`NR_BITS-1:0] read_rd = decode_if.rd; - wire [`NR_BITS-1:0] write_rd = writeback_if.rd; - wire rs3_inuse = 0; - `endif - - reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][NREGS-1:0]; - reg [CTVW-1:0] count_valid; + localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); + reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][`NUM_REGS-1:0]; + reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0]; - wire rs1_inuse = inuse_table[decode_if.warp_num][read_rs1]; - wire rs2_inuse = inuse_table[decode_if.warp_num][read_rs2]; - wire rd_inuse = inuse_table[decode_if.warp_num][read_rd]; + wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.warp_num] & decode_if.reg_use_mask; + wire inuse_hazard = (inuse_mask != 0); - wire rs1_inuse_qual = rs1_inuse && decode_if.use_rs1; - wire rs2_inuse_qual = rs2_inuse && decode_if.use_rs2; - wire rs3_inuse_qual = rs3_inuse && decode_if.use_rs3; - wire rd_inuse_qual = rd_inuse && decode_if.wb; - - wire inuse_valid = (rd_inuse_qual || rs1_inuse_qual || rs2_inuse_qual || rs3_inuse_qual); - - wire ex_stalled = ((gpr_busy) - || (alu_busy && (decode_if.ex_type == `EX_ALU)) + wire exu_stalled = (alu_busy && (decode_if.ex_type == `EX_ALU)) || (lsu_busy && (decode_if.ex_type == `EX_LSU)) || (csr_busy && (decode_if.ex_type == `EX_CSR)) || (mul_busy && (decode_if.ex_type == `EX_MUL)) || (fpu_busy && (decode_if.ex_type == `EX_FPU)) - || (gpu_busy && (decode_if.ex_type == `EX_GPU))); + || (gpu_busy && (decode_if.ex_type == `EX_GPU)); wire issue_buf_full; - wire stall = (ex_stalled || inuse_valid || issue_buf_full) && decode_if.valid; + wire stall = (gpr_busy || exu_stalled || inuse_hazard || issue_buf_full) && decode_if.valid; wire acquire_rd = decode_if.valid && (decode_if.wb != 0) && ~stall; wire release_rd = writeback_if.valid; - wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][write_rd] & ~writeback_if.thread_mask; + wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.thread_mask; - reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == inuse_registers_n))) ? (count_valid + 1) : - (~acquire_rd && (release_rd && (0 == inuse_registers_n))) ? (count_valid - 1) : - count_valid; - always @(posedge clk) begin + always @(posedge clk) begin if (reset) begin integer i, w; for (w = 0; w < `NUM_WARPS; w++) begin - for (i = 0; i < NREGS; i++) begin - inuse_registers[w][i] <= 0; - inuse_table[w][i] <= 0; + for (i = 0; i < `NUM_REGS; i++) begin + inuse_registers[w][i] <= 0; end + inuse_reg_mask[w] <= 0; end - count_valid <= 0; end else begin if (acquire_rd) begin - inuse_registers[decode_if.warp_num][read_rd] <= decode_if.thread_mask; - inuse_table[decode_if.warp_num][read_rd] <= 1; + inuse_registers[decode_if.warp_num][decode_if.rd] <= decode_if.thread_mask; + inuse_reg_mask[decode_if.warp_num][decode_if.rd] <= 1; end if (release_rd) begin - assert(inuse_table[writeback_if.warp_num][write_rd] != 0); - inuse_registers[writeback_if.warp_num][write_rd] <= inuse_registers_n; - inuse_table[writeback_if.warp_num][write_rd] <= (| inuse_registers_n); + assert(inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] != 0); + inuse_registers[writeback_if.warp_num][writeback_if.rd] <= inuse_registers_n; + inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] <= (| inuse_registers_n); end - count_valid <= count_valid_next; end end - wire ib_acquire = decode_if.valid && ~stall; - - `DEBUG_BLOCK( - wire [`NW_BITS-1:0] cis_alu_warp_num = cmt_to_issue_if.alu_data.warp_num; - wire [`NUM_THREADS-1:0] cis_alu_thread_mask = cmt_to_issue_if.alu_data.thread_mask; - wire [31:0] cis_alu_curr_PC = cmt_to_issue_if.alu_data.curr_PC; - wire [`NR_BITS-1:0] cis_alu_rd = cmt_to_issue_if.alu_data.rd; - wire cis_alu_rd_is_fp = cmt_to_issue_if.alu_data.rd_is_fp; - wire cis_alu_wb = cmt_to_issue_if.alu_data.wb; - - wire [`NW_BITS-1:0] cis_fpu_warp_num = cmt_to_issue_if.fpu_data.warp_num; - wire [`NUM_THREADS-1:0] cis_fpu_thread_mask = cmt_to_issue_if.fpu_data.thread_mask; - wire [31:0] cis_fpu_curr_PC = cmt_to_issue_if.fpu_data.curr_PC; - wire [`NR_BITS-1:0] cis_fpu_rd = cmt_to_issue_if.fpu_data.rd; - wire cis_fpu_rd_is_fp = cmt_to_issue_if.fpu_data.rd_is_fp; - wire cis_fpu_wb = cmt_to_issue_if.fpu_data.wb; - ) + wire issue_fire = decode_if.valid && ~stall; VX_cam_buffer #( .DATAW ($bits(is_data_t)), @@ -124,9 +73,9 @@ module VX_scheduler #( ) issue_buffer ( .clk (clk), .reset (reset), - .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rd_is_fp, decode_if.wb}), + .write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}), .write_addr (issue_tag), - .acquire_slot (ib_acquire), + .acquire_slot (issue_fire), .release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}), .read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}), .read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}), @@ -135,14 +84,12 @@ module VX_scheduler #( assign decode_if.ready = ~stall; - assign is_empty = (0 == count_valid); - `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (stall) begin $display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, gpr=%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b", - $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, rd_inuse_qual, rs1_inuse_qual, - rs2_inuse_qual, rs3_inuse_qual, gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy); + $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], + inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy); end end `endif diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 00e44439..04c2d80e 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -205,24 +205,26 @@ module VX_warp_sched #( assign {join_fall, join_pc, join_tm} = ipdom[join_if.warp_num]; genvar i; - for (i = 0; i < `NUM_WARPS; i++) begin : stacks + for (i = 0; i < `NUM_WARPS; i++) begin wire correct_warp_s = (i == warp_ctl_if.warp_num); wire correct_warp_j = (i == join_if.warp_num); wire push = (warp_ctl_if.is_split && warp_ctl_if.do_split) && correct_warp_s; wire pop = join_if.is_join && correct_warp_j; - VX_generic_stack #( + VX_ipdom_stack #( .WIDTH(1+32+`NUM_THREADS), - .DEPTH($clog2(`NUM_THREADS)+1) - ) ipdom_stack( + .DEPTH(`NT_BITS+1) + ) ipdom_stack ( .clk (clk), .reset(reset), .push (push), .pop (pop), .d (ipdom[i]), .q1 (q1), - .q2 (q2) + .q2 (q2), + `UNUSED_PIN (empty), + `UNUSED_PIN (full) ); end diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 7230502f..953db5d1 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -24,7 +24,6 @@ module VX_writeback #( reg [`NUM_THREADS-1:0] wb_thread_mask [`ISSUEQ_SIZE-1:0]; reg [31:0] wb_curr_PC [`ISSUEQ_SIZE-1:0]; reg [`NR_BITS-1:0] wb_rd [`ISSUEQ_SIZE-1:0]; - reg wb_rd_is_fp [`ISSUEQ_SIZE-1:0]; reg [`ISSUEQ_SIZE-1:0] wb_pending, wb_pending_n; reg [`ISTAG_BITS-1:0] wb_index; @@ -75,7 +74,6 @@ module VX_writeback #( wb_thread_mask [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.thread_mask; wb_curr_PC [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.curr_PC; wb_rd [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd; - wb_rd_is_fp [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd_is_fp; end if (lsu_commit_if.valid) begin wb_data [lsu_commit_if.issue_tag] <= lsu_commit_if.data; @@ -83,7 +81,6 @@ module VX_writeback #( wb_thread_mask [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.thread_mask; wb_curr_PC [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.curr_PC; wb_rd [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd; - wb_rd_is_fp [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd_is_fp; end if (csr_commit_if.valid) begin wb_data [csr_commit_if.issue_tag] <= csr_commit_if.data; @@ -91,7 +88,6 @@ module VX_writeback #( wb_thread_mask [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.thread_mask; wb_curr_PC [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.curr_PC; wb_rd [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd; - wb_rd_is_fp [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd_is_fp; end if (mul_commit_if.valid) begin wb_data [mul_commit_if.issue_tag] <= mul_commit_if.data; @@ -99,7 +95,6 @@ module VX_writeback #( wb_thread_mask [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.thread_mask; wb_curr_PC [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.curr_PC; wb_rd [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd; - wb_rd_is_fp [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd_is_fp; end if (fpu_commit_if.valid) begin wb_data [fpu_commit_if.issue_tag] <= fpu_commit_if.data; @@ -107,7 +102,6 @@ module VX_writeback #( wb_thread_mask [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.thread_mask; wb_curr_PC [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.curr_PC; wb_rd [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd; - wb_rd_is_fp [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd_is_fp; end wb_pending <= wb_pending_n; @@ -122,7 +116,6 @@ module VX_writeback #( assign writeback_if.thread_mask = wb_thread_mask [wb_index]; assign writeback_if.curr_PC = wb_curr_PC [wb_index]; assign writeback_if.rd = wb_rd [wb_index]; - assign writeback_if.rd_is_fp = wb_rd_is_fp [wb_index]; assign writeback_if.data = wb_data [wb_index]; // commit back-pressure diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index 1802801d..1fd5b96e 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -21,16 +21,12 @@ interface VX_decode_if (); wire rs1_is_PC; wire rs2_is_imm; - - wire use_rs1; - wire use_rs2; + + wire [`NUM_REGS-1:0] reg_use_mask; // FP states wire [`NR_BITS-1:0] rs3; - wire use_rs3; - wire rd_is_fp; - wire rs1_is_fp; - wire rs2_is_fp; + wire use_rs3; wire [`FRM_BITS-1:0] frm; wire wb; diff --git a/hw/rtl/interfaces/VX_gpr_read_if.v b/hw/rtl/interfaces/VX_gpr_read_if.v index 44d2475c..9b24ce56 100644 --- a/hw/rtl/interfaces/VX_gpr_read_if.v +++ b/hw/rtl/interfaces/VX_gpr_read_if.v @@ -13,10 +13,7 @@ interface VX_gpr_read_if (); wire [`NR_BITS-1:0] rs2; wire [`NR_BITS-1:0] rs3; - wire use_rs3; - - wire rs1_is_fp; - wire rs2_is_fp; + wire use_rs3; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index c3e3ec34..859db75b 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -14,8 +14,8 @@ interface VX_wb_if (); `IGNORE_WARNINGS_END wire [`NR_BITS-1:0] rd; - wire rd_is_fp; wire [`NUM_THREADS-1:0][31:0] data; + wire ready; endinterface diff --git a/hw/rtl/libs/VX_cam_buffer.v b/hw/rtl/libs/VX_cam_buffer.v index ad877564..06379f15 100644 --- a/hw/rtl/libs/VX_cam_buffer.v +++ b/hw/rtl/libs/VX_cam_buffer.v @@ -12,7 +12,7 @@ module VX_cam_buffer #( output wire [ADDRW-1:0] write_addr, input wire acquire_slot, input wire [RPORTS-1:0][ADDRW-1:0] read_addr, - output reg [RPORTS-1:0][DATAW-1:0] read_data, + output reg [RPORTS-1:0][DATAW-1:0] read_data, input wire [RPORTS-1:0] release_slot, output wire full ); diff --git a/hw/rtl/libs/VX_generic_queue.v b/hw/rtl/libs/VX_generic_queue.v index 8beff584..ba4a3228 100644 --- a/hw/rtl/libs/VX_generic_queue.v +++ b/hw/rtl/libs/VX_generic_queue.v @@ -52,11 +52,7 @@ module VX_generic_queue #( end else begin // (SIZE > 1) - `ifdef QUEUE_FORCE_MLAB - (* syn_ramstyle = "mlab" *) reg [DATAW-1:0] data [SIZE-1:0]; - `else - reg [DATAW-1:0] data [SIZE-1:0]; - `endif + `USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0]; if (0 == BUFFERED) begin diff --git a/hw/rtl/libs/VX_generic_stack.v b/hw/rtl/libs/VX_generic_stack.v deleted file mode 100644 index 252d3032..00000000 --- a/hw/rtl/libs/VX_generic_stack.v +++ /dev/null @@ -1,34 +0,0 @@ - -`include "VX_platform.vh" - -module VX_generic_stack #( - parameter WIDTH = 1, - parameter DEPTH = 1 -) ( - input wire clk, - input wire reset, - input wire push, - input wire pop, - input reg [WIDTH - 1:0] q1, - input reg [WIDTH - 1:0] q2, - output wire[WIDTH - 1:0] d -); - - reg [DEPTH - 1:0] ptr; - reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1]; - - always @(posedge clk) begin - if (reset) begin - ptr <= 0; - end else if (push) begin - stack[ptr] <= q1; - stack[ptr+1] <= q2; - ptr <= ptr + 2; - end else if (pop) begin - ptr <= ptr - 1; - end - end - - assign d = stack[ptr - 1]; - -endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_index_queue.v b/hw/rtl/libs/VX_index_queue.v index ed8e0970..75252e56 100644 --- a/hw/rtl/libs/VX_index_queue.v +++ b/hw/rtl/libs/VX_index_queue.v @@ -15,7 +15,7 @@ module VX_index_queue #( input wire [`LOG2UP(SIZE)-1:0] read_addr, output wire [DATAW-1:0] read_data ); - reg [DATAW-1:0] data [SIZE-1:0]; + `USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0]; reg [SIZE-1:0] valid; reg [`LOG2UP(SIZE):0] rd_ptr, wr_ptr; diff --git a/hw/unit_tests/VX_divide_tb.v b/hw/unit_tests/VX_divide_tb.v index 1b14f526..92ccfac7 100644 --- a/hw/unit_tests/VX_divide_tb.v +++ b/hw/unit_tests/VX_divide_tb.v @@ -19,7 +19,7 @@ module VX_tb_divide(); genvar i; generate - for (i = 0; i < 8; i++) begin : div_loop + for (i = 0; i < 8; i++) begin VX_divide#( .WIDTHN(32), .WIDTHD(32),