diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 55ea3dd4..15d029fe 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -1,6 +1,4 @@ `include "VX_define.vh" -`include "fpnew_pkg.sv" -`include "defs_div_sqrt_mvp.sv" module VX_alu_unit #( parameter CORE_ID = 0 diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index 457d8308..c4e4d294 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -11,7 +11,7 @@ module VX_commit #( VX_commit_if lsu_commit_if, VX_commit_if mul_commit_if, VX_commit_if csr_commit_if, - VX_commit_if fpu_commit_if, + VX_commit_fp_if fpu_commit_if, VX_commit_if gpu_commit_if, // outputs @@ -70,7 +70,7 @@ module VX_commit #( .fpu_commit_if (fpu_commit_if), .writeback_if (writeback_if) - ); + ); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ef4b9e78..95d23af3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -63,6 +63,10 @@ `define EXT_M_ENABLE 1 `endif +`ifndef EXT_F_ENABLE +`define EXT_F_ENABLE 1 +`endif + // Configuration Values ======================================================= `define VENDOR_ID 0 @@ -71,6 +75,10 @@ // CSR Addresses ============================================================== +`define CSR_FFLAGS 12'hF01 +`define CSR_FRM 12'hF02 +`define CSR_FCSR 12'hF03 + `define CSR_VEND_ID 12'hF11 `define CSR_ARCH_ID 12'hF12 `define CSR_IMPL_ID 12'hF13 @@ -91,6 +99,16 @@ `define CSR_MISA 12'h301 +// Size of MUL Request Queue Size +`ifndef MULRQ_SIZE +`define MULRQ_SIZE 8 +`endif + +// Size of FPU Request Queue Size +`ifndef FPURQ_SIZE +`define FPURQ_SIZE 8 +`endif + // Dcache Configurable Knobs ================================================== // Size of cache in bytes @@ -407,5 +425,4 @@ `define L3PRFQ_STRIDE 0 `endif - // VX_CONFIG `endif diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index abe7f7e8..e9c93ab3 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -4,33 +4,82 @@ module VX_csr_data #( parameter CORE_ID = 0 ) ( input wire clk, + input wire reset, + + VX_perf_cntrs_if perf_cntrs_if, + VX_fpu_from_csr_if fpu_from_csr_if, + VX_fpu_to_csr_if fpu_to_csr_if, + + input wire[`NW_BITS-1:0] warp_num, input wire[`CSR_ADDR_SIZE-1:0] read_addr, output reg[31:0] read_data, - input wire write_enable, + input wire write_enable, `IGNORE_WARNINGS_BEGIN // We use a smaller storage for CSRs than the standard 4KB in RISC-V input wire[`CSR_ADDR_SIZE-1:0] write_addr, `IGNORE_WARNINGS_END - input wire[`CSR_WIDTH-1:0] write_data, - input wire[`NW_BITS-1:0] warp_num, - VX_perf_cntrs_if perf_cntrs_if + input wire[`CSR_WIDTH-1:0] write_data ); reg [`CSR_WIDTH-1:0] csr_table[`NUM_CSRS-1:0]; + reg [`FFG_BITS+`FRM_BITS-1:0] fflags_table [`NUM_WARPS-1:0]; + reg [`FRM_BITS-1:0] frm_table [`NUM_WARPS-1:0]; + reg [`FFG_BITS+`FRM_BITS-1:0] fcsr_table [`NUM_WARPS-1:0]; // fflags + frm + // cast address to physical CSR range wire [$clog2(`NUM_CSRS)-1:0] rd_addr, wr_addr; assign rd_addr = $size(rd_addr)'(read_addr); - assign wr_addr = $size(wr_addr)'(write_addr); + assign wr_addr = $size(wr_addr)'(write_addr); + + wire [`FFG_BITS-1:0] fflags_update; + assign fflags_update[4] = fpu_to_csr_if.fflags_NV; + assign fflags_update[3] = fpu_to_csr_if.fflags_DZ; + assign fflags_update[2] = fpu_to_csr_if.fflags_OF; + assign fflags_update[1] = fpu_to_csr_if.fflags_UF; + assign fflags_update[0] = fpu_to_csr_if.fflags_NX; + + integer i; always @(posedge clk) begin - if (write_enable) begin - csr_table[wr_addr] <= write_data; + if (reset) begin + for (i = 0; i < `NUM_WARPS; i++) begin + fflags_table[i] <= 0; + frm_table[i] <= 0; + fcsr_table[i] <= 0; + end + end else begin + if (write_enable) begin + case (write_addr) + `CSR_FFLAGS: begin + fcsr_table[warp_num][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; + fflags_table[warp_num][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; + end + `CSR_FRM: begin + fcsr_table[warp_num][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; + frm_table[warp_num] <= write_data[`FRM_BITS-1:0]; + end + `CSR_FCSR: begin + fcsr_table[warp_num] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; + frm_table[warp_num] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS]; + fflags_table[warp_num][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; + end + default: begin + csr_table[wr_addr] <= write_data; + end + endcase + end else if (fpu_to_csr_if.valid) begin + fflags_table[fpu_to_csr_if.warp_num][`FFG_BITS-1:0] <= fflags_update; + fcsr_table[fpu_to_csr_if.warp_num][`FFG_BITS-1:0] <= fflags_update; + end end end always @(*) begin case (read_addr) + `CSR_FFLAGS : read_data = 32'(fflags_table[warp_num]); + `CSR_FRM : read_data = 32'(frm_table[warp_num]); + `CSR_FCSR : read_data = 32'(fcsr_table[warp_num]); `CSR_LWID : read_data = 32'(warp_num); `CSR_GTID , `CSR_GWID : read_data = CORE_ID * `NUM_WARPS + 32'(warp_num); @@ -48,6 +97,8 @@ module VX_csr_data #( `CSR_MISA : read_data = `ISA_CODE; default : read_data = 32'(csr_table[rd_addr]); endcase - end + end + + assign fpu_from_csr_if.frm = frm_table[fpu_from_csr_if.warp_num]; endmodule diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 96b4364c..96a7973f 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -6,7 +6,9 @@ module VX_csr_unit #( input wire clk, input wire reset, - VX_perf_cntrs_if perf_cntrs_if, + VX_perf_cntrs_if perf_cntrs_if, + + VX_fpu_from_csr_if fpu_from_csr_if, VX_fpu_to_csr_if fpu_to_csr_if, VX_csr_io_req_if csr_io_req_if, @@ -48,13 +50,16 @@ module VX_csr_unit #( .CORE_ID(CORE_ID) ) csr_data ( .clk (clk), + .reset (reset), + .perf_cntrs_if (perf_cntrs_if), + .fpu_to_csr_if (fpu_to_csr_if), + .fpu_from_csr_if(fpu_from_csr_if), .read_addr (csr_pipe_req_if.csr_addr), .read_data (csr_read_data_unqual), .write_enable (is_csr_s2), .write_data (csr_updated_data_s2[`CSR_WIDTH-1:0]), .write_addr (csr_addr_s2), - .warp_num (csr_pipe_req_if.warp_num), - .perf_cntrs_if (perf_cntrs_if) + .warp_num (csr_pipe_req_if.warp_num) ); wire csr_hazard = (csr_addr_s2 == csr_pipe_req_if.csr_addr) diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index f5cc7549..96b39503 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -214,14 +214,14 @@ module VX_decode #( 7'h04: fpu_op = `FPU_SUB; 7'h08: fpu_op = `FPU_MUL; 7'h0C: fpu_op = `FPU_DIV; - 7'h2C: fpu_op = `FPU_SQRT; + 7'h10: fpu_op = (func3[1]) ? `FPU_SGNJX : ((func3[0]) ? `FPU_SGNJN : `FPU_SGNJ); 7'h14: fpu_op = (func3 == 3'h0) ? `FPU_MIN : `FPU_MAX; + 7'h2C: fpu_op = `FPU_SQRT; + 7'h50: fpu_op = `FPU_CMP; // wb to intReg 7'h60: fpu_op = (instr[20]) ? `FPU_CVTWUS : `FPU_CVTWS; // doesn't need rs2, and read rs1 from fpReg, WB to intReg 7'h68: fpu_op = (instr[20]) ? `FPU_CVTSWU : `FPU_CVTSW; // doesn't need rs2, and read rs1 from intReg - 7'h70: fpu_op = (func3 == 3'h0) ? `FPU_MVXW : `FPU_CLASS; // both wb to intReg - 7'h78: fpu_op = `FPU_MVWX; - 7'h50: fpu_op = `FPU_CMP; // wb to intReg - 7'h10: fpu_op = (func3[1]) ? `FPU_SGNJX : ((func3[0]) ? `FPU_SGNJN : `FPU_SGNJ); + 7'h70: fpu_op = (func3 == 3'h0) ? `FPU_MVXW : `FPU_CLASS; // both wb to intReg + 7'h78: fpu_op = `FPU_MVWX; default:; endcase end @@ -283,15 +283,15 @@ module VX_decode #( assign decode_tmp_if.use_rs2 = (decode_tmp_if.rs2 != 0) && (is_btype || is_stype || is_rtype || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))); - - assign decode_tmp_if.rs1_is_fp = (is_fci && ((func7 != 7'h68) && (fpu_op != `FPU_MVWX)) || is_fr4); + + assign decode_tmp_if.rd_is_fp = is_fpu && ~(is_fci && ((func7 == 7'h50) || (func7 == 7'h60) || (func7 == 7'h70))); + assign decode_tmp_if.rs1_is_fp = is_fci && ((func7 != 7'h68) && (fpu_op != `FPU_MVWX)) || is_fr4; assign decode_tmp_if.rs2_is_fp = is_fs || (is_fci && ((func7 != 7'h60) && (func7 != 7'h68)) || is_fr4); assign decode_tmp_if.rs3 = rs3; assign decode_tmp_if.use_rs3 = is_fr4; assign decode_tmp_if.frm = func3; - assign decode_tmp_if.wb = (is_fpu && (is_fl || (is_fci && ((func7 != 7'h50) || (func7 != 7'h70) || (func7 != 7'h60))) || is_fr4)) - || (~is_fpu && (rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype)); + assign decode_tmp_if.wb = is_fpu || ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype)); assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN); assign join_if.warp_num = ifetch_rsp_if.warp_num; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 8751847e..841df9a3 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -25,10 +25,14 @@ `define IGNORE_WARNINGS_BEGIN /* verilator lint_off UNUSED */ \ /* verilator lint_off PINCONNECTEMPTY */ \ + /* verilator lint_off WIDTH */ \ + /* verilator lint_off UNOPTFLAT */ \ /* verilator lint_off DECLFILENAME */ `define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \ /* verilator lint_on PINCONNECTEMPTY */ \ + /* verilator lint_on WIDTH */ \ + /* verilator lint_on UNOPTFLAT */ \ /* verilator lint_on DECLFILENAME */ `define UNUSED_VAR(x) /* verilator lint_off UNUSED */ \ @@ -76,9 +80,14 @@ `define CSR_WIDTH 12 -`define DIV_LATENCY 21 +`define LATENCY_IDIV 21 -`define MUL_LATENCY 2 +`define LATENCY_IMUL 2 + +`define LATENCY_FMULADD 2 +`define LATENCY_FDIVSQRT 2 +`define LATENCY_FCONV 2 +`define LATENCY_FNONCOMP 1 /////////////////////////////////////////////////////////////////////////////// @@ -93,6 +102,15 @@ `define INST_R 7'b0110011 `define INST_F 7'b0001111 `define INST_SYS 7'b1110011 + +`define INST_FL 7'b0000111 +`define INST_FS 7'b0100111 +`define INST_FCI 7'b1010011 +`define INST_FMADD 7'b1000011 +`define INST_FMSUB 7'b1000111 +`define INST_FNMSUB 7'b1001011 +`define INST_FNMADD 7'b1001111 + `define INST_GPU 7'b1101011 `define BYTEEN_SB 3'h0 @@ -150,18 +168,6 @@ `define BR_OP(x) x[`BR_BITS-1:0] `define IS_BR_OP(x) x[4] -`define MUL_MUL 3'h0 -`define MUL_MULH 3'h1 -`define MUL_MULHSU 3'h2 -`define MUL_MULHU 3'h3 -`define MUL_DIV 3'h4 -`define MUL_DIVU 3'h5 -`define MUL_REM 3'h6 -`define MUL_REMU 3'h7 -`define MUL_BITS 3 -`define MUL_OP(x) x[`MUL_BITS-1:0] -`define IS_DIV_OP(x) x[2] - `define LSU_LB {1'b0, `BYTEEN_SB} `define LSU_LH {1'b0, `BYTEEN_SH} `define LSU_LW {1'b0, `BYTEEN_SW} @@ -183,6 +189,53 @@ `define CSR_BITS 2 `define CSR_OP(x) x[`CSR_BITS-1:0] +`define MUL_MUL 3'h0 +`define MUL_MULH 3'h1 +`define MUL_MULHSU 3'h2 +`define MUL_MULHU 3'h3 +`define MUL_DIV 3'h4 +`define MUL_DIVU 3'h5 +`define MUL_REM 3'h6 +`define MUL_REMU 3'h7 +`define MUL_BITS 3 +`define MUL_OP(x) x[`MUL_BITS-1:0] +`define IS_DIV_OP(x) x[2] + +`define FPU_ADD 5'h00 +`define FPU_SUB 5'h01 +`define FPU_MUL 5'h02 +`define FPU_DIV 5'h03 +`define FPU_SQRT 5'h04 +`define FPU_MADD 5'h05 +`define FPU_MSUB 5'h06 +`define FPU_NMSUB 5'h07 +`define FPU_NMADD 5'h08 +`define FPU_SGNJ 5'h09 // FSGNJ +`define FPU_SGNJN 5'h0A // FSGNJN +`define FPU_SGNJX 5'h0B // FSGNJX +`define FPU_MIN 5'h0C // FMIN.S +`define FPU_MAX 5'h0D // FMAX.S +`define FPU_CVTWS 5'h0E // FCVT.W.S +`define FPU_CVTWUS 5'h0F // FCVT.WU.S +`define FPU_CVTSW 5'h10 // FCVT.S.W +`define FPU_CVTSWU 5'h11 // FCVT.S.WU +`define FPU_MVXW 5'h12 // MOV FP from fpReg to integer reg +`define FPU_MVWX 5'h13 // MOV FP from integer reg to fpReg +`define FPU_CLASS 5'h14 +`define FPU_CMP 5'h15 +`define FPU_OTHER 5'h1f +`define FPU_BITS 5 +`define FPU_OP(x) x[`FPU_BITS-1:0] + +`define FRM_RNE 3'b000 +`define FRM_RTZ 3'b001 +`define FRM_RDN 3'b010 +`define FRM_RUP 3'b011 // positive inf +`define FRM_RMM 3'b100 +`define FRM_DYN 3'b111 +`define FRM_BITS 3 +`define FFG_BITS 5 + `define GPU_TMC 3'h0 `define GPU_WSPAWN 3'h1 `define GPU_SPLIT 3'h2 @@ -194,21 +247,16 @@ `define EX_NOP 3'h0 `define EX_ALU 3'h1 -`define EX_MUL 3'h2 -`define EX_LSU 3'h3 -`define EX_CSR 3'h4 -`define EX_GPU 3'h5 +`define EX_LSU 3'h2 +`define EX_CSR 3'h3 +`define EX_MUL 3'h4 +`define EX_FPU 3'h5 +`define EX_GPU 3'h6 `define EX_BITS 3 -`define NUM_EXS 5 +`define NUM_EXS 6 `define NE_BITS `LOG2UP(`NUM_EXS) -`define WB_NO 2'h0 -`define WB_ALU 2'h1 -`define WB_MEM 2'h2 -`define WB_JAL 2'h3 -`define WB_BITS 2 - /////////////////////////////////////////////////////////////////////////////// `define ISA_CODE (0 << 0) // A - Atomic Instructions extension \ @@ -216,14 +264,14 @@ | (0 << 2) // C - Compressed extension \ | (0 << 3) // D - Double precsision floating-point extension \ | (0 << 4) // E - RV32E base ISA \ - | (0 << 5) // F - Single precsision floating-point extension \ + | (`EXT_F_ENABLE << 5) // F - Single precsision floating-point extension \ | (0 << 6) // G - Additional standard extensions present \ | (0 << 7) // H - Hypervisor mode implemented \ | (1 << 8) // I - RV32I/64I/128I base ISA \ | (0 << 9) // J - Reserved \ | (0 << 10) // K - Reserved \ | (0 << 11) // L - Tentatively reserved for Bit operations extension \ - | (1 << 12) // M - Integer Multiply/Divide extension \ + | (`EXT_M_ENABLE << 12) // M - Integer Multiply/Divide extension \ | (0 << 13) // N - User level interrupts supported \ | (0 << 14) // O - Reserved \ | (0 << 15) // P - Tentatively reserved for Packed-SIMD extension \ @@ -241,7 +289,7 @@ /////////////////////////////////////////////////////////////////////////////// `ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num -`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + `WB_BITS + `NR_BITS + `NW_BITS) +`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 1 + `NR_BITS + `NW_BITS) `else `define DEBUG_CORE_REQ_MDATA_WIDTH 0 `endif @@ -492,16 +540,4 @@ task print_instr_op; end endtask -task print_wb; - input [`WB_BITS-1:0] wb; - begin - case (wb) - `WB_ALU: $write("ALU"); - `WB_MEM: $write("MEM"); - `WB_JAL: $write("JAL"); - default: $write("NO"); - endcase - end -endtask - `endif diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 086227bb..6f9716b3 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -35,7 +35,7 @@ module VX_execute #( VX_commit_if lsu_commit_if, VX_commit_if csr_commit_if, VX_commit_if mul_commit_if, - VX_commit_if fpu_commit_if, + VX_commit_fp_if fpu_commit_if, VX_commit_if gpu_commit_if, output wire ebreak @@ -72,6 +72,7 @@ module VX_execute #( .reset (reset), .perf_cntrs_if (perf_cntrs_if), .fpu_to_csr_if (fpu_to_csr_if), + .fpu_from_csr_if(fpu_from_csr_if), .csr_io_req_if (csr_io_req_if), .csr_io_rsp_if (csr_io_rsp_if), .csr_req_if (csr_req_if), diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 770dc53f..4b22850b 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -1,4 +1,6 @@ `include "VX_define.vh" +`include "fpnew_pkg.sv" +`include "defs_div_sqrt_mvp.sv" module VX_fpu_unit #( parameter CORE_ID = 0 @@ -12,7 +14,7 @@ module VX_fpu_unit #( VX_fpu_from_csr_if fpu_from_csr_if, // outputs - VX_commit_if fpu_commit_if, + VX_commit_fp_if fpu_commit_if, VX_fpu_to_csr_if fpu_to_csr_if ); localparam FOP_BITS = fpnew_pkg::OP_BITS; @@ -30,21 +32,21 @@ module VX_fpu_unit #( }; localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{ - PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL - '{default: `LATENCY_FDIVSQRT}, // DIVSQRT - '{default: `LATENCY_FNONCOMP}, // NONCOMP - '{default: `LATENCY_FCONV}}, // CONV - UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL - '{default: fpnew_pkg::MERGED}, // DIVSQRT - '{default: fpnew_pkg::PARALLEL}, // NONCOMP - '{default: fpnew_pkg::MERGED}}, // CONV + PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL + '{default: `LATENCY_FDIVSQRT}, // DIVSQRT + '{default: `LATENCY_FNONCOMP}, // NONCOMP + '{default: `LATENCY_FCONV}}, // CONV + UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL + '{default: fpnew_pkg::MERGED}, // DIVSQRT + '{default: fpnew_pkg::PARALLEL}, // NONCOMP + '{default: fpnew_pkg::MERGED}}, // CONV PipeConfig: fpnew_pkg::DISTRIBUTED }; - wire fpu_in_ready; - wire fpu_in_valid; - wire fpu_out_ready; - wire fpu_out_valid; + wire fpu_in_ready, fpu_in_valid; + wire fpu_out_ready, fpu_out_valid; + + wire [`LOG2UP(`FPURQ_SIZE)-1:0] fpu_in_tag, fpu_out_tag; wire [2:0][`NUM_THREADS-1:0][31:0] fpu_operands; @@ -52,15 +54,13 @@ module VX_fpu_unit #( wire [FMTF_BITS-1:0] fpu_dst_fmt = fpnew_pkg::FP32; wire [FMTI_BITS-1:0] fpu_int_fmt = fpnew_pkg::INT32; - assign fpu_in_valid = (| fpu_req_if.valid); - assign fpu_operands[0] = fpu_req_if.rs1_data; - assign fpu_operands[1] = fpu_req_if.rs2_data; - assign fpu_operands[2] = fpu_req_if.rs3_data; - assign fpu_req_if.ready = fpu_in_ready; - wire [`NUM_THREADS-1:0][31:0] fpu_result; fpnew_pkg::status_t fpu_status; + assign fpu_from_csr_if.warp_num = fpu_req_if.warp_num; + wire is_dyn_rnd = &(fpu_req_if.frm); + wire [`FRM_BITS-1:0] real_frm = is_dyn_rnd ? fpu_from_csr_if.frm : fpu_req_if.frm; + reg [FOP_BITS-1:0] fpu_op; reg [`FRM_BITS-1:0] fpu_rnd; reg fpu_op_mod; @@ -96,10 +96,12 @@ module VX_fpu_unit #( endcase end + assign fpu_operands = {fpu_req_if.rs3_data, fpu_req_if.rs2_data, fpu_req_if.rs1_data}; + fpnew_top #( .Features (FPU_FEATURES), .Implementation (FPU_IMPLEMENTATION), - .TagType (logic) + .TagType (logic [`LOG2UP(`FPURQ_SIZE)-1:0]) ) fpnew_core ( .clk_i (clk), .rst_ni (1'b1), @@ -111,26 +113,59 @@ module VX_fpu_unit #( .dst_fmt_i (fpu_dst_fmt), .int_fmt_i (fpu_int_fmt), .vectorial_op_i (1'b1), - .tag_i (1'b0), + .tag_i (fpu_in_tag), .in_valid_i (fpu_in_valid), .in_ready_o (fpu_in_ready), .flush_i (reset), .result_o (fpu_result), .status_o (fpu_status), - `UNUSED_PIN (tag_o), + .tag_o (fpu_out_tag), .out_valid_o (fpu_out_valid), .out_ready_i (fpu_out_ready), `UNUSED_PIN (busy_o) ); - assign fpu_commit_if.valid = fpu_req_if.valid & {`NUM_THREADS{fpu_out_valid}}; - assign fpu_commit_if.data = fpu_result; - assign fpu_commit_if.wb = fpu_req_if.wb; - assign fpu_commit_if.rd = fpu_req_if.rd; - assign fpu_out_ready = fpu_commit_if.ready; + wire req_push = fpu_req_if.valid && fpu_req_if.ready; + wire req_pop = fpu_out_valid && fpu_out_ready; + wire req_full; + + wire [`NUM_THREADS-1:0] rsp_valid; + wire [`NW_BITS-1:0] rsp_warp_num; + wire [31:0] rsp_curr_PC; + wire rsp_wb; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_rd_is_fp; + + VX_index_queue #( + .DATAW (`NUM_THREADS + `NW_BITS + 32 + 1 + `NR_BITS + 1), + .SIZE (`FPURQ_SIZE) + ) fpu_req_queue ( + .clk (clk), + .reset (reset), + .write_data ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rd_is_fp}), + .write_addr (fpu_in_tag), + .push (req_push), + .full (req_full), + .pop (req_pop), + .read_addr (fpu_out_tag), + .read_data ({rsp_valid, rsp_warp_num, rsp_curr_PC, rsp_wb, rsp_rd, rsp_rd_is_fp}), + `UNUSED_PIN (empty) + ); + + assign fpu_in_valid = (| fpu_req_if.valid) && ~req_full; + assign fpu_req_if.ready = fpu_in_ready && ~req_full; + + assign fpu_commit_if.valid = rsp_valid & {`NUM_THREADS{fpu_out_valid}}; + assign fpu_commit_if.warp_num = rsp_warp_num; + assign fpu_commit_if.curr_PC = rsp_curr_PC; + assign fpu_commit_if.data = fpu_result; + assign fpu_commit_if.wb = rsp_wb; + assign fpu_commit_if.rd = rsp_rd; + assign fpu_commit_if.rd_is_fp = rsp_rd_is_fp; + assign fpu_out_ready = fpu_commit_if.ready; assign fpu_to_csr_if.valid = fpu_out_valid; - assign fpu_to_csr_if.warp_num = fpu_req_if.warp_num; + assign fpu_to_csr_if.warp_num = rsp_warp_num; assign fpu_to_csr_if.fflags_NV = fpu_status.NV; assign fpu_to_csr_if.fflags_DZ = fpu_status.DZ; assign fpu_to_csr_if.fflags_OF = fpu_status.OF; diff --git a/hw/rtl/VX_gpr_fp_ctrl.v b/hw/rtl/VX_gpr_fp_ctrl.v index bacc36a5..1c7407f1 100644 --- a/hw/rtl/VX_gpr_fp_ctrl.v +++ b/hw/rtl/VX_gpr_fp_ctrl.v @@ -50,7 +50,7 @@ module VX_gpr_fp_ctrl ( if (decode_if.rs1_is_fp) begin tmp_rs1_data <= rs1_fp_data; end else begin - tmp_rs1_data <= decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : rs1_int_data; + tmp_rs1_data <= decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : rs1_int_data; end end end diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 01c9c281..37da6ed9 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -34,7 +34,7 @@ module VX_gpr_stage #( // Int GPRs VX_gpr_ram gpr_int_ram ( .clk (clk), - .we (we[i] & {`NUM_THREADS{~writeback_if.is_fp}}), + .we (we[i] & {`NUM_THREADS{~writeback_if.rd_is_fp}}), .waddr (writeback_if.rd), .wdata (writeback_if.data), .rs1 (raddr1), @@ -46,7 +46,7 @@ module VX_gpr_stage #( // FP GPRs VX_gpr_ram gpr_fp_ram ( .clk (clk), - .we (we[i] & {`NUM_THREADS{writeback_if.is_fp}}), + .we (we[i] & {`NUM_THREADS{writeback_if.rd_is_fp}}), .waddr (writeback_if.rd), .wdata (writeback_if.data), .rs1 (raddr1), diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 37eef92c..e2be9f37 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -23,7 +23,7 @@ module VX_icache_stage #( wire valid_inst = (| ifetch_req_if.valid); - wire [`LOG2UP(`ICREQ_SIZE)-1:0] mrq_write_addr, mrq_read_addr, dbg_mrq_write_addr; + wire [`LOG2UP(`ICREQ_SIZE)-1:0] mrq_write_addr, mrq_read_addr; wire mrq_full; wire mrq_push = icache_req_if.valid && icache_req_if.ready; @@ -32,18 +32,18 @@ module VX_icache_stage #( assign mrq_read_addr = icache_rsp_if.tag[0][`LOG2UP(`ICREQ_SIZE)-1:0]; VX_index_queue #( - .DATAW (`LOG2UP(`ICREQ_SIZE) + 32 + `NW_BITS), + .DATAW (32 + `NW_BITS), .SIZE (`ICREQ_SIZE) ) mem_req_queue ( .clk (clk), .reset (reset), - .write_data ({mrq_write_addr, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}), + .write_data ({ifetch_req_if.curr_PC, ifetch_req_if.warp_num}), .write_addr (mrq_write_addr), .push (mrq_push), .full (mrq_full), .pop (mrq_pop), .read_addr (mrq_read_addr), - .read_data ({dbg_mrq_write_addr, ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num}), + .read_data ({ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num}), `UNUSED_PIN (empty) ); @@ -51,9 +51,6 @@ module VX_icache_stage #( if (mrq_push) begin valid_threads[ifetch_req_if.warp_num] <= ifetch_req_if.valid; end - if (mrq_pop) begin - assert(mrq_read_addr == dbg_mrq_write_addr); - end end // Icache Request @@ -67,7 +64,7 @@ module VX_icache_stage #( assign ifetch_req_if.ready = !mrq_full && icache_req_if.ready; `ifdef DBG_CORE_REQ_INFO - assign icache_req_if.tag = {ifetch_req_if.curr_PC, 2'b1, 5'b0, ifetch_req_if.warp_num, mrq_write_addr}; + assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, 5'b0, ifetch_req_if.warp_num, mrq_write_addr}; `else assign icache_req_if.tag = mrq_write_addr; `endif diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 261a0927..3392f719 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -64,7 +64,7 @@ module VX_issue #( VX_fpu_req_if fpu_req_tmp_if(); VX_gpu_req_if gpu_req_tmp_if(); - VX_issue_mux issue_mux ( + VX_issue_demux issue_demux ( .decode_if (decode_if), .gpr_data_if (gpr_data_if), .alu_req_if (alu_req_tmp_if), @@ -134,14 +134,14 @@ module VX_issue #( ); VX_generic_register #( - .N(`NUM_THREADS +`NW_BITS + 32 + `FPU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `FRM_BITS) + .N(`NUM_THREADS +`NW_BITS + 32 + `FPU_BITS + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `FRM_BITS) ) fpu_reg ( .clk (clk), .reset (reset), .stall (stall_fpu), .flush (flush_fpu), - .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data, fpu_req_tmp_if.frm}), - .out ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.fpu_op, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data, fpu_req_if.frm}) + .in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rd_is_fp, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data, fpu_req_tmp_if.frm}), + .out ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.fpu_op, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rd_is_fp, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data, fpu_req_if.frm}) ); VX_generic_register #( diff --git a/hw/rtl/VX_issue_mux.v b/hw/rtl/VX_issue_mux.v deleted file mode 100644 index 68ec6159..00000000 --- a/hw/rtl/VX_issue_mux.v +++ /dev/null @@ -1,90 +0,0 @@ -`include "VX_define.vh" - -module VX_issue_mux ( - // inputs - VX_decode_if decode_if, - VX_gpr_data_if gpr_data_if, - - // outputs - VX_alu_req_if alu_req_if, - VX_lsu_req_if lsu_req_if, - VX_csr_req_if csr_req_if, - VX_mul_req_if mul_req_if, - VX_fpu_req_if fpu_req_if, - VX_gpu_req_if gpu_req_if -); - - wire[`NUM_THREADS-1:0] is_alu = {`NUM_THREADS{decode_if.ex_type == `EX_ALU}}; - wire[`NUM_THREADS-1:0] is_lsu = {`NUM_THREADS{decode_if.ex_type == `EX_LSU}}; - wire[`NUM_THREADS-1:0] is_csr = {`NUM_THREADS{decode_if.ex_type == `EX_CSR}}; - wire[`NUM_THREADS-1:0] is_mul = {`NUM_THREADS{decode_if.ex_type == `EX_MUL}}; - wire[`NUM_THREADS-1:0] is_fpu = {`NUM_THREADS{decode_if.ex_type == `EX_FPU}}; - wire[`NUM_THREADS-1:0] is_gpu = {`NUM_THREADS{decode_if.ex_type == `EX_GPU}}; - - // ALU unit - assign alu_req_if.valid = decode_if.valid & is_alu; - assign alu_req_if.warp_num = decode_if.warp_num; - assign alu_req_if.curr_PC = decode_if.curr_PC; - assign alu_req_if.alu_op = `ALU_OP(decode_if.instr_op); - assign alu_req_if.rd = decode_if.rd; - assign alu_req_if.wb = decode_if.wb; - assign alu_req_if.rs1_data = gpr_data_if.rs1_data; - assign alu_req_if.rs2_data = gpr_data_if.rs2_data; - assign alu_req_if.offset = decode_if.imm; - assign alu_req_if.next_PC = decode_if.next_PC; - - // LSU unit - assign lsu_req_if.valid = decode_if.valid & is_lsu; - assign lsu_req_if.warp_num = decode_if.warp_num; - assign lsu_req_if.curr_PC = decode_if.curr_PC; - assign lsu_req_if.base_addr = gpr_data_if.rs1_data; - assign lsu_req_if.store_data = gpr_data_if.rs2_data; - assign lsu_req_if.offset = decode_if.imm; - assign lsu_req_if.rw = `LSU_RW(decode_if.instr_op); - assign lsu_req_if.byteen = `LSU_BE(decode_if.instr_op); - assign lsu_req_if.rd = decode_if.rd; - assign lsu_req_if.wb = decode_if.wb; - - // CSR unit - assign csr_req_if.valid = decode_if.valid & is_csr; - assign csr_req_if.warp_num = decode_if.warp_num; - assign csr_req_if.curr_PC = decode_if.curr_PC; - assign csr_req_if.csr_op = `CSR_OP(decode_if.instr_op); - assign csr_req_if.csr_addr = decode_if.imm[`CSR_ADDR_SIZE-1:0]; - assign csr_req_if.csr_mask = decode_if.rs2_is_imm ? 32'(decode_if.rs1) : gpr_data_if.rs1_data[0]; - assign csr_req_if.rd = decode_if.rd; - assign csr_req_if.wb = decode_if.wb; - assign csr_req_if.is_io = 1'b0; - - // MUL unit - assign mul_req_if.valid = decode_if.valid & is_mul; - assign mul_req_if.warp_num = decode_if.warp_num; - assign mul_req_if.curr_PC = decode_if.curr_PC; - assign mul_req_if.mul_op = `MUL_OP(decode_if.instr_op); - assign mul_req_if.rs1_data = gpr_data_if.rs1_data; - assign mul_req_if.rs2_data = gpr_data_if.rs2_data; - assign mul_req_if.rd = decode_if.rd; - assign mul_req_if.wb = decode_if.wb; - - // FPU unit - assign fpu_req_if.valid = decode_if.valid & is_fpu; - assign fpu_req_if.warp_num = decode_if.warp_num; - assign fpu_req_if.curr_PC = decode_if.curr_PC; - assign fpu_req_if.fpu_op = `FPU_OP(decode_if.instr_op); - assign fpu_req_if.rs1_data = gpr_data_if.rs1_data; - assign fpu_req_if.rs2_data = gpr_data_if.rs2_data; - assign fpu_req_if.rs3_data = gpr_data_if.rs3_data; - assign fpu_req_if.frm = decode_if.frm; - assign fpu_req_if.rd = decode_if.rd; - assign fpu_req_if.wb = decode_if.wb; - - // GPU unit - assign gpu_req_if.valid = decode_if.valid & is_gpu; - assign gpu_req_if.warp_num = decode_if.warp_num; - assign gpu_req_if.curr_PC = decode_if.curr_PC; - assign gpu_req_if.gpu_op = `GPU_OP(decode_if.instr_op); - assign gpu_req_if.rs1_data = gpr_data_if.rs1_data; - assign gpu_req_if.rs2_data = gpr_data_if.rs2_data[0]; - assign gpu_req_if.next_PC = decode_if.next_PC; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 886e17bd..06ea3a53 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -81,7 +81,7 @@ module VX_lsu_unit #( reg [`NUM_THREADS-1:0] mem_rsp_mask[`DCREQ_SIZE-1:0]; - wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr, dbg_mrq_write_addr; + wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr; wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset; wire [`BYTEEN_BITS-1:0] core_rsp_mem_read; @@ -97,18 +97,18 @@ module VX_lsu_unit #( wire mrq_pop = mrq_pop_part && (0 == mem_rsp_mask_upd); VX_index_queue #( - .DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + 1 + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), + .DATAW (32 + 1 + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), .SIZE (`DCREQ_SIZE) ) mem_req_queue ( .clk (clk), .reset (reset), - .write_data ({mrq_write_addr, use_pc, use_wb, use_req_offset, mem_byteen, use_rd, use_warp_num}), + .write_data ({use_pc, use_wb, use_req_offset, mem_byteen, use_rd, use_warp_num}), .write_addr (mrq_write_addr), .push (mrq_push), .full (mrq_full), .pop (mrq_pop), .read_addr (mrq_read_addr), - .read_data ({dbg_mrq_write_addr, lsu_commit_if.curr_PC, lsu_commit_if.wb, mem_rsp_offset, core_rsp_mem_read, lsu_commit_if.rd, lsu_commit_if.warp_num}), + .read_data ({lsu_commit_if.curr_PC, lsu_commit_if.wb, mem_rsp_offset, core_rsp_mem_read, lsu_commit_if.rd, lsu_commit_if.warp_num}), `UNUSED_PIN (empty) ); @@ -117,8 +117,7 @@ module VX_lsu_unit #( mem_rsp_mask[mrq_write_addr] <= use_valid; end if (mrq_pop_part) begin - mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd; - assert(($time < 2) || mrq_read_addr == dbg_mrq_write_addr); + mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd; end end diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v index 4670c224..9ec58aa4 100644 --- a/hw/rtl/VX_mul_unit.v +++ b/hw/rtl/VX_mul_unit.v @@ -75,7 +75,7 @@ module VX_mul_unit #( default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC, FENCE endcase end - end + end wire stall; diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index ea4e2e92..23c21ab9 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -119,7 +119,7 @@ module VX_pipeline #( VX_commit_if lsu_commit_if(); VX_commit_if csr_commit_if(); VX_commit_if mul_commit_if(); - VX_commit_if fpu_commit_if(); + VX_commit_fp_if fpu_commit_if(); VX_commit_if gpu_commit_if(); VX_fetch #( diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v index ec49f26a..f20b8b87 100644 --- a/hw/rtl/VX_scheduler.v +++ b/hw/rtl/VX_scheduler.v @@ -20,14 +20,17 @@ module VX_scheduler #( ); localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1); - reg [`NUM_REGS-1:0][`NUM_THREADS-1:0] rename_table [`NUM_WARPS-1:0]; - reg [`NUM_REGS-1:0] busy_table [`NUM_WARPS-1:0]; - reg [CTVW-1:0] count_valid; + reg [`NUM_THREADS-1:0] rename_table [`NUM_WARPS-1:0][(`NUM_REGS*2)-1:0]; + reg busy_table [`NUM_WARPS-1:0][(`NUM_REGS*2)-1:0]; + reg [CTVW-1:0] count_valid; + + reg [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd}; + reg [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd}; - wire rs1_rename = busy_table[decode_if.warp_num][decode_if.rs1]; - wire rs2_rename = busy_table[decode_if.warp_num][decode_if.rs2]; - wire rs3_rename = busy_table[decode_if.warp_num][decode_if.rs3]; - wire rd_rename = busy_table[decode_if.warp_num][decode_if.rd]; + wire rs1_rename = busy_table[decode_if.warp_num][{decode_if.rs1_is_fp, decode_if.rs1}]; + wire rs2_rename = busy_table[decode_if.warp_num][{decode_if.rs1_is_fp, decode_if.rs2}]; + wire rs3_rename = busy_table[decode_if.warp_num][{1'b1, decode_if.rs3}]; + wire rd_rename = busy_table[decode_if.warp_num][read_rd]; wire rs1_rename_qual = rs1_rename && decode_if.use_rs1; wire rs2_rename_qual = rs2_rename && decode_if.use_rs2; @@ -50,7 +53,7 @@ module VX_scheduler #( wire release_rd = (| writeback_if.valid); - wire [`NUM_THREADS-1:0] valid_wb_new_mask = rename_table[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.valid; + wire [`NUM_THREADS-1:0] valid_wb_new_mask = rename_table[writeback_if.warp_num][write_rd] & ~writeback_if.valid; reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == valid_wb_new_mask))) ? (count_valid + 1) : (~acquire_rd && (release_rd && (0 == valid_wb_new_mask))) ? (count_valid - 1) : @@ -67,13 +70,13 @@ module VX_scheduler #( count_valid <= 0; end else begin if (acquire_rd) begin - rename_table[decode_if.warp_num][decode_if.rd] <= decode_if.valid; - busy_table[decode_if.warp_num][decode_if.rd] <= 1; + rename_table[decode_if.warp_num][read_rd] <= decode_if.valid; + busy_table[decode_if.warp_num][read_rd] <= 1; end if (release_rd) begin - assert(rename_table[writeback_if.warp_num][writeback_if.rd] != 0); - rename_table[writeback_if.warp_num][writeback_if.rd] <= valid_wb_new_mask; - busy_table[writeback_if.warp_num][writeback_if.rd] <= (| valid_wb_new_mask); + assert(rename_table[writeback_if.warp_num][write_rd] != 0); + rename_table[writeback_if.warp_num][write_rd] <= valid_wb_new_mask; + busy_table[writeback_if.warp_num][write_rd] <= (| valid_wb_new_mask); end count_valid <= count_valid_next; end diff --git a/hw/rtl/VX_scope.vh b/hw/rtl/VX_scope.vh index 4d11cd4c..2ceea369 100644 --- a/hw/rtl/VX_scope.vh +++ b/hw/rtl/VX_scope.vh @@ -145,7 +145,7 @@ wire [`NUM_THREADS-1:0] scope_writeback_valid; \ wire [`NW_BITS-1:0] scope_writeback_warp_num; \ wire [31:0] scope_writeback_curr_PC; \ - wire [`WB_BITS-1:0] scope_writeback_wb; \ + wire scope_writeback_wb; \ wire [`NR_BITS-1:0] scope_writeback_rd; \ wire [63:0] scope_writeback_data; \ wire scope_bank_valid_st0; \ @@ -224,7 +224,7 @@ output wire [`NUM_THREADS-1:0] scope_writeback_valid, \ output wire [`NW_BITS-1:0] scope_writeback_warp_num, \ output wire [31:0] scope_writeback_curr_PC, \ - output wire [`WB_BITS-1:0] scope_writeback_wb, \ + output wire scope_writeback_wb, \ output wire [`NR_BITS-1:0] scope_writeback_rd, \ output wire [63:0] scope_writeback_data, diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 4b724470..3506e33a 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -10,7 +10,7 @@ module VX_writeback #( VX_commit_if alu_commit_if, VX_commit_if lsu_commit_if, VX_commit_if mul_commit_if, - VX_commit_if fpu_commit_if, + VX_commit_fp_if fpu_commit_if, VX_commit_if csr_commit_if, // outputs @@ -26,30 +26,39 @@ module VX_writeback #( VX_wb_if writeback_tmp_if(); assign writeback_tmp_if.valid = lsu_valid ? lsu_commit_if.valid : + fpu_valid ? fpu_commit_if.valid : mul_valid ? mul_commit_if.valid : alu_valid ? alu_commit_if.valid : csr_valid ? csr_commit_if.valid : 0; assign writeback_tmp_if.warp_num = lsu_valid ? lsu_commit_if.warp_num : + fpu_valid ? fpu_commit_if.warp_num : mul_valid ? mul_commit_if.warp_num : alu_valid ? alu_commit_if.warp_num : csr_valid ? csr_commit_if.warp_num : - 0; - - assign writeback_tmp_if.data = lsu_valid ? lsu_commit_if.data : - mul_valid ? mul_commit_if.data : - alu_valid ? alu_commit_if.data : - csr_valid ? csr_commit_if.data : 0; assign writeback_tmp_if.rd = lsu_valid ? lsu_commit_if.rd : + fpu_valid ? fpu_commit_if.rd : mul_valid ? mul_commit_if.rd : alu_valid ? alu_commit_if.rd : csr_valid ? csr_commit_if.rd : 0; - assign writeback_tmp_if.is_fp = fpu_valid && fpu_commit_if.ready; + assign writeback_tmp_if.rd_is_fp = lsu_valid ? 0 : + fpu_valid ? fpu_commit_if.rd_is_fp : + mul_valid ? 0 : + alu_valid ? 0 : + csr_valid ? 0 : + 0; + + assign writeback_tmp_if.data = lsu_valid ? lsu_commit_if.data : + fpu_valid ? fpu_commit_if.data : + mul_valid ? mul_commit_if.data : + alu_valid ? alu_commit_if.data : + csr_valid ? csr_commit_if.data : + 0; wire stall = ~writeback_if.ready && (| writeback_if.valid); @@ -60,8 +69,8 @@ module VX_writeback #( .reset (reset), .stall (stall), .flush (0), - .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.data, writeback_tmp_if.is_fp}), - .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.data, writeback_if.is_fp}) + .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.rd_is_fp, writeback_tmp_if.data}), + .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.rd_is_fp, writeback_if.data}) ); assign lsu_commit_if.ready = !stall; diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 2980274d..c99b6fb4 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -106,7 +106,7 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_use_pc_st0; - wire[`WB_BITS-1:0] debug_wb_st0; + wire debug_wb_st0; wire[`NR_BITS-1:0] debug_rd_st0; wire[`NW_BITS-1:0] debug_warp_num_st0; wire debug_rw_st0; @@ -115,7 +115,7 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st0; wire[31:0] debug_use_pc_st1e; - wire[`WB_BITS-1:0] debug_wb_st1e; + wire debug_wb_st1e; wire[`NR_BITS-1:0] debug_rd_st1e; wire[`NW_BITS-1:0] debug_warp_num_st1e; wire debug_rw_st1e; @@ -124,7 +124,7 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e; wire[31:0] debug_use_pc_st2; - wire[`WB_BITS-1:0] debug_wb_st2; + wire debug_wb_st2; wire[`NR_BITS-1:0] debug_rd_st2; wire[`NW_BITS-1:0] debug_warp_num_st2; wire debug_rw_st2; diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 7184847b..2f1f3813 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -130,7 +130,7 @@ module VX_cache #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_core_req_use_pc; - wire[`WB_BITS-1:0] debug_core_req_wb; + wire debug_core_req_wb; wire[`NR_BITS-1:0] debug_core_req_rd; wire[`NW_BITS-1:0] debug_core_req_warp_num; wire[`LOG2UP(CREQ_SIZE)-1:0] debug_core_req_idx; diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index 606570fd..0856ff0d 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -65,13 +65,13 @@ module VX_snp_forwarder #( ) snp_fwd_queue ( .clk (clk), .reset (reset), - .write_data ({sfq_write_addr, snp_req_addr, snp_req_invalidate, snp_req_tag}), + .write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}), .write_addr (sfq_write_addr), .push (sfq_push), .pop (sfq_pop), .full (sfq_full), .read_addr (sfq_read_addr), - .read_data ({dbg_sfq_write_addr, snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}), + .read_data ({snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}), `UNUSED_PIN (empty) ); @@ -81,7 +81,6 @@ module VX_snp_forwarder #( end if (fwdin_fire) begin pending_cntrs[sfq_read_addr] <= pending_cntrs[sfq_read_addr] - 1; - assert(sfq_read_addr == dbg_sfq_write_addr); end end diff --git a/hw/rtl/interfaces/VX_commit_if.v b/hw/rtl/interfaces/VX_commit_if.v index 457add5e..9bf12884 100644 --- a/hw/rtl/interfaces/VX_commit_if.v +++ b/hw/rtl/interfaces/VX_commit_if.v @@ -10,7 +10,7 @@ interface VX_commit_if (); wire [31:0] curr_PC; wire [`NUM_THREADS-1:0][31:0] data; wire [`NR_BITS-1:0] rd; - wire wb; + wire wb; wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v index 12a45198..ee7b4bf1 100644 --- a/hw/rtl/interfaces/VX_decode_if.v +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -27,8 +27,9 @@ interface VX_decode_if (); // FP states wire [`NR_BITS-1:0] rs3; wire use_rs3; + wire rd_is_fp; wire rs1_is_fp; - wire rs2_is_fp; + wire rs2_is_fp; wire [`FRM_BITS-1:0] frm; wire wb; diff --git a/hw/rtl/interfaces/VX_fpu_from_csr_if.v b/hw/rtl/interfaces/VX_fpu_from_csr_if.v index 9cf03d37..508d1a94 100644 --- a/hw/rtl/interfaces/VX_fpu_from_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_from_csr_if.v @@ -5,11 +5,8 @@ interface VX_fpu_from_csr_if (); -`IGNORE_WARNINGS_BEGIN - - wire [`NUM_WARPS-1:0][`FRM_BITS-1:0] frm; - -`IGNORE_WARNINGS_END + wire [`NW_BITS-1:0] warp_num; + wire [`FRM_BITS-1:0] frm; endinterface diff --git a/hw/rtl/interfaces/VX_fpu_req_if.v b/hw/rtl/interfaces/VX_fpu_req_if.v index c35f83d3..4d29fa8b 100644 --- a/hw/rtl/interfaces/VX_fpu_req_if.v +++ b/hw/rtl/interfaces/VX_fpu_req_if.v @@ -14,6 +14,7 @@ interface VX_fpu_req_if (); wire wb; wire [`NR_BITS-1:0] rd; + wire rd_is_fp; wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs2_data; diff --git a/hw/rtl/interfaces/VX_fpu_to_csr_if.v b/hw/rtl/interfaces/VX_fpu_to_csr_if.v index 6d57da1d..b4471632 100644 --- a/hw/rtl/interfaces/VX_fpu_to_csr_if.v +++ b/hw/rtl/interfaces/VX_fpu_to_csr_if.v @@ -5,7 +5,6 @@ interface VX_fpu_to_csr_if (); -`IGNORE_WARNINGS_BEGIN wire valid; wire [`NW_BITS-1:0] warp_num; @@ -16,8 +15,6 @@ interface VX_fpu_to_csr_if (); wire fflags_UF; wire fflags_NX; -`IGNORE_WARNINGS_END - endinterface `endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index 9d7aaa7a..968fec13 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -8,8 +8,8 @@ interface VX_wb_if (); wire [`NUM_THREADS-1:0] valid; wire [`NW_BITS-1:0] warp_num; wire [`NR_BITS-1:0] rd; - wire [`NUM_THREADS-1:0][31:0] data; - wire is_fp; + wire rd_is_fp; + wire [`NUM_THREADS-1:0][31:0] data; wire ready; endinterface diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index c755d5d2..c24bea0a 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -17,7 +17,9 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO -INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate +INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate + +INCLUDE += -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src SRCS = simulator.cpp testbench.cpp @@ -29,6 +31,8 @@ VF += --language 1800-2009 --assert -Wall -Wpedantic VF += -Wno-DECLFILENAME VF += --x-initial unique --x-assign unique VF += -exe $(SRCS) $(INCLUDE) +VF += -cc Vortex.v -top-module Vortex +VF += verilator.vlt DBG += -DVCD_OUTPUT $(DBG_FLAGS) DBG += -DDBG_CORE_REQ_INFO @@ -36,22 +40,22 @@ DBG += -DDBG_CORE_REQ_INFO THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') gen-s: - verilator $(VF) -DNDEBUG -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG $(SINGLECORE)' + verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG $(SINGLECORE)' gen-sd: - verilator $(VF) -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(SINGLECORE)' --trace $(DBG) + verilator $(VF) $(SINGLECORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(SINGLECORE)' --trace $(DBG) gen-st: - verilator $(VF) -DNDEBUG -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(SINGLECORE)' --threads $(THREADS) + verilator $(VF) -DNDEBUG $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(SINGLECORE)' --threads $(THREADS) gen-m: - verilator $(VF) -DNDEBUG -cc Vortex.v $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG $(MULTICORE)' + verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG $(MULTICORE)' gen-md: - verilator $(VF) -cc Vortex.v $(MULTICORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(MULTICORE)' --trace $(DBG) + verilator $(VF) $(MULTICORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(MULTICORE)' --trace $(DBG) gen-mt: - verilator $(VF) -DNDEBUG -cc Vortex.v $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(MULTICORE)' --threads $(THREADS) + verilator $(VF) -DNDEBUG $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(MULTICORE)' --threads $(THREADS) build-s: gen-s (cd obj_dir && make -j -f VVortex.mk)