pipeline optimization: fixed GPR fanout delay to execute units

This commit is contained in:
Blaise Tine
2020-11-07 02:01:21 -08:00
parent af2bb3b789
commit b14007f930
13 changed files with 155 additions and 151 deletions

View File

@@ -7,7 +7,7 @@ module VX_csr_data #(
input wire reset, input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if, VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_issue_if csr_to_issue_if, VX_csr_to_fpu_if csr_to_fpu_if,
input wire read_enable, input wire read_enable,
input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`CSR_ADDR_BITS-1:0] read_addr,
@@ -144,6 +144,6 @@ module VX_csr_data #(
end end
assign read_data = read_data_r; assign read_data = read_data_r;
assign csr_to_issue_if.frm = csr_frm[csr_to_issue_if.wid]; assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.wid];
endmodule endmodule

View File

@@ -7,7 +7,7 @@ module VX_csr_unit #(
input wire reset, input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if, VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_issue_if csr_to_issue_if, VX_csr_to_fpu_if csr_to_fpu_if,
VX_csr_io_req_if csr_io_req_if, VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if, VX_csr_io_rsp_if csr_io_rsp_if,
@@ -47,7 +47,7 @@ module VX_csr_unit #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.csr_to_issue_if(csr_to_issue_if), .csr_to_fpu_if (csr_to_fpu_if),
.read_enable (csr_pipe_req_if.valid), .read_enable (csr_pipe_req_if.valid),
.read_addr (csr_pipe_req_if.csr_addr), .read_addr (csr_pipe_req_if.csr_addr),
.read_wid (csr_pipe_req_if.wid), .read_wid (csr_pipe_req_if.wid),

View File

@@ -28,7 +28,6 @@ module VX_execute #(
VX_gpu_req_if gpu_req_if, VX_gpu_req_if gpu_req_if,
// outputs // outputs
VX_csr_to_issue_if csr_to_issue_if,
VX_branch_ctl_if branch_ctl_if, VX_branch_ctl_if branch_ctl_if,
VX_warp_ctl_if warp_ctl_if, VX_warp_ctl_if warp_ctl_if,
VX_exu_to_cmt_if alu_commit_if, VX_exu_to_cmt_if alu_commit_if,
@@ -41,6 +40,7 @@ module VX_execute #(
input wire busy, input wire busy,
output wire ebreak output wire ebreak
); );
VX_csr_to_fpu_if csr_to_fpu_if();
VX_alu_unit #( VX_alu_unit #(
.CORE_ID(CORE_ID) .CORE_ID(CORE_ID)
@@ -70,7 +70,7 @@ module VX_execute #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.csr_to_issue_if(csr_to_issue_if), .csr_to_fpu_if (csr_to_fpu_if),
.csr_io_req_if (csr_io_req_if), .csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if), .csr_io_rsp_if (csr_io_rsp_if),
.csr_req_if (csr_req_if), .csr_req_if (csr_req_if),
@@ -105,6 +105,7 @@ module VX_execute #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.fpu_req_if (fpu_req_if), .fpu_req_if (fpu_req_if),
.csr_to_fpu_if (csr_to_fpu_if),
.fpu_commit_if (fpu_commit_if) .fpu_commit_if (fpu_commit_if)
); );
`else `else

View File

@@ -9,6 +9,7 @@ module VX_fpu_unit #(
// inputs // inputs
VX_fpu_req_if fpu_req_if, VX_fpu_req_if fpu_req_if,
VX_csr_to_fpu_if csr_to_fpu_if,
// outputs // outputs
VX_fpu_to_cmt_if fpu_commit_if VX_fpu_to_cmt_if fpu_commit_if
@@ -56,6 +57,10 @@ module VX_fpu_unit #(
wire valid_in = fpu_req_if.valid && ~fpuq_full; wire valid_in = fpu_req_if.valid && ~fpuq_full;
// resolve dynamic FRM
assign csr_to_fpu_if.wid = fpu_req_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.op_mod;
`ifdef FPU_FAST `ifdef FPU_FAST
VX_fp_fpga #( VX_fp_fpga #(
@@ -70,7 +75,7 @@ module VX_fpu_unit #(
.tag_in (tag_in), .tag_in (tag_in),
.op_type (fpu_req_if.op_type), .op_type (fpu_req_if.op_type),
.frm (fpu_req_if.frm), .frm (fpu_frm),
.dataa (fpu_req_if.rs1_data), .dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data), .datab (fpu_req_if.rs2_data),
@@ -104,7 +109,7 @@ module VX_fpu_unit #(
.tag_in (tag_in), .tag_in (tag_in),
.op_type (fpu_req_if.op_type), .op_type (fpu_req_if.op_type),
.frm (fpu_req_if.frm), .frm (fpu_frm),
.dataa (fpu_req_if.rs1_data), .dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data), .datab (fpu_req_if.rs2_data),

View File

@@ -42,22 +42,20 @@ module VX_gpr_bypass #(
delayed_push <= push; delayed_push <= push;
assert(!use_buffer2 || use_buffer); assert(!use_buffer2 || use_buffer);
if (pop) begin if (pop) begin
if (use_buffer) begin
buffer <= buffer2; buffer <= buffer2;
use_buffer <= use_buffer2; use_buffer <= use_buffer2;
use_buffer2 <= 0; use_buffer2 <= 0;
end end
end
if (delayed_push) begin if (delayed_push) begin
if (use_buffer) begin if (use_buffer) begin
assert(!use_buffer2); // queue full! assert(!use_buffer2); // full!
use_buffer <= 1;
if (pop) begin if (pop) begin
buffer <= data_in; buffer <= data_in;
end else begin end else begin
buffer2 <= data_in; buffer2 <= data_in;
use_buffer2 <= 1; use_buffer2 <= 1;
end end
use_buffer <= 1;
end else if (!pop) begin end else if (!pop) begin
buffer <= data_in; buffer <= data_in;
use_buffer <= 1; use_buffer <= 1;

View File

@@ -7,7 +7,6 @@ module VX_instr_demux (
// inputs // inputs
VX_decode_if execute_if, VX_decode_if execute_if,
VX_gpr_rsp_if gpr_rsp_if, VX_gpr_rsp_if gpr_rsp_if,
VX_csr_to_issue_if csr_to_issue_if,
// outputs // outputs
VX_alu_req_if alu_req_if, VX_alu_req_if alu_req_if,
@@ -34,78 +33,47 @@ module VX_instr_demux (
wire alu_req_ready; wire alu_req_ready;
wire is_br_op = `IS_BR_MOD(execute_if.op_mod); wire is_br_op = `IS_BR_MOD(execute_if.op_mod);
VX_skid_buffer #( VX_opd_collect #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS) .INSTW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS),
) alu_reg ( .OPDSW (2 * `NUM_THREADS * 32),
.PASSTHRU (1) // ALU has no backpressure
) alu_opc (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.ready_in (alu_req_ready), .ready_in (alu_req_ready),
.valid_in (alu_req_valid), .valid_in (alu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}), .opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.ready_out (alu_req_if.ready), .ready_out (alu_req_if.ready),
.valid_out (alu_req_if.valid) .valid_out (alu_req_if.valid)
); );
VX_gpr_bypass #(
.DATAW (2 * `NUM_THREADS * 32),
.PASSTHRU (1) // ALU has no back-pressure, bypass not needed
) alu_bypass (
.clk (clk),
.reset (reset),
.push (alu_req_valid && alu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.rs1_data, alu_req_if.rs2_data}),
.pop (alu_req_if.valid && alu_req_if.ready)
);
// lsu unit // lsu unit
wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU); wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU);
wire lsu_req_ready; wire lsu_req_ready;
VX_skid_buffer #( VX_opd_collect #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1) .INSTW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1),
) lsu_reg ( .OPDSW (2 * `NUM_THREADS * 32)
) lsu_opc (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.ready_in (lsu_req_ready), .ready_in (lsu_req_ready),
.valid_in (lsu_req_valid), .valid_in (lsu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `LSU_RW(execute_if.op_type), `LSU_BE(execute_if.op_type), execute_if.imm, execute_if.rd, execute_if.wb}), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `LSU_RW(execute_if.op_type), `LSU_BE(execute_if.op_type), execute_if.imm, execute_if.rd, execute_if.wb}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb}), .opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}),
.ready_out (lsu_req_if.ready), .ready_out (lsu_req_if.ready),
.valid_out (lsu_req_if.valid) .valid_out (lsu_req_if.valid)
); );
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) lsu_bypass (
.clk (clk),
.reset (reset),
.push (lsu_req_valid && lsu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.base_addr, lsu_req_if.store_data}),
.pop (lsu_req_if.valid && lsu_req_if.ready)
);
// csr unit // csr unit
wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR); wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR);
wire csr_req_ready; wire csr_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1)
) csr_reg (
.clk (clk),
.reset (reset),
.ready_in (csr_req_ready),
.valid_in (csr_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `CSR_OP(execute_if.op_type), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io}),
.ready_out (csr_req_if.ready),
.valid_out (csr_req_if.valid)
);
reg tmp_rs2_is_imm; reg tmp_rs2_is_imm;
reg [`NR_BITS-1:0] tmp_rs1; reg [`NR_BITS-1:0] tmp_rs1;
@@ -116,15 +84,19 @@ module VX_instr_demux (
wire [31:0] csr_req_mask = tmp_rs2_is_imm ? 32'(tmp_rs1) : gpr_rsp_if.rs1_data[0]; wire [31:0] csr_req_mask = tmp_rs2_is_imm ? 32'(tmp_rs1) : gpr_rsp_if.rs1_data[0];
VX_gpr_bypass #( VX_opd_collect #(
.DATAW (32) .INSTW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1),
) csr_bypass ( .OPDSW (32)
) csr_opc (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.push (csr_req_valid && csr_req_ready), .ready_in (csr_req_ready),
.data_in (csr_req_mask), .valid_in (csr_req_valid),
.data_out (csr_req_if.csr_mask), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `CSR_OP(execute_if.op_type), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}),
.pop (csr_req_if.valid && csr_req_if.ready) .opds_in ({csr_req_mask}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io, csr_req_if.csr_mask}),
.ready_out (csr_req_if.ready),
.valid_out (csr_req_if.valid)
); );
// mul unit // mul unit
@@ -133,29 +105,20 @@ module VX_instr_demux (
wire mul_req_valid = execute_if.valid && (execute_if.ex_type == `EX_MUL); wire mul_req_valid = execute_if.valid && (execute_if.ex_type == `EX_MUL);
wire mul_req_ready; wire mul_req_ready;
VX_skid_buffer #( VX_opd_collect #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1) .INSTW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1),
) mul_reg ( .OPDSW (2 * `NUM_THREADS * 32)
) mul_opc (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.ready_in (mul_req_ready), .ready_in (mul_req_ready),
.valid_in (mul_req_valid), .valid_in (mul_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `MUL_OP(execute_if.op_type), execute_if.rd, execute_if.wb}), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `MUL_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.data_out ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.op_type, mul_req_if.rd, mul_req_if.wb}), .opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.op_type, mul_req_if.rd, mul_req_if.wb, mul_req_if.rs1_data, mul_req_if.rs2_data}),
.ready_out (mul_req_if.ready), .ready_out (mul_req_if.ready),
.valid_out (mul_req_if.valid) .valid_out (mul_req_if.valid)
); );
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) mul_bypass (
.clk (clk),
.reset (reset),
.push (mul_req_valid && mul_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({mul_req_if.rs1_data, mul_req_if.rs2_data}),
.pop (mul_req_if.valid && mul_req_if.ready)
);
`endif `endif
// fpu unit // fpu unit
@@ -164,33 +127,20 @@ module VX_instr_demux (
wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU); wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU);
wire fpu_req_ready; wire fpu_req_ready;
// resolve dynamic FRM VX_opd_collect #(
assign csr_to_issue_if.wid = execute_if.wid; .INSTW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1),
wire [`FRM_BITS-1:0] fpu_frm = (execute_if.op_mod == `FRM_DYN) ? csr_to_issue_if.frm : execute_if.op_mod; .OPDSW (3 * `NUM_THREADS * 32)
) fpu_opc (
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `FRM_BITS + `NR_BITS + 1)
) fpu_reg (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.ready_in (fpu_req_ready), .ready_in (fpu_req_ready),
.valid_in (fpu_req_valid), .valid_in (fpu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `FPU_OP(execute_if.op_type), fpu_frm, execute_if.rd, execute_if.wb}), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `FPU_OP(execute_if.op_type), execute_if.op_mod, execute_if.rd, execute_if.wb}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.frm, fpu_req_if.rd, fpu_req_if.wb}), .opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.ready_out (fpu_req_if.ready), .ready_out (fpu_req_if.ready),
.valid_out (fpu_req_if.valid) .valid_out (fpu_req_if.valid)
); );
VX_gpr_bypass #(
.DATAW ((3 * `NUM_THREADS * 32))
) fpu_bypass (
.clk (clk),
.reset (reset),
.push (fpu_req_valid && fpu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.pop (fpu_req_if.valid && fpu_req_if.ready)
);
`endif `endif
// gpu unit // gpu unit
@@ -198,30 +148,21 @@ module VX_instr_demux (
wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU);
wire gpu_req_ready; wire gpu_req_ready;
VX_skid_buffer #( VX_opd_collect #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1) .INSTW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1),
) gpu_reg ( .OPDSW (`NUM_THREADS * 32 + 32)
) gpu_opc (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.ready_in (gpu_req_ready), .ready_in (gpu_req_ready),
.valid_in (gpu_req_valid), .valid_in (gpu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}), .inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}), .opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.ready_out (gpu_req_if.ready), .ready_out (gpu_req_if.ready),
.valid_out (gpu_req_if.valid) .valid_out (gpu_req_if.valid)
); );
VX_gpr_bypass #(
.DATAW ((`NUM_THREADS * 32) + 32)
) gpu_bypass (
.clk (clk),
.reset (reset),
.push (gpu_req_valid && gpu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.pop (gpu_req_if.valid && gpu_req_if.ready)
);
// can take next request? // can take next request?
assign execute_if.ready = (alu_req_ready && (execute_if.ex_type == `EX_ALU)) assign execute_if.ready = (alu_req_ready && (execute_if.ex_type == `EX_ALU))
|| (lsu_req_ready && (execute_if.ex_type == `EX_LSU)) || (lsu_req_ready && (execute_if.ex_type == `EX_LSU))

View File

@@ -10,7 +10,6 @@ module VX_issue #(
VX_decode_if decode_if, VX_decode_if decode_if,
VX_writeback_if writeback_if, VX_writeback_if writeback_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_alu_req_if alu_req_if, VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if, VX_lsu_req_if lsu_req_if,
@@ -91,7 +90,6 @@ module VX_issue #(
.reset (reset), .reset (reset),
.execute_if (execute_if), .execute_if (execute_if),
.gpr_rsp_if (gpr_rsp_if), .gpr_rsp_if (gpr_rsp_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if), .alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if), .lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if), .csr_req_if (csr_req_if),
@@ -136,22 +134,22 @@ module VX_issue #(
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin if (alu_req_if.valid && alu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rs1_data, alu_req_if.rs2_data); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data);
end end
if (lsu_req_if.valid && lsu_req_if.ready) begin if (lsu_req_if.valid && lsu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data);
end end
if (csr_req_if.valid && csr_req_if.ready) begin if (csr_req_if.valid && csr_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.csr_addr, csr_req_if.csr_mask); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask);
end end
if (mul_req_if.valid && mul_req_if.ready) begin if (mul_req_if.valid && mul_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.PC, mul_req_if.tmask, mul_req_if.rs1_data, mul_req_if.rs2_data); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.PC, mul_req_if.tmask, mul_req_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data);
end end
if (fpu_req_if.valid && fpu_req_if.ready) begin if (fpu_req_if.valid && fpu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data);
end end
if (gpu_req_if.valid && gpu_req_if.ready) begin if (gpu_req_if.valid && gpu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rs1_data, gpu_req_if.rs2_data); $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd, gpu_req_if.rs1_data, gpu_req_if.rs2_data);
end end
end end
`endif `endif

62
hw/rtl/VX_opd_collect.v Normal file
View File

@@ -0,0 +1,62 @@
`include "VX_platform.vh"
module VX_opd_collect #(
parameter INSTW = 1,
parameter OPDSW = 1,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [INSTW-1:0] inst_in,
input wire [OPDSW-1:0] opds_in,
output wire [INSTW+OPDSW-1:0] data_out,
output wire valid_out,
input wire ready_out
);
wire [INSTW-1:0] inst_out;
wire [OPDSW-1:0] opds_out;
wire valid_out_tmp, ready_out_tmp;
VX_skid_buffer #(
.DATAW (INSTW)
) skid_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (inst_in),
.data_out (inst_out),
.valid_out (valid_out_tmp),
.ready_out (ready_out_tmp)
);
VX_gpr_bypass #(
.DATAW (OPDSW),
.PASSTHRU (PASSTHRU)
) gpr_bypass (
.clk (clk),
.reset (reset),
.push (valid_in && ready_in),
.pop (valid_out_tmp && ready_out_tmp),
.data_in (opds_in),
.data_out (opds_out)
);
wire stall_out = valid_out && ~ready_out;
VX_generic_register #(
.N(1+INSTW+OPDSW)
) pipe_reg (
.clk (clk),
.reset (reset),
.stall (stall_out),
.flush (1'b0),
.in ({valid_out_tmp, inst_out, opds_out}),
.out ({valid_out, data_out})
);
assign ready_out_tmp = ~stall_out;
endmodule

View File

@@ -98,7 +98,6 @@ module VX_pipeline #(
assign csr_io_rsp_data = csr_io_rsp_if.data; assign csr_io_rsp_data = csr_io_rsp_if.data;
assign csr_io_rsp_if.ready = csr_io_rsp_ready; assign csr_io_rsp_if.ready = csr_io_rsp_ready;
VX_csr_to_issue_if csr_to_issue_if();
VX_cmt_to_csr_if cmt_to_csr_if(); VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if(); VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if(); VX_branch_ctl_if branch_ctl_if();
@@ -157,7 +156,6 @@ module VX_pipeline #(
.decode_if (decode_if), .decode_if (decode_if),
.writeback_if (writeback_if), .writeback_if (writeback_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if), .alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if), .lsu_req_if (lsu_req_if),
@@ -181,7 +179,6 @@ module VX_pipeline #(
.csr_io_req_if (csr_io_req_if), .csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if), .csr_io_rsp_if (csr_io_rsp_if),
.csr_to_issue_if(csr_to_issue_if),
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.alu_req_if (alu_req_if), .alu_req_if (alu_req_if),

View File

@@ -1,5 +1,5 @@
`ifndef VX_CSR_TO_ISSUE_IF `ifndef VX_CSR_TO_FPU_IF
`define VX_CSR_TO_ISSUE_IF `define VX_CSR_TO_FPU_IF
`include "VX_define.vh" `include "VX_define.vh"
@@ -7,7 +7,7 @@
`IGNORE_WARNINGS_BEGIN `IGNORE_WARNINGS_BEGIN
`endif `endif
interface VX_csr_to_issue_if (); interface VX_csr_to_fpu_if ();
wire [`NW_BITS-1:0] wid; wire [`NW_BITS-1:0] wid;
wire [`FRM_BITS-1:0] frm; wire [`FRM_BITS-1:0] frm;

View File

@@ -15,7 +15,7 @@ interface VX_fpu_req_if ();
wire [`NUM_THREADS-1:0] tmask; wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC; wire [31:0] PC;
wire [`FPU_BITS-1:0] op_type; wire [`FPU_BITS-1:0] op_type;
wire [`FRM_BITS-1:0] frm; wire [`MOD_BITS-1:0] op_mod;
wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NUM_THREADS-1:0][31:0] rs3_data;

View File

@@ -17,6 +17,8 @@ module VX_skid_buffer #(
reg valid_out_r; reg valid_out_r;
reg use_buffer; reg use_buffer;
wire push = valid_in && ready_in;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
data_out_r <= 0; data_out_r <= 0;
@@ -24,17 +26,17 @@ module VX_skid_buffer #(
use_buffer <= 0; use_buffer <= 0;
valid_out_r <= 0; valid_out_r <= 0;
end else begin end else begin
if (valid_in && ready_in && valid_out && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end
if (ready_out) begin if (ready_out) begin
use_buffer <= 0; use_buffer <= 0;
end end
if (valid_in && ready_in) begin if (push) begin
buffer <= data_in; buffer <= data_in;
if (valid_out_r && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end end
if (!valid_out || ready_out) begin end
if (!valid_out_r || ready_out) begin
valid_out_r <= valid_in || use_buffer; valid_out_r <= valid_in || use_buffer;
data_out_r <= use_buffer ? buffer : data_in; data_out_r <= use_buffer ? buffer : data_in;
end end