diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 4abfd8fd..ab4d4742 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -55,8 +55,6 @@ `define EXT_F_ENABLE -`define IBUF_ENABLE - // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 diff --git a/hw/rtl/VX_gpr_bypass.v b/hw/rtl/VX_gpr_bypass.v index f2537301..b3d2a67c 100644 --- a/hw/rtl/VX_gpr_bypass.v +++ b/hw/rtl/VX_gpr_bypass.v @@ -2,52 +2,69 @@ module VX_gpr_bypass #( parameter DATAW = 1, - parameter BUFFERED = 1 + parameter PASSTHRU = 0 ) ( input wire clk, input wire reset, input wire push, - input reg pop, + input wire pop, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out ); - reg [DATAW-1:0] buffer, buffer2; - reg use_buffer, use_buffer2; - reg delayed_push; + if (PASSTHRU) begin + reg delayed_push; + + always @(posedge clk) begin + if (reset) begin + delayed_push <= 0; + end else begin + delayed_push <= push; + assert(!delayed_push || pop); + end + end - always @(posedge clk) begin - if (reset) begin - delayed_push <= 0; - use_buffer <= 0; - use_buffer2 <= 0; - end else begin - delayed_push <= push; - assert(!use_buffer2 || use_buffer); - if (pop) begin - if (use_buffer) begin - buffer <= buffer2; - use_buffer <= use_buffer2; - use_buffer2 <= 0; - end - end - if (delayed_push) begin - if (use_buffer) begin - assert(!use_buffer2); // queue full! - if (pop) begin + assign data_out = data_in; + + end else begin + + reg [DATAW-1:0] buffer, buffer2; + reg use_buffer, use_buffer2; + reg delayed_push; + + always @(posedge clk) begin + if (reset) begin + delayed_push <= 0; + use_buffer <= 0; + use_buffer2 <= 0; + end else begin + delayed_push <= push; + assert(!use_buffer2 || use_buffer); + if (pop) begin + if (use_buffer) begin + buffer <= buffer2; + use_buffer <= use_buffer2; + use_buffer2 <= 0; + end + end + if (delayed_push) begin + if (use_buffer) begin + assert(!use_buffer2); // queue full! + if (pop) begin + buffer <= data_in; + end else begin + buffer2 <= data_in; + use_buffer2 <= 1; + end + use_buffer <= 1; + end else if (!pop) begin buffer <= data_in; - end else begin - buffer2 <= data_in; - use_buffer2 <= 1; - end - use_buffer <= 1; - end else if (!pop) begin - buffer <= data_in; - use_buffer <= 1; + use_buffer <= 1; + end end end end + + assign data_out = use_buffer ? buffer : data_in; end - assign data_out = use_buffer ? buffer : data_in; - endmodule \ No newline at end of file diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index 04eebd6e..96dc15b6 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -83,12 +83,11 @@ module VX_ibuffer #( reg [`NW_BITS-1:0] deq_wid, deq_wid_n; reg deq_valid, deq_valid_n; reg [DATAW-1:0] deq_instr, deq_instr_n; - reg deq_is_size1, deq_is_size1_n; - + always @(*) begin valid_table_n = valid_table; - if (deq_fire && deq_is_size1) begin - valid_table_n[ibuf_deq_if.wid] = 0; + if (deq_fire) begin + valid_table_n[ibuf_deq_if.wid] = (q_size[deq_wid] != SIZEW'(1)); end if (enq_fire) begin valid_table_n[ibuf_enq_if.wid] = 1; @@ -96,32 +95,35 @@ module VX_ibuffer #( end always @(*) begin - deq_valid_n = 0; - deq_wid_n = 'x; - deq_instr_n = 'x; - deq_is_size1_n = 'x; + deq_valid_n = 0; + deq_wid_n = 'x; + deq_instr_n = 'x; - schedule_table_n = schedule_table; - if (deq_fire && deq_is_size1) begin - schedule_table_n[ibuf_deq_if.wid] = 0; - end - - for (integer i = 0; i < `NUM_WARPS; i++) begin - if (schedule_table_n[i]) begin - deq_valid_n = 1; - deq_wid_n = `NW_BITS'(i); - deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i]; - deq_is_size1_n = (~(enq_fire && ibuf_enq_if.wid == `NW_BITS'(i)) - && (((deq_fire && ibuf_deq_if.wid == `NW_BITS'(i)) && (SIZEW'(2) == q_size[i])) - || (SIZEW'(1) == q_size[i]))); - schedule_table_n[i] = 0; - break; + schedule_table_n = schedule_table; + + if (0 == num_warps) begin + deq_valid_n = enq_fire; + deq_wid_n = ibuf_enq_if.wid; + deq_instr_n = q_data_in; + end else if ((1 == num_warps) || freeze) begin + deq_valid_n = (!deq_fire || (q_size[deq_wid] != SIZEW'(1))) || enq_fire; + deq_wid_n = (!deq_fire || (q_size[deq_wid] != SIZEW'(1))) ? deq_wid : ibuf_enq_if.wid; + deq_instr_n = deq_fire ? ((q_size[deq_wid] != SIZEW'(1)) ? q_data_prev[deq_wid] : q_data_in) : q_data_out[deq_wid]; + end else begin + for (integer i = 0; i < `NUM_WARPS; i++) begin + if (schedule_table_n[i]) begin + deq_valid_n = 1; + deq_wid_n = `NW_BITS'(i); + deq_instr_n = q_data_out[i]; + schedule_table_n[i] = 0; + break; + end end - end + end end - wire warp_added = enq_fire && (0 == q_size[ibuf_enq_if.wid]) && (!deq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid); - wire warp_removed = deq_fire && (1 == q_size[ibuf_deq_if.wid]) && (!enq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid); + wire warp_added = enq_fire && (0 == q_size[ibuf_enq_if.wid]); + wire warp_removed = deq_fire && ~(enq_fire && ibuf_enq_if.wid == ibuf_deq_if.wid) && (1 == q_size[ibuf_deq_if.wid]); always @(posedge clk) begin if (reset) begin @@ -130,20 +132,18 @@ module VX_ibuffer #( deq_valid <= 0; num_warps <= 0; end else begin - valid_table <= valid_table_n; - schedule_table <= (| schedule_table_n) ? schedule_table_n : valid_table_n; + valid_table <= valid_table_n; - if (enq_fire && (0 == num_warps)) begin - deq_valid <= 1; - deq_wid <= ibuf_enq_if.wid; - deq_instr <= q_data_in; - deq_is_size1 <= 1; - end else if (!freeze) begin - deq_valid <= deq_valid_n; - deq_wid <= deq_wid_n; - deq_instr <= deq_instr_n; - deq_is_size1 <= deq_is_size1_n; - end + if ((| schedule_table_n)) begin + schedule_table <= schedule_table_n; + end else begin + schedule_table <= valid_table_n; + schedule_table[deq_wid_n] <= 0; + end + + deq_valid <= deq_valid_n; + deq_wid <= deq_wid_n; + deq_instr <= deq_instr_n; if (warp_added && !warp_removed) begin num_warps <= num_warps + NWARPSW'(1); @@ -151,14 +151,19 @@ module VX_ibuffer #( num_warps <= num_warps - NWARPSW'(1); end - `ifdef VERILATOR + `ifdef VERILATOR + /*if (enq_fire || deq_fire || deq_valid) begin + $display("*** %t: cur=%b(%0d), nxt=%b(%0d), enq=%b(%0d), deq=%b(%0d), nw=%0d(%0d,%0d,%0d,%0d), sched=%b, sched_n=%b", + $time, deq_valid, deq_wid, deq_valid_n, deq_wid_n, enq_fire, ibuf_enq_if.wid, deq_fire, ibuf_deq_if.wid, num_warps, size_r[0], size_r[1], size_r[2], size_r[3], schedule_table, schedule_table_n); + end*/ begin // verify 'num_warps' integer nw = 0; for (integer i = 0; i < `NUM_WARPS; i++) begin nw += 32'(q_size[i] != 0); end - assert(nw == 32'(num_warps)); - assert(~deq_fire || num_warps != 0); + assert(nw == 32'(num_warps)) else $display("%t: error: invalid num_warps: nw=%0d, ref=%0d", $time, num_warps, nw); + assert(~deq_valid || (q_size[deq_wid] != 0)) else $display("%t: error: invalid schedule: wid=%0d", $time, deq_wid); + assert(~deq_fire || (q_size[deq_wid] != 0)) else $display("%t: error: invalid dequeu: wid=%0d", $time, deq_wid); end `endif end diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 44fb1e07..284a0e97 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -48,7 +48,8 @@ module VX_instr_demux ( ); VX_gpr_bypass #( - .DATAW ((2 * `NUM_THREADS * 32)) + .DATAW (2 * `NUM_THREADS * 32), + .PASSTHRU (1) // ALU has no back-pressure, bypass not needed ) alu_bypass ( .clk (clk), .reset (reset), @@ -231,6 +232,6 @@ module VX_instr_demux ( `ifdef EXT_F_ENABLE || (fpu_req_ready && (execute_if.ex_type == `EX_FPU)) `endif - || (gpu_req_ready && (execute_if.ex_type == `EX_GPU)); + || (gpu_req_ready && (execute_if.ex_type == `EX_GPU)); endmodule \ No newline at end of file diff --git a/hw/rtl/VX_ipdom_stack.v b/hw/rtl/VX_ipdom_stack.v index ea06bd4b..f388d3d0 100644 --- a/hw/rtl/VX_ipdom_stack.v +++ b/hw/rtl/VX_ipdom_stack.v @@ -5,15 +5,15 @@ module VX_ipdom_stack #( parameter WIDTH = 1, parameter DEPTH = 1 ) ( - input wire clk, - input wire reset, - input reg [WIDTH - 1:0] q1, - input reg [WIDTH - 1:0] q2, - output wire[WIDTH - 1:0] d, - input wire push, - input wire pop, - output wire empty, - output wire full + input wire clk, + input wire reset, + input wire [WIDTH - 1:0] q1, + input wire [WIDTH - 1:0] q2, + output wire [WIDTH - 1:0] d, + input wire push, + input wire pop, + output wire empty, + output wire full ); localparam STACK_SIZE = 2 ** DEPTH; diff --git a/hw/rtl/VX_scoreboard.v b/hw/rtl/VX_scoreboard.v index 1e31973e..3ecd2d77 100644 --- a/hw/rtl/VX_scoreboard.v +++ b/hw/rtl/VX_scoreboard.v @@ -55,7 +55,7 @@ module VX_scoreboard #( if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin $display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b", $time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.curr_PC, ibuf_deq_if.rd, ibuf_deq_if.wb, - inuse_reg_mask[ibuf_deq_if.rd], inuse_reg_mask[ibuf_deq_if.rs1], inuse_reg_mask[ibuf_deq_if.rs2], inuse_reg_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay); + inuse_regs[ibuf_deq_if.rd], inuse_regs[ibuf_deq_if.rs1], inuse_regs[ibuf_deq_if.rs2], inuse_regs[ibuf_deq_if.rs3], exe_delay, gpr_delay); end end `endif diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index e34d0102..ee24cb88 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -23,42 +23,46 @@ module VX_writeback #( wire mul_valid = mul_commit_if.valid && mul_commit_if.wb; wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb; - VX_writeback_if writeback_tmp_if(); - - assign writeback_tmp_if.valid = alu_valid ? alu_commit_if.valid : - lsu_valid ? lsu_commit_if.valid : - csr_valid ? csr_commit_if.valid : - mul_valid ? mul_commit_if.valid : - fpu_valid ? fpu_commit_if.valid : - 0; - - assign writeback_tmp_if.wid = alu_valid ? alu_commit_if.wid : - lsu_valid ? lsu_commit_if.wid : - csr_valid ? csr_commit_if.wid : - mul_valid ? mul_commit_if.wid : - fpu_valid ? fpu_commit_if.wid : - 0; + wire wb_valid; + wire [`NW_BITS-1:0] wb_wid; + wire [`NUM_THREADS-1:0] wb_thread_mask; + wire [`NR_BITS-1:0] wb_rd; + wire [`NUM_THREADS-1:0][31:0] wb_data; - assign writeback_tmp_if.thread_mask = alu_valid ? alu_commit_if.thread_mask : - lsu_valid ? lsu_commit_if.thread_mask : - csr_valid ? csr_commit_if.thread_mask : - mul_valid ? mul_commit_if.thread_mask : - fpu_valid ? fpu_commit_if.thread_mask : - 0; + assign wb_valid = alu_valid ? alu_commit_if.valid : + lsu_valid ? lsu_commit_if.valid : + csr_valid ? csr_commit_if.valid : + mul_valid ? mul_commit_if.valid : + fpu_valid ? fpu_commit_if.valid : + 0; - assign writeback_tmp_if.rd = alu_valid ? alu_commit_if.rd : - lsu_valid ? lsu_commit_if.rd : - csr_valid ? csr_commit_if.rd : - mul_valid ? mul_commit_if.rd : - fpu_valid ? fpu_commit_if.rd : - 0; + assign wb_wid = alu_valid ? alu_commit_if.wid : + lsu_valid ? lsu_commit_if.wid : + csr_valid ? csr_commit_if.wid : + mul_valid ? mul_commit_if.wid : + fpu_valid ? fpu_commit_if.wid : + 0; + + assign wb_thread_mask = alu_valid ? alu_commit_if.thread_mask : + lsu_valid ? lsu_commit_if.thread_mask : + csr_valid ? csr_commit_if.thread_mask : + mul_valid ? mul_commit_if.thread_mask : + fpu_valid ? fpu_commit_if.thread_mask : + 0; - assign writeback_tmp_if.data = alu_valid ? alu_commit_if.data : - lsu_valid ? lsu_commit_if.data : - csr_valid ? csr_commit_if.data : - mul_valid ? mul_commit_if.data : - fpu_valid ? fpu_commit_if.data : - 0; + assign wb_rd = alu_valid ? alu_commit_if.rd : + lsu_valid ? lsu_commit_if.rd : + csr_valid ? csr_commit_if.rd : + mul_valid ? mul_commit_if.rd : + fpu_valid ? fpu_commit_if.rd : + 0; + + assign wb_data = alu_valid ? alu_commit_if.data : + lsu_valid ? lsu_commit_if.data : + csr_valid ? csr_commit_if.data : + mul_valid ? mul_commit_if.data : + fpu_valid ? fpu_commit_if.data : + 0; wire stall = ~writeback_if.ready && writeback_if.valid; @@ -69,8 +73,8 @@ module VX_writeback #( .reset (reset), .stall (stall), .flush (1'b0), - .in ({writeback_tmp_if.valid, writeback_tmp_if.wid, writeback_tmp_if.thread_mask, writeback_tmp_if.rd, writeback_tmp_if.data}), - .out ({writeback_if.valid, writeback_if.wid, writeback_if.thread_mask, writeback_if.rd, writeback_if.data}) + .in ({wb_valid, wb_wid, wb_thread_mask, wb_rd, wb_data}), + .out ({writeback_if.valid, writeback_if.wid, writeback_if.thread_mask, writeback_if.rd, writeback_if.data}) ); assign alu_commit_if.ready = !stall; diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index ab4846cd..b4b1200e 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -12,6 +12,7 @@ double sc_time_stamp() { Simulator::Simulator() { // force random values for unitialized signals Verilated::randReset(2); + Verilated::randSeed(50); // Turn off assertion before reset Verilated::assertOn(false); diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index a85c1178..030bd254 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -38,6 +38,20 @@ set_global_assignment -name VERILOG_MACRO QUARTUS set_global_assignment -name VERILOG_MACRO SYNTHESIS set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 +set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0 +set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100 +set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)" +set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS" +set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON +set_global_assignment -name FITTER_EFFORT "STANDARD FIT" +set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON +set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON +set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON +set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED +set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON +set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON +set_global_assignment -name POWER_USE_TA_VALUE 65 +set_global_assignment -name SEED 1 set idx 0 foreach arg $q_args_orig {