From 25f66e6490fdb74b80a4587d2bdd601c05984dc2 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 19 Jul 2020 05:03:47 -0400 Subject: [PATCH] pipeline refactoring --- driver/opae/scope.cpp | 4 +- driver/rtlsim/Makefile | 7 +- hw/rtl/VX_alu_unit.v | 180 ++---- hw/rtl/VX_back_end.v | 171 ------ hw/rtl/VX_branch_unit.v | 82 +++ hw/rtl/VX_config.vh | 6 + hw/rtl/VX_core.v | 24 +- hw/rtl/VX_csr_arb.v | 71 +-- hw/rtl/VX_csr_data.v | 5 +- hw/rtl/VX_csr_io_arb.v | 3 +- hw/rtl/VX_csr_pipe.v | 99 ++-- hw/rtl/VX_dcache_arb.v | 58 +- hw/rtl/VX_decode.v | 528 ++++++++---------- hw/rtl/VX_define.vh | 309 ++++++++-- hw/rtl/VX_exec_unit.v | 147 ----- hw/rtl/VX_execute.v | 140 +++++ hw/rtl/VX_fetch.v | 129 ++--- hw/rtl/VX_front_end.v | 116 ---- hw/rtl/VX_gpr_mux.v | 88 +++ hw/rtl/VX_gpr_ram.v | 96 ++-- hw/rtl/VX_gpr_stage.v | 342 +++++------- hw/rtl/VX_gpr_wrapper.v | 60 -- hw/rtl/VX_gpu_inst.v | 88 --- hw/rtl/VX_gpu_unit.v | 79 +++ hw/rtl/VX_icache_stage.v | 46 +- hw/rtl/VX_inst_multiplex.v | 90 --- hw/rtl/VX_issue.v | 87 +++ hw/rtl/VX_lsu_unit.v | 121 ++-- hw/rtl/VX_mem_unit.v | 24 +- hw/rtl/VX_mul_unit.v | 123 ++++ hw/rtl/VX_pipeline.v | 146 ++--- hw/rtl/VX_scheduler.v | 83 --- hw/rtl/VX_scope.vh | 28 +- hw/rtl/VX_warp.v | 4 +- hw/rtl/VX_warp_sched.v | 289 ++++------ hw/rtl/VX_writeback.v | 147 +++-- hw/rtl/cache/VX_bank.v | 20 +- hw/rtl/cache/VX_cache.v | 4 +- hw/rtl/cache/VX_cache_core_rsp_merge.v | 58 +- hw/rtl/cache/VX_snp_forwarder.v | 2 +- hw/rtl/cache/VX_tag_data_access.v | 4 +- hw/rtl/interfaces/VX_alu_req_if.v | 24 + hw/rtl/interfaces/VX_backend_req_if.v | 40 -- hw/rtl/interfaces/VX_branch_rsp_if.v | 15 - hw/rtl/interfaces/VX_csr_io_req_if.v | 10 +- hw/rtl/interfaces/VX_csr_req_if.v | 21 +- hw/rtl/interfaces/VX_decode_if.v | 33 ++ hw/rtl/interfaces/VX_exec_unit_req_if.v | 47 -- hw/rtl/interfaces/VX_execute_if.v | 33 ++ hw/rtl/interfaces/VX_gpr_read_if.v | 19 - hw/rtl/interfaces/VX_gpu_inst_req_if.v | 23 - hw/rtl/interfaces/VX_gpu_req_if.v | 21 + .../{VX_inst_meta_if.v => VX_ifetch_req_if.v} | 8 +- hw/rtl/interfaces/VX_ifetch_rsp_if.v | 16 + hw/rtl/interfaces/VX_jal_rsp_if.v | 15 - hw/rtl/interfaces/VX_join_if.v | 1 - hw/rtl/interfaces/VX_lsu_req_if.v | 14 +- hw/rtl/interfaces/VX_mul_req_if.v | 24 + hw/rtl/interfaces/VX_warp_ctl_if.v | 13 +- hw/rtl/interfaces/VX_wb_if.v | 7 +- hw/rtl/libs/VX_countones.v | 4 +- hw/rtl/libs/VX_divide.v | 26 +- hw/rtl/libs/VX_generic_register.v | 27 +- ...{VX_indexable_queue.v => VX_index_queue.v} | 2 +- hw/rtl/libs/VX_matrix_arbiter.v | 2 +- ..._encoder_onehot.v => VX_onehot_encooder.v} | 2 +- hw/simulate/Makefile | 12 +- hw/simulate/simulator.cpp | 23 +- hw/simulate/simulator.h | 1 + hw/simulate/testbench.cpp | 22 +- hw/unit_tests/VX_divide_tb.v | 8 +- 71 files changed, 2242 insertions(+), 2379 deletions(-) delete mode 100644 hw/rtl/VX_back_end.v create mode 100644 hw/rtl/VX_branch_unit.v delete mode 100644 hw/rtl/VX_exec_unit.v create mode 100644 hw/rtl/VX_execute.v delete mode 100644 hw/rtl/VX_front_end.v create mode 100644 hw/rtl/VX_gpr_mux.v delete mode 100644 hw/rtl/VX_gpr_wrapper.v delete mode 100644 hw/rtl/VX_gpu_inst.v create mode 100644 hw/rtl/VX_gpu_unit.v delete mode 100644 hw/rtl/VX_inst_multiplex.v create mode 100644 hw/rtl/VX_issue.v create mode 100644 hw/rtl/VX_mul_unit.v delete mode 100644 hw/rtl/VX_scheduler.v create mode 100644 hw/rtl/interfaces/VX_alu_req_if.v delete mode 100644 hw/rtl/interfaces/VX_backend_req_if.v delete mode 100644 hw/rtl/interfaces/VX_branch_rsp_if.v create mode 100644 hw/rtl/interfaces/VX_decode_if.v delete mode 100644 hw/rtl/interfaces/VX_exec_unit_req_if.v create mode 100644 hw/rtl/interfaces/VX_execute_if.v delete mode 100644 hw/rtl/interfaces/VX_gpr_read_if.v delete mode 100644 hw/rtl/interfaces/VX_gpu_inst_req_if.v create mode 100644 hw/rtl/interfaces/VX_gpu_req_if.v rename hw/rtl/interfaces/{VX_inst_meta_if.v => VX_ifetch_req_if.v} (57%) create mode 100644 hw/rtl/interfaces/VX_ifetch_rsp_if.v delete mode 100644 hw/rtl/interfaces/VX_jal_rsp_if.v create mode 100644 hw/rtl/interfaces/VX_mul_req_if.v rename hw/rtl/libs/{VX_indexable_queue.v => VX_index_queue.v} (98%) rename hw/rtl/libs/{VX_encoder_onehot.v => VX_onehot_encooder.v} (94%) diff --git a/driver/opae/scope.cpp b/driver/opae/scope.cpp index 6053d353..0bbe4b10 100644 --- a/driver/opae/scope.cpp +++ b/driver/opae/scope.cpp @@ -116,9 +116,9 @@ static const scope_signal_t scope_signals[] = { { NUM_THREADS, "writeback_valid" }, { 1, "schedule_delay" }, - { 1, "memory_delay" }, + { 1, "mem_delay" }, { 1, "exec_delay" }, - { 1, "gpr_stage_delay" }, + { 1, "gpr_delay" }, { 1, "busy" }, { 1, "bank_valid_st0" }, diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index d0890c4c..38fe3d60 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -13,14 +13,15 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -#DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO #CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 +#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 +CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -#DEBUG=1 +DEBUG=1 #AFU=1 CFLAGS += -fPIC diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index d1b30016..51cc7c58 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -1,132 +1,66 @@ `include "VX_define.vh" -module VX_alu_unit ( - input wire clk, - input wire reset, - input wire [31:0] src_a, - input wire [31:0] src_b, - input wire src_rs2, - input wire [31:0] itype_immed, - input wire [19:0] upper_immed, - input wire [4:0] alu_op, - input wire [31:0] curr_PC, - output reg [31:0] alu_result, - output reg alu_stall -); - wire[31:0] div_result_unsigned; - wire[31:0] div_result_signed; - - wire[31:0] rem_result_unsigned; - wire[31:0] rem_result_signed; - - wire[63:0] mul_result; - - wire[31:0] alu_in1 = src_a; - wire[31:0] alu_in2 = (src_rs2 == `RS2_IMMED) ? itype_immed : src_b; - - wire[31:0] upper_immed_s = {upper_immed, {12{1'b0}}}; +module VX_alu_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, - reg [7:0] inst_delay; - reg [7:0] curr_inst_delay; - - always @(*) begin - case (alu_op) - `ALU_DIV, - `ALU_DIVU, - `ALU_REM, - `ALU_REMU: inst_delay = `DIV_LATENCY; - `ALU_MUL, - `ALU_MULH, - `ALU_MULHSU, - `ALU_MULHU: inst_delay = `MUL_LATENCY; - default: inst_delay = 0; - endcase + // Inputs + VX_alu_req_if alu_req_if, + + // Outputs + VX_wb_if alu_wb_if +); + wire [`NUM_THREADS-1:0][31:0] alu_result; + wire [`NUM_THREADS-1:0][32:0] sub_result; + wire [`NUM_THREADS-1:0][32:0] shift_result; + `UNUSED_VAR (shift_result); + + wire [`ALU_BITS-1:0] alu_op = alu_req_if.alu_op; + wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data; + wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data; + + genvar i; + + for (i = 0; i < `NUM_THREADS; i++) begin + + wire [32:0] sub_in1 = {(alu_op != `ALU_SLTU) & alu_in1[i][31], alu_in1[i]}; + wire [32:0] sub_in2 = {(alu_op != `ALU_SLTU) & alu_in2[i][31], alu_in2[i]}; + assign sub_result[i] = $signed(sub_in1) - $signed(sub_in2); + + wire [32:0] shift_in1 = {(alu_op == `ALU_SRA) & alu_in1[i][31], alu_in1[i]}; + assign shift_result[i] = $signed(shift_in1) >>> alu_in2[i][4:0]; + + always @(*) begin + case (alu_op) + `ALU_SUB: alu_result[i] = sub_result[i][31:0]; + `ALU_SLL: alu_result[i] = alu_in1[i] << alu_in2[i][4:0]; + `ALU_SLT, + `ALU_SLTU: alu_result[i] = 32'(sub_result[i][32]); + `ALU_XOR: alu_result[i] = alu_in1[i] ^ alu_in2[i]; + `ALU_SRL, + `ALU_SRA: alu_result[i] = shift_result[i][31:0]; + `ALU_OR: alu_result[i] = alu_in1[i] | alu_in2[i]; + `ALU_AND: alu_result[i] = alu_in1[i] & alu_in2[i]; + default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC + endcase + end end - wire inst_stalled = (curr_inst_delay != inst_delay); + wire stall = ~alu_wb_if.ready && (| alu_wb_if.valid); - always @(posedge clk) begin - if (reset) begin - curr_inst_delay <= 0; - end else begin - curr_inst_delay <= inst_stalled ? (curr_inst_delay + 1) : 0; - end - end + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32)), + ) alu_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_result}), + .out ({alu_wb_if.valid, alu_wb_if.warp_num, alu_wb_if.curr_PC, alu_wb_if.rd, alu_wb_if.wb, alu_wb_if.data}) + ); - assign alu_stall = inst_stalled; - - always @(*) begin - case (alu_op) - `ALU_ADD: alu_result = $signed(alu_in1) + $signed(alu_in2); - `ALU_SUB: alu_result = $signed(alu_in1) - $signed(alu_in2); - `ALU_SLLA: alu_result = alu_in1 << alu_in2[4:0]; - `ALU_SLT: alu_result = ($signed(alu_in1) < $signed(alu_in2)) ? 32'h1 : 32'h0; - `ALU_SLTU: alu_result = alu_in1 < alu_in2 ? 32'h1 : 32'h0; - `ALU_XOR: alu_result = alu_in1 ^ alu_in2; - `ALU_SRL: alu_result = alu_in1 >> alu_in2[4:0]; - `ALU_SRA: alu_result = $signed(alu_in1) >>> alu_in2[4:0]; - `ALU_OR: alu_result = alu_in1 | alu_in2; - `ALU_AND: alu_result = alu_in2 & alu_in1; - `ALU_SUBU: alu_result = (alu_in1 >= alu_in2) ? 32'h0 : 32'hffffffff; - `ALU_LUI: alu_result = upper_immed_s; - `ALU_AUIPC: alu_result = $signed(curr_PC) + $signed(upper_immed_s); - `ALU_MUL: alu_result = mul_result[31:0]; - `ALU_MULH: alu_result = mul_result[63:32]; - `ALU_MULHSU: alu_result = mul_result[63:32]; - `ALU_MULHU: alu_result = mul_result[63:32]; - `ALU_DIV: alu_result = (alu_in2 == 0) ? 32'hffffffff : div_result_signed; - `ALU_DIVU: alu_result = (alu_in2 == 0) ? 32'hffffffff : div_result_unsigned; - `ALU_REM: alu_result = (alu_in2 == 0) ? alu_in1 : rem_result_signed; - `ALU_REMU: alu_result = (alu_in2 == 0) ? alu_in1 : rem_result_unsigned; - default: alu_result = 32'h0; - endcase - end - - VX_divide #( - .WIDTHN(32), - .WIDTHD(32), - .NSIGNED(0), - .DSIGNED(0), - .PIPELINE(`DIV_LATENCY) - ) udiv ( - .clk(clk), - .reset(reset), - .numer(alu_in1), - .denom(alu_in2), - .quotient(div_result_unsigned), - .remainder(rem_result_unsigned) - ); - - VX_divide #( - .WIDTHN(32), - .WIDTHD(32), - .NSIGNED(1), - .DSIGNED(1), - .PIPELINE(`DIV_LATENCY) - ) sdiv ( - .clk(clk), - .reset(reset), - .numer(alu_in1), - .denom(alu_in2), - .quotient(div_result_signed), - .remainder(rem_result_signed) - ); - - wire [32:0] mul_dataa = {(alu_op == `ALU_MULHU) ? 1'b0 : alu_in1[31], alu_in1}; - wire [32:0] mul_datab = {(alu_op == `ALU_MULHU || alu_op == `ALU_MULHSU) ? 1'b0 : alu_in2[31], alu_in2}; - - VX_mult #( - .WIDTHA(33), - .WIDTHB(33), - .WIDTHP(64), - .SIGNED(1), - .PIPELINE(`MUL_LATENCY) - ) multiplier ( - .clk(clk), - .reset(reset), - .dataa(mul_dataa), - .datab(mul_datab), - .result(mul_result) - ); + assign alu_req_if.ready = ~stall; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_back_end.v b/hw/rtl/VX_back_end.v deleted file mode 100644 index 3da014ce..00000000 --- a/hw/rtl/VX_back_end.v +++ /dev/null @@ -1,171 +0,0 @@ -`include "VX_define.vh" - -module VX_back_end #( - parameter CORE_ID = 0 -) ( - `SCOPE_SIGNALS_LSU_IO - `SCOPE_SIGNALS_BE_IO - - input wire clk, - input wire reset, - - VX_csr_io_req_if csr_io_req_if, - VX_csr_io_rsp_if csr_io_rsp_if, - - input wire schedule_delay, - - VX_cache_core_req_if dcache_req_if, - VX_cache_core_rsp_if dcache_rsp_if, - VX_jal_rsp_if jal_rsp_if, - VX_branch_rsp_if branch_rsp_if, - - VX_backend_req_if bckE_req_if, - VX_wb_if writeback_if, - - VX_warp_ctl_if warp_ctl_if, - - output wire mem_delay, - output wire exec_delay, - output wire gpr_stage_delay, - - output wire ebreak -); - - wire no_slot_mem; - wire no_slot_exec; - - - // LSU input + output - VX_lsu_req_if lsu_req_if(); - VX_wb_if mem_wb_if(); - - // Exec unit input + output - VX_exec_unit_req_if exec_unit_req_if(); - VX_wb_if inst_exec_wb_if(); - - // GPU unit input - VX_gpu_inst_req_if gpu_inst_req_if(); - - // CSR unit inputs - VX_csr_req_if csr_req_if(); - VX_wb_if csr_wb_if(); - wire no_slot_csr; - wire stall_gpr_csr; - - VX_gpr_stage gpr_stage ( - .clk (clk), - .reset (reset), - .schedule_delay (schedule_delay), - .writeback_if (writeback_if), - .bckE_req_if (bckE_req_if), - // New - .exec_unit_req_if (exec_unit_req_if), - .lsu_req_if (lsu_req_if), - .gpu_inst_req_if (gpu_inst_req_if), - .csr_req_if (csr_req_if), - .stall_gpr_csr (stall_gpr_csr), - // End new - .memory_delay (mem_delay), - .exec_delay (exec_delay), - .delay (gpr_stage_delay) - ); - - assign ebreak = exec_unit_req_if.is_etype && (| exec_unit_req_if.valid); - - VX_lsu_unit #( - .CORE_ID(CORE_ID) - ) lsu_unit ( - `SCOPE_SIGNALS_LSU_BIND - - .clk (clk), - .reset (reset), - .lsu_req_if (lsu_req_if), - .mem_wb_if (mem_wb_if), - .dcache_req_if (dcache_req_if), - .dcache_rsp_if (dcache_rsp_if), - .delay (mem_delay), - .no_slot_mem (no_slot_mem) - ); - - VX_exec_unit exec_unit ( - .clk (clk), - .reset (reset), - .exec_unit_req_if(exec_unit_req_if), - .inst_exec_wb_if(inst_exec_wb_if), - .jal_rsp_if (jal_rsp_if), - .branch_rsp_if (branch_rsp_if), - .delay (exec_delay), - .no_slot_exec (no_slot_exec) - ); - - VX_gpu_inst gpu_inst ( - .gpu_inst_req_if(gpu_inst_req_if), - .warp_ctl_if (warp_ctl_if) - ); - - VX_csr_req_if issued_csr_req_if(); - - VX_wb_if csr_pipe_rsp_if(); - - VX_csr_arb csr_arb ( - .clk (clk), - .reset (reset), - - .csr_pipe_stall (stall_gpr_csr), - - .csr_core_req_if (csr_req_if), - .csr_io_req_if (csr_io_req_if), - .issued_csr_req_if(issued_csr_req_if), - - .csr_pipe_rsp_if (csr_pipe_rsp_if), - .csr_wb_if (csr_wb_if), - .csr_io_rsp_if (csr_io_rsp_if) - ); - - VX_csr_pipe #( - .CORE_ID(CORE_ID) - ) csr_pipe ( - .clk (clk), - .reset (reset), - .no_slot_csr (no_slot_csr), - .csr_req_if (issued_csr_req_if), - .writeback_if (writeback_if), - .csr_wb_if (csr_pipe_rsp_if), - .stall_gpr_csr (stall_gpr_csr) - ); - - VX_writeback writeback ( - .clk (clk), - .reset (reset), - .mem_wb_if (mem_wb_if), - .inst_exec_wb_if(inst_exec_wb_if), - .csr_wb_if (csr_wb_if), - - .writeback_if (writeback_if), - .no_slot_mem (no_slot_mem), - .no_slot_exec (no_slot_exec), - .no_slot_csr (no_slot_csr) - ); - - `SCOPE_ASSIGN(scope_decode_valid, bckE_req_if.valid); - `SCOPE_ASSIGN(scope_decode_warp_num, bckE_req_if.warp_num); - `SCOPE_ASSIGN(scope_decode_curr_PC, bckE_req_if.curr_PC); - `SCOPE_ASSIGN(scope_decode_is_jal, bckE_req_if.is_jal); - `SCOPE_ASSIGN(scope_decode_rs1, bckE_req_if.rs1); - `SCOPE_ASSIGN(scope_decode_rs2, bckE_req_if.rs2); - - `SCOPE_ASSIGN(scope_execute_valid, exec_unit_req_if.valid); - `SCOPE_ASSIGN(scope_execute_warp_num, exec_unit_req_if.warp_num); - `SCOPE_ASSIGN(scope_execute_curr_PC, exec_unit_req_if.curr_PC); - `SCOPE_ASSIGN(scope_execute_rd, exec_unit_req_if.rd); - `SCOPE_ASSIGN(scope_execute_a, exec_unit_req_if.a_reg_data); - `SCOPE_ASSIGN(scope_execute_b, exec_unit_req_if.b_reg_data); - - `SCOPE_ASSIGN(scope_writeback_valid, writeback_if.valid); - `SCOPE_ASSIGN(scope_writeback_warp_num, writeback_if.warp_num); - `SCOPE_ASSIGN(scope_writeback_curr_PC, writeback_if.curr_PC); - `SCOPE_ASSIGN(scope_writeback_wb, writeback_if.wb); - `SCOPE_ASSIGN(scope_writeback_rd, writeback_if.rd); - `SCOPE_ASSIGN(scope_writeback_data, writeback_if.data); - -endmodule diff --git a/hw/rtl/VX_branch_unit.v b/hw/rtl/VX_branch_unit.v new file mode 100644 index 00000000..d1bd8eca --- /dev/null +++ b/hw/rtl/VX_branch_unit.v @@ -0,0 +1,82 @@ +`include "VX_define.vh" + +module VX_branch_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Inputs + VX_branch_req_if branch_req_if, + + // Outputs + VX_branch_rsp_if branch_rsp_if, + VX_wb_if branch_wb_if +); + + wire [`NT_BITS-1:0] br_result_index; + + VX_priority_encoder #( + .N(`NUM_THREADS) + ) choose_alu_result ( + .data_in (alu_req_if.valid), + .data_out (br_result_index), + `UNUSED_PIN (valid_out) + ); + + wire [`BR_BITS-1:0] br_op = branch_req_if.br_op; + wire [31:0] rs1_data = branch_req_if.rs1_data[br_result_index]; + wire [31:0] rs2_data = branch_req_if.rs2_data[br_result_index]; + + wire [32:0] sub_in1 = {(br_op != `BR_LTU) & (br_op != `BR_GEU) & rs1_data[31], rs1_data}; + wire [32:0] sub_in2 = {(br_op != `BR_LTU) & (br_op != `BR_GEU) & rs2_data[31], rs2_data}; + wire [32:0] sub_res = $signed(sub_in1) - $signed(sub_in2); + + wire sub_sign = sub_res[32]; + wire sub_nzero = (| sub_res[31:0]); + + reg br_taken; + always @(*) begin + case (br_op) + `BR_NE: br_taken = sub_nzero; + `BR_EQ: br_taken = ~sub_nzero; + `BR_LT, + `BR_LTU: br_taken = sub_sign; + `BR_GE, + `BR_GEU: br_taken = ~sub_sign; + default: br_taken = 1'b1; + endcase + end + + wire in_valid = (| branch_req_if.valid); + + wire [31:0] base_addr = (br_op == `BR_JALR) ? rs1_data : branch_req_if.curr_PC; + wire [31:0] br_dest = $signed(base_addr) + $signed(branch_req_if.offset); + + wire stall = (~branch_wb_if.ready && (| branch_wb_if.valid)); + + VX_generic_register #( + .N(1 + `NW_BITS + 1 + 32) + ) rsp_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({in_valid, branch_req_if.warp_num, br_taken, br_dest}), + .out ({branch_rsp_if.valid, branch_rsp_if.warp_num, branch_rsp_if.taken, branch_rsp_if.dest}) + ); + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32)), + ) wb_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({branch_req_if.valid, branch_req_if.warp_num, branch_req_if.curr_PC, branch_req_if.rd, branch_req_if.wb, {`NUM_THREADS{branch_req_if.next_PC}}}), + .out ({branch_wb_if.valid, branch_wb_if.warp_num, branch_wb_if.curr_PC, branch_wb_if.rd, branch_wb_if.wb, branch_wb_if.data}) + ); + + assign branch_req_if.ready = ~stall; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 6a5f7386..ef4b9e78 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -59,6 +59,10 @@ `define L3_ENABLE (`NUM_CLUSTERS > 1) `endif +`ifndef EXT_M_ENABLE +`define EXT_M_ENABLE 1 +`endif + // Configuration Values ======================================================= `define VENDOR_ID 0 @@ -85,6 +89,8 @@ `define CSR_INSTR_L 12'hC02 `define CSR_INSTR_H 12'hC82 +`define CSR_MISA 12'h301 + // Dcache Configurable Knobs ================================================== // Size of cache in bytes diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index c437c398..9695526b 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -250,7 +250,7 @@ module VX_core #( assign dcache_snp_req_if.addr = snp_req_addr; assign dcache_snp_req_if.invalidate = snp_req_invalidate; assign dcache_snp_req_if.tag = snp_req_tag; - assign snp_req_ready = dcache_snp_req_if.ready; + assign snp_req_ready = dcache_snp_req_if.ready; assign snp_rsp_valid = dcache_snp_rsp_if.valid; assign snp_rsp_tag = dcache_snp_rsp_if.tag; @@ -283,18 +283,20 @@ module VX_core #( .icache_dram_rsp_if (icache_dram_rsp_if) ); - // select io address + // select io bus wire is_io_addr = ({core_dcache_req_if.addr[0], 2'b0} >= `IO_BUS_BASE_ADDR); - wire io_select = (| core_dcache_req_if.valid) ? is_io_addr : 0; + wire io_req_select = (| core_dcache_req_if.valid) ? is_io_addr : 0; + wire io_rsp_select = (| arb_io_rsp_if.valid); - VX_dcache_arb dcache_io_arb ( - .req_select (io_select), - .in_core_req_if (core_dcache_req_if), - .out0_core_req_if (arb_dcache_req_if), - .out1_core_req_if (arb_io_req_if), - .in0_core_rsp_if (arb_dcache_rsp_if), - .in1_core_rsp_if (arb_io_rsp_if), - .out_core_rsp_if (core_dcache_rsp_if) + VX_dcache_arb dcache_io_arb ( + .core_req_in_if (core_dcache_req_if), + .core_req_out0_if (arb_dcache_req_if), + .core_req_out1_if (arb_io_req_if), + .core_rsp_in0_if (arb_dcache_rsp_if), + .core_rsp_in1_if (arb_io_rsp_if), + .core_rsp_out_if (core_dcache_rsp_if), + .select_req (io_req_select), + .select_rsp (io_rsp_select) ); endmodule diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 58327ee3..2db5c7a8 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -1,51 +1,54 @@ `include "VX_define.vh" module VX_csr_arb ( - input wire clk, - input wire reset, - - input wire csr_pipe_stall, + input wire clk, + input wire reset, + // inputs VX_csr_req_if csr_core_req_if, VX_csr_io_req_if csr_io_req_if, - VX_csr_req_if issued_csr_req_if, - VX_wb_if csr_pipe_rsp_if, - VX_wb_if csr_wb_if, - VX_csr_io_rsp_if csr_io_rsp_if + // output + VX_csr_req_if csr_req_if, + + // input + VX_wb_if csr_rsp_if, + + // outputs + VX_csr_io_rsp_if csr_io_rsp_if, + VX_wb_if csr_wb_if ); `UNUSED_VAR (clk) `UNUSED_VAR (reset) - wire pick_core = (| csr_core_req_if.valid); + wire core_select = ~(| csr_io_req_if.valid); - // Mux between core and io - assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; - assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1; - assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS); - assign issued_csr_req_if.csr_addr = pick_core ? csr_core_req_if.csr_addr : csr_io_req_if.addr; - assign issued_csr_req_if.csr_immed = pick_core ? csr_core_req_if.csr_immed : 0; - assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); - assign issued_csr_req_if.is_io = !pick_core; - assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num; - assign issued_csr_req_if.rd = csr_core_req_if.rd; - assign issued_csr_req_if.wb = csr_core_req_if.wb; + // requests + assign csr_req_if.valid = core_select ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; + assign csr_req_if.warp_num = core_select ? csr_core_req_if.warp_num : 0; + assign csr_req_if.curr_PC = core_select ? csr_core_req_if.curr_PC : 0; + assign csr_req_if.csr_op = core_select ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS); + assign csr_req_if.csr_addr = core_select ? csr_core_req_if.csr_addr : csr_io_req_if.addr; + assign csr_req_if.csr_mask = core_select ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); + assign csr_req_if.rd = core_select ? csr_core_req_if.rd : 0; + assign csr_req_if.wb = core_select ? csr_core_req_if.wb : 0; + assign csr_req_if.is_io = ~core_select; - assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core); - - // Core Writeback - assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}}; - assign csr_wb_if.data = csr_pipe_rsp_if.data; - assign csr_wb_if.warp_num = csr_pipe_rsp_if.warp_num; - assign csr_wb_if.rd = csr_pipe_rsp_if.rd; - assign csr_wb_if.wb = csr_pipe_rsp_if.wb; - assign csr_wb_if.curr_PC = csr_pipe_rsp_if.curr_PC; + assign csr_core_req_if.ready = csr_req_if.ready && core_select; + assign csr_io_req_if.ready = csr_req_if.ready && ~core_select; - // CSR I/O response - assign csr_io_rsp_if.valid = csr_pipe_rsp_if.valid[0] & csr_pipe_rsp_if.is_io; - assign csr_io_rsp_if.data = csr_pipe_rsp_if.data[0]; - wire x = csr_io_rsp_if.ready; - `UNUSED_VAR(x) + // responses + assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & csr_rsp_if.is_io; + assign csr_io_rsp_if.data = csr_rsp_if.data[0]; + + assign csr_wb_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~csr_rsp_if.is_io}}; + assign csr_wb_if.warp_num = csr_rsp_if.warp_num; + assign csr_wb_if.curr_PC = csr_rsp_if.curr_PC; + assign csr_wb_if.data = csr_rsp_if.data; + assign csr_wb_if.rd = csr_rsp_if.rd; + assign csr_wb_if.wb = csr_rsp_if.wb; + + assign csr_rsp_if.ready = csr_rsp_if.is_io ? csr_io_rsp_if.ready : csr_wb_if.ready; endmodule diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 7488129e..cace905f 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -15,7 +15,7 @@ module VX_csr_data #( `IGNORE_WARNINGS_END input wire[`CSR_WIDTH-1:0] write_data, input wire[`NW_BITS-1:0] warp_num, - input wire wb_valid + input wire notify_commit ); reg [`CSR_WIDTH-1:0] csr_table[`NUM_CSRS-1:0]; @@ -35,7 +35,7 @@ module VX_csr_data #( csr_table[wr_addr] <= write_data; end num_cycles <= num_cycles + 1; - if (wb_valid) begin + if (notify_commit) begin num_instrs <= num_instrs + 1; end end @@ -57,6 +57,7 @@ module VX_csr_data #( `CSR_VEND_ID : read_data = `VENDOR_ID; `CSR_ARCH_ID : read_data = `ARCHITECTURE_ID; `CSR_IMPL_ID : read_data = `IMPLEMENTATION_ID; + `CSR_MISA : read_data = `ISA_CODE; default : read_data = 32'(csr_table[rd_addr]); endcase end diff --git a/hw/rtl/VX_csr_io_arb.v b/hw/rtl/VX_csr_io_arb.v index bf0d7041..5e6782da 100644 --- a/hw/rtl/VX_csr_io_arb.v +++ b/hw/rtl/VX_csr_io_arb.v @@ -2,7 +2,7 @@ module VX_csr_io_arb #( parameter NUM_REQUESTS = 1, - parameter REQS_BITS = `CLOG2(NUM_REQUESTS) + parameter REQS_BITS = `LOG2UP(NUM_REQUESTS) ) ( input wire clk, input wire reset, @@ -37,6 +37,7 @@ module VX_csr_io_arb #( `UNUSED_VAR (clk) `UNUSED_VAR (reset) + `UNUSED_VAR (request_id) assign out_csr_io_req_valid = in_csr_io_req_valid; assign out_csr_io_req_rw = in_csr_io_req_rw; diff --git a/hw/rtl/VX_csr_pipe.v b/hw/rtl/VX_csr_pipe.v index aa6980ab..a329d258 100644 --- a/hw/rtl/VX_csr_pipe.v +++ b/hw/rtl/VX_csr_pipe.v @@ -3,79 +3,86 @@ module VX_csr_pipe #( parameter CORE_ID = 0 ) ( - input wire clk, - input wire reset, - input wire no_slot_csr, - VX_csr_req_if csr_req_if, - VX_wb_if writeback_if, - VX_wb_if csr_wb_if, - output wire stall_gpr_csr + input wire clk, + input wire reset, + VX_csr_req_if csr_req_if, + VX_csr_io_req_if csr_io_req_if, + VX_wb_if csr_wb_if, + VX_csr_io_rsp_if csr_io_rsp_if, + input wire notify_commit ); + VX_csr_req_if csr_pipe_req_if(); + VX_wb_if csr_pipe_wb_if(); - wire[`NUM_THREADS-1:0] valid_s2; - wire[`NW_BITS-1:0] warp_num_s2; - wire[4:0] rd_s2; - wire[1:0] wb_s2; - wire is_csr_s2; - wire[`CSR_ADDR_SIZE-1:0] csr_addr_s2; - wire[31:0] csr_read_data_s2; - wire[31:0] csr_updated_data_s2; + VX_csr_arb csr_arb ( + .clk (clk), + .reset (reset), + .csr_core_req_if (csr_req_if), + .csr_io_req_if (csr_io_req_if), + .csr_req_if (csr_pipe_req_if), + .csr_rsp_if (csr_pipe_wb_if), + .csr_io_rsp_if (csr_io_rsp_if), + .csr_wb_if (csr_wb_if) + ); - wire[31:0] csr_read_data_unqual; - wire[31:0] csr_read_data; + wire [`CSR_ADDR_SIZE-1:0] csr_addr_s2; + wire [31:0] csr_read_data_s2; + wire [31:0] csr_updated_data_s2; + wire [31:0] csr_read_data_unqual; + + wire is_csr_s2 = (| csr_pipe_wb_if.valid); VX_csr_data #( .CORE_ID(CORE_ID) ) csr_data ( .clk (clk), .reset (reset), - .read_addr (csr_req_if.csr_addr), + .read_addr (csr_pipe_req_if.csr_addr), .read_data (csr_read_data_unqual), .write_enable (is_csr_s2), .write_data (csr_updated_data_s2[`CSR_WIDTH-1:0]), .write_addr (csr_addr_s2), - .warp_num (csr_req_if.warp_num), - .wb_valid (| writeback_if.valid) + .warp_num (csr_pipe_req_if.warp_num), + .notify_commit (notify_commit) ); - wire car_hazard = (csr_addr_s2 == csr_req_if.csr_addr) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2; + wire csr_hazard = (csr_addr_s2 == csr_pipe_req_if.csr_addr) + && (csr_pipe_wb_if.warp_num == csr_pipe_req_if.warp_num) + && is_csr_s2; - assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual; + wire [31:0] csr_read_data = csr_hazard ? csr_updated_data_s2 : csr_read_data_unqual; reg [31:0] csr_updated_data; always @(*) begin - case (csr_req_if.alu_op) - `ALU_CSR_RW: csr_updated_data = csr_req_if.csr_mask; - `ALU_CSR_RS: csr_updated_data = csr_read_data | csr_req_if.csr_mask; - `ALU_CSR_RC: csr_updated_data = csr_read_data & (32'hFFFFFFFF - csr_req_if.csr_mask); - default: csr_updated_data = 32'hdeadbeef; + case (csr_pipe_req_if.csr_op) + `CSR_RW: csr_updated_data = csr_pipe_req_if.csr_mask; + `CSR_RS: csr_updated_data = csr_read_data | csr_pipe_req_if.csr_mask; + `CSR_RC: csr_updated_data = csr_read_data & (32'hFFFFFFFF - csr_pipe_req_if.csr_mask); + default: csr_updated_data = 32'hdeadbeef; endcase - end + end + + wire stall = ~csr_pipe_wb_if.ready && (| csr_pipe_wb_if.valid); VX_generic_register #( - .N(32 + 32 + 12 + 1 + 1 + 2 + 5 + (`NW_BITS-1+1) + `NUM_THREADS) - ) csr_reg_s2 ( - .clk (clk), - .reset(reset), - .stall(no_slot_csr), - .flush(1'b0), - .in ({csr_req_if.valid, csr_req_if.warp_num, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_csr, csr_req_if.csr_addr, csr_req_if.is_io, csr_read_data , csr_updated_data }), - .out ({valid_s2 , warp_num_s2 , rd_s2 , wb_s2 , is_csr_s2 , csr_addr_s2 , csr_wb_if.is_io , csr_read_data_s2, csr_updated_data_s2}) + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + `CSR_ADDR_SIZE + 1 + 32 + 32) + ) csr_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}), + .out ({csr_pipe_wb_if.valid, csr_pipe_wb_if.warp_num, csr_pipe_wb_if.curr_PC, csr_pipe_wb_if.rd, csr_pipe_wb_if.wb, csr_addr_s2, csr_pipe_wb_if.is_io, csr_read_data_s2, csr_updated_data_s2}) ); - assign csr_wb_if.valid = valid_s2; - assign csr_wb_if.warp_num = warp_num_s2; - assign csr_wb_if.rd = rd_s2; - assign csr_wb_if.wb = wb_s2; - genvar i; for (i = 0; i < `NUM_THREADS; i++) begin - assign csr_wb_if.data[i] = (csr_addr_s2 == `CSR_LTID) ? i : - (csr_addr_s2 == `CSR_GTID) ? (csr_read_data_s2 * `NUM_THREADS + i) : - csr_read_data_s2; - end + assign csr_pipe_wb_if.data[i] = (csr_addr_s2 == `CSR_LTID) ? i : + (csr_addr_s2 == `CSR_GTID) ? (csr_read_data_s2 * `NUM_THREADS + i) : + csr_read_data_s2; + end - assign stall_gpr_csr = no_slot_csr && csr_req_if.is_csr && (| csr_req_if.valid); + assign csr_pipe_req_if.ready = ~stall; endmodule diff --git a/hw/rtl/VX_dcache_arb.v b/hw/rtl/VX_dcache_arb.v index 1431be60..4d493c60 100644 --- a/hw/rtl/VX_dcache_arb.v +++ b/hw/rtl/VX_dcache_arb.v @@ -1,48 +1,50 @@ `include "VX_define.vh" module VX_dcache_arb ( - input wire req_select, - // input request - VX_cache_core_req_if in_core_req_if, + VX_cache_core_req_if core_req_in_if, // output 0 request - VX_cache_core_req_if out0_core_req_if, + VX_cache_core_req_if core_req_out0_if, // output 1 request - VX_cache_core_req_if out1_core_req_if, + VX_cache_core_req_if core_req_out1_if, // input 0 response - VX_cache_core_rsp_if in0_core_rsp_if, + VX_cache_core_rsp_if core_rsp_in0_if, // input 1 response - VX_cache_core_rsp_if in1_core_rsp_if, + VX_cache_core_rsp_if core_rsp_in1_if, // output response - VX_cache_core_rsp_if out_core_rsp_if + VX_cache_core_rsp_if core_rsp_out_if, + + // bus select + input wire select_req, + input wire select_rsp ); - assign out0_core_req_if.valid = in_core_req_if.valid & {`NUM_THREADS{~req_select}}; - assign out0_core_req_if.rw = in_core_req_if.rw; - assign out0_core_req_if.byteen = in_core_req_if.byteen; - assign out0_core_req_if.addr = in_core_req_if.addr; - assign out0_core_req_if.data = in_core_req_if.data; - assign out0_core_req_if.tag = in_core_req_if.tag; + // select request + assign core_req_out0_if.valid = core_req_in_if.valid & {`NUM_THREADS{~select_req}}; + assign core_req_out0_if.rw = core_req_in_if.rw; + assign core_req_out0_if.byteen = core_req_in_if.byteen; + assign core_req_out0_if.addr = core_req_in_if.addr; + assign core_req_out0_if.data = core_req_in_if.data; + assign core_req_out0_if.tag = core_req_in_if.tag; - assign out1_core_req_if.valid = in_core_req_if.valid & {`NUM_THREADS{req_select}}; - assign out1_core_req_if.rw = in_core_req_if.rw; - assign out1_core_req_if.byteen = in_core_req_if.byteen; - assign out1_core_req_if.addr = in_core_req_if.addr; - assign out1_core_req_if.data = in_core_req_if.data; - assign out1_core_req_if.tag = in_core_req_if.tag; + assign core_req_out1_if.valid = core_req_in_if.valid & {`NUM_THREADS{select_req}}; + assign core_req_out1_if.rw = core_req_in_if.rw; + assign core_req_out1_if.byteen = core_req_in_if.byteen; + assign core_req_out1_if.addr = core_req_in_if.addr; + assign core_req_out1_if.data = core_req_in_if.data; + assign core_req_out1_if.tag = core_req_in_if.tag; - assign in_core_req_if.ready = req_select ? out1_core_req_if.ready : out0_core_req_if.ready; + assign core_req_in_if.ready = select_req ? core_req_out1_if.ready : core_req_out0_if.ready; - wire rsp_select0 = (| in0_core_rsp_if.valid); - - assign out_core_rsp_if.valid = rsp_select0 ? in0_core_rsp_if.valid : in1_core_rsp_if.valid; - assign out_core_rsp_if.data = rsp_select0 ? in0_core_rsp_if.data : in1_core_rsp_if.data; - assign out_core_rsp_if.tag = rsp_select0 ? in0_core_rsp_if.tag : in1_core_rsp_if.tag; - assign in0_core_rsp_if.ready = out_core_rsp_if.ready && rsp_select0; - assign in1_core_rsp_if.ready = out_core_rsp_if.ready && !rsp_select0; + // select response + assign core_rsp_out_if.valid = select_rsp ? core_rsp_in1_if.valid : core_rsp_in0_if.valid; + assign core_rsp_out_if.data = select_rsp ? core_rsp_in1_if.data : core_rsp_in0_if.data; + assign core_rsp_out_if.tag = select_rsp ? core_rsp_in1_if.tag : core_rsp_in0_if.tag; + assign core_rsp_in0_if.ready = core_rsp_out_if.ready && ~select_rsp; + assign core_rsp_in1_if.ready = core_rsp_out_if.ready && select_rsp; endmodule \ No newline at end of file diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index deb3f44b..44a84c88 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -1,321 +1,279 @@ `include "VX_define.vh" -module VX_decode( - // Fetch Inputs - VX_inst_meta_if fd_inst_meta_de, +module VX_decode #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, - // Outputs - VX_backend_req_if frE_to_bckE_req_if, - VX_wstall_if wstall_if, - VX_join_if join_if + // inputs + VX_ifetch_rsp_if ifetch_rsp_if, + + // outputs + VX_decode_if decode_if, + VX_wstall_if wstall_if, + VX_join_if join_if ); - wire in_valid = (| fd_inst_meta_de.valid); - wire[31:0] in_instruction = fd_inst_meta_de.instruction; - wire[31:0] in_curr_PC = fd_inst_meta_de.curr_PC; - wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num; + wire in_valid = (| ifetch_rsp_if.valid); + wire [31:0] instr = ifetch_rsp_if.instr; - assign frE_to_bckE_req_if.curr_PC = in_curr_PC; + reg [`ALU_BITS-1:0] alu_op; + reg [`BR_BITS-1:0] br_op; + reg [`MUL_BITS-1:0] mul_op; + wire [`LSU_BITS-1:0] lsu_op; + reg [`CSR_BITS-1:0] csr_op; + reg [`GPU_BITS-1:0] gpu_op; - wire[6:0] curr_opcode; + reg [19:0] upper_imm; + reg [31:0] jalx_offset; + reg [31:0] src2_imm; - wire is_itype; - wire is_rtype; - wire is_stype; - wire is_btype; - wire is_linst; - wire is_jal; - wire is_jalr; - wire is_lui; - wire is_auipc; - wire is_csr; - wire is_csr_immed; - wire is_etype; + wire [6:0] opcode = instr[6:0]; + wire [2:0] func3 = instr[14:12]; + wire [6:0] func7 = instr[31:25]; + wire [11:0] u_12 = instr[31:20]; - wire is_gpgpu; - wire is_wspawn; - wire is_tmc; - wire is_split; - wire is_join; - wire is_barrier; + wire [`NR_BITS-1:0] rd = instr[11:7]; + wire [`NR_BITS-1:0] rs1 = instr[19:15]; + wire [`NR_BITS-1:0] rs2 = instr[24:20]; - wire[2:0] func3; - wire[6:0] func7; - wire[11:0] u_12; - - wire[7:0] jal_b_19_to_12; - wire jal_b_11; - wire[9:0] jal_b_10_to_1; - wire jal_b_20; - wire jal_b_0; - wire[20:0] jal_unsigned_offset; - wire[31:0] jal_1_offset; - - wire[11:0] jalr_immed; - wire[31:0] jal_2_offset; - - wire jal_sys_cond1; - wire jal_sys_cond2; - wire jal_sys_jal; - wire[31:0] jal_sys_off; - - wire csr_cond1; - wire csr_cond2; - - wire[11:0] alu_tempp; - wire alu_shift_i; - wire[11:0] alu_shift_i_immed; - - wire[1:0] csr_type; - - reg[4:0] csr_alu; - reg[4:0] alu_op; - reg[4:0] mul_alu; - reg[19:0] temp_upper_immed; - reg temp_jal; - reg[31:0] temp_jal_offset; - reg[31:0] temp_itype_immed; - reg[2:0] temp_branch_type; - reg temp_branch_stall; - - assign frE_to_bckE_req_if.valid = fd_inst_meta_de.valid; - - assign frE_to_bckE_req_if.warp_num = in_warp_num; - - assign curr_opcode = in_instruction[6:0]; - - assign frE_to_bckE_req_if.rd = in_instruction[11:7]; - assign frE_to_bckE_req_if.rs1 = in_instruction[19:15]; - assign frE_to_bckE_req_if.rs2 = in_instruction[24:20]; - assign func3 = in_instruction[14:12]; - assign func7 = in_instruction[31:25]; - assign u_12 = in_instruction[31:20]; - - assign frE_to_bckE_req_if.next_PC = in_curr_PC + 32'h4; - - // Write Back sigal - assign is_rtype = (curr_opcode == `INST_R); - assign is_linst = (curr_opcode == `INST_L); - assign is_itype = (curr_opcode == `INST_ALU) || is_linst; - assign is_stype = (curr_opcode == `INST_S); - assign is_btype = (curr_opcode == `INST_B); - assign is_jal = (curr_opcode == `INST_JAL); - assign is_jalr = (curr_opcode == `INST_JALR); - assign is_lui = (curr_opcode == `INST_LUI); - assign is_auipc = (curr_opcode == `INST_AUIPC); - assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0); - assign is_csr_immed = is_csr && (func3[2] == 1); - - assign is_gpgpu = (curr_opcode == `INST_GPGPU); - - assign is_tmc = is_gpgpu && (func3 == 0); // Goes to BE - assign is_wspawn = is_gpgpu && (func3 == 1); // Goes to BE - assign is_barrier = is_gpgpu && (func3 == 4); // Goes to BE - assign is_split = is_gpgpu && (func3 == 2); // Goes to BE - assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE - - assign join_if.is_join = is_join && in_valid; - assign join_if.warp_num = in_warp_num; - - assign frE_to_bckE_req_if.is_wspawn = is_wspawn; - assign frE_to_bckE_req_if.is_tmc = is_tmc; - assign frE_to_bckE_req_if.is_split = is_split; - assign frE_to_bckE_req_if.is_barrier = is_barrier; - - assign frE_to_bckE_req_if.csr_immed = is_csr_immed; - assign frE_to_bckE_req_if.is_csr = is_csr; - - assign frE_to_bckE_req_if.wb = (is_jal || is_jalr || is_etype) ? `WB_JAL : - is_linst ? `WB_MEM : - (is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU : - `WB_NO; - - assign frE_to_bckE_req_if.rs2_src = (is_itype || is_stype) ? `RS2_IMMED : `RS2_REG; - - // MEM signals - assign frE_to_bckE_req_if.mem_read = (is_linst) ? func3 : `BYTE_EN_NO; - assign frE_to_bckE_req_if.mem_write = (is_stype) ? func3 : `BYTE_EN_NO; - - // UPPER IMMEDIATE + // opcode types + wire is_rtype = (opcode == `INST_R); + wire is_ltype = (opcode == `INST_L); + wire is_itype = (opcode == `INST_I); + wire is_stype = (opcode == `INST_S); + wire is_btype = (opcode == `INST_B); + wire is_jal = (opcode == `INST_JAL); + wire is_jalr = (opcode == `INST_JALR); + wire is_lui = (opcode == `INST_LUI); + wire is_auipc = (opcode == `INST_AUIPC); + wire is_jals = (opcode == `INST_SYS) && (func3 == 0); + wire is_csr = (opcode == `INST_SYS) && (func3 != 0); + wire is_gpu = (opcode == `INST_GPU); + wire is_br = (is_btype || is_jal || is_jalr || is_jals); + wire is_mul = is_rtype && (func7 == 7'h1); + + // upper immediate always @(*) begin - case (curr_opcode) - `INST_LUI: temp_upper_immed = {func7, frE_to_bckE_req_if.rs2, frE_to_bckE_req_if.rs1, func3}; - `INST_AUIPC: temp_upper_immed = {func7, frE_to_bckE_req_if.rs2, frE_to_bckE_req_if.rs1, func3}; - default: temp_upper_immed = 20'h0; - endcase // curr_opcode - end - - assign frE_to_bckE_req_if.upper_immed = temp_upper_immed; - - assign jal_b_19_to_12 = in_instruction[19:12]; - assign jal_b_11 = in_instruction[20]; - assign jal_b_10_to_1 = in_instruction[30:21]; - assign jal_b_20 = in_instruction[31]; - assign jal_b_0 = 1'b0; - assign jal_unsigned_offset = {jal_b_20, jal_b_19_to_12, jal_b_11, jal_b_10_to_1, jal_b_0}; - assign jal_1_offset = {{11{jal_b_20}}, jal_unsigned_offset}; - - assign jalr_immed = {func7, frE_to_bckE_req_if.rs2}; - assign jal_2_offset = {{20{jalr_immed[11]}}, jalr_immed}; - - assign jal_sys_cond1 = (func3 == 3'h0); - assign jal_sys_cond2 = (u_12 < 12'h2); - - assign jal_sys_jal = (jal_sys_cond1 && jal_sys_cond2) ? 1'b1 : 1'b0; - assign jal_sys_off = (jal_sys_cond1 && jal_sys_cond2) ? 32'hb0000000 : 32'hdeadbeef; - - // JAL - always @(*) begin - case (curr_opcode) - `INST_JAL: - begin - temp_jal = in_valid; - temp_jal_offset = jal_1_offset; - end - `INST_JALR: - begin - temp_jal = in_valid; - temp_jal_offset = jal_2_offset; - end - `INST_SYS: - begin - // $display("SYS EBREAK %h", (jal_sys_jal && in_valid)); - temp_jal = jal_sys_jal && in_valid; - temp_jal_offset = jal_sys_off; - end - default: - begin - temp_jal = 1'b0; - temp_jal_offset = 32'hdeadbeef; - end + case (opcode) + `INST_LUI: upper_imm = {func7, rs2, rs1, func3}; + `INST_AUIPC: upper_imm = {func7, rs2, rs1, func3}; + default: upper_imm = 20'h0; endcase - end + end - assign frE_to_bckE_req_if.is_jal = is_jal; - assign frE_to_bckE_req_if.jal = temp_jal; - assign frE_to_bckE_req_if.jal_offset = temp_jal_offset; - - // ecall/ebreak - assign is_etype = (curr_opcode == `INST_SYS) && jal_sys_jal; - assign frE_to_bckE_req_if.is_etype = is_etype; - - // CSR - - assign csr_cond1 = func3 != 3'h0; - assign csr_cond2 = u_12 >= 12'h2; - - assign frE_to_bckE_req_if.csr_addr = (csr_cond1 && csr_cond2) ? u_12 : 12'h55; - - // ITYPE IMEED - assign alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5); - assign alu_shift_i_immed = {{7{1'b0}}, frE_to_bckE_req_if.rs2}; - assign alu_tempp = alu_shift_i ? alu_shift_i_immed : u_12; - - always @(*) begin - case (curr_opcode) - `INST_ALU: temp_itype_immed = {{20{alu_tempp[11]}}, alu_tempp}; - `INST_S: temp_itype_immed = {{20{func7[6]}}, func7, frE_to_bckE_req_if.rd}; - `INST_L: temp_itype_immed = {{20{u_12[11]}}, u_12}; - `INST_B: temp_itype_immed = {{20{in_instruction[31]}}, in_instruction[31], in_instruction[7], in_instruction[30:25], in_instruction[11:8]}; - default: temp_itype_immed = 32'hdeadbeef; + // JAL + wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0}; + wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm}; + wire [11:0] jalr_imm = {func7, rs2}; + wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm}; + always @(*) begin + case (opcode) + `INST_JAL: jalx_offset = jal_offset; + `INST_JALR: jalx_offset = jalr_offset; + default: jalx_offset = 32'd4; endcase - end - - assign frE_to_bckE_req_if.itype_immed = temp_itype_immed; + end + // I-type immediate + wire alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5); + wire [11:0] alu_shift_imm = {{7{1'b0}}, rs2}; + wire [11:0] alu_imm = alu_shift_i ? alu_shift_imm : u_12; always @(*) begin - case (curr_opcode) + case (opcode) + `INST_I: src2_imm = {{20{alu_imm[11]}}, alu_imm}; + `INST_S: src2_imm = {{20{func7[6]}}, func7, rd}; + `INST_L: src2_imm = {{20{u_12[11]}}, u_12}; + `INST_B: src2_imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0}; + default: src2_imm = 32'hdeadbeef; + endcase + end + + // BRANCH + always @(*) begin + br_op = `BR_OTHER; + case (opcode) `INST_B: begin - // $display("BRANCH IN DECODE"); - temp_branch_stall = in_valid; case (func3) - 3'h0: temp_branch_type = `BR_EQ; - 3'h1: temp_branch_type = `BR_NE; - 3'h4: temp_branch_type = `BR_LT; - 3'h5: temp_branch_type = `BR_GT; - 3'h6: temp_branch_type = `BR_LTU; - 3'h7: temp_branch_type = `BR_GTU; - default: temp_branch_type = `BR_NO; + 3'h0: br_op = `BR_EQ; + 3'h1: br_op = `BR_NE; + 3'h4: br_op = `BR_LT; + 3'h5: br_op = `BR_GE; + 3'h6: br_op = `BR_LTU; + 3'h7: br_op = `BR_GEU; + default:; endcase end - `INST_JAL: begin - temp_branch_type = `BR_NO; - temp_branch_stall = in_valid; - end - `INST_JALR: begin - temp_branch_type = `BR_NO; - temp_branch_stall = in_valid; - end - default: begin - temp_branch_type = `BR_NO; - temp_branch_stall = 1'b0; + `INST_JAL: br_op = `BR_JAL; + `INST_JALR: br_op = `BR_JALR; + `INST_SYS: begin + if (is_jals && u_12 == 12'h000) br_op = `BR_ECALL; + if (is_jals && u_12 == 12'h001) br_op = `BR_EBREAK; + if (is_jals && u_12 == 12'h302) br_op = `BR_MRET; + if (is_jals && u_12 == 12'h102) br_op = `BR_SRET; + if (is_jals && u_12 == 12'h7B2) br_op = `BR_DRET; end + default:; endcase end - - assign frE_to_bckE_req_if.branch_type = temp_branch_type; - - assign wstall_if.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && in_valid; - assign wstall_if.warp_num = in_warp_num; - + + // ALU always @(*) begin - // ALU OP + alu_op = `ALU_OTHER; + if (is_lui) begin + alu_op = `ALU_LUI; + end else if (is_auipc) begin + alu_op = `ALU_AUIPC; + end else if (is_itype || is_rtype) begin + case (func3) + 3'h0: alu_op = (is_rtype && func7 == 7'h20) ? `ALU_SUB : `ALU_ADD; + 3'h1: alu_op = `ALU_SLL; + 3'h2: alu_op = `ALU_SLT; + 3'h3: alu_op = `ALU_SLTU; + 3'h4: alu_op = `ALU_XOR; + 3'h5: alu_op = (func7 == 7'h0) ? `ALU_SRL : `ALU_SRA; + 3'h6: alu_op = `ALU_OR; + 3'h7: alu_op = `ALU_AND; + default:; + endcase + end + end + + // MUL + always @(*) begin + mul_op = `MUL_MUL; case (func3) - 3'h0: alu_op = (curr_opcode == `INST_ALU) ? `ALU_ADD : (func7 == 7'h0 ? `ALU_ADD : `ALU_SUB); - 3'h1: alu_op = `ALU_SLLA; - 3'h2: alu_op = `ALU_SLT; - 3'h3: alu_op = `ALU_SLTU; - 3'h4: alu_op = `ALU_XOR; - 3'h5: alu_op = (func7 == 7'h0) ? `ALU_SRL : `ALU_SRA; - 3'h6: alu_op = `ALU_OR; - 3'h7: alu_op = `ALU_AND; - default: alu_op = `ALU_NO; + 3'h0: mul_op = `MUL_MUL; + 3'h1: mul_op = `MUL_MULH; + 3'h2: mul_op = `MUL_MULHSU; + 3'h3: mul_op = `MUL_MULHU; + 3'h4: mul_op = `MUL_DIV; + 3'h5: mul_op = `MUL_DIVU; + 3'h6: mul_op = `MUL_REM; + 3'h7: mul_op = `MUL_REMU; + default:; endcase end + // LSU + wire is_lsu = (is_ltype || is_stype); + assign lsu_op = {is_stype, func3}; + + // CSR + wire is_csr_imm = is_csr && (func3[2] == 1); always @(*) begin - // ALU OP + csr_op = `CSR_OTHER; + case (func3[1:0]) + 2'h1: csr_op = `CSR_RW; + 2'h2: csr_op = `CSR_RS; + 2'h3: csr_op = `CSR_RC; + default:; + endcase + end + + // GPU + always @(*) begin + gpu_op = `GPU_OTHER; case (func3) - 3'h0: mul_alu = `ALU_MUL; - 3'h1: mul_alu = `ALU_MULH; - 3'h2: mul_alu = `ALU_MULHSU; - 3'h3: mul_alu = `ALU_MULHU; - 3'h4: mul_alu = `ALU_DIV; - 3'h5: mul_alu = `ALU_DIVU; - 3'h6: mul_alu = `ALU_REM; - 3'h7: mul_alu = `ALU_REMU; - default: mul_alu = `ALU_NO; + 3'h0: gpu_op = `GPU_TMC; + 3'h1: gpu_op = `GPU_WSPAWN; + 3'h2: gpu_op = `GPU_SPLIT; + 3'h3: gpu_op = `GPU_JOIN; + 3'h4: gpu_op = `GPU_BAR; + default:; endcase end - assign csr_type = func3[1:0]; + VX_decode_if decode_tmp_if(); - always @(*) begin - case (csr_type) - 2'h1: csr_alu = `ALU_CSR_RW; - 2'h2: csr_alu = `ALU_CSR_RS; - 2'h3: csr_alu = `ALU_CSR_RC; - default: csr_alu = `ALU_NO; - endcase + assign decode_tmp_if.valid = ifetch_rsp_if.valid; + assign decode_tmp_if.warp_num = ifetch_rsp_if.warp_num; + assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC; + assign decode_tmp_if.next_PC = ifetch_rsp_if.curr_PC + 32'h4; + + assign decode_tmp_if.ex_type = is_br ? `EX_BR : + is_lsu ? `EX_LSU : + is_csr ? `EX_CSR : + is_mul ? `EX_MUL : + is_gpu ? `EX_GPU : + (is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU : + `EX_NOP; + + assign decode_tmp_if.instr_op = is_br ? `OP_BITS'(br_op) : + is_lsu ? `OP_BITS'(lsu_op) : + is_csr ? `OP_BITS'(csr_op) : + is_mul ? `OP_BITS'(mul_op) : + is_gpu ? `OP_BITS'(gpu_op) : + (is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) : + 0; + + assign decode_tmp_if.rd = rd; + + assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1; + + assign decode_tmp_if.rs2 = rs2; + + assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} : + (is_jal || is_jalr || is_jals) ? jalx_offset : + is_csr ? 32'(u_12) : + src2_imm; + + assign decode_tmp_if.rs1_is_PC = is_auipc; + + assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm; + + assign decode_tmp_if.use_rs1 = (decode_tmp_if.rs1 != 0) + && (is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu); + + assign decode_tmp_if.use_rs2 = (decode_tmp_if.rs2 != 0) + && (is_btype || is_stype || is_rtype || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))); + + assign decode_tmp_if.wb = (rd == 0) ? `WB_NO : // disable writeback to r0 + (is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU : + (is_jal || is_jalr || is_jals) ? `WB_JAL : + is_ltype ? `WB_MEM : + `WB_NO; + + assign join_if.is_join = is_gpu && (gpu_op == `GPU_JOIN) && in_valid; + assign join_if.warp_num = ifetch_rsp_if.warp_num; + + assign wstall_if.wstall = (is_br || is_gpu) && in_valid; + assign wstall_if.warp_num = ifetch_rsp_if.warp_num; + + wire stall = ~decode_if.ready && (| decode_if.valid); + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + `WB_BITS) + ) decode_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb}), + .out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb}) + ); + + assign ifetch_rsp_if.ready = ~stall; + +`ifdef DBG_PRINT_PIPELINE + always @(posedge clk) begin + if ((| decode_tmp_if.valid) && ~stall) begin + $write("%t: Core%0d-Decode: warp=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.warp_num, decode_tmp_if.curr_PC); + print_ex_type(decode_tmp_if.ex_type); + $write(", op="); + print_instr_op(decode_tmp_if.ex_type, decode_tmp_if.instr_op); + $write(", wb="); + print_wb(decode_tmp_if.wb); + $write(", rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2); + + // trap unsupported instructions + assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.instr_op) == `ALU_OTHER)); + assert(~(~stall && (decode_tmp_if.ex_type == `EX_BR) && `BR_OP(decode_tmp_if.instr_op) == `BR_OTHER)); + assert(~(~stall && (decode_tmp_if.ex_type == `EX_CSR) && `CSR_OP(decode_tmp_if.instr_op) == `CSR_OTHER)); + assert(~(~stall && (decode_tmp_if.ex_type == `EX_GPU) && `GPU_OP(decode_tmp_if.instr_op) == `GPU_OTHER)); + end end +`endif - wire[4:0] temp_final_alu; - - assign temp_final_alu = is_btype ? ((frE_to_bckE_req_if.branch_type < `BR_LTU) ? `ALU_SUB : `ALU_SUBU) : - is_lui ? `ALU_LUI : - is_auipc ? `ALU_AUIPC : - is_csr ? csr_alu : - (is_stype || is_linst) ? `ALU_ADD : - alu_op; - - assign frE_to_bckE_req_if.alu_op = ((func7[0] == 1'b1) && is_rtype) ? mul_alu : temp_final_alu; - -endmodule - - - - - - - - +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 21da158e..2597b298 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -64,88 +64,159 @@ `define NC_BITS `LOG2UP(`NUM_CORES) +`define NB_BITS `LOG2UP(`NUM_BARRIERS) + `define REQS_BITS `LOG2UP(NUM_REQUESTS) -`define NUM_GPRS 32 +`define NUM_REGS 32 + +`define NR_BITS `LOG2UP(`NUM_REGS) `define CSR_ADDR_SIZE 12 `define CSR_WIDTH 12 -`define DIV_LATENCY 22 +`define DIV_LATENCY 2 `define MUL_LATENCY 2 /////////////////////////////////////////////////////////////////////////////// -`define BYTE_EN_NO 3'h7 -`define BYTE_EN_SB 3'h0 -`define BYTE_EN_SH 3'h1 -`define BYTE_EN_SW 3'h2 -`define BYTE_EN_UB 3'h4 -`define BYTE_EN_UH 3'h5 -`define BYTE_EN_BITS 3 +`define INST_LUI 7'b0110111 +`define INST_AUIPC 7'b0010111 +`define INST_JAL 7'b1101111 +`define INST_JALR 7'b1100111 +`define INST_B 7'b1100011 +`define INST_L 7'b0000011 +`define INST_S 7'b0100011 +`define INST_I 7'b0010011 +`define INST_R 7'b0110011 +`define INST_F 7'b0001111 +`define INST_SYS 7'b1110011 +`define INST_GPU 7'b1101011 -/////////////////////////////////////////////////////////////////////////////// +`define OP_BITS 4 -`define INST_R 7'd051 -`define INST_L 7'd003 -`define INST_ALU 7'd019 -`define INST_S 7'd035 -`define INST_B 7'd099 -`define INST_LUI 7'd055 -`define INST_AUIPC 7'd023 -`define INST_JAL 7'd111 -`define INST_JALR 7'd103 -`define INST_SYS 7'd115 -`define INST_GPGPU 7'd107 +`define ALU_ADD 4'h0 +`define ALU_SUB 4'h1 +`define ALU_SLL 4'h2 +`define ALU_SRL 4'h3 +`define ALU_SRA 4'h4 +`define ALU_SLT 4'h5 +`define ALU_SLTU 4'h6 +`define ALU_XOR 4'h7 +`define ALU_OR 4'h8 +`define ALU_AND 4'h9 +`define ALU_LUI 4'hA +`define ALU_AUIPC 4'hB +`define ALU_OTHER 4'hF +`define ALU_BITS 4 +`define ALU_OP(x) x[`ALU_BITS-1:0] -`define RS2_IMMED 1 -`define RS2_REG 0 +`define MUL_MUL 3'h0 +`define MUL_MULH 3'h1 +`define MUL_MULHSU 3'h2 +`define MUL_MULHU 3'h3 +`define MUL_DIV 3'h4 +`define MUL_DIVU 3'h5 +`define MUL_REM 3'h6 +`define MUL_REMU 3'h7 +`define MUL_BITS 3 +`define MUL_OP(x) x[`MUL_BITS-1:0] +`define IS_DIV_OP(x) x[2] -`define BR_NO 3'h0 -`define BR_EQ 3'h1 -`define BR_NE 3'h2 -`define BR_LT 3'h3 -`define BR_GT 3'h4 -`define BR_LTU 3'h5 -`define BR_GTU 3'h6 +`define BR_EQ 4'h0 +`define BR_NE 4'h1 +`define BR_LT 4'h2 +`define BR_GE 4'h3 +`define BR_LTU 4'h4 +`define BR_GEU 4'h5 +`define BR_JAL 4'h6 +`define BR_JALR 4'h7 +`define BR_ECALL 4'h8 +`define BR_EBREAK 4'h9 +`define BR_MRET 4'hA +`define BR_SRET 4'hB +`define BR_DRET 4'hC +`define BR_OTHER 4'hF +`define BR_BITS 4 +`define BR_OP(x) x[`BR_BITS-1:0] -`define ALU_NO 5'd15 -`define ALU_ADD 5'd00 -`define ALU_SUB 5'd01 -`define ALU_SLLA 5'd02 -`define ALU_SLT 5'd03 -`define ALU_SLTU 5'd04 -`define ALU_XOR 5'd05 -`define ALU_SRL 5'd06 -`define ALU_SRA 5'd07 -`define ALU_OR 5'd08 -`define ALU_AND 5'd09 -`define ALU_SUBU 5'd10 -`define ALU_LUI 5'd11 -`define ALU_AUIPC 5'd12 -`define ALU_CSR_RW 5'd13 -`define ALU_CSR_RS 5'd14 -`define ALU_CSR_RC 5'd15 -`define ALU_MUL 5'd16 -`define ALU_MULH 5'd17 -`define ALU_MULHSU 5'd18 -`define ALU_MULHU 5'd19 -`define ALU_DIV 5'd20 -`define ALU_DIVU 5'd21 -`define ALU_REM 5'd22 -`define ALU_REMU 5'd23 +`define BYTEEN_SB 3'h0 +`define BYTEEN_SH 3'h1 +`define BYTEEN_SW 3'h2 +`define BYTEEN_UB 3'h4 +`define BYTEEN_UH 3'h5 +`define BYTEEN_BITS 3 +`define LSU_BITS 4 +`define LSU_RW(x) x[3] +`define LSU_BE(x) x[2:0] + +`define CSR_RW 2'h0 +`define CSR_RS 2'h1 +`define CSR_RC 2'h2 +`define CSR_OTHER 2'h3 +`define CSR_BITS 2 +`define CSR_OP(x) x[`CSR_BITS-1:0] + +`define GPU_TMC 3'h0 +`define GPU_WSPAWN 3'h1 +`define GPU_SPLIT 3'h2 +`define GPU_JOIN 3'h3 +`define GPU_BAR 3'h4 +`define GPU_OTHER 3'h7 +`define GPU_BITS 3 +`define GPU_OP(x) x[`GPU_BITS-1:0] + +`define EX_NOP 3'h0 +`define EX_ALU 3'h1 +`define EX_BR 3'h2 +`define EX_MUL 3'h3 +`define EX_LSU 3'h4 +`define EX_FPU 3'h5 +`define EX_CSR 3'h6 +`define EX_GPU 3'h7 +`define EX_BITS 3 `define WB_NO 2'h0 `define WB_ALU 2'h1 `define WB_MEM 2'h2 `define WB_JAL 2'h3 +`define WB_BITS 2 /////////////////////////////////////////////////////////////////////////////// -`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num -`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 2 + 5 + `NW_BITS) +`define ISA_CODE (0 << 0) // A - Atomic Instructions extension \ + | (0 << 1) // B - Tentatively reserved for Bit operations extension \ + | (0 << 2) // C - Compressed extension \ + | (0 << 3) // D - Double precsision floating-point extension \ + | (0 << 4) // E - RV32E base ISA \ + | (0 << 5) // F - Single precsision floating-point extension \ + | (0 << 6) // G - Additional standard extensions present \ + | (0 << 7) // H - Hypervisor mode implemented \ + | (1 << 8) // I - RV32I/64I/128I base ISA \ + | (0 << 9) // J - Reserved \ + | (0 << 10) // K - Reserved \ + | (0 << 11) // L - Tentatively reserved for Bit operations extension \ + | (1 << 12) // M - Integer Multiply/Divide extension \ + | (0 << 13) // N - User level interrupts supported \ + | (0 << 14) // O - Reserved \ + | (0 << 15) // P - Tentatively reserved for Packed-SIMD extension \ + | (0 << 16) // Q - Quad-precision floating-point extension \ + | (0 << 17) // R - Reserved \ + | (0 << 18) // S - Supervisor mode implemented \ + | (0 << 19) // T - Tentatively reserved for Transactional Memory extension \ + | (1 << 20) // U - User mode implemented \ + | (0 << 21) // V - Tentatively reserved for Vector extension \ + | (0 << 22) // W - Reserved \ + | (1 << 23) // X - Non-standard extensions present \ + | (0 << 24) // Y - Reserved \ + | (0 << 25) // Z - Reserved + +/////////////////////////////////////////////////////////////////////////////// + +`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, warp_num +`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + `WB_BITS + `NR_BITS + `NW_BITS) `else `define DEBUG_CORE_REQ_MDATA_WIDTH 0 `endif @@ -288,9 +359,129 @@ `define VX_DRAM_TAG_WIDTH `L3DRAM_TAG_WIDTH `define VX_SNP_TAG_WIDTH `L3SNP_TAG_WIDTH `define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH -`define VX_CSR_ID_WIDTH `CLOG2(`NUM_CLUSTERS * `NUM_CORES) +`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES) `define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)} - // VX_DEFINE +/////////////////////////////////////////////////////////////////////////////// + +task print_ex_type; + input [`EX_BITS-1:0] ex; + begin + case (ex) + `EX_ALU: $write("ALU"); + `EX_BR: $write("BR"); + `EX_LSU: $write("LSU"); + `EX_CSR: $write("CSR"); + `EX_MUL: $write("MUL"); + `EX_FPU: $write("FPU"); + `EX_GPU: $write("GPU"); + default: $write("NOP"); + endcase + end +endtask + +task print_instr_op; + input [`EX_BITS-1:0] ex; + input [`OP_BITS-1:0] op; + begin + case (ex) + `EX_ALU: begin + case (`ALU_BITS'(op)) + `ALU_ADD: $write("ADD"); + `ALU_SUB: $write("SUB"); + `ALU_SLL: $write("SLL"); + `ALU_SRL: $write("SRL"); + `ALU_SRA: $write("SRA"); + `ALU_SLT: $write("SLT"); + `ALU_SLTU: $write("SLTU"); + `ALU_XOR: $write("XOR"); + `ALU_OR: $write("OR"); + `ALU_AND: $write("AND"); + `ALU_LUI: $write("LUI"); + `ALU_AUIPC: $write("AUIPC"); + default: $write("?"); + endcase + end + `EX_BR: begin + case (`BR_BITS'(op)) + `BR_EQ: $write("EQ"); + `BR_NE: $write("NE"); + `BR_LT: $write("LT"); + `BR_GE: $write("GE"); + `BR_LTU: $write("LTU"); + `BR_GEU: $write("GEU"); + `BR_JAL: $write("JAL"); + `BR_JALR: $write("JALR"); + `BR_ECALL: $write("ECALL"); + `BR_EBREAK: $write("EBREAK"); + `BR_MRET: $write("MRET"); + `BR_SRET: $write("SRET"); + `BR_DRET: $write("DRET"); + default: $write("?"); + endcase + end + `EX_MUL: begin + case (`MUL_BITS'(op)) + `MUL_MUL: $write("MUL"); + `MUL_MULH: $write("MULH"); + `MUL_MULHSU: $write("MULHSU"); + `MUL_MULHU: $write("MULHU"); + `MUL_DIV: $write("DIV"); + `MUL_DIVU: $write("DIVU"); + `MUL_REM: $write("REM"); + `MUL_REMU: $write("REMU"); + default: $write("?"); + endcase + end + `EX_LSU: begin + case (`LSU_BITS'(op)) + 4'b0000: $write("LB"); + 4'b0001: $write("LH"); + 4'b0010: $write("LW"); + 4'b0100: $write("LBU"); + 4'b0101: $write("LHU"); + 4'b1000: $write("SB"); + 4'b1001: $write("SH"); + 4'b1010: $write("SW"); + 4'b1100: $write("SBU"); + 4'b1101: $write("SHU"); + default: $write("?"); + endcase + end + `EX_CSR: begin + case (`CSR_BITS'(op)) + `CSR_RW: $write("CSRW"); + `CSR_RS: $write("CSRS"); + `CSR_RC: $write("CSRC"); + default: $write("?"); + endcase + end + `EX_GPU: begin + case (`GPU_BITS'(op)) + `GPU_TMC: $write("TMC"); + `GPU_WSPAWN: $write("WSPAWN"); + `GPU_SPLIT: $write("SPLIT"); + `GPU_JOIN: $write("JOIN"); + `GPU_BAR: $write("BAR"); + default: $write("?"); + endcase + end + default:; + endcase + end +endtask + +task print_wb; + input [`WB_BITS-1:0] wb; + begin + case (wb) + `WB_ALU: $write("ALU"); + `WB_MEM: $write("MEM"); + `WB_JAL: $write("JAL"); + default: $write("NO"); + endcase + end +endtask + `endif diff --git a/hw/rtl/VX_exec_unit.v b/hw/rtl/VX_exec_unit.v deleted file mode 100644 index 0b4259b5..00000000 --- a/hw/rtl/VX_exec_unit.v +++ /dev/null @@ -1,147 +0,0 @@ -`include "VX_define.vh" - -module VX_exec_unit ( - input wire clk, - input wire reset, - // Request - VX_exec_unit_req_if exec_unit_req_if, - - // Output - VX_wb_if inst_exec_wb_if, - VX_jal_rsp_if jal_rsp_if, - VX_branch_rsp_if branch_rsp_if, - - input wire no_slot_exec, - output wire delay -); - - wire [`NUM_THREADS-1:0][31:0] in_a_reg_data; - wire [`NUM_THREADS-1:0][31:0] in_b_reg_data; - wire [4:0] in_alu_op; - wire in_rs2_src; - wire [31:0] in_itype_immed; -`DEBUG_BEGIN - wire [2:0] in_branch_type; -`DEBUG_END - wire [19:0] in_upper_immed; - wire in_jal; - wire [31:0] in_jal_offset; - wire [31:0] in_curr_PC; - - assign in_a_reg_data = exec_unit_req_if.a_reg_data; - assign in_b_reg_data = exec_unit_req_if.b_reg_data; - assign in_alu_op = exec_unit_req_if.alu_op; - assign in_rs2_src = exec_unit_req_if.rs2_src; - assign in_itype_immed = exec_unit_req_if.itype_immed; - assign in_branch_type = exec_unit_req_if.branch_type; - assign in_upper_immed = exec_unit_req_if.upper_immed; - assign in_jal = exec_unit_req_if.jal; - assign in_jal_offset = exec_unit_req_if.jal_offset; - assign in_curr_PC = exec_unit_req_if.curr_PC; - - wire [`NUM_THREADS-1:0][31:0] alu_result; - wire [`NUM_THREADS-1:0] alu_stall; - - genvar i; - generate - for (i = 0; i < `NUM_THREADS; i++) begin - VX_alu_unit alu_unit ( - .clk (clk), - .reset (reset), - .src_a (in_a_reg_data[i]), - .src_b (in_b_reg_data[i]), - .src_rs2 (in_rs2_src), - .itype_immed (in_itype_immed), - .upper_immed (in_upper_immed), - .alu_op (in_alu_op), - .curr_PC (in_curr_PC), - .alu_result (alu_result[i]), - .alu_stall (alu_stall[i]) - ); - end - endgenerate - - wire internal_stall = (| alu_stall); - - assign delay = no_slot_exec || internal_stall; - - wire [$clog2(`NUM_THREADS)-1:0] jal_branch_use_index; - - VX_priority_encoder #( - .N(`NUM_THREADS) - ) choose_alu_result ( - .data_in (exec_unit_req_if.valid), - .data_out (jal_branch_use_index), - `UNUSED_PIN (valid_out) - ); - - wire [31:0] branch_use_alu_result = alu_result[jal_branch_use_index]; - - reg temp_branch_dir; - always @(*) - begin - case (exec_unit_req_if.branch_type) - `BR_EQ: temp_branch_dir = (branch_use_alu_result == 0); - `BR_NE: temp_branch_dir = (branch_use_alu_result != 0); - `BR_LT: temp_branch_dir = (branch_use_alu_result[31] != 0); - `BR_GT: temp_branch_dir = (branch_use_alu_result[31] == 0); - `BR_LTU: temp_branch_dir = (branch_use_alu_result[31] != 0); - `BR_GTU: temp_branch_dir = (branch_use_alu_result[31] == 0); - `BR_NO: temp_branch_dir = 0; - default: temp_branch_dir = 0; - endcase // in_branch_type - end - - wire [`NUM_THREADS-1:0][31:0] duplicate_PC_data; - - generate - for (i = 0; i < `NUM_THREADS; i++) begin - assign duplicate_PC_data[i] = exec_unit_req_if.next_PC; - end - endgenerate - - VX_jal_rsp_if jal_rsp_temp_if(); - VX_branch_rsp_if branch_rsp_temp_if(); - - // Actual Writeback - assign inst_exec_wb_if.rd = exec_unit_req_if.rd; - assign inst_exec_wb_if.wb = exec_unit_req_if.wb; - assign inst_exec_wb_if.valid = exec_unit_req_if.valid & {`NUM_THREADS{!internal_stall}}; - assign inst_exec_wb_if.warp_num = exec_unit_req_if.warp_num; - assign inst_exec_wb_if.data = exec_unit_req_if.jal ? duplicate_PC_data : alu_result; - assign inst_exec_wb_if.curr_PC = in_curr_PC; - - // Jal rsp - assign jal_rsp_temp_if.valid = in_jal; - assign jal_rsp_temp_if.dest = $signed(in_a_reg_data[jal_branch_use_index]) + $signed(in_jal_offset); - assign jal_rsp_temp_if.warp_num = exec_unit_req_if.warp_num; - - // Branch rsp - assign branch_rsp_temp_if.valid = (exec_unit_req_if.branch_type != `BR_NO) && (| exec_unit_req_if.valid); - assign branch_rsp_temp_if.dir = temp_branch_dir; - assign branch_rsp_temp_if.warp_num = exec_unit_req_if.warp_num; - assign branch_rsp_temp_if.dest = $signed(exec_unit_req_if.curr_PC) + ($signed(exec_unit_req_if.itype_immed) << 1); // itype_immed = branch_offset - - VX_generic_register #( - .N(33 + `NW_BITS-1 + 1) - ) jal_reg ( - .clk (clk), - .reset (reset), - .stall (1'b0), - .flush (1'b0), - .in ({jal_rsp_temp_if.valid, jal_rsp_temp_if.dest, jal_rsp_temp_if.warp_num}), - .out ({jal_rsp_if.valid , jal_rsp_if.dest , jal_rsp_if.warp_num}) - ); - - VX_generic_register #( - .N(34 + `NW_BITS-1 + 1) - ) branch_reg ( - .clk (clk), - .reset (reset), - .stall (1'b0), - .flush (1'b0), - .in ({branch_rsp_temp_if.valid, branch_rsp_temp_if.dir, branch_rsp_temp_if.warp_num, branch_rsp_temp_if.dest}), - .out ({branch_rsp_if.valid , branch_rsp_if.dir , branch_rsp_if.warp_num , branch_rsp_if.dest }) - ); - -endmodule : VX_exec_unit \ No newline at end of file diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v new file mode 100644 index 00000000..7b424431 --- /dev/null +++ b/hw/rtl/VX_execute.v @@ -0,0 +1,140 @@ +`include "VX_define.vh" + +module VX_execute #( + parameter CORE_ID = 0 +) ( + `SCOPE_SIGNALS_LSU_IO + `SCOPE_SIGNALS_BE_IO + + input wire clk, + input wire reset, + + // CSR io interface + VX_csr_io_req_if csr_io_req_if, + VX_csr_io_rsp_if csr_io_rsp_if, + + // Dcache interface + VX_cache_core_req_if dcache_req_if, + VX_cache_core_rsp_if dcache_rsp_if, + + // inputs + VX_execute_if execute_if, + VX_wb_if writeback_if, + + // outputs + VX_branch_rsp_if branch_rsp_if, + VX_warp_ctl_if warp_ctl_if, + VX_wb_if alu_wb_if, + VX_wb_if branch_wb_if, + VX_wb_if lsu_wb_if, + VX_wb_if csr_wb_if, + VX_wb_if mul_wb_if, + + input wire notify_commit, + output wire ebreak +); + VX_alu_req_if alu_req_if(); + VX_branch_req_if branch_req_if(); + VX_csr_req_if csr_req_if(); + VX_lsu_req_if lsu_req_if(); + VX_mul_req_if mul_req_if(); + VX_gpu_req_if gpu_req_if(); + + VX_gpr_stage #( + .CORE_ID(CORE_ID) + ) gpr_stage ( + .clk (clk), + .reset (reset), + .writeback_if (writeback_if), + .execute_if (execute_if), + .alu_req_if (alu_req_if), + .branch_req_if (branch_req_if), + .lsu_req_if (lsu_req_if), + .csr_req_if (csr_req_if), + .mul_req_if (mul_req_if), + .gpu_req_if (gpu_req_if) + ); + + VX_alu_unit #( + .CORE_ID(CORE_ID) + ) alu_unit ( + .clk (clk), + .reset (reset), + .alu_req_if (alu_req_if), + .alu_wb_if (alu_wb_if) + ); + + VX_branch_unit #( + .CORE_ID(CORE_ID) + ) branch_unit ( + .clk (clk), + .reset (reset), + .branch_req_if (branch_req_if), + .branch_rsp_if (branch_rsp_if), + .branch_wb_if (branch_wb_if) + ); + + VX_lsu_unit #( + .CORE_ID(CORE_ID) + ) lsu_unit ( + `SCOPE_SIGNALS_LSU_BIND + .clk (clk), + .reset (reset), + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if), + .lsu_req_if (lsu_req_if), + .lsu_wb_if (lsu_wb_if) + ); + + VX_csr_pipe #( + .CORE_ID(CORE_ID) + ) csr_pipe ( + .clk (clk), + .reset (reset), + .csr_req_if (csr_req_if), + .csr_io_req_if (csr_io_req_if), + .csr_wb_if (csr_wb_if), + .csr_io_rsp_if (csr_io_rsp_if), + .notify_commit (notify_commit) + ); + + VX_mul_unit #( + .CORE_ID(CORE_ID) + ) mul_unit ( + .clk (clk), + .reset (reset), + .mul_req_if (mul_req_if), + .mul_wb_if (mul_wb_if) + ); + + VX_gpu_unit #( + .CORE_ID(CORE_ID) + ) gpu_unit ( + .gpu_req_if (gpu_req_if), + .warp_ctl_if (warp_ctl_if) + ); + + assign ebreak = (| branch_req_if.valid) && (branch_req_if.br_op == `BR_EBREAK || branch_req_if.br_op == `BR_ECALL); + + `SCOPE_ASSIGN(scope_decode_valid, decode_if.valid); + `SCOPE_ASSIGN(scope_decode_warp_num, decode_if.warp_num); + `SCOPE_ASSIGN(scope_decode_curr_PC, decode_if.curr_PC); + `SCOPE_ASSIGN(scope_decode_is_jal, decode_if.is_jal); + `SCOPE_ASSIGN(scope_decode_rs1, decode_if.rs1); + `SCOPE_ASSIGN(scope_decode_rs2, decode_if.rs2); + + `SCOPE_ASSIGN(scope_execute_valid, alu_req_if.valid); + `SCOPE_ASSIGN(scope_execute_warp_num, alu_req_if.warp_num); + `SCOPE_ASSIGN(scope_execute_curr_PC, alu_req_if.curr_PC); + `SCOPE_ASSIGN(scope_execute_rd, alu_req_if.rd); + `SCOPE_ASSIGN(scope_execute_a, alu_req_if.rs1_data); + `SCOPE_ASSIGN(scope_execute_b, alu_req_if.rs2_data); + + `SCOPE_ASSIGN(scope_writeback_valid, writeback_if.valid); + `SCOPE_ASSIGN(scope_writeback_warp_num, writeback_if.warp_num); + `SCOPE_ASSIGN(scope_writeback_curr_PC, writeback_if.curr_PC); + `SCOPE_ASSIGN(scope_writeback_wb, writeback_if.wb); + `SCOPE_ASSIGN(scope_writeback_rd, writeback_if.rd); + `SCOPE_ASSIGN(scope_writeback_data, writeback_if.data); + +endmodule diff --git a/hw/rtl/VX_fetch.v b/hw/rtl/VX_fetch.v index 810f0317..2f38adac 100644 --- a/hw/rtl/VX_fetch.v +++ b/hw/rtl/VX_fetch.v @@ -1,99 +1,56 @@ `include "VX_define.vh" -module VX_fetch ( - input wire clk, - input wire reset, - VX_wstall_if wstall_if, - VX_join_if join_if, - input wire schedule_delay, - input wire icache_stage_delay, - input wire[`NW_BITS-1:0] icache_stage_wid, - input wire icache_stage_response, - output wire busy, - VX_jal_rsp_if jal_rsp_if, - VX_branch_rsp_if branch_rsp_if, - VX_inst_meta_if fe_inst_meta_fi, - VX_warp_ctl_if warp_ctl_if +module VX_fetch #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Icache interface + VX_cache_core_req_if icache_req_if, + VX_cache_core_rsp_if icache_rsp_if, + + // inputs + VX_wstall_if wstall_if, + VX_join_if join_if, + VX_branch_rsp_if branch_rsp_if, + VX_warp_ctl_if warp_ctl_if, + + // outputs + VX_ifetch_rsp_if ifetch_rsp_if, + + output wire busy ); - wire[`NUM_THREADS-1:0] thread_mask; - wire[`NW_BITS-1:0] warp_num; - wire[31:0] warp_pc; - wire scheduled_warp; + VX_ifetch_req_if ifetch_req_if(); - wire pipe_stall = schedule_delay || icache_stage_delay; - - VX_warp_sched warp_sched ( + VX_warp_sched #( + .CORE_ID(CORE_ID) + ) warp_sched ( .clk (clk), - .reset (reset), - .stall (pipe_stall), + .reset (reset), + .warp_ctl_if (warp_ctl_if), + .wstall_if (wstall_if), + .join_if (join_if), + .branch_rsp_if (branch_rsp_if), + .ifetch_req_if (ifetch_req_if), + .ifetch_rsp_if (ifetch_rsp_if), + .busy (busy) + ); - .is_barrier (warp_ctl_if.is_barrier), - .barrier_id (warp_ctl_if.barrier_id), - .num_warps (warp_ctl_if.num_warps), - .barrier_warp_num (warp_ctl_if.warp_num), + VX_icache_stage #( + .CORE_ID(CORE_ID) + ) icache_stage ( + `SCOPE_SIGNALS_ISTAGE_BIND - // Wspawn - .wspawn (warp_ctl_if.wspawn), - .wsapwn_pc (warp_ctl_if.wspawn_pc), - .wspawn_new_active(warp_ctl_if.wspawn_new_active), + .clk (clk), + .reset (reset), - // CTM - .ctm (warp_ctl_if.change_mask), - .ctm_mask (warp_ctl_if.thread_mask), - .ctm_warp_num (warp_ctl_if.warp_num), + .icache_rsp_if (icache_rsp_if), + .icache_req_if (icache_req_if), - // WHALT - .whalt (warp_ctl_if.whalt), - .whalt_warp_num (warp_ctl_if.warp_num), - - // Wstall - .wstall (wstall_if.wstall), - .wstall_warp_num (wstall_if.warp_num), - - // Lock/release Stuff - .icache_stage_response(icache_stage_response), - .icache_stage_wid (icache_stage_wid), - - // Join - .is_join (join_if.is_join), - .join_warp_num (join_if.warp_num), - - // Split - .is_split (warp_ctl_if.is_split), - .dont_split (warp_ctl_if.dont_split), - .split_new_mask (warp_ctl_if.split_new_mask), - .split_later_mask (warp_ctl_if.split_later_mask), - .split_save_pc (warp_ctl_if.split_save_pc), - .split_warp_num (warp_ctl_if.warp_num), - - // JAL - .jal (jal_rsp_if.valid), - .dest (jal_rsp_if.dest), - .jal_warp_num (jal_rsp_if.warp_num), - - // Branch - .branch_valid (branch_rsp_if.valid), - .branch_dir (branch_rsp_if.dir), - .branch_dest (branch_rsp_if.dest), - .branch_warp_num (branch_rsp_if.warp_num), - - // Outputs - .thread_mask (thread_mask), - .warp_num (warp_num), - .warp_pc (warp_pc), - .busy (busy), - .scheduled_warp (scheduled_warp) + .ifetch_req_if (ifetch_req_if), + .ifetch_rsp_if (ifetch_rsp_if) ); - assign fe_inst_meta_fi.warp_num = warp_num; - assign fe_inst_meta_fi.valid = thread_mask; - assign fe_inst_meta_fi.instruction = 32'h0; - assign fe_inst_meta_fi.curr_PC = warp_pc; - -`DEBUG_BEGIN - wire start_mat_add = scheduled_warp && (warp_pc == 32'h80000ed8) && (warp_num == 0); - wire end_mat_add = scheduled_warp && (warp_pc == 32'h80000fbc) && (warp_num == 0); -`DEBUG_END - endmodule \ No newline at end of file diff --git a/hw/rtl/VX_front_end.v b/hw/rtl/VX_front_end.v deleted file mode 100644 index 8ceba8df..00000000 --- a/hw/rtl/VX_front_end.v +++ /dev/null @@ -1,116 +0,0 @@ -`include "VX_define.vh" - -module VX_front_end #( - parameter CORE_ID = 0 -) ( - `SCOPE_SIGNALS_ISTAGE_IO - - input wire clk, - input wire reset, - - input wire schedule_delay, - - VX_warp_ctl_if warp_ctl_if, - - VX_cache_core_rsp_if icache_rsp_if, - VX_cache_core_req_if icache_req_if, - - VX_jal_rsp_if jal_rsp_if, - VX_branch_rsp_if branch_rsp_if, - - VX_backend_req_if bckE_req_if, - output wire busy -); - - VX_inst_meta_if fe_inst_meta_fi(); - VX_inst_meta_if fe_inst_meta_fi2(); - VX_inst_meta_if fe_inst_meta_id(); - - VX_backend_req_if frE_to_bckE_req_if(); - VX_inst_meta_if fd_inst_meta_de(); - - wire total_freeze = schedule_delay; - wire icache_stage_delay; - - wire[`NW_BITS-1:0] icache_stage_wid; - wire icache_stage_response; - - VX_wstall_if wstall_if(); - VX_join_if join_if(); - - VX_fetch fetch ( - .clk (clk), - .reset (reset), - .icache_stage_wid (icache_stage_wid), - .icache_stage_response(icache_stage_response), - .wstall_if (wstall_if), - .join_if (join_if), - .schedule_delay (schedule_delay), - .jal_rsp_if (jal_rsp_if), - .warp_ctl_if (warp_ctl_if), - .icache_stage_delay (icache_stage_delay), - .branch_rsp_if (branch_rsp_if), - .busy (busy), - .fe_inst_meta_fi (fe_inst_meta_fi) - ); - - VX_generic_register #( - .N(64+`NW_BITS-1+1+`NUM_THREADS) - ) f_d_reg ( - .clk (clk), - .reset (reset), - .stall (icache_stage_delay), - .flush (1'b0), - .in ({fe_inst_meta_fi.instruction, fe_inst_meta_fi.curr_PC, fe_inst_meta_fi.warp_num, fe_inst_meta_fi.valid}), - .out ({fe_inst_meta_fi2.instruction, fe_inst_meta_fi2.curr_PC, fe_inst_meta_fi2.warp_num, fe_inst_meta_fi2.valid}) - ); - - VX_icache_stage #( - .CORE_ID(CORE_ID) - ) icache_stage ( - `SCOPE_SIGNALS_ISTAGE_BIND - - .clk (clk), - .reset (reset), - .total_freeze (total_freeze), - .icache_stage_delay (icache_stage_delay), - .icache_stage_response(icache_stage_response), - .icache_stage_wid (icache_stage_wid), - .fe_inst_meta_fi (fe_inst_meta_fi2), - .fe_inst_meta_id (fe_inst_meta_id), - .icache_rsp_if (icache_rsp_if), - .icache_req_if (icache_req_if) - ); - - VX_generic_register #( - .N(64 + `NW_BITS-1 + 1 + `NUM_THREADS) - ) i_d_reg ( - .clk (clk), - .reset (reset), - .stall (total_freeze), - .flush (1'b0), - .in ({fe_inst_meta_id.instruction, fe_inst_meta_id.curr_PC, fe_inst_meta_id.warp_num, fe_inst_meta_id.valid}), - .out ({fd_inst_meta_de.instruction, fd_inst_meta_de.curr_PC, fd_inst_meta_de.warp_num, fd_inst_meta_de.valid}) - ); - - VX_decode decode ( - .fd_inst_meta_de (fd_inst_meta_de), - .frE_to_bckE_req_if (frE_to_bckE_req_if), - .wstall_if (wstall_if), - .join_if (join_if) - ); - - VX_generic_register #( - .N(233 + `NW_BITS-1 + 1 + `NUM_THREADS) - ) d_e_reg ( - .clk (clk), - .reset (reset), - .stall (total_freeze), - .flush (1'b0), - .in ({frE_to_bckE_req_if.csr_addr, frE_to_bckE_req_if.is_jal, frE_to_bckE_req_if.is_etype, frE_to_bckE_req_if.is_csr, frE_to_bckE_req_if.csr_immed, frE_to_bckE_req_if.csr_mask, frE_to_bckE_req_if.rd, frE_to_bckE_req_if.rs1, frE_to_bckE_req_if.rs2, frE_to_bckE_req_if.alu_op, frE_to_bckE_req_if.wb, frE_to_bckE_req_if.rs2_src, frE_to_bckE_req_if.itype_immed, frE_to_bckE_req_if.mem_read, frE_to_bckE_req_if.mem_write, frE_to_bckE_req_if.branch_type, frE_to_bckE_req_if.upper_immed, frE_to_bckE_req_if.curr_PC, frE_to_bckE_req_if.jal, frE_to_bckE_req_if.jal_offset, frE_to_bckE_req_if.next_PC, frE_to_bckE_req_if.valid, frE_to_bckE_req_if.warp_num, frE_to_bckE_req_if.is_wspawn, frE_to_bckE_req_if.is_tmc, frE_to_bckE_req_if.is_split, frE_to_bckE_req_if.is_barrier}), - .out ({bckE_req_if.csr_addr , bckE_req_if.is_jal , bckE_req_if.is_etype ,bckE_req_if.is_csr , bckE_req_if.csr_immed , bckE_req_if.csr_mask , bckE_req_if.rd , bckE_req_if.rs1 , bckE_req_if.rs2 , bckE_req_if.alu_op , bckE_req_if.wb , bckE_req_if.rs2_src , bckE_req_if.itype_immed , bckE_req_if.mem_read , bckE_req_if.mem_write , bckE_req_if.branch_type , bckE_req_if.upper_immed , bckE_req_if.curr_PC , bckE_req_if.jal , bckE_req_if.jal_offset , bckE_req_if.next_PC , bckE_req_if.valid , bckE_req_if.warp_num , bckE_req_if.is_wspawn , bckE_req_if.is_tmc , bckE_req_if.is_split , bckE_req_if.is_barrier }) - ); - -endmodule - - diff --git a/hw/rtl/VX_gpr_mux.v b/hw/rtl/VX_gpr_mux.v new file mode 100644 index 00000000..6bec71da --- /dev/null +++ b/hw/rtl/VX_gpr_mux.v @@ -0,0 +1,88 @@ +`include "VX_define.vh" + +module VX_gpr_mux ( + // inputs + VX_execute_if execute_if, + input wire [`NUM_THREADS-1:0][31:0] rs1_data, + input wire [`NUM_THREADS-1:0][31:0] rs2_data, + + // outputs + VX_alu_req_if alu_req_if, + VX_branch_req_if branch_req_if, + VX_lsu_req_if lsu_req_if, + VX_csr_req_if csr_req_if, + VX_mul_req_if mul_req_if, + VX_gpu_req_if gpu_req_if +); + + wire[`NUM_THREADS-1:0] is_alu = {`NUM_THREADS{execute_if.ex_type == `EX_ALU}}; + wire[`NUM_THREADS-1:0] is_br = {`NUM_THREADS{execute_if.ex_type == `EX_BR}}; + wire[`NUM_THREADS-1:0] is_lsu = {`NUM_THREADS{execute_if.ex_type == `EX_LSU}}; + wire[`NUM_THREADS-1:0] is_csr = {`NUM_THREADS{execute_if.ex_type == `EX_CSR}}; + wire[`NUM_THREADS-1:0] is_mul = {`NUM_THREADS{execute_if.ex_type == `EX_MUL}}; + wire[`NUM_THREADS-1:0] is_gpu = {`NUM_THREADS{execute_if.ex_type == `EX_GPU}}; + + // ALU unit + assign alu_req_if.valid = execute_if.valid & is_alu; + assign alu_req_if.warp_num = execute_if.warp_num; + assign alu_req_if.curr_PC = execute_if.curr_PC; + assign alu_req_if.alu_op = `ALU_OP(execute_if.instr_op); + assign alu_req_if.rd = execute_if.rd; + assign alu_req_if.wb = execute_if.wb; + assign alu_req_if.rs1_data = rs1_data; + assign alu_req_if.rs2_data = rs2_data; + + // BR unit + assign branch_req_if.valid = execute_if.valid & is_br; + assign branch_req_if.warp_num = execute_if.warp_num; + assign branch_req_if.curr_PC = execute_if.curr_PC; + assign branch_req_if.br_op = `BR_OP(execute_if.instr_op); + assign branch_req_if.offset = execute_if.imm; + assign branch_req_if.next_PC = execute_if.next_PC; + assign branch_req_if.rs1_data = rs1_data; + assign branch_req_if.rs2_data = rs2_data; + assign branch_req_if.rd = execute_if.rd; + assign branch_req_if.wb = execute_if.wb; + + // LSU unit + assign lsu_req_if.valid = execute_if.valid & is_lsu; + assign lsu_req_if.warp_num = execute_if.warp_num; + assign lsu_req_if.curr_PC = execute_if.curr_PC; + assign lsu_req_if.base_addr = rs1_data; + assign lsu_req_if.store_data = rs2_data; + assign lsu_req_if.offset = execute_if.imm; + assign lsu_req_if.rw = `LSU_RW(execute_if.instr_op); + assign lsu_req_if.byteen = `LSU_BE(execute_if.instr_op); + assign lsu_req_if.rd = execute_if.rd; + assign lsu_req_if.wb = execute_if.wb; + + // CSR unit + assign csr_req_if.valid = execute_if.valid & is_csr; + assign csr_req_if.warp_num = execute_if.warp_num; + assign csr_req_if.curr_PC = execute_if.curr_PC; + assign csr_req_if.csr_op = `CSR_OP(execute_if.instr_op); + assign csr_req_if.csr_addr = execute_if.imm[`CSR_ADDR_SIZE-1:0]; + assign csr_req_if.csr_mask = execute_if.rs2_is_imm ? 32'(execute_if.rs1) : rs1_data[0]; + assign csr_req_if.rd = execute_if.rd; + assign csr_req_if.wb = execute_if.wb; + assign csr_req_if.is_io = 1'b0; + + // MUL unit + assign mul_req_if.valid = execute_if.valid & is_mul; + assign mul_req_if.warp_num = execute_if.warp_num; + assign mul_req_if.curr_PC = execute_if.curr_PC; + assign mul_req_if.mul_op = `MUL_OP(execute_if.instr_op); + assign mul_req_if.rs1_data = rs1_data; + assign mul_req_if.rs2_data = rs2_data; + assign mul_req_if.rd = execute_if.rd; + assign mul_req_if.wb = execute_if.wb; + + // GPU unit + assign gpu_req_if.valid = execute_if.valid & is_gpu; + assign gpu_req_if.warp_num = execute_if.warp_num; + assign gpu_req_if.next_PC = execute_if.next_PC; + assign gpu_req_if.gpu_op = `GPU_OP(execute_if.instr_op); + assign gpu_req_if.rs1_data = rs1_data; + assign gpu_req_if.rs2_data = rs2_data[0]; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpr_ram.v b/hw/rtl/VX_gpr_ram.v index d30b6a91..fa3fc004 100644 --- a/hw/rtl/VX_gpr_ram.v +++ b/hw/rtl/VX_gpr_ram.v @@ -1,81 +1,75 @@ `include "VX_define.vh" module VX_gpr_ram ( - input wire clk, - input wire reset, - input wire write_ce, - VX_gpr_read_if gpr_read_if, - VX_wb_if writeback_if, + input wire clk, + input wire [`NUM_THREADS-1:0] we, + input wire [`NR_BITS-1:0] waddr, + input wire [`NUM_THREADS-1:0][31:0] wdata, + input wire [`NR_BITS-1:0] rs1, + input wire [`NR_BITS-1:0] rs2, + output wire [`NUM_THREADS-1:0][31:0] rs1_data, + output wire [`NUM_THREADS-1:0][31:0] rs2_data +); + `ifndef ASIC - output wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] a_reg_data, - output wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] b_reg_data -); - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] a_reg_data_unqual; - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] b_reg_data_unqual; + reg [`NUM_THREADS-1:0][3:0][7:0] ram [31:0]; - assign a_reg_data = (gpr_read_if.rs1 != 0) ? a_reg_data_unqual : 0; - assign b_reg_data = (gpr_read_if.rs2 != 0) ? b_reg_data_unqual : 0; + integer i; - wire [`NUM_THREADS-1:0] write_enable = writeback_if.valid & {`NUM_THREADS{write_ce && (writeback_if.wb != 0)}}; - - `ifndef ASIC - `UNUSED_VAR(reset) - - reg [`NUM_THREADS-1:0][3:0][7:0] ram[31:0]; - - wire [4:0] waddr = writeback_if.rd; - wire [`NUM_THREADS-1:0][31:0] wdata = writeback_if.data; + initial begin + // initialize r0 to 0 + for (i = 0; i < `NUM_THREADS; i++) begin + ram[i][0] = 0; + ram[i][1] = 0; + ram[i][2] = 0; + ram[i][3] = 0; + end + end - genvar i; - for (i = 0; i < `NUM_THREADS; i++) begin - always @(posedge clk) begin - if (write_enable[i]) begin + always @(posedge clk) begin + for (i = 0; i < `NUM_THREADS; i++) begin + if (we[i]) begin ram[waddr][i][0] <= wdata[i][07:00]; ram[waddr][i][1] <= wdata[i][15:08]; ram[waddr][i][2] <= wdata[i][23:16]; ram[waddr][i][3] <= wdata[i][31:24]; end end + assert(~(|we) || (waddr != 0)); // ensure r0 is never written! end - assign a_reg_data_unqual = ram[gpr_read_if.rs1]; - assign b_reg_data_unqual = ram[gpr_read_if.rs2]; + assign rs1_data = ram[rs1]; + assign rs2_data = ram[rs2]; `else - wire going_to_write = write_enable & (| writeback_if.wb_valid); - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] write_bit_mask; + wire [`NUM_THREADS-1:0][31:0] write_bit_mask; - genvar i; + integer i; for (i = 0; i < `NUM_THREADS; i++) begin - wire local_write = write_enable & writeback_if.wb_valid[i]; - assign write_bit_mask[i] = {`NUM_GPRS{~local_write}}; + assign write_bit_mask[i] = {32{~we[i]}}; end wire cenb = 0; wire cena_1 = 0; wire cena_2 = 0; - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] tmp_a; - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] tmp_b; + wire [`NUM_THREADS-1:0][31:0] tmp_a; + wire [`NUM_THREADS-1:0][31:0] tmp_b; `ifndef SYNTHESIS - genvar j; + integer j; for (i = 0; i < `NUM_THREADS; i++) begin - for (j = 0; j < `NUM_GPRS; j++) begin - assign a_reg_data_unqual[i][j] = ((tmp_a[i][j] === 1'dx) || cena_1) ? 1'b0 : tmp_a[i][j]; - assign b_reg_data_unqual[i][j] = ((tmp_b[i][j] === 1'dx) || cena_2) ? 1'b0 : tmp_b[i][j]; + for (j = 0; j < 32; j++) begin + assign rs1_data[i][j] = ((tmp_a[i][j] === 1'dx) || cena_1) ? 1'b0 : tmp_a[i][j]; + assign rs2_data[i][j] = ((tmp_b[i][j] === 1'dx) || cena_2) ? 1'b0 : tmp_b[i][j]; end end `else - assign a_reg_data_unqual = tmp_a; - assign b_reg_data_unqual = tmp_b; + assign rs1_data = tmp_a; + assign rs2_data = tmp_b; `endif - - wire [`NUM_THREADS-1:0][`NUM_GPRS-1:0] to_write = writeback_if.write_data; - - for (i = 0; i < 'NT; i=i+4) - begin + for (i = 0; i < 'NT; i=i+4) begin `IGNORE_WARNINGS_BEGIN rf2_32x128_wm1 first_ram ( .CENYA(), @@ -88,12 +82,12 @@ module VX_gpr_ram ( .SOB(), .CLKA(clk), .CENA(cena_1), - .AA(gpr_read_if.rs1[(i+3):(i)]), + .AA(rs1[(i+3):(i)]), .CLKB(clk), .CENB(cenb), .WENB(write_bit_mask[(i+3):(i)]), - .AB(writeback_if.rd[(i+3):(i)]), - .DB(to_write[(i+3):(i)]), + .AB(waddr[(i+3):(i)]), + .DB(wdata[(i+3):(i)]), .EMAA(3'b011), .EMASA(1'b0), .EMAB(3'b011), @@ -125,12 +119,12 @@ module VX_gpr_ram ( .SOB(), .CLKA(clk), .CENA(cena_2), - .AA(gpr_read_if.rs2[(i+3):(i)]), + .AA(rs2[(i+3):(i)]), .CLKB(clk), .CENB(cenb), .WENB(write_bit_mask[(i+3):(i)]), - .AB(writeback_if.rd[(i+3):(i)]), - .DB(to_write[(i+3):(i)]), + .AB(waddr[(i+3):(i)]), + .DB(wdata[(i+3):(i)]), .EMAA(3'b011), .EMASA(1'b0), .EMAB(3'b011), diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index e0f5f865..85c40d02 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -1,226 +1,172 @@ `include "VX_define.vh" -module VX_gpr_stage ( - input wire clk, - input wire reset, - input wire schedule_delay, +module VX_gpr_stage #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, - input wire memory_delay, - input wire exec_delay, - input wire stall_gpr_csr, - output wire delay, + // inputs + VX_execute_if execute_if, + VX_wb_if writeback_if, - // decodee inputs - VX_backend_req_if bckE_req_if, - - // WriteBack inputs - VX_wb_if writeback_if, - - // Outputs - VX_exec_unit_req_if exec_unit_req_if, - VX_lsu_req_if lsu_req_if, - VX_gpu_inst_req_if gpu_inst_req_if, - VX_csr_req_if csr_req_if + // outputs + VX_alu_req_if alu_req_if, + VX_branch_req_if branch_req_if, + VX_lsu_req_if lsu_req_if, + VX_csr_req_if csr_req_if, + VX_mul_req_if mul_req_if, + VX_gpu_req_if gpu_req_if ); -`DEBUG_BEGIN - wire[31:0] curr_PC = bckE_req_if.curr_PC; - wire[2:0] branchType = bckE_req_if.branch_type; - wire is_store = (bckE_req_if.mem_write != `BYTE_EN_NO); - wire is_load = (bckE_req_if.mem_read != `BYTE_EN_NO); - wire is_jal = bckE_req_if.is_jal; -`DEBUG_END + wire [`NUM_THREADS-1:0][31:0] rs1_data_all [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs2_data_all [`NUM_WARPS-1:0]; + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs1_PC; + wire [`NUM_THREADS-1:0][31:0] rs2_imm; + wire [`NUM_THREADS-1:0] we [`NUM_WARPS-1:0]; - assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core + genvar i; - VX_gpr_read_if gpr_read_if(); - assign gpr_read_if.rs1 = bckE_req_if.rs1; - assign gpr_read_if.rs2 = bckE_req_if.rs2; - assign gpr_read_if.warp_num = bckE_req_if.warp_num; + for (i = 0; i < `NUM_THREADS; i++) begin + assign rs1_PC[i] = execute_if.curr_PC; + assign rs2_imm[i] = execute_if.imm; + end -`ifndef ASIC - assign gpr_read_if.is_jal = bckE_req_if.is_jal; - assign gpr_read_if.curr_PC = bckE_req_if.curr_PC; -`else - assign gpr_read_if.is_jal = exec_unit_req_if.is_jal; - assign gpr_read_if.curr_PC = exec_unit_req_if.curr_PC; -`endif + assign rs1_data = execute_if.rs1_is_PC ? rs1_PC : rs1_data_all[execute_if.warp_num]; + assign rs2_data = execute_if.rs2_is_imm ? rs2_imm : rs2_data_all[execute_if.warp_num]; - VX_gpr_wrapper grp_wrapper ( - .clk (clk), - .reset (reset), - .writeback_if (writeback_if), - .gpr_read_if (gpr_read_if) - ); + generate + for (i = 0; i < `NUM_WARPS; i++) begin + assign we[i] = writeback_if.valid & {`NUM_THREADS{(writeback_if.wb != 0) && (i == writeback_if.warp_num)}}; + VX_gpr_ram gpr_ram ( + .clk (clk), + .we (we[i]), + .waddr (writeback_if.rd), + .wdata (writeback_if.data), + .rs1 (execute_if.rs1), + .rs2 (execute_if.rs2), + .rs1_data (rs1_data_all[i]), + .rs2_data (rs2_data_all[i]) + ); + end + endgenerate - // Outputs - VX_exec_unit_req_if exec_unit_req_temp_if(); - VX_lsu_req_if lsu_req_temp_if(); - VX_gpu_inst_req_if gpu_inst_req_temp_if(); - VX_csr_req_if csr_req_temp_if(); + VX_alu_req_if alu_req_tmp_if(); + VX_branch_req_if branch_req_tmp_if(); + VX_lsu_req_if lsu_req_tmp_if(); + VX_csr_req_if csr_req_tmp_if(); + VX_mul_req_if mul_req_tmp_if(); + VX_gpu_req_if gpu_req_tmp_if(); - VX_inst_multiplex inst_mult( - .bckE_req_if (bckE_req_if), - .gpr_read_if (gpr_read_if), - .exec_unit_req_if (exec_unit_req_temp_if), - .lsu_req_if (lsu_req_temp_if), - .gpu_inst_req_if (gpu_inst_req_temp_if), - .csr_req_if (csr_req_temp_if) - ); + VX_gpr_mux gpr_mux ( + .execute_if (execute_if), + .rs1_data (rs1_data), + .rs2_data (rs2_data), + .alu_req_if (alu_req_if), + .branch_req_if (branch_req_tmp_if), + .lsu_req_if (lsu_req_tmp_if), + .csr_req_if (csr_req_tmp_if), + .mul_req_if (mul_req_tmp_if), + .gpu_req_if (gpu_req_tmp_if) + ); -`DEBUG_BEGIN - wire is_lsu = (| lsu_req_temp_if.valid); -`DEBUG_END - wire stall_rest = 0; - wire flush_rest = schedule_delay; + wire stall_alu = ~alu_req_if.ready && (| alu_req_if.valid); + wire stall_br = ~branch_req_if.ready && (| branch_req_if.valid); + wire stall_lsu = ~lsu_req_if.ready && (| lsu_req_if.valid); + wire stall_csr = ~csr_req_if.ready && (| csr_req_if.valid); + wire stall_mul = ~mul_req_if.ready && (| mul_req_if.valid); + wire stall_gpu = ~gpu_req_if.ready && (| gpu_req_if.valid); - wire stall_lsu = memory_delay; - wire flush_lsu = schedule_delay && !stall_lsu; - - wire stall_exec = exec_delay; - wire flush_exec = schedule_delay && !stall_exec; - - wire stall_csr = stall_gpr_csr && bckE_req_if.is_csr && (| bckE_req_if.valid); - - assign delay = stall_lsu || stall_exec || stall_csr; - -`ifdef ASIC - wire delayed_lsu_last_cycle; - - VX_generic_register #( - .N(1) - ) delayed_reg ( - .clk (clk), - .reset (reset), - .stall (stall_rest), - .flush (stall_rest), - .in (stall_lsu), - .out (delayed_lsu_last_cycle), - `UNUSED_PIN (size) - ); - - wire [`NUM_THREADS-1:0][31:0] temp_store_data; - wire [`NUM_THREADS-1:0][31:0] temp_base_addr; // A reg data - - wire [`NUM_THREADS-1:0][31:0] real_store_data; - wire [`NUM_THREADS-1:0][31:0] real_base_addr; // A reg data - - wire store_curr_real = !delayed_lsu_last_cycle && stall_lsu; - - VX_generic_register #( - .N(`NUM_THREADS*32*2) - ) lsu_data ( - .clk (clk), - .reset (reset), - .stall (!store_curr_real), - .flush (stall_rest), - .in ({real_store_data, real_base_addr}), - .out ({temp_store_data, temp_base_addr}) - ); - - assign real_store_data = lsu_req_temp_if.store_data; - assign real_base_addr = lsu_req_temp_if.base_addr; - - assign lsu_req_if.store_data = (delayed_lsu_last_cycle) ? temp_store_data : real_store_data; - assign lsu_req_if.base_addr = (delayed_lsu_last_cycle) ? temp_base_addr : real_base_addr; - - VX_generic_register #( - .N(77 + `NW_BITS-1 + 1 + (`NUM_THREADS)) - ) lsu_reg ( - .clk (clk), - .reset (reset), - .stall (stall_lsu), - .flush (flush_lsu), - .in ({lsu_req_temp_if.valid, lsu_req_temp_if.curr_PC, lsu_req_temp_if.warp_num, lsu_req_temp_if.offset, lsu_req_temp_if.mem_read, lsu_req_temp_if.mem_write, lsu_req_temp_if.rd, lsu_req_temp_if.wb}), - .out ({lsu_req_if.valid , lsu_req_if.curr_PC ,lsu_req_if.warp_num , lsu_req_if.offset , lsu_req_if.mem_read , lsu_req_if.mem_write , lsu_req_if.rd , lsu_req_if.wb }) - ); - - VX_generic_register #( - .N(224 + `NW_BITS-1 + 1 + (`NUM_THREADS)) - ) exec_unit_reg ( - .clk (clk), - .reset (reset), - .stall (stall_exec), - .flush (flush_exec), - .in ({exec_unit_req_temp_if.valid, exec_unit_req_temp_if.warp_num, exec_unit_req_temp_if.curr_PC, exec_unit_req_temp_if.next_PC, exec_unit_req_temp_if.rd, exec_unit_req_temp_if.wb, exec_unit_req_temp_if.alu_op, exec_unit_req_temp_if.rs1, exec_unit_req_temp_if.rs2, exec_unit_req_temp_if.rs2_src, exec_unit_req_temp_if.itype_immed, exec_unit_req_temp_if.upper_immed, exec_unit_req_temp_if.branch_type, exec_unit_req_temp_if.is_jal, exec_unit_req_temp_if.jal, exec_unit_req_temp_if.jal_offset, exec_unit_req_temp_if.is_etype, exec_unit_req_temp_if.wspawn, exec_unit_req_temp_if.is_csr, exec_unit_req_temp_if.csr_addr, exec_unit_req_temp_if.csr_immed, exec_unit_req_temp_if.csr_mask}), - .out ({exec_unit_req_if.valid , exec_unit_req_if.warp_num , exec_unit_req_if.curr_PC , exec_unit_req_if.next_PC , exec_unit_req_if.rd , exec_unit_req_if.wb , exec_unit_req_if.alu_op , exec_unit_req_if.rs1 , exec_unit_req_if.rs2 , exec_unit_req_if.rs2_src , exec_unit_req_if.itype_immed , exec_unit_req_if.upper_immed , exec_unit_req_if.branch_type , exec_unit_req_if.is_jal , exec_unit_req_if.jal , exec_unit_req_if.jal_offset , exec_unit_req_if.is_etype , exec_unit_req_if.wspawn , exec_unit_req_if.is_csr , exec_unit_req_if.csr_addr , exec_unit_req_if.csr_immed , exec_unit_req_if.csr_mask }) - ); - - assign exec_unit_req_if.a_reg_data = real_base_addr; - assign exec_unit_req_if.b_reg_data = real_store_data; - - VX_generic_register #( - .N(36 + `NW_BITS-1 + 1 + (`NUM_THREADS)) - ) gpu_inst_reg ( - .clk (clk), - .reset (reset), - .stall (stall_rest), - .flush (flush_rest), - .in ({gpu_inst_req_temp_if.valid, gpu_inst_req_temp_if.warp_num, gpu_inst_req_temp_if.is_wspawn, gpu_inst_req_temp_if.is_tmc, gpu_inst_req_temp_if.is_split, gpu_inst_req_temp_if.is_barrier, gpu_inst_req_temp_if.next_PC}), - .out ({gpu_inst_req_if.valid , gpu_inst_req_if.warp_num , gpu_inst_req_if.is_wspawn , gpu_inst_req_if.is_tmc , gpu_inst_req_if.is_split , gpu_inst_req_if.is_barrier , gpu_inst_req_if.next_PC }) - ); - - assign gpu_inst_req_if.a_reg_data = real_base_addr; - assign gpu_inst_req_if.rd2 = real_store_data; - - VX_generic_register #( - .N(`NW_BITS-1 + 1 + `NUM_THREADS + 58) - ) csr_reg ( - .clk (clk), - .reset (reset), - .stall (stall_gpr_csr), - .flush (flush_rest), - .in ({csr_req_temp_if.valid, csr_req_temp_if.warp_num, csr_req_temp_if.rd, csr_req_temp_if.wb, csr_req_temp_if.alu_op, csr_req_temp_if.is_csr, csr_req_temp_if.csr_addr, csr_req_temp_if.csr_immed, csr_req_temp_if.csr_mask}), - .out ({csr_req_if.valid , csr_req_if.warp_num , csr_req_if.rd , csr_req_if.wb , csr_req_if.alu_op , csr_req_if.is_csr , csr_req_if.csr_addr , csr_req_if.csr_immed , csr_req_if.csr_mask }) - ); - - -`else - - // 341 VX_generic_register #( - .N(77 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS)) + .N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `NR_BITS + `WB_BITS) + ) alu_reg ( + .clk (clk), + .reset (reset), + .stall (stall_alu), + .flush (0), + .in ({alu_req_tmp_if.valid, alu_req_tmp_if.warp_num, alu_req_tmp_if.curr_PC, alu_req_tmp_if.alu_op, alu_req_tmp_if.rs1_data, alu_req_tmp_if.rs2_data, alu_req_tmp_if.rd, alu_req_tmp_if.wb}), + .out ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.curr_PC, alu_req_if.alu_op, alu_req_if.rs1_data, alu_req_if.rs2_data, alu_req_if.rd, alu_req_if.wb}) + ); + + VX_generic_register #( + .N(`NUM_THREADS +`NW_BITS + 32 + 32 + `BR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + `NR_BITS + `WB_BITS) + ) br_reg ( + .clk (clk), + .reset (reset), + .stall (stall_br), + .flush (0), + .in ({branch_req_tmp_if.valid, branch_req_tmp_if.warp_num, branch_req_tmp_if.curr_PC, branch_req_tmp_if.next_PC, branch_req_tmp_if.br_op, branch_req_tmp_if.rs1_data, branch_req_tmp_if.rs2_data, branch_req_tmp_if.offset, branch_req_tmp_if.rd, branch_req_tmp_if.wb}), + .out ({branch_req_if.valid, branch_req_if.warp_num, branch_req_if.curr_PC, branch_req_if.next_PC, branch_req_if.br_op, branch_req_if.rs1_data, branch_req_if.rs2_data, branch_req_if.offset, branch_req_if.rd, branch_req_if.wb}) + ); + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 1 + `BYTEEN_BITS + `NR_BITS + `WB_BITS) ) lsu_reg ( .clk (clk), .reset (reset), .stall (stall_lsu), - .flush (flush_lsu), - .in ({lsu_req_temp_if.valid, lsu_req_temp_if.curr_PC, lsu_req_temp_if.warp_num, lsu_req_temp_if.store_data, lsu_req_temp_if.base_addr, lsu_req_temp_if.offset, lsu_req_temp_if.mem_read, lsu_req_temp_if.mem_write, lsu_req_temp_if.rd, lsu_req_temp_if.wb}), - .out ({lsu_req_if.valid , lsu_req_if.curr_PC , lsu_req_if.warp_num , lsu_req_if.store_data , lsu_req_if.base_addr , lsu_req_if.offset , lsu_req_if.mem_read , lsu_req_if.mem_write , lsu_req_if.rd , lsu_req_if.wb }) + .flush (0), + .in ({lsu_req_tmp_if.valid, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.store_data, lsu_req_tmp_if.offset, lsu_req_tmp_if.rw, lsu_req_tmp_if.byteen, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb}), + .out ({lsu_req_if.valid, lsu_req_if.warp_num, lsu_req_if.curr_PC, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.offset, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.rd, lsu_req_if.wb}) ); VX_generic_register #( - .N(224 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS)) - ) exec_unit_reg ( - .clk (clk), - .reset (reset), - .stall (stall_exec), - .flush (flush_exec), - .in ({exec_unit_req_temp_if.valid, exec_unit_req_temp_if.warp_num, exec_unit_req_temp_if.curr_PC, exec_unit_req_temp_if.next_PC, exec_unit_req_temp_if.rd, exec_unit_req_temp_if.wb, exec_unit_req_temp_if.a_reg_data, exec_unit_req_temp_if.b_reg_data, exec_unit_req_temp_if.alu_op, exec_unit_req_temp_if.rs1, exec_unit_req_temp_if.rs2, exec_unit_req_temp_if.rs2_src, exec_unit_req_temp_if.itype_immed, exec_unit_req_temp_if.upper_immed, exec_unit_req_temp_if.branch_type, exec_unit_req_temp_if.is_jal, exec_unit_req_temp_if.jal, exec_unit_req_temp_if.jal_offset, exec_unit_req_temp_if.is_etype, exec_unit_req_temp_if.wspawn, exec_unit_req_temp_if.is_csr, exec_unit_req_temp_if.csr_addr, exec_unit_req_temp_if.csr_immed, exec_unit_req_temp_if.csr_mask}), - .out ({exec_unit_req_if.valid , exec_unit_req_if.warp_num , exec_unit_req_if.curr_PC , exec_unit_req_if.next_PC , exec_unit_req_if.rd , exec_unit_req_if.wb , exec_unit_req_if.a_reg_data , exec_unit_req_if.b_reg_data , exec_unit_req_if.alu_op , exec_unit_req_if.rs1 , exec_unit_req_if.rs2 , exec_unit_req_if.rs2_src , exec_unit_req_if.itype_immed , exec_unit_req_if.upper_immed , exec_unit_req_if.branch_type , exec_unit_req_if.is_jal , exec_unit_req_if.jal , exec_unit_req_if.jal_offset , exec_unit_req_if.is_etype , exec_unit_req_if.wspawn , exec_unit_req_if.is_csr , exec_unit_req_if.csr_addr , exec_unit_req_if.csr_immed , exec_unit_req_if.csr_mask }) - ); - - VX_generic_register #( - .N(68 + `NW_BITS-1 + 1 + 33*(`NUM_THREADS)) - ) gpu_inst_reg ( - .clk (clk), - .reset (reset), - .stall (stall_rest), - .flush (flush_rest), - .in ({gpu_inst_req_temp_if.valid, gpu_inst_req_temp_if.warp_num, gpu_inst_req_temp_if.is_wspawn, gpu_inst_req_temp_if.is_tmc, gpu_inst_req_temp_if.is_split, gpu_inst_req_temp_if.is_barrier, gpu_inst_req_temp_if.next_PC, gpu_inst_req_temp_if.a_reg_data, gpu_inst_req_temp_if.rd2}), - .out ({gpu_inst_req_if.valid , gpu_inst_req_if.warp_num , gpu_inst_req_if.is_wspawn , gpu_inst_req_if.is_tmc , gpu_inst_req_if.is_split , gpu_inst_req_if.is_barrier , gpu_inst_req_if.next_PC , gpu_inst_req_if.a_reg_data , gpu_inst_req_if.rd2 }) - ); - - VX_generic_register #( - .N(`NW_BITS-1 + 1 + `NUM_THREADS + 58) + .N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + `CSR_ADDR_SIZE + 32 + 1 + `NR_BITS + `WB_BITS) ) csr_reg ( .clk (clk), .reset (reset), - .stall (stall_gpr_csr), - .flush (flush_rest), - .in ({csr_req_temp_if.valid, csr_req_temp_if.warp_num, csr_req_temp_if.rd, csr_req_temp_if.wb, csr_req_temp_if.alu_op, csr_req_temp_if.is_csr, csr_req_temp_if.csr_addr, csr_req_temp_if.csr_immed, csr_req_temp_if.csr_mask}), - .out ({csr_req_if.valid , csr_req_if.warp_num , csr_req_if.rd , csr_req_if.wb , csr_req_if.alu_op , csr_req_if.is_csr , csr_req_if.csr_addr , csr_req_if.csr_immed , csr_req_if.csr_mask }) + .stall (stall_csr), + .flush (0), + .in ({csr_req_tmp_if.valid, csr_req_tmp_if.warp_num, csr_req_tmp_if.curr_PC, csr_req_tmp_if.csr_op, csr_req_tmp_if.csr_addr, csr_req_tmp_if.csr_mask, csr_req_tmp_if.is_io, csr_req_tmp_if.rd, csr_req_tmp_if.wb}), + .out ({csr_req_if.valid, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.csr_op, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io, csr_req_if.rd, csr_req_if.wb}) ); + VX_generic_register #( + .N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `NR_BITS + `WB_BITS) + ) mul_reg ( + .clk (clk), + .reset (reset), + .stall (stall_mul), + .flush (0), + .in ({mul_req_tmp_if.valid, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.mul_op, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data, mul_req_tmp_if.rd, mul_req_tmp_if.wb}), + .out ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.mul_op, mul_req_if.rs1_data, mul_req_if.rs2_data, mul_req_if.rd, mul_req_if.wb}) + ); + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + `GPU_BITS + (`NUM_THREADS * 32) + 32) + ) gpu_reg ( + .clk (clk), + .reset (reset), + .stall (stall_gpu), + .flush (0), + .in ({gpu_req_tmp_if.valid, gpu_req_tmp_if.warp_num, gpu_req_tmp_if.next_PC, gpu_req_tmp_if.gpu_op, gpu_req_tmp_if.rs1_data, gpu_req_tmp_if.rs2_data}), + .out ({gpu_req_if.valid, gpu_req_if.warp_num, gpu_req_if.next_PC, gpu_req_if.gpu_op, gpu_req_if.rs1_data, gpu_req_if.rs2_data}) + ); + + assign execute_if.alu_ready = ~stall_alu; + assign execute_if.br_ready = ~stall_br; + assign execute_if.lsu_ready = ~stall_lsu; + assign execute_if.csr_ready = ~stall_csr; + assign execute_if.mul_ready = ~stall_mul; + assign execute_if.gpu_ready = ~stall_gpu; + + assign writeback_if.ready = 1'b1; + +`ifdef DBG_PRINT_PIPELINE + always @(posedge clk) begin + if ((| execute_if.valid)) begin + $display("%t: Core%0d-GPR: warp=%0d, PC=%0h, a=%0h, b=%0h", $time, CORE_ID, execute_if.warp_num, execute_if.curr_PC, rs1_data, rs2_data); + + // scheduler ensures the destination execute unit is ready (garanteed by the scheduler) + assert((execute_if.ex_type != `EX_ALU) || alu_req_if.ready); + assert((execute_if.ex_type != `EX_BR) || branch_req_if.ready); + assert((execute_if.ex_type != `EX_LSU) || lsu_req_if.ready); + assert((execute_if.ex_type != `EX_CSR) || csr_req_if.ready); + assert((execute_if.ex_type != `EX_MUL) || mul_req_if.ready); + assert((execute_if.ex_type != `EX_GPU) || gpu_req_if.ready); + end + end `endif -endmodule : VX_gpr_stage +endmodule diff --git a/hw/rtl/VX_gpr_wrapper.v b/hw/rtl/VX_gpr_wrapper.v deleted file mode 100644 index fe71f064..00000000 --- a/hw/rtl/VX_gpr_wrapper.v +++ /dev/null @@ -1,60 +0,0 @@ -`include "VX_define.vh" - -module VX_gpr_wrapper ( - input wire clk, - input wire reset, - VX_wb_if writeback_if, - VX_gpr_read_if gpr_read_if -); - wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0][31:0] tmp_a_reg_data; - wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0][31:0] tmp_b_reg_data; - wire [`NUM_THREADS-1:0][31:0] jal_data; - - genvar i; - generate - for (i = 0; i < `NUM_THREADS; i++) begin : jal_data_assign - assign jal_data[i] = gpr_read_if.curr_PC; - end - endgenerate - - `ifndef ASIC - assign gpr_read_if.a_reg_data = gpr_read_if.is_jal ? jal_data : tmp_a_reg_data[gpr_read_if.warp_num]; - assign gpr_read_if.b_reg_data = tmp_b_reg_data[gpr_read_if.warp_num]; - `else - - wire [`NW_BITS-1:0] old_warp_num; - - VX_generic_register #( - .N(`NW_BITS-1+1) - ) store_wn ( - .clk (clk), - .reset (reset), - .stall (1'b0), - .flush (1'b0), - .in (gpr_read_if.warp_num), - .out (old_warp_num) - ); - - assign gpr_read_if.a_reg_data = gpr_jal_if.is_jal ? jal_data : tmp_a_reg_data[old_warp_num]; - assign gpr_read_if.b_reg_data = tmp_b_reg_data[old_warp_num]; - - `endif - - generate - for (i = 0; i < `NUM_WARPS; i++) begin : warp_gprs - wire write_ce = (i == writeback_if.warp_num); - VX_gpr_ram gpr_ram( - .clk (clk), - .reset (reset), - .write_ce (write_ce), - .gpr_read_if (gpr_read_if), - .writeback_if (writeback_if), - .a_reg_data (tmp_a_reg_data[i]), - .b_reg_data (tmp_b_reg_data[i]) - ); - end - endgenerate - -endmodule - - diff --git a/hw/rtl/VX_gpu_inst.v b/hw/rtl/VX_gpu_inst.v deleted file mode 100644 index 5dbf3e57..00000000 --- a/hw/rtl/VX_gpu_inst.v +++ /dev/null @@ -1,88 +0,0 @@ -`include "VX_define.vh" - -module VX_gpu_inst ( - // Input - VX_gpu_inst_req_if gpu_inst_req_if, - - // Output - VX_warp_ctl_if warp_ctl_if -); - wire[`NUM_THREADS-1:0] curr_valids = gpu_inst_req_if.valid; - wire is_split = gpu_inst_req_if.is_split; - - wire[`NUM_THREADS-1:0] tmc_new_mask; - wire all_threads = `NUM_THREADS < gpu_inst_req_if.a_reg_data[0]; - - genvar i; - generate - for (i = 0; i < `NUM_THREADS; i++) begin : tmc_new_mask_init - assign tmc_new_mask[i] = all_threads ? 1 : i < gpu_inst_req_if.a_reg_data[0]; - end - endgenerate - - wire valid_inst = (| curr_valids); - - assign warp_ctl_if.warp_num = gpu_inst_req_if.warp_num; - assign warp_ctl_if.change_mask = gpu_inst_req_if.is_tmc && valid_inst; - assign warp_ctl_if.thread_mask = gpu_inst_req_if.is_tmc ? tmc_new_mask : 0; - - assign warp_ctl_if.whalt = warp_ctl_if.change_mask && (0 == warp_ctl_if.thread_mask); - - wire wspawn = gpu_inst_req_if.is_wspawn && valid_inst; - wire[31:0] wspawn_pc = gpu_inst_req_if.rd2; - wire all_active = `NUM_WARPS < gpu_inst_req_if.a_reg_data[0]; - wire[`NUM_WARPS-1:0] wspawn_new_active; - - generate - for (i = 0; i < `NUM_WARPS; i++) begin : wspawn_new_active_init - assign wspawn_new_active[i] = all_active ? 1 : i < gpu_inst_req_if.a_reg_data[0]; - end - endgenerate - - assign warp_ctl_if.is_barrier = gpu_inst_req_if.is_barrier && valid_inst; - assign warp_ctl_if.barrier_id = gpu_inst_req_if.a_reg_data[0]; - -`DEBUG_BEGIN - wire[31:0] num_warps_m1 = gpu_inst_req_if.rd2 - 1; -`DEBUG_END - - assign warp_ctl_if.num_warps = num_warps_m1[$clog2(`NUM_WARPS):0]; - - assign warp_ctl_if.wspawn = wspawn; - assign warp_ctl_if.wspawn_pc = wspawn_pc; - assign warp_ctl_if.wspawn_new_active = wspawn_new_active; - - wire[`NUM_THREADS-1:0] split_new_use_mask; - wire[`NUM_THREADS-1:0] split_new_later_mask; - - generate - for (i = 0; i < `NUM_THREADS; i++) begin : masks_init - wire curr_bool = (gpu_inst_req_if.a_reg_data[i] == 32'b1); - assign split_new_use_mask[i] = curr_valids[i] & (curr_bool); - assign split_new_later_mask[i] = curr_valids[i] & (!curr_bool); - end - endgenerate - - wire[$clog2(`NUM_THREADS):0] num_valids; - - VX_countones #( - .N(`NUM_THREADS) - ) valids_counter ( - .valids(curr_valids), - .count (num_valids) - ); - - // wire[`NW_BITS-1:0] num_valids = $countones(curr_valids); - - assign warp_ctl_if.is_split = is_split && (num_valids > 1); - assign warp_ctl_if.dont_split = warp_ctl_if.is_split && ((split_new_use_mask == 0) || (split_new_use_mask == {`NUM_THREADS{1'b1}})); - assign warp_ctl_if.split_new_mask = split_new_use_mask; - assign warp_ctl_if.split_later_mask = split_new_later_mask; - assign warp_ctl_if.split_save_pc = gpu_inst_req_if.next_PC; - assign warp_ctl_if.split_warp_num = gpu_inst_req_if.warp_num; - - // gpu_inst_req_if.is_wspawn - // gpu_inst_req_if.is_split - // gpu_inst_req_if.is_barrier - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v new file mode 100644 index 00000000..292b1f23 --- /dev/null +++ b/hw/rtl/VX_gpu_unit.v @@ -0,0 +1,79 @@ +`include "VX_define.vh" + +module VX_gpu_unit #( + parameter CORE_ID = 0 +) ( + // Input + VX_gpu_req_if gpu_req_if, + + // Output + VX_warp_ctl_if warp_ctl_if +); + wire [`NUM_THREADS-1:0] curr_valids = gpu_req_if.valid; + wire is_wspawn = (gpu_req_if.gpu_op == `GPU_WSPAWN); + wire is_tmc = (gpu_req_if.gpu_op == `GPU_TMC); + wire is_split = (gpu_req_if.gpu_op == `GPU_SPLIT); + wire is_bar = (gpu_req_if.gpu_op == `GPU_BAR); + + wire [`NUM_THREADS-1:0] tmc_new_mask; + wire all_threads = `NUM_THREADS < gpu_req_if.rs1_data[0]; + + genvar i; + for (i = 0; i < `NUM_THREADS; i++) begin : tmc_new_mask_init + assign tmc_new_mask[i] = all_threads ? 1 : i < gpu_req_if.rs1_data[0]; + end + + wire valid_inst = (| curr_valids); + + assign warp_ctl_if.warp_num = gpu_req_if.warp_num; + + assign warp_ctl_if.change_mask = is_tmc && valid_inst; + assign warp_ctl_if.thread_mask = is_tmc ? tmc_new_mask : 0; + + assign warp_ctl_if.whalt = warp_ctl_if.change_mask && (0 == warp_ctl_if.thread_mask); + + wire wspawn = is_wspawn && valid_inst; + wire [31:0] wspawn_pc = gpu_req_if.rs2_data; + wire all_active = `NUM_WARPS < gpu_req_if.rs1_data[0]; + wire [`NUM_WARPS-1:0] wspawn_new_active; + + for (i = 0; i < `NUM_WARPS; i++) begin : wspawn_new_active_init + assign wspawn_new_active[i] = all_active ? 1 : i < gpu_req_if.rs1_data[0]; + end + + assign warp_ctl_if.is_barrier = is_bar && valid_inst; + assign warp_ctl_if.barrier_id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; + + assign warp_ctl_if.num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1); + + assign warp_ctl_if.wspawn = wspawn; + assign warp_ctl_if.wspawn_pc = wspawn_pc; + assign warp_ctl_if.wspawn_new_active = wspawn_new_active; + + wire[`NUM_THREADS-1:0] split_new_use_mask; + wire[`NUM_THREADS-1:0] split_new_later_mask; + + for (i = 0; i < `NUM_THREADS; i++) begin : masks_init + wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1); + assign split_new_use_mask[i] = curr_valids[i] & (curr_bool); + assign split_new_later_mask[i] = curr_valids[i] & (!curr_bool); + end + + wire [`NT_BITS:0] num_valids; + + VX_countones #( + .N(`NUM_THREADS) + ) valids_counter ( + .valids(curr_valids), + .count (num_valids) + ); + + assign warp_ctl_if.is_split = is_split && (num_valids > 1); + assign warp_ctl_if.do_split = (split_new_use_mask != 0) && (split_new_use_mask != {`NUM_THREADS{1'b1}}); + assign warp_ctl_if.split_new_mask = split_new_use_mask; + assign warp_ctl_if.split_later_mask = split_new_later_mask; + assign warp_ctl_if.split_save_pc = gpu_req_if.next_PC; + + assign gpu_req_if.ready = 1'b1; // has no stalls + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 5232f943..37eef92c 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -7,20 +7,21 @@ module VX_icache_stage #( input wire clk, input wire reset, - input wire total_freeze, - output wire icache_stage_delay, - output wire[`NW_BITS-1:0] icache_stage_wid, - output wire icache_stage_response, - VX_inst_meta_if fe_inst_meta_fi, - VX_inst_meta_if fe_inst_meta_id, + // Icache interface VX_cache_core_req_if icache_req_if, - VX_cache_core_rsp_if icache_rsp_if + VX_cache_core_rsp_if icache_rsp_if, + + // request + VX_ifetch_req_if ifetch_req_if, + + // reponse + VX_ifetch_rsp_if ifetch_rsp_if ); reg [`NUM_THREADS-1:0] valid_threads [`NUM_WARPS-1:0]; - wire valid_inst = (| fe_inst_meta_fi.valid); + wire valid_inst = (| ifetch_req_if.valid); wire [`LOG2UP(`ICREQ_SIZE)-1:0] mrq_write_addr, mrq_read_addr, dbg_mrq_write_addr; wire mrq_full; @@ -30,25 +31,25 @@ module VX_icache_stage #( assign mrq_read_addr = icache_rsp_if.tag[0][`LOG2UP(`ICREQ_SIZE)-1:0]; - VX_indexable_queue #( + VX_index_queue #( .DATAW (`LOG2UP(`ICREQ_SIZE) + 32 + `NW_BITS), .SIZE (`ICREQ_SIZE) ) mem_req_queue ( .clk (clk), .reset (reset), - .write_data ({mrq_write_addr, fe_inst_meta_fi.curr_PC, fe_inst_meta_fi.warp_num}), + .write_data ({mrq_write_addr, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}), .write_addr (mrq_write_addr), .push (mrq_push), .full (mrq_full), .pop (mrq_pop), .read_addr (mrq_read_addr), - .read_data ({dbg_mrq_write_addr, fe_inst_meta_id.curr_PC, fe_inst_meta_id.warp_num}), + .read_data ({dbg_mrq_write_addr, ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num}), `UNUSED_PIN (empty) ); always @(posedge clk) begin if (mrq_push) begin - valid_threads[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid; + valid_threads[ifetch_req_if.warp_num] <= ifetch_req_if.valid; end if (mrq_pop) begin assert(mrq_read_addr == dbg_mrq_write_addr); @@ -59,29 +60,26 @@ module VX_icache_stage #( assign icache_req_if.valid = valid_inst && !mrq_full; assign icache_req_if.rw = 0; assign icache_req_if.byteen = 4'b1111; - assign icache_req_if.addr = fe_inst_meta_fi.curr_PC[31:2]; + assign icache_req_if.addr = ifetch_req_if.curr_PC[31:2]; assign icache_req_if.data = 0; // Can't accept new request - assign icache_stage_delay = mrq_full || !icache_req_if.ready; + assign ifetch_req_if.ready = !mrq_full && icache_req_if.ready; `ifdef DBG_CORE_REQ_INFO - assign icache_req_if.tag = {fe_inst_meta_fi.curr_PC, 2'b1, 5'b0, fe_inst_meta_fi.warp_num, mrq_write_addr}; + assign icache_req_if.tag = {ifetch_req_if.curr_PC, 2'b1, 5'b0, ifetch_req_if.warp_num, mrq_write_addr}; `else assign icache_req_if.tag = mrq_write_addr; `endif - assign fe_inst_meta_id.instruction = icache_rsp_if.valid ? icache_rsp_if.data[0] : 0; - assign fe_inst_meta_id.valid = icache_rsp_if.valid ? valid_threads[fe_inst_meta_id.warp_num] : 0; - - assign icache_stage_response = mrq_pop; - assign icache_stage_wid = fe_inst_meta_id.warp_num; + assign ifetch_rsp_if.valid = icache_rsp_if.valid ? valid_threads[ifetch_rsp_if.warp_num] : 0; + assign ifetch_rsp_if.instr = icache_rsp_if.data[0]; // Can't accept new response - assign icache_rsp_if.ready = !total_freeze; + assign icache_rsp_if.ready = ifetch_rsp_if.ready; `SCOPE_ASSIGN(scope_icache_req_valid, icache_req_if.valid); - `SCOPE_ASSIGN(scope_icache_req_warp_num, fe_inst_meta_fi.warp_num); + `SCOPE_ASSIGN(scope_icache_req_warp_num, ifetch_req_if.warp_num); `SCOPE_ASSIGN(scope_icache_req_addr, {icache_req_if.addr, 2'b0}); `SCOPE_ASSIGN(scope_icache_req_tag, icache_req_if.tag); `SCOPE_ASSIGN(scope_icache_req_ready, icache_req_if.ready); @@ -94,10 +92,10 @@ module VX_icache_stage #( `ifdef DBG_PRINT_CORE_ICACHE always @(posedge clk) begin if (icache_req_if.valid && icache_req_if.ready) begin - $display("%t: I%0d$ req: tag=%0h, pc=%0h, warp=%0d", $time, CORE_ID, mrq_write_addr, fe_inst_meta_fi.curr_PC, fe_inst_meta_fi.warp_num); + $display("%t: I$%0d req: tag=%0h, PC=%0h, warp=%0d", $time, CORE_ID, mrq_write_addr, ifetch_req_if.curr_PC, ifetch_req_if.warp_num); end if (icache_rsp_if.valid && icache_rsp_if.ready) begin - $display("%t: I%0d$ rsp: tag=%0h, pc=%0h, warp=%0d, instr=%0h", $time, CORE_ID, mrq_read_addr, fe_inst_meta_id.curr_PC, fe_inst_meta_id.warp_num, fe_inst_meta_id.instruction); + $display("%t: I$%0d rsp: tag=%0h, PC=%0h, warp=%0d, instr=%0h", $time, CORE_ID, mrq_read_addr, ifetch_rsp_if.curr_PC, ifetch_rsp_if.warp_num, ifetch_rsp_if.instr); end end `endif diff --git a/hw/rtl/VX_inst_multiplex.v b/hw/rtl/VX_inst_multiplex.v deleted file mode 100644 index 73225085..00000000 --- a/hw/rtl/VX_inst_multiplex.v +++ /dev/null @@ -1,90 +0,0 @@ -`include "VX_define.vh" - -module VX_inst_multiplex ( - // Inputs - VX_backend_req_if bckE_req_if, - VX_gpr_read_if gpr_read_if, - - // Outputs - VX_exec_unit_req_if exec_unit_req_if, - VX_lsu_req_if lsu_req_if, - VX_gpu_inst_req_if gpu_inst_req_if, - VX_csr_req_if csr_req_if -); - - wire[`NUM_THREADS-1:0] is_mem_mask; - wire[`NUM_THREADS-1:0] is_gpu_mask; - wire[`NUM_THREADS-1:0] is_csr_mask; - - wire is_mem = (bckE_req_if.mem_write != `BYTE_EN_NO) || (bckE_req_if.mem_read != `BYTE_EN_NO); - wire is_gpu = (bckE_req_if.is_wspawn || bckE_req_if.is_tmc || bckE_req_if.is_barrier || bckE_req_if.is_split); - wire is_csr = bckE_req_if.is_csr; - // wire is_gpu = 0; - - genvar i; - generate - for (i = 0; i < `NUM_THREADS; i++) begin : mask_init - assign is_mem_mask[i] = is_mem; - assign is_gpu_mask[i] = is_gpu; - assign is_csr_mask[i] = is_csr; - end - endgenerate - - // LSU Unit - assign lsu_req_if.valid = bckE_req_if.valid & is_mem_mask; - assign lsu_req_if.warp_num = bckE_req_if.warp_num; - assign lsu_req_if.base_addr = gpr_read_if.a_reg_data; - assign lsu_req_if.store_data = gpr_read_if.b_reg_data; - - assign lsu_req_if.offset = bckE_req_if.itype_immed; - - assign lsu_req_if.mem_read = bckE_req_if.mem_read; - assign lsu_req_if.mem_write = bckE_req_if.mem_write; - assign lsu_req_if.rd = bckE_req_if.rd; - assign lsu_req_if.wb = bckE_req_if.wb; - assign lsu_req_if.curr_PC = bckE_req_if.curr_PC; - - // Execute Unit - assign exec_unit_req_if.valid = bckE_req_if.valid & (~is_mem_mask & ~is_gpu_mask & ~is_csr_mask); - assign exec_unit_req_if.warp_num = bckE_req_if.warp_num; - assign exec_unit_req_if.curr_PC = bckE_req_if.curr_PC; - assign exec_unit_req_if.next_PC = bckE_req_if.next_PC; - assign exec_unit_req_if.rd = bckE_req_if.rd; - assign exec_unit_req_if.wb = bckE_req_if.wb; - assign exec_unit_req_if.a_reg_data = gpr_read_if.a_reg_data; - assign exec_unit_req_if.b_reg_data = gpr_read_if.b_reg_data; - assign exec_unit_req_if.alu_op = bckE_req_if.alu_op; - assign exec_unit_req_if.rs1 = bckE_req_if.rs1; - assign exec_unit_req_if.rs2 = bckE_req_if.rs2; - assign exec_unit_req_if.rs2_src = bckE_req_if.rs2_src; - assign exec_unit_req_if.itype_immed = bckE_req_if.itype_immed; - assign exec_unit_req_if.upper_immed = bckE_req_if.upper_immed; - assign exec_unit_req_if.branch_type = bckE_req_if.branch_type; - assign exec_unit_req_if.is_jal = bckE_req_if.is_jal; - assign exec_unit_req_if.jal = bckE_req_if.jal; - assign exec_unit_req_if.jal_offset = bckE_req_if.jal_offset; - assign exec_unit_req_if.is_etype = bckE_req_if.is_etype; - - // GPR Req - assign gpu_inst_req_if.valid = bckE_req_if.valid & is_gpu_mask; - assign gpu_inst_req_if.warp_num = bckE_req_if.warp_num; - assign gpu_inst_req_if.is_wspawn = bckE_req_if.is_wspawn; - assign gpu_inst_req_if.is_tmc = bckE_req_if.is_tmc; - assign gpu_inst_req_if.is_split = bckE_req_if.is_split; - assign gpu_inst_req_if.is_barrier = bckE_req_if.is_barrier; - assign gpu_inst_req_if.a_reg_data = gpr_read_if.a_reg_data; - assign gpu_inst_req_if.rd2 = gpr_read_if.b_reg_data[0]; - assign gpu_inst_req_if.next_PC = bckE_req_if.next_PC; - - // CSR Req - assign csr_req_if.valid = bckE_req_if.valid & is_csr_mask; - assign csr_req_if.warp_num = bckE_req_if.warp_num; - assign csr_req_if.rd = bckE_req_if.rd; - assign csr_req_if.wb = bckE_req_if.wb; - assign csr_req_if.alu_op = bckE_req_if.alu_op; - assign csr_req_if.is_csr = bckE_req_if.is_csr; - assign csr_req_if.csr_addr = bckE_req_if.csr_addr; - assign csr_req_if.csr_immed = bckE_req_if.csr_immed; - assign csr_req_if.csr_mask = bckE_req_if.csr_mask; - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v new file mode 100644 index 00000000..145ac917 --- /dev/null +++ b/hw/rtl/VX_issue.v @@ -0,0 +1,87 @@ +`include "VX_define.vh" + +module VX_issue #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + VX_decode_if decode_if, + VX_wb_if writeback_if, + + VX_execute_if execute_if, + + output wire is_empty +); + localparam CTVW = `CLOG2(`NUM_WARPS * 32 + 1); + + reg [31:0][`NUM_THREADS-1:0] rename_table[`NUM_WARPS-1:0]; + reg [CTVW-1:0] count_valid; + + wire rs1_rename = (rename_table[decode_if.warp_num][decode_if.rs1] != 0); + wire rs2_rename = (rename_table[decode_if.warp_num][decode_if.rs2] != 0); + wire rd_rename = (rename_table[decode_if.warp_num][decode_if.rd ] != 0); + + wire rs1_rename_qual = (rs1_rename) && (decode_if.use_rs1); + wire rs2_rename_qual = (rs2_rename) && (decode_if.use_rs2); + wire rd_rename_qual = (rd_rename) && (decode_if.wb != 0); + + wire rename_valid = (| decode_if.valid) && (rs1_rename_qual || rs2_rename_qual || rd_rename_qual); + + wire ex_stalled = (| decode_if.valid) + && ((!execute_if.alu_ready && (decode_if.ex_type == `EX_ALU)) + || (!execute_if.br_ready && (decode_if.ex_type == `EX_BR)) + || (!execute_if.lsu_ready && (decode_if.ex_type == `EX_LSU)) + || (!execute_if.csr_ready && (decode_if.ex_type == `EX_CSR)) + || (!execute_if.mul_ready && (decode_if.ex_type == `EX_MUL)) + || (!execute_if.gpu_ready && (decode_if.ex_type == `EX_GPU))); + + wire stall = rename_valid || ex_stalled; + + wire acquire_rd = (| decode_if.valid) && (decode_if.wb != 0) && (decode_if.rd != 0) && ~stall; + + wire release_rd = (| writeback_if.valid) && (writeback_if.wb != 0) && (writeback_if.rd != 0); + + wire [`NUM_THREADS-1:0] valid_wb_new_mask = rename_table[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.valid; + + reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == valid_wb_new_mask))) ? (count_valid + 1) : + (~acquire_rd && (release_rd && (0 == valid_wb_new_mask))) ? (count_valid - 1) : + count_valid; + integer i, w; + + always @(posedge clk) begin + if (reset) begin + for (w = 0; w < `NUM_WARPS; w++) begin + for (i = 0; i < 32; i++) begin + rename_table[w][i] <= 0; + end + end + count_valid <= 0; + end else begin + if (acquire_rd) begin + rename_table[decode_if.warp_num][decode_if.rd] <= decode_if.valid; + end + if (release_rd) begin + assert(rename_table[writeback_if.warp_num][writeback_if.rd] != 0); + rename_table[writeback_if.warp_num][writeback_if.rd] <= valid_wb_new_mask; + end + count_valid <= count_valid_next; + end + end + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + `WB_BITS), + ) schedule_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.instr_op, decode_if.wb}), + .out ({execute_if.valid, execute_if.warp_num, execute_if.curr_PC, execute_if.next_PC, execute_if.rd, execute_if.rs1, execute_if.rs2, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.ex_type, execute_if.instr_op, execute_if.wb}) + ); + + assign decode_if.ready = ~stall; + + assign is_empty = (0 == count_valid); + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index d1296680..09add222 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -5,23 +5,19 @@ module VX_lsu_unit #( ) ( `SCOPE_SIGNALS_LSU_IO - input wire clk, - input wire reset, - - input wire no_slot_mem, - VX_lsu_req_if lsu_req_if, - - // Write back to GPR - VX_wb_if mem_wb_if, + input wire clk, + input wire reset, // Dcache interface VX_cache_core_req_if dcache_req_if, VX_cache_core_rsp_if dcache_rsp_if, - output wire delay -); + // inputs + VX_lsu_req_if lsu_req_if, - VX_wb_if mem_wb_unqual_if(); + // outputs + VX_wb_if lsu_wb_if +); wire [`NUM_THREADS-1:0] use_valid; wire use_req_rw; @@ -29,28 +25,25 @@ module VX_lsu_unit #( wire [`NUM_THREADS-1:0][1:0] use_req_offset; wire [`NUM_THREADS-1:0][3:0] use_req_byteen; wire [`NUM_THREADS-1:0][31:0] use_req_data; - wire [`BYTE_EN_BITS-1:0] use_mem_read; - wire [4:0] use_rd; + wire [`BYTEEN_BITS-1:0] mem_byteen; + wire [`NR_BITS-1:0] use_rd; wire [`NW_BITS-1:0] use_warp_num; - wire [1:0] use_wb; + wire [`WB_BITS-1:0] use_wb; wire [31:0] use_pc; genvar i; - // Generate Full Addresses - wire[`NUM_THREADS-1:0][31:0] full_address; + wire [`NUM_THREADS-1:0][31:0] full_address; for (i = 0; i < `NUM_THREADS; i++) begin assign full_address[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset; end - wire mem_req_rw = (lsu_req_if.mem_write != `BYTE_EN_NO); - reg [3:0] wmask; always @(*) begin - case ((mem_req_rw ? lsu_req_if.mem_write[1:0] : lsu_req_if.mem_read[1:0])) - 0: wmask = 4'b0001; - 1: wmask = 4'b0011; - default : wmask = 4'b1111; + case (lsu_req_if.byteen) + 0: wmask = 4'b0001; + 1: wmask = 4'b0011; + default: wmask = 4'b1111; endcase end @@ -64,29 +57,32 @@ module VX_lsu_unit #( assign mem_req_offset[i] = full_address[i][1:0]; assign mem_req_byteen[i] = wmask << full_address[i][1:0]; assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0}; - end + end + + // Can accept new request + wire stall = ~dcache_req_if.ready || mrq_full; + assign lsu_req_if.ready = ~stall; `IGNORE_WARNINGS_BEGIN - wire[`NUM_THREADS-1:0][31:0] use_address; + wire [`NUM_THREADS-1:0][31:0] use_address; `IGNORE_WARNINGS_END VX_generic_register #( - .N((`NUM_THREADS * 1) + (`NUM_THREADS * 32) + `BYTE_EN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + 5 + `NW_BITS + 2 + 32) - ) lsu_buffer ( + .N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + `WB_BITS + 32) + ) mem_req_reg ( .clk (clk), .reset (reset), - .stall (delay), - .flush (1'b0), - .in ({lsu_req_if.valid, full_address, lsu_req_if.mem_read, mem_req_rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.warp_num, lsu_req_if.wb, lsu_req_if.curr_PC}), - .out ({use_valid , use_address, use_mem_read , use_req_rw, use_req_addr, use_req_offset, use_req_byteen, use_req_data, use_rd , use_warp_num , use_wb , use_pc}) + .stall (stall), + .flush (0), + .in ({lsu_req_if.valid, full_address, lsu_req_if.byteen, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.warp_num, lsu_req_if.wb, lsu_req_if.curr_PC}), + .out ({use_valid , use_address, mem_byteen , use_req_rw, use_req_addr, use_req_offset, use_req_byteen, use_req_data, use_rd , use_warp_num , use_wb , use_pc}) ); - wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset; - wire [`BYTE_EN_BITS-1:0] core_rsp_mem_read; - reg [`NUM_THREADS-1:0] mem_rsp_mask[`DCREQ_SIZE-1:0]; - wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr, mrq_read_addr, dbg_mrq_write_addr; + wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_write_addr, dbg_mrq_write_addr; + wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset; + wire [`BYTEEN_BITS-1:0] core_rsp_mem_read; wire mrq_full; wire mrq_push = (| dcache_req_if.valid) && dcache_req_if.ready @@ -94,25 +90,25 @@ module VX_lsu_unit #( wire mrq_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; - assign mrq_read_addr = dcache_rsp_if.tag[0][`LOG2UP(`DCREQ_SIZE)-1:0]; + wire [`LOG2UP(`DCREQ_SIZE)-1:0] mrq_read_addr = dcache_rsp_if.tag[0][`LOG2UP(`DCREQ_SIZE)-1:0]; wire [`NUM_THREADS-1:0] mem_rsp_mask_upd = mem_rsp_mask[mrq_read_addr] & ~dcache_rsp_if.valid; wire mrq_pop = mrq_pop_part && (0 == mem_rsp_mask_upd); - VX_indexable_queue #( - .DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + 2 + (`NUM_THREADS * 2) + `BYTE_EN_BITS + 5 + `NW_BITS), + VX_index_queue #( + .DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + `WB_BITS + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS), .SIZE (`DCREQ_SIZE) ) mem_req_queue ( .clk (clk), .reset (reset), - .write_data ({mrq_write_addr, use_pc, use_wb, use_req_offset, use_mem_read, use_rd, use_warp_num}), + .write_data ({mrq_write_addr, use_pc, use_wb, use_req_offset, mem_byteen, use_rd, use_warp_num}), .write_addr (mrq_write_addr), .push (mrq_push), .full (mrq_full), .pop (mrq_pop), .read_addr (mrq_read_addr), - .read_data ({dbg_mrq_write_addr, mem_wb_unqual_if.curr_PC, mem_wb_unqual_if.wb, mem_rsp_offset, core_rsp_mem_read, mem_wb_unqual_if.rd, mem_wb_unqual_if.warp_num}), + .read_data ({dbg_mrq_write_addr, lsu_wb_if.curr_PC, lsu_wb_if.wb, mem_rsp_offset, core_rsp_mem_read, lsu_wb_if.rd, lsu_wb_if.warp_num}), `UNUSED_PIN (empty) ); @@ -127,7 +123,6 @@ module VX_lsu_unit #( end // Core Request - assign dcache_req_if.valid = use_valid & {`NUM_THREADS{~mrq_full}}; assign dcache_req_if.rw = {`NUM_THREADS{use_req_rw}}; assign dcache_req_if.byteen = use_req_byteen; @@ -140,43 +135,27 @@ module VX_lsu_unit #( assign dcache_req_if.tag = mrq_write_addr; `endif - // Can't accept new request - assign delay = mrq_full || !dcache_req_if.ready; - // Core Response - - reg [`NUM_THREADS-1:0][31:0] core_rsp_data; - wire [`NUM_THREADS-1:0][31:0] rsp_data_shifted; + reg [`NUM_THREADS-1:0][31:0] core_rsp_data; for (i = 0; i < `NUM_THREADS; i++) begin - assign rsp_data_shifted[i] = dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0}; + wire [15:0] rsp_data_shifted = 16'(dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0}); always @(*) begin case (core_rsp_mem_read) - `BYTE_EN_SB: core_rsp_data[i] = {{24{rsp_data_shifted[i][7]}}, rsp_data_shifted[i][7:0]}; - `BYTE_EN_SH: core_rsp_data[i] = {{16{rsp_data_shifted[i][15]}}, rsp_data_shifted[i][15:0]}; - `BYTE_EN_UB: core_rsp_data[i] = 32'(rsp_data_shifted[i][7:0]); - `BYTE_EN_UH: core_rsp_data[i] = 32'(rsp_data_shifted[i][15:0]); - default : core_rsp_data[i] = rsp_data_shifted[i]; + `BYTEEN_SB: core_rsp_data[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]}; + `BYTEEN_UB: core_rsp_data[i] = 32'(rsp_data_shifted[7:0]); + `BYTEEN_SH: core_rsp_data[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]}; + `BYTEEN_UH: core_rsp_data[i] = 32'(rsp_data_shifted[15:0]); + default: core_rsp_data[i] = dcache_rsp_if.data[i]; endcase end end - assign mem_wb_unqual_if.valid = dcache_rsp_if.valid; - assign mem_wb_unqual_if.data = core_rsp_data; + assign lsu_wb_if.valid = dcache_rsp_if.valid; + assign lsu_wb_if.data = core_rsp_data; - // Can't accept new response - assign dcache_rsp_if.ready = !(no_slot_mem & (|mem_wb_if.valid)); - - // From LSU to WB - localparam WB_REQ_SIZE = (`NUM_THREADS) + (`NUM_THREADS * 32) + (`NW_BITS) + (5) + (2) + 32; - VX_generic_register #(.N(WB_REQ_SIZE)) lsu_to_wb ( - .clk (clk), - .reset (reset), - .stall (no_slot_mem), - .flush (1'b0), - .in ({mem_wb_unqual_if.valid, mem_wb_unqual_if.data, mem_wb_unqual_if.warp_num, mem_wb_unqual_if.rd, mem_wb_unqual_if.wb, mem_wb_unqual_if.curr_PC}), - .out ({mem_wb_if.valid, mem_wb_if.data, mem_wb_if.warp_num, mem_wb_if.rd, mem_wb_if.wb, mem_wb_if.curr_PC}) - ); + // Can accept new cache response + assign dcache_rsp_if.ready = lsu_wb_if.ready; `SCOPE_ASSIGN(scope_dcache_req_valid, dcache_req_if.valid); `SCOPE_ASSIGN(scope_dcache_req_warp_num, use_warp_num); @@ -196,12 +175,12 @@ module VX_lsu_unit #( `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin if ((| dcache_req_if.valid) && dcache_req_if.ready) begin - $display("%t: D%0d$ req: valid=%b, addr=%0h, tag=%0h, rw=%0b, pc=%0h, rd=%0d, warp=%0d, byteen=%0h, data=%0h", - $time, CORE_ID, use_valid, use_address, mrq_write_addr, use_req_rw, use_pc, use_rd, use_warp_num, use_req_byteen, use_req_data); + $display("%t: D$%0d req: valid=%b, warp=%0d, PC=%0h, addr=%0h, tag=%0h, rw=%0b, rd=%0d, byteen=%0h, data=%0h", + $time, CORE_ID, use_valid, use_warp_num, use_pc, use_address, mrq_write_addr, use_req_rw, use_rd, use_req_byteen, use_req_data); end if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin - $display("%t: D%0d$ rsp: valid=%b, tag=%0h, pc=%0h, rd=%0d, warp=%0d, data=%0h", - $time, CORE_ID, mem_wb_unqual_if.valid, mrq_read_addr, mem_wb_unqual_if.curr_PC, mem_wb_unqual_if.rd, mem_wb_unqual_if.warp_num, mem_wb_unqual_if.data); + $display("%t: D$%0d rsp: valid=%b, warp=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h", + $time, CORE_ID, lsu_wb_if.valid, lsu_wb_if.warp_num, lsu_wb_if.curr_PC, mrq_read_addr, lsu_wb_if.rd, lsu_wb_if.data); end end `endif diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index af626723..8399857a 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -40,18 +40,20 @@ module VX_mem_unit # ( .CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS) ) core_dcache_rsp_qual_if(), core_smem_rsp_if(); - // select shared memory address - wire is_smem_addr = (({core_dcache_req_if.addr[0], 2'b0} - `SHARED_MEM_BASE_ADDR) <= `SCACHE_SIZE); - wire smem_select = (| core_dcache_req_if.valid) ? is_smem_addr : 0; + // select shared memory bus + wire is_smem_addr = (({core_dcache_req_if.addr[0], 2'b0} - `SHARED_MEM_BASE_ADDR) <= `SCACHE_SIZE); + wire smem_req_select = (| core_dcache_req_if.valid) ? is_smem_addr : 0; + wire smem_rsp_select = (| core_smem_rsp_if.valid); - VX_dcache_arb dcache_smem_arb ( - .req_select (smem_select), - .in_core_req_if (core_dcache_req_if), - .out0_core_req_if (core_dcache_req_qual_if), - .out1_core_req_if (core_smem_req_if), - .in0_core_rsp_if (core_dcache_rsp_qual_if), - .in1_core_rsp_if (core_smem_rsp_if), - .out_core_rsp_if (core_dcache_rsp_if) + VX_dcache_arb dcache_smem_arb ( + .core_req_in_if (core_dcache_req_if), + .core_req_out0_if (core_dcache_req_qual_if), + .core_req_out1_if (core_smem_req_if), + .core_rsp_in0_if (core_dcache_rsp_qual_if), + .core_rsp_in1_if (core_smem_rsp_if), + .core_rsp_out_if (core_dcache_rsp_if), + .select_req (smem_req_select), + .select_rsp (smem_rsp_select) ); VX_cache #( diff --git a/hw/rtl/VX_mul_unit.v b/hw/rtl/VX_mul_unit.v new file mode 100644 index 00000000..c0a80de1 --- /dev/null +++ b/hw/rtl/VX_mul_unit.v @@ -0,0 +1,123 @@ +`include "VX_define.vh" + +module VX_mul_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Inputs + VX_mul_req_if mul_req_if, + + // Outputs + VX_wb_if mul_wb_if +); + wire [`NUM_THREADS-1:0][31:0] alu_result; + wire [`NUM_THREADS-1:0][63:0] mul_result; + wire [`NUM_THREADS-1:0][31:0] div_result; + wire [`NUM_THREADS-1:0][31:0] rem_result; + + wire [`MUL_BITS-1:0] alu_op = mul_req_if.mul_op; + wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data; + wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data; + + genvar i; + + for (i = 0; i < `NUM_THREADS; i++) begin + + wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]}; + wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]}; + + wire [32:0] div_in1 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in1[i][31], alu_in1[i]}; + wire [32:0] div_in2 = {(alu_op == `MUL_DIV || alu_op == `MUL_REM) & alu_in2[i][31], alu_in2[i]}; + + VX_mult #( + .WIDTHA(33), + .WIDTHB(33), + .WIDTHP(64), + .SIGNED(1), + .PIPELINE(`MUL_LATENCY) + ) multiplier ( + .clk(clk), + .reset(reset), + .dataa(mul_in1), + .datab(mul_in2), + .result(mul_result[i]) + ); + + VX_divide #( + .WIDTHN(33), + .WIDTHD(33), + .WIDTHQ(32), + .WIDTHR(32), + .NSIGNED(1), + .DSIGNED(1), + .PIPELINE(`DIV_LATENCY) + ) sdiv ( + .clk(clk), + .reset(reset), + .numer(div_in1), + .denom(div_in2), + .quotient(div_result[i]), + .remainder(rem_result[i]) + ); + + always @(*) begin + case (alu_op) + `MUL_MUL: alu_result[i] = mul_result[i][31:0]; + `MUL_MULH, + `MUL_MULHSU, + `MUL_MULHU: alu_result[i] = mul_result[i][63:32]; + `MUL_DIV, + `MUL_DIVU: alu_result[i] = (alu_in2[i] == 0) ? 32'hffffffff : div_result[i]; + `MUL_REM, + `MUL_REMU: alu_result[i] = (alu_in2 == 0) ? alu_in1[i] : rem_result[i]; + default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC, FENCE + endcase + end + end + + reg result_avail; + reg [4:0] pending_ctr; + wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `DIV_LATENCY : `MUL_LATENCY; + + always @(posedge clk) begin + if (reset) begin + result_avail <= 0; + pending_ctr <= 0; + end else begin + if (result_avail && !stall) begin + result_avail <= 0; + pending_ctr <= 0; + end + if ((| mul_req_if.valid) && (pending_ctr == 0)) begin + pending_ctr <= instr_delay - 1; + if (instr_delay == 1) + result_avail <= 1; + end else if (pending_ctr != 0) begin + pending_ctr <= pending_ctr - 1; + if (pending_ctr == 1) + result_avail <= 1; + end + end + end + + wire pipeline_stall = ~result_avail && (| mul_req_if.valid); + + wire stall = (~mul_wb_if.ready && (| mul_wb_if.valid)) + || pipeline_stall; + + VX_generic_register #( + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32)), + ) mul_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.rd, mul_req_if.wb, alu_result}), + .out ({mul_wb_if.valid, mul_wb_if.warp_num, mul_wb_if.curr_PC, mul_wb_if.rd, mul_wb_if.wb, mul_wb_if.data}) + ); + + assign mul_req_if.ready = ~stall; + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 72c6ee2d..7fee578b 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -58,16 +58,6 @@ module VX_pipeline #( output wire busy, output wire ebreak ); - -`DEBUG_BEGIN - wire scheduler_empty; -`DEBUG_END - - wire memory_delay; - wire exec_delay; - wire gpr_stage_delay; - wire schedule_delay; - // Dcache VX_cache_core_req_if #( .NUM_REQUESTS(`NUM_THREADS), @@ -98,7 +88,6 @@ module VX_pipeline #( .CORE_TAG_ID_BITS(`ICORE_TAG_ID_BITS) ) core_icache_rsp_if(); - // CSR I/O VX_csr_io_req_if csr_io_req_if(); assign csr_io_req_if.valid = csr_io_req_valid; @@ -112,69 +101,95 @@ module VX_pipeline #( assign csr_io_rsp_data = csr_io_rsp_if.data; assign csr_io_rsp_if.ready = csr_io_rsp_ready; - // Front-end to Back-end - VX_backend_req_if bckE_req_if(); - - // Back-end to Front-end - VX_wb_if writeback_if(); + VX_decode_if decode_if(); + VX_execute_if execute_if(); VX_branch_rsp_if branch_rsp_if(); - VX_jal_rsp_if jal_rsp_if(); - - // Warp controls VX_warp_ctl_if warp_ctl_if(); + VX_ifetch_rsp_if ifetch_rsp_if(); + VX_wb_if writeback_if(); + VX_wstall_if wstall_if(); + VX_join_if join_if(); + VX_wb_if alu_wb_if(); + VX_wb_if branch_wb_if(); + VX_wb_if lsu_wb_if(); + VX_wb_if csr_wb_if(); + VX_wb_if mul_wb_if(); - VX_front_end #( + wire notify_commit; + + VX_fetch #( .CORE_ID(CORE_ID) - ) front_end ( - `SCOPE_SIGNALS_ISTAGE_BIND - + ) fetch ( .clk (clk), .reset (reset), - .warp_ctl_if (warp_ctl_if), - .bckE_req_if (bckE_req_if), - .schedule_delay (schedule_delay), - .icache_rsp_if (core_icache_rsp_if), .icache_req_if (core_icache_req_if), - .jal_rsp_if (jal_rsp_if), + .icache_rsp_if (core_icache_rsp_if), + .wstall_if (wstall_if), + .join_if (join_if), + .warp_ctl_if (warp_ctl_if), .branch_rsp_if (branch_rsp_if), + .ifetch_rsp_if (ifetch_rsp_if), .busy (busy) ); - VX_scheduler scheduler ( + VX_decode #( + .CORE_ID(CORE_ID) + ) decode ( + .clk (clk), + .reset (reset), + .ifetch_rsp_if (ifetch_rsp_if), + .decode_if (decode_if), + .wstall_if (wstall_if), + .join_if (join_if) + ); + + VX_issue #( + .CORE_ID(CORE_ID) + ) issue ( + .clk (clk), + .reset (reset), + .decode_if (decode_if), + .writeback_if (writeback_if), + .execute_if (execute_if), + `UNUSED_PIN (is_empty) + ); + + VX_execute #( + .CORE_ID(CORE_ID) + ) execute ( + `SCOPE_SIGNALS_LSU_BIND + .clk (clk), + .reset (reset), + .dcache_req_if (core_dcache_req_if), + .dcache_rsp_if (core_dcache_rsp_if), + .csr_io_req_if (csr_io_req_if), + .csr_io_rsp_if (csr_io_rsp_if), + .execute_if (execute_if), + .writeback_if (writeback_if), + .warp_ctl_if (warp_ctl_if), + .branch_rsp_if (branch_rsp_if), + .alu_wb_if (alu_wb_if), + .branch_wb_if (branch_wb_if), + .lsu_wb_if (lsu_wb_if), + .csr_wb_if (csr_wb_if), + .mul_wb_if (mul_wb_if), + .notify_commit (notify_commit), + .ebreak (ebreak) + ); + + VX_writeback #( + .CORE_ID(CORE_ID) + ) writeback ( .clk (clk), .reset (reset), - .memory_delay (memory_delay), - .exec_delay (exec_delay), - .gpr_stage_delay(gpr_stage_delay), - .bckE_req_if (bckE_req_if), + .alu_wb_if (alu_wb_if), + .branch_wb_if (branch_wb_if), + .lsu_wb_if (lsu_wb_if), + .csr_wb_if (csr_wb_if), + .mul_wb_if (mul_wb_if), .writeback_if (writeback_if), - .schedule_delay (schedule_delay), - .is_empty (scheduler_empty) - ); - - VX_back_end #( - .CORE_ID(CORE_ID) - ) back_end ( - `SCOPE_SIGNALS_LSU_BIND - `SCOPE_SIGNALS_BE_BIND - - .clk (clk), - .reset (reset), - .csr_io_req_if (csr_io_req_if), - .csr_io_rsp_if (csr_io_rsp_if), - .schedule_delay (schedule_delay), - .warp_ctl_if (warp_ctl_if), - .bckE_req_if (bckE_req_if), - .jal_rsp_if (jal_rsp_if), - .branch_rsp_if (branch_rsp_if), - .dcache_req_if (core_dcache_req_if), - .dcache_rsp_if (core_dcache_rsp_if), - .writeback_if (writeback_if), - .mem_delay (memory_delay), - .exec_delay (exec_delay), - .gpr_stage_delay (gpr_stage_delay), - .ebreak (ebreak) - ); + .notify_commit (notify_commit) + ); assign dcache_req_valid = core_dcache_req_if.valid; assign dcache_req_rw = core_dcache_req_if.rw; @@ -204,17 +219,14 @@ module VX_pipeline #( `SCOPE_ASSIGN(scope_busy, busy); `SCOPE_ASSIGN(scope_schedule_delay, schedule_delay); - `SCOPE_ASSIGN(scope_memory_delay, memory_delay); + `SCOPE_ASSIGN(scope_mem_delay, mem_delay); `SCOPE_ASSIGN(scope_exec_delay, exec_delay); - `SCOPE_ASSIGN(scope_gpr_stage_delay, gpr_stage_delay); + `SCOPE_ASSIGN(scope_gpr_stage_delay, gpr_delay); `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin - if ((| writeback_if.valid) && (writeback_if.wb != 0)) begin - $display("%t: Core%0d-WB: warp=%0d, rd=%0d, data=%0h", $time, CORE_ID, writeback_if.warp_num, writeback_if.rd, writeback_if.data); - end - if (schedule_delay || memory_delay || exec_delay || gpr_stage_delay) begin - $display("%t: Core%0d-Delay: sched=%b, mem=%b, exec=%b, gpr=%b ", $time, CORE_ID, schedule_delay, memory_delay, exec_delay, gpr_stage_delay); + if ((| execute_if.valid) && (~execute_if.alu_ready || ~execute_if.br_ready || ~execute_if.lsu_ready || ~execute_if.csr_ready || ~execute_if.mul_ready || ~execute_if.gpu_ready)) begin + $display("%t: Core%0d-stall: warp=%0d, PC=%0h, alu=%b, br=%b, lsu=%b, csr=%b, mul=%b, gpu=%b", $time, CORE_ID, execute_if.warp_num, execute_if.curr_PC, ~execute_if.alu_ready, ~execute_if.br_ready, ~execute_if.lsu_ready, ~execute_if.csr_ready, ~execute_if.mul_ready, ~execute_if.gpu_ready); end end `endif diff --git a/hw/rtl/VX_scheduler.v b/hw/rtl/VX_scheduler.v deleted file mode 100644 index 52810bcc..00000000 --- a/hw/rtl/VX_scheduler.v +++ /dev/null @@ -1,83 +0,0 @@ -`include "VX_define.vh" - -module VX_scheduler ( - input wire clk, - input wire reset, - input wire memory_delay, - input wire exec_delay, - input wire gpr_stage_delay, - - VX_backend_req_if bckE_req_if, - VX_wb_if writeback_if, - - output wire schedule_delay, - output wire is_empty -); - localparam CTVW = `CLOG2(`NUM_WARPS * 32 + 1); - - reg [31:0][`NUM_THREADS-1:0] rename_table[`NUM_WARPS-1:0]; - reg [CTVW-1:0] count_valid; - - wire is_store = (bckE_req_if.mem_write != `BYTE_EN_NO); - wire is_load = (bckE_req_if.mem_read != `BYTE_EN_NO); - wire is_mem = (is_store || is_load); - wire is_gpu = (bckE_req_if.is_wspawn || bckE_req_if.is_tmc || bckE_req_if.is_barrier || bckE_req_if.is_split); - wire is_csr = bckE_req_if.is_csr; - wire is_exec = !is_mem && !is_gpu && !is_csr; - - wire using_rs2 = is_store - || (bckE_req_if.rs2_src == `RS2_REG) - || bckE_req_if.is_barrier - || bckE_req_if.is_wspawn; - - wire rs1_rename = (rename_table[bckE_req_if.warp_num][bckE_req_if.rs1] != 0); - wire rs2_rename = (rename_table[bckE_req_if.warp_num][bckE_req_if.rs2] != 0); - wire rd_rename = (rename_table[bckE_req_if.warp_num][bckE_req_if.rd ] != 0); - - wire rs1_rename_qual = (rs1_rename) && (bckE_req_if.rs1 != 0); - wire rs2_rename_qual = (rs2_rename) && (bckE_req_if.rs2 != 0 && using_rs2); - wire rd_rename_qual = (rd_rename) && (bckE_req_if.rd != 0); - - wire rename_valid = rs1_rename_qual || rs2_rename_qual || rd_rename_qual; - - assign schedule_delay = (| bckE_req_if.valid) - && ((rename_valid) - || (memory_delay && is_mem) - || (gpr_stage_delay && (is_mem || is_exec)) - || (exec_delay && is_exec)); - - assign is_empty = (count_valid == 0); - - integer i, w; - - wire acquire_rd = (| bckE_req_if.valid) && (bckE_req_if.wb != 0) && (bckE_req_if.rd != 0) && !schedule_delay; - - wire release_rd = (| writeback_if.valid) && (writeback_if.wb != 0) && (writeback_if.rd != 0); - - wire [`NUM_THREADS-1:0] valid_wb_new_mask = rename_table[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.valid; - - reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == valid_wb_new_mask))) ? (count_valid + 1) : - (~acquire_rd && (release_rd && (0 == valid_wb_new_mask))) ? (count_valid - 1) : - count_valid; - - always @(posedge clk) begin - if (reset) begin - for (w = 0; w < `NUM_WARPS; w++) begin - for (i = 0; i < 32; i++) begin - rename_table[w][i] <= 0; - end - end - count_valid <= 0; - end else begin - if (acquire_rd) begin - rename_table[bckE_req_if.warp_num][bckE_req_if.rd] <= bckE_req_if.valid; - end - if (release_rd) begin - assert(rename_table[writeback_if.warp_num][writeback_if.rd] != 0); - rename_table[writeback_if.warp_num][writeback_if.rd] <= valid_wb_new_mask; - end - count_valid <= count_valid_next; - end - end - -endmodule \ No newline at end of file diff --git a/hw/rtl/VX_scope.vh b/hw/rtl/VX_scope.vh index 972286ae..4d11cd4c 100644 --- a/hw/rtl/VX_scope.vh +++ b/hw/rtl/VX_scope.vh @@ -74,7 +74,7 @@ scope_execute_valid, \ scope_writeback_valid, \ scope_schedule_delay, \ - scope_memory_delay, \ + scope_mem_delay, \ scope_exec_delay, \ scope_gpr_stage_delay, \ scope_busy, \ @@ -127,26 +127,26 @@ wire scope_busy; \ wire scope_snp_rsp_ready; \ wire scope_schedule_delay; \ - wire scope_memory_delay; \ + wire scope_mem_delay; \ wire scope_exec_delay; \ wire scope_gpr_stage_delay; \ wire [`NUM_THREADS-1:0] scope_decode_valid; \ wire [`NW_BITS-1:0] scope_decode_warp_num; \ wire [31:0] scope_decode_curr_PC; \ wire scope_decode_is_jal; \ - wire [4:0] scope_decode_rs1; \ - wire [4:0] scope_decode_rs2; \ + wire [`NR_BITS-1:0] scope_decode_rs1; \ + wire [`NR_BITS-1:0] scope_decode_rs2; \ wire [`NUM_THREADS-1:0] scope_execute_valid; \ wire [`NW_BITS-1:0] scope_execute_warp_num; \ wire [31:0] scope_execute_curr_PC; \ - wire [4:0] scope_execute_rd; \ + wire [`NR_BITS-1:0] scope_execute_rd; \ wire [63:0] scope_execute_a; \ wire [63:0] scope_execute_b; \ wire [`NUM_THREADS-1:0] scope_writeback_valid; \ wire [`NW_BITS-1:0] scope_writeback_warp_num; \ wire [31:0] scope_writeback_curr_PC; \ - wire [1:0] scope_writeback_wb; \ - wire [4:0] scope_writeback_rd; \ + wire [`WB_BITS-1:0] scope_writeback_wb; \ + wire [`NR_BITS-1:0] scope_writeback_rd; \ wire [63:0] scope_writeback_data; \ wire scope_bank_valid_st0; \ wire scope_bank_valid_st1; \ @@ -204,7 +204,7 @@ `define SCOPE_SIGNALS_PIPELINE_IO \ output wire scope_busy, \ output wire scope_schedule_delay, \ - output wire scope_memory_delay, \ + output wire scope_mem_delay, \ output wire scope_exec_delay, \ output wire scope_gpr_stage_delay, @@ -213,19 +213,19 @@ output wire [`NW_BITS-1:0] scope_decode_warp_num, \ output wire [31:0] scope_decode_curr_PC, \ output wire scope_decode_is_jal, \ - output wire [4:0] scope_decode_rs1, \ - output wire [4:0] scope_decode_rs2, \ + output wire [`NR_BITS-1:0] scope_decode_rs1, \ + output wire [`NR_BITS-1:0] scope_decode_rs2, \ output wire [`NUM_THREADS-1:0] scope_execute_valid, \ output wire [`NW_BITS-1:0] scope_execute_warp_num, \ output wire [31:0] scope_execute_curr_PC, \ - output wire [4:0] scope_execute_rd, \ + output wire [`NR_BITS-1:0] scope_execute_rd, \ output wire [63:0] scope_execute_a, \ output wire [63:0] scope_execute_b, \ output wire [`NUM_THREADS-1:0] scope_writeback_valid, \ output wire [`NW_BITS-1:0] scope_writeback_warp_num, \ output wire [31:0] scope_writeback_curr_PC, \ - output wire [1:0] scope_writeback_wb, \ - output wire [4:0] scope_writeback_rd, \ + output wire [`WB_BITS-1:0] scope_writeback_wb, \ + output wire [`NR_BITS-1:0] scope_writeback_rd, \ output wire [63:0] scope_writeback_data, `define SCOPE_SIGNALS_ISTAGE_BIND \ @@ -326,7 +326,7 @@ `define SCOPE_SIGNALS_PIPELINE_BIND \ .scope_busy (scope_busy), \ .scope_schedule_delay (scope_schedule_delay), \ - .scope_memory_delay (scope_memory_delay), \ + .scope_mem_delay (scope_mem_delay), \ .scope_exec_delay (scope_exec_delay), \ .scope_gpr_stage_delay (scope_gpr_stage_delay), diff --git a/hw/rtl/VX_warp.v b/hw/rtl/VX_warp.v index b780d760..55c2bb31 100644 --- a/hw/rtl/VX_warp.v +++ b/hw/rtl/VX_warp.v @@ -10,7 +10,7 @@ module VX_warp ( input wire change_mask, input wire jal, input wire[31:0] dest, - input wire branch_dir, + input wire branch_taken, input wire[31:0] branch_dest, input wire wspawn, input wire[31:0] wspawn_pc, @@ -44,7 +44,7 @@ module VX_warp ( always @(*) begin if (jal == 1'b1) begin temp_PC = dest; - end else if (branch_dir) begin + end else if (branch_taken) begin temp_PC = branch_dest; end else begin temp_PC = real_PC; diff --git a/hw/rtl/VX_warp_sched.v b/hw/rtl/VX_warp_sched.v index 9621d47b..09ec8564 100644 --- a/hw/rtl/VX_warp_sched.v +++ b/hw/rtl/VX_warp_sched.v @@ -1,90 +1,38 @@ `include "VX_define.vh" -module VX_warp_sched ( - input wire clk, - input wire reset, - input wire stall, +module VX_warp_sched #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, - // Wspawn - input wire wspawn, - input wire[31:0] wsapwn_pc, - input wire[`NUM_WARPS-1:0] wspawn_new_active, + VX_warp_ctl_if warp_ctl_if, + VX_wstall_if wstall_if, + VX_join_if join_if, + VX_branch_rsp_if branch_rsp_if, - // CTM - input wire ctm, - input wire[`NUM_THREADS-1:0] ctm_mask, - input wire[`NW_BITS-1:0] ctm_warp_num, + VX_ifetch_rsp_if ifetch_rsp_if, + VX_ifetch_req_if ifetch_req_if, - // WHALT - input wire whalt, - input wire[`NW_BITS-1:0] whalt_warp_num, - - input wire is_barrier, -`DEBUG_BEGIN - input wire[31:0] barrier_id, -`DEBUG_END - input wire[$clog2(`NUM_WARPS):0] num_warps, - input wire[`NW_BITS-1:0] barrier_warp_num, - - // WSTALL - input wire wstall, - input wire [`NW_BITS-1:0] wstall_warp_num, - - // Split - input wire is_split, - input wire dont_split, - input wire [`NUM_THREADS-1:0] split_new_mask, - input wire [`NUM_THREADS-1:0] split_later_mask, - input wire [31:0] split_save_pc, - input wire [`NW_BITS-1:0] split_warp_num, - - // Join - input wire is_join, - input wire [`NW_BITS-1:0] join_warp_num, - - // JAL - input wire jal, - input wire [31:0] dest, - input wire [`NW_BITS-1:0] jal_warp_num, - - // Branch - input wire branch_valid, - input wire branch_dir, - input wire [31:0] branch_dest, - input wire [`NW_BITS-1:0] branch_warp_num, - - output wire [`NUM_THREADS-1:0] thread_mask, - output wire [`NW_BITS-1:0] warp_num, - output wire [31:0] warp_pc, - output wire busy, - output wire scheduled_warp, - - input wire [`NW_BITS-1:0] icache_stage_wid, - input wire icache_stage_response + output wire busy ); wire update_use_wspawn; wire update_visible_active; + wire scheduled_warp; - wire[(1+32+`NUM_THREADS-1):0] d[`NUM_WARPS-1:0]; + wire [(1+32+`NUM_THREADS-1):0] d[`NUM_WARPS-1:0]; - wire join_fall; - wire[31:0] join_pc; - wire[`NUM_THREADS-1:0] join_tm; + wire join_fall; + wire [31:0] join_pc; + wire [`NUM_THREADS-1:0] join_tm; -`DEBUG_BEGIN - wire in_wspawn = wspawn; - wire in_ctm = ctm; - wire in_whalt = whalt; - wire in_wstall = wstall; -`DEBUG_END - - reg[`NUM_WARPS-1:0] warp_active; - reg[`NUM_WARPS-1:0] warp_stalled; + reg [`NUM_WARPS-1:0] warp_active; + reg [`NUM_WARPS-1:0] warp_stalled; reg [`NUM_WARPS-1:0] visible_active; - wire[`NUM_WARPS-1:0] use_active; + wire [`NUM_WARPS-1:0] use_active; - reg [`NUM_WARPS-1:0] warp_lock; + reg [`NUM_WARPS-1:0] warp_lock; wire wstall_this_cycle; @@ -92,17 +40,23 @@ module VX_warp_sched ( reg [31:0] warp_pcs[`NUM_WARPS-1:0]; // barriers - reg [`NUM_WARPS-1:0] barrier_stall_mask[(`NUM_BARRIERS-1):0]; - wire reached_barrier_limit; + reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; wire [`NUM_WARPS-1:0] b_mask; - wire [$clog2(`NUM_WARPS):0] b_count; + wire [`NW_BITS:0] b_count; - // wsapwn - reg [31:0] use_wsapwn_pc; - reg [`NUM_WARPS-1:0] use_wsapwn; + wire reached_barrier_limit; - wire [`NW_BITS-1:0] warp_to_schedule; - wire schedule; + // wspawn + reg [31:0] use_wspawn_pc; + reg [`NUM_WARPS-1:0] use_wspawn; + + wire [`NW_BITS-1:0] warp_to_schedule; + wire schedule; + + wire [`NUM_THREADS-1:0] thread_mask; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] warp_pc; + wire scheduled_warp; wire hazard; wire global_stall; @@ -115,15 +69,18 @@ module VX_warp_sched ( reg didnt_split; - integer w, b; + wire stall; + + integer i; always @(posedge clk) begin if (reset) begin - for (b = 0; b < `NUM_BARRIERS; b=b+1) begin - barrier_stall_mask[b] <= 0; + for (i = 0; i < `NUM_BARRIERS; i++) begin + barrier_stall_mask[i] <= 0; end - use_wsapwn_pc <= 0; - use_wsapwn <= 0; + + use_wspawn_pc <= 0; + use_wspawn <= 0; warp_pcs[0] <= `STARTUP_ADDR; warp_active[0] <= 1; // Activating first warp visible_active[0] <= 1; // Activating first warp @@ -131,63 +88,62 @@ module VX_warp_sched ( warp_stalled <= 0; didnt_split <= 0; warp_lock <= 0; - // total_barrier_stall = 0; - for (w = 1; w < `NUM_WARPS; w=w+1) begin - warp_pcs[w] <= 0; - warp_active[w] <= 0; // Activating first warp - visible_active[w] <= 0; // Activating first warp - thread_masks[w] <= 1; // Activating first thread in first warp + + for (i = 1; i < `NUM_WARPS; i++) begin + warp_pcs[i] <= 0; + warp_active[i] <= 0; // Activating first warp + visible_active[i] <= 0; // Activating first warp + thread_masks[i] <= 1; // Activating first thread in first warp end end else begin - // Wsapwning warps - if (wspawn) begin - warp_active <= wspawn_new_active; - use_wsapwn_pc <= wsapwn_pc; - use_wsapwn <= wspawn_new_active & (~`NUM_WARPS'b1); + + if (warp_ctl_if.wspawn) begin + warp_active <= warp_ctl_if.wspawn_new_active; + use_wspawn_pc <= warp_ctl_if.wspawn_pc; + use_wspawn <= warp_ctl_if.wspawn_new_active & (~`NUM_WARPS'b1); end - if (is_barrier) begin - warp_stalled[barrier_warp_num] <= 0; + if (warp_ctl_if.is_barrier) begin + warp_stalled[warp_ctl_if.warp_num] <= 0; if (reached_barrier_limit) begin - barrier_stall_mask[barrier_id] <= 0; + barrier_stall_mask[warp_ctl_if.barrier_id] <= 0; end else begin - barrier_stall_mask[barrier_id][barrier_warp_num] <= 1; + barrier_stall_mask[warp_ctl_if.barrier_id][warp_ctl_if.warp_num] <= 1; end - end else if (ctm) begin - thread_masks[ctm_warp_num] <= ctm_mask; - warp_stalled[ctm_warp_num] <= 0; - end else if (is_join && !didnt_split) begin + end else if (warp_ctl_if.change_mask) begin + thread_masks[warp_ctl_if.warp_num] <= warp_ctl_if.thread_mask; + warp_stalled[warp_ctl_if.warp_num] <= 0; + end else if (join_if.is_join && !didnt_split) begin if (!join_fall) begin - warp_pcs[join_warp_num] <= join_pc; + warp_pcs[join_if.warp_num] <= join_pc; end - thread_masks[join_warp_num] <= join_tm; - didnt_split <= 0; - end else if (is_split) begin - warp_stalled[split_warp_num] <= 0; - if (!dont_split) begin - thread_masks[split_warp_num] <= split_new_mask; + thread_masks[join_if.warp_num] <= join_tm; + didnt_split <= 0; + end else if (warp_ctl_if.is_split) begin + warp_stalled[warp_ctl_if.warp_num] <= 0; + if (warp_ctl_if.do_split) begin + thread_masks[warp_ctl_if.warp_num] <= warp_ctl_if.split_new_mask; didnt_split <= 0; end else begin didnt_split <= 1; end end - if (whalt) begin - warp_active[whalt_warp_num] <= 0; - visible_active[whalt_warp_num] <= 0; + if (warp_ctl_if.whalt) begin + warp_active[warp_ctl_if.warp_num] <= 0; + visible_active[warp_ctl_if.warp_num] <= 0; end if (update_use_wspawn) begin - use_wsapwn[warp_to_schedule] <= 0; + use_wspawn[warp_to_schedule] <= 0; thread_masks[warp_to_schedule] <= 1; end - // Stalling the scheduling of warps - if (wstall) begin - warp_stalled[wstall_warp_num] <= 1; - visible_active[wstall_warp_num] <= 0; + if (wstall_if.wstall) begin + warp_stalled[wstall_if.warp_num] <= 1; + visible_active[wstall_if.warp_num] <= 0; end // Refilling active warps @@ -201,26 +157,20 @@ module VX_warp_sched ( warp_pcs[warp_to_schedule] <= new_pc; end - // Jal - if (jal) begin - warp_pcs[jal_warp_num] <= dest; - warp_stalled[jal_warp_num] <= 0; - end - // Branch - if (branch_valid) begin - if (branch_dir) begin - warp_pcs[branch_warp_num] <= branch_dest; + if (branch_rsp_if.valid) begin + if (branch_rsp_if.taken) begin + warp_pcs[branch_rsp_if.warp_num] <= branch_rsp_if.dest; end - warp_stalled[branch_warp_num] <= 0; + warp_stalled[branch_rsp_if.warp_num] <= 0; end // Lock/Release if (scheduled_warp && !stall) begin - warp_lock[warp_num] <= 1'b1; + warp_lock[warp_num] <= 1; end - if (icache_stage_response) begin - warp_lock[icache_stage_wid] <= 1'b0; + if ((| ifetch_rsp_if.valid) && ifetch_rsp_if.ready) begin + warp_lock[ifetch_rsp_if.warp_num] <= 0; end end @@ -233,7 +183,7 @@ module VX_warp_sched ( .count (b_count) ); - wire [$clog2(`NUM_WARPS):0] count_visible_active; + wire [`NW_BITS:0] count_visible_active; VX_countones #( .N(`NUM_WARPS) @@ -242,30 +192,29 @@ module VX_warp_sched ( .count (count_visible_active) ); - // assign b_count = $countones(b_mask); + assign b_mask = barrier_stall_mask[warp_ctl_if.barrier_id][`NUM_WARPS-1:0]; + + assign reached_barrier_limit = (b_count == warp_ctl_if.num_warps); - assign b_mask = barrier_stall_mask[barrier_id][`NUM_WARPS-1:0]; - assign reached_barrier_limit = b_count == (num_warps); - - assign wstall_this_cycle = wstall && (wstall_warp_num == warp_to_schedule); // Maybe bug + assign wstall_this_cycle = wstall_if.wstall && (wstall_if.warp_num == warp_to_schedule); // Maybe bug assign total_barrier_stall = barrier_stall_mask[0] | barrier_stall_mask[1] | barrier_stall_mask[2] | barrier_stall_mask[3]; - assign update_visible_active = (0 == count_visible_active) && !(stall || wstall_this_cycle || hazard || is_join); + assign update_visible_active = (0 == count_visible_active) && !(stall || wstall_this_cycle || hazard || join_if.is_join); - wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[split_warp_num]}; - wire [(1+32+`NUM_THREADS-1):0] q2 = {1'b0, split_save_pc, split_later_mask}; + wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.warp_num]}; + wire [(1+32+`NUM_THREADS-1):0] q2 = {1'b0, warp_ctl_if.split_save_pc, warp_ctl_if.split_later_mask}; - assign {join_fall, join_pc, join_tm} = d[join_warp_num]; + assign {join_fall, join_pc, join_tm} = d[join_if.warp_num]; - genvar i; - generate - for (i = 0; i < `NUM_WARPS; i++) begin : stacks - wire correct_warp_s = (i == split_warp_num); - wire correct_warp_j = (i == join_warp_num); + genvar j; - wire push = (is_split && !dont_split) && correct_warp_s; - wire pop = is_join && correct_warp_j; + for (j = 0; j < `NUM_WARPS; j++) begin : stacks + wire correct_warp_s = (j == warp_ctl_if.warp_num); + wire correct_warp_j = (j == join_if.warp_num); + + wire push = (warp_ctl_if.is_split && warp_ctl_if.do_split) && correct_warp_s; + wire pop = join_if.is_join && correct_warp_j; VX_generic_stack #( .WIDTH(1+32+`NUM_THREADS), @@ -279,27 +228,27 @@ module VX_warp_sched ( .q1 (q1), .q2 (q2) ); - end - endgenerate + end - wire should_jal = (jal && (warp_to_schedule == jal_warp_num)); - wire should_bra = (branch_valid && branch_dir && (warp_to_schedule == branch_warp_num)); + wire should_bra = (branch_rsp_if.valid && branch_rsp_if.taken && (warp_to_schedule == branch_rsp_if.warp_num)); - assign hazard = (should_jal || should_bra) && schedule; + assign hazard = should_bra && schedule; assign real_schedule = schedule && !warp_stalled[warp_to_schedule] && !total_barrier_stall[warp_to_schedule] && !warp_lock[0]; - assign global_stall = (stall || wstall_this_cycle || hazard || !real_schedule || is_join); + assign global_stall = (stall || wstall_this_cycle || hazard || !real_schedule || join_if.is_join); - assign scheduled_warp = !(wstall_this_cycle || hazard || !real_schedule || is_join) && !reset; + assign scheduled_warp = !(wstall_this_cycle || hazard || !real_schedule || join_if.is_join) && !reset; - wire real_use_wspawn = use_wsapwn[warp_to_schedule]; + wire real_use_wspawn = use_wspawn[warp_to_schedule]; - assign warp_pc = real_use_wspawn ? use_wsapwn_pc : warp_pcs[warp_to_schedule]; + assign warp_pc = real_use_wspawn ? use_wspawn_pc : warp_pcs[warp_to_schedule]; + assign thread_mask = (global_stall) ? 0 : (real_use_wspawn ? `NUM_THREADS'b1 : thread_masks[warp_to_schedule]); + assign warp_num = warp_to_schedule; - assign update_use_wspawn = use_wsapwn[warp_to_schedule] && !global_stall; + assign update_use_wspawn = use_wspawn[warp_to_schedule] && !global_stall; assign new_pc = warp_pc + 4; @@ -315,13 +264,21 @@ module VX_warp_sched ( .grant_index (warp_to_schedule), .grant_valid (schedule), `UNUSED_PIN (grant_onehot) + ); + + assign stall = ~ifetch_req_if.ready && (| ifetch_req_if.valid); + + VX_generic_register #( + .N(`NUM_THREADS + 32 + `NW_BITS) + ) fetch_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({thread_mask, warp_pc, warp_num}), + .out ({ifetch_req_if.valid, ifetch_req_if.curr_PC, ifetch_req_if.warp_num}) ); - // always @(*) begin - // $display("WarpPC: %h",warp_pc); - // $display("real_schedule: %d, schedule: %d, warp_stalled: %d, warp_to_schedule: %d, total_barrier_stall: %d",real_schedule, schedule, warp_stalled[warp_to_schedule], warp_to_schedule, total_barrier_stall[warp_to_schedule]); - // end - - assign busy = (warp_active != 0); + assign busy = (warp_active != 0); endmodule \ No newline at end of file diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 4baddbfb..cd642c3d 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -1,89 +1,112 @@ `include "VX_define.vh" -module VX_writeback ( - input wire clk, - input wire reset, +module VX_writeback #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, - // Mem WB info - VX_wb_if mem_wb_if, + // inputs + VX_wb_if alu_wb_if, + VX_wb_if branch_wb_if, + VX_wb_if lsu_wb_if, + VX_wb_if mul_wb_if, + VX_wb_if csr_wb_if, - // EXEC Unit WB info - VX_wb_if inst_exec_wb_if, - - // CSR Unit WB info - VX_wb_if csr_wb_if, - - // Actual WB to GPR - VX_wb_if writeback_if, - output wire no_slot_mem, - output wire no_slot_exec, - output wire no_slot_csr + // outputs + VX_wb_if writeback_if, + output wire notify_commit ); - VX_wb_if writeback_tmp_if(); + wire br_valid = (| branch_wb_if.valid); + wire lsu_valid = (| lsu_wb_if.valid); + wire mul_valid = (| mul_wb_if.valid); + wire alu_valid = (| alu_wb_if.valid); + wire csr_valid = (| csr_wb_if.valid); - wire exec_wb = (inst_exec_wb_if.wb != 0) && (| inst_exec_wb_if.valid); - wire mem_wb = (mem_wb_if.wb != 0) && (| mem_wb_if.valid); - wire csr_wb = (csr_wb_if.wb != 0) && (| csr_wb_if.valid); + VX_wb_if writeback_tmp_if(); - assign no_slot_mem = mem_wb && (exec_wb || csr_wb); - assign no_slot_csr = csr_wb && exec_wb; - assign no_slot_exec = 0; + assign writeback_tmp_if.valid = br_valid ? branch_wb_if.valid : + lsu_valid ? lsu_wb_if.valid : + mul_valid ? mul_wb_if.valid : + alu_valid ? alu_wb_if.valid : + csr_valid ? csr_wb_if.valid : + 0; - assign writeback_tmp_if.data = exec_wb ? inst_exec_wb_if.data : - csr_wb ? csr_wb_if.data : - mem_wb ? mem_wb_if.data : - 0; + assign writeback_tmp_if.warp_num = br_valid ? branch_wb_if.warp_num : + lsu_valid ? lsu_wb_if.warp_num : + mul_valid ? mul_wb_if.warp_num : + alu_valid ? alu_wb_if.warp_num : + csr_valid ? csr_wb_if.warp_num : + + 0; - assign writeback_tmp_if.valid = exec_wb ? inst_exec_wb_if.valid : - csr_wb ? csr_wb_if.valid : - mem_wb ? mem_wb_if.valid : - 0; + assign writeback_tmp_if.curr_PC = br_valid ? branch_wb_if.curr_PC : + lsu_valid ? lsu_wb_if.curr_PC : + mul_valid ? mul_wb_if.curr_PC : + alu_valid ? alu_wb_if.curr_PC : + csr_valid ? csr_wb_if.curr_PC : + 0; - assign writeback_tmp_if.rd = exec_wb ? inst_exec_wb_if.rd : - csr_wb ? csr_wb_if.rd : - mem_wb ? mem_wb_if.rd : - 0; + assign writeback_tmp_if.data = br_valid ? branch_wb_if.data : + lsu_valid ? lsu_wb_if.data : + mul_valid ? mul_wb_if.data : + alu_valid ? alu_wb_if.data : + csr_valid ? csr_wb_if.data : + 0; - assign writeback_tmp_if.wb = exec_wb ? inst_exec_wb_if.wb : - csr_wb ? csr_wb_if.wb : - mem_wb ? mem_wb_if.wb : - 0; + assign writeback_tmp_if.rd = br_valid ? branch_wb_if.rd : + lsu_valid ? lsu_wb_if.rd : + mul_valid ? mul_wb_if.rd : + alu_valid ? alu_wb_if.rd : + csr_valid ? csr_wb_if.rd : + 0; - assign writeback_tmp_if.warp_num = exec_wb ? inst_exec_wb_if.warp_num : - csr_wb ? csr_wb_if.warp_num : - mem_wb ? mem_wb_if.warp_num : - 0; + assign writeback_tmp_if.wb = br_valid ? branch_wb_if.wb : + lsu_valid ? lsu_wb_if.wb : + alu_valid ? alu_wb_if.wb : + csr_valid ? csr_wb_if.wb : + mul_valid ? mul_wb_if.wb : + 0; - assign writeback_tmp_if.curr_PC = exec_wb ? inst_exec_wb_if.curr_PC : - csr_wb ? 32'hdeadbeef : - mem_wb ? mem_wb_if.curr_PC : - 32'hdeadbeef; - - wire [`NUM_THREADS-1:0][31:0] use_wb_data; + wire stall = ~writeback_if.ready && (| writeback_if.valid); VX_generic_register #( - .N(39 + `NW_BITS-1 + 1 + `NUM_THREADS*33) - ) wb_register ( - .clk (clk), - .reset(reset), - .stall(1'b0), - .flush(1'b0), - .in ({writeback_tmp_if.data, writeback_tmp_if.valid, writeback_tmp_if.rd, writeback_tmp_if.wb, writeback_tmp_if.warp_num, writeback_tmp_if.curr_PC}), - .out ({use_wb_data, writeback_if.valid, writeback_if.rd, writeback_if.wb, writeback_if.warp_num, writeback_if.curr_PC}) + .N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + (`NUM_THREADS * 32) + `WB_BITS) + ) wb_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.curr_PC, writeback_tmp_if.rd, writeback_tmp_if.data, writeback_tmp_if.wb}), + .out ({writeback_if.valid, writeback_if.warp_num, writeback_if.curr_PC, writeback_if.rd, writeback_if.data, writeback_if.wb}) ); - reg [31:0] last_data_wb /* verilator public */; + assign branch_wb_if.ready = !stall; + assign lsu_wb_if.ready = !stall && !br_valid; + assign mul_wb_if.ready = !stall && !br_valid && !lsu_valid; + assign alu_wb_if.ready = !stall && !br_valid && !lsu_valid && !mul_valid; + assign csr_wb_if.ready = !stall && !br_valid && !lsu_valid && !mul_valid && !alu_valid; + + assign notify_commit = (| writeback_tmp_if.valid) && ~stall; + // special workaround to control RISC-V benchmarks termination on Verilator + reg [31:0] last_data_wb /* verilator public */; always @(posedge clk) begin - if ( (| writeback_if.valid) && (writeback_if.wb != 0) && (writeback_if.rd == 28)) begin - last_data_wb <= use_wb_data[0]; + if (notify_commit && (writeback_tmp_if.wb != 0) && (writeback_tmp_if.rd == 28)) begin + last_data_wb <= writeback_tmp_if.data[0]; end end - assign writeback_if.data = use_wb_data; +`ifdef DBG_PRINT_PIPELINE + always @(posedge clk) begin + if ((| writeback_tmp_if.valid) && ~stall) begin + $display("%t: Core%0d-WB: warp=%0d, PC=%0h, rd=%0d, wb=%0d, data=%0h", $time, CORE_ID, writeback_tmp_if.warp_num, writeback_tmp_if.curr_PC, writeback_tmp_if.rd, writeback_tmp_if.wb, writeback_tmp_if.data); + end + end +`endif -endmodule : VX_writeback +endmodule diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 57b6efe5..2980274d 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -106,8 +106,8 @@ module VX_bank #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_use_pc_st0; - wire[1:0] debug_wb_st0; - wire[4:0] debug_rd_st0; + wire[`WB_BITS-1:0] debug_wb_st0; + wire[`NR_BITS-1:0] debug_rd_st0; wire[`NW_BITS-1:0] debug_warp_num_st0; wire debug_rw_st0; wire[WORD_SIZE-1:0] debug_byteen_st0; @@ -115,8 +115,8 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st0; wire[31:0] debug_use_pc_st1e; - wire[1:0] debug_wb_st1e; - wire[4:0] debug_rd_st1e; + wire[`WB_BITS-1:0] debug_wb_st1e; + wire[`NR_BITS-1:0] debug_rd_st1e; wire[`NW_BITS-1:0] debug_warp_num_st1e; wire debug_rw_st1e; wire[WORD_SIZE-1:0] debug_byteen_st1e; @@ -124,8 +124,8 @@ module VX_bank #( wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e; wire[31:0] debug_use_pc_st2; - wire[1:0] debug_wb_st2; - wire[4:0] debug_rd_st2; + wire[`WB_BITS-1:0] debug_wb_st2; + wire[`NR_BITS-1:0] debug_rd_st2; wire[`NW_BITS-1:0] debug_warp_num_st2; wire debug_rw_st2; wire[WORD_SIZE-1:0] debug_byteen_st2; @@ -370,7 +370,7 @@ module VX_bank #( .clk (clk), .reset (reset), .stall (stall_bank_pipe), - .flush (1'b0), + .flush (0), .in ({qual_is_mrvq_st0, qual_is_snp_st0, qual_snp_invalidate_st0, qual_going_to_write_st0, qual_valid_st0, qual_addr_st0, qual_wsel_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0}), .out ({is_mrvq_st1[0] , is_snp_st1[0], snp_invalidate_st1[0], going_to_write_st1[0], valid_st1[0], addr_st1[0], wsel_st1[0], writeword_st1[0], inst_meta_st1[0], is_fill_st1[0], writedata_st1[0]}) ); @@ -383,7 +383,7 @@ module VX_bank #( .clk (clk), .reset (reset), .stall (stall_bank_pipe), - .flush (1'b0), + .flush (0), .in ({is_mrvq_st1[i-1], is_snp_st1[i-1], snp_invalidate_st1[i-1], going_to_write_st1[i-1], valid_st1[i-1], addr_st1[i-1], wsel_st1[i-1], writeword_st1[i-1], inst_meta_st1[i-1], is_fill_st1[i-1], writedata_st1[i-1]}), .out ({is_mrvq_st1[i] , is_snp_st1[i], snp_invalidate_st1[i], going_to_write_st1[i], valid_st1[i], addr_st1[i], wsel_st1[i], writeword_st1[i], inst_meta_st1[i], is_fill_st1[i], writedata_st1[i]}) ); @@ -512,7 +512,7 @@ module VX_bank #( .clk (clk), .reset (reset), .stall (stall_bank_pipe), - .flush (1'b0), + .flush (0), .in ({mrvq_recover_ready_state_st1e, is_mrvq_st1e_st2, mrvq_init_ready_state_st1e , snp_to_mrvq_st1e, is_snp_st1e, snp_invalidate_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1] , qual_valid_st1e_2, addr_st1e, wsel_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, dirtyb_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}), .out ({mrvq_recover_ready_state_st2 , is_mrvq_st2 , mrvq_init_ready_state_unqual_st2, snp_to_mrvq_st2 , is_snp_st2 , snp_invalidate_st2, fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , wsel_st2, writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , dirtyb_st2, inst_meta_st2 }) ); @@ -765,4 +765,4 @@ module VX_bank #( `SCOPE_ASSIGN(scope_bank_addr_st1, `LINE_TO_BYTE_ADDR(addr_st1e, BANK_ID)); `SCOPE_ASSIGN(scope_bank_addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); -endmodule : VX_bank +endmodule diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 240bd304..e76430f9 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -130,8 +130,8 @@ module VX_cache #( `ifdef DBG_CORE_REQ_INFO /* verilator lint_off UNUSED */ wire[31:0] debug_core_req_use_pc; - wire[1:0] debug_core_req_wb; - wire[4:0] debug_core_req_rd; + wire[`WB_BITS-1:0] debug_core_req_wb; + wire[`NR_BITS-1:0] debug_core_req_rd; wire[`NW_BITS-1:0] debug_core_req_warp_num; wire[`LOG2UP(CREQ_SIZE)-1:0] debug_core_req_idx; /* verilator lint_on UNUSED */ diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index 849d8907..24e40f91 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -23,9 +23,9 @@ module VX_cache_core_rsp_merge #( output wire [NUM_BANKS-1:0] per_bank_core_rsp_ready, // Core Writeback - output reg [NUM_REQUESTS-1:0] core_rsp_valid, - output reg [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, - output reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, + output wire [NUM_REQUESTS-1:0] core_rsp_valid, + output wire [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_rsp_data, + output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire core_rsp_ready ); @@ -41,47 +41,63 @@ module VX_cache_core_rsp_merge #( `UNUSED_PIN (grant_onehot) ); - reg [NUM_BANKS-1:0] per_bank_core_rsp_pop_unqual; + reg [NUM_REQUESTS-1:0] core_rsp_valid_unqual; + reg [NUM_REQUESTS-1:0][`WORD_WIDTH-1:0] core_rsp_data_unqual; + reg [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual; + reg [NUM_BANKS-1:0] core_rsp_bank_select; - assign per_bank_core_rsp_ready = per_bank_core_rsp_pop_unqual & {NUM_BANKS{core_rsp_ready}}; + wire stall = ~core_rsp_ready; integer i; if (CORE_TAG_ID_BITS != 0) begin - assign core_rsp_tag = per_bank_core_rsp_tag[main_bank_index]; always @(*) begin - core_rsp_valid = 0; - core_rsp_data = 0; + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 0; + core_rsp_tag_unqual = per_bank_core_rsp_tag[main_bank_index]; for (i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i] && (per_bank_core_rsp_tag[i][CORE_TAG_ID_BITS-1:0] == per_bank_core_rsp_tag[main_bank_index][CORE_TAG_ID_BITS-1:0])) begin - core_rsp_valid[per_bank_core_rsp_tid[i]] = 1; - core_rsp_data[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - per_bank_core_rsp_pop_unqual[i] = 1; + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + core_rsp_bank_select[i] = 1; end else begin - per_bank_core_rsp_pop_unqual[i] = 0; + core_rsp_bank_select[i] = 0; end end end end else begin always @(*) begin - core_rsp_valid = 0; - core_rsp_data = 0; - core_rsp_tag = 0; + core_rsp_valid_unqual = 0; + core_rsp_data_unqual = 0; + core_rsp_tag_unqual = 0; for (i = 0; i < NUM_BANKS; i++) begin if (per_bank_core_rsp_valid[i] - && !core_rsp_valid[per_bank_core_rsp_tid[i]] + && !core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] && ((main_bank_index == `BANK_BITS'(i)) || (per_bank_core_rsp_tid[i] != per_bank_core_rsp_tid[main_bank_index]))) begin - core_rsp_valid[per_bank_core_rsp_tid[i]] = 1; - core_rsp_data[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; - core_rsp_tag[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; - per_bank_core_rsp_pop_unqual[i] = 1; + core_rsp_valid_unqual[per_bank_core_rsp_tid[i]] = 1; + core_rsp_data_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_data[i]; + core_rsp_tag_unqual[per_bank_core_rsp_tid[i]] = per_bank_core_rsp_tag[i]; + core_rsp_bank_select[i] = 1; end else begin - per_bank_core_rsp_pop_unqual[i] = 0; + core_rsp_bank_select[i] = 0; end end end end + VX_generic_register #( + .N(NUM_REQUESTS + (NUM_REQUESTS *`WORD_WIDTH) + (`CORE_REQ_TAG_COUNT * CORE_TAG_WIDTH)) + ) core_wb_reg ( + .clk (clk), + .reset (reset), + .stall (stall), + .flush (0), + .in ({core_rsp_valid_unqual, core_rsp_data_unqual, core_rsp_tag_unqual}), + .out ({core_rsp_valid, core_rsp_data, core_rsp_tag}) + ); + + assign per_bank_core_rsp_ready = core_rsp_bank_select & {NUM_BANKS{~stall}}; + endmodule diff --git a/hw/rtl/cache/VX_snp_forwarder.v b/hw/rtl/cache/VX_snp_forwarder.v index f54b6eba..606570fd 100644 --- a/hw/rtl/cache/VX_snp_forwarder.v +++ b/hw/rtl/cache/VX_snp_forwarder.v @@ -59,7 +59,7 @@ module VX_snp_forwarder #( assign sfq_push = snp_req_valid && !sfq_full && fwdout_ready; assign sfq_pop = snp_rsp_valid; - VX_indexable_queue #( + VX_index_queue #( .DATAW (`LOG2UP(SNRQ_SIZE) + 1 +`DRAM_ADDR_WIDTH+SNP_REQ_TAG_WIDTH), .SIZE (SNRQ_SIZE) ) snp_fwd_queue ( diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index 6dfb5b98..4009ac4e 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -116,7 +116,7 @@ module VX_tag_data_access #( .clk (clk), .reset (reset), .stall (stall), - .flush (1'b0), + .flush (0), .in ({qual_read_valid_st1, qual_read_dirty_st1, qual_read_dirtyb_st1, qual_read_tag_st1, qual_read_data_st1}), .out ({read_valid_st1c[0], read_dirty_st1c[0], read_dirtyb_st1c[0], read_tag_st1c[0], read_data_st1c[0]}) ); @@ -129,7 +129,7 @@ module VX_tag_data_access #( .clk (clk), .reset (reset), .stall (stall), - .flush (1'b0), + .flush (0), .in ({read_valid_st1c[i-1], read_dirty_st1c[i-1], read_dirtyb_st1c[i-1], read_tag_st1c[i-1], read_data_st1c[i-1]}), .out ({read_valid_st1c[i], read_dirty_st1c[i], read_dirtyb_st1c[i], read_tag_st1c[i], read_data_st1c[i]}) ); diff --git a/hw/rtl/interfaces/VX_alu_req_if.v b/hw/rtl/interfaces/VX_alu_req_if.v new file mode 100644 index 00000000..42e427d5 --- /dev/null +++ b/hw/rtl/interfaces/VX_alu_req_if.v @@ -0,0 +1,24 @@ +`ifndef VX_ALU_REQ_IF +`define VX_ALU_REQ_IF + +`include "VX_define.vh" + +interface VX_alu_req_if (); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + + wire [`ALU_BITS-1:0] alu_op; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + + wire [`NR_BITS-1:0] rd; + wire [`WB_BITS-1:0] wb; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_backend_req_if.v b/hw/rtl/interfaces/VX_backend_req_if.v deleted file mode 100644 index f153f1b6..00000000 --- a/hw/rtl/interfaces/VX_backend_req_if.v +++ /dev/null @@ -1,40 +0,0 @@ -`ifndef VX_FrE_to_BCKBE_REQ_IF -`define VX_FrE_to_BCKBE_REQ_IF - -`include "VX_define.vh" - -interface VX_backend_req_if (); - - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; - wire [11:0] csr_addr; - wire is_csr; - wire csr_immed; - wire [31:0] csr_mask; - wire [4:0] rd; - wire [4:0] rs1; - wire [4:0] rs2; - wire [4:0] alu_op; - wire [1:0] wb; - wire rs2_src; - wire [31:0] itype_immed; - wire [`BYTE_EN_BITS-1:0] mem_read; - wire [`BYTE_EN_BITS-1:0] mem_write; - wire [2:0] branch_type; - wire [19:0] upper_immed; - wire is_etype; - wire is_jal; - wire jal; - wire [31:0] jal_offset; - wire [31:0] next_PC; - - // GPGPU stuff - wire is_wspawn; - wire is_tmc; - wire is_split; - wire is_barrier; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_branch_rsp_if.v b/hw/rtl/interfaces/VX_branch_rsp_if.v deleted file mode 100644 index a8f01d44..00000000 --- a/hw/rtl/interfaces/VX_branch_rsp_if.v +++ /dev/null @@ -1,15 +0,0 @@ -`ifndef VX_BRANCH_RSP_IF -`define VX_BRANCH_RSP_IF - -`include "VX_define.vh" - -interface VX_branch_rsp_if (); - - wire valid; - wire dir; - wire [31:0] dest; - wire [`NW_BITS-1:0] warp_num; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_csr_io_req_if.v b/hw/rtl/interfaces/VX_csr_io_req_if.v index ce8d2fed..716887d3 100644 --- a/hw/rtl/interfaces/VX_csr_io_req_if.v +++ b/hw/rtl/interfaces/VX_csr_io_req_if.v @@ -5,11 +5,11 @@ interface VX_csr_io_req_if (); - wire valid; - wire rw; - wire [11:0] addr; - wire [31:0] data; - wire ready; + wire valid; + wire [`CSR_ADDR_SIZE-1:0] addr; + wire rw; + wire [31:0] data; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_csr_req_if.v b/hw/rtl/interfaces/VX_csr_req_if.v index 6eeddf74..2956416a 100644 --- a/hw/rtl/interfaces/VX_csr_req_if.v +++ b/hw/rtl/interfaces/VX_csr_req_if.v @@ -6,17 +6,20 @@ interface VX_csr_req_if (); wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [4:0] rd; - wire [1:0] wb; - wire [4:0] alu_op; - wire is_csr; - wire [11:0] csr_addr; - wire csr_immed; - wire [31:0] csr_mask; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; - wire is_io; + wire [`CSR_BITS-1:0] csr_op; + wire [`CSR_ADDR_SIZE-1:0] csr_addr; + wire [31:0] csr_mask; + + wire [`NR_BITS-1:0] rd; + wire [`WB_BITS-1:0] wb; + wire is_io; + + wire ready; + endinterface `endif diff --git a/hw/rtl/interfaces/VX_decode_if.v b/hw/rtl/interfaces/VX_decode_if.v new file mode 100644 index 00000000..e4b99dc6 --- /dev/null +++ b/hw/rtl/interfaces/VX_decode_if.v @@ -0,0 +1,33 @@ +`ifndef VX_DECODE_IF +`define VX_DECODE_IF + +`include "VX_define.vh" + +interface VX_decode_if (); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + wire [31:0] next_PC; + + wire [`EX_BITS-1:0] ex_type; + wire [`OP_BITS-1:0] instr_op; + + wire [`NR_BITS-1:0] rd; + wire [`NR_BITS-1:0] rs1; + wire [`NR_BITS-1:0] rs2; + wire [31:0] imm; + + wire rs1_is_PC; + wire rs2_is_imm; + + wire use_rs1; + wire use_rs2; + + wire [`WB_BITS-1:0] wb; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_exec_unit_req_if.v b/hw/rtl/interfaces/VX_exec_unit_req_if.v deleted file mode 100644 index 918cb456..00000000 --- a/hw/rtl/interfaces/VX_exec_unit_req_if.v +++ /dev/null @@ -1,47 +0,0 @@ -`ifndef VX_EXE_UNIT_REQ_IF -`define VX_EXE_UNIT_REQ_IF - -`include "VX_define.vh" - -interface VX_exec_unit_req_if (); - - // Meta - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire [31:0] curr_PC; - wire [31:0] next_PC; - - // Write Back Info - wire [4:0] rd; - wire [1:0] wb; - - // Data and alu op - wire [`NUM_THREADS-1:0][31:0] a_reg_data; - wire [`NUM_THREADS-1:0][31:0] b_reg_data; - wire [4:0] alu_op; - wire [4:0] rs1; - wire [4:0] rs2; - wire rs2_src; - wire [31:0] itype_immed; - wire [19:0] upper_immed; - - // Branch type - wire [2:0] branch_type; - - // Jal info - wire is_jal; - wire jal; - wire [31:0] jal_offset; - - wire is_etype; - wire wspawn; - - // CSR info - wire is_csr; - wire [11:0] csr_addr; - wire csr_immed; - wire [31:0] csr_mask; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_execute_if.v b/hw/rtl/interfaces/VX_execute_if.v new file mode 100644 index 00000000..ae1fe68b --- /dev/null +++ b/hw/rtl/interfaces/VX_execute_if.v @@ -0,0 +1,33 @@ +`ifndef VX_EXECUTE_IF +`define VX_EXECUTE_IF + +`include "VX_define.vh" + +interface VX_execute_if(); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + wire [`EX_BITS-1:0] ex_type; + wire [`OP_BITS-1:0] instr_op; + + wire [`NR_BITS-1:0] rd; + wire [`NR_BITS-1:0] rs1; + wire [`NR_BITS-1:0] rs2; + wire [31:0] imm; + wire rs1_is_PC; + wire rs2_is_imm; + wire [31:0] next_PC; + + wire [`WB_BITS-1:0] wb; + + wire alu_ready; + wire br_ready; + wire mul_ready; + wire lsu_ready; + wire csr_ready; + wire gpu_ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpr_read_if.v b/hw/rtl/interfaces/VX_gpr_read_if.v deleted file mode 100644 index 5673c319..00000000 --- a/hw/rtl/interfaces/VX_gpr_read_if.v +++ /dev/null @@ -1,19 +0,0 @@ -`ifndef VX_GPR_READ_IF -`define VX_GPR_READ_IF - -`include "VX_define.vh" - -interface VX_gpr_read_if (); - - wire [4:0] rs1; - wire [4:0] rs2; - wire [`NW_BITS-1:0] warp_num; - wire is_jal; - wire[31:0] curr_PC; - - wire [`NUM_THREADS-1:0][31:0] a_reg_data; - wire [`NUM_THREADS-1:0][31:0] b_reg_data; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpu_inst_req_if.v b/hw/rtl/interfaces/VX_gpu_inst_req_if.v deleted file mode 100644 index a39800b0..00000000 --- a/hw/rtl/interfaces/VX_gpu_inst_req_if.v +++ /dev/null @@ -1,23 +0,0 @@ -`ifndef VX_GPGPU_INST_REQ_IF -`define VX_GPGPU_INST_REQ_IF - -`include "VX_define.vh" - -interface VX_gpu_inst_req_if(); - - wire [`NUM_THREADS-1:0] valid; - wire [`NW_BITS-1:0] warp_num; - wire is_wspawn; - wire is_tmc; - wire is_split; - - wire is_barrier; - - wire[31:0] next_PC; - - wire [`NUM_THREADS-1:0][31:0] a_reg_data; - wire [31:0] rd2; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v new file mode 100644 index 00000000..13adb788 --- /dev/null +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -0,0 +1,21 @@ +`ifndef VX_GPU_REQ_IF +`define VX_GPU_REQ_IF + +`include "VX_define.vh" + +interface VX_gpu_req_if(); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] next_PC; + + wire [`GPU_BITS-1:0] gpu_op; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [31:0] rs2_data; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_inst_meta_if.v b/hw/rtl/interfaces/VX_ifetch_req_if.v similarity index 57% rename from hw/rtl/interfaces/VX_inst_meta_if.v rename to hw/rtl/interfaces/VX_ifetch_req_if.v index 8ebfa87f..ac95eaac 100644 --- a/hw/rtl/interfaces/VX_inst_meta_if.v +++ b/hw/rtl/interfaces/VX_ifetch_req_if.v @@ -1,14 +1,14 @@ -`ifndef VX_INST_META_IF -`define VX_INST_META_IF +`ifndef VX_IFETCH_REQ_IF +`define VX_IFETCH_REQ_IF `include "VX_define.vh" -interface VX_inst_meta_if (); +interface VX_ifetch_req_if (); wire [`NUM_THREADS-1:0] valid; wire [31:0] curr_PC; wire [`NW_BITS-1:0] warp_num; - wire [31:0] instruction; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_ifetch_rsp_if.v b/hw/rtl/interfaces/VX_ifetch_rsp_if.v new file mode 100644 index 00000000..c8e5e2d7 --- /dev/null +++ b/hw/rtl/interfaces/VX_ifetch_rsp_if.v @@ -0,0 +1,16 @@ +`ifndef VX_IFETCH_RSP_IF +`define VX_IFETCH_RSP_IF + +`include "VX_define.vh" + +interface VX_ifetch_rsp_if (); + + wire [`NUM_THREADS-1:0] valid; + wire [31:0] curr_PC; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] instr; + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_jal_rsp_if.v b/hw/rtl/interfaces/VX_jal_rsp_if.v deleted file mode 100644 index 3f6796a0..00000000 --- a/hw/rtl/interfaces/VX_jal_rsp_if.v +++ /dev/null @@ -1,15 +0,0 @@ - -`ifndef VX_JAL_RSP_IF -`define VX_JAL_RSP_IF - -`include "VX_define.vh" - -interface VX_jal_rsp_if (); - - wire valid; - wire [31:0] dest; - wire [`NW_BITS-1:0] warp_num; - -endinterface - -`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_join_if.v b/hw/rtl/interfaces/VX_join_if.v index bc48bfee..15c1509b 100644 --- a/hw/rtl/interfaces/VX_join_if.v +++ b/hw/rtl/interfaces/VX_join_if.v @@ -1,4 +1,3 @@ - `ifndef VX_JOIN_IF `define VX_JOIN_IF diff --git a/hw/rtl/interfaces/VX_lsu_req_if.v b/hw/rtl/interfaces/VX_lsu_req_if.v index 216853cf..c333961d 100644 --- a/hw/rtl/interfaces/VX_lsu_req_if.v +++ b/hw/rtl/interfaces/VX_lsu_req_if.v @@ -1,4 +1,3 @@ - `ifndef VX_LSU_REQ_IF `define VX_LSU_REQ_IF @@ -10,12 +9,13 @@ interface VX_lsu_req_if (); wire [31:0] curr_PC; wire [`NW_BITS-1:0] warp_num; wire [`NUM_THREADS-1:0][31:0] store_data; - wire [`NUM_THREADS-1:0][31:0] base_addr; // A reg data - wire [31:0] offset; // itype_immed - wire [`BYTE_EN_BITS-1:0] mem_read; - wire [`BYTE_EN_BITS-1:0] mem_write; - wire [4:0] rd; // dest register - wire [1:0] wb; // + wire [`NUM_THREADS-1:0][31:0] base_addr; + wire [31:0] offset; + wire rw; + wire [`BYTEEN_BITS-1:0] byteen; + wire [`NR_BITS-1:0] rd; + wire [`WB_BITS-1:0] wb; + wire ready; endinterface diff --git a/hw/rtl/interfaces/VX_mul_req_if.v b/hw/rtl/interfaces/VX_mul_req_if.v new file mode 100644 index 00000000..26d175fb --- /dev/null +++ b/hw/rtl/interfaces/VX_mul_req_if.v @@ -0,0 +1,24 @@ +`ifndef VX_MUL_REQ_IF +`define VX_MUL_REQ_IF + +`include "VX_define.vh" + +interface VX_mul_req_if (); + + wire [`NUM_THREADS-1:0] valid; + wire [`NW_BITS-1:0] warp_num; + wire [31:0] curr_PC; + + wire [`NUM_THREADS-1:0][31:0] rs1_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + + wire [`MUL_BITS-1:0] mul_op; + + wire [`NR_BITS-1:0] rd; + wire [`WB_BITS-1:0] wb; + + wire ready; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_warp_ctl_if.v b/hw/rtl/interfaces/VX_warp_ctl_if.v index a8169d0c..7eef29f0 100644 --- a/hw/rtl/interfaces/VX_warp_ctl_if.v +++ b/hw/rtl/interfaces/VX_warp_ctl_if.v @@ -1,4 +1,3 @@ - `ifndef VX_WARP_CTL_IF `define VX_WARP_CTL_IF @@ -7,6 +6,7 @@ interface VX_warp_ctl_if (); wire [`NW_BITS-1:0] warp_num; + wire change_mask; wire [`NUM_THREADS-1:0] thread_mask; @@ -16,16 +16,13 @@ interface VX_warp_ctl_if (); wire whalt; - // barrier wire is_barrier; - wire [31:0] barrier_id; - wire [$clog2(`NUM_WARPS):0] num_warps; + wire [`NB_BITS-1:0] barrier_id; + wire [`NW_BITS:0] num_warps; wire is_split; - wire dont_split; -`IGNORE_WARNINGS_BEGIN - wire [`NW_BITS-1:0] split_warp_num; -`IGNORE_WARNINGS_END + wire do_split; + wire [`NUM_THREADS-1:0] split_new_mask; wire [`NUM_THREADS-1:0] split_later_mask; wire [31:0] split_save_pc; diff --git a/hw/rtl/interfaces/VX_wb_if.v b/hw/rtl/interfaces/VX_wb_if.v index 96b8acef..4fb23d73 100644 --- a/hw/rtl/interfaces/VX_wb_if.v +++ b/hw/rtl/interfaces/VX_wb_if.v @@ -6,12 +6,13 @@ interface VX_wb_if (); wire [`NUM_THREADS-1:0] valid; - wire [`NUM_THREADS-1:0][31:0] data; wire [`NW_BITS-1:0] warp_num; - wire [4:0] rd; - wire [1:0] wb; wire [31:0] curr_PC; + wire [`NUM_THREADS-1:0][31:0] data; + wire [`NR_BITS-1:0] rd; + wire [`WB_BITS-1:0] wb; wire is_io; + wire ready; endinterface diff --git a/hw/rtl/libs/VX_countones.v b/hw/rtl/libs/VX_countones.v index 6d77b934..625a969f 100644 --- a/hw/rtl/libs/VX_countones.v +++ b/hw/rtl/libs/VX_countones.v @@ -1,8 +1,8 @@ module VX_countones #( parameter N = 10 ) ( - input wire[N-1:0] valids, - output reg[$clog2(N):0] count + input wire [N-1:0] valids, + output reg [$clog2(N):0] count ); integer i; diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index 510abf26..d815dc39 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -3,6 +3,8 @@ module VX_divide #( parameter WIDTHN = 1, parameter WIDTHD = 1, + parameter WIDTHQ = 1, + parameter WIDTHR = 1, parameter NSIGNED = 0, parameter DSIGNED = 0, parameter PIPELINE = 0 @@ -13,18 +15,21 @@ module VX_divide #( input wire [WIDTHN-1:0] numer, input wire [WIDTHD-1:0] denom, - output wire [WIDTHN-1:0] quotient, - output wire [WIDTHD-1:0] remainder + output wire [WIDTHQ-1:0] quotient, + output wire [WIDTHR-1:0] remainder ); `ifdef QUARTUS + wire [WIDTHN-1:0] quotient_unqual; + wire [WIDTHD-1:0] remainder_unqual; + lpm_divide quartus_div ( .clock (clk), .numer (numer), .denom (denom), - .quotient (quotient), - .remain (remainder), + .quotient (quotient_unqual), + .remain (remainder_unqual), .aclr (1'b0), .clken (1'b1) ); @@ -38,6 +43,9 @@ module VX_divide #( quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE", quartus_div.lpm_pipeline = PIPELINE; + assign quotient = quotient_unqual[WIDTHQ-1:0]; + assign remainder = remainder_unqual[WIDTHR-1:0]; + `else reg [WIDTHN-1:0] quotient_unqual; @@ -47,7 +55,7 @@ module VX_divide #( `ifndef SYNTHESIS // this edge case kills verilator in some cases by causing a division // overflow exception. INT_MIN / -1 (on x86) - if (numer == {1'b1, (WIDTHN-1)'(0)} + if (numer == {1'b1, (WIDTHN-1)'(1'b0)} && denom == {WIDTHD{1'b1}}) begin quotient_unqual = 0; remainder_unqual = 0; @@ -74,8 +82,8 @@ module VX_divide #( end if (PIPELINE == 0) begin - assign quotient = quotient_unqual; - assign remainder = remainder_unqual; + assign quotient = quotient_unqual[WIDTHQ-1:0]; + assign remainder = remainder_unqual[WIDTHR-1:0]; end else begin reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1]; reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1]; @@ -99,8 +107,8 @@ module VX_divide #( end end - assign quotient = quotient_pipe[PIPELINE-1]; - assign remainder = remainder_pipe[PIPELINE-1]; + assign quotient = quotient_pipe[PIPELINE-1][WIDTHQ-1:0]; + assign remainder = remainder_pipe[PIPELINE-1][WIDTHR-1:0]; end `endif diff --git a/hw/rtl/libs/VX_generic_register.v b/hw/rtl/libs/VX_generic_register.v index 7234b0a2..b0328372 100644 --- a/hw/rtl/libs/VX_generic_register.v +++ b/hw/rtl/libs/VX_generic_register.v @@ -11,18 +11,25 @@ module VX_generic_register #( input wire[N-1:0] in, output wire[N-1:0] out ); - reg [(N-1):0] value; + if (PASSTHRU) begin + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + `UNUSED_VAR (stall) + assign out = flush ? N'(0) : in; + end else begin + reg [(N-1):0] value; - always @(posedge clk) begin - if (reset) begin - value <= 0; - end else if (flush) begin - value <= 0; - end else if (~stall) begin - value <= in; + always @(posedge clk) begin + if (reset) begin + value <= N'(0); + end else if (~stall) begin + value <= in; + end else if (flush) begin + value <= N'(0); + end end - end - assign out = PASSTHRU ? in : value; + assign out = value; + end endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_indexable_queue.v b/hw/rtl/libs/VX_index_queue.v similarity index 98% rename from hw/rtl/libs/VX_indexable_queue.v rename to hw/rtl/libs/VX_index_queue.v index 886b05f6..ce91845c 100644 --- a/hw/rtl/libs/VX_indexable_queue.v +++ b/hw/rtl/libs/VX_index_queue.v @@ -1,6 +1,6 @@ `include "VX_define.vh" -module VX_indexable_queue #( +module VX_index_queue #( parameter DATAW = 1, parameter SIZE = 1 ) ( diff --git a/hw/rtl/libs/VX_matrix_arbiter.v b/hw/rtl/libs/VX_matrix_arbiter.v index 8c467974..97b05cd7 100644 --- a/hw/rtl/libs/VX_matrix_arbiter.v +++ b/hw/rtl/libs/VX_matrix_arbiter.v @@ -56,7 +56,7 @@ module VX_matrix_arbiter #( end end - VX_encoder_onehot #( + VX_onehot_encoder #( .N(N) ) encoder ( .onehot (grant_onehot), diff --git a/hw/rtl/libs/VX_encoder_onehot.v b/hw/rtl/libs/VX_onehot_encooder.v similarity index 94% rename from hw/rtl/libs/VX_encoder_onehot.v rename to hw/rtl/libs/VX_onehot_encooder.v index a4e0e72e..06a25033 100644 --- a/hw/rtl/libs/VX_encoder_onehot.v +++ b/hw/rtl/libs/VX_onehot_encooder.v @@ -1,6 +1,6 @@ `include "VX_define.vh" -module VX_encoder_onehot #( +module VX_onehot_encoder #( parameter N = 6 ) ( input wire [N-1:0] onehot, diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 7b665714..a3f119a4 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -1,6 +1,8 @@ +SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 + #MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1 #MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 +MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 # control RTL debug print states DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE @@ -12,7 +14,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -#DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CORE_REQ_INFO INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate @@ -34,13 +36,13 @@ DBG += -DDBG_CORE_REQ_INFO THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') gen-s: - verilator $(VF) -DNDEBUG -cc Vortex.v -CFLAGS '$(CF) -DNDEBUG' + verilator $(VF) -DNDEBUG -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG $(SINGLECORE)' gen-sd: - verilator $(VF) -cc Vortex.v -CFLAGS '$(CF) -g -O0 $(DBG)' --trace $(DBG) + verilator $(VF) -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -g -O0 $(DBG) $(SINGLECORE)' --trace $(DBG) gen-st: - verilator $(VF) -DNDEBUG -cc Vortex.v -CFLAGS '$(CF) -DNDEBUG -O2' --threads $(THREADS) + verilator $(VF) -DNDEBUG -cc Vortex.v $(SINGLECORE) -CFLAGS '$(CF) -DNDEBUG -O2 $(SINGLECORE)' --threads $(THREADS) gen-m: verilator $(VF) -DNDEBUG -cc Vortex.v $(MULTICORE) -CFLAGS '$(CF) -DNDEBUG $(MULTICORE)' diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 2976a43c..d3c164b1 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -13,6 +13,9 @@ Simulator::Simulator() { // force random values for unitialized signals Verilated::randReset(2); + // Turn off assertion before reset + Verilated::assertOn(false); + ram_ = nullptr; vortex_ = new VVortex(); @@ -49,6 +52,9 @@ void Simulator::reset() { vortex_->reset = 0; dram_rsp_vec_.clear(); + + // Turn on assertion after reset + Verilated::assertOn(true); } void Simulator::step() { @@ -60,6 +66,7 @@ void Simulator::step() { this->eval_dram_bus(); this->eval_io_bus(); + this->eval_csr_bus(); this->eval_snp_bus(); } @@ -157,6 +164,15 @@ void Simulator::eval_io_bus() { vortex_->io_rsp_valid = 0; } +void Simulator::eval_csr_bus() { + vortex_->csr_io_req_valid = 0; + vortex_->csr_io_req_coreid = 0; + vortex_->csr_io_req_addr = 0; + vortex_->csr_io_req_rw = 0; + vortex_->csr_io_req_data = 0; + vortex_->csr_io_rsp_ready = 1; +} + void Simulator::eval_snp_bus() { if (snp_req_active_) { if (vortex_->snp_rsp_valid) { @@ -241,19 +257,18 @@ bool Simulator::run() { // check riscv-tests PASSED/FAILED status #if (NUM_CLUSTERS == 1 && NUM_CORES == 1) - int status = (int)vortex_->Vortex->genblk1__DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->back_end->writeback->last_data_wb & 0xf; + int status = (int)vortex_->Vortex->genblk1__DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->writeback->last_data_wb & 0xf; #else #if (NUM_CLUSTERS == 1) - int status = (int)vortex_->Vortex->genblk1__DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->back_end->writeback->last_data_wb & 0xf; + int status = (int)vortex_->Vortex->genblk1__DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->writeback->last_data_wb & 0xf; #else - int status = (int)vortex_->Vortex->genblk2__DOT__genblk1__BRA__0__KET____DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->back_end->writeback->last_data_wb & 0xf; + int status = (int)vortex_->Vortex->genblk2__DOT__genblk1__BRA__0__KET____DOT__cluster->genblk1__BRA__0__KET____DOT__core->pipeline->writeback->last_data_wb & 0xf; #endif #endif return (status == 1); } - void Simulator::load_bin(const char* program_file) { if (ram_ == nullptr) return; diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 27803754..297e2121 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -52,6 +52,7 @@ private: void eval_dram_bus(); void eval_io_bus(); + void eval_csr_bus(); void eval_snp_bus(); std::vector dram_rsp_vec_; diff --git a/hw/simulate/testbench.cpp b/hw/simulate/testbench.cpp index d037fc18..8515f1fd 100644 --- a/hw/simulate/testbench.cpp +++ b/hw/simulate/testbench.cpp @@ -5,12 +5,12 @@ int main(int argc, char **argv) { -//#define ALL_TESTS +#define ALL_TESTS #ifdef ALL_TESTS bool passed = true; std::string tests[] = { - "../../../benchmarks/riscv_tests/rv32ui-p-add.hex", + "../../../benchmarks/riscv_tests/rv32ui-p-add.hex", "../../../benchmarks/riscv_tests/rv32ui-p-addi.hex", "../../../benchmarks/riscv_tests/rv32ui-p-and.hex", "../../../benchmarks/riscv_tests/rv32ui-p-andi.hex", @@ -67,12 +67,14 @@ int main(int argc, char **argv) Simulator simulator; simulator.attach_ram(&ram); simulator.load_ihex(test.c_str()); - bool curr = simulator.run(); + bool status = simulator.run(); - if (curr) std::cerr << GREEN << "Test Passed: " << test << std::endl; - if (!curr) std::cerr << RED << "Test Failed: " << test << std::endl; + if (status) std::cerr << GREEN << "Test Passed: " << test << std::endl; + if (!status) std::cerr << RED << "Test Failed: " << test << std::endl; std::cerr << DEFAULT; - passed = passed && curr; + passed = passed && status; + if (!passed) + break; } std::cerr << DEFAULT << "\n***************************************\n"; @@ -95,12 +97,12 @@ int main(int argc, char **argv) Simulator simulator; simulator.attach_ram(&ram); simulator.load_ihex(test); - bool curr = simulator.run(); + bool status = simulator.run(); - if (curr) std::cerr << GREEN << "Test Passed: " << test << std::endl; - if (!curr) std::cerr << RED << "Test Failed: " << test << std::endl; + if (status) std::cerr << GREEN << "Test Passed: " << test << std::endl; + if (!status) std::cerr << RED << "Test Failed: " << test << std::endl; - return !curr; + return !status; #endif } \ No newline at end of file diff --git a/hw/unit_tests/VX_divide_tb.v b/hw/unit_tests/VX_divide_tb.v index 02e63e1f..1b14f526 100644 --- a/hw/unit_tests/VX_divide_tb.v +++ b/hw/unit_tests/VX_divide_tb.v @@ -2,13 +2,13 @@ module VX_tb_divide(); - `ifdef TRACE +`ifdef TRACE initial begin $dumpfile("trace.vcd"); $dumpvars(0,test); end - `endif +`endif reg clk; reg rst; @@ -23,6 +23,8 @@ module VX_tb_divide(); VX_divide#( .WIDTHN(32), .WIDTHD(32), + .WIDTHQ(32), + .WIDTHR(32), .PIPELINE(i) ) div( .clock(clk), @@ -157,4 +159,4 @@ module VX_tb_divide(); always #1 clk = !clk; -endmodule: VX_tb_divide \ No newline at end of file +endmodule \ No newline at end of file