diff --git a/rtl/VX_back_end.v b/rtl/VX_back_end.v index f835e132..61027635 100644 --- a/rtl/VX_back_end.v +++ b/rtl/VX_back_end.v @@ -52,12 +52,15 @@ VX_inst_mem_wb_inter VX_mem_wb(); VX_exec_unit_req_inter VX_exec_unit_req(); VX_inst_exec_wb_inter VX_inst_exec_wb(); + +// GPU unit input +VX_gpu_inst_req_inter VX_gpu_inst_req(); + VX_gpr_stage VX_gpr_stage( .clk (clk), .schedule_delay (schedule_delay), .VX_writeback_inter(VX_writeback_inter), .VX_bckE_req (VX_bckE_req), - .VX_warp_ctl (VX_warp_ctl), .VX_bckE_req_out (VX_bckE_req_out), .VX_gpr_data (VX_gpr_data) ); @@ -67,7 +70,8 @@ VX_inst_multiplex VX_inst_mult( .VX_bckE_req (VX_bckE_req_out), .VX_gpr_data (VX_gpr_data), .VX_exec_unit_req(VX_exec_unit_req), - .VX_lsu_req (VX_lsu_req) + .VX_lsu_req (VX_lsu_req), + .VX_gpu_inst_req (VX_gpu_inst_req) ); @@ -94,6 +98,12 @@ VX_execute_unit VX_execUnit( .out_csr_result (VX_csr_w_req.csr_result) ); + +VX_gpgpu_inst VX_gpgpu_inst( + .VX_gpu_inst_req(VX_gpu_inst_req), + .VX_warp_ctl (VX_warp_ctl) + ); + VX_writeback VX_wb( .VX_mem_wb (VX_mem_wb), .VX_inst_exec_wb (VX_inst_exec_wb), diff --git a/rtl/VX_decode.v b/rtl/VX_decode.v index 69497ff1..2dec804f 100644 --- a/rtl/VX_decode.v +++ b/rtl/VX_decode.v @@ -36,10 +36,11 @@ module VX_decode( wire is_e_inst; wire is_gpgpu; - // wire is_clone; - wire is_jalrs; - wire is_jmprt; wire is_wspawn; + wire is_tmc; + wire is_split; + wire is_join; + wire is_barrier; wire[2:0] func3; wire[6:0] func7; @@ -110,38 +111,26 @@ module VX_decode( assign is_e_inst = (curr_opcode == `SYS_INST) && (func3 == 0); assign is_gpgpu = (curr_opcode == `GPGPU_INST); - // assign is_clone = is_gpgpu && (func3 == 5); - assign is_jalrs = is_gpgpu && (func3 == 6); - assign is_jmprt = is_gpgpu && (func3 == 4); - assign is_wspawn = is_gpgpu && (func3 == 0); + + assign is_tmc = is_gpgpu && (func3 == 0); // Goes to BE + assign is_wspawn = is_gpgpu && (func3 == 1); // Goes to BE + assign is_barrier = is_gpgpu && (func3 == 4); // Goes to BE + assign is_split = is_gpgpu && (func3 == 2); // Goes to BE + assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE + + assign VX_frE_to_bckE_req.is_wspawn = is_wspawn; + assign VX_frE_to_bckE_req.is_tmc = is_tmc; + assign VX_frE_to_bckE_req.is_split = is_split; + assign VX_frE_to_bckE_req.is_barrier = is_barrier; + assign VX_frE_to_bckE_req.csr_immed = is_csr_immed; - assign VX_frE_to_bckE_req.wspawn = is_wspawn; - // wire[`NT_M1:0] jalrs_thread_mask = 0; - // wire[`NT_M1:0] jmprt_thread_mask; - // genvar tm_i; - // generate - // for (tm_i = 0; tm_i < `NT; tm_i = tm_i + 1) begin - // assign jalrs_thread_mask[tm_i] = $signed(tm_i) <= $signed(VX_frE_to_bckE_req.b_reg_data[0]); - // end - // endgenerate - - - // genvar tm_ji; - // generate - // assign jmprt_thread_mask[0] = 1; - // for (tm_ji = 1; tm_ji < `NT; tm_ji = tm_ji + 1) begin - // assign jmprt_thread_mask[tm_ji] = 0; - // end - // endgenerate - - - assign VX_frE_to_bckE_req.wb = (is_jal || is_jalr || is_jalrs || is_e_inst) ? `WB_JAL : + assign VX_frE_to_bckE_req.wb = (is_jal || is_jalr || is_e_inst) ? `WB_JAL : is_linst ? `WB_MEM : (is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU : `NO_WB; @@ -200,14 +189,6 @@ module VX_decode( temp_jal = 1'b1 && in_valid[0]; temp_jal_offset = jal_2_offset; end - `GPGPU_INST: - begin - if (is_jalrs || is_jmprt) - begin - temp_jal = 1'b1 && in_valid[0]; - temp_jal_offset = 32'h0; - end - end `SYS_INST: begin // $display("SYS EBREAK %h", (jal_sys_jal && in_valid[0]) ); @@ -293,14 +274,6 @@ module VX_decode( temp_branch_type = `NO_BRANCH; temp_branch_stall = 1'b1 && in_valid[0]; end - `GPGPU_INST: - begin - if (is_jalrs || is_jmprt) - begin - temp_branch_type = `NO_BRANCH; - temp_branch_stall = 1'b1 && in_valid[0]; - end - end default: begin temp_branch_type = `NO_BRANCH; @@ -311,7 +284,7 @@ module VX_decode( assign VX_frE_to_bckE_req.branch_type = temp_branch_type; - assign VX_wstall.wstall = temp_branch_stall && in_valid[0]; + assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_join || is_barrier) && (|in_valid); assign VX_wstall.warp_num = in_warp_num; always @(*) begin diff --git a/rtl/VX_gpgpu_inst.v b/rtl/VX_gpgpu_inst.v new file mode 100644 index 00000000..e7bc2d4b --- /dev/null +++ b/rtl/VX_gpgpu_inst.v @@ -0,0 +1,32 @@ +module VX_gpgpu_inst ( + // Input + VX_gpu_inst_req_inter VX_gpu_inst_req, + + // Output + VX_warp_ctl_inter VX_warp_ctl +); + + + wire[`NT_M1:0] tmc_new_mask; + genvar curr_t; + for (curr_t = 0; curr_t < `NT; curr_t=curr_t+1) + begin + assign tmc_new_mask[curr_t] = curr_t < VX_gpu_inst_req.a_reg_data[0]; + end + + + assign VX_warp_ctl.warp_num = VX_gpu_inst_req.warp_num; + assign VX_warp_ctl.change_mask = (VX_gpu_inst_req.is_tmc || VX_gpu_inst_req.is_split) && (|VX_gpu_inst_req.valid); + assign VX_warp_ctl.thread_mask = VX_gpu_inst_req.is_tmc ? tmc_new_mask : 0; + + assign VX_warp_ctl.ebreak = (VX_gpu_inst_req.a_reg_data[0] == 0); + + assign VX_warp_ctl.wspawn = 0; + assign VX_warp_ctl.wspawn_pc = 0; + + + // VX_gpu_inst_req.is_wspawn + // VX_gpu_inst_req.is_split + // VX_gpu_inst_req.is_barrier + +endmodule \ No newline at end of file diff --git a/rtl/VX_gpr_stage.v b/rtl/VX_gpr_stage.v index eef624b4..188ffd48 100644 --- a/rtl/VX_gpr_stage.v +++ b/rtl/VX_gpr_stage.v @@ -11,8 +11,6 @@ module VX_gpr_stage ( // Outputs - // Warp Control - VX_warp_ctl_inter VX_warp_ctl, // Original Request 1 cycle later VX_frE_to_bckE_req_inter VX_bckE_req_out, // Data Read @@ -75,22 +73,4 @@ module VX_gpr_stage ( .VX_bckE_req (VX_bckE_req_out) ); - - // assign VX_warp_ctl.warp_num = VX_bckE_req_out.warp_num; - // assign VX_warp_ctl.wspawn = VX_bckE_req_out.wspawn; - // assign VX_warp_ctl.wspawn_pc = VX_bckE_req_out.a_reg_data[0]; - - // assign VX_warp_ctl.thread_mask = is_jalrs ? jalrs_thread_mask : jmprt_thread_mask; - // assign VX_warp_ctl.change_mask = is_jalrs || is_jmprt; - // assign VX_warp_ctl.ebreak = VX_bckE_req_out.ebreak; - - - assign VX_warp_ctl.warp_num = 0; - assign VX_warp_ctl.wspawn = 0; - assign VX_warp_ctl.wspawn_pc = 0; - - assign VX_warp_ctl.thread_mask = 0; - assign VX_warp_ctl.change_mask = 0; - assign VX_warp_ctl.ebreak = 0; - endmodule \ No newline at end of file diff --git a/rtl/VX_inst_multiplex.v b/rtl/VX_inst_multiplex.v index 4909ef21..2dbac6d9 100644 --- a/rtl/VX_inst_multiplex.v +++ b/rtl/VX_inst_multiplex.v @@ -5,16 +5,23 @@ module VX_inst_multiplex ( // Outputs VX_exec_unit_req_inter VX_exec_unit_req, - VX_lsu_req_inter VX_lsu_req + VX_lsu_req_inter VX_lsu_req, + VX_gpu_inst_req_inter VX_gpu_inst_req ); wire[`NT_M1:0] is_mem_mask; - wire is_mem = (VX_bckE_req.mem_write != `NO_MEM_WRITE) || (VX_bckE_req.mem_read != `NO_MEM_READ); + wire[`NT_M1:0] is_gpu_mask; + wire is_mem = (VX_bckE_req.mem_write != `NO_MEM_WRITE) || (VX_bckE_req.mem_read != `NO_MEM_READ); + // wire is_gpu = (VX_bckE_req.is_wspawn || VX_bckE_req.is_tmc || VX_bckE_req.is_barrier || VX_bckE_req.is_split); + wire is_gpu = 0; genvar currT; - for (currT = 0; currT < `NT; currT = currT + 1) assign is_mem_mask[currT] = is_mem; + for (currT = 0; currT < `NT; currT = currT + 1) begin + assign is_mem_mask[currT] = is_mem; + assign is_gpu_mask[currT] = is_gpu; + end // LSU Unit assign VX_lsu_req.valid = VX_bckE_req.valid & is_mem_mask; @@ -31,7 +38,7 @@ module VX_inst_multiplex ( // Execute Unit - assign VX_exec_unit_req.valid = VX_bckE_req.valid & (~is_mem_mask); + assign VX_exec_unit_req.valid = VX_bckE_req.valid & (~is_mem_mask & ~is_gpu_mask); assign VX_exec_unit_req.warp_num = VX_bckE_req.warp_num; assign VX_exec_unit_req.curr_PC = VX_bckE_req.curr_PC; assign VX_exec_unit_req.PC_next = VX_bckE_req.PC_next; @@ -49,7 +56,6 @@ module VX_inst_multiplex ( assign VX_exec_unit_req.jalQual = VX_bckE_req.jalQual; assign VX_exec_unit_req.jal = VX_bckE_req.jal; assign VX_exec_unit_req.jal_offset = VX_bckE_req.jal_offset; - assign VX_exec_unit_req.wspawn = VX_bckE_req.wspawn; assign VX_exec_unit_req.ebreak = VX_bckE_req.ebreak; assign VX_exec_unit_req.is_csr = VX_bckE_req.is_csr; assign VX_exec_unit_req.csr_address = VX_bckE_req.csr_address; @@ -57,4 +63,18 @@ module VX_inst_multiplex ( assign VX_exec_unit_req.csr_mask = VX_bckE_req.csr_mask; -endmodule \ No newline at end of file + // GPR Req + assign VX_gpu_inst_req.valid = VX_bckE_req.valid & is_gpu_mask; + assign VX_gpu_inst_req.warp_num = VX_bckE_req.warp_num; + assign VX_gpu_inst_req.is_wspawn = VX_bckE_req.is_wspawn; + assign VX_gpu_inst_req.is_tmc = VX_bckE_req.is_tmc; + assign VX_gpu_inst_req.is_split = VX_bckE_req.is_split; + assign VX_gpu_inst_req.is_barrier = VX_bckE_req.is_barrier; + assign VX_gpu_inst_req.a_reg_data = VX_gpr_data.a_reg_data; + assign VX_gpu_inst_req.rd2 = VX_gpr_data.b_reg_data[0]; + +endmodule + + + + diff --git a/rtl/interfaces/VX_frE_to_bckE_req_inter.v b/rtl/interfaces/VX_frE_to_bckE_req_inter.v index 3664d903..610d3525 100644 --- a/rtl/interfaces/VX_frE_to_bckE_req_inter.v +++ b/rtl/interfaces/VX_frE_to_bckE_req_inter.v @@ -25,7 +25,6 @@ interface VX_frE_to_bckE_req_inter (); wire[31:0] curr_PC; /* verilator lint_off UNUSED */ wire ebreak; - wire wspawn; /* verilator lint_on UNUSED */ wire jalQual; wire jal; @@ -34,6 +33,12 @@ interface VX_frE_to_bckE_req_inter (); wire[`NT_M1:0] valid; wire[`NW_M1:0] warp_num; + // GPGPU stuff + wire is_wspawn; + wire is_tmc; + wire is_split; + wire is_barrier; + endinterface diff --git a/rtl/interfaces/VX_gpu_inst_req_inter.v b/rtl/interfaces/VX_gpu_inst_req_inter.v new file mode 100644 index 00000000..01793b2b --- /dev/null +++ b/rtl/interfaces/VX_gpu_inst_req_inter.v @@ -0,0 +1,24 @@ +`include "../VX_define.v" + +`ifndef VX_GPU_INST_REQ_IN + +`define VX_GPU_INST_REQ_IN + +interface VX_gpu_inst_req_inter(); + + wire[`NT_M1:0] valid; + wire[`NW_M1:0] warp_num; + wire is_wspawn; + wire is_tmc; + wire is_split; + wire is_barrier; + + wire[`NT_M1:0][31:0] a_reg_data; + wire[31:0] rd2; + + + +endinterface + + +`endif \ No newline at end of file diff --git a/rtl/interfaces/VX_warp_ctl_inter.v b/rtl/interfaces/VX_warp_ctl_inter.v index fc1931a7..9908384e 100644 --- a/rtl/interfaces/VX_warp_ctl_inter.v +++ b/rtl/interfaces/VX_warp_ctl_inter.v @@ -10,10 +10,11 @@ interface VX_warp_ctl_inter (); wire[`NW_M1:0] warp_num; wire change_mask; wire[`NT_M1:0] thread_mask; + wire wspawn; wire[31:0] wspawn_pc; - wire ebreak; + wire ebreak; endinterface diff --git a/rtl/pipe_regs/VX_d_e_reg.v b/rtl/pipe_regs/VX_d_e_reg.v index b5d92649..64b69834 100644 --- a/rtl/pipe_regs/VX_d_e_reg.v +++ b/rtl/pipe_regs/VX_d_e_reg.v @@ -18,14 +18,14 @@ module VX_d_e_reg ( wire flush = (in_branch_stall == `STALL); - VX_generic_register #(.N(237)) d_e_reg + VX_generic_register #(.N(240)) d_e_reg ( .clk (clk), .reset(reset), .stall(stall), .flush(flush), - .in ({VX_frE_to_bckE_req.csr_address, VX_frE_to_bckE_req.jalQual, VX_frE_to_bckE_req.ebreak, VX_frE_to_bckE_req.wspawn, VX_frE_to_bckE_req.is_csr, VX_frE_to_bckE_req.csr_immed, VX_frE_to_bckE_req.csr_mask, VX_frE_to_bckE_req.rd, VX_frE_to_bckE_req.rs1, VX_frE_to_bckE_req.rs2, VX_frE_to_bckE_req.alu_op, VX_frE_to_bckE_req.wb, VX_frE_to_bckE_req.rs2_src, VX_frE_to_bckE_req.itype_immed, VX_frE_to_bckE_req.mem_read, VX_frE_to_bckE_req.mem_write, VX_frE_to_bckE_req.branch_type, VX_frE_to_bckE_req.upper_immed, VX_frE_to_bckE_req.curr_PC, VX_frE_to_bckE_req.jal, VX_frE_to_bckE_req.jal_offset, VX_frE_to_bckE_req.PC_next, VX_frE_to_bckE_req.valid, VX_frE_to_bckE_req.warp_num}), - .out ({VX_bckE_req.csr_address , VX_bckE_req.jalQual , VX_bckE_req.ebreak , VX_bckE_req.wspawn ,VX_bckE_req.is_csr , VX_bckE_req.csr_immed , VX_bckE_req.csr_mask , VX_bckE_req.rd , VX_bckE_req.rs1 , VX_bckE_req.rs2 , VX_bckE_req.alu_op , VX_bckE_req.wb , VX_bckE_req.rs2_src , VX_bckE_req.itype_immed , VX_bckE_req.mem_read , VX_bckE_req.mem_write , VX_bckE_req.branch_type , VX_bckE_req.upper_immed , VX_bckE_req.curr_PC , VX_bckE_req.jal , VX_bckE_req.jal_offset , VX_bckE_req.PC_next , VX_bckE_req.valid , VX_bckE_req.warp_num}) + .in ({VX_frE_to_bckE_req.csr_address, VX_frE_to_bckE_req.jalQual, VX_frE_to_bckE_req.ebreak, VX_frE_to_bckE_req.is_csr, VX_frE_to_bckE_req.csr_immed, VX_frE_to_bckE_req.csr_mask, VX_frE_to_bckE_req.rd, VX_frE_to_bckE_req.rs1, VX_frE_to_bckE_req.rs2, VX_frE_to_bckE_req.alu_op, VX_frE_to_bckE_req.wb, VX_frE_to_bckE_req.rs2_src, VX_frE_to_bckE_req.itype_immed, VX_frE_to_bckE_req.mem_read, VX_frE_to_bckE_req.mem_write, VX_frE_to_bckE_req.branch_type, VX_frE_to_bckE_req.upper_immed, VX_frE_to_bckE_req.curr_PC, VX_frE_to_bckE_req.jal, VX_frE_to_bckE_req.jal_offset, VX_frE_to_bckE_req.PC_next, VX_frE_to_bckE_req.valid, VX_frE_to_bckE_req.warp_num, VX_frE_to_bckE_req.is_wspawn, VX_frE_to_bckE_req.is_tmc, VX_frE_to_bckE_req.is_split, VX_frE_to_bckE_req.is_barrier}), + .out ({VX_bckE_req.csr_address , VX_bckE_req.jalQual , VX_bckE_req.ebreak ,VX_bckE_req.is_csr , VX_bckE_req.csr_immed , VX_bckE_req.csr_mask , VX_bckE_req.rd , VX_bckE_req.rs1 , VX_bckE_req.rs2 , VX_bckE_req.alu_op , VX_bckE_req.wb , VX_bckE_req.rs2_src , VX_bckE_req.itype_immed , VX_bckE_req.mem_read , VX_bckE_req.mem_write , VX_bckE_req.branch_type , VX_bckE_req.upper_immed , VX_bckE_req.curr_PC , VX_bckE_req.jal , VX_bckE_req.jal_offset , VX_bckE_req.PC_next , VX_bckE_req.valid , VX_bckE_req.warp_num , VX_bckE_req.is_wspawn , VX_bckE_req.is_tmc , VX_bckE_req.is_split , VX_bckE_req.is_barrier }) ); diff --git a/rtl/results.txt b/rtl/results.txt index fd1e7156..e69de29b 100644 --- a/rtl/results.txt +++ b/rtl/results.txt @@ -1,7 +0,0 @@ -# Dynamic Instructions: 52683 -# of total cycles: 52699 -# of forwarding stalls: 0 -# of branch stalls: 0 -# CPI: 1.0003 -# time to simulate: 0 milliseconds -# GRADE: Failed on test: 4294967295 diff --git a/syn/syn.tcl b/syn/syn.tcl index 8d432e5a..bbd4ef0f 100755 --- a/syn/syn.tcl +++ b/syn/syn.tcl @@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_ set symbol_library {} set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db] -set verilog_files [ list VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_csr_write_request_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ +set verilog_files [ list VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_csr_write_request_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \ ] analyze -format sverilog $verilog_files