Cleanup before integration

This commit is contained in:
felsabbagh3
2019-10-22 03:03:17 -04:00
parent b3f464dd89
commit 1bfafca896
11 changed files with 171 additions and 81 deletions

View File

@@ -24,7 +24,7 @@ MAKECPP=(cd obj_dir && make -j -f VVortex.mk)
# -LDFLAGS '-lsystemc'
VERILATOR:
echo "#define VCD_OFF" > simulate/tb_debug.h
verilator $(COMP) -cc $(FILE) $(INCLUDE) $(EXE) $(LIB) $(CF) $(LIGHTW)
verilator $(COMP) -cc $(FILE) $(INCLUDE) $(EXE) $(LIB) $(CF)
VERILATORnoWarnings:
echo "#define VCD_OFF" > simulate/tb_debug.h

View File

@@ -4,7 +4,7 @@ module VX_back_end (
input wire schedule_delay,
output wire out_mem_delay,
output wire gpr_stage_delay,
VX_jal_response_inter VX_jal_rsp,
VX_branch_response_inter VX_branch_rsp,
@@ -37,9 +37,6 @@ VX_mem_req_inter VX_exe_mem_req();
VX_mem_req_inter VX_mem_req();
VX_gpr_data_inter VX_gpr_data();
VX_frE_to_bckE_req_inter VX_bckE_req_out();
// LSU input + output
VX_lsu_req_inter VX_lsu_req();
@@ -63,18 +60,14 @@ VX_gpr_stage VX_gpr_stage(
.schedule_delay (schedule_delay),
.VX_writeback_inter(VX_writeback_temp),
.VX_bckE_req (VX_bckE_req),
.VX_bckE_req_out (VX_bckE_req_out),
.VX_gpr_data (VX_gpr_data)
);
VX_inst_multiplex VX_inst_mult(
.VX_bckE_req (VX_bckE_req_out),
.VX_gpr_data (VX_gpr_data),
// New
.VX_exec_unit_req(VX_exec_unit_req),
.VX_lsu_req (VX_lsu_req),
.VX_gpu_inst_req (VX_gpu_inst_req),
.VX_csr_req (VX_csr_req)
.VX_csr_req (VX_csr_req),
// End new
.memory_delay (out_mem_delay),
.gpr_stage_delay (gpr_stage_delay)
);

View File

@@ -58,9 +58,9 @@ module VX_execute_unit (
endgenerate
wire [`NW_M1:0] branch_use_index;
wire branch_found_valid;
VX_priority_encoder choose_alu_result(
wire [$clog2(`NT)-1:0] branch_use_index;
wire branch_found_valid;
VX_generic_priority_encoder #(.N(`NT)) choose_alu_result(
.valids(VX_exec_unit_req.valid),
.index (branch_use_index),
.found (branch_found_valid)

View File

@@ -0,0 +1,22 @@
module VX_generic_priority_encoder
#(
parameter N = 1
)
(
input wire[N-1:0] valids,
output reg[$clog2(N)-1:0] index,
output reg found
);
integer i;
always @(*) begin
index = 0;
found = 0;
for (i = `NW-1; i >= 0; i = i - 1) begin
if (valids[i]) begin
index = i[$clog2(N)-1:0];
found = 1;
end
end
end
endmodule

View File

@@ -39,7 +39,9 @@ module VX_gpgpu_inst (
assign VX_warp_ctl.is_barrier = VX_gpu_inst_req.is_barrier && valid_inst;
assign VX_warp_ctl.barrier_id = VX_gpu_inst_req.a_reg_data[0];
assign VX_warp_ctl.num_warps = VX_gpu_inst_req.rd2 - 1;
wire[31:0] num_warps_m1 = VX_gpu_inst_req.rd2 - 1;
assign VX_warp_ctl.num_warps = num_warps_m1[$clog2(`NW):0];
assign VX_warp_ctl.wspawn = wspawn;
assign VX_warp_ctl.wspawn_pc = wspawn_pc;
@@ -58,11 +60,7 @@ module VX_gpgpu_inst (
end
wire[`NW_M1:0] num_valids;
VX_one_counter one_counter(
.valids (curr_valids),
.ones_found(num_valids)
);
wire[`NW_M1:0] num_valids = $countones(curr_valids);
assign VX_warp_ctl.is_split = is_split && (num_valids > 1) && (split_new_use_mask != 0) && (split_new_use_mask != {`NT{1'b1}});

View File

@@ -1,7 +1,14 @@
`include "VX_define.v"
module VX_gpr_stage (
input wire clk,
input wire reset,
input wire schedule_delay,
input wire memory_delay,
output wire gpr_stage_delay,
// inputs
// Instruction Information
VX_frE_to_bckE_req_inter VX_bckE_req,
@@ -12,16 +19,20 @@ module VX_gpr_stage (
// Outputs
// Original Request 1 cycle later
VX_frE_to_bckE_req_inter VX_bckE_req_out,
// Data Read
VX_gpr_data_inter VX_gpr_data
VX_exec_unit_req_inter VX_exec_unit_req,
VX_lsu_req_inter VX_lsu_req,
VX_gpu_inst_req_inter VX_gpu_inst_req,
VX_csr_req_inter VX_csr_req
);
wire[31:0] curr_PC = VX_bckE_req.curr_PC;
wire[2:0] branchType = VX_bckE_req.branch_type;
wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ);
wire jalQual = VX_bckE_req.jalQual;
VX_gpr_read_inter VX_gpr_read();
@@ -50,28 +61,93 @@ module VX_gpr_stage (
// assign VX_bckE_req.is_csr = is_csr;
// assign VX_bckE_req_out.csr_mask = (VX_bckE_req.sr_immed == 1'b1) ? {27'h0, VX_bckE_req.rs1} : VX_gpr_data.a_reg_data[0];
wire zero_temp = 0;
// Outputs
VX_exec_unit_req_inter VX_exec_unit_req_temp();
VX_lsu_req_inter VX_lsu_req_temp();
VX_gpu_inst_req_inter VX_gpu_inst_req_temp();
VX_csr_req_inter VX_csr_req_temp();
VX_generic_register #(.N(256)) reg_data
(
.clk (clk),
.reset(reset),
.stall(zero_temp),
.flush(zero_temp),
.in ({VX_gpr_datf.a_reg_data, VX_gpr_datf.b_reg_data}),
.out ({VX_gpr_data.a_reg_data, VX_gpr_data.b_reg_data})
);
wire stall = schedule_delay;
VX_d_e_reg gpr_stage_reg(
.clk (clk),
.reset (reset),
.in_branch_stall (stall),
.in_freeze (zero_temp),
.VX_frE_to_bckE_req(VX_bckE_req),
.VX_bckE_req (VX_bckE_req_out)
VX_inst_multiplex VX_inst_mult(
.VX_bckE_req (VX_bckE_req),
.VX_gpr_data (VX_gpr_datf),
.VX_exec_unit_req(VX_exec_unit_req_temp),
.VX_lsu_req (VX_lsu_req_temp),
.VX_gpu_inst_req (VX_gpu_inst_req_temp),
.VX_csr_req (VX_csr_req_temp)
);
wire is_lsu = (|VX_lsu_req_temp.valid);
wire stall_rest = 0;
wire flush_rest = schedule_delay;
wire stall_lsu = is_lsu && memory_delay;
wire flush_lsu = schedule_delay && !stall_lsu;
assign gpr_stage_delay = stall_lsu;
VX_generic_register #(.N(308)) lsu_reg(
.clk (clk),
.reset(reset),
.stall(stall_lsu),
.flush(flush_lsu),
.in ({VX_lsu_req_temp.valid, VX_lsu_req_temp.warp_num, VX_lsu_req_temp.store_data, VX_lsu_req_temp.base_address, VX_lsu_req_temp.offset, VX_lsu_req_temp.mem_read, VX_lsu_req_temp.mem_write, VX_lsu_req_temp.rd, VX_lsu_req_temp.wb}),
.out ({VX_lsu_req.valid , VX_lsu_req.warp_num , VX_lsu_req.store_data , VX_lsu_req.base_address , VX_lsu_req.offset , VX_lsu_req.mem_read , VX_lsu_req.mem_write , VX_lsu_req.rd , VX_lsu_req.wb })
);
VX_generic_register #(.N(487)) exec_unit_reg(
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.a_reg_data, VX_exec_unit_req_temp.b_reg_data, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}),
.out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.a_reg_data , VX_exec_unit_req.b_reg_data , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask })
);
VX_generic_register #(.N(203)) gpu_inst_reg(
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.in ({VX_gpu_inst_req_temp.valid, VX_gpu_inst_req_temp.warp_num, VX_gpu_inst_req_temp.is_wspawn, VX_gpu_inst_req_temp.is_tmc, VX_gpu_inst_req_temp.is_split, VX_gpu_inst_req_temp.is_barrier, VX_gpu_inst_req_temp.pc_next, VX_gpu_inst_req_temp.a_reg_data, VX_gpu_inst_req_temp.rd2}),
.out ({VX_gpu_inst_req.valid , VX_gpu_inst_req.warp_num , VX_gpu_inst_req.is_wspawn , VX_gpu_inst_req.is_tmc , VX_gpu_inst_req.is_split , VX_gpu_inst_req.is_barrier , VX_gpu_inst_req.pc_next , VX_gpu_inst_req.a_reg_data , VX_gpu_inst_req.rd2 })
);
VX_generic_register #(.N(60)) csr_reg(
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.in ({VX_csr_req_temp.valid, VX_csr_req_temp.warp_num, VX_csr_req_temp.rd, VX_csr_req_temp.wb, VX_csr_req_temp.is_csr, VX_csr_req_temp.csr_address, VX_csr_req_temp.csr_immed, VX_csr_req_temp.csr_mask}),
.out ({VX_csr_req.valid , VX_csr_req.warp_num , VX_csr_req.rd , VX_csr_req.wb , VX_csr_req.is_csr , VX_csr_req.csr_address , VX_csr_req.csr_immed , VX_csr_req.csr_mask })
);
// wire zero_temp = 0;
// VX_generic_register #(.N(256)) reg_data
// (
// .clk (clk),
// .reset(reset),
// .stall(zero_temp),
// .flush(zero_temp),
// .in ({VX_gpr_datf.a_reg_data, VX_gpr_datf.b_reg_data}),
// .out ({VX_gpr_data.a_reg_data, VX_gpr_data.b_reg_data})
// );
// wire stall = schedule_delay;
// VX_d_e_reg gpr_stage_reg(
// .clk (clk),
// .reset (reset),
// .in_branch_stall (stall),
// .in_freeze (zero_temp),
// .VX_frE_to_bckE_req(VX_bckE_req),
// .VX_bckE_req (VX_bckE_req_out)
// );
endmodule

View File

@@ -6,6 +6,7 @@ module VX_scheduler (
input wire clk,
input wire reset,
input wire memory_delay,
input wire gpr_stage_delay,
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_wb_inter VX_writeback_inter,
@@ -28,14 +29,17 @@ module VX_scheduler (
wire rs2_rename = rename_table[VX_bckE_req.rs2];
wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ);
wire is_mem = is_store || is_load;
wire rs1_rename_qual = (rs1_rename && (VX_bckE_req.rs1 != 0));
wire rs2_rename_qual = (rs2_rename && (VX_bckE_req.rs2 != 0) && ((VX_bckE_req.rs2_src == `RS2_REG) || is_store));
wire rs2_rename_qual = (rs2_rename && (VX_bckE_req.rs2 != 0) && ((VX_bckE_req.rs2_src == `RS2_REG) || is_store)) || (VX_bckE_req.is_barrier) || (VX_bckE_req.is_wspawn);
wire rename_valid = rs1_rename_qual || rs2_rename_qual ;
assign schedule_delay = (rename_valid) && (|VX_bckE_req.valid) || memory_delay;
assign schedule_delay = (rename_valid) && (|VX_bckE_req.valid) || (memory_delay && (is_mem)) || (gpr_stage_delay && is_mem);
integer i;

View File

@@ -18,10 +18,10 @@ module VX_warp_scheduler (
input wire whalt,
input wire[`NW_M1:0] whalt_warp_num,
input wire is_barrier,
input wire[31:0] barrier_id,
input wire[`NW_M1:0] num_warps,
input wire[`NW_M1:0] barrier_warp_num,
input wire is_barrier,
input wire[31:0] barrier_id,
input wire[$clog2(`NW):0] num_warps,
input wire[`NW_M1:0] barrier_warp_num,
// WSTALL
input wire wstall,
@@ -86,7 +86,7 @@ module VX_warp_scheduler (
reg[`NW-1:0] barrier_stall_mask[(`NUM_BARRIERS-1):0];
wire reached_barrier_limit;
wire[`NW-1:0] curr_barrier_mask;
wire[($clog2(`NUM_BARRIERS)-1):0] curr_barrier_count;
wire[$clog2(`NW):0] curr_barrier_count;
// wsapwn
reg[31:0] use_wsapwn_pc;
@@ -141,41 +141,35 @@ module VX_warp_scheduler (
end else begin
barrier_stall_mask[barrier_id][barrier_warp_num] <= 1;
end
end else if (ctm) begin
thread_masks[ctm_warp_num] <= ctm_mask;
warp_stalled[ctm_warp_num] <= 0;
end else if (is_join) begin
if (!join_fall) begin
warp_pcs[join_warp_num] <= join_pc;
end
thread_masks[join_warp_num] <= join_tm;
end else if (is_split) begin
warp_stalled[split_warp_num] <= 0;
thread_masks[split_warp_num] <= split_new_mask;
end
if (update_use_wspawn) begin
use_wsapwn[warp_to_schedule] <= 0;
end
// Halting warps
if (whalt) begin
warp_active[whalt_warp_num] <= 0;
visible_active[whalt_warp_num] <= 0;
end
// Changing thread masks
if (ctm) begin
thread_masks[ctm_warp_num] <= ctm_mask;
warp_stalled[ctm_warp_num] <= 0;
if (update_use_wspawn) begin
use_wsapwn[warp_to_schedule] <= 0;
end
// Stalling the scheduling of warps
if (wstall) begin
warp_stalled[wstall_warp_num] <= 1;
visible_active[wstall_warp_num] <= 0;
end
if (is_split) begin
warp_stalled[split_warp_num] <= 0;
thread_masks[split_warp_num] <= split_new_mask;
end
if (is_join) begin
if (!join_fall) begin
warp_pcs[join_warp_num] <= join_pc;
end
thread_masks[join_warp_num] <= join_tm;
end
// Refilling active warps
if (update_visible_active) begin
visible_active <= warp_active & (~warp_stalled) & (~total_barrier_stall);

View File

@@ -58,6 +58,7 @@ VX_warp_ctl_inter VX_warp_ctl();
wire memory_delay;
wire gpr_stage_delay;
wire schedule_delay;
@@ -78,6 +79,7 @@ VX_scheduler schedule(
.clk (clk),
.reset (reset),
.memory_delay (memory_delay),
.gpr_stage_delay (gpr_stage_delay),
.VX_bckE_req (VX_bckE_req),
.VX_writeback_inter(VX_writeback_inter),
.schedule_delay (schedule_delay)
@@ -94,7 +96,8 @@ VX_back_end vx_back_end(
.VX_dcache_rsp (VX_dcache_rsp),
.VX_dcache_req (VX_dcache_req),
.VX_writeback_inter (VX_writeback_inter),
.out_mem_delay (memory_delay)
.out_mem_delay (memory_delay),
.gpr_stage_delay (gpr_stage_delay)
);
// VX_csr_handler vx_csr_handler(

View File

@@ -18,9 +18,9 @@ interface VX_warp_ctl_inter ();
wire ebreak;
// barrier
wire is_barrier;
wire[31:0] barrier_id;
wire[`NW_M1:0] num_warps;
wire is_barrier;
wire[31:0] barrier_id;
wire[$clog2(`NW):0] num_warps;
wire is_split;
wire[`NW_M1:0] split_warp_num;

View File

@@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_
set symbol_library {}
set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db]
set verilog_files [ list VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
set verilog_files [ list VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
]
analyze -format sverilog $verilog_files