8Warp 32Threads for GTCAD synthesis

This commit is contained in:
fares
2019-11-21 23:51:11 -05:00
9 changed files with 271 additions and 87 deletions

View File

@@ -1,4 +1,3 @@
`include "VX_define.v"
module VX_alu(
@@ -13,6 +12,71 @@ module VX_alu(
);
`ifdef SYN_FUNC
wire which_in2;
wire[31:0] ALU_in1;
wire[31:0] ALU_in2;
wire[63:0] ALU_in1_mult;
wire[63:0] ALU_in2_mult;
wire[31:0] upper_immed;
wire[31:0] div_result;
wire[31:0] rem_result;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
//always @(posedge `MUL) begin
/* verilator lint_off UNUSED */
wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1};
wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2};
assign ALU_in1_mult = (in_alu_op == `MULHU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in1} : alu_in1_signed;
assign ALU_in2_mult = (in_alu_op == `MULHU || in_alu_op == `MULHSU || in_alu_op == `DIVU || in_alu_op == `REMU) ? {32'b0, ALU_in2} : alu_in2_signed;
wire[63:0] mult_result = ALU_in1_mult * ALU_in2_mult;
/* verilator lint_on UNUSED */
always @(in_alu_op or ALU_in1 or ALU_in2) begin
case(in_alu_op)
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
`OR: out_alu_result = ALU_in1 | ALU_in2;
`AND: out_alu_result = ALU_in2 & ALU_in1;
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
`LUI_ALU: out_alu_result = upper_immed;
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
`MUL: out_alu_result = mult_result[31:0];
`MULH: out_alu_result = mult_result[63:32];
`MULHSU: out_alu_result = mult_result[63:32];
`MULHU: out_alu_result = mult_result[63:32];
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : $signed($signed(ALU_in1) / $signed(ALU_in2));
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : ALU_in1 / ALU_in2;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : $signed($signed(ALU_in1) % $signed(ALU_in2));
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2;
default: out_alu_result = 32'h0;
endcase // in_alu_op
end
`else
wire which_in2;
wire[31:0] ALU_in1;
@@ -69,7 +133,7 @@ module VX_alu(
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : ALU_in1 % ALU_in2;
default: out_alu_result = 32'h0;
endcase // in_alu_op
end
end
`endif
endmodule // VX_alu

View File

@@ -1,11 +1,10 @@
`include "./VX_define_synth.v"
`define NT 4
`define NT_M1 (`NT-1)
// NW_M1 is actually log2(NW)
//`define NW_M1 (4-1)
`define NW 8
`define NW_M1 (`CLOG2(`NW))
// Uncomment the below line if NW=1
@@ -13,6 +12,7 @@
// `define SYN 1
// `define ASIC 1
// `define SYN_FUNC 1
`define NUM_BARRIERS 4

2
rtl/VX_define_synth.v Normal file
View File

@@ -0,0 +1,2 @@
`define NT 32
`define NW 8

View File

@@ -85,83 +85,87 @@ module VX_gpr (
wire[`NT_M1:0][31:0] to_write = (VX_writeback_inter.rd != 0) ? VX_writeback_inter.write_data : 0;
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 first_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_a),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_1),
.AA(VX_gpr_read.rs1),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask),
.AB(VX_writeback_inter.rd),
.DB(to_write),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
genvar curr_base_thread;
for (curr_base_thread = 0; curr_base_thread < 'NT; curr_base_thread=curr_base_thread+4)
begin
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 first_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_a[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_1),
.AA(VX_gpr_read.rs1[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 second_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_b),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_2),
.AA(VX_gpr_read.rs2),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask),
.AB(VX_writeback_inter.rd),
.DB(to_write),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 second_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_b[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_2),
.AA(VX_gpr_read.rs2[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
end
`endif

View File

@@ -304,9 +304,15 @@ module VX_d_cache
// 0;
wire[1:0] byte_select = bank_addr[1:0];
wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START];
`ifdef SYN_FUNC
wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = 0;
wire[IND_SIZE_END:IND_SIZE_START] cache_index = 0;
`else
wire[OFFSET_SIZE_END:OFFSET_SIZE_START] cache_offset = bank_addr[ADDR_OFFSET_END:ADDR_OFFSET_START];
wire[IND_SIZE_END:IND_SIZE_START] cache_index = bank_addr[ADDR_IND_END:ADDR_IND_START];
wire[TAG_SIZE_END:TAG_SIZE_START] cache_tag = bank_addr[ADDR_TAG_END:ADDR_TAG_START];
`endif
wire normal_valid_in = valid_per_bank[bank_id];

View File

@@ -7,6 +7,7 @@ SRC = \
vortex_dpi.cpp \
vortex_tb.v \
../VX_define.v \
../VX_define_synth.v \
../interfaces/VX_branch_response_inter.v \
../interfaces/VX_csr_req_inter.v \
../interfaces/VX_csr_wb_inter.v \

View File

@@ -1,7 +1,33 @@
all: syn
SCRIPT_DIR=./scripts
all: dc
syn:
dc_shell-t -f fsyn.tcl 2>&1 | tee vortex_syn.log
#syn:
#dc_shell-t -f esyn.tcl 2>&1 | tee vortex_syn.log
#dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log
#dc_shell -f $(SCRIPT_DIR)/dc/dc_script.tcl
dc:
rm -rf rpt
mkdir rpt
dc_shell -f esyn.tcl 2>&1 | tee vortex_syn.log
clean:
rm -f simv
rm -f *.vcd
rm -f *.key
rm -rf csrc/
rm -rf *.rpt
rm -rf *.log
rm -rf *.svf
rm -rf *.ddc
rm -rf results_synthesized.v
rm -rf results_synthesized.sdc
rm -rf alib-52/
rm -rf rpt/
rm -rf simv.daidir/
rm -rf encounter*
rm -rf ./synth_out

53
syn/esyn.tcl Normal file
View File

@@ -0,0 +1,53 @@
#set search_path [concat /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_db/ /nethome/dshim8/Desktop/GTCAD-3DPKG-v3/example/tech/cln28hpm/2d_hard_db/ ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db]
set search_path [concat ../rtl/ ../rtl/interfaces ../rtl/pipe_regs ../rtl/shared_memory ../rtl/cache ../models/memory/cln28hpm/2d_hardmacro_db]
set link_library [concat ./NanGate_15nm_OCL.db]
set symbol_library {}
set target_library [concat ./NanGate_15nm_OCL.db]
set verilog_files [ list VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v VX_cache_data_per_index.v VX_Cache_Bank.v VX_cache_data.v VX_d_cache.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_define_synth.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v Vortex.v VX_cache_bank_valid.v \
]
# set verilog_files [ list Vortex.v VX_countones.v VX_priority_encoder_w_mask.v VX_dram_req_rsp_inter.v cache_set.v VX_Cache_Bank.v VX_Cache_Block_DM.v VX_cache_data.v VX_d_cache.v VX_generic_pc.v VX_bank_valids.v VX_priority_encoder_sm.v VX_shared_memory.v VX_shared_memory_block.v VX_dmem_controller.v VX_generic_priority_encoder.v VX_generic_stack.v VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp_scheduler.v VX_writeback.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
# ]
set top_level Vortex
analyze -format sverilog $verilog_files
#analyze -format sverilog -error=LINT-66 $verilog_files
elaborate Vortex
link
set clk_freq 0.4
set clk_period [expr 1000.0 / $clk_freq / 1.0]
create_clock [get_ports clk] -period $clk_period
set_max_fanout 20 [get_ports clk]
set_ideal_network [get_ports clk]
set_max_fanout 20 [get_ports reset]
set_false_path -from [get_ports reset]
all_high_fanout -net -threshold 20
# set_register_merging Vortex FALSE
# set compile_seqmap_propagate_constants false
# set compile_seqmap_propagate_high_effort false
check_design
compile_ultra -no_autoungroup
ungroup -all -flatten
uniquify
define_name_rules verilog -remove_internal_net_bus -remove_port_bus
change_names -rule verilog -hierarchy
# report_qor
report_area
report_hierarchy
report_cell
report_reference
report_port
report_power
write -hierarchy -format verilog -output Vortex.netlist.v
remove_ideal_network [get_ports clk]
set_propagated_clock [get_ports clk]
write_sdc -version 1.9 Vortex.sdc
write_file -format ddc -output Vortex.ddc
exit

28
syn/run_mult_synth.sh Normal file
View File

@@ -0,0 +1,28 @@
#!/bin/bash
set top_level = Vortex
source /tools/synopsys/synthesis/j201409/cshrc.syn
set cur_dir = `pwd`
echo $cur_dir
for number_of_warps in 2 4 8 16 32; do
for number_of_threads in 2 4 8 16 32; do
echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Launched"
echo "\`define NT $number_of_threads" > ../rtl/VX_define_synth.v
echo "\`define NW $number_of_warps" >> ../rtl/VX_define_synth.v
make dc | tee run.log
sleep 30
moved_filename="${number_of_warps}_Warps__${number_of_threads}_threads__400MHz.log"
mv ./vortex_syn.log ../../$moved_filename
sleep 30
echo "Warp Count: $number_of_warps Thread Count: $number_of_threads Finished"
done
done
echo "Done!"