diff --git a/ci/regression.sh b/ci/regression.sh index 13d8136d..eb00b259 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -15,6 +15,9 @@ set -e CONFIGS=-DEXT_M_DISABLE make -C hw/simulate CONFIGS=-DEXT_F_DISABLE make -C hw/simulate +# disable shared memory +CONFIGS=-DSM_ENABLE=0 make -C hw/simulate + # Blackbox tests ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" diff --git a/evaluation/scripts/README.txt b/evaluation/scripts/README.txt index 79cbc558..908b0147 100644 --- a/evaluation/scripts/README.txt +++ b/evaluation/scripts/README.txt @@ -5,16 +5,19 @@ Description: Makes the build in the opae directory with the specified core exists, a make clean command is ran before the build. Script waits until the inteldev script or quartus program is finished running. -Usage: ./build.sh -c [1|2|4|8|16] [-p [y|n]] +Usage: ./build.sh -c [1|2|4|8|16] [-p perf] [-w wait] Options: -c Core count (1, 2, 4, 8, or 16). -p - Performance profiling enable (y or n). Changes the source file in the + Performance profiling enable. Changes the source file in the opae directory to include/exclude "+define+PERF_ENABLE". + -w + Wait for the build to complete + _______________________________________________________________________________ diff --git a/evaluation/scripts/build.sh b/evaluation/scripts/build.sh index abb646f1..21b9f345 100755 --- a/evaluation/scripts/build.sh +++ b/evaluation/scripts/build.sh @@ -1,10 +1,23 @@ #!/bin/bash -while getopts c:p: flag +BUILD_DIR=../../hw/syn/opae + +perf=0 +wait=0 + +while getopts c:pwh flag do case "${flag}" in c) cores=${OPTARG};; #1, 2, 4, 8, 16 - p) perf=${OPTARG};; #perf counters enable (y/n) + p) perf=1;; #perf counters enable + w) wait=1;; # wait for build to complete + h) echo "Usage: -c [-p perf] [-w wait] [-h help]" + exit 0 + ;; + \?) + echo "Invalid option: -$OPTARG" 1>&2 + exit 1 + ;; esac done @@ -13,25 +26,22 @@ if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then exit 1 fi -cd ../../hw/syn/opae +cd ${BUILD_DIR} sources_file="./sources_${cores}c.txt" -if [ ${perf:0:1} = "n" ]; then - if grep -v '^ *#' ${sources_file} | grep -Fxq '+define+SYNTHESIS'; then - sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file} - elif ! grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then - sed -i '1s/^/#+define+PERF_ENABLE\n/' ${sources_file} - fi -elif [ ${perf:0:1} = "y" ]; then +if [ ${perf} = 1 ]; then if grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file} elif ! grep -Fxq '+define+PERF_ENABLE' ${sources_file}; then sed -i '1s/^/+define+PERF_ENABLE\n/' ${sources_file} fi else - echo 'Invalid parameter for argument -p (y/n expected)' - exit 1 + if grep -v '^ *#' ${sources_file} | grep -Fxq '+define+SYNTHESIS'; then + sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file} + elif ! grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then + sed -i '1s/^/#+define+PERF_ENABLE\n/' ${sources_file} + fi fi if [ -d "./build_fpga_{$cores}c" ]; then @@ -39,12 +49,12 @@ if [ -d "./build_fpga_{$cores}c" ]; then fi make "fpga-${cores}c" -sleep 30 - -pids=($(pgrep -f "${OPAE_PLATFORM_ROOT}|quartus")) -for pid in ${pids[@]}; do - while kill -0 ${pid} 2> /dev/null; do - sleep 30 +if [ ${wait} = 1 ]; then + sleep 30 + pids=($(pgrep -f "${OPAE_PLATFORM_ROOT}|quartus")) + for pid in ${pids[@]}; do + while kill -0 ${pid} 2> /dev/null; do + sleep 30 + done done -done - +fi diff --git a/evaluation/scripts/build_all_perf.sh b/evaluation/scripts/build_all_perf.sh index ffcfa211..336bb49e 100755 --- a/evaluation/scripts/build_all_perf.sh +++ b/evaluation/scripts/build_all_perf.sh @@ -2,6 +2,6 @@ for ((i=1; i <= 16; i=i*2)); do echo "Building ${i} core build..." - ./build.sh -c ${i} -p y + ./build.sh -c ${i} -p -w echo "Done ${i} core build." done diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp index f7d9a85c..70d49eae 100644 --- a/hw/dpi/float_dpi.cpp +++ b/hw/dpi/float_dpi.cpp @@ -26,9 +26,9 @@ extern "C" { void dpi_utof(int a, int frm, int* result, int* fflags); void dpi_fclss(int a, int* result); - void dpi_fsgnj(int a, int* result); - void dpi_fsgnjn(int a, int* result); - void dpi_fsgnjx(int a, int* result); + void dpi_fsgnj(int a, int b, int* result); + void dpi_fsgnjn(int a, int b, int* result); + void dpi_fsgnjx(int a, int b, int* result); void dpi_flt(int a, int b, int* result, int* fflags); void dpi_fle(int a, int b, int* result, int* fflags); @@ -244,21 +244,53 @@ void dpi_fmax(int a, int b, int* result, int* fflags) { } void dpi_fclss(int a, int* result) { - // TODO - *result = 0; + + int r = 0; // clear all bits + + bool fsign = (a >> 31); + uint32_t expo = (a >> 23) & 0xFF; + uint32_t fraction = a & 0x7FFFFF; + + if ((expo == 0) && (fraction == 0)) { + r = fsign ? (1 << 3) : (1 << 4); // +/- 0 + } else if ((expo == 0) && (fraction != 0)) { + r = fsign ? (1 << 2) : (1 << 5); // +/- subnormal + } else if ((expo == 0xFF) && (fraction == 0)) { + r = fsign ? (1<<0) : (1<<7); // +/- infinity + } else if ((expo == 0xFF ) && (fraction != 0)) { + if (!fsign && (fraction == 0x00400000)) { + r = (1 << 9); // quiet NaN + } else { + r = (1 << 8); // signaling NaN + } + } else { + r = fsign ? (1 << 1) : (1 << 6); // +/- normal + } + + *result = r; } -void dpi_fsgnj(int a, int* result) { - // TODO - *result = 0; +void dpi_fsgnj(int a, int b, int* result) { + + int sign = b & 0x80000000; + int r = sign | (a & 0x7FFFFFFF); + + *result = r; } -void dpi_fsgnjn(int a, int* result) { - // TODO - *result = 0; +void dpi_fsgnjn(int a, int b, int* result) { + + int sign = ~b & 0x80000000; + int r = sign | (a & 0x7FFFFFFF); + + *result = r; } -void dpi_fsgnjx(int a, int* result) { - // TODO - *result = 0; +void dpi_fsgnjx(int a, int b, int* result) { + + int sign1 = a & 0x80000000; + int sign2 = b & 0x80000000; + int r = (sign1 ^ sign2) | (a & 0x7FFFFFFF); + + *result = r; } \ No newline at end of file diff --git a/hw/dpi/float_dpi.vh b/hw/dpi/float_dpi.vh index 62f45432..bba4c891 100644 --- a/hw/dpi/float_dpi.vh +++ b/hw/dpi/float_dpi.vh @@ -18,9 +18,9 @@ import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, o import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags); import "DPI-C" context function void dpi_fclss(input int a, output int result); -import "DPI-C" context function void dpi_fsgnj(input int a, output int result); -import "DPI-C" context function void dpi_fsgnjn(input int a, output int result); -import "DPI-C" context function void dpi_fsgnjx(input int a, output int result); +import "DPI-C" context function void dpi_fsgnj(input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsgnjn(input int a, input int b, output int result); +import "DPI-C" context function void dpi_fsgnjx(input int a, input int b, output int result); import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags); import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags); diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 466512e7..ea9a766b 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -120,7 +120,7 @@ module VX_cluster #( .DATA_WIDTH (32), .ADDR_WIDTH (12), .BUFFERED_REQ (1), - .BUFFERED_RSP (`NUM_CORES >= 4) + .BUFFERED_RSP (1) ) csr_arb ( .clk (clk), .reset (reset), @@ -225,7 +225,7 @@ module VX_cluster #( .DATA_WIDTH (`L2DRAM_LINE_WIDTH), .TAG_IN_WIDTH (`XDRAM_TAG_WIDTH), .TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH), - .BUFFERED_REQ (`NUM_CORES >= 4), + .BUFFERED_REQ (1), .BUFFERED_RSP (1) ) dram_arb ( .clk (clk), diff --git a/hw/rtl/VX_databus_arb.v b/hw/rtl/VX_databus_arb.v index df5ac42a..6a77f385 100644 --- a/hw/rtl/VX_databus_arb.v +++ b/hw/rtl/VX_databus_arb.v @@ -21,7 +21,7 @@ module VX_databus_arb ( localparam SMEM_ASHIFT = `CLOG2(`SHARED_MEM_BASE_ADDR_ALIGN); localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); localparam REQ_ADDRW = 32 - REQ_ASHIFT; - localparam REQ_DATAW = REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam REQ_DATAW = 1 + REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; // @@ -30,41 +30,42 @@ module VX_databus_arb ( for (genvar i = 0; i < `NUM_THREADS; ++i) begin - wire cache_req_ready_in; - wire smem_req_ready_in; + wire cache_req_valid_out, cache_req_ready_out; + wire is_smem_addr_in, is_smem_addr_out; // select shared memory bus - wire is_smem_addr = core_req_if.valid[i] && `SM_ENABLE - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) - && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); + assign is_smem_addr_in = core_req_if.valid[i] && `SM_ENABLE + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); VX_skid_buffer #( .DATAW (REQ_DATAW) - ) cache_out_buffer ( + ) out_buffer ( .clk (clk), .reset (reset), - .valid_in (core_req_if.valid[i] && !is_smem_addr), - .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), - .ready_in (cache_req_ready_in), - .valid_out (cache_req_if.valid[i]), - .data_out ({cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), - .ready_out (cache_req_if.ready[i]) + .valid_in (core_req_if.valid[i]), + .data_in ({is_smem_addr_in, core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (core_req_if.ready[i]), + .valid_out (cache_req_valid_out), + .data_out ({is_smem_addr_out, cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_ready_out) ); - VX_skid_buffer #( - .DATAW (REQ_DATAW) - ) smem_out_buffer ( - .clk (clk), - .reset (reset), - .valid_in (core_req_if.valid[i] && is_smem_addr), - .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), - .ready_in (smem_req_ready_in), - .valid_out (smem_req_if.valid[i]), - .data_out ({smem_req_if.addr[i], smem_req_if.rw[i], smem_req_if.byteen[i], smem_req_if.data[i], smem_req_if.tag[i]}), - .ready_out (smem_req_if.ready[i]) - ); - - assign core_req_if.ready[i] = is_smem_addr ? smem_req_ready_in : cache_req_ready_in; + if (`SM_ENABLE ) begin + assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out; + assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out; + assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i]; + + assign smem_req_if.addr[i] = cache_req_if.addr[i]; + assign smem_req_if.rw[i] = cache_req_if.rw[i]; + assign smem_req_if.byteen[i] = cache_req_if.byteen[i]; + assign smem_req_if.data[i] = cache_req_if.data[i]; + assign smem_req_if.tag[i] = cache_req_if.tag[i]; + end else begin + `UNUSED_VAR (is_smem_addr_out) + assign cache_req_if.valid[i] = cache_req_valid_out; + assign cache_req_ready_out = cache_req_if.ready[i]; + end end // diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index f68ec116..628a5f27 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -1,6 +1,12 @@ `include "VX_define.vh" `include "VX_print_instr.vh" +`ifdef EXT_F_ENABLE + `define USED_REGS(f,r) used_regs[{f,r}] = 1 +`else + `define USED_REGS(f,r) used_regs[r] = 1 +`endif + module VX_decode #( parameter CORE_ID = 0 ) ( @@ -22,10 +28,12 @@ module VX_decode #( reg [`EX_BITS-1:0] ex_type; reg [`OP_BITS-1:0] op_type; reg [`MOD_BITS-1:0] op_mod; - reg [31:0] imm; - reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm; + reg [4:0] rd_r, rs1_r, rs2_r, rs3_r; + reg [31:0] imm; + reg use_rd, use_PC, use_imm; reg rd_fp, rs1_fp, rs2_fp; reg is_join, is_wstall; + reg [`NUM_REGS-1:0] used_regs; wire [31:0] instr = ifetch_rsp_if.instr; wire [6:0] opcode = instr[6:0]; @@ -45,21 +53,23 @@ module VX_decode #( always @(*) begin - ex_type = `EX_NOP; + ex_type = 0; op_type = 'x; op_mod = 'x; imm = 'x; use_rd = 0; - use_rs1 = 0; - use_rs2 = 0; - use_rs3 = 0; use_PC = 0; use_imm = 0; rd_fp = 0; rs1_fp = 0; rs2_fp = 0; is_join = 0; - is_wstall = 0; + is_wstall = 0; + used_regs = 0; + rd_r = rd; + rs1_r = rs1; + rs2_r = rs2; + rs3_r = rs3; case (opcode) `INST_I: begin @@ -78,8 +88,9 @@ module VX_decode #( op_mod = 0; imm = {{20{alu_imm[11]}}, alu_imm}; use_rd = 1; - use_rs1 = 1; use_imm = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); end `INST_R: begin ex_type = `EX_ALU; @@ -113,18 +124,21 @@ module VX_decode #( endcase op_mod = 0; end - use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; + use_rd = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end `INST_LUI: begin ex_type = `EX_ALU; op_type = `OP_BITS'(`ALU_LUI); - op_mod = 0; + op_mod = 0; + rs1_r = 0; imm = {upper_imm, 12'(0)}; use_rd = 1; - use_rs1 = 1; - use_imm = 1; + use_imm = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, 5'b0); end `INST_AUIPC: begin ex_type = `EX_ALU; @@ -134,6 +148,7 @@ module VX_decode #( use_rd = 1; use_PC = 1; use_imm = 1; + `USED_REGS (1'b0, rd); end `INST_JAL: begin ex_type = `EX_ALU; @@ -144,6 +159,7 @@ module VX_decode #( use_PC = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rd); end `INST_JALR: begin ex_type = `EX_ALU; @@ -151,9 +167,10 @@ module VX_decode #( op_mod = 1; imm = {{20{jalr_imm[11]}}, jalr_imm}; use_rd = 1; - use_rs1 = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b0, rs1); end `INST_B: begin ex_type = `EX_ALU; @@ -168,11 +185,11 @@ module VX_decode #( endcase op_mod = 1; imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0}; - use_rs1 = 1; - use_rs2 = 1; use_PC = 1; use_imm = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end `INST_SYS : begin if (func3 == 0) begin @@ -190,6 +207,7 @@ module VX_decode #( use_rd = 1; use_PC = 1; use_imm = 1; + `USED_REGS (1'b0, rd); end else begin ex_type = `EX_CSR; case (func3[1:0]) @@ -201,8 +219,10 @@ module VX_decode #( endcase imm = 32'(u_12); use_rd = 1; - use_rs1 = !func3[2]; use_imm = func3[2]; + `USED_REGS (1'b0, rd); + if (!func3[2]) + `USED_REGS (1'b0, rs1); end end `ifdef EXT_F_ENABLE @@ -212,10 +232,11 @@ module VX_decode #( ex_type = `EX_LSU; op_type = `OP_BITS'({1'b0, func3}); imm = {{20{u_12[11]}}, u_12}; - use_rd = 1; - use_rs1 = 1; - `ifdef EXT_F_ENABLE - rd_fp = (opcode == `INST_FL); + use_rd = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS ((opcode == `INST_FL), rd); + `ifdef EXT_F_ENABLE + rd_fp = (opcode == `INST_FL); `endif end `ifdef EXT_F_ENABLE @@ -225,8 +246,8 @@ module VX_decode #( ex_type = `EX_LSU; op_type = `OP_BITS'({1'b1, func3}); imm = {{20{func7[6]}}, func7, rd}; - use_rs1 = 1; - use_rs2 = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS ((opcode == `INST_FS), rs2); `ifdef EXT_F_ENABLE rs2_fp = (opcode == `INST_FS); `endif @@ -240,17 +261,18 @@ module VX_decode #( op_type = `OP_BITS'(opcode[3:0]); op_mod = func3; use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; - use_rs3 = 1; rd_fp = 1; rs1_fp = 1; - rs2_fp = 1; + rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); + `USED_REGS (1'b1, rs3); end `INST_FCI: begin ex_type = `EX_FPU; op_mod = func3; - use_rd = 1; + use_rd = 1; case (func7) 7'h00, // FADD 7'h04, // FSUB @@ -258,55 +280,61 @@ module VX_decode #( 7'h0C: // FDIV begin op_type = `OP_BITS'(func7[3:0]); - use_rd = 1; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h2C: begin op_type = `OP_BITS'(`FPU_SQRT); - use_rs1 = 1; rd_fp = 1; rs1_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); end 7'h50: begin op_type = `OP_BITS'(`FPU_CMP); - use_rs1 = 1; - use_rs2 = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h60: begin op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTWUS) : `OP_BITS'(`FPU_CVTWS); - use_rs1 = 1; rs1_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); end 7'h68: begin op_type = (instr[20]) ? `OP_BITS'(`FPU_CVTSWU) : `OP_BITS'(`FPU_CVTSW); - use_rs1 = 1; rd_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b0, rs1); end 7'h10: begin // FSGNJ=0, FSGNJN=1, FSGNJX=2 op_type = `OP_BITS'(`FPU_MISC); op_mod = {1'b0, func3[1:0]}; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h14: begin // FMIN=3, FMAX=4 op_type = `OP_BITS'(`FPU_MISC); op_mod = func3[0] ? 4 : 3; - use_rs1 = 1; - use_rs2 = 1; rd_fp = 1; rs1_fp = 1; rs2_fp = 1; + `USED_REGS (1'b1, rd); + `USED_REGS (1'b1, rs1); + `USED_REGS (1'b1, rs2); end 7'h70: begin if (func3[0]) begin @@ -316,15 +344,17 @@ module VX_decode #( // FMV.X.W=5 op_type = `OP_BITS'(`FPU_MISC); op_mod = 5; - end - use_rs1 = 1; - rs1_fp = 1; + end + rs1_fp = 1; + `USED_REGS (1'b0, rd); + `USED_REGS (1'b1, rs1); end 7'h78: begin // FMV.W.X=6 op_type = `OP_BITS'(`FPU_MISC); op_mod = 6; rd_fp = 1; + `USED_REGS (1'b1, rd); end default:; endcase @@ -335,28 +365,28 @@ module VX_decode #( case (func3) 3'h0: begin op_type = `OP_BITS'(`GPU_TMC); - use_rs1 = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); end 3'h1: begin op_type = `OP_BITS'(`GPU_WSPAWN); - use_rs1 = 1; - use_rs2 = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end 3'h2: begin op_type = `OP_BITS'(`GPU_SPLIT); - use_rs1 = 1; is_wstall = 1; + `USED_REGS (1'b0, rs1); end 3'h3: begin op_type = `OP_BITS'(`GPU_JOIN); is_join = 1; end 3'h4: begin - op_type = `OP_BITS'(`GPU_BAR); - use_rs1 = 1; - use_rs2 = 1; + op_type = `OP_BITS'(`GPU_BAR); is_wstall = 1; + `USED_REGS (1'b0, rs1); + `USED_REGS (1'b0, rs2); end default:; endcase @@ -366,10 +396,7 @@ module VX_decode #( end // disable write to integer register r0 - wire use_rd_qual = use_rd && (rd_fp || (rd != 0)); - - // EX_ALU needs rs1=0 for LUI operation - wire [4:0] rs1_qual = (opcode == `INST_LUI) ? 5'h0 : rs1; + wire wb = use_rd && (rd_fp || (rd_r != 0)); assign decode_if.valid = ifetch_rsp_if.valid; assign decode_if.wid = ifetch_rsp_if.wid; @@ -378,31 +405,27 @@ module VX_decode #( assign decode_if.ex_type = ex_type; assign decode_if.op_type = op_type; assign decode_if.op_mod = op_mod; - assign decode_if.wb = use_rd_qual; + assign decode_if.wb = wb; - `ifdef EXT_F_ENABLE - assign decode_if.rd = {rd_fp, rd}; - assign decode_if.rs1 = {rs1_fp, rs1_qual}; - assign decode_if.rs2 = {rs2_fp, rs2}; - assign decode_if.rs3 = {1'b1, rs3}; - `else - `UNUSED_VAR (rd_fp) - `UNUSED_VAR (rs1_fp) - `UNUSED_VAR (rs2_fp) - assign decode_if.rd = rd; - assign decode_if.rs1 = rs1_qual; - assign decode_if.rs2 = rs2; - assign decode_if.rs3 = rs3; - `endif +`ifdef EXT_F_ENABLE + assign decode_if.rd = {rd_fp, rd_r}; + assign decode_if.rs1 = {rs1_fp, rs1_r}; + assign decode_if.rs2 = {rs2_fp, rs2_r}; + assign decode_if.rs3 = {1'b1, rs3_r}; +`else + `UNUSED_VAR (rd_fp) + `UNUSED_VAR (rs1_fp) + `UNUSED_VAR (rs2_fp) + assign decode_if.rd = rd_r; + assign decode_if.rs1 = rs1_r; + assign decode_if.rs2 = rs2_r; + assign decode_if.rs3 = rs3_r; +`endif - assign decode_if.imm = imm; - assign decode_if.use_PC = use_PC; - assign decode_if.use_imm = use_imm; - - assign decode_if.used_regs = (`NUM_REGS'(use_rd) << decode_if.rd) - | (`NUM_REGS'(use_rs1) << decode_if.rs1) - | (`NUM_REGS'(use_rs2) << decode_if.rs2) - | (`NUM_REGS'(use_rs3) << decode_if.rs3); + assign decode_if.imm = imm; + assign decode_if.use_PC = use_PC; + assign decode_if.use_imm = use_imm; + assign decode_if.used_regs = used_regs; /////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_fpu_unit.v b/hw/rtl/VX_fpu_unit.v index 3b6dd20e..f0fbca65 100644 --- a/hw/rtl/VX_fpu_unit.v +++ b/hw/rtl/VX_fpu_unit.v @@ -54,7 +54,8 @@ module VX_fpu_unit #( .write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}), .read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}), .release_slot (fpuq_pop), - .full (fpuq_full) + .full (fpuq_full), + `UNUSED_PIN (empty) ); // can accept new request? diff --git a/hw/rtl/VX_ibuffer.v b/hw/rtl/VX_ibuffer.v index a67c7221..9de55ab2 100644 --- a/hw/rtl/VX_ibuffer.v +++ b/hw/rtl/VX_ibuffer.v @@ -82,8 +82,7 @@ module VX_ibuffer #( if (writing && is_slot0) begin q_data_out[i] <= q_data_in; - end - if (pop) begin + end else if (pop) begin q_data_out[i] <= q_data_prev[i]; end end diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 36164b94..85f0bbf5 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -38,7 +38,8 @@ module VX_instr_demux ( wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)), + .BUFFERED (1) ) alu_buffer ( .clk (clk), .reset (reset), @@ -55,7 +56,8 @@ module VX_instr_demux ( wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)), + .BUFFERED (1) ) lsu_buffer ( .clk (clk), .reset (reset), @@ -72,7 +74,8 @@ module VX_instr_demux ( wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32), + .BUFFERED (1) ) csr_buffer ( .clk (clk), .reset (reset), @@ -90,7 +93,8 @@ module VX_instr_demux ( wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)), + .BUFFERED (1) ) fpu_buffer ( .clk (clk), .reset (reset), @@ -111,7 +115,8 @@ module VX_instr_demux ( wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)), + .BUFFERED (1) ) gpu_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 7f35602b..9ea7093b 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -183,19 +183,44 @@ module VX_issue #( `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin - $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data); + $write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=", + $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd); + `PRINT_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS); + $write(", rs2_data="); + `PRINT_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS); + $write("\n"); end if (lsu_req_if.valid && lsu_req_if.ready) begin - $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data); + $write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=", + $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.offset); + `PRINT_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS); + $write(", data="); + `PRINT_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS); + $write("\n"); end if (csr_req_if.valid && csr_req_if.ready) begin - $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr, csr_req_if.rs1_data); + $write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=", + $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr); + `PRINT_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS); + $write("\n"); end if (fpu_req_if.valid && fpu_req_if.ready) begin - $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data); + $write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=", + $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd); + `PRINT_ARRAY1D(fpu_req_if.rs1_data, `NUM_THREADS); + $write(", rs2_data="); + `PRINT_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS); + $write(", rs3_data="); + `PRINT_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS); + $write("\n"); end if (gpu_req_if.valid && gpu_req_if.ready) begin - $display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd, gpu_req_if.rs1_data, gpu_req_if.rs2_data); + $write("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=", + $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd); + `PRINT_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS); + $write(", rs2_data="); + `PRINT_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); + $write("\n"); end end `endif diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 01a5f00b..6f393ed3 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -75,10 +75,11 @@ module VX_lsu_unit #( `UNUSED_VAR (rsp_type) reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; - reg [`NUM_THREADS-1:0] rsp_rem_mask_n; + wire [`NUM_THREADS-1:0] rsp_rem_mask_n; + wire [`NUM_THREADS-1:0] rsp_tmask; reg [`NUM_THREADS-1:0] req_sent_mask; - wire req_sent_all; + wire sent_all_ready; wire [`DCORE_TAG_ID_BITS-1:0] mbuf_waddr, mbuf_raddr; wire mbuf_full; @@ -88,18 +89,20 @@ module VX_lsu_unit #( assign req_offset[i] = req_addr[i][1:0]; end - wire mbuf_push = (| (dcache_req_if.valid & dcache_req_if.ready)) + wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; + + wire dcache_rsp_fire = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; + + wire mbuf_push = (| dcache_req_fire) && (0 == req_sent_mask) // first submission only && req_wb; // loads only - wire mbuf_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready; - - wire mbuf_pop = mbuf_pop_part && (rsp_rem_mask_n == 0 || rsp_is_dup); + wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); assign mbuf_raddr = dcache_rsp_if.tag[`DCORE_TAG_ID_BITS-1:0]; VX_index_buffer #( - .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * 2) + 1), + .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `LSU_BITS + (`NUM_THREADS * 2) + 1), .SIZE (`LSUQ_SIZE) ) req_metadata ( .clk (clk), @@ -107,26 +110,34 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_rd, req_wb, req_type, req_offset, req_is_dup}), - .read_data ({rsp_wid, rsp_pc, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup}), + .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup}), + .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), - .full (mbuf_full) + .full (mbuf_full), + `UNUSED_PIN (empty) ); - assign req_sent_all = (&(dcache_req_if.ready | req_sent_mask | ~req_tmask)) - || (req_is_dup && dcache_req_if.ready[0]); + always @(posedge clk) begin + if (mbuf_push) begin + pending_tags[mbuf_waddr] <= req_tag; + end + end + + assign sent_all_ready = &(dcache_req_if.ready | req_sent_mask); + + wire [`NUM_THREADS-1:0] req_sent_dup = {{(`NUM_THREADS-1){dcache_req_fire[0] && req_is_dup}}, 1'b0}; always @(posedge clk) begin if (reset) begin req_sent_mask <= 0; end else begin - if (req_sent_all) + if (sent_all_ready) req_sent_mask <= 0; else - req_sent_mask <= req_sent_mask | (dcache_req_if.valid & dcache_req_if.ready); + req_sent_mask <= req_sent_mask | dcache_req_fire | req_sent_dup; end - end + end // need to hold the acquired tag index until the full request is submitted reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold; @@ -136,20 +147,21 @@ module VX_lsu_unit #( req_tag_hold <= mbuf_waddr; end + wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; + assign rsp_rem_mask_n = rsp_rem_mask[mbuf_raddr] & ~dcache_rsp_if.valid; + always @(posedge clk) begin if (mbuf_push) begin - rsp_rem_mask[mbuf_waddr] <= req_tmask; - pending_tags[mbuf_waddr] <= req_tag; + rsp_rem_mask[mbuf_waddr] <= req_tmask_dup; end - if (mbuf_pop_part) begin + if (dcache_rsp_fire) begin rsp_rem_mask[mbuf_raddr] <= rsp_rem_mask_n; end end - wire req_ready_dep = (req_wb && ~mbuf_full) || (~req_wb && st_commit_if.ready); - - wire [`NUM_THREADS-1:0] dup_mask = {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1}; + wire req_ready_dep = (req_wb && ~mbuf_full) + || (~req_wb && st_commit_if.ready); // DCache Request @@ -181,23 +193,23 @@ module VX_lsu_unit #( end end - assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask & dup_mask & ~req_sent_mask; + assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask_dup & ~req_sent_mask; assign dcache_req_if.rw = {`NUM_THREADS{~req_wb}}; assign dcache_req_if.addr = mem_req_addr; assign dcache_req_if.byteen = mem_req_byteen; assign dcache_req_if.data = mem_req_data; `ifdef DBG_CACHE_REQ_INFO - assign dcache_req_if.tag = {`NUM_THREADS{{req_pc, req_wid, req_tag}}}; + assign dcache_req_if.tag = {`NUM_THREADS{req_pc, req_wid, req_tag}}; `else assign dcache_req_if.tag = {`NUM_THREADS{req_tag}}; `endif - assign ready_in = req_ready_dep && req_sent_all; + assign ready_in = req_ready_dep && sent_all_ready; // send store commit - wire is_store_rsp = req_valid && ~req_wb && req_sent_all; + wire is_store_rsp = req_valid && ~req_wb && sent_all_ready; assign st_commit_if.valid = is_store_rsp; assign st_commit_if.wid = req_wid; @@ -211,7 +223,7 @@ module VX_lsu_unit #( // load response formatting reg [`NUM_THREADS-1:0][31:0] rsp_data; - wire [`NUM_THREADS-1:0] rsp_tmask; + wire [`NUM_THREADS-1:0] rsp_tmask_qual; for (genvar i = 0; i < `NUM_THREADS; i++) begin wire [31:0] src_data = (i == 0 || rsp_is_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i]; @@ -234,7 +246,7 @@ module VX_lsu_unit #( end end - assign rsp_tmask = rsp_is_dup ? rsp_rem_mask[mbuf_raddr] : dcache_rsp_if.valid; + assign rsp_tmask_qual = rsp_is_dup ? rsp_tmask : dcache_rsp_if.valid; // send load commit @@ -247,15 +259,15 @@ module VX_lsu_unit #( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({(| dcache_rsp_if.valid), rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), - .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) + .data_in ({(| dcache_rsp_if.valid), rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? assign dcache_rsp_if.ready = ~load_rsp_stall; // scope registration - `SCOPE_ASSIGN (dcache_req_fire, dcache_req_if.valid & dcache_req_if.ready); + `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); `SCOPE_ASSIGN (dcache_req_wid, req_wid); `SCOPE_ASSIGN (dcache_req_pc, req_pc); `SCOPE_ASSIGN (dcache_req_addr, req_addr); @@ -269,15 +281,15 @@ module VX_lsu_unit #( `ifdef DBG_PRINT_CORE_DCACHE always @(posedge clk) begin - if ((| (dcache_req_if.valid & dcache_req_if.ready))) begin + if ((| dcache_req_fire)) begin if ((| dcache_req_if.rw)) $display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h", - $time, CORE_ID, req_wid, req_pc, (dcache_req_if.valid & dcache_req_if.ready), req_addr, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); + $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data); else $display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d, is_dup=%b", - $time, CORE_ID, req_wid, req_pc, (dcache_req_if.valid & dcache_req_if.ready), req_addr, dcache_req_if.tag, dcache_req_if.byteen, req_rd, req_is_dup); + $time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, req_rd, req_is_dup); end - if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin + if (dcache_rsp_fire) begin $display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h, is_dup=%b", $time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data, rsp_is_dup); end @@ -291,4 +303,4 @@ module VX_lsu_unit #( end `endif -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 5ec9c74a..312d3d58 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -81,4 +81,25 @@ `define LTRIM(x,s) x[s-1:0] +`define PRINT_ARRAY1D(a, m) \ + $write("{"); \ + for (integer i = (m-1); i >= 0; --i) begin \ + if (i != (m-1)) $write(", "); \ + $write("0x%0h", a[i]); \ + end \ + $write("}"); \ + +`define PRINT_ARRAY2D(a, m, n) \ + $write("{"); \ + for (integer i = n-1; i >= 0; --i) begin \ + if (i != (n-1)) $write(", "); \ + $write("{"); \ + for (integer j = (m-1); j >= 0; --j) begin \ + if (j != (m-1)) $write(", "); \ + $write("0x%0h", a[i][j]); \ + end \ + $write("}"); \ + end \ + $write("}") + `endif \ No newline at end of file diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index 17c67db5..2f2d34e9 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -121,7 +121,7 @@ module Vortex ( .NUM_REQS (`NUM_CLUSTERS), .DATA_WIDTH (32), .ADDR_WIDTH (12), - .BUFFERED_REQ (`NUM_CLUSTERS >= 4), + .BUFFERED_REQ (1), .BUFFERED_RSP (1) ) csr_arb ( .clk (clk), @@ -228,7 +228,7 @@ module Vortex ( .TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH), .TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH), .BUFFERED_REQ (1), - .BUFFERED_RSP (`NUM_CLUSTERS >= 4) + .BUFFERED_RSP (1) ) dram_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index f6c0dc12..4d6cae37 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -1,14 +1,13 @@ `include "VX_define.vh" `ifndef NOPAE -import local_mem_cfg_pkg::*; `include "afu_json_info.vh" `else `include "vortex_afu.vh" +`endif /* verilator lint_off IMPORTSTAR */ import ccip_if_pkg::*; import local_mem_cfg_pkg::*; /* verilator lint_on IMPORTSTAR */ -`endif module vortex_afu #( parameter NUM_LOCAL_MEM_BANKS = 2 diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 7629b264..5e62ab26 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -487,7 +487,8 @@ module VX_bank #( end VX_skid_buffer #( - .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS) + .DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS), + .BUFFERED (NUM_BANKS == 1) ) core_rsp_req ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 63133664..f0bf1d9d 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -168,8 +168,7 @@ module VX_cache #( .NUM_BANKS (NUM_BANKS) ) flush_ctrl ( .clk (clk), - .reset (reset), - .flush (flush), + .reset (reset || flush), .addr_out (flush_addr), .valid_out (flush_enable) ); diff --git a/hw/rtl/cache/VX_cache_core_rsp_merge.v b/hw/rtl/cache/VX_cache_core_rsp_merge.v index dad89c15..4bed779d 100644 --- a/hw/rtl/cache/VX_cache_core_rsp_merge.v +++ b/hw/rtl/cache/VX_cache_core_rsp_merge.v @@ -98,7 +98,8 @@ module VX_cache_core_rsp_merge #( wire core_rsp_valid_any = (| per_bank_core_rsp_valid); VX_skid_buffer #( - .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)) + .DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)), + .BUFFERED (1) ) pipe_reg ( .clk (clk), .reset (reset), @@ -146,7 +147,8 @@ module VX_cache_core_rsp_merge #( for (genvar i = 0; i < NUM_REQS; i++) begin VX_skid_buffer #( - .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH) + .DATAW (CORE_TAG_WIDTH + `WORD_WIDTH), + .BUFFERED (1) ) pipe_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/cache/VX_flush_ctrl.v b/hw/rtl/cache/VX_flush_ctrl.v index 261197b6..d9b7ca9d 100644 --- a/hw/rtl/cache/VX_flush_ctrl.v +++ b/hw/rtl/cache/VX_flush_ctrl.v @@ -9,8 +9,7 @@ module VX_flush_ctrl #( parameter NUM_BANKS = 1 ) ( input wire clk, - input wire reset, - input wire flush, + input wire reset, output wire [`LINE_SELECT_BITS-1:0] addr_out, output wire valid_out ); @@ -18,7 +17,7 @@ module VX_flush_ctrl #( reg [`LINE_SELECT_BITS-1:0] flush_ctr; always @(posedge clk) begin - if (reset || flush) begin + if (reset) begin flush_enable <= 1; flush_ctr <= 0; end else begin diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v index 4e030f98..a719ee55 100644 --- a/hw/rtl/fp_cores/VX_fp_cvt.v +++ b/hw/rtl/fp_cores/VX_fp_cvt.v @@ -3,10 +3,6 @@ /// Modified port of cast module from fpnew Libray /// reference: https://github.com/pulp-platform/fpnew -`ifndef SYNTHESIS -`include "float_dpi.vh" -`endif - module VX_fp_cvt #( parameter TAGW = 1, parameter LANES = 1 @@ -73,19 +69,19 @@ module VX_fp_cvt #( ); end - wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant; // input mantissa with implicit bit - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent; - wire [LANES-1:0] input_sign; + wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant; // input mantissa with implicit bit + wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent; + wire [LANES-1:0] input_sign; for (genvar i = 0; i < LANES; ++i) begin wire [INT_MAN_WIDTH-1:0] int_mantissa; wire [INT_MAN_WIDTH-1:0] fmt_mantissa; wire fmt_sign = dataa[i][31]; wire int_sign = dataa[i][31] & is_signed; - assign int_mantissa = int_sign ? $unsigned(-dataa[i]) : dataa[i]; + assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i]; assign fmt_mantissa = INT_MAN_WIDTH'({in_a_type[i].is_normal, dataa[i][MAN_BITS-1:0]}); - assign fmt_exponent[i] = $signed({1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]}); + assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]}; assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa; assign input_sign[i] = is_itof ? int_sign : fmt_sign; end @@ -115,7 +111,7 @@ module VX_fp_cvt #( wire [2:0] rnd_mode_s0; fp_type_t [LANES-1:0] in_a_type_s0; wire [LANES-1:0] input_sign_s0; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0; wire [LANES-1:0][LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; wire [LANES-1:0] mant_is_zero_s0; @@ -135,38 +131,93 @@ module VX_fp_cvt #( // Normalization - wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant; // normalized input mantissa - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp; // unbiased true exponent - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp; // re-biased exponent for destination + wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant; // normalized input mantissa + wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp; // unbiased true exponent + wire [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp; // re-biased exponent for destination for (genvar i = 0; i < LANES; ++i) begin `IGNORE_WARNINGS_BEGIN // Input mantissa needs to be normalized - wire signed [INT_EXP_WIDTH-1:0] fp_input_exp; - wire signed [INT_EXP_WIDTH-1:0] int_input_exp; - wire [LZC_RESULT_WIDTH:0] renorm_shamt_sgn; - - // signed form for calculations - assign renorm_shamt_sgn = $signed({1'b0, renorm_shamt_s0[i]}); + wire [INT_EXP_WIDTH-1:0] fp_input_exp; + wire [INT_EXP_WIDTH-1:0] int_input_exp; // Realign input mantissa, append zeroes if destination is wider assign input_mant[i] = encoded_mant_s0[i] << renorm_shamt_s0[i]; // Unbias exponent and compensate for shift - assign fp_input_exp = $signed(fmt_exponent_s0[i] + - (($signed({1'b0, in_a_type_s0[i].is_subnormal}) + - $signed(FMT_SHIFT_COMPENSATION - EXP_BIAS)) - - renorm_shamt_sgn)); + assign fp_input_exp = fmt_exponent_s0[i] + + {1'b0, in_a_type_s0[i].is_subnormal} + + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - + {1'b0, renorm_shamt_s0[i]}; - assign int_input_exp = $signed(INT_MAN_WIDTH - 1 - renorm_shamt_sgn); + assign int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]}; - assign input_exp[i] = is_itof_s0 ? int_input_exp : fp_input_exp; + assign input_exp[i] = is_itof_s0 ? int_input_exp : fp_input_exp; // Rebias the exponent - assign destination_exp[i] = input_exp[i] + $signed(EXP_BIAS); + assign destination_exp[i] = input_exp[i] + EXP_BIAS; `IGNORE_WARNINGS_END end + // Perform adjustments to mantissa and exponent + + wire [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant_s0; + wire [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt_s0; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s0; + wire [LANES-1:0] of_before_round_s0; + + for (genvar i = 0; i < LANES; ++i) begin + reg [2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift + reg [SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization + reg [INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments + reg of_before_round; + + always @(*) begin + `IGNORE_WARNINGS_BEGIN + // Default assignment + final_exp = destination_exp[i]; // take exponent as is, only look at lower bits + preshift_mant = {input_mant[i], 33'b0}; // Place mantissa to the left of the shifter + denorm_shamt = 0; // right of mantissa + of_before_round = 1'b0; + + // Handle INT casts + if (is_itof_s0) begin + if ($signed(destination_exp[i]) >= $signed(2**EXP_BITS-1)) begin + // Overflow or infinities (for proper rounding) + final_exp = (2**EXP_BITS-2); // largest normal value + preshift_mant = ~0; // largest normal value and RS bits set + of_before_round = 1'b1; + end else if ($signed(destination_exp[i]) < $signed(-MAN_BITS)) begin + // Limit the shift to retain sticky bits + final_exp = 0; // denormal result + denorm_shamt = denorm_shamt + (2 + MAN_BITS); // to sticky + end else if ($signed(destination_exp[i]) < $signed(1)) begin + // Denormalize underflowing values + final_exp = 0; // denormal result + denorm_shamt = denorm_shamt + 1 - destination_exp[i]; // adjust right shifting + end + end else begin + if ($signed(input_exp[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s0)) begin + // overflow: when converting to unsigned the range is larger by one + denorm_shamt = SHAMT_BITS'(0); // prevent shifting + of_before_round = 1'b1; + end else if ($signed(input_exp[i]) < $signed(-1)) begin + // underflow + denorm_shamt = MAX_INT_WIDTH + 1; // all bits go to the sticky + end else begin + // By default right shift mantissa to be an integer + denorm_shamt = (MAX_INT_WIDTH-1) - input_exp[i]; + end + end + `IGNORE_WARNINGS_END + end + + assign preshift_mant_s0[i] = preshift_mant; + assign denorm_shamt_s0[i] = denorm_shamt; + assign final_exp_s0[i] = final_exp; + assign of_before_round_s0[i] = of_before_round; + end + // Pipeline stage1 wire valid_in_s1; @@ -176,121 +227,68 @@ module VX_fp_cvt #( wire [2:0] rnd_mode_s1; fp_type_t [LANES-1:0] in_a_type_s1; wire [LANES-1:0] mant_is_zero_s1; - wire [LANES-1:0] input_sign_s1; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1; - wire signed [LANES-1:0][INT_EXP_WIDTH-1:0] destination_exp_s1; - wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1; - + wire [LANES-1:0] input_sign_s1; + wire [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant_s1; + wire [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt_s1; + wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1; + wire [LANES-1:0] of_before_round_s1; + VX_pipe_register #( - .DATAW (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + 2*INT_EXP_WIDTH)), + .DATAW (1 + TAGW + 1 + 1 + `FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + SHAMT_BITS + INT_EXP_WIDTH + 1)), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, mant_is_zero_s0, input_sign_s0, input_mant, input_exp, destination_exp}), - .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, input_mant_s1, input_exp_s1, destination_exp_s1}) + .data_in ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, mant_is_zero_s0, input_sign_s0, preshift_mant_s0, denorm_shamt_s0, final_exp_s0, of_before_round_s0}), + .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, preshift_mant_s1, denorm_shamt_s1, final_exp_s1, of_before_round_s1}) ); - // Casting - reg [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments - - reg [LANES-1:0][2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift - wire [LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant; // mantissa from shifter, with rnd bit - wire [LANES-1:0][MAN_BITS-1:0] final_mant; // mantissa after adjustments - wire [LANES-1:0][MAX_INT_WIDTH-1:0] final_int; // integer shifted in position - - reg [LANES-1:0][SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization - - wire [LANES-1:0][1:0] fp_round_sticky_bits, int_round_sticky_bits, round_sticky_bits; - reg [LANES-1:0] of_before_round; - - // Perform adjustments to mantissa and exponent + wire [LANES-1:0] rounded_sign; + wire [LANES-1:0][31:0] rounded_abs; // absolute value of result after rounding + wire [LANES-1:0][1:0] fp_round_sticky_bits, int_round_sticky_bits; + + // Rouding and classification + for (genvar i = 0; i < LANES; ++i) begin - always @(*) begin - `IGNORE_WARNINGS_BEGIN - // Default assignment - final_exp[i] = $unsigned(destination_exp_s1[i]); // take exponent as is, only look at lower bits - preshift_mant[i] = 65'b0; // initialize mantissa container with zeroes - denorm_shamt[i] = 0; // right of mantissa - of_before_round[i] = 1'b0; - - // Place mantissa to the left of the shifter - preshift_mant[i] = {input_mant_s1[i], 33'b0}; - - // Handle INT casts - if (is_itof_s1) begin - // Overflow or infinities (for proper rounding) - if ($signed(destination_exp_s1[i]) >= $signed(2**EXP_BITS-1)) begin - final_exp[i] = (2**EXP_BITS-2); // largest normal value - preshift_mant[i] = ~0; // largest normal value and RS bits set - of_before_round[i] = 1'b1; - // Denormalize underflowing values - end else if (($signed(destination_exp_s1[i]) < $signed(1)) - && ($signed(destination_exp_s1[i]) >= -$signed(MAN_BITS))) begin - final_exp[i] = 0; // denormal result - denorm_shamt[i] = $unsigned(denorm_shamt[i] + 1 - destination_exp_s1[i]); // adjust right shifting - // Limit the shift to retain sticky bits - end else if ($signed(destination_exp_s1[i]) < -$signed(MAN_BITS)) begin - final_exp[i] = 0; // denormal result - denorm_shamt[i] = $unsigned(denorm_shamt[i] + (2 + MAN_BITS)); // to sticky - end - end else begin - // By default right shift mantissa to be an integer - denorm_shamt[i] = (MAX_INT_WIDTH-1) - input_exp_s1[i]; - // overflow: when converting to unsigned the range is larger by one - if ($signed(input_exp_s1[i]) >= $signed(MAX_INT_WIDTH -1 + unsigned_s1)) begin - denorm_shamt[i] = SHAMT_BITS'(0); // prevent shifting - of_before_round[i] = 1'b1; - // underflow - end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin - denorm_shamt[i] = MAX_INT_WIDTH + 1; // all bits go to the sticky - end - end - `IGNORE_WARNINGS_END - end + wire [2*INT_MAN_WIDTH:0] destination_mant; + wire [MAN_BITS-1:0] final_mant; // mantissa after adjustments + wire [MAX_INT_WIDTH-1:0] final_int; // integer shifted in position + wire [1:0] round_sticky_bits; + wire [31:0] fmt_pre_round_abs; + wire [31:0] pre_round_abs; // Mantissa adjustment shift - assign destination_mant[i] = preshift_mant[i] >> denorm_shamt[i]; - + assign destination_mant = preshift_mant_s1[i] >> denorm_shamt_s1[i]; + // Extract final mantissa and round bit, discard the normal bit (for FP) - assign {final_mant[i], fp_round_sticky_bits[i][1]} = destination_mant[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; - assign {final_int[i], int_round_sticky_bits[i][1]} = destination_mant[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; + assign {final_mant, fp_round_sticky_bits[i][1]} = destination_mant[2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; + assign {final_int, int_round_sticky_bits[i][1]} = destination_mant[2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; // Collapse sticky bits - assign fp_round_sticky_bits[i][0] = (| destination_mant[i][NUM_FP_STICKY-1:0]); - assign int_round_sticky_bits[i][0] = (| destination_mant[i][NUM_INT_STICKY-1:0]); + assign fp_round_sticky_bits[i][0] = (| destination_mant[NUM_FP_STICKY-1:0]); + assign int_round_sticky_bits[i][0] = (| destination_mant[NUM_INT_STICKY-1:0]); // select RS bits for destination operation - assign round_sticky_bits[i] = is_itof_s1 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i]; - end + assign round_sticky_bits = is_itof_s1 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i]; - // Rouding and classification - - wire [LANES-1:0] rounded_sign; - wire [LANES-1:0][31:0] rounded_abs; // absolute value of result after rounding - - for (genvar i = 0; i < LANES; ++i) begin // Pack exponent and mantissa into proper rounding form - wire [31:0] fmt_pre_round_abs = {1'b0, final_exp[i][EXP_BITS-1:0], final_mant[i][MAN_BITS-1:0]}; - - // Sign-extend integer result - wire [31:0] ifmt_pre_round_abs = final_int[i]; + assign fmt_pre_round_abs = {1'b0, final_exp_s1[i][EXP_BITS-1:0], final_mant[MAN_BITS-1:0]}; // Select output with destination format and operation - wire [31:0] pre_round_abs = is_itof_s1 ? fmt_pre_round_abs : ifmt_pre_round_abs; + assign pre_round_abs = is_itof_s1 ? fmt_pre_round_abs : final_int; // Perform the rounding VX_fp_rounding #( .DAT_WIDTH (32) ) fp_rounding ( - .abs_value_i (pre_round_abs), - .sign_i (input_sign_s1[i]), - .round_sticky_bits_i (round_sticky_bits[i]), - .rnd_mode_i (rnd_mode_s1), - .effective_subtraction_i (1'b0), - .abs_rounded_o (rounded_abs[i]), - .sign_o (rounded_sign[i]), + .abs_value_i (pre_round_abs), + .sign_i (input_sign_s1[i]), + .round_sticky_bits_i(round_sticky_bits), + .rnd_mode_i (rnd_mode_s1), + .effective_subtraction_i(1'b0), + .abs_rounded_o (rounded_abs[i]), + .sign_o (rounded_sign[i]), `UNUSED_PIN (exact_zero_o) ); end @@ -306,23 +304,22 @@ module VX_fp_cvt #( wire [LANES-1:0] input_sign_s2; wire [LANES-1:0] rounded_sign_s2; wire [LANES-1:0][31:0] rounded_abs_s2; + wire [LANES-1:0] of_before_round_s2; VX_pipe_register #( - .DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1)), + .DATAW (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1 + 1)), .RESETW (1) ) pipe_reg2 ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, rounded_abs, rounded_sign}), - .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2}) + .data_in ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, rounded_abs, rounded_sign, of_before_round_s1}), + .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2}) ); wire [LANES-1:0] of_after_round; wire [LANES-1:0] uf_after_round; - wire [LANES-1:0][31:0] fmt_result; - wire [LANES-1:0][31:0] rounded_int_res; // after possible inversion wire [LANES-1:0] rounded_int_res_zero; // after rounding @@ -335,7 +332,7 @@ module VX_fp_cvt #( assign of_after_round[i] = (rounded_abs_s2[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp. // Negative integer result needs to be brought into two's complement - assign rounded_int_res[i] = rounded_sign_s2[i] ? $unsigned(-rounded_abs_s2[i]) : rounded_abs_s2[i]; + assign rounded_int_res[i] = rounded_sign_s2[i] ? (-rounded_abs_s2[i]) : rounded_abs_s2[i]; assign rounded_int_res_zero[i] = (rounded_int_res[i] == 0); end @@ -373,7 +370,7 @@ module VX_fp_cvt #( int_special_result[i][30:0] = 0; // alone yields 2**(31)-1 int_special_result[i][31] = ~unsigned_s2; // for unsigned casts yields 2**31 end else begin - int_special_result[i][30:0] = 2**(31) -1; // alone yields 2**(31)-1 + int_special_result[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1 int_special_result[i][31] = unsigned_s2; // for unsigned casts yields 2**31 end end @@ -381,7 +378,7 @@ module VX_fp_cvt #( // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned) assign int_result_is_special[i] = in_a_type_s2[i].is_nan | in_a_type_s2[i].is_inf - | of_before_round[i] + | of_before_round_s2[i] | (input_sign_s2[i] & unsigned_s2 & ~rounded_int_res_zero[i]); // All integer special cases are invalid @@ -399,11 +396,11 @@ module VX_fp_cvt #( wire [31:0] fp_result, int_result; wire inexact = is_itof_s2 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f; - : (| fp_round_sticky_bits[i]) | (~in_a_type_s2[i].is_inf & (of_before_round[i] | of_after_round[i])); + : (| fp_round_sticky_bits[i]) | (~in_a_type_s2[i].is_inf & (of_before_round_s2[i] | of_after_round[i])); - assign fp_regular_status.NV = is_itof_s2 & (of_before_round[i] | of_after_round[i]); // overflow is invalid for I2F casts + assign fp_regular_status.NV = is_itof_s2 & (of_before_round_s2[i] | of_after_round[i]); // overflow is invalid for I2F casts assign fp_regular_status.DZ = 1'b0; // no divisions - assign fp_regular_status.OF = ~is_itof_s2 & (~in_a_type_s2[i].is_inf & (of_before_round[i] | of_after_round[i])); // inf casts no OF + assign fp_regular_status.OF = ~is_itof_s2 & (~in_a_type_s2[i].is_inf & (of_before_round_s2[i] | of_after_round[i])); // inf casts no OF assign fp_regular_status.UF = uf_after_round[i] & inexact; assign fp_regular_status.NX = inexact; diff --git a/hw/rtl/fp_cores/VX_fp_div.v b/hw/rtl/fp_cores/VX_fp_div.v index 69c8e93a..7d4f5fc4 100644 --- a/hw/rtl/fp_cores/VX_fp_div.v +++ b/hw/rtl/fp_cores/VX_fp_div.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_div #( parameter TAGW = 1, parameter LANES = 1 diff --git a/hw/rtl/fp_cores/VX_fp_fma.v b/hw/rtl/fp_cores/VX_fp_fma.v index ce7efb24..4d095823 100644 --- a/hw/rtl/fp_cores/VX_fp_fma.v +++ b/hw/rtl/fp_cores/VX_fp_fma.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_fma #( parameter TAGW = 1, parameter LANES = 1 diff --git a/hw/rtl/fp_cores/VX_fp_sqrt.v b/hw/rtl/fp_cores/VX_fp_sqrt.v index 869da516..5aa0f134 100644 --- a/hw/rtl/fp_cores/VX_fp_sqrt.v +++ b/hw/rtl/fp_cores/VX_fp_sqrt.v @@ -1,5 +1,9 @@ `include "VX_define.vh" +`ifndef SYNTHESIS +`include "float_dpi.vh" +`endif + module VX_fp_sqrt #( parameter TAGW = 1, parameter LANES = 1 @@ -44,7 +48,7 @@ module VX_fp_sqrt #( fflags_t f; always @(*) begin - dpi_fsqrt (dataa[i], frm, r, f); + dpi_fsqrt (dataa[i], frm, r, f); end `UNUSED_VAR (f) diff --git a/hw/rtl/fp_cores/VX_fp_type.v b/hw/rtl/fp_cores/VX_fp_type.v index df202148..bdc41b86 100644 --- a/hw/rtl/fp_cores/VX_fp_type.v +++ b/hw/rtl/fp_cores/VX_fp_type.v @@ -10,7 +10,7 @@ module VX_fp_type ( ); wire is_normal = (exp_i != 8'd0) && (exp_i != 8'hff); wire is_zero = (exp_i == 8'd0) && (man_i == 23'd0); - wire is_subnormal = (exp_i == 8'd0) && !is_zero; + wire is_subnormal = (exp_i == 8'd0) && (man_i != 23'd0); wire is_inf = (exp_i == 8'hff) && (man_i == 23'd0); wire is_nan = (exp_i == 8'hff) && (man_i != 23'd0); wire is_signaling = is_nan && (man_i[22] == 1'b0); diff --git a/hw/rtl/fp_cores/VX_fpu_dpi.v b/hw/rtl/fp_cores/VX_fpu_dpi.v index c18be4ec..10dab769 100644 --- a/hw/rtl/fp_cores/VX_fpu_dpi.v +++ b/hw/rtl/fp_cores/VX_fpu_dpi.v @@ -330,9 +330,9 @@ module VX_fpu_dpi #( dpi_feq (dataa[i], datab[i], result_feq[i], fflags_feq[i]); dpi_fmin (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]); dpi_fmax (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]); - dpi_fsgnj (dataa[i], result_fsgnj[i]); - dpi_fsgnjn (dataa[i], result_fsgnjn[i]); - dpi_fsgnjx (dataa[i], result_fsgnjx[i]); + dpi_fsgnj (dataa[i], datab[i], result_fsgnj[i]); + dpi_fsgnjn (dataa[i], datab[i], result_fsgnjn[i]); + dpi_fsgnjx (dataa[i], datab[i], result_fsgnjx[i]); result_fmv[i] = dataa[i]; end end diff --git a/hw/rtl/libs/VX_index_buffer.v b/hw/rtl/libs/VX_index_buffer.v index fd0b7e07..62af4dbb 100644 --- a/hw/rtl/libs/VX_index_buffer.v +++ b/hw/rtl/libs/VX_index_buffer.v @@ -18,11 +18,12 @@ module VX_index_buffer #( input wire [ADDRW-1:0] release_addr, input wire release_slot, - output wire full + output wire empty, + output wire full ); reg [SIZE-1:0] free_slots, free_slots_n; reg [ADDRW-1:0] write_addr_r; - reg full_r; + reg empty_r, full_r; wire free_valid; wire [ADDRW-1:0] free_index; @@ -51,6 +52,7 @@ module VX_index_buffer #( if (reset) begin write_addr_r <= ADDRW'(1'b0); free_slots <= {SIZE{1'b1}}; + empty_r <= 1'b1; full_r <= 1'b0; end else begin if (release_slot) begin @@ -60,6 +62,7 @@ module VX_index_buffer #( write_addr_r <= free_index; end free_slots <= free_slots_n; + empty_r <= (& free_slots_n); full_r <= ~free_valid; end end @@ -81,6 +84,7 @@ module VX_index_buffer #( ); assign write_addr = write_addr_r; + assign empty = empty_r; assign full = full_r; - + endmodule \ No newline at end of file diff --git a/hw/rtl/libs/VX_skid_buffer.v b/hw/rtl/libs/VX_skid_buffer.v index 08377cfb..31d789a0 100644 --- a/hw/rtl/libs/VX_skid_buffer.v +++ b/hw/rtl/libs/VX_skid_buffer.v @@ -67,8 +67,7 @@ module VX_skid_buffer #( end else begin if (ready_out) begin use_buffer <= 0; - end - if (push && !pop) begin + end else if (push && valid_out_r) begin assert(!use_buffer); use_buffer <= 1; end @@ -81,9 +80,11 @@ module VX_skid_buffer #( always @(posedge clk) begin if (push) begin buffer <= data_in; - end - if (pop) begin - data_out_r <= use_buffer ? buffer : data_in; + end + if (pop && !use_buffer) begin + data_out_r <= data_in; + end else if (pop) begin + data_out_r <= buffer; end end diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 097bcc97..5a871248 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -138,5 +138,4 @@ clean-fpga-32c: clean-fpga-64c: rm -rf $(FPGA_BUILD_DIR)_64c sources.txt -clean: clean-ase-1c clean-ase-2c clean-ase-4c clean-fpga-1c clean-fpga-2c clean-fpga-4c clean-fpga-8c clean-fpga-16c clean-fpga-32c clean-fpga-64c - rm sources.txt \ No newline at end of file +clean: clean-ase-1c clean-ase-2c clean-ase-4c clean-fpga-1c clean-fpga-2c clean-fpga-4c clean-fpga-8c clean-fpga-16c clean-fpga-32c clean-fpga-64c \ No newline at end of file diff --git a/hw/syn/opae/sources_16c.txt b/hw/syn/opae/sources_16c.txt index 94aeb46c..cbee87e0 100644 --- a/hw/syn/opae/sources_16c.txt +++ b/hw/syn/opae/sources_16c.txt @@ -6,7 +6,7 @@ +define+QUARTUS #+define+PERF_ENABLE -vortex_afu.json +vortex_afu16.json QI:vortex_afu.qsf C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/sources_32c.txt b/hw/syn/opae/sources_32c.txt index e1bf6649..1fc88ecd 100644 --- a/hw/syn/opae/sources_32c.txt +++ b/hw/syn/opae/sources_32c.txt @@ -2,6 +2,8 @@ +define+NUM_CLUSTERS=4 #+define+L3_ENABLE=1 ++define+GLOBAL_BLOCK_SIZE=16 + +define+SYNTHESIS +define+QUARTUS #+define+PERF_ENABLE diff --git a/hw/syn/opae/sources_64c.txt b/hw/syn/opae/sources_64c.txt index 8cc42e1b..bf267717 100644 --- a/hw/syn/opae/sources_64c.txt +++ b/hw/syn/opae/sources_64c.txt @@ -2,6 +2,8 @@ +define+NUM_CLUSTERS=8 #+define+L3_ENABLE=1 ++define+GLOBAL_BLOCK_SIZE=16 + +define+SYNTHESIS +define+QUARTUS #+define+PERF_ENABLE diff --git a/hw/syn/opae/sources_8c.txt b/hw/syn/opae/sources_8c.txt index a41c281f..baafe36a 100644 --- a/hw/syn/opae/sources_8c.txt +++ b/hw/syn/opae/sources_8c.txt @@ -6,7 +6,7 @@ +define+QUARTUS #+define+PERF_ENABLE -vortex_afu.json +vortex_afu8.json QI:vortex_afu.qsf C:sources.txt \ No newline at end of file diff --git a/hw/syn/opae/vortex_afu16.json b/hw/syn/opae/vortex_afu16.json new file mode 100644 index 00000000..04700701 --- /dev/null +++ b/hw/syn/opae/vortex_afu16.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "afu-image": { + "power": 0, + "clock-frequency-high": "auto-200", + "clock-frequency-low": "auto-200", + + "cmd-mem-read": 1, + "cmd-mem-write": 2, + "cmd-run": 3, + "cmd-csr-read": 4, + "cmd-csr-write": 5, + + "mmio-cmd-type": 10, + "mmio-io-addr": 12, + "mmio-mem-addr": 14, + "mmio-data-size": 16, + "mmio-status": 18, + "mmio-scope-read": 20, + "mmio-scope-write": 22, + "mmio-csr-core": 24, + "mmio-csr-addr": 26, + "mmio-csr-data": 28, + "mmio-csr-read": 30, + + "afu-top-interface": + { + "class": "ccip_std_afu_avalon_mm", + "module-ports" : + [ + { + "class": "cci-p", + "params": + { + "clock": "uClk_usr" + } + }, + { + "class": "local-memory", + "params": + { + "clock": "uClk_usr" + } + } + ] + }, + "accelerator-clusters": + [ + { + "name": "vortex_afu", + "total-contexts": 1, + "accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c" + } + ] + } +} diff --git a/hw/syn/opae/vortex_afu8.json b/hw/syn/opae/vortex_afu8.json new file mode 100644 index 00000000..cef41fbf --- /dev/null +++ b/hw/syn/opae/vortex_afu8.json @@ -0,0 +1,57 @@ +{ + "version": 1, + "afu-image": { + "power": 0, + "clock-frequency-high": "auto-210", + "clock-frequency-low": "auto-210", + + "cmd-mem-read": 1, + "cmd-mem-write": 2, + "cmd-run": 3, + "cmd-csr-read": 4, + "cmd-csr-write": 5, + + "mmio-cmd-type": 10, + "mmio-io-addr": 12, + "mmio-mem-addr": 14, + "mmio-data-size": 16, + "mmio-status": 18, + "mmio-scope-read": 20, + "mmio-scope-write": 22, + "mmio-csr-core": 24, + "mmio-csr-addr": 26, + "mmio-csr-data": 28, + "mmio-csr-read": 30, + + "afu-top-interface": + { + "class": "ccip_std_afu_avalon_mm", + "module-ports" : + [ + { + "class": "cci-p", + "params": + { + "clock": "uClk_usr" + } + }, + { + "class": "local-memory", + "params": + { + "clock": "uClk_usr" + } + } + ] + }, + "accelerator-clusters": + [ + { + "name": "vortex_afu", + "total-contexts": 1, + "accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c" + } + ] + } + } + \ No newline at end of file diff --git a/hw/syn/quartus/Makefile b/hw/syn/quartus/Makefile new file mode 100644 index 00000000..66d95034 --- /dev/null +++ b/hw/syn/quartus/Makefile @@ -0,0 +1,37 @@ +.PHONY: unittest pipeline cache core vortex top1 top2 top4 top8 top16 top32 top64 + +unittest: + $(MAKE) -C unittest clean && $(MAKE) -C unittest > unittest/build.log 2>&1 & + +pipeline: + $(MAKE) -C pipeline clean && $(MAKE) -C pipeline > pipeline/build.log 2>&1 & + +cache: + $(MAKE) -C cache clean && $(MAKE) -C cache > cache/build.log 2>&1 & + +core: + $(MAKE) -C core clean && $(MAKE) -C core > core/build.log 2>&1 & + +vortex: + $(MAKE) -C vortex clean && $(MAKE) -C vortex > vortex/build.log 2>&1 & + +top1: + $(MAKE) -C top1 clean && $(MAKE) -C top1 > top1/build.log 2>&1 & + +top2: + $(MAKE) -C top2 clean && $(MAKE) -C top2 > top2/build.log 2>&1 & + +top4: + $(MAKE) -C top4 clean && $(MAKE) -C top4 > top4/build.log 2>&1 & + +top8: + $(MAKE) -C top8 clean && $(MAKE) -C top8 > top8/build.log 2>&1 & + +top16: + $(MAKE) -C top16 clean && $(MAKE) -C top16 > top16/build.log 2>&1 & + +top32: + $(MAKE) -C top32 clean && $(MAKE) -C top32 > top32/build.log 2>&1 & + +top64: + $(MAKE) -C top64 clean && $(MAKE) -C top64 > top64/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/quartus/project.tcl b/hw/syn/quartus/project.tcl index fccc5439..9fa3df14 100644 --- a/hw/syn/quartus/project.tcl +++ b/hw/syn/quartus/project.tcl @@ -41,10 +41,6 @@ set_global_assignment -name VERILOG_MACRO NDEBUG set_global_assignment -name MESSAGE_DISABLE 16818 set_global_assignment -name TIMEQUEST_DO_REPORT_TIMING ON -#set_global_assignment -name ALLOW_ANY_RAM_SIZE_FOR_RECOGNITION ON -#set_global_assignment -name USE_HIGH_SPEED_ADDER ON -#set_global_assignment -name MUX_RESTRUCTURE ON - #set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED #set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE" #set_global_assignment -name FINAL_PLACEMENT_OPTIMIZATION ALWAYS diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 17e2023b..b2e90b31 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -1,13 +1,20 @@ +FAMILY = "Arria 10" +DEVICE = 10AX115N3F40E2SG +FPU_CORE_PATH=../../../rtl/fp_cores/altera/arria10 + +#FAMILY = "Stratix 10" +#DEVICE = 1SX280HN2F43E2VG +#FPU_CORE_PATH=../../../rtl/fp_cores/altera/stratix10 + PROJECT = Vortex TOP_LEVEL_ENTITY = Vortex -SRC_FILE = Vortex.v -FPU_INCLUDE = ../../../rtl/fp_cores;../../../rtl/fp_cores/altera/arria10;../../../rtl/fp_cores/fpnew/src;../../../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;../../../rtl/fp_cores/fpnew/src/common_cells/include;../../../rtl/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/cache -PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf +SRC_FILE = Vortex.sv -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG +RTL_DIR=../../../rtl +FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) + +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on