fixed OPAE crash, added custom bram module to controll rw collision, dogfood testcase argurment, optimzed buffered fifo, quartus build optimization flags

This commit is contained in:
Blaise Tine
2020-10-20 05:32:55 -07:00
parent 301cc45740
commit 7529f72c5d
22 changed files with 388 additions and 300 deletions

View File

@@ -7,7 +7,7 @@ CXXFLAGS += -I../include -I$(OPAE_HOME)/include -I../../hw
LDFLAGS += -L$(OPAE_HOME)/lib
SCOPE=1
#SCOPE=1
# stack execution protection
LDFLAGS +=-z noexecstack
@@ -32,8 +32,6 @@ ASE_LIBS += -luuid -lopae-c-ase
VLSIM_LIBS += -lopae-c-vlsim
LIB_DIR=../lib
ASE_DIR = ase
VLSIM_DIR = vlsim
@@ -67,10 +65,10 @@ fpga: $(SRCS)
asesim: $(SRCS) $(ASE_DIR)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $(PROJECT_ASE)
vlsim: $(SRCS) opae-vlsim
$(CXX) $(CXXFLAGS) -L./vlsim -DUSE_VLSIM $(SRCS) $(LDFLAGS) $(VLSIM_LIBS) -o $(PROJECT_VLSIM)
vlsim: $(SRCS) vlsim-hw
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -L./vlsim $(VLSIM_LIBS) -o $(PROJECT_VLSIM)
opae-vlsim:
vlsim-hw:
$(SET_SCOPE) $(MAKE) -C vlsim
vortex.o: vortex.cpp

View File

@@ -15,8 +15,8 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_CORE_REQ_INFO
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
#DBG_FLAGS += $(DBG_PRINT_FLAGS)
#DBG_FLAGS += -DDBG_CORE_REQ_INFO
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
@@ -72,9 +72,13 @@ ifdef SCOPE
SCOPE_VH = $(RTL_DIR)/scope-defs.vh
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
# use DPI FPU
VL_FLAGS += -DFPU_FAST
RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip
PROJECT = libopae-c-vlsim.so

View File

@@ -31,7 +31,7 @@
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d, %s!\n", \
printf("[VXDRV] Error: '%s' returned %d, %s!\n", \
#_expr, (int)res, fpgaErrStr(res)); \
return -1; \
} while (false)
@@ -118,7 +118,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
*value = STARTUP_ADDR;
break;
default:
fprintf(stderr, "invalid caps id: %d\n", caps_id);
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
std::abort();
return -1;
}
@@ -156,7 +156,7 @@ extern int vx_dev_open(vx_device_h* hdevice) {
fpgaDestroyProperties(&filter);
if (num_matches < 1) {
fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID);
fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID);
return -1;
}
@@ -197,9 +197,10 @@ extern int vx_dev_open(vx_device_h* hdevice) {
fpgaClose(accel_handle);
return ret;
}
fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
#ifndef NDEBUG
fprintf(stdout, "[VXDRV] DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
device->implementation_id, device->num_cores, device->num_warps, device->num_threads);
#endif
}
#ifdef SCOPE
@@ -236,18 +237,18 @@ extern int vx_dev_close(vx_device_h hdevice) {
int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles);
assert(ret == 0);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
assert(ret == 0);
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
#endif
@@ -373,7 +374,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &data));
if (0 == data || 0 == timeout) {
if (data != 0) {
fprintf(stdout, "ready-wait timed out: status=%ld\n", data);
fprintf(stdout, "[VXDRV] ready-wait timed out: status=%ld\n", data);
}
break;
}

View File

@@ -90,16 +90,20 @@ vx_buffer_h dst_buf = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
std::cout << "Usage: [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl;
std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:s:e:k:ch?")) != -1) {
while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
testid_s = atoi(optarg);
testid_e = atoi(optarg);
break;
case 's':
testid_s = atoi(optarg);
break;

View File

@@ -60,9 +60,9 @@ qsub-sim
make ase
# tests
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -t1 -n1
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n16
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n16
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
# modify "vsim_run.tcl" to dump VCD trace
@@ -97,7 +97,7 @@ kill -9 <pid>
# fixing device resource busy issue when deleting /build_ase_1c/
lsof +D build_ase_1c
# quick off cache synthesis
# quick off synthesis
make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 &
make -C cache clean && make -C cache > cache/build.log 2>&1 &
make -C core clean && make -C core > core/build.log 2>&1 &

View File

@@ -3,7 +3,7 @@
+define+SYNTHESIS
+define+QUARTUS
+define+FPU_FAST
+define+SCOPE
#+define+SCOPE
#+define+DBG_PRINT_CORE_ICACHE
#+define+DBG_PRINT_CORE_DCACHE

View File

@@ -7,3 +7,20 @@ set_global_assignment -name VERILOG_MACRO SYNTHESIS
set_global_assignment -name VERILOG_MACRO NDEBUG
set_global_assignment -name MESSAGE_DISABLE 16818
set_global_assignment -name VERILOG_MACRO FPU_FAST
set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)"
set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
set_global_assignment -name POWER_USE_TA_VALUE 65
set_global_assignment -name SEED 1
set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON
set_global_assignment -name FITTER_EFFORT "STANDARD FIT"
set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS"
set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED
set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM
set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE"

View File

@@ -175,8 +175,9 @@ logic [31:0] cmd_csr_wdata;
// MMIO controller ////////////////////////////////////////////////////////////
`IGNORE_WARNINGS_BEGIN
t_ccip_c0_ReqMmioHdr mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr);
t_ccip_c0_ReqMmioHdr mmio_hdr;
`IGNORE_WARNINGS_END
assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr);
`STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!"))
@@ -204,9 +205,20 @@ wire [2:0] cmd_type = (cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hd
reg scope_start;
`endif
// disable assertions until reset
`ifndef VERILATOR
initial begin
$assertoff;
end
`endif
always_ff @(posedge clk)
begin
if (reset) begin
`ifndef VERILATOR
$asserton; // enable assertions
`endif
mmio_tx.hdr <= 0;
mmio_tx.data <= 0;
mmio_tx.mmioRdValid <= 0;
@@ -324,6 +336,7 @@ begin
end
`endif
default: begin
mmio_tx.data <= 64'h0;
`ifdef DBG_PRINT_OPAE
$display("%t: Unknown MMIO Rd: addr=%0h", $time, mmio_hdr.address);
`endif

View File

@@ -59,8 +59,6 @@
`define EXT_F_ENABLE
`endif
//`define FPU_FAST
// Device identification
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0

View File

@@ -347,7 +347,7 @@ module VX_decode #(
assign decode_if.rd = rd;
assign decode_if.rs1 = rs1_qual;
assign decode_if.rs2 = rs2;
assign decode_if.rs3 = rs3;
assign decode_if.rs3 = 0;
`endif
assign decode_if.use_rs3 = use_rs3;

View File

@@ -1,70 +0,0 @@
`include "VX_define.vh"
// control module to support multi-cycle read for fp register
module VX_gpr_fp_ctrl (
input wire clk,
input wire reset,
input wire [`NUM_THREADS-1:0][31:0] rs1_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_data,
VX_gpr_req_if gpr_req_if,
// outputs
output wire [`NW_BITS+`NR_BITS-1:0] raddr1,
VX_gpr_rsp_if gpr_rsp_if
);
reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data, rsp_rs3_data;
reg rsp_valid;
reg [31:0] rsp_pc;
reg [`NW_BITS-1:0] rsp_wid;
reg read_rs1;
wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && read_rs1;
wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready;
always @(posedge clk) begin
if (reset) begin
rsp_valid <= 0;
rsp_pc <= 0;
rsp_rs1_data <= 0;
rsp_rs2_data <= 0;
rsp_rs3_data <= 0;
rsp_wid <= 0;
read_rs1 <= 1;
end else begin
if (rs3_delay) begin
read_rs1 <= 0;
rsp_wid <= gpr_req_if.wid;
end else if (read_fire) begin
read_rs1 <= 1;
end
rsp_valid <= gpr_req_if.valid;
rsp_wid <= gpr_req_if.wid;
rsp_pc <= gpr_req_if.PC;
if (read_rs1) begin
rsp_rs1_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data;
end
rsp_rs2_data <= (gpr_req_if.rs2 == 0) ? (`NUM_THREADS*32)'(0) : rs2_data;
rsp_rs3_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data;
assert(read_rs1 || rsp_wid == gpr_req_if.wid);
end
end
// outputs
wire [`NR_BITS-1:0] rs1 = read_rs1 ? gpr_req_if.rs1 : gpr_req_if.rs3;
assign raddr1 = {gpr_req_if.wid, rs1};
assign gpr_req_if.ready = ~rs3_delay;
assign gpr_rsp_if.valid = rsp_valid;
assign gpr_rsp_if.wid = rsp_wid;
assign gpr_rsp_if.PC = rsp_pc;
assign gpr_rsp_if.rs1_data = rsp_rs1_data;
assign gpr_rsp_if.rs2_data = rsp_rs2_data;
assign gpr_rsp_if.rs3_data = rsp_rs3_data;
endmodule

View File

@@ -12,21 +12,24 @@ module VX_gpr_ram (
);
`ifndef ASIC
reg [`NUM_THREADS-1:0][3:0][7:0] ram [(`NUM_WARPS * `NUM_REGS)-1:0];
reg [`NUM_THREADS-1:0][3:0][7:0] mem [(`NUM_WARPS * `NUM_REGS)-1:0];
reg [`NUM_THREADS-1:0][31:0] q1, q2;
always @(posedge clk) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
if (we[i]) begin
ram[waddr][i][0] <= wdata[i][07:00];
ram[waddr][i][1] <= wdata[i][15:08];
ram[waddr][i][2] <= wdata[i][23:16];
ram[waddr][i][3] <= wdata[i][31:24];
mem[waddr][i][0] <= wdata[i][07:00];
mem[waddr][i][1] <= wdata[i][15:08];
mem[waddr][i][2] <= wdata[i][23:16];
mem[waddr][i][3] <= wdata[i][31:24];
end
end
q1 <= mem[rs1];
q2 <= mem[rs2];
end
assign rs1_data = ram[rs1];
assign rs2_data = ram[rs2];
assign rs1_data = q1;
assign rs2_data = q2;
`else

View File

@@ -15,8 +15,15 @@ module VX_gpr_stage #(
);
`UNUSED_VAR (reset)
reg rsp_valid;
reg [`NW_BITS-1:0] rsp_wid;
reg [31:0] rsp_pc;
reg rs1_is_zero, rs2_is_zero;
wire [`NUM_THREADS-1:0][31:0] rs1_data, rs2_data;
wire [`NW_BITS+`NR_BITS-1:0] raddr1;
wire [`NW_BITS+`NR_BITS-1:0] raddr1, raddr2;
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
VX_gpr_ram gpr_ram (
.clk (clk),
@@ -24,60 +31,77 @@ module VX_gpr_stage #(
.waddr ({writeback_if.wid, writeback_if.rd}),
.wdata (writeback_if.data),
.rs1 (raddr1),
.rs2 ({gpr_req_if.wid, gpr_req_if.rs2}),
.rs2 (raddr2),
.rs1_data (rs1_data),
.rs2_data (rs2_data)
);
`ifdef EXT_F_ENABLE
VX_gpr_fp_ctrl VX_gpr_fp_ctrl (
.clk (clk),
.reset (reset),
.rs1_data (rs1_data),
.rs2_data (rs2_data),
.raddr1 (raddr1),
.gpr_req_if (gpr_req_if),
.gpr_rsp_if (gpr_rsp_if)
);
`else
reg [`NUM_THREADS-1:0][31:0] rsp_rs1_data, rsp_rs2_data;
reg rsp_valid;
reg [`NW_BITS-1:0] rsp_wid;
reg [31:0] rsp_pc;
always @(posedge clk) begin
if (reset) begin
rsp_valid <= 0;
rsp_wid <= 0;
rsp_pc <= 0;
rsp_rs1_data <= 0;
rsp_rs2_data <= 0;
rs1_is_zero <= 0;
rs2_is_zero <= 0;
end else begin
rsp_valid <= gpr_req_if.valid;
rsp_wid <= gpr_req_if.wid;
rsp_pc <= gpr_req_if.PC;
rsp_rs1_data <= (gpr_req_if.rs1 == 0) ? (`NUM_THREADS*32)'(0) : rs1_data;
rsp_rs2_data <= (gpr_req_if.rs2 == 0) ? (`NUM_THREADS*32)'(0) : rs2_data;
rs1_is_zero <= (0 == gpr_req_if.rs1);
rs2_is_zero <= (0 == gpr_req_if.rs2);
end
end
`ifdef EXT_F_ENABLE
reg [`NUM_THREADS-1:0][31:0] rs3_data;
reg read_rs3, save_rs3;
wire rs3_delay = gpr_req_if.valid && gpr_req_if.use_rs3 && !read_rs3;
wire read_fire = gpr_req_if.valid && gpr_rsp_if.ready;
always @(posedge clk) begin
if (reset) begin
rs3_data <= 0;
read_rs3 <= 0;
end else begin
if (rs3_delay) begin
read_rs3 <= 1;
save_rs3 <= 1;
end else if (read_fire) begin
read_rs3 <= 0;
end
if (save_rs3) begin
rs3_data <= rs1_data;
save_rs3 <= 0;
end
assert(!read_rs3 || rsp_wid == gpr_req_if.wid);
end
end
assign raddr1 = {gpr_req_if.wid, (rs3_delay ? gpr_req_if.rs3 : gpr_req_if.rs1)};
assign gpr_req_if.ready = ~rs3_delay;
assign gpr_rsp_if.rs3_data = rs3_data;
`else
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign gpr_req_if.ready = 1;
assign gpr_rsp_if.valid = rsp_valid;
assign gpr_rsp_if.wid = rsp_wid;
assign gpr_rsp_if.PC = rsp_pc;
assign gpr_rsp_if.rs1_data = rsp_rs1_data;
assign gpr_rsp_if.rs2_data = rsp_rs2_data;
assign gpr_rsp_if.rs3_data = 0;
`UNUSED_VAR (gpr_req_if.valid);
`UNUSED_VAR (gpr_req_if.rs3);
`UNUSED_VAR (gpr_req_if.use_rs3);
`UNUSED_VAR (gpr_rsp_if.ready);
`endif
assign gpr_rsp_if.rs1_data = rs1_is_zero ? (`NUM_THREADS*32)'(0) : rs1_data;
assign gpr_rsp_if.rs2_data = rs2_is_zero ? (`NUM_THREADS*32)'(0) : rs2_data;
assign gpr_rsp_if.valid = rsp_valid;
assign gpr_rsp_if.wid = rsp_wid;
assign gpr_rsp_if.PC = rsp_pc;
assign writeback_if.ready = 1'b1;
endmodule

View File

@@ -20,13 +20,13 @@ module VX_ibuffer #(
localparam ADDRW = $clog2(SIZE);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
reg [SIZEW-1:0] size_r [`NUM_WARPS-1:0];
wire [`NUM_WARPS-1:0] q_full;
wire [`NUM_WARPS-1:0][SIZEW-1:0] q_size;
wire [DATAW-1:0] q_data_in;
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev;
reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
reg [SIZEW-1:0] size_r [`NUM_WARPS-1:0];
wire enq_fire = ibuf_enq_if.valid && ibuf_enq_if.ready;
wire deq_fire = ibuf_deq_if.valid && ibuf_deq_if.ready;
@@ -36,7 +36,7 @@ module VX_ibuffer #(
wire writing = enq_fire && (i == ibuf_enq_if.wid);
wire reading = deq_fire && (i == ibuf_deq_if.wid);
wire is_slot0 = ((0 == size_r[i]) || ((1 == size_r[i]) && reading));
wire is_slot0 = (0 == size_r[i]) || ((1 == size_r[i]) && reading);
wire push = writing && !is_slot0;
wire pop = reading && (size_r[i] != 1);
@@ -48,35 +48,36 @@ module VX_ibuffer #(
.clk (clk),
.reset (reset),
.push (push),
.data_in (q_data_in),
.pop (pop),
.data_in (q_data_in),
.data_out (q_data_prev[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full),
`UNUSED_PIN (size)
);
always @(posedge clk) begin
if (writing && is_slot0) begin
q_data_out[i] <= q_data_in;
end
if (pop) begin
q_data_out[i] <= q_data_prev[i];
end
end
always @(posedge clk) begin
if (reset) begin
size_r[i] <= 0;
end else begin
if (writing && !reading) begin
if (writing) begin
if (is_slot0) begin
q_data_out[i] <= q_data_in;
end
if (!reading) begin
size_r[i] <= size_r[i] + SIZEW'(1);
end
if (reading && !writing) begin
end
if (reading) begin
if (size_r[i] != 1) begin
q_data_out[i] <= q_data_prev[i];
end
if (!writing) begin
size_r[i] <= size_r[i] - SIZEW'(1);
end
end
end
end
assign q_full[i] = (size_r[i] == SIZE);
assign q_size[i] = size_r[i];

View File

@@ -51,8 +51,8 @@
///////////////////////////////////////////////////////////////////////////////
`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *)
`define RELAXED_RW_BRAM (* syn_ramstyle = "no_rw_check" *)
`define USE_FAST_BRAM (* ramstyle="mlab" *)
`define NO_RW_RAM_CHECK (* ramstyle="no_rw_check" *)
///////////////////////////////////////////////////////////////////////////////

View File

@@ -447,6 +447,8 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin
assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = inst_meta_st1;
end else begin
assign {debug_pc_st1, debug_rd_st1, debug_wid_st1, debug_tagid_st1, debug_rw_st1, debug_byteen_st1, debug_tid_st1} = 0;
end
`endif

View File

@@ -58,6 +58,7 @@ module VX_cache_miss_resrv #(
);
reg [`MRVQ_METADATA_WIDTH-1:0] metadata_table[MRVQ_SIZE-1:0];
reg [MRVQ_SIZE-1:0][`LINE_ADDR_WIDTH-1:0] addr_table;
reg [MRVQ_SIZE-1:0] valid_table;
reg [MRVQ_SIZE-1:0] ready_table;
reg [`LOG2UP(MRVQ_SIZE)-1:0] schedule_ptr;

View File

@@ -30,7 +30,6 @@ module VX_tag_data_store #(
input wire fill_sent
);
reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0][7:0] data [`BANK_LINE_COUNT-1:0];
reg [`TAG_SELECT_BITS-1:0] tag [`BANK_LINE_COUNT-1:0];
reg [`BANK_LINE_WORDS-1:0][WORD_SIZE-1:0] dirtyb[`BANK_LINE_COUNT-1:0];
reg [`BANK_LINE_COUNT-1:0] dirty;
@@ -40,7 +39,6 @@ module VX_tag_data_store #(
assign read_dirty = dirty [read_addr];
assign read_dirtyb = dirtyb [read_addr];
assign read_tag = tag [read_addr];
assign read_data = data [read_addr];
wire do_write = (| write_enable);
@@ -69,15 +67,26 @@ module VX_tag_data_store #(
if (invalidate) begin
valid[write_addr] <= 0;
end
end
end
for (integer j = 0; j < `BANK_LINE_WORDS; j++) begin
for (integer i = 0; i < WORD_SIZE; i++) begin
if (write_enable[j][i]) begin
data[write_addr][j][i] <= write_data[j * `WORD_WIDTH + i * 8 +: 8];
end
end
end
end
end
wire [(`BANK_LINE_WORDS * WORD_SIZE)-1:0] ram_wren;
assign ram_wren = write_enable & {(`BANK_LINE_WORDS * WORD_SIZE){!stall_bank_pipe}};
VX_dp_ram #(
.DATAW(`BANK_LINE_WORDS * WORD_SIZE * 8),
.SIZE(`BANK_LINE_COUNT),
.BYTEENW(`BANK_LINE_WORDS * WORD_SIZE),
.BUFFERED(0),
.RWCHECK(1)
) dp_ram (
.clk(clk),
.waddr(write_addr),
.raddr(read_addr),
.wren(ram_wren),
.rden(1'b1),
.din(write_data),
.dout(read_data)
);
endmodule

117
hw/rtl/libs/VX_dp_ram.v Normal file
View File

@@ -0,0 +1,117 @@
`include "VX_platform.vh"
module VX_dp_ram #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter BYTEENW = 1,
parameter BUFFERED = 1,
parameter RWCHECK = 1,
parameter ADDRW = $clog2(SIZE),
parameter SIZEW = $clog2(SIZE+1)
) (
input wire clk,
input wire [ADDRW-1:0] waddr,
input wire [ADDRW-1:0] raddr,
input wire [BYTEENW-1:0] wren,
input wire rden,
input wire [DATAW-1:0] din,
output wire [DATAW-1:0] dout
);
if (BUFFERED) begin
reg [DATAW-1:0] mem [SIZE-1:0];
reg [DATAW-1:0] dout_r;
if (BYTEENW > 1) begin
always @(posedge clk) begin
for (integer i = 0; i < BYTEENW; i++) begin
if (wren[i])
mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8];
end
if (rden)
dout_r <= mem[raddr];
end
end else begin
always @(posedge clk) begin
if (wren)
mem[waddr] <= din;
if (rden)
dout_r <= mem[raddr];
end
end
assign dout = dout_r;
end else begin
`UNUSED_VAR(rden)
if (RWCHECK) begin
reg [DATAW-1:0] mem [SIZE-1:0];
if (BYTEENW > 1) begin
always @(posedge clk) begin
for (integer i = 0; i < BYTEENW; i++) begin
if (wren[i])
mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8];
end
end
end else begin
always @(posedge clk) begin
if (wren)
mem[waddr] <= din;
end
end
`ifdef SYNTHESIS
reg [DATAW-1:0] din_r;
wire writing;
if (BYTEENW > 1) begin
assign writing = (| wren);
always @(posedge clk) begin
for (integer i = 0; i < BYTEENW; i++) begin
din_r[i * 8 +: 8] <= wren[i] ? din[i * 8 +: 8] : mem[waddr][i * 8 +: 8];
end
end
end else begin
assign writing = wren;
always @(posedge clk) begin
din_r <= din;
end
end
reg bypass_r;
always @(posedge clk) begin
bypass_r <= writing && (raddr == waddr);
end
assign dout = bypass_r ? din_r : mem[raddr];
`else
assign dout = mem[raddr];
`endif
end else begin
reg [DATAW-1:0] mem [SIZE-1:0];
if (BYTEENW > 1) begin
always @(posedge clk) begin
for (integer i = 0; i < BYTEENW; i++) begin
if (wren[i])
mem[waddr][i * 8 +: 8] <= din[i * 8 +: 8];
end
end
end else begin
always @(posedge clk) begin
if (wren)
mem[waddr] <= din;
end
end
assign dout = mem[raddr];
end
end
endmodule

View File

@@ -19,15 +19,10 @@ module VX_generic_queue #(
);
`STATIC_ASSERT(`ISPOW2(SIZE), ("must be 0 or power of 2!"))
always @(*) begin
assert(!pop || !empty);
assert(!push || !full);
end
if (SIZE == 1) begin
if (SIZE == 1) begin // (SIZE == 1)
reg [SIZEW-1:0] size_r;
reg [DATAW-1:0] head_r;
reg size_r;
always @(posedge clk) begin
if (reset) begin
@@ -35,8 +30,10 @@ module VX_generic_queue #(
size_r <= 0;
end else begin
if (push && !pop) begin
assert(!full);
size_r <= 1;
end else if (pop && !push) begin
assert(!empty);
size_r <= 0;
end
if (push) begin
@@ -50,62 +47,13 @@ module VX_generic_queue #(
assign full = (size_r != 0);
assign size = size_r;
end else begin // (SIZE > 1)
`ifdef QUARTUS
scfifo scfifo_component (
.clock (clk),
.data (data_in),
.rdreq (pop),
.wrreq (push),
.empty (empty),
.full (full),
.q (data_out),
.sclr (reset),
.usedw (),
.aclr (),
.almost_empty (),
.almost_full (),
.eccstatus ()
);
defparam
scfifo_component.lpm_type = "scfifo",
scfifo_component.intended_device_family = "Arria 10",
scfifo_component.lpm_numwords = SIZE,
scfifo_component.lpm_width = DATAW,
scfifo_component.lpm_widthu = $clog2(SIZE),
scfifo_component.lpm_showahead = "ON",
scfifo_component.add_ram_output_register = (BUFFERED ? "ON" : "ON"),
scfifo_component.use_eab = "ON";
reg [SIZEW-1:0] size_r;
always @(posedge clk) begin
if (reset) begin
size_r <= 0;
end else begin
if (push && !pop) begin
size_r <= size_r + SIZEW'(1);
end
if (pop && !push) begin
size_r <= size_r - SIZEW'(1);
end
end
end
assign size = size_r;
`else
`USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0];
if (0 == BUFFERED) begin
reg [SIZEW-1:0] size_r;
reg [ADDRW:0] rd_ptr_r;
reg [ADDRW:0] wr_ptr_r;
reg [ADDRW-1:0] used_r;
wire [ADDRW-1:0] rd_ptr_a = rd_ptr_r[ADDRW-1:0];
wire [ADDRW-1:0] wr_ptr_a = wr_ptr_r[ADDRW-1:0];
@@ -114,111 +62,127 @@ module VX_generic_queue #(
if (reset) begin
rd_ptr_r <= 0;
wr_ptr_r <= 0;
size_r <= 0;
used_r <= 0;
end else begin
if (push) begin
assert(!full);
wr_ptr_r <= wr_ptr_r + (ADDRW+1)'(1);
if (!pop) begin
size_r <= size_r + SIZEW'(1);
used_r <= used_r + ADDRW'(1);
end
end
if (pop) begin
assert(!empty);
rd_ptr_r <= rd_ptr_r + (ADDRW+1)'(1);
if (!push) begin
size_r <= size_r - SIZEW'(1);
used_r <= used_r - ADDRW'(1);
end
end
end
end
always @(posedge clk) begin
if (push) begin
data[wr_ptr_a] <= data_in;
end
end
VX_dp_ram #(
.DATAW(DATAW),
.SIZE(SIZE),
.BUFFERED(0),
.RWCHECK(1)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_a),
.raddr(rd_ptr_a),
.wren(push),
.rden(pop),
.din(data_in),
.dout(data_out)
);
assign data_out = data[rd_ptr_a];
assign empty = (wr_ptr_r == rd_ptr_r);
assign full = (wr_ptr_a == rd_ptr_a) && (wr_ptr_r[ADDRW] != rd_ptr_r[ADDRW]);
assign size = size_r;
assign size = {full, used_r};
end else begin
reg [SIZEW-1:0] size_r;
reg [DATAW-1:0] head_r;
reg [DATAW-1:0] curr_r;
wire [DATAW-1:0] dout;
reg [DATAW-1:0] din_r;
reg [ADDRW-1:0] wr_ptr_r;
reg [ADDRW-1:0] rd_ptr_r;
reg [ADDRW-1:0] rd_ptr_next_r;
reg [ADDRW-1:0] rd_ptr_n_r;
reg [ADDRW-1:0] used_r;
reg empty_r;
reg full_r;
reg bypass_r;
always @(posedge clk) begin
if (reset) begin
size_r <= 0;
curr_r <= 0;
wr_ptr_r <= 0;
rd_ptr_r <= 0;
rd_ptr_next_r <= 1;
rd_ptr_n_r <= 1;
empty_r <= 1;
full_r <= 0;
used_r <= 0;
end else begin
if (push) begin
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
if (!pop) begin
empty_r <= 0;
if (size_r == SIZEW'(SIZE-1)) begin
if (used_r == ADDRW'(SIZE-1)) begin
full_r <= 1;
end
size_r <= size_r + SIZEW'(1);
used_r <= used_r + ADDRW'(1);
end
end
if (pop) begin
rd_ptr_r <= rd_ptr_next_r;
rd_ptr_r <= rd_ptr_n_r;
if (SIZE > 2) begin
rd_ptr_next_r <= rd_ptr_r + ADDRW'(2);
rd_ptr_n_r <= rd_ptr_r + ADDRW'(2);
end else begin // (SIZE == 2);
rd_ptr_next_r <= ~rd_ptr_next_r;
rd_ptr_n_r <= ~rd_ptr_n_r;
end
if (!push) begin
if (size_r == SIZEW'(1)) begin
assert(rd_ptr_next_r == wr_ptr_r);
full_r <= 0;
if (used_r == ADDRW'(1)) begin
assert(rd_ptr_n_r == wr_ptr_r);
empty_r <= 1;
end;
full_r <= 0;
size_r <= size_r - SIZEW'(1);
used_r <= used_r - ADDRW'(1);
end
end
bypass_r <= push && (empty_r || ((size_r == SIZEW'(1)) && pop));
curr_r <= data_in;
end
end
always @(posedge clk) begin
if (reset) begin
head_r <= 0;
end else begin
if (push) begin
data[wr_ptr_r] <= data_in;
end
head_r <= data[pop ? rd_ptr_next_r : rd_ptr_r];
end
if (push && (empty_r || ((used_r == ADDRW'(1)) && pop))) begin
bypass_r <= 1;
din_r <= data_in;
end else if (pop)
bypass_r <= 0;
end
assign data_out = bypass_r ? curr_r : head_r;
VX_dp_ram #(
.DATAW(DATAW),
.SIZE(SIZE),
.BUFFERED(1),
.RWCHECK(0)
) dp_ram (
.clk(clk),
.waddr(wr_ptr_r),
.raddr(rd_ptr_n_r),
.wren(push),
.rden(pop),
.din(data_in),
.dout(dout)
);
assign data_out = bypass_r ? din_r : dout;
assign empty = empty_r;
assign full = full_r;
assign size = size_r;
assign size = {full_r, used_r};
end
`endif
end
endmodule

View File

@@ -43,17 +43,19 @@ set_global_assignment -name VERILOG_MACRO FPU_FAST
set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
set_global_assignment -name POWER_BOARD_THERMAL_MODEL "NONE (CONSERVATIVE)"
set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS"
set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON
set_global_assignment -name FITTER_EFFORT "STANDARD FIT"
set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED
set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
set_global_assignment -name POWER_USE_TA_VALUE 65
set_global_assignment -name SEED 1
set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON
set_global_assignment -name FITTER_EFFORT "STANDARD FIT"
set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS"
set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED
set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM
set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE"
set idx 0
foreach arg $q_args_orig {