This commit is contained in:
Blaise Tine
2020-09-08 13:05:47 -04:00
25 changed files with 2411 additions and 2605 deletions

View File

@@ -507,6 +507,12 @@ extern int vx_start(vx_device_h hdevice) {
// start execution
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN));
#ifdef SCOPE
sleep(15);
vx_scope_stop(device->fpga, 0);
exit(0);
#endif
return 0;
}

BIN
driver/tests/dogfood/kernel.bin Executable file → Normal file

Binary file not shown.

View File

@@ -131,9 +131,8 @@ void kernel_fmadd(void* arg) {
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
float d = a * b + c;
dst_ptr[offset+i] = d;
float c = a * b + b;
dst_ptr[offset+i] = c;
}
}
@@ -148,9 +147,8 @@ void kernel_fmsub(void* arg) {
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
float d = a * b - c;
dst_ptr[offset+i] = d;
float c = a * b - b;
dst_ptr[offset+i] = c;
}
}
@@ -165,9 +163,8 @@ void kernel_fnmadd(void* arg) {
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
float d =-a * b - c;
dst_ptr[offset+i] = d;
float c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
@@ -182,9 +179,8 @@ void kernel_fnmsub(void* arg) {
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
float d =-a * b + c;
dst_ptr[offset+i] = d;
float c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
@@ -199,11 +195,10 @@ void kernel_fnmadd_madd(void* arg) {
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
float d =-a * b - c;
float e = a * b + c;
float f = d + e;
dst_ptr[offset+i] = f;
float c =-a * b - b;
float d = a * b + b;
float e = c + d;
dst_ptr[offset+i] = e;
}
}

File diff suppressed because it is too large Load Diff

BIN
driver/tests/dogfood/kernel.elf Executable file → Normal file

Binary file not shown.

View File

@@ -253,8 +253,7 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i];
auto ref = a[i] * b[i] + x;
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@@ -282,8 +281,7 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i];
auto ref = a[i] * b[i] - x;
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@@ -311,8 +309,7 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i];
auto ref = -a[i] * b[i] - x;
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@@ -340,8 +337,7 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i];
auto ref = -a[i] * b[i] + x;
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@@ -369,10 +365,9 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
auto x = a[i] - b[i];
auto y = -a[i] * b[i] - x;
auto z = a[i] * b[i] + x;
auto ref = y + z;
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;

View File

@@ -79,6 +79,8 @@ tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd
tar -zcvf trace.vcd.tar.gz trace.vcd
tar -zcvf run.log.tar.gz run.log
tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd
tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd
tar -zcvf run.log.tar.gz build_ase_1c/work/run.log
# decompress VCD trace
tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz
@@ -104,6 +106,5 @@ make -C top clean && make -C top > top/build.log 2>&1 &
200 Mhz -> period = 1/200x10^6 = 5ns
if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz
# build rtlsim from driver tests
make -C ../../rtlsim clean && reset && make -C ../../rtlsim

View File

@@ -106,7 +106,7 @@ module ccip_std_afu #(
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
) vortex_afu_inst (
.clk (clk),
.SoftReset (reset_T1),
.reset (reset_T1),
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),

View File

@@ -18,7 +18,7 @@ module vortex_afu #(
) (
// global signals
input clk,
input SoftReset,
input reset,
// IF signals between CCI and AFU
input t_if_ccip_Rx cp2af_sRxPort,
@@ -191,7 +191,7 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
mmio_tx.hdr <= 0;
mmio_tx.data <= 0;
mmio_tx.mmioRdValid <= 0;
@@ -319,7 +319,7 @@ logic cmd_run_done;
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
state <= STATE_IDLE;
vx_reset <= 0;
end
@@ -484,18 +484,18 @@ begin
case (state)
CMD_MEM_READ: avs_address = cci_dram_rd_req_addr;
CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout)));
default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
endcase
case (state)
CMD_MEM_READ: avs_byteenable = 64'hffffffffffffffff;
CMD_MEM_WRITE: avs_byteenable = 64'hffffffffffffffff;
default: avs_byteenable = vx_dram_req_byteen_;
default: avs_byteenable = vx_dram_req_byteen_;
endcase
case (state)
CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)];
default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset;
default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset;
endcase
end
@@ -506,7 +506,7 @@ assign cmd_write_done = (cci_dram_wr_req_ctr >= cmd_data_size);
always_ff @(posedge clk)
begin
if (SoftReset)
if (reset)
begin
mem_bank_select <= 0;
avs_burstcount <= 1;
@@ -586,7 +586,7 @@ VX_generic_queue #(
.SIZE(AVS_RD_QUEUE_SIZE)
) avs_rd_req_queue (
.clk (clk),
.reset (SoftReset),
.reset (reset),
.push (avs_rtq_push),
.data_in ({vx_dram_req_tag, vx_dram_req_offset}),
.pop (avs_rtq_pop),
@@ -608,7 +608,7 @@ VX_generic_queue #(
.SIZE(AVS_RD_QUEUE_SIZE)
) avs_rd_rsp_queue (
.clk (clk),
.reset (SoftReset),
.reset (reset),
.push (avs_rdq_push),
.data_in (avs_readdata),
.pop (avs_rdq_pop),
@@ -655,7 +655,7 @@ assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait;
// Send read requests to CCI
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
cci_rd_req_addr <= 0;
cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0;
@@ -716,7 +716,7 @@ VX_generic_queue #(
.SIZE(CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
.reset (SoftReset),
.reset (reset),
.push (cci_rdq_push),
.data_in (cci_rdq_din),
.pop (cci_rdq_pop),
@@ -754,7 +754,7 @@ assign af2cp_sTxPort.c1.valid = cci_wr_req_enable && !avs_rdq_empty;
// Send write requests to CCI
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
cci_wr_req_addr <= 0;
cci_wr_req_ctr <= 0;
cci_wr_req_enable <= 0;
@@ -818,7 +818,7 @@ assign cmd_clflush_done = (0 == snp_rsp_ctr);
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
vx_snp_req_valid <= 0;
vx_snp_req_addr <= 0;
vx_snp_req_tag <= 0;
@@ -866,7 +866,7 @@ begin
`ifdef DBG_PRINT_OPAE
$display("%t: AFU Snp Rsp: tag=%0d, rem=%0d", $time, vx_snp_rsp_tag, snp_rsp_ctr_next);
`endif
end
end
end
end
@@ -887,7 +887,7 @@ assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_
always_ff @(posedge clk)
begin
if (SoftReset) begin
if (reset) begin
csr_io_req_sent <= 0;
cmd_csr_rdata <= 0;
end
@@ -918,7 +918,7 @@ Vortex #() vortex (
`SCOPE_SIGNALS_EXECUTE_BIND
.clk (clk),
.reset (SoftReset | vx_reset),
.reset (reset | vx_reset),
// DRAM request
.dram_req_valid (vx_dram_req_valid),
@@ -980,6 +980,13 @@ Vortex #() vortex (
`UNUSED_PIN (ebreak)
);
always @(posedge clk) begin
if (!reset) begin
// DRAM reads should only happen during vortex execution
assert(vx_busy || !vx_dram_rd_req_enable);
end
end
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef SCOPE
@@ -1049,7 +1056,7 @@ for (genvar i = 1; i < SCOPE_SR_DEPTH; i++) begin
.N (SCOPE_DATAW+2)
) scope_sr (
.clk (clk),
.reset (SoftReset),
.reset (reset),
.stall (0),
.flush (0),
.in (scope_data_in_st[i-1]),
@@ -1064,7 +1071,7 @@ VX_scope #(
.UPDW ($bits({`SCOPE_SIGNALS_UPD_LIST}))
) scope (
.clk (clk),
.reset (SoftReset),
.reset (reset),
.start (scope_data_in_ste[0]),
.stop (0),
.changed (scope_data_in_ste[1]),

View File

@@ -18,23 +18,35 @@ module VX_commit #(
VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if
);
localparam NCMTW = $clog2(`NUM_EXS*`NUM_THREADS+1);
// CSRs update
wire [`NUM_EXS-1:0] commited_mask;
assign commited_mask = {alu_commit_if.valid,
lsu_commit_if.valid,
wire [`NUM_EXS-1-1:0] exu_committed;
wire [`NUM_THREADS-1:0] lsu_committed;
wire [$clog2(`NUM_EXS-1+1)-1:0] exu_commits;
wire [$clog2(`NUM_THREADS+1)-1:0] lsu_commits;
assign exu_committed = {alu_commit_if.valid,
csr_commit_if.valid,
mul_commit_if.valid,
fpu_commit_if.valid,
gpu_commit_if.valid};
wire [$clog2(`NUM_EXS+1)-1:0] num_commits;
assign lsu_committed = {`NUM_THREADS{lsu_commit_if.valid}} & lsu_commit_if.tmask;
VX_countones #(
.N(`NUM_EXS)
) valids_counter (
.valids(commited_mask),
.count (num_commits)
.N(`NUM_EXS-1)
) exu_counter (
.valids(exu_committed),
.count (exu_commits)
);
VX_countones #(
.N(`NUM_THREADS)
) lsu_counter (
.valids(lsu_committed),
.count (lsu_commits)
);
fflags_t fflags;
@@ -54,20 +66,22 @@ module VX_commit #(
fflags_t fflags_r;
reg has_fflags_r;
reg [`NW_BITS-1:0] wid_r;
reg [$clog2(`NUM_EXS+1)-1:0] num_commits_r;
reg [$clog2(`NUM_EXS-1+1)-1:0] exu_cmt_r;
reg [$clog2(`NUM_THREADS+1)-1:0] lsu_cmt_r;
reg csr_update_r;
always @(posedge clk) begin
csr_update_r <= (| commited_mask);
fflags_r <= fflags;
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
wid_r <= fpu_commit_if.wid;
num_commits_r <= (num_commits << $clog2(`NUM_THREADS));
csr_update_r <= (| exu_committed) | lsu_commit_if.valid;
fflags_r <= fflags;
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
wid_r <= fpu_commit_if.wid;
exu_cmt_r <= exu_commits;
lsu_cmt_r <= lsu_commits;
end
assign cmt_to_csr_if.valid = csr_update_r;
assign cmt_to_csr_if.wid = wid_r;
assign cmt_to_csr_if.num_commits = num_commits_r;
assign cmt_to_csr_if.num_commits = {exu_cmt_r, `NT_BITS'(0)} + NCMTW'(lsu_cmt_r);
assign cmt_to_csr_if.has_fflags = has_fflags_r;
assign cmt_to_csr_if.fflags = fflags_r;

View File

@@ -59,6 +59,8 @@
`define EXT_F_ENABLE
`endif
//`define FPU_FAST
// Device identification
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0

View File

@@ -51,11 +51,11 @@ module VX_fpu_unit #(
.full (fpuq_full)
);
wire valid_in = fpu_req_if.valid && ~fpuq_full;
// can accept new request?
assign fpu_req_if.ready = ready_in && ~fpuq_full;
wire valid_in = fpu_req_if.valid && ~fpuq_full;
`ifdef FPU_FAST
VX_fp_fpga #(
@@ -135,6 +135,6 @@ module VX_fpu_unit #(
.out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, fpu_commit_if.has_fflags, fpu_commit_if.fflags})
);
assign ready_out = ~stall_out;
assign ready_out = ~stall_out;
endmodule

View File

@@ -118,7 +118,7 @@ module VX_ibuffer #(
deq_valid_n = 1;
deq_wid_n = `NW_BITS'(i);
deq_instr_n = q_data_out[i];
schedule_table_n[i] = 0;
schedule_table_n[i] = 0;
break;
end
end

View File

@@ -25,12 +25,11 @@ module VX_mul_unit #(
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [MULQ_BITS-1:0] tag_in, tag_out;
wire valid_out;
wire stall_out;
wire valid_out, ready_out;
wire mulq_full;
wire mulq_push = mul_req_if.valid && mul_req_if.ready;
wire mulq_pop = valid_out && ~stall_out;
wire mulq_pop = valid_out && ready_out;
VX_cam_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
@@ -48,12 +47,18 @@ module VX_mul_unit #(
.full (mulq_full)
);
wire valid_in = mul_req_if.valid && ~mulq_full;
///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] mul_result;
wire [MULQ_BITS-1:0] mul_tag;
wire is_mul_in = (alu_op == `MUL_MUL);
wire is_mul_out;
wire stall_mul;
wire is_mul_out;
wire mul_valid_out;
wire mul_valid_in = valid_in && !is_div_op;
wire mul_ready_in = ready_out || ~mul_valid_out;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@@ -71,7 +76,7 @@ module VX_mul_unit #(
.LATENCY(`LATENCY_IMUL)
) multiplier (
.clk(clk),
.enable(~stall_mul),
.enable(mul_ready_in),
.dataa(mul_in1),
.datab(mul_in2),
.result(mul_result_tmp)
@@ -80,19 +85,14 @@ module VX_mul_unit #(
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
end
wire [MULQ_BITS-1:0] mul_tag;
wire mul_valid_out;
wire mul_fire = mul_req_if.valid && mul_req_if.ready && !is_div_op;
VX_shift_register #(
.DATAW(1 + MULQ_BITS + 1),
.DEPTH(`LATENCY_IMUL)
) mul_shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall_mul),
.in({mul_fire, tag_in, is_mul_in}),
.enable(mul_ready_in),
.in({mul_valid_in, tag_in, is_mul_in}),
.out({mul_valid_out, mul_tag, is_mul_out})
);
@@ -100,13 +100,13 @@ module VX_mul_unit #(
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
wire is_rem_op = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU);
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
wire div_valid_in = mul_req_if.valid && is_div_op;
wire div_valid_in = valid_in && is_div_op;
wire div_ready_out = ready_out && ~mul_valid_out; // arbitration prioritizes MUL
wire div_ready_in;
wire div_ready_out;
wire div_valid_out;
wire is_div_out;
wire is_rem_op_out;
wire [MULQ_BITS-1:0] div_tag;
VX_serial_div #(
@@ -122,30 +122,25 @@ module VX_mul_unit #(
.ready_in(div_ready_in),
.valid_in(div_valid_in),
.signed_mode(is_signed_div),
.tag_in({tag_in, is_div_only}),
.tag_in({tag_in, is_rem_op}),
.numer(alu_in1),
.denom(alu_in2),
.quotient(div_result_tmp),
.remainder(rem_result_tmp),
.ready_out(div_ready_out),
.valid_out(div_valid_out),
.tag_out({div_tag, is_div_out})
.tag_out({div_tag, is_rem_op_out})
);
wire [`NUM_THREADS-1:0][31:0] div_result = is_div_out ? div_result_tmp : rem_result_tmp;
wire [`NUM_THREADS-1:0][31:0] div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
///////////////////////////////////////////////////////////////////////////
wire arbiter_hazard = mul_valid_out && div_valid_out;
assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign stall_mul = (stall_out && !is_div_op) || mulq_full;
assign div_ready_out = ~stall_out && ~arbiter_hazard; // arbitration prioritizes MUL
wire stall_in = stall_mul || ~div_ready_in;
wire stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign ready_out = ~stall_out;
assign valid_out = mul_valid_out || div_valid_out;
assign tag_out = mul_valid_out ? mul_tag : div_tag;
wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result;
VX_generic_register #(
@@ -160,6 +155,6 @@ module VX_mul_unit #(
);
// can accept new request?
assign mul_req_if.ready = ~stall_in;
assign mul_req_if.ready = (is_div_op ? div_ready_in : mul_ready_in) && ~mulq_full;
endmodule

View File

@@ -20,9 +20,9 @@ module VX_warp_sched #(
wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tm;
reg [`NUM_WARPS-1:0] active_warps; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n; // enforces round-robin, barrier, and non-speculating branches
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
// Lock warp until instruction decode to resolve branches
reg [`NUM_WARPS-1:0] fetch_lock;
@@ -46,12 +46,20 @@ module VX_warp_sched #(
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
always @(*) begin
active_warps_n = active_warps;
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n = warp_ctl_if.wspawn.wmask;
end
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
end
end
always @(*) begin
schedule_table_n = schedule_table;
if (warp_ctl_if.valid
&& warp_ctl_if.tmc.valid
&& (0 == warp_ctl_if.tmc.tmask)) begin
schedule_table_n[warp_ctl_if.wid] = 0;
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
schedule_table_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
end
if (scheduled_warp) begin // remove scheduled warp (round-robin)
schedule_table_n[warp_to_schedule] = 0;
@@ -82,7 +90,6 @@ module VX_warp_sched #(
end
end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps <= warp_ctl_if.wspawn.wmask;
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
use_wspawn_pc <= warp_ctl_if.wspawn.pc;
end
@@ -97,9 +104,6 @@ module VX_warp_sched #(
end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
stalled_warps[warp_ctl_if.wid] <= 0;
if (0 == warp_ctl_if.tmc.tmask) begin
active_warps[warp_ctl_if.wid] <= 0;
end
end else if (join_if.valid && !didnt_split) begin
if (!join_fall) begin
warp_pcs[join_if.wid] <= join_pc;
@@ -143,8 +147,10 @@ module VX_warp_sched #(
warp_pcs[ifetch_rsp_if.wid] <= ifetch_rsp_if.PC + 4;
end
active_warps <= active_warps_n;
// reset 'schedule_table' when it goes to zero
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps;
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps_n;
end
end

View File

@@ -51,9 +51,9 @@ module VX_fp_addmul #(
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -91,9 +91,9 @@ module VX_fp_addmul #(
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -131,9 +131,9 @@ module VX_fp_addmul #(
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),

View File

@@ -32,7 +32,7 @@ module VX_fp_div #(
`ifdef QUARTUS
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.b (datab[i]),

View File

@@ -27,7 +27,7 @@ module VX_fp_fpga #(
input wire ready_out,
output wire valid_out
);
localparam NUM_FPC = 8;
localparam NUM_FPC = 7;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in;
@@ -40,28 +40,28 @@ module VX_fp_fpga #(
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
reg [FPC_BITS-1:0] core_select;
reg do_sub, do_mul;
reg do_sub, do_mul, do_neg;
reg is_signed;
always @(*) begin
core_select = 'x;
do_sub = 'x;
do_mul = 'x;
is_signed = 'x;
do_sub = 'x;
do_mul = 'x;
do_neg = 'x;
is_signed = 'x;
case (op_type)
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; end
`FPU_NMADD: begin core_select = 3; do_sub = 0; end
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end
`FPU_DIV: begin core_select = 4; end
`FPU_SQRT: begin core_select = 5; end
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; do_neg = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; do_neg = 0; end
`FPU_NMADD: begin core_select = 2; do_sub = 0; do_neg = 1; end
`FPU_NMSUB: begin core_select = 2; do_sub = 1; do_neg = 1; end
`FPU_DIV: begin core_select = 3; end
`FPU_SQRT: begin core_select = 4; end
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
default: begin core_select = 0; end
endcase
end
@@ -116,6 +116,7 @@ module VX_fp_fpga #(
.ready_in (per_core_ready_in[2]),
.tag_in (tag_in),
.do_sub (do_sub),
.do_neg (do_neg),
.dataa (dataa),
.datab (datab),
.datac (datac),
@@ -125,40 +126,21 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[2])
);
VX_fp_nmadd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_nmadd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.do_sub (do_sub),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_div #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.dataa (dataa),
.datab (datab),
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_sqrt #(
@@ -167,14 +149,14 @@ module VX_fp_fpga #(
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_ftoi #(
@@ -183,32 +165,32 @@ module VX_fp_fpga #(
) fp_ftoi (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
);
VX_fp_itof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 7)),
.ready_in (per_core_ready_in[7]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[7]),
.tag_out (per_core_tag_out[7]),
.ready_out (per_core_ready_out[7]),
.valid_out (per_core_valid_out[7])
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
);
reg valid_out_n;
@@ -234,7 +216,7 @@ module VX_fp_fpga #(
end
end
assign ready_in = (& per_core_ready_in);
assign ready_in = per_core_ready_in[core_select];
assign valid_out = valid_out_n;
assign has_fflags = has_fflags_n;
assign tag_out = tag_out_n;

View File

@@ -39,7 +39,7 @@ module VX_fp_ftoi #(
`ifdef QUARTUS
acl_ftoi ftoi (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_ftoi #(
acl_ftou ftou (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_u)

View File

@@ -39,7 +39,7 @@ module VX_fp_itof #(
`ifdef QUARTUS
acl_itof itof (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_itof #(
acl_utof utof (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_u)

View File

@@ -17,7 +17,8 @@ module VX_fp_madd #(
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire do_neg,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
@@ -32,7 +33,7 @@ module VX_fp_madd #(
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
reg do_sub_r, do_neg_r;
for (genvar i = 0; i < LANES; i++) begin
@@ -50,9 +51,9 @@ module VX_fp_madd #(
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -90,9 +91,9 @@ module VX_fp_madd #(
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -126,18 +127,20 @@ module VX_fp_madd #(
end
`endif
assign result[i] = do_sub_r ? result_msub : result_madd;
wire [31:0] result_unqual = do_sub_r ? result_msub : result_madd;
assign result[i][31] = result_unqual[31] ^ do_neg_r;
assign result[i][30:0] = result_unqual[30:0];
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DATAW(TAGW + 1 + 1 + 1),
.DEPTH(`LATENCY_FMADD)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in, do_sub}),
.out({tag_out, valid_out, do_sub_r})
.in({tag_in, valid_in, do_sub, do_neg}),
.out({tag_out, valid_out, do_sub_r, do_neg_r})
);
assign ready_in = enable;

View File

@@ -1,197 +0,0 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_nmadd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_madd;
wire [31:0] result_msub;
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_madd),
.chainout()
);
defparam mac_fp_madd.operation_mode = "sp_mult_add";
defparam mac_fp_madd.use_chainin = "false";
defparam mac_fp_madd.adder_subtract = "false";
defparam mac_fp_madd.ax_clock = "0";
defparam mac_fp_madd.ay_clock = "0";
defparam mac_fp_madd.az_clock = "0";
defparam mac_fp_madd.output_clock = "0";
defparam mac_fp_madd.accumulate_clock = "none";
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
defparam mac_fp_madd.accum_pipeline_clock = "none";
defparam mac_fp_madd.mult_pipeline_clock = "0";
defparam mac_fp_madd.adder_input_clock = "0";
defparam mac_fp_madd.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_msub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable0}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_msub),
.chainout()
);
defparam mac_fp_msub.operation_mode = "sp_mult_add";
defparam mac_fp_msub.use_chainin = "false";
defparam mac_fp_msub.adder_subtract = "true";
defparam mac_fp_msub.ax_clock = "0";
defparam mac_fp_msub.ay_clock = "0";
defparam mac_fp_msub.az_clock = "0";
defparam mac_fp_msub.output_clock = "0";
defparam mac_fp_msub.accumulate_clock = "none";
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
defparam mac_fp_msub.accum_pipeline_clock = "none";
defparam mac_fp_msub.mult_pipeline_clock = "0";
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_neg (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable1}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_neg.operation_mode = "sp_add";
defparam mac_fp_neg.use_chainin = "false";
defparam mac_fp_neg.adder_subtract = "true";
defparam mac_fp_neg.ax_clock = "0";
defparam mac_fp_neg.ay_clock = "0";
defparam mac_fp_neg.az_clock = "none";
defparam mac_fp_neg.output_clock = "0";
defparam mac_fp_neg.accumulate_clock = "none";
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
defparam mac_fp_neg.accum_pipeline_clock = "none";
defparam mac_fp_neg.mult_pipeline_clock = "none";
defparam mac_fp_neg.adder_input_clock = "0";
defparam mac_fp_neg.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
end
`endif
end
VX_shift_register #(
.DATAW(1),
.DEPTH(`LATENCY_FMADD)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({do_sub}),
.out({do_sub_r})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = enable;
endmodule

View File

@@ -31,7 +31,7 @@ module VX_fp_sqrt #(
`ifdef QUARTUS
acl_fsqrt fsqrt (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result[i])

View File

@@ -91,6 +91,7 @@ module VX_fpnew #(
fpu_operands[0] = dataa;
fpu_operands[1] = datab;
fpu_operands[2] = datac;
case (op_type)
`FPU_ADD: begin
fpu_op = fpnew_pkg::ADD;
@@ -107,23 +108,23 @@ module VX_fpnew #(
`FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end
`FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
`FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
`FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
`FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
`FPU_MISC: begin
case (frm)
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
endcase
end
default:;

View File

@@ -9,7 +9,7 @@ interface VX_cmt_to_csr_if ();
wire [`NW_BITS-1:0] wid;
wire [$clog2(`NUM_EXS+1)-1:0] num_commits;
wire [$clog2(`NUM_EXS*`NUM_THREADS+1)-1:0] num_commits;
wire has_fflags;
fflags_t fflags;