fpga fixes: warp scheduler, fnmadd, fdiv, fsqrt
This commit is contained in:
BIN
driver/tests/dogfood/kernel.bin
Executable file → Normal file
BIN
driver/tests/dogfood/kernel.bin
Executable file → Normal file
Binary file not shown.
@@ -131,9 +131,8 @@ void kernel_fmadd(void* arg) {
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
float b = src1_ptr[offset+i];
|
||||
float c = a - b;
|
||||
float d = a * b + c;
|
||||
dst_ptr[offset+i] = d;
|
||||
float c = a * b + b;
|
||||
dst_ptr[offset+i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,9 +147,8 @@ void kernel_fmsub(void* arg) {
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
float b = src1_ptr[offset+i];
|
||||
float c = a - b;
|
||||
float d = a * b - c;
|
||||
dst_ptr[offset+i] = d;
|
||||
float c = a * b - b;
|
||||
dst_ptr[offset+i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -165,9 +163,8 @@ void kernel_fnmadd(void* arg) {
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
float b = src1_ptr[offset+i];
|
||||
float c = a - b;
|
||||
float d =-a * b - c;
|
||||
dst_ptr[offset+i] = d;
|
||||
float c =-a * b - b;
|
||||
dst_ptr[offset+i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -182,9 +179,8 @@ void kernel_fnmsub(void* arg) {
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
float b = src1_ptr[offset+i];
|
||||
float c = a - b;
|
||||
float d =-a * b + c;
|
||||
dst_ptr[offset+i] = d;
|
||||
float c =-a * b + b;
|
||||
dst_ptr[offset+i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,11 +195,10 @@ void kernel_fnmadd_madd(void* arg) {
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
float b = src1_ptr[offset+i];
|
||||
float c = a - b;
|
||||
float d =-a * b - c;
|
||||
float e = a * b + c;
|
||||
float f = d + e;
|
||||
dst_ptr[offset+i] = f;
|
||||
float c =-a * b - b;
|
||||
float d = a * b + b;
|
||||
float e = c + d;
|
||||
dst_ptr[offset+i] = e;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
BIN
driver/tests/dogfood/kernel.elf
Executable file → Normal file
BIN
driver/tests/dogfood/kernel.elf
Executable file → Normal file
Binary file not shown.
@@ -253,8 +253,7 @@ public:
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto x = a[i] - b[i];
|
||||
auto ref = a[i] * b[i] + x;
|
||||
auto ref = a[i] * b[i] + b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
@@ -282,8 +281,7 @@ public:
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto x = a[i] - b[i];
|
||||
auto ref = a[i] * b[i] - x;
|
||||
auto ref = a[i] * b[i] - b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
@@ -311,8 +309,7 @@ public:
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto x = a[i] - b[i];
|
||||
auto ref = -a[i] * b[i] - x;
|
||||
auto ref = -a[i] * b[i] - b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
@@ -340,8 +337,7 @@ public:
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto x = a[i] - b[i];
|
||||
auto ref = -a[i] * b[i] + x;
|
||||
auto ref = -a[i] * b[i] + b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
@@ -369,10 +365,9 @@ public:
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto x = a[i] - b[i];
|
||||
auto y = -a[i] * b[i] - x;
|
||||
auto z = a[i] * b[i] + x;
|
||||
auto ref = y + z;
|
||||
auto x = -a[i] * b[i] - b[i];
|
||||
auto y = a[i] * b[i] + b[i];
|
||||
auto ref = x + y;
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
||||
@@ -80,6 +80,7 @@ tar -zcvf trace.vcd.tar.gz trace.vcd
|
||||
tar -zcvf run.log.tar.gz run.log
|
||||
tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd
|
||||
tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd
|
||||
tar -zcvf run.log.tar.gz build_ase_1c/work/run.log
|
||||
|
||||
# decompress VCD trace
|
||||
tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz
|
||||
|
||||
@@ -106,7 +106,7 @@ module ccip_std_afu #(
|
||||
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
|
||||
) vortex_afu_inst (
|
||||
.clk (clk),
|
||||
.SoftReset (reset_T1),
|
||||
.reset (reset_T1),
|
||||
|
||||
.avs_writedata (avs_writedata),
|
||||
.avs_readdata (avs_readdata),
|
||||
|
||||
@@ -18,7 +18,7 @@ module vortex_afu #(
|
||||
) (
|
||||
// global signals
|
||||
input clk,
|
||||
input SoftReset,
|
||||
input reset,
|
||||
|
||||
// IF signals between CCI and AFU
|
||||
input t_if_ccip_Rx cp2af_sRxPort,
|
||||
@@ -191,7 +191,7 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm
|
||||
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
mmio_tx.hdr <= 0;
|
||||
mmio_tx.data <= 0;
|
||||
mmio_tx.mmioRdValid <= 0;
|
||||
@@ -319,7 +319,7 @@ logic cmd_run_done;
|
||||
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
state <= STATE_IDLE;
|
||||
vx_reset <= 0;
|
||||
end
|
||||
@@ -484,18 +484,18 @@ begin
|
||||
case (state)
|
||||
CMD_MEM_READ: avs_address = cci_dram_rd_req_addr;
|
||||
CMD_MEM_WRITE: avs_address = cci_dram_wr_req_addr + ((DRAM_ADDR_WIDTH)'(t_cci_rdq_tag'(cci_rdq_dout)));
|
||||
default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
|
||||
default: avs_address = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
|
||||
endcase
|
||||
|
||||
case (state)
|
||||
CMD_MEM_READ: avs_byteenable = 64'hffffffffffffffff;
|
||||
CMD_MEM_WRITE: avs_byteenable = 64'hffffffffffffffff;
|
||||
default: avs_byteenable = vx_dram_req_byteen_;
|
||||
default: avs_byteenable = vx_dram_req_byteen_;
|
||||
endcase
|
||||
|
||||
case (state)
|
||||
CMD_MEM_WRITE: avs_writedata = cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)];
|
||||
default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset;
|
||||
default: avs_writedata = (DRAM_LINE_WIDTH)'(vx_dram_req_data) << vx_dram_req_offset;
|
||||
endcase
|
||||
end
|
||||
|
||||
@@ -506,7 +506,7 @@ assign cmd_write_done = (cci_dram_wr_req_ctr >= cmd_data_size);
|
||||
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset)
|
||||
if (reset)
|
||||
begin
|
||||
mem_bank_select <= 0;
|
||||
avs_burstcount <= 1;
|
||||
@@ -586,7 +586,7 @@ VX_generic_queue #(
|
||||
.SIZE(AVS_RD_QUEUE_SIZE)
|
||||
) avs_rd_req_queue (
|
||||
.clk (clk),
|
||||
.reset (SoftReset),
|
||||
.reset (reset),
|
||||
.push (avs_rtq_push),
|
||||
.data_in ({vx_dram_req_tag, vx_dram_req_offset}),
|
||||
.pop (avs_rtq_pop),
|
||||
@@ -608,7 +608,7 @@ VX_generic_queue #(
|
||||
.SIZE(AVS_RD_QUEUE_SIZE)
|
||||
) avs_rd_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (SoftReset),
|
||||
.reset (reset),
|
||||
.push (avs_rdq_push),
|
||||
.data_in (avs_readdata),
|
||||
.pop (avs_rdq_pop),
|
||||
@@ -655,7 +655,7 @@ assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait;
|
||||
// Send read requests to CCI
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
cci_rd_req_addr <= 0;
|
||||
cci_rd_req_ctr <= 0;
|
||||
cci_rd_rsp_ctr <= 0;
|
||||
@@ -716,7 +716,7 @@ VX_generic_queue #(
|
||||
.SIZE(CCI_RD_QUEUE_SIZE)
|
||||
) cci_rd_req_queue (
|
||||
.clk (clk),
|
||||
.reset (SoftReset),
|
||||
.reset (reset),
|
||||
.push (cci_rdq_push),
|
||||
.data_in (cci_rdq_din),
|
||||
.pop (cci_rdq_pop),
|
||||
@@ -754,7 +754,7 @@ assign af2cp_sTxPort.c1.valid = cci_wr_req_enable && !avs_rdq_empty;
|
||||
// Send write requests to CCI
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
cci_wr_req_addr <= 0;
|
||||
cci_wr_req_ctr <= 0;
|
||||
cci_wr_req_enable <= 0;
|
||||
@@ -818,7 +818,7 @@ assign cmd_clflush_done = (0 == snp_rsp_ctr);
|
||||
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
vx_snp_req_valid <= 0;
|
||||
vx_snp_req_addr <= 0;
|
||||
vx_snp_req_tag <= 0;
|
||||
@@ -866,7 +866,7 @@ begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: AFU Snp Rsp: tag=%0d, rem=%0d", $time, vx_snp_rsp_tag, snp_rsp_ctr_next);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -887,7 +887,7 @@ assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_
|
||||
|
||||
always_ff @(posedge clk)
|
||||
begin
|
||||
if (SoftReset) begin
|
||||
if (reset) begin
|
||||
csr_io_req_sent <= 0;
|
||||
cmd_csr_rdata <= 0;
|
||||
end
|
||||
@@ -918,7 +918,7 @@ Vortex #() vortex (
|
||||
`SCOPE_SIGNALS_EXECUTE_BIND
|
||||
|
||||
.clk (clk),
|
||||
.reset (SoftReset | vx_reset),
|
||||
.reset (reset | vx_reset),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (vx_dram_req_valid),
|
||||
@@ -980,6 +980,13 @@ Vortex #() vortex (
|
||||
`UNUSED_PIN (ebreak)
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (!reset) begin
|
||||
// DRAM reads should only happen during vortex execution
|
||||
assert(vx_busy || !vx_dram_rd_req_enable);
|
||||
end
|
||||
end
|
||||
|
||||
// SCOPE //////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef SCOPE
|
||||
@@ -1049,7 +1056,7 @@ for (genvar i = 1; i < SCOPE_SR_DEPTH; i++) begin
|
||||
.N (SCOPE_DATAW+2)
|
||||
) scope_sr (
|
||||
.clk (clk),
|
||||
.reset (SoftReset),
|
||||
.reset (reset),
|
||||
.stall (0),
|
||||
.flush (0),
|
||||
.in (scope_data_in_st[i-1]),
|
||||
@@ -1064,7 +1071,7 @@ VX_scope #(
|
||||
.UPDW ($bits({`SCOPE_SIGNALS_UPD_LIST}))
|
||||
) scope (
|
||||
.clk (clk),
|
||||
.reset (SoftReset),
|
||||
.reset (reset),
|
||||
.start (scope_data_in_ste[0]),
|
||||
.stop (0),
|
||||
.changed (scope_data_in_ste[1]),
|
||||
|
||||
@@ -59,6 +59,8 @@
|
||||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
//`define FPU_FAST
|
||||
|
||||
// Device identification
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
|
||||
@@ -51,11 +51,11 @@ module VX_fpu_unit #(
|
||||
.full (fpuq_full)
|
||||
);
|
||||
|
||||
wire valid_in = fpu_req_if.valid && ~fpuq_full;
|
||||
|
||||
// can accept new request?
|
||||
assign fpu_req_if.ready = ready_in && ~fpuq_full;
|
||||
|
||||
wire valid_in = fpu_req_if.valid && ~fpuq_full;
|
||||
|
||||
`ifdef FPU_FAST
|
||||
|
||||
VX_fp_fpga #(
|
||||
@@ -135,6 +135,6 @@ module VX_fpu_unit #(
|
||||
.out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, fpu_commit_if.has_fflags, fpu_commit_if.fflags})
|
||||
);
|
||||
|
||||
assign ready_out = ~stall_out;
|
||||
assign ready_out = ~stall_out;
|
||||
|
||||
endmodule
|
||||
@@ -118,7 +118,7 @@ module VX_ibuffer #(
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = `NW_BITS'(i);
|
||||
deq_instr_n = q_data_out[i];
|
||||
schedule_table_n[i] = 0;
|
||||
schedule_table_n[i] = 0;
|
||||
break;
|
||||
end
|
||||
end
|
||||
|
||||
@@ -25,12 +25,11 @@ module VX_mul_unit #(
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
wire [MULQ_BITS-1:0] tag_in, tag_out;
|
||||
wire valid_out;
|
||||
wire stall_out;
|
||||
wire valid_out, ready_out;
|
||||
wire mulq_full;
|
||||
|
||||
wire mulq_push = mul_req_if.valid && mul_req_if.ready;
|
||||
wire mulq_pop = valid_out && ~stall_out;
|
||||
wire mulq_pop = valid_out && ready_out;
|
||||
|
||||
VX_cam_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
@@ -48,12 +47,18 @@ module VX_mul_unit #(
|
||||
.full (mulq_full)
|
||||
);
|
||||
|
||||
wire valid_in = mul_req_if.valid && ~mulq_full;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
||||
wire [MULQ_BITS-1:0] mul_tag;
|
||||
wire is_mul_in = (alu_op == `MUL_MUL);
|
||||
wire is_mul_out;
|
||||
wire stall_mul;
|
||||
wire is_mul_out;
|
||||
|
||||
wire mul_valid_out;
|
||||
wire mul_valid_in = valid_in && !is_div_op;
|
||||
wire mul_ready_in = ready_out || ~mul_valid_out;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
||||
@@ -71,7 +76,7 @@ module VX_mul_unit #(
|
||||
.LATENCY(`LATENCY_IMUL)
|
||||
) multiplier (
|
||||
.clk(clk),
|
||||
.enable(~stall_mul),
|
||||
.enable(mul_ready_in),
|
||||
.dataa(mul_in1),
|
||||
.datab(mul_in2),
|
||||
.result(mul_result_tmp)
|
||||
@@ -80,19 +85,14 @@ module VX_mul_unit #(
|
||||
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
|
||||
end
|
||||
|
||||
wire [MULQ_BITS-1:0] mul_tag;
|
||||
wire mul_valid_out;
|
||||
|
||||
wire mul_fire = mul_req_if.valid && mul_req_if.ready && !is_div_op;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1 + MULQ_BITS + 1),
|
||||
.DEPTH(`LATENCY_IMUL)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall_mul),
|
||||
.in({mul_fire, tag_in, is_mul_in}),
|
||||
.enable(mul_ready_in),
|
||||
.in({mul_valid_in, tag_in, is_mul_in}),
|
||||
.out({mul_valid_out, mul_tag, is_mul_out})
|
||||
);
|
||||
|
||||
@@ -100,13 +100,13 @@ module VX_mul_unit #(
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
|
||||
|
||||
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
|
||||
wire is_rem_op = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU);
|
||||
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
|
||||
wire div_valid_in = mul_req_if.valid && is_div_op;
|
||||
wire div_valid_in = valid_in && is_div_op;
|
||||
wire div_ready_out = ready_out && ~mul_valid_out; // arbitration prioritizes MUL
|
||||
wire div_ready_in;
|
||||
wire div_ready_out;
|
||||
wire div_valid_out;
|
||||
wire is_div_out;
|
||||
wire is_rem_op_out;
|
||||
wire [MULQ_BITS-1:0] div_tag;
|
||||
|
||||
VX_serial_div #(
|
||||
@@ -122,30 +122,25 @@ module VX_mul_unit #(
|
||||
.ready_in(div_ready_in),
|
||||
.valid_in(div_valid_in),
|
||||
.signed_mode(is_signed_div),
|
||||
.tag_in({tag_in, is_div_only}),
|
||||
.tag_in({tag_in, is_rem_op}),
|
||||
.numer(alu_in1),
|
||||
.denom(alu_in2),
|
||||
.quotient(div_result_tmp),
|
||||
.remainder(rem_result_tmp),
|
||||
.ready_out(div_ready_out),
|
||||
.valid_out(div_valid_out),
|
||||
.tag_out({div_tag, is_div_out})
|
||||
.tag_out({div_tag, is_rem_op_out})
|
||||
);
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result = is_div_out ? div_result_tmp : rem_result_tmp;
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire arbiter_hazard = mul_valid_out && div_valid_out;
|
||||
|
||||
assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
|
||||
assign stall_mul = (stall_out && !is_div_op) || mulq_full;
|
||||
assign div_ready_out = ~stall_out && ~arbiter_hazard; // arbitration prioritizes MUL
|
||||
wire stall_in = stall_mul || ~div_ready_in;
|
||||
wire stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
|
||||
assign ready_out = ~stall_out;
|
||||
|
||||
assign valid_out = mul_valid_out || div_valid_out;
|
||||
assign tag_out = mul_valid_out ? mul_tag : div_tag;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result;
|
||||
|
||||
VX_generic_register #(
|
||||
@@ -160,6 +155,6 @@ module VX_mul_unit #(
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
assign mul_req_if.ready = ~stall_in;
|
||||
assign mul_req_if.ready = (is_div_op ? div_ready_in : mul_ready_in) && ~mulq_full;
|
||||
|
||||
endmodule
|
||||
@@ -20,9 +20,9 @@ module VX_warp_sched #(
|
||||
wire [31:0] join_pc;
|
||||
wire [`NUM_THREADS-1:0] join_tm;
|
||||
|
||||
reg [`NUM_WARPS-1:0] active_warps; // real active warps (updated when a warp is activated or disabled)
|
||||
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
|
||||
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
|
||||
reg [`NUM_WARPS-1:0] schedule_table, schedule_table_n; // enforces round-robin, barrier, and non-speculating branches
|
||||
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
|
||||
|
||||
// Lock warp until instruction decode to resolve branches
|
||||
reg [`NUM_WARPS-1:0] fetch_lock;
|
||||
@@ -46,12 +46,20 @@ module VX_warp_sched #(
|
||||
|
||||
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
|
||||
|
||||
always @(*) begin
|
||||
active_warps_n = active_warps;
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
active_warps_n = warp_ctl_if.wspawn.wmask;
|
||||
end
|
||||
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
||||
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
schedule_table_n = schedule_table;
|
||||
if (warp_ctl_if.valid
|
||||
&& warp_ctl_if.tmc.valid
|
||||
&& (0 == warp_ctl_if.tmc.tmask)) begin
|
||||
schedule_table_n[warp_ctl_if.wid] = 0;
|
||||
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
||||
schedule_table_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
|
||||
end
|
||||
if (scheduled_warp) begin // remove scheduled warp (round-robin)
|
||||
schedule_table_n[warp_to_schedule] = 0;
|
||||
@@ -82,7 +90,6 @@ module VX_warp_sched #(
|
||||
end
|
||||
end else begin
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
active_warps <= warp_ctl_if.wspawn.wmask;
|
||||
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
|
||||
use_wspawn_pc <= warp_ctl_if.wspawn.pc;
|
||||
end
|
||||
@@ -97,9 +104,6 @@ module VX_warp_sched #(
|
||||
end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
||||
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
if (0 == warp_ctl_if.tmc.tmask) begin
|
||||
active_warps[warp_ctl_if.wid] <= 0;
|
||||
end
|
||||
end else if (join_if.valid && !didnt_split) begin
|
||||
if (!join_fall) begin
|
||||
warp_pcs[join_if.wid] <= join_pc;
|
||||
@@ -143,8 +147,10 @@ module VX_warp_sched #(
|
||||
warp_pcs[ifetch_rsp_if.wid] <= ifetch_rsp_if.PC + 4;
|
||||
end
|
||||
|
||||
active_warps <= active_warps_n;
|
||||
|
||||
// reset 'schedule_table' when it goes to zero
|
||||
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps;
|
||||
schedule_table <= (| schedule_table_n) ? schedule_table_n : active_warps_n;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -51,9 +51,9 @@ module VX_fp_addmul #(
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.clk({2'b00, clk}),
|
||||
.ena({2'b00, enable}),
|
||||
.aclr({reset, reset}),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
@@ -91,9 +91,9 @@ module VX_fp_addmul #(
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.clk({2'b00, clk}),
|
||||
.ena({2'b00, enable}),
|
||||
.aclr({reset, reset}),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
@@ -131,9 +131,9 @@ module VX_fp_addmul #(
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.clk({2'b00, clk}),
|
||||
.ena({2'b00, enable}),
|
||||
.aclr({reset, reset}),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
|
||||
@@ -32,7 +32,7 @@ module VX_fp_div #(
|
||||
`ifdef QUARTUS
|
||||
acl_fdiv fdiv (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.b (datab[i]),
|
||||
|
||||
@@ -27,7 +27,7 @@ module VX_fp_fpga #(
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam NUM_FPC = 8;
|
||||
localparam NUM_FPC = 7;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
@@ -40,28 +40,28 @@ module VX_fp_fpga #(
|
||||
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
reg do_sub, do_mul;
|
||||
reg do_sub, do_mul, do_neg;
|
||||
reg is_signed;
|
||||
|
||||
always @(*) begin
|
||||
core_select = 'x;
|
||||
do_sub = 'x;
|
||||
do_mul = 'x;
|
||||
is_signed = 'x;
|
||||
do_sub = 'x;
|
||||
do_mul = 'x;
|
||||
do_neg = 'x;
|
||||
is_signed = 'x;
|
||||
case (op_type)
|
||||
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
|
||||
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
|
||||
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
|
||||
`FPU_MADD: begin core_select = 2; do_sub = 0; end
|
||||
`FPU_MSUB: begin core_select = 2; do_sub = 1; end
|
||||
`FPU_NMADD: begin core_select = 3; do_sub = 0; end
|
||||
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end
|
||||
`FPU_DIV: begin core_select = 4; end
|
||||
`FPU_SQRT: begin core_select = 5; end
|
||||
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
|
||||
`FPU_MADD: begin core_select = 2; do_sub = 0; do_neg = 0; end
|
||||
`FPU_MSUB: begin core_select = 2; do_sub = 1; do_neg = 0; end
|
||||
`FPU_NMADD: begin core_select = 2; do_sub = 0; do_neg = 1; end
|
||||
`FPU_NMSUB: begin core_select = 2; do_sub = 1; do_neg = 1; end
|
||||
`FPU_DIV: begin core_select = 3; end
|
||||
`FPU_SQRT: begin core_select = 4; end
|
||||
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
|
||||
default: begin core_select = 0; end
|
||||
endcase
|
||||
end
|
||||
@@ -116,6 +116,7 @@ module VX_fp_fpga #(
|
||||
.ready_in (per_core_ready_in[2]),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.do_neg (do_neg),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
@@ -125,40 +126,21 @@ module VX_fp_fpga #(
|
||||
.valid_out (per_core_valid_out[2])
|
||||
);
|
||||
|
||||
VX_fp_nmadd #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_nmadd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
.ready_in (per_core_ready_in[3]),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[3]),
|
||||
.tag_out (per_core_tag_out[3]),
|
||||
.ready_out (per_core_ready_out[3]),
|
||||
.valid_out (per_core_valid_out[3])
|
||||
);
|
||||
|
||||
VX_fp_div #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
.ready_in (per_core_ready_in[3]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
.result (per_core_result[3]),
|
||||
.tag_out (per_core_tag_out[3]),
|
||||
.ready_out (per_core_ready_out[3]),
|
||||
.valid_out (per_core_valid_out[3])
|
||||
);
|
||||
|
||||
VX_fp_sqrt #(
|
||||
@@ -167,14 +149,14 @@ module VX_fp_fpga #(
|
||||
) fp_sqrt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
);
|
||||
|
||||
VX_fp_ftoi #(
|
||||
@@ -183,32 +165,32 @@ module VX_fp_fpga #(
|
||||
) fp_ftoi (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
);
|
||||
|
||||
VX_fp_itof #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_itof (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 7)),
|
||||
.ready_in (per_core_ready_in[7]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[7]),
|
||||
.tag_out (per_core_tag_out[7]),
|
||||
.ready_out (per_core_ready_out[7]),
|
||||
.valid_out (per_core_valid_out[7])
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
);
|
||||
|
||||
reg valid_out_n;
|
||||
@@ -234,7 +216,7 @@ module VX_fp_fpga #(
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in = (& per_core_ready_in);
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
assign valid_out = valid_out_n;
|
||||
assign has_fflags = has_fflags_n;
|
||||
assign tag_out = tag_out_n;
|
||||
|
||||
@@ -39,7 +39,7 @@ module VX_fp_ftoi #(
|
||||
`ifdef QUARTUS
|
||||
acl_ftoi ftoi (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
@@ -47,7 +47,7 @@ module VX_fp_ftoi #(
|
||||
|
||||
acl_ftou ftou (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
|
||||
@@ -39,7 +39,7 @@ module VX_fp_itof #(
|
||||
`ifdef QUARTUS
|
||||
acl_itof itof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
@@ -47,7 +47,7 @@ module VX_fp_itof #(
|
||||
|
||||
acl_utof utof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
|
||||
@@ -17,7 +17,8 @@ module VX_fp_madd #(
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_sub,
|
||||
|
||||
input wire do_neg,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
@@ -32,7 +33,7 @@ module VX_fp_madd #(
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg do_sub_r;
|
||||
reg do_sub_r, do_neg_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
@@ -50,9 +51,9 @@ module VX_fp_madd #(
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.clk({2'b00, clk}),
|
||||
.ena({2'b00, enable}),
|
||||
.aclr({reset, reset}),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
@@ -90,9 +91,9 @@ module VX_fp_madd #(
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.clk({2'b00, clk}),
|
||||
.ena({2'b00, enable}),
|
||||
.aclr({reset, reset}),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
@@ -126,18 +127,20 @@ module VX_fp_madd #(
|
||||
end
|
||||
`endif
|
||||
|
||||
assign result[i] = do_sub_r ? result_msub : result_madd;
|
||||
wire [31:0] result_unqual = do_sub_r ? result_msub : result_madd;
|
||||
assign result[i][31] = result_unqual[31] ^ do_neg_r;
|
||||
assign result[i][30:0] = result_unqual[30:0];
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DATAW(TAGW + 1 + 1 + 1),
|
||||
.DEPTH(`LATENCY_FMADD)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in({tag_in, valid_in, do_sub}),
|
||||
.out({tag_out, valid_out, do_sub_r})
|
||||
.in({tag_in, valid_in, do_sub, do_neg}),
|
||||
.out({tag_out, valid_out, do_sub_r, do_neg_r})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_nmadd #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_sub,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg do_sub_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_madd;
|
||||
wire [31:0] result_msub;
|
||||
|
||||
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
|
||||
|
||||
`ifdef QUARTUS
|
||||
twentynm_fp_mac mac_fp_madd (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_madd),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_madd.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_madd.use_chainin = "false";
|
||||
defparam mac_fp_madd.adder_subtract = "false";
|
||||
defparam mac_fp_madd.ax_clock = "0";
|
||||
defparam mac_fp_madd.ay_clock = "0";
|
||||
defparam mac_fp_madd.az_clock = "0";
|
||||
defparam mac_fp_madd.output_clock = "0";
|
||||
defparam mac_fp_madd.accumulate_clock = "none";
|
||||
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_madd.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_madd.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_madd.adder_input_clock = "0";
|
||||
defparam mac_fp_madd.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_msub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_msub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_msub.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_msub.use_chainin = "false";
|
||||
defparam mac_fp_msub.adder_subtract = "true";
|
||||
defparam mac_fp_msub.ax_clock = "0";
|
||||
defparam mac_fp_msub.ay_clock = "0";
|
||||
defparam mac_fp_msub.az_clock = "0";
|
||||
defparam mac_fp_msub.output_clock = "0";
|
||||
defparam mac_fp_msub.accumulate_clock = "none";
|
||||
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_msub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_msub.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_msub.adder_input_clock = "0";
|
||||
defparam mac_fp_msub.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_neg (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(32'h0),
|
||||
.ay(result_st0),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_neg.operation_mode = "sp_add";
|
||||
defparam mac_fp_neg.use_chainin = "false";
|
||||
defparam mac_fp_neg.adder_subtract = "true";
|
||||
defparam mac_fp_neg.ax_clock = "0";
|
||||
defparam mac_fp_neg.ay_clock = "0";
|
||||
defparam mac_fp_neg.az_clock = "none";
|
||||
defparam mac_fp_neg.output_clock = "0";
|
||||
defparam mac_fp_neg.accumulate_clock = "none";
|
||||
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_neg.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_neg.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_neg.adder_input_clock = "0";
|
||||
defparam mac_fp_neg.accum_adder_clock = "none";
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
|
||||
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1),
|
||||
.DEPTH(`LATENCY_FMADD)
|
||||
) shift_reg0 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in({do_sub}),
|
||||
.out({do_sub_r})
|
||||
);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
@@ -31,7 +31,7 @@ module VX_fp_sqrt #(
|
||||
`ifdef QUARTUS
|
||||
acl_fsqrt fsqrt (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.areset (reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
|
||||
@@ -91,6 +91,7 @@ module VX_fpnew #(
|
||||
fpu_operands[0] = dataa;
|
||||
fpu_operands[1] = datab;
|
||||
fpu_operands[2] = datac;
|
||||
|
||||
case (op_type)
|
||||
`FPU_ADD: begin
|
||||
fpu_op = fpnew_pkg::ADD;
|
||||
@@ -107,23 +108,23 @@ module VX_fpnew #(
|
||||
`FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end
|
||||
`FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
|
||||
`FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
|
||||
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
|
||||
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
|
||||
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
|
||||
`FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
|
||||
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
|
||||
`FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
|
||||
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
|
||||
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
|
||||
`FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
|
||||
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
|
||||
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
|
||||
`FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
|
||||
`FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
|
||||
`FPU_MISC: begin
|
||||
case (frm)
|
||||
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
|
||||
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
|
||||
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
|
||||
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
|
||||
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
|
||||
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
|
||||
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
|
||||
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
|
||||
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
|
||||
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
|
||||
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
|
||||
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
|
||||
endcase
|
||||
end
|
||||
default:;
|
||||
|
||||
Reference in New Issue
Block a user