Gather FPGA perf stats
This commit is contained in:
@@ -30,28 +30,6 @@
|
|||||||
_ret; \
|
_ret; \
|
||||||
})
|
})
|
||||||
|
|
||||||
/*#include <cstdint>
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
int _pocl_register_kernel(const char* name, const void* pfn, uint32_t num_args, uint32_t num_locals, const uint8_t* arg_types, const uint32_t* local_sizes);
|
|
||||||
void _pocl_kernel_vecadd_workgroup(uint8_t* args, uint8_t*, uint32_t, uint32_t, uint32_t);
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
class auto_register_kernel_t {
|
|
||||||
public:
|
|
||||||
auto_register_kernel_t() {
|
|
||||||
static uint8_t arg_types[] = {1, 1, 1};
|
|
||||||
static uint32_t local_sizes[] = {};
|
|
||||||
_pocl_register_kernel("vecadd", (void*)_pocl_kernel_vecadd_workgroup, 3, 0, arg_types, local_sizes);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
static auto_register_kernel_t __x__;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
int exitcode = 0;
|
int exitcode = 0;
|
||||||
cl_context context = NULL;
|
cl_context context = NULL;
|
||||||
cl_command_queue commandQueue = NULL;
|
cl_command_queue commandQueue = NULL;
|
||||||
|
|||||||
@@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice);
|
|||||||
int vx_ready_wait(vx_device_h hdevice, long long timeout);
|
int vx_ready_wait(vx_device_h hdevice, long long timeout);
|
||||||
|
|
||||||
// set device constant registers
|
// set device constant registers
|
||||||
int vx_csr_set(vx_device_h hdevice, int core, int address, int value);
|
int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value);
|
||||||
|
|
||||||
// get device constant registers
|
// get device constant registers
|
||||||
int vx_csr_get(vx_device_h hdevice, int core, int address, int* value);
|
int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value);
|
||||||
|
|
||||||
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
||||||
|
|
||||||
|
|||||||
@@ -53,10 +53,10 @@
|
|||||||
typedef struct vx_device_ {
|
typedef struct vx_device_ {
|
||||||
fpga_handle fpga;
|
fpga_handle fpga;
|
||||||
size_t mem_allocation;
|
size_t mem_allocation;
|
||||||
int implementation_id;
|
unsigned implementation_id;
|
||||||
int num_cores;
|
unsigned num_cores;
|
||||||
int num_warps;
|
unsigned num_warps;
|
||||||
int num_threads;
|
unsigned num_threads;
|
||||||
} vx_device_t;
|
} vx_device_t;
|
||||||
|
|
||||||
typedef struct vx_buffer_ {
|
typedef struct vx_buffer_ {
|
||||||
@@ -181,6 +181,9 @@ extern int vx_dev_open(vx_device_h* hdevice) {
|
|||||||
fpgaClose(accel_handle);
|
fpgaClose(accel_handle);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
|
||||||
|
device->implementation_id, device->num_cores, device->num_warps, device->num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SCOPE
|
#ifdef SCOPE
|
||||||
@@ -208,6 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||||||
vx_scope_stop(device->fpga, 0);
|
vx_scope_stop(device->fpga, 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
{
|
||||||
|
// Dump performance stats
|
||||||
|
uint64_t instrs, cycles;
|
||||||
|
unsigned value;
|
||||||
|
|
||||||
|
int ret = 0;
|
||||||
|
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_H, &value);
|
||||||
|
instrs = value;
|
||||||
|
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_L, &value);
|
||||||
|
instrs = (instrs << 32) | value;
|
||||||
|
|
||||||
|
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value);
|
||||||
|
cycles = value;
|
||||||
|
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_L, &value);
|
||||||
|
cycles = (cycles << 32) | value;
|
||||||
|
|
||||||
|
float IPC = (float)(double(instrs) / double(cycles));
|
||||||
|
|
||||||
|
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||||
|
|
||||||
|
assert(ret == 0);
|
||||||
|
}
|
||||||
|
|
||||||
fpgaClose(device->fpga);
|
fpgaClose(device->fpga);
|
||||||
|
|
||||||
free(device);
|
free(device);
|
||||||
@@ -468,7 +494,7 @@ extern int vx_start(vx_device_h hdevice) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// set device constant registers
|
// set device constant registers
|
||||||
extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
|
extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) {
|
||||||
if (nullptr == hdevice)
|
if (nullptr == hdevice)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
@@ -488,7 +514,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get device constant registers
|
// get device constant registers
|
||||||
extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
|
extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) {
|
||||||
if (nullptr == hdevice || nullptr == value)
|
if (nullptr == hdevice || nullptr == value)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
@@ -510,7 +536,7 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
|
|||||||
|
|
||||||
uint64_t value64;
|
uint64_t value64;
|
||||||
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64));
|
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64));
|
||||||
*value = (int)value64;
|
*value = (unsigned)value64;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
Binary file not shown.
@@ -881,7 +881,6 @@ assign vx_csr_io_req_rw = (STATE_CSR_WRITE == state);
|
|||||||
assign vx_csr_io_req_addr = cmd_csr_addr;
|
assign vx_csr_io_req_addr = cmd_csr_addr;
|
||||||
assign vx_csr_io_req_data = cmd_csr_wdata;
|
assign vx_csr_io_req_data = cmd_csr_wdata;
|
||||||
|
|
||||||
assign cmd_csr_rdata = vx_csr_io_rsp_data;
|
|
||||||
assign vx_csr_io_rsp_ready = 1;
|
assign vx_csr_io_rsp_ready = 1;
|
||||||
|
|
||||||
assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid;
|
assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid;
|
||||||
@@ -890,6 +889,7 @@ always_ff @(posedge clk)
|
|||||||
begin
|
begin
|
||||||
if (SoftReset) begin
|
if (SoftReset) begin
|
||||||
csr_io_req_sent <= 0;
|
csr_io_req_sent <= 0;
|
||||||
|
cmd_csr_rdata <= 0;
|
||||||
end
|
end
|
||||||
else begin
|
else begin
|
||||||
if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin
|
if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin
|
||||||
@@ -898,6 +898,11 @@ begin
|
|||||||
if (cmd_csr_done) begin
|
if (cmd_csr_done) begin
|
||||||
csr_io_req_sent <= 0;
|
csr_io_req_sent <= 0;
|
||||||
end
|
end
|
||||||
|
if ((STATE_CSR_READ == state)
|
||||||
|
&& vx_csr_io_rsp_ready
|
||||||
|
&& vx_csr_io_rsp_valid) begin
|
||||||
|
cmd_csr_rdata <= vx_csr_io_rsp_data;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -119,7 +119,7 @@ module VX_alu_unit (
|
|||||||
VX_mult #(
|
VX_mult #(
|
||||||
.WIDTHA(33),
|
.WIDTHA(33),
|
||||||
.WIDTHB(33),
|
.WIDTHB(33),
|
||||||
.WIDTHP(64),
|
.WIDTHP(66),
|
||||||
.SIGNED(1),
|
.SIGNED(1),
|
||||||
.PIPELINE(`MUL_LATENCY)
|
.PIPELINE(`MUL_LATENCY)
|
||||||
) multiplier (
|
) multiplier (
|
||||||
|
|||||||
@@ -20,23 +20,20 @@ module VX_csr_arb (
|
|||||||
|
|
||||||
wire pick_core = (| csr_core_req_if.valid);
|
wire pick_core = (| csr_core_req_if.valid);
|
||||||
|
|
||||||
// Which request to pick
|
|
||||||
assign issued_csr_req_if.is_io = !pick_core;
|
|
||||||
|
|
||||||
// Mux between core and io
|
// Mux between core and io
|
||||||
assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
|
assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
|
||||||
assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1;
|
assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1;
|
||||||
assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS);
|
assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS);
|
||||||
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
|
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
|
||||||
|
assign issued_csr_req_if.csr_immed = pick_core ? csr_core_req_if.csr_immed : 0;
|
||||||
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
|
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
|
||||||
|
assign issued_csr_req_if.is_io = !pick_core;
|
||||||
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
|
|
||||||
|
|
||||||
// Core arguments
|
|
||||||
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
|
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
|
||||||
assign issued_csr_req_if.rd = csr_core_req_if.rd;
|
assign issued_csr_req_if.rd = csr_core_req_if.rd;
|
||||||
assign issued_csr_req_if.wb = csr_core_req_if.wb;
|
assign issued_csr_req_if.wb = csr_core_req_if.wb;
|
||||||
|
|
||||||
|
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
|
||||||
|
|
||||||
// Core Writeback
|
// Core Writeback
|
||||||
assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}};
|
assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}};
|
||||||
assign csr_wb_if.data = csr_pipe_rsp_if.data;
|
assign csr_wb_if.data = csr_pipe_rsp_if.data;
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ module VX_csr_pipe #(
|
|||||||
.wb_valid (| writeback_if.valid)
|
.wb_valid (| writeback_if.valid)
|
||||||
);
|
);
|
||||||
|
|
||||||
// wire hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
|
|
||||||
wire car_hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
|
wire car_hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
|
||||||
|
|
||||||
assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual;
|
assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual;
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ module VX_decode(
|
|||||||
assign is_lui = (curr_opcode == `INST_LUI);
|
assign is_lui = (curr_opcode == `INST_LUI);
|
||||||
assign is_auipc = (curr_opcode == `INST_AUIPC);
|
assign is_auipc = (curr_opcode == `INST_AUIPC);
|
||||||
assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0);
|
assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0);
|
||||||
assign is_csr_immed = (is_csr) && (func3[2] == 1);
|
assign is_csr_immed = is_csr && (func3[2] == 1);
|
||||||
|
|
||||||
assign is_gpgpu = (curr_opcode == `INST_GPGPU);
|
assign is_gpgpu = (curr_opcode == `INST_GPGPU);
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,6 @@ module VX_gpr_stage (
|
|||||||
wire is_jal = bckE_req_if.is_jal;
|
wire is_jal = bckE_req_if.is_jal;
|
||||||
`DEBUG_END
|
`DEBUG_END
|
||||||
|
|
||||||
|
|
||||||
assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core
|
assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core
|
||||||
|
|
||||||
VX_gpr_read_if gpr_read_if();
|
VX_gpr_read_if gpr_read_if();
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ module VX_divide #(
|
|||||||
quartus_div.lpm_widthd = WIDTHD,
|
quartus_div.lpm_widthd = WIDTHD,
|
||||||
quartus_div.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
|
quartus_div.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
|
||||||
quartus_div.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
|
quartus_div.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
|
||||||
quartus_div.lpm_hint = "LPM_REMAINDERPOSITIVE=FALSE,MAXIMIZE_SPEED=9",
|
quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
|
||||||
quartus_div.lpm_pipeline = PIPELINE;
|
quartus_div.lpm_pipeline = PIPELINE;
|
||||||
|
|
||||||
`else
|
`else
|
||||||
|
|||||||
@@ -23,9 +23,9 @@ module VX_mult #(
|
|||||||
.dataa (dataa),
|
.dataa (dataa),
|
||||||
.datab (datab),
|
.datab (datab),
|
||||||
.result (result),
|
.result (result),
|
||||||
|
.sclr (reset),
|
||||||
.aclr (1'b0),
|
.aclr (1'b0),
|
||||||
.clken (1'b1),
|
.clken (1'b1),
|
||||||
.sclr (1'b0),
|
|
||||||
.sum (1'b0)
|
.sum (1'b0)
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ module VX_mult #(
|
|||||||
quartus_mult.lpm_widthp = WIDTHP,
|
quartus_mult.lpm_widthp = WIDTHP,
|
||||||
quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
|
quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
|
||||||
quartus_mult.lpm_pipeline = PIPELINE,
|
quartus_mult.lpm_pipeline = PIPELINE,
|
||||||
quartus_mult.lpm_hint = "MAXIMIZE_SPEED=9";
|
quartus_mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
|
||||||
`else
|
`else
|
||||||
|
|
||||||
wire [WIDTHP-1:0] result_unqual;
|
wire [WIDTHP-1:0] result_unqual;
|
||||||
|
|||||||
Reference in New Issue
Block a user