diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index d57de0c3..3af544c6 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -194,10 +194,14 @@ `ifndef FPU_FPNEW `ifndef FPU_DSP `ifndef FPU_DPI -`ifdef SYNTHESIS -`define FPU_DSP -`else +`ifndef SYNTHESIS +`ifndef DPI_DISABLE `define FPU_DPI +`else +`define FPU_DSP +`endif +`else +`define FPU_DSP `endif `endif `endif diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index 594204e4..b29f0802 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -181,16 +181,15 @@ module Vortex import VX_gpu_pkg::*; ( end end + wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw; + wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw; + always @(posedge clk) begin if (reset) begin mem_perf <= '0; - end else begin - if (mem_req_fire && ~mem_bus_if.req_data.rw) begin - mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1); - end - if (mem_req_fire && mem_bus_if.req_data.rw) begin - mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1); - end + end else begin + mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire); + mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire); mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; end end diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index 65efc607..891512da 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -530,14 +530,17 @@ module VX_cache import VX_gpu_pkg::*; #( wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; - wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw; - wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw; + wire [NUM_REQS-1:0] perf_core_reads_per_req; + wire [NUM_REQS-1:0] perf_core_writes_per_req; // per cycle: read misses, write misses, msrq stalls, pipeline stalls wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; + + `BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw); + `BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw); `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); @@ -560,13 +563,7 @@ module VX_cache import VX_gpu_pkg::*; #( reg [`PERF_CTR_BITS-1:0] perf_write_misses; reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls; reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; - reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; - - wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle_r; - wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle_r; - - `BUFFER(perf_core_reads_per_cycle_r, perf_core_reads_per_cycle); - `BUFFER(perf_core_writes_per_cycle_r, perf_core_writes_per_cycle); + reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; always @(posedge clk) begin if (reset) begin @@ -578,8 +575,8 @@ module VX_cache import VX_gpu_pkg::*; #( perf_mem_stalls <= '0; perf_crsp_stalls <= '0; end else begin - perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle_r); - perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle_r); + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle); perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 1776024f..5aba3075 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -266,8 +266,8 @@ module VX_core import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE - wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_per_cycle_r; - wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_per_cycle_r; + wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; + wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; wire [1:0] perf_icache_pending_read_cycle; @@ -283,7 +283,9 @@ module VX_core import VX_gpu_pkg::*; #( wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; - wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire; + wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; + wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; + wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; @@ -291,15 +293,15 @@ module VX_core import VX_gpu_pkg::*; #( assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; end - `POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire); - `POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire); - `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); + `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire); + `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire); - `BUFFER(perf_dcache_rd_req_per_cycle_r, perf_dcache_rd_req_per_cycle); - `BUFFER(perf_dcache_wr_req_per_cycle_r, perf_dcache_wr_req_per_cycle); + `POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r); + `POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r); + `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; - assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle_r - perf_dcache_rsp_per_cycle; + assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle; always @(posedge clk) begin if (reset) begin @@ -323,8 +325,8 @@ module VX_core import VX_gpu_pkg::*; #( perf_dcache_lat <= '0; end else begin perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire); - perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle_r); - perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle_r); + perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle); + perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle); perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads; perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads; end diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 53701cc8..8d0eaff6 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -156,13 +156,14 @@ module VX_issue #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; + + wire decode_stall = decode_if.valid && ~decode_if.ready; + always @(posedge clk) begin if (reset) begin perf_ibf_stalls <= '0; end else begin - if (decode_if.valid && ~decode_if.ready) begin - perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1); - end + perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall); end end diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 9008f605..f11e4324 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -383,13 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls; + + wire schedule_stall = schedule_if.valid && ~schedule_if.ready; + always @(posedge clk) begin if (reset) begin perf_sched_stalls <= '0; perf_fetch_stalls <= '0; end else begin - perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid); - perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready); + perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid); + perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall); end end diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index fd5dd59f..b531e75b 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -196,11 +196,14 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls; + + wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready; + always @(posedge clk) begin if (reset) begin perf_wctl_stalls <= '0; end else begin - perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready); + perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall); end end assign sfu_perf_if.wctl_stalls = perf_wctl_stalls; diff --git a/hw/rtl/core/VX_wctl_unit.sv b/hw/rtl/core/VX_wctl_unit.sv index 35503add..88b2f71e 100644 --- a/hw/rtl/core/VX_wctl_unit.sv +++ b/hw/rtl/core/VX_wctl_unit.sv @@ -49,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN); wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR); - wire [LANE_BITS-1:0] tid; + wire [`UP(LANE_BITS)-1:0] tid; if (LANE_BITS != 0) begin assign tid = execute_if.data.tid[0 +: LANE_BITS]; end else begin diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index fbae5c7a..2a8e4bb4 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -176,8 +176,9 @@ module VX_stream_xbar #( // we have a collision when there exists a valid transfer with multiple input candicates // we count the unique duplicates each cycle. + reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r; + wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count; reg [PERF_CTR_BITS-1:0] collisions_r; - reg [NUM_INPUTS-1:0] per_cycle_collision; always @(*) begin per_cycle_collision = 0; @@ -190,16 +191,15 @@ module VX_stream_xbar #( end end end - - wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count, collision_count_r; - `POP_COUNT(collision_count, per_cycle_collision); - `BUFFER(collision_count_r, collision_count); + + `BUFFER(per_cycle_collision_r, per_cycle_collision); + `POP_COUNT(collision_count, per_cycle_collision_r); always @(posedge clk) begin if (reset) begin collisions_r <= '0; end else begin - collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count_r); + collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count); end end diff --git a/hw/rtl/mem/VX_shared_mem.sv b/hw/rtl/mem/VX_shared_mem.sv index 0f1f4171..97082468 100644 --- a/hw/rtl/mem/VX_shared_mem.sv +++ b/hw/rtl/mem/VX_shared_mem.sv @@ -229,14 +229,16 @@ module VX_shared_mem import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE // per cycle: reads, writes - wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle, perf_reads_per_cycle_r; - wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle, perf_writes_per_cycle_r; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; - wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw; - wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw; + wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req; wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready; + `BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw); + `BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw); + `POP_COUNT(perf_reads_per_cycle, perf_reads_per_req); `POP_COUNT(perf_writes_per_cycle, perf_writes_per_req); `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); @@ -245,17 +247,14 @@ module VX_shared_mem import VX_gpu_pkg::*; #( reg [`PERF_CTR_BITS-1:0] perf_writes; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; - `BUFFER(perf_reads_per_cycle_r, perf_reads_per_cycle); - `BUFFER(perf_writes_per_cycle_r, perf_writes_per_cycle); - always @(posedge clk) begin if (reset) begin perf_reads <= '0; perf_writes <= '0; perf_crsp_stalls <= '0; end else begin - perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle_r); - perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle_r); + perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle); + perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle); perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); end end diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 27ef6f38..c7ba1ed7 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -37,10 +37,10 @@ run-simx: $(MAKE) -C blackscholes run-simx $(MAKE) -C transpose run-simx $(MAKE) -C convolution run-simx - $(MAKE) -C cutcp run-simx - $(MAKE) -C sgemm2 run-simx + $(MAKE) -C cutcp run-simx $(MAKE) -C vectorhypot run-simx $(MAKE) -C mri-q run-simx +# $(MAKE) -C sgemm2 run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim