profiling timing optimization

minor update

minor update

minor update
This commit is contained in:
Blaise Tine
2023-12-13 18:04:12 -08:00
parent f5f9e3dfdb
commit c6845a4c8d
11 changed files with 64 additions and 56 deletions

View File

@@ -194,10 +194,14 @@
`ifndef FPU_FPNEW `ifndef FPU_FPNEW
`ifndef FPU_DSP `ifndef FPU_DSP
`ifndef FPU_DPI `ifndef FPU_DPI
`ifdef SYNTHESIS `ifndef SYNTHESIS
`define FPU_DSP `ifndef DPI_DISABLE
`else
`define FPU_DPI `define FPU_DPI
`else
`define FPU_DSP
`endif
`else
`define FPU_DSP
`endif `endif
`endif `endif
`endif `endif

View File

@@ -181,16 +181,15 @@ module Vortex import VX_gpu_pkg::*; (
end end
end end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
mem_perf <= '0; mem_perf <= '0;
end else begin end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1); mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end end
end end

View File

@@ -530,8 +530,8 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls // per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
@@ -539,6 +539,9 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
@@ -562,12 +565,6 @@ module VX_cache import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle_r;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle_r;
`BUFFER(perf_core_reads_per_cycle_r, perf_core_reads_per_cycle);
`BUFFER(perf_core_writes_per_cycle_r, perf_core_writes_per_cycle);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_core_reads <= '0; perf_core_reads <= '0;
@@ -578,8 +575,8 @@ module VX_cache import VX_gpu_pkg::*; #(
perf_mem_stalls <= '0; perf_mem_stalls <= '0;
perf_crsp_stalls <= '0; perf_crsp_stalls <= '0;
end else begin end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle_r); perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle_r); perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle); perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle); perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle); perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);

View File

@@ -266,8 +266,8 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_per_cycle_r; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_per_cycle_r; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle; wire [1:0] perf_icache_pending_read_cycle;
@@ -283,7 +283,9 @@ module VX_core import VX_gpu_pkg::*; #(
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
@@ -291,15 +293,15 @@ module VX_core import VX_gpu_pkg::*; #(
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire); `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire); `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
`BUFFER(perf_dcache_rd_req_per_cycle_r, perf_dcache_rd_req_per_cycle);
`BUFFER(perf_dcache_wr_req_per_cycle_r, perf_dcache_wr_req_per_cycle);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle_r - perf_dcache_rsp_per_cycle; assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@@ -323,8 +325,8 @@ module VX_core import VX_gpu_pkg::*; #(
perf_dcache_lat <= '0; perf_dcache_lat <= '0;
end else begin end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire); perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle_r); perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle_r); perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads; perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads; perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end end

View File

@@ -156,13 +156,14 @@ module VX_issue #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_ibf_stalls <= '0; perf_ibf_stalls <= '0;
end else begin end else begin
if (decode_if.valid && ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
end end
end end

View File

@@ -383,13 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls; reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_sched_stalls <= '0; perf_sched_stalls <= '0;
perf_fetch_stalls <= '0; perf_fetch_stalls <= '0;
end else begin end else begin
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid); perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(~schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready); perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_stall);
end end
end end

View File

@@ -196,11 +196,14 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls; reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_wctl_stalls <= '0; perf_wctl_stalls <= '0;
end else begin end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready); perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
end end
end end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls; assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;

View File

@@ -49,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN); wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR); wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_BITS-1:0] tid; wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS]; assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin end else begin

View File

@@ -176,8 +176,9 @@ module VX_stream_xbar #(
// we have a collision when there exists a valid transfer with multiple input candicates // we have a collision when there exists a valid transfer with multiple input candicates
// we count the unique duplicates each cycle. // we count the unique duplicates each cycle.
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
reg [PERF_CTR_BITS-1:0] collisions_r; reg [PERF_CTR_BITS-1:0] collisions_r;
reg [NUM_INPUTS-1:0] per_cycle_collision;
always @(*) begin always @(*) begin
per_cycle_collision = 0; per_cycle_collision = 0;
@@ -191,15 +192,14 @@ module VX_stream_xbar #(
end end
end end
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count, collision_count_r; `BUFFER(per_cycle_collision_r, per_cycle_collision);
`POP_COUNT(collision_count, per_cycle_collision); `POP_COUNT(collision_count, per_cycle_collision_r);
`BUFFER(collision_count_r, collision_count);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
collisions_r <= '0; collisions_r <= '0;
end else begin end else begin
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count_r); collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
end end
end end

View File

@@ -229,14 +229,16 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
// per cycle: reads, writes // per cycle: reads, writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle, perf_reads_per_cycle_r; wire [`CLOG2(NUM_REQS+1)-1:0] perf_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle, perf_writes_per_cycle_r; wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw; wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req;
wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready; wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
`BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw);
`BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw);
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req); `POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req); `POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
@@ -245,17 +247,14 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_writes; reg [`PERF_CTR_BITS-1:0] perf_writes;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
`BUFFER(perf_reads_per_cycle_r, perf_reads_per_cycle);
`BUFFER(perf_writes_per_cycle_r, perf_writes_per_cycle);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_reads <= '0; perf_reads <= '0;
perf_writes <= '0; perf_writes <= '0;
perf_crsp_stalls <= '0; perf_crsp_stalls <= '0;
end else begin end else begin
perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle_r); perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle);
perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle_r); perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end end
end end

View File

@@ -38,9 +38,9 @@ run-simx:
$(MAKE) -C transpose run-simx $(MAKE) -C transpose run-simx
$(MAKE) -C convolution run-simx $(MAKE) -C convolution run-simx
$(MAKE) -C cutcp run-simx $(MAKE) -C cutcp run-simx
$(MAKE) -C sgemm2 run-simx
$(MAKE) -C vectorhypot run-simx $(MAKE) -C vectorhypot run-simx
$(MAKE) -C mri-q run-simx $(MAKE) -C mri-q run-simx
# $(MAKE) -C sgemm2 run-simx
run-rtlsim: run-rtlsim:
$(MAKE) -C vecadd run-rtlsim $(MAKE) -C vecadd run-rtlsim