adding tracking for SFU stalls

2023-12-28 12:12:11 -08:00
parent c7a81d1493
commit e217bc2c23
27 changed files with 1266 additions and 1166 deletions
--- a/runtime/common/utils.cpp
+++ b/runtime/common/utils.cpp
@@ -208,6 +208,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  uint64_t scrb_fpu = 0;
  uint64_t scrb_lsu = 0;
  uint64_t scrb_sfu = 0;
+  uint64_t scrb_wctl = 0;
+  uint64_t scrb_csrs = 0;
  uint64_t ifetches = 0;
  uint64_t loads = 0;
  uint64_t stores = 0;
@@ -268,44 +270,69 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // PERF: pipeline    
      // scheduler idles
      {
-        uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
-        int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
+        uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);        
+        if (num_cores > 1) {
+          int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
+          fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
+        }
        sched_idles += sched_idles_per_core;
      }
      // scheduler stalls
      {
-        uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
-        int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
+        uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);        
+        if (num_cores > 1) {
+          int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
+          fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
+        }
        sched_stalls += sched_stalls_per_core;
      }
      // ibuffer_stalls
      {
-        uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
-        int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
+        uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);        
+        if (num_cores > 1) {
+          int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
+          fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
+        }
        ibuffer_stalls += ibuffer_stalls_per_core;
      }
-      // scrb_stalls
+      // issue_stalls
      {
        uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
        uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
        uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
        uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
-        uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
-        uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
+        uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);        
        scrb_alu += scrb_alu_per_core;
        scrb_fpu += scrb_fpu_per_core;
        scrb_lsu += scrb_lsu_per_core;
        scrb_sfu += scrb_sfu_per_core;      
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, 
+        if (num_cores > 1) {
+          uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
+          fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, 
          calcAvgPercent(scrb_alu_per_core, scrb_total), 
          calcAvgPercent(scrb_fpu_per_core, scrb_total),
          calcAvgPercent(scrb_lsu_per_core, scrb_total),
          calcAvgPercent(scrb_sfu_per_core, scrb_total));
+        }
        scrb_stalls += scrb_stalls_per_core;
      }
+      // sfu_stalls
+      {
+        uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);  
+        uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
+        uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
+        if (num_cores > 1) {
+          uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core + scrb_tex_per_core + scrb_raster_per_core + scrb_om_per_core;
+          fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
+            , core_id
+            , scrb_sfu_per_core            
+            , calcAvgPercent(scrb_csrs_per_core, sfu_total)
+            , calcAvgPercent(scrb_wctl_per_core, sfu_total)
+          );
+        }
+        scrb_wctl += scrb_wctl_per_core;
+        scrb_csrs += scrb_csrs_per_core;
+      }
      // PERF: memory
      // ifetches
      {
@@ -313,9 +340,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
        if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
        ifetches += ifetches_per_core;

-        uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
-        int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);      
+        uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);        
+        if (num_cores > 1) {
+          int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
+          fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
+        }
        ifetch_lat += ifetch_lat_per_core;
      }
      // loads
@@ -324,9 +353,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
        if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
        loads += loads_per_core;

-        uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
-        int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
-        if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
+        uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);        
+        if (num_cores > 1) {
+          int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
+          fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
+        }
        load_lat += load_lat_per_core;
      }
      // stores
@@ -428,14 +459,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
    int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
    int load_avg_lat = (int)(double(load_lat) / double(loads));
    uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
-    fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent);
+    uint64_t sfu_total = scrb_wctl + scrb_csrs;
+    fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
    fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
    fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
-    fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
+    fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
      calcAvgPercent(scrb_alu, scrb_total), 
      calcAvgPercent(scrb_fpu, scrb_total),
      calcAvgPercent(scrb_lsu, scrb_total),
-      calcAvgPercent(scrb_sfu, scrb_total));
+      calcAvgPercent(scrb_sfu, scrb_total));    
+    fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
+      , scrb_sfu      
+      , calcAvgPercent(scrb_csrs, sfu_total)
+      , calcAvgPercent(scrb_wctl, sfu_total)
+    );
    fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
    fprintf(stream, "PERF: loads=%ld\n", loads);
    fprintf(stream, "PERF: stores=%ld\n", stores);