性能剖析

2026-01-18 00:31:39 +08:00
parent 25c6fc04db
commit c86c2be5ff
26 changed files with 272 additions and 54 deletions
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -56,21 +56,50 @@ class TimingRecord:
        }


-class ProfilerManager:
-    """Manages macro and micro-level profiling."""
-
-    def __init__(self, enabled: bool = False, output_dir: str = "./profile_output"):
-        self.enabled = enabled
-        self.output_dir = output_dir
-        self.macro_timings: Dict[str, List[float]] = {}
-        self.cuda_events: Dict[str, List[tuple]] = {}
-        self.memory_snapshots: List[Dict] = []
-        self.pytorch_profiler = None
-        self.current_iteration = 0
-        self.operator_stats: Dict[str, Dict] = {}
-
-        if enabled:
-            os.makedirs(output_dir, exist_ok=True)
+class ProfilerManager:
+    """Manages macro and micro-level profiling."""
+
+    def __init__(
+        self,
+        enabled: bool = False,
+        output_dir: str = "./profile_output",
+        profile_detail: str = "light",
+    ):
+        self.enabled = enabled
+        self.output_dir = output_dir
+        self.profile_detail = profile_detail
+        self.macro_timings: Dict[str, List[float]] = {}
+        self.cuda_events: Dict[str, List[tuple]] = {}
+        self.memory_snapshots: List[Dict] = []
+        self.pytorch_profiler = None
+        self.current_iteration = 0
+        self.operator_stats: Dict[str, Dict] = {}
+        self.profiler_config = self._build_profiler_config(profile_detail)
+
+        if enabled:
+            os.makedirs(output_dir, exist_ok=True)
+
+    def _build_profiler_config(self, profile_detail: str) -> Dict[str, Any]:
+        """Return profiler settings based on the requested detail level."""
+        if profile_detail not in ("light", "full"):
+            raise ValueError(f"Unsupported profile_detail: {profile_detail}")
+        if profile_detail == "full":
+            return {
+                "record_shapes": True,
+                "profile_memory": True,
+                "with_stack": True,
+                "with_flops": True,
+                "with_modules": True,
+                "group_by_input_shape": True,
+            }
+        return {
+            "record_shapes": False,
+            "profile_memory": False,
+            "with_stack": False,
+            "with_flops": False,
+            "with_modules": False,
+            "group_by_input_shape": False,
+        }

    @contextmanager
    def profile_section(self, name: str, sync_cuda: bool = True):
@@ -133,22 +162,22 @@ class ProfilerManager:
        if not self.enabled:
            return nullcontext()

-        self.pytorch_profiler = torch.profiler.profile(
-            activities=[
-                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
-            ],
-            schedule=torch.profiler.schedule(
-                wait=wait, warmup=warmup, active=active, repeat=1
-            ),
-            on_trace_ready=self._trace_handler,
-            record_shapes=True,
-            profile_memory=True,
-            with_stack=True,
-            with_flops=True,
-            with_modules=True,
-        )
-        return self.pytorch_profiler
+        self.pytorch_profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            schedule=torch.profiler.schedule(
+                wait=wait, warmup=warmup, active=active, repeat=1
+            ),
+            on_trace_ready=self._trace_handler,
+            record_shapes=self.profiler_config["record_shapes"],
+            profile_memory=self.profiler_config["profile_memory"],
+            with_stack=self.profiler_config["with_stack"],
+            with_flops=self.profiler_config["with_flops"],
+            with_modules=self.profiler_config["with_modules"],
+        )
+        return self.pytorch_profiler

    def _trace_handler(self, prof):
        """Handle profiler trace output."""
@@ -158,8 +187,10 @@ class ProfilerManager:
        )
        prof.export_chrome_trace(trace_path)

-        # Extract operator statistics
-        key_averages = prof.key_averages(group_by_input_shape=True)
+        # Extract operator statistics
+        key_averages = prof.key_averages(
+            group_by_input_shape=self.profiler_config["group_by_input_shape"]
+        )
        for evt in key_averages:
            op_name = evt.key
            if op_name not in self.operator_stats:
@@ -344,18 +375,22 @@ class ProfilerManager:
 # Global profiler instance
 _profiler: Optional[ProfilerManager] = None

-def get_profiler() -> ProfilerManager:
-    """Get the global profiler instance."""
-    global _profiler
-    if _profiler is None:
-        _profiler = ProfilerManager(enabled=False)
-    return _profiler
-
-def init_profiler(enabled: bool, output_dir: str) -> ProfilerManager:
-    """Initialize the global profiler."""
-    global _profiler
-    _profiler = ProfilerManager(enabled=enabled, output_dir=output_dir)
-    return _profiler
+def get_profiler() -> ProfilerManager:
+    """Get the global profiler instance."""
+    global _profiler
+    if _profiler is None:
+        _profiler = ProfilerManager(enabled=False)
+    return _profiler
+
+def init_profiler(enabled: bool, output_dir: str, profile_detail: str) -> ProfilerManager:
+    """Initialize the global profiler."""
+    global _profiler
+    _profiler = ProfilerManager(
+        enabled=enabled,
+        output_dir=output_dir,
+        profile_detail=profile_detail,
+    )
+    return _profiler


 # ========== Original Functions ==========
@@ -1193,13 +1228,20 @@ def get_parser():
        default=None,
        help="Directory to save profiling results. Defaults to {savedir}/profile_output."
    )
-    parser.add_argument(
-        "--profile_iterations",
-        type=int,
-        default=3,
-        help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
-    )
-    return parser
+    parser.add_argument(
+        "--profile_iterations",
+        type=int,
+        default=3,
+        help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
+    )
+    parser.add_argument(
+        "--profile_detail",
+        type=str,
+        choices=["light", "full"],
+        default="light",
+        help="Profiling detail level. Use 'full' for shapes/stacks/memory/flops."
+    )
+    return parser


 if __name__ == '__main__':
@@ -1214,7 +1256,11 @@ if __name__ == '__main__':
    profile_output_dir = args.profile_output_dir
    if profile_output_dir is None:
        profile_output_dir = os.path.join(args.savedir, "profile_output")
-    init_profiler(enabled=args.profile, output_dir=profile_output_dir)
+    init_profiler(
+        enabled=args.profile,
+        output_dir=profile_output_dir,
+        profile_detail=args.profile_detail,
+    )

    rank, gpu_num = 0, 1
    run_inference(args, gpu_num, rank)
--- a/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt
+++ b/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt
@@ -0,0 +1,85 @@
+================================================================================
+PERFORMANCE PROFILING REPORT
+================================================================================
+
+----------------------------------------
+MACRO-LEVEL TIMING SUMMARY
+----------------------------------------
+Section                                     Count    Total(ms)      Avg(ms)   CUDA Avg(ms)
+--------------------------------------------------------------------------------------
+action_generation                              11    399707.47     36337.04       36336.85
+data_loading                                    1        52.85        52.85          52.88
+get_latent_z/encode                            22       901.39        40.97          41.01
+iteration_total                                11    836793.23     76072.11       76071.63
+load_transitions                                1         2.24         2.24           2.28
+model_loading/checkpoint                        1     11833.31     11833.31       11833.43
+model_loading/config                            1     49774.19     49774.19       49774.16
+model_to_cuda                                   1      8909.30      8909.30        8909.33
+prepare_init_input                              1        10.52        10.52          10.55
+prepare_observation                            11         5.41         0.49           0.53
+prepare_wm_observation                         11         2.12         0.19           0.22
+save_results                                   11     38668.06      3515.28        3515.32
+synthesis/conditioning_prep                    22      2916.63       132.57         132.61
+synthesis/ddim_sampling                        22    782695.01     35577.05       35576.86
+synthesis/decode_first_stage                   22     12444.31       565.65         565.70
+update_action_queues                           11         6.85         0.62           0.65
+update_state_queues                            11        17.67         1.61           1.64
+world_model_interaction                        11    398375.58     36215.96       36215.75
+--------------------------------------------------------------------------------------
+TOTAL                                               2543116.13
+
+----------------------------------------
+GPU MEMORY SUMMARY
+----------------------------------------
+Peak allocated:       17890.50 MB
+Average allocated:    16129.98 MB
+
+----------------------------------------
+TOP 30 OPERATORS BY CUDA TIME
+----------------------------------------
+Operator                                              Count     CUDA(ms)      CPU(ms)  Self CUDA(ms)
+------------------------------------------------------------------------------------------------
+ProfilerStep*                                             6    443804.16    237696.98      237689.25
+aten::linear                                         171276    112286.23     13179.82           0.00
+aten::addmm                                           81456     79537.36      3799.84       79296.37
+ampere_sgemm_128x64_tn                                26400     52052.10         0.00       52052.10
+aten::matmul                                          90468     34234.05      6281.32           0.00
+aten::_convolution                                   100242     33623.79     13105.89           0.00
+aten::mm                                              89820     33580.74      3202.22       33253.18
+aten::convolution                                    100242     33575.23     13714.47           0.00
+aten::cudnn_convolution                               98430     30932.19      8640.50       29248.12
+ampere_sgemm_32x128_tn                                42348     20394.52         0.00       20394.52
+aten::conv2d                                          42042     18115.35      5932.30           0.00
+ampere_sgemm_128x32_tn                                40938     16429.81         0.00       16429.81
+xformers::efficient_attention_forward_cutlass         24000     15222.23      2532.93       15120.44
+fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti...    24000     15121.31         0.00       15121.31
+ampere_sgemm_64x64_tn                                 21000     14627.12         0.00       14627.12
+aten::copy_                                          231819     14504.87    127056.51       14038.39
+aten::group_norm                                      87144     12033.73     10659.57           0.00
+aten::native_group_norm                               87144     11473.40      9449.36       11002.02
+aten::conv3d                                          26400      8852.13      3365.43           0.00
+void at::native::(anonymous namespace)::Rowwise...    87144      8714.68         0.00        8714.68
+void cudnn::ops::nchwToNhwcKernel<float, float,...   169824      8525.44         0.00        8525.44
+aten::clone                                          214314      8200.26      8568.82           0.00
+void at::native::elementwise_kernel<128, 2, at:...   220440      8109.62         0.00        8109.62
+void cutlass::Kernel<cutlass_80_simt_sgemm_128x...    15000      7919.30         0.00        7919.30
+aten::_to_copy                                        12219      5963.43    122411.53           0.00
+aten::to                                              58101      5952.65    122443.72           0.00
+aten::conv1d                                          30000      5878.95      4556.48           0.00
+Memcpy HtoD (Pageable -> Device)                       6696      5856.39         0.00        5856.39
+aten::reshape                                        671772      5124.03      9636.01           0.00
+sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t...    16272      5097.70         0.00        5097.70
+
+----------------------------------------
+OPERATOR CATEGORY BREAKDOWN
+----------------------------------------
+Category                         CUDA Time(ms)   Percentage
+---------------------------------------------------------
+Other                                481950.47        41.9%
+Linear/GEMM                          342333.09        29.8%
+Convolution                          159920.77        13.9%
+Elementwise                           54682.93         4.8%
+Memory                                36883.36         3.2%
+Attention                             34736.13         3.0%
+Normalization                         32081.19         2.8%
+Activation                             6449.19         0.6%
--- a/unitree_g1_pack_camera/case1/output/profile_output_0/profiling_report.txt
+++ b/unitree_g1_pack_camera/case1/output/profile_output_0/profiling_report.txt
@@ -0,0 +1,85 @@
+================================================================================
+PERFORMANCE PROFILING REPORT
+================================================================================
+
+----------------------------------------
+MACRO-LEVEL TIMING SUMMARY
+----------------------------------------
+Section                                     Count    Total(ms)      Avg(ms)   CUDA Avg(ms)
+--------------------------------------------------------------------------------------
+action_generation                              11    394370.58     35851.87       35851.67
+data_loading                                    1        52.00        52.00          52.03
+get_latent_z/encode                            22       899.25        40.88          40.91
+iteration_total                                11    830856.07     75532.37       75531.89
+load_transitions                                1         2.11         2.11           2.16
+model_loading/checkpoint                        1     10410.48     10410.48       10410.60
+model_loading/config                            1     49460.02     49460.02       49460.01
+model_to_cuda                                   1      4398.71      4398.71        4398.74
+prepare_init_input                              1        10.26        10.26          10.29
+prepare_observation                            11         5.08         0.46           0.49
+prepare_wm_observation                         11         2.03         0.18           0.21
+save_results                                   11     40851.48      3713.77        3713.80
+synthesis/conditioning_prep                    22      2270.48       103.20         103.24
+synthesis/ddim_sampling                        22    775253.03     35238.77       35238.59
+synthesis/decode_first_stage                   22     12416.36       564.38         564.43
+update_action_queues                           11         6.27         0.57           0.60
+update_state_queues                            11        16.57         1.51           1.54
+world_model_interaction                        11    395594.93     35963.18       35962.96
+--------------------------------------------------------------------------------------
+TOTAL                                               2516875.71
+
+----------------------------------------
+GPU MEMORY SUMMARY
+----------------------------------------
+Peak allocated:       17890.50 MB
+Average allocated:    16129.98 MB
+
+----------------------------------------
+TOP 30 OPERATORS BY CUDA TIME
+----------------------------------------
+Operator                                              Count     CUDA(ms)      CPU(ms)  Self CUDA(ms)
+------------------------------------------------------------------------------------------------
+ProfilerStep*                                             6    438046.75    232814.87      232809.14
+aten::linear                                         171276    112786.01     10941.68           0.00
+aten::addmm                                           81456     79765.93      3676.25       79525.01
+ampere_sgemm_128x64_tn                                26400     52203.84         0.00       52203.84
+aten::matmul                                          90468     34345.67      5341.43           0.00
+aten::_convolution                                   100242     33699.82     12792.11           0.00
+aten::mm                                              89820     33690.79      3067.07       33361.05
+aten::convolution                                    100242     33629.44     13178.80           0.00
+aten::cudnn_convolution                               98430     31003.85      9020.54       29316.78
+ampere_sgemm_32x128_tn                                42348     20439.71         0.00       20439.71
+aten::conv2d                                          42042     18256.98      5775.15           0.00
+ampere_sgemm_128x32_tn                                40938     16493.37         0.00       16493.37
+xformers::efficient_attention_forward_cutlass         24000     15256.14      2372.78       15154.49
+fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti...    24000     15155.37         0.00       15155.37
+ampere_sgemm_64x64_tn                                 21000     14660.16         0.00       14660.16
+aten::copy_                                          231819     13133.93    137045.31       12663.88
+aten::group_norm                                      87144     12058.55      9417.15           0.00
+aten::native_group_norm                               87144     11497.70      8394.42       11024.58
+aten::conv3d                                          26400      8909.30      3210.64           0.00
+void at::native::(anonymous namespace)::Rowwise...    87144      8732.10         0.00        8732.10
+void cudnn::ops::nchwToNhwcKernel<float, float,...   169824      8550.65         0.00        8550.65
+aten::clone                                          214314      8182.15      7704.97           0.00
+void at::native::elementwise_kernel<128, 2, at:...   220440      8122.53         0.00        8122.53
+void cutlass::Kernel<cutlass_80_simt_sgemm_128x...    15000      7959.63         0.00        7959.63
+aten::conv1d                                          30000      5921.64      4150.30           0.00
+aten::reshape                                        671772      5134.95      7968.26           0.00
+sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t...    16272      5106.25         0.00        5106.25
+void cutlass_cudnn_infer::Kernel<cutlass_tensor...     4200      4882.51         0.00        4882.51
+aten::_to_copy                                        12219      4575.90    132491.24           0.00
+aten::to                                              58101      4568.11    132512.86           0.00
+
+----------------------------------------
+OPERATOR CATEGORY BREAKDOWN
+----------------------------------------
+Category                         CUDA Time(ms)   Percentage
+---------------------------------------------------------
+Other                                473442.20        41.5%
+Linear/GEMM                          343517.32        30.1%
+Convolution                          160436.45        14.1%
+Elementwise                           54809.55         4.8%
+Attention                             34810.12         3.1%
+Memory                                34401.76         3.0%
+Normalization                         32147.89         2.8%
+Activation                             6457.30         0.6%
--- a/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768654109.node-0.99872.0
+++ b/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768654109.node-0.99872.0
--- a/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768654183.node-0.100294.0
+++ b/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768654183.node-0.100294.0
--- a/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768664289.node-0.139960.0
+++ b/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768664289.node-0.139960.0
--- a/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768664457.node-0.141866.0
+++ b/unitree_g1_pack_camera/case1/output/tensorboard/events.out.tfevents.1768664457.node-0.141866.0
--- a/unitree_g1_pack_camera/case1/run_world_model_interaction_profile.sh
+++ b/unitree_g1_pack_camera/case1/run_world_model_interaction_profile.sh
@@ -22,7 +22,8 @@ dataset="unitree_g1_pack_camera"
        --guidance_rescale 0.7 \
        --perframe_ae \
        --profile \
-        --profile_iterations 3
+        --profile_iterations 3 \
+        --profile_detail full
 } 2>&1 | tee "${res_dir}/output_profile.log"

 echo ""
--- a/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0
+++ b/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output/tensorboard/events.out.tfevents.1768653377.node-0.96886.0
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output/tensorboard/events.out.tfevents.1768653377.node-0.96886.0
--- a/unitree_z1_dual_arm_cleanup_pencils/case2/output/tensorboard/events.out.tfevents.1768654066.node-0.99536.0
+++ b/unitree_z1_dual_arm_cleanup_pencils/case2/output/tensorboard/events.out.tfevents.1768654066.node-0.99536.0
--- a/unitree_z1_dual_arm_cleanup_pencils/case3/output/tensorboard/events.out.tfevents.1768654759.node-0.104943.0
+++ b/unitree_z1_dual_arm_cleanup_pencils/case3/output/tensorboard/events.out.tfevents.1768654759.node-0.104943.0
--- a/unitree_z1_dual_arm_cleanup_pencils/case4/output/tensorboard/events.out.tfevents.1768655457.node-0.113979.0
+++ b/unitree_z1_dual_arm_cleanup_pencils/case4/output/tensorboard/events.out.tfevents.1768655457.node-0.113979.0
--- a/unitree_z1_dual_arm_stackbox/case1/output/tensorboard/events.out.tfevents.1768656164.node-0.121654.0
+++ b/unitree_z1_dual_arm_stackbox/case1/output/tensorboard/events.out.tfevents.1768656164.node-0.121654.0
--- a/unitree_z1_dual_arm_stackbox/case2/output/tensorboard/events.out.tfevents.1768656804.node-0.127411.0
+++ b/unitree_z1_dual_arm_stackbox/case2/output/tensorboard/events.out.tfevents.1768656804.node-0.127411.0
--- a/unitree_z1_dual_arm_stackbox/case3/output/tensorboard/events.out.tfevents.1768657441.node-0.129215.0
+++ b/unitree_z1_dual_arm_stackbox/case3/output/tensorboard/events.out.tfevents.1768657441.node-0.129215.0
--- a/unitree_z1_dual_arm_stackbox/case4/output/tensorboard/events.out.tfevents.1768658073.node-0.129560.0
+++ b/unitree_z1_dual_arm_stackbox/case4/output/tensorboard/events.out.tfevents.1768658073.node-0.129560.0
--- a/unitree_z1_dual_arm_stackbox_v2/case1/output/tensorboard/events.out.tfevents.1768658696.node-0.130758.0
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/output/tensorboard/events.out.tfevents.1768658696.node-0.130758.0
--- a/unitree_z1_dual_arm_stackbox_v2/case2/output/tensorboard/events.out.tfevents.1768659620.node-0.131213.0
+++ b/unitree_z1_dual_arm_stackbox_v2/case2/output/tensorboard/events.out.tfevents.1768659620.node-0.131213.0
--- a/unitree_z1_dual_arm_stackbox_v2/case3/output/tensorboard/events.out.tfevents.1768660542.node-0.132493.0
+++ b/unitree_z1_dual_arm_stackbox_v2/case3/output/tensorboard/events.out.tfevents.1768660542.node-0.132493.0
--- a/unitree_z1_dual_arm_stackbox_v2/case4/output/tensorboard/events.out.tfevents.1768661469.node-0.132922.0
+++ b/unitree_z1_dual_arm_stackbox_v2/case4/output/tensorboard/events.out.tfevents.1768661469.node-0.132922.0
--- a/unitree_z1_stackbox/case1/output/tensorboard/events.out.tfevents.1768662393.node-0.134204.0
+++ b/unitree_z1_stackbox/case1/output/tensorboard/events.out.tfevents.1768662393.node-0.134204.0
--- a/unitree_z1_stackbox/case2/output/tensorboard/events.out.tfevents.1768663363.node-0.135074.0
+++ b/unitree_z1_stackbox/case2/output/tensorboard/events.out.tfevents.1768663363.node-0.135074.0
--- a/unitree_z1_stackbox/case3/output/tensorboard/events.out.tfevents.1768664339.node-0.140563.0
+++ b/unitree_z1_stackbox/case3/output/tensorboard/events.out.tfevents.1768664339.node-0.140563.0
--- a/unitree_z1_stackbox/case4/output/tensorboard/events.out.tfevents.1768665320.node-0.149197.0
+++ b/unitree_z1_stackbox/case4/output/tensorboard/events.out.tfevents.1768665320.node-0.149197.0
--- a/usefal.sh
+++ b/usefal.sh
@@ -0,0 +1 @@
+python3 psnr_score_for_challenge.py --gt_video unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4 --pred_video unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4 --output_file unitree_g1_pack_camera/case1/psnr_result.json
				`@@ -0,0 +1 @@`
				`python3 psnr_score_for_challenge.py --gt_video unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4 --pred_video unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4 --output_file unitree_g1_pack_camera/case1/psnr_result.json`