diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py index c87bae5..07dc8ef 100644 --- a/scripts/evaluation/world_model_interaction.py +++ b/scripts/evaluation/world_model_interaction.py @@ -56,21 +56,50 @@ class TimingRecord: } -class ProfilerManager: - """Manages macro and micro-level profiling.""" - - def __init__(self, enabled: bool = False, output_dir: str = "./profile_output"): - self.enabled = enabled - self.output_dir = output_dir - self.macro_timings: Dict[str, List[float]] = {} - self.cuda_events: Dict[str, List[tuple]] = {} - self.memory_snapshots: List[Dict] = [] - self.pytorch_profiler = None - self.current_iteration = 0 - self.operator_stats: Dict[str, Dict] = {} - - if enabled: - os.makedirs(output_dir, exist_ok=True) +class ProfilerManager: + """Manages macro and micro-level profiling.""" + + def __init__( + self, + enabled: bool = False, + output_dir: str = "./profile_output", + profile_detail: str = "light", + ): + self.enabled = enabled + self.output_dir = output_dir + self.profile_detail = profile_detail + self.macro_timings: Dict[str, List[float]] = {} + self.cuda_events: Dict[str, List[tuple]] = {} + self.memory_snapshots: List[Dict] = [] + self.pytorch_profiler = None + self.current_iteration = 0 + self.operator_stats: Dict[str, Dict] = {} + self.profiler_config = self._build_profiler_config(profile_detail) + + if enabled: + os.makedirs(output_dir, exist_ok=True) + + def _build_profiler_config(self, profile_detail: str) -> Dict[str, Any]: + """Return profiler settings based on the requested detail level.""" + if profile_detail not in ("light", "full"): + raise ValueError(f"Unsupported profile_detail: {profile_detail}") + if profile_detail == "full": + return { + "record_shapes": True, + "profile_memory": True, + "with_stack": True, + "with_flops": True, + "with_modules": True, + "group_by_input_shape": True, + } + return { + "record_shapes": False, + "profile_memory": False, + "with_stack": False, + "with_flops": False, + "with_modules": False, + "group_by_input_shape": False, + } @contextmanager def profile_section(self, name: str, sync_cuda: bool = True): @@ -133,22 +162,22 @@ class ProfilerManager: if not self.enabled: return nullcontext() - self.pytorch_profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - schedule=torch.profiler.schedule( - wait=wait, warmup=warmup, active=active, repeat=1 - ), - on_trace_ready=self._trace_handler, - record_shapes=True, - profile_memory=True, - with_stack=True, - with_flops=True, - with_modules=True, - ) - return self.pytorch_profiler + self.pytorch_profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, warmup=warmup, active=active, repeat=1 + ), + on_trace_ready=self._trace_handler, + record_shapes=self.profiler_config["record_shapes"], + profile_memory=self.profiler_config["profile_memory"], + with_stack=self.profiler_config["with_stack"], + with_flops=self.profiler_config["with_flops"], + with_modules=self.profiler_config["with_modules"], + ) + return self.pytorch_profiler def _trace_handler(self, prof): """Handle profiler trace output.""" @@ -158,8 +187,10 @@ class ProfilerManager: ) prof.export_chrome_trace(trace_path) - # Extract operator statistics - key_averages = prof.key_averages(group_by_input_shape=True) + # Extract operator statistics + key_averages = prof.key_averages( + group_by_input_shape=self.profiler_config["group_by_input_shape"] + ) for evt in key_averages: op_name = evt.key if op_name not in self.operator_stats: @@ -344,18 +375,22 @@ class ProfilerManager: # Global profiler instance _profiler: Optional[ProfilerManager] = None -def get_profiler() -> ProfilerManager: - """Get the global profiler instance.""" - global _profiler - if _profiler is None: - _profiler = ProfilerManager(enabled=False) - return _profiler - -def init_profiler(enabled: bool, output_dir: str) -> ProfilerManager: - """Initialize the global profiler.""" - global _profiler - _profiler = ProfilerManager(enabled=enabled, output_dir=output_dir) - return _profiler +def get_profiler() -> ProfilerManager: + """Get the global profiler instance.""" + global _profiler + if _profiler is None: + _profiler = ProfilerManager(enabled=False) + return _profiler + +def init_profiler(enabled: bool, output_dir: str, profile_detail: str) -> ProfilerManager: + """Initialize the global profiler.""" + global _profiler + _profiler = ProfilerManager( + enabled=enabled, + output_dir=output_dir, + profile_detail=profile_detail, + ) + return _profiler # ========== Original Functions ========== @@ -1193,13 +1228,20 @@ def get_parser(): default=None, help="Directory to save profiling results. Defaults to {savedir}/profile_output." ) - parser.add_argument( - "--profile_iterations", - type=int, - default=3, - help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis." - ) - return parser + parser.add_argument( + "--profile_iterations", + type=int, + default=3, + help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis." + ) + parser.add_argument( + "--profile_detail", + type=str, + choices=["light", "full"], + default="light", + help="Profiling detail level. Use 'full' for shapes/stacks/memory/flops." + ) + return parser if __name__ == '__main__': @@ -1214,7 +1256,11 @@ if __name__ == '__main__': profile_output_dir = args.profile_output_dir if profile_output_dir is None: profile_output_dir = os.path.join(args.savedir, "profile_output") - init_profiler(enabled=args.profile, output_dir=profile_output_dir) + init_profiler( + enabled=args.profile, + output_dir=profile_output_dir, + profile_detail=args.profile_detail, + ) rank, gpu_num = 0, 1 run_inference(args, gpu_num, rank) diff --git a/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt b/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt new file mode 100644 index 0000000..98ac526 --- /dev/null +++ b/unitree_g1_pack_camera/case1/output/profile_output/profiling_report.txt @@ -0,0 +1,85 @@ +================================================================================ +PERFORMANCE PROFILING REPORT +================================================================================ + +---------------------------------------- +MACRO-LEVEL TIMING SUMMARY +---------------------------------------- +Section Count Total(ms) Avg(ms) CUDA Avg(ms) +-------------------------------------------------------------------------------------- +action_generation 11 399707.47 36337.04 36336.85 +data_loading 1 52.85 52.85 52.88 +get_latent_z/encode 22 901.39 40.97 41.01 +iteration_total 11 836793.23 76072.11 76071.63 +load_transitions 1 2.24 2.24 2.28 +model_loading/checkpoint 1 11833.31 11833.31 11833.43 +model_loading/config 1 49774.19 49774.19 49774.16 +model_to_cuda 1 8909.30 8909.30 8909.33 +prepare_init_input 1 10.52 10.52 10.55 +prepare_observation 11 5.41 0.49 0.53 +prepare_wm_observation 11 2.12 0.19 0.22 +save_results 11 38668.06 3515.28 3515.32 +synthesis/conditioning_prep 22 2916.63 132.57 132.61 +synthesis/ddim_sampling 22 782695.01 35577.05 35576.86 +synthesis/decode_first_stage 22 12444.31 565.65 565.70 +update_action_queues 11 6.85 0.62 0.65 +update_state_queues 11 17.67 1.61 1.64 +world_model_interaction 11 398375.58 36215.96 36215.75 +-------------------------------------------------------------------------------------- +TOTAL 2543116.13 + +---------------------------------------- +GPU MEMORY SUMMARY +---------------------------------------- +Peak allocated: 17890.50 MB +Average allocated: 16129.98 MB + +---------------------------------------- +TOP 30 OPERATORS BY CUDA TIME +---------------------------------------- +Operator Count CUDA(ms) CPU(ms) Self CUDA(ms) +------------------------------------------------------------------------------------------------ +ProfilerStep* 6 443804.16 237696.98 237689.25 +aten::linear 171276 112286.23 13179.82 0.00 +aten::addmm 81456 79537.36 3799.84 79296.37 +ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10 +aten::matmul 90468 34234.05 6281.32 0.00 +aten::_convolution 100242 33623.79 13105.89 0.00 +aten::mm 89820 33580.74 3202.22 33253.18 +aten::convolution 100242 33575.23 13714.47 0.00 +aten::cudnn_convolution 98430 30932.19 8640.50 29248.12 +ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52 +aten::conv2d 42042 18115.35 5932.30 0.00 +ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81 +xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44 +fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31 +ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12 +aten::copy_ 231819 14504.87 127056.51 14038.39 +aten::group_norm 87144 12033.73 10659.57 0.00 +aten::native_group_norm 87144 11473.40 9449.36 11002.02 +aten::conv3d 26400 8852.13 3365.43 0.00 +void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68 +void cudnn::ops::nchwToNhwcKernel Device) 6696 5856.39 0.00 5856.39 +aten::reshape 671772 5124.03 9636.01 0.00 +sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70 + +---------------------------------------- +OPERATOR CATEGORY BREAKDOWN +---------------------------------------- +Category CUDA Time(ms) Percentage +--------------------------------------------------------- +Other 481950.47 41.9% +Linear/GEMM 342333.09 29.8% +Convolution 159920.77 13.9% +Elementwise 54682.93 4.8% +Memory 36883.36 3.2% +Attention 34736.13 3.0% +Normalization 32081.19 2.8% +Activation 6449.19 0.6% diff --git a/unitree_g1_pack_camera/case1/output/profile_output_0/profiling_report.txt b/unitree_g1_pack_camera/case1/output/profile_output_0/profiling_report.txt new file mode 100644 index 0000000..de109d2 --- /dev/null +++ b/unitree_g1_pack_camera/case1/output/profile_output_0/profiling_report.txt @@ -0,0 +1,85 @@ +================================================================================ +PERFORMANCE PROFILING REPORT +================================================================================ + +---------------------------------------- +MACRO-LEVEL TIMING SUMMARY +---------------------------------------- +Section Count Total(ms) Avg(ms) CUDA Avg(ms) +-------------------------------------------------------------------------------------- +action_generation 11 394370.58 35851.87 35851.67 +data_loading 1 52.00 52.00 52.03 +get_latent_z/encode 22 899.25 40.88 40.91 +iteration_total 11 830856.07 75532.37 75531.89 +load_transitions 1 2.11 2.11 2.16 +model_loading/checkpoint 1 10410.48 10410.48 10410.60 +model_loading/config 1 49460.02 49460.02 49460.01 +model_to_cuda 1 4398.71 4398.71 4398.74 +prepare_init_input 1 10.26 10.26 10.29 +prepare_observation 11 5.08 0.46 0.49 +prepare_wm_observation 11 2.03 0.18 0.21 +save_results 11 40851.48 3713.77 3713.80 +synthesis/conditioning_prep 22 2270.48 103.20 103.24 +synthesis/ddim_sampling 22 775253.03 35238.77 35238.59 +synthesis/decode_first_stage 22 12416.36 564.38 564.43 +update_action_queues 11 6.27 0.57 0.60 +update_state_queues 11 16.57 1.51 1.54 +world_model_interaction 11 395594.93 35963.18 35962.96 +-------------------------------------------------------------------------------------- +TOTAL 2516875.71 + +---------------------------------------- +GPU MEMORY SUMMARY +---------------------------------------- +Peak allocated: 17890.50 MB +Average allocated: 16129.98 MB + +---------------------------------------- +TOP 30 OPERATORS BY CUDA TIME +---------------------------------------- +Operator Count CUDA(ms) CPU(ms) Self CUDA(ms) +------------------------------------------------------------------------------------------------ +ProfilerStep* 6 438046.75 232814.87 232809.14 +aten::linear 171276 112786.01 10941.68 0.00 +aten::addmm 81456 79765.93 3676.25 79525.01 +ampere_sgemm_128x64_tn 26400 52203.84 0.00 52203.84 +aten::matmul 90468 34345.67 5341.43 0.00 +aten::_convolution 100242 33699.82 12792.11 0.00 +aten::mm 89820 33690.79 3067.07 33361.05 +aten::convolution 100242 33629.44 13178.80 0.00 +aten::cudnn_convolution 98430 31003.85 9020.54 29316.78 +ampere_sgemm_32x128_tn 42348 20439.71 0.00 20439.71 +aten::conv2d 42042 18256.98 5775.15 0.00 +ampere_sgemm_128x32_tn 40938 16493.37 0.00 16493.37 +xformers::efficient_attention_forward_cutlass 24000 15256.14 2372.78 15154.49 +fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15155.37 0.00 15155.37 +ampere_sgemm_64x64_tn 21000 14660.16 0.00 14660.16 +aten::copy_ 231819 13133.93 137045.31 12663.88 +aten::group_norm 87144 12058.55 9417.15 0.00 +aten::native_group_norm 87144 11497.70 8394.42 11024.58 +aten::conv3d 26400 8909.30 3210.64 0.00 +void at::native::(anonymous namespace)::Rowwise... 87144 8732.10 0.00 8732.10 +void cudnn::ops::nchwToNhwcKernel&1 | tee "${res_dir}/output_profile.log" echo "" diff --git a/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0 b/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0 index a141f30..3d3ddef 100644 Binary files a/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0 and b/unitree_g1_pack_camera/case4/output/tensorboard/events.out.tfevents.1768652463.node-0.90349.0 differ diff --git a/unitree_z1_dual_arm_cleanup_pencils/case1/output/tensorboard/events.out.tfevents.1768653377.node-0.96886.0 b/unitree_z1_dual_arm_cleanup_pencils/case1/output/tensorboard/events.out.tfevents.1768653377.node-0.96886.0 new file mode 100644 index 0000000..be8daa0 Binary files /dev/null and b/unitree_z1_dual_arm_cleanup_pencils/case1/output/tensorboard/events.out.tfevents.1768653377.node-0.96886.0 differ diff --git a/unitree_z1_dual_arm_cleanup_pencils/case2/output/tensorboard/events.out.tfevents.1768654066.node-0.99536.0 b/unitree_z1_dual_arm_cleanup_pencils/case2/output/tensorboard/events.out.tfevents.1768654066.node-0.99536.0 new file mode 100644 index 0000000..26a3aa1 Binary files /dev/null and b/unitree_z1_dual_arm_cleanup_pencils/case2/output/tensorboard/events.out.tfevents.1768654066.node-0.99536.0 differ diff --git a/unitree_z1_dual_arm_cleanup_pencils/case3/output/tensorboard/events.out.tfevents.1768654759.node-0.104943.0 b/unitree_z1_dual_arm_cleanup_pencils/case3/output/tensorboard/events.out.tfevents.1768654759.node-0.104943.0 new file mode 100644 index 0000000..b62f042 Binary files /dev/null and b/unitree_z1_dual_arm_cleanup_pencils/case3/output/tensorboard/events.out.tfevents.1768654759.node-0.104943.0 differ diff --git a/unitree_z1_dual_arm_cleanup_pencils/case4/output/tensorboard/events.out.tfevents.1768655457.node-0.113979.0 b/unitree_z1_dual_arm_cleanup_pencils/case4/output/tensorboard/events.out.tfevents.1768655457.node-0.113979.0 new file mode 100644 index 0000000..072f97d Binary files /dev/null and b/unitree_z1_dual_arm_cleanup_pencils/case4/output/tensorboard/events.out.tfevents.1768655457.node-0.113979.0 differ diff --git a/unitree_z1_dual_arm_stackbox/case1/output/tensorboard/events.out.tfevents.1768656164.node-0.121654.0 b/unitree_z1_dual_arm_stackbox/case1/output/tensorboard/events.out.tfevents.1768656164.node-0.121654.0 new file mode 100644 index 0000000..738fb89 Binary files /dev/null and b/unitree_z1_dual_arm_stackbox/case1/output/tensorboard/events.out.tfevents.1768656164.node-0.121654.0 differ diff --git a/unitree_z1_dual_arm_stackbox/case2/output/tensorboard/events.out.tfevents.1768656804.node-0.127411.0 b/unitree_z1_dual_arm_stackbox/case2/output/tensorboard/events.out.tfevents.1768656804.node-0.127411.0 new file mode 100644 index 0000000..ef03061 Binary files /dev/null and b/unitree_z1_dual_arm_stackbox/case2/output/tensorboard/events.out.tfevents.1768656804.node-0.127411.0 differ diff --git a/unitree_z1_dual_arm_stackbox/case3/output/tensorboard/events.out.tfevents.1768657441.node-0.129215.0 b/unitree_z1_dual_arm_stackbox/case3/output/tensorboard/events.out.tfevents.1768657441.node-0.129215.0 new file mode 100644 index 0000000..1c67947 Binary files /dev/null and b/unitree_z1_dual_arm_stackbox/case3/output/tensorboard/events.out.tfevents.1768657441.node-0.129215.0 differ diff --git a/unitree_z1_dual_arm_stackbox/case4/output/tensorboard/events.out.tfevents.1768658073.node-0.129560.0 b/unitree_z1_dual_arm_stackbox/case4/output/tensorboard/events.out.tfevents.1768658073.node-0.129560.0 new file mode 100644 index 0000000..7d628cd Binary files /dev/null and b/unitree_z1_dual_arm_stackbox/case4/output/tensorboard/events.out.tfevents.1768658073.node-0.129560.0 differ diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/output/tensorboard/events.out.tfevents.1768658696.node-0.130758.0 b/unitree_z1_dual_arm_stackbox_v2/case1/output/tensorboard/events.out.tfevents.1768658696.node-0.130758.0 new file mode 100644 index 0000000..fcf6f2b Binary files /dev/null and b/unitree_z1_dual_arm_stackbox_v2/case1/output/tensorboard/events.out.tfevents.1768658696.node-0.130758.0 differ diff --git a/unitree_z1_dual_arm_stackbox_v2/case2/output/tensorboard/events.out.tfevents.1768659620.node-0.131213.0 b/unitree_z1_dual_arm_stackbox_v2/case2/output/tensorboard/events.out.tfevents.1768659620.node-0.131213.0 new file mode 100644 index 0000000..aa8769b Binary files /dev/null and b/unitree_z1_dual_arm_stackbox_v2/case2/output/tensorboard/events.out.tfevents.1768659620.node-0.131213.0 differ diff --git a/unitree_z1_dual_arm_stackbox_v2/case3/output/tensorboard/events.out.tfevents.1768660542.node-0.132493.0 b/unitree_z1_dual_arm_stackbox_v2/case3/output/tensorboard/events.out.tfevents.1768660542.node-0.132493.0 new file mode 100644 index 0000000..604927e Binary files /dev/null and b/unitree_z1_dual_arm_stackbox_v2/case3/output/tensorboard/events.out.tfevents.1768660542.node-0.132493.0 differ diff --git a/unitree_z1_dual_arm_stackbox_v2/case4/output/tensorboard/events.out.tfevents.1768661469.node-0.132922.0 b/unitree_z1_dual_arm_stackbox_v2/case4/output/tensorboard/events.out.tfevents.1768661469.node-0.132922.0 new file mode 100644 index 0000000..1511b1a Binary files /dev/null and b/unitree_z1_dual_arm_stackbox_v2/case4/output/tensorboard/events.out.tfevents.1768661469.node-0.132922.0 differ diff --git a/unitree_z1_stackbox/case1/output/tensorboard/events.out.tfevents.1768662393.node-0.134204.0 b/unitree_z1_stackbox/case1/output/tensorboard/events.out.tfevents.1768662393.node-0.134204.0 new file mode 100644 index 0000000..61c6342 Binary files /dev/null and b/unitree_z1_stackbox/case1/output/tensorboard/events.out.tfevents.1768662393.node-0.134204.0 differ diff --git a/unitree_z1_stackbox/case2/output/tensorboard/events.out.tfevents.1768663363.node-0.135074.0 b/unitree_z1_stackbox/case2/output/tensorboard/events.out.tfevents.1768663363.node-0.135074.0 new file mode 100644 index 0000000..e01a840 Binary files /dev/null and b/unitree_z1_stackbox/case2/output/tensorboard/events.out.tfevents.1768663363.node-0.135074.0 differ diff --git a/unitree_z1_stackbox/case3/output/tensorboard/events.out.tfevents.1768664339.node-0.140563.0 b/unitree_z1_stackbox/case3/output/tensorboard/events.out.tfevents.1768664339.node-0.140563.0 new file mode 100644 index 0000000..e795ded Binary files /dev/null and b/unitree_z1_stackbox/case3/output/tensorboard/events.out.tfevents.1768664339.node-0.140563.0 differ diff --git a/unitree_z1_stackbox/case4/output/tensorboard/events.out.tfevents.1768665320.node-0.149197.0 b/unitree_z1_stackbox/case4/output/tensorboard/events.out.tfevents.1768665320.node-0.149197.0 new file mode 100644 index 0000000..d4d1a62 Binary files /dev/null and b/unitree_z1_stackbox/case4/output/tensorboard/events.out.tfevents.1768665320.node-0.149197.0 differ diff --git a/usefal.sh b/usefal.sh new file mode 100644 index 0000000..3e28604 --- /dev/null +++ b/usefal.sh @@ -0,0 +1 @@ +python3 psnr_score_for_challenge.py --gt_video unitree_g1_pack_camera/case1/unitree_g1_pack_camera_case1.mp4 --pred_video unitree_g1_pack_camera/case1/output/inference/0_full_fs6.mp4 --output_file unitree_g1_pack_camera/case1/psnr_result.json \ No newline at end of file