From 2a6068f9e4c8b5d86d0b8c7f572c601094617d99 Mon Sep 17 00:00:00 2001 From: qhy <2728290997@qq.com> Date: Tue, 10 Feb 2026 17:13:45 +0800 Subject: [PATCH] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=BA=86=E4=B8=80=E8=B7=AF?= =?UTF-8?q?=E8=A7=86=E9=A2=91vae=E8=A7=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/evaluation/world_model_interaction.py | 44 ++++++++++++------- .../case1/output.log | 20 ++++----- .../case1/run_world_model_interaction.sh | 3 +- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py index 562d357..1b95a13 100644 --- a/scripts/evaluation/world_model_interaction.py +++ b/scripts/evaluation/world_model_interaction.py @@ -330,7 +330,8 @@ def image_guided_synthesis_sim_mode( timestep_spacing: str = 'uniform', guidance_rescale: float = 0.0, sim_mode: bool = True, - **kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + decode_video: bool = True, + **kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]: """ Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text). @@ -353,10 +354,13 @@ def image_guided_synthesis_sim_mode( timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace". guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance. sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model. + decode_video (bool): Whether to decode latent samples to pixel-space video. + Set to False to skip VAE decode for speed when only actions/states are needed. **kwargs: Additional arguments passed to the DDIM sampler. Returns: - batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W]. + batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W], + or None when decode_video=False. actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding. states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding. """ @@ -409,6 +413,7 @@ def image_guided_synthesis_sim_mode( kwargs.update({"unconditional_conditioning_img_nonetext": None}) cond_mask = None cond_z0 = None + batch_variants = None if ddim_sampler is not None: samples, actions, states, intermedia = ddim_sampler.sample( S=ddim_steps, @@ -427,9 +432,10 @@ def image_guided_synthesis_sim_mode( guidance_rescale=guidance_rescale, **kwargs) - # Reconstruct from latent to pixel space - batch_images = model.decode_first_stage(samples) - batch_variants = batch_images + if decode_video: + # Reconstruct from latent to pixel space + batch_images = model.decode_first_stage(samples) + batch_variants = batch_images return batch_variants, actions, states @@ -590,7 +596,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: fs=model_input_fs, timestep_spacing=args.timestep_spacing, guidance_rescale=args.guidance_rescale, - sim_mode=False) + sim_mode=False, + decode_video=not args.fast_policy_no_decode) # Update future actions in the observation queues for idx in range(len(pred_actions[0])): @@ -648,11 +655,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: observation) # Save the imagen videos for decision-making - sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}" - log_to_tensorboard(writer, - pred_videos_0, - sample_tag, - fps=args.save_fps) + if pred_videos_0 is not None: + sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}" + log_to_tensorboard(writer, + pred_videos_0, + sample_tag, + fps=args.save_fps) # Save videos environment changes via world-model interaction sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}" log_to_tensorboard(writer, @@ -661,10 +669,11 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None: fps=args.save_fps) # Save the imagen videos for decision-making - sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4' - save_results(pred_videos_0.cpu(), - sample_video_file, - fps=args.save_fps) + if pred_videos_0 is not None: + sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4' + save_results(pred_videos_0.cpu(), + sample_video_file, + fps=args.save_fps) # Save videos environment changes via world-model interaction sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4' save_results(pred_videos_1.cpu(), @@ -797,6 +806,11 @@ def get_parser(): action='store_true', default=False, help="not using the predicted states as comparison") + parser.add_argument( + "--fast_policy_no_decode", + action='store_true', + default=False, + help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.") parser.add_argument("--save_fps", type=int, default=8, diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/output.log b/unitree_z1_dual_arm_stackbox_v2/case1/output.log index 90fa18b..af627c1 100644 --- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log +++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log @@ -1,10 +1,10 @@ -2026-02-10 16:42:59.052755: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. -2026-02-10 16:42:59.102749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered -2026-02-10 16:42:59.102803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered -2026-02-10 16:42:59.104125: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered -2026-02-10 16:42:59.111711: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. +2026-02-10 17:03:42.057881: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. +2026-02-10 17:03:42.107520: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered +2026-02-10 17:03:42.107564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered +2026-02-10 17:03:42.108900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered +2026-02-10 17:03:42.116404: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. -2026-02-10 16:43:00.040735: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT +2026-02-10 17:03:43.044539: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT Global seed set to 123 INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08 @@ -92,7 +92,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin DEBUG:PIL.Image:Importing XbmImagePlugin DEBUG:PIL.Image:Importing XpmImagePlugin DEBUG:PIL.Image:Importing XVThumbImagePlugin - 9%|▉ | 1/11 [00:40<06:41, 40.19s/it] 18%|█▊ | 2/11 [01:20<06:04, 40.45s/it] 27%|██▋ | 3/11 [02:01<05:25, 40.72s/it] 36%|███▋ | 4/11 [02:42<04:45, 40.81s/it] 45%|████▌ | 5/11 [03:23<04:04, 40.76s/it] 55%|█████▍ | 6/11 [04:03<03:22, 40.57s/it] 64%|██████▎ | 7/11 [04:43<02:41, 40.48s/it] 73%|███████▎ | 8/11 [05:24<02:01, 40.44s/it] 82%|████████▏ | 9/11 [06:04<01:20, 40.41s/it] 91%|█████████ | 10/11 [06:45<00:40, 40.44s/it] 100%|██████████| 11/11 [07:25<00:00, 40.45s/it] 100%|██████████| 11/11 [07:25<00:00, 40.51s/it] + 9%|▉ | 1/11 [00:37<06:15, 37.55s/it] 18%|█▊ | 2/11 [01:15<05:39, 37.71s/it] 27%|██▋ | 3/11 [01:53<05:03, 37.98s/it] 36%|███▋ | 4/11 [02:32<04:26, 38.13s/it] 45%|████▌ | 5/11 [03:10<03:48, 38.14s/it] 55%|█████▍ | 6/11 [03:48<03:10, 38.07s/it] 64%|██████▎ | 7/11 [04:26<02:32, 38.02s/it] 73%|███████▎ | 8/11 [05:04<01:54, 38.01s/it] 82%|████████▏ | 9/11 [05:41<01:15, 37.99s/it] 91%|█████████ | 10/11 [06:19<00:37, 37.99s/it] 100%|██████████| 11/11 [06:57<00:00, 38.00s/it] 100%|██████████| 11/11 [06:57<00:00, 38.00s/it] >>>>>>>>>>>>>>>>>>>>>>>> >>> Step 1: generating actions ... >>> Step 1: interacting with world model ... @@ -125,6 +125,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin >>> Step 10: interacting with world model ... >>>>>>>>>>>>>>>>>>>>>>>> -real 9m4.275s -user 9m59.012s -sys 1m24.037s +real 8m36.548s +user 9m22.484s +sys 1m21.506s diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh b/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh index bdcbbff..0aaed8f 100644 --- a/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh +++ b/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh @@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2" --n_iter 11 \ --timestep_spacing 'uniform_trailing' \ --guidance_rescale 0.7 \ - --perframe_ae + --perframe_ae \ + --fast_policy_no_decode } 2>&1 | tee "${res_dir}/output.log"