From 2a6068f9e4c8b5d86d0b8c7f572c601094617d99 Mon Sep 17 00:00:00 2001
From: qhy <2728290997@qq.com>
Date: Tue, 10 Feb 2026 17:13:45 +0800
Subject: [PATCH] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=BA=86=E4=B8=80=E8=B7=AF?=
 =?UTF-8?q?=E8=A7=86=E9=A2=91vae=E8=A7=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 scripts/evaluation/world_model_interaction.py | 44 ++++++++++++-------
 .../case1/output.log                          | 20 ++++-----
 .../case1/run_world_model_interaction.sh      |  3 +-
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/scripts/evaluation/world_model_interaction.py b/scripts/evaluation/world_model_interaction.py
index 562d357..1b95a13 100644
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -330,7 +330,8 @@ def image_guided_synthesis_sim_mode(
         timestep_spacing: str = 'uniform',
         guidance_rescale: float = 0.0,
         sim_mode: bool = True,
-        **kwargs) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        decode_video: bool = True,
+        **kwargs) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
     """
     Performs image-guided video generation in a simulation-style mode with optional multimodal guidance (image, state, action, text).
 
@@ -353,10 +354,13 @@ def image_guided_synthesis_sim_mode(
         timestep_spacing (str): Timestep sampling method in DDIM sampler. Typically "uniform" or "linspace".
         guidance_rescale (float): Guidance rescaling factor to mitigate overexposure from classifier-free guidance.
         sim_mode (bool): Whether to perform world-model interaction or decision-making using the world-model.
+        decode_video (bool): Whether to decode latent samples to pixel-space video.
+            Set to False to skip VAE decode for speed when only actions/states are needed.
         **kwargs: Additional arguments passed to the DDIM sampler.
 
     Returns:
-        batch_variants (torch.Tensor): Predicted pixel-space video frames [B, C, T, H, W].
+        batch_variants (torch.Tensor | None): Predicted pixel-space video frames [B, C, T, H, W],
+            or None when decode_video=False.
         actions (torch.Tensor): Predicted action sequences [B, T, D] from diffusion decoding.
         states (torch.Tensor): Predicted state sequences [B, T, D] from diffusion decoding.
     """
@@ -409,6 +413,7 @@ def image_guided_synthesis_sim_mode(
     kwargs.update({"unconditional_conditioning_img_nonetext": None})
     cond_mask = None
     cond_z0 = None
+    batch_variants = None
     if ddim_sampler is not None:
         samples, actions, states, intermedia = ddim_sampler.sample(
             S=ddim_steps,
@@ -427,9 +432,10 @@ def image_guided_synthesis_sim_mode(
             guidance_rescale=guidance_rescale,
             **kwargs)
 
-        # Reconstruct from latent to pixel space
-        batch_images = model.decode_first_stage(samples)
-        batch_variants = batch_images
+        if decode_video:
+            # Reconstruct from latent to pixel space
+            batch_images = model.decode_first_stage(samples)
+            batch_variants = batch_images
 
     return batch_variants, actions, states
 
@@ -590,7 +596,8 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                     fs=model_input_fs,
                     timestep_spacing=args.timestep_spacing,
                     guidance_rescale=args.guidance_rescale,
-                    sim_mode=False)
+                    sim_mode=False,
+                    decode_video=not args.fast_policy_no_decode)
 
                 # Update future actions in the observation queues
                 for idx in range(len(pred_actions[0])):
@@ -648,11 +655,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                                       observation)
 
                 # Save the imagen videos for decision-making
-                sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
-                log_to_tensorboard(writer,
-                                   pred_videos_0,
-                                   sample_tag,
-                                   fps=args.save_fps)
+                if pred_videos_0 is not None:
+                    sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
+                    log_to_tensorboard(writer,
+                                       pred_videos_0,
+                                       sample_tag,
+                                       fps=args.save_fps)
                 # Save videos environment changes via world-model interaction
                 sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
                 log_to_tensorboard(writer,
@@ -661,10 +669,11 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                    fps=args.save_fps)
 
                 # Save the imagen videos for decision-making
-                sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
-                save_results(pred_videos_0.cpu(),
-                             sample_video_file,
-                             fps=args.save_fps)
+                if pred_videos_0 is not None:
+                    sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
+                    save_results(pred_videos_0.cpu(),
+                                 sample_video_file,
+                                 fps=args.save_fps)
                 # Save videos environment changes via world-model interaction
                 sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
                 save_results(pred_videos_1.cpu(),
@@ -797,6 +806,11 @@ def get_parser():
                         action='store_true',
                         default=False,
                         help="not using the predicted states as comparison")
+    parser.add_argument(
+        "--fast_policy_no_decode",
+        action='store_true',
+        default=False,
+        help="Speed mode: policy pass only predicts actions, skip policy video decode/log/save.")
     parser.add_argument("--save_fps",
                         type=int,
                         default=8,
diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/output.log b/unitree_z1_dual_arm_stackbox_v2/case1/output.log
index 90fa18b..af627c1 100644
--- a/unitree_z1_dual_arm_stackbox_v2/case1/output.log
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/output.log
@@ -1,10 +1,10 @@
-2026-02-10 16:42:59.052755: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-10 16:42:59.102749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-10 16:42:59.102803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-10 16:42:59.104125: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-10 16:42:59.111711: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-10 17:03:42.057881: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-10 17:03:42.107520: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-10 17:03:42.107564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-10 17:03:42.108900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-10 17:03:42.116404: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-10 16:43:00.040735: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-10 17:03:43.044539: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 Global seed set to 123
 INFO:mainlogger:LatentVisualDiffusion: Running in v-prediction mode
 INFO:unifolm_wma.models.diffusion_head.conditional_unet1d:number of parameters: 5.010531e+08
@@ -92,7 +92,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing XbmImagePlugin
 DEBUG:PIL.Image:Importing XpmImagePlugin
 DEBUG:PIL.Image:Importing XVThumbImagePlugin
-  9%|▉         | 1/11 [00:40<06:41, 40.19s/it] 18%|█▊        | 2/11 [01:20<06:04, 40.45s/it] 27%|██▋       | 3/11 [02:01<05:25, 40.72s/it] 36%|███▋      | 4/11 [02:42<04:45, 40.81s/it] 45%|████▌     | 5/11 [03:23<04:04, 40.76s/it] 55%|█████▍    | 6/11 [04:03<03:22, 40.57s/it] 64%|██████▎   | 7/11 [04:43<02:41, 40.48s/it] 73%|███████▎  | 8/11 [05:24<02:01, 40.44s/it] 82%|████████▏ | 9/11 [06:04<01:20, 40.41s/it] 91%|█████████ | 10/11 [06:45<00:40, 40.44s/it]100%|██████████| 11/11 [07:25<00:00, 40.45s/it]100%|██████████| 11/11 [07:25<00:00, 40.51s/it]
+  9%|▉         | 1/11 [00:37<06:15, 37.55s/it] 18%|█▊        | 2/11 [01:15<05:39, 37.71s/it] 27%|██▋       | 3/11 [01:53<05:03, 37.98s/it] 36%|███▋      | 4/11 [02:32<04:26, 38.13s/it] 45%|████▌     | 5/11 [03:10<03:48, 38.14s/it] 55%|█████▍    | 6/11 [03:48<03:10, 38.07s/it] 64%|██████▎   | 7/11 [04:26<02:32, 38.02s/it] 73%|███████▎  | 8/11 [05:04<01:54, 38.01s/it] 82%|████████▏ | 9/11 [05:41<01:15, 37.99s/it] 91%|█████████ | 10/11 [06:19<00:37, 37.99s/it]100%|██████████| 11/11 [06:57<00:00, 38.00s/it]100%|██████████| 11/11 [06:57<00:00, 38.00s/it]
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 1: generating actions ...
 >>> Step 1: interacting with world model ...
@@ -125,6 +125,6 @@ DEBUG:PIL.Image:Importing XVThumbImagePlugin
 >>> Step 10: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 
-real	9m4.275s
-user	9m59.012s
-sys	1m24.037s
+real	8m36.548s
+user	9m22.484s
+sys	1m21.506s
diff --git a/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh b/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh
index bdcbbff..0aaed8f 100644
--- a/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh
+++ b/unitree_z1_dual_arm_stackbox_v2/case1/run_world_model_interaction.sh
@@ -20,5 +20,6 @@ dataset="unitree_z1_dual_arm_stackbox_v2"
         --n_iter 11 \
         --timestep_spacing 'uniform_trailing' \
         --guidance_rescale 0.7 \
-        --perframe_ae
+        --perframe_ae \
+        --fast_policy_no_decode
 } 2>&1 | tee "${res_dir}/output.log"