结果

Layer 3: 延迟 decode，只解码 CLIP 需要的 1 帧
- world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-11 07:12:16 +00:00 · 2026-02-11 07:11:55 +00:00
3 changed files with 51 additions and 62 deletions
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -559,6 +559,7 @@ def image_guided_synthesis_sim_mode(
        autocast_ctx = nullcontext()

    batch_variants = None
+    samples = None
    if ddim_sampler is not None:
        with autocast_ctx:
            samples, actions, states, intermedia = ddim_sampler.sample(
@@ -583,7 +584,7 @@ def image_guided_synthesis_sim_mode(
            batch_images = model.decode_first_stage(samples)
            batch_variants = batch_images

-    return batch_variants, actions, states
+    return batch_variants, actions, states, samples


 def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
@@ -693,7 +694,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
            sample_save_dir = f'{video_save_dir}/wm/{fs}'
            os.makedirs(sample_save_dir, exist_ok=True)
            # For collecting interaction videos
-            wm_video = []
+            wm_latent = []
            # Initialize observation queues
            cond_obs_queues = {
                "observation.images.top":
@@ -749,7 +750,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:

                # Use world-model in policy to generate action
                print(f'>>> Step {itr}: generating actions ...')
-                pred_videos_0, pred_actions, _ = image_guided_synthesis_sim_mode(
+                pred_videos_0, pred_actions, _, _ = image_guided_synthesis_sim_mode(
                    model,
                    sample['instruction'],
                    observation,
@@ -791,7 +792,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:

                # Interaction with the world-model
                print(f'>>> Step {itr}: interacting with world model ...')
-                pred_videos_1, _, pred_states = image_guided_synthesis_sim_mode(
+                pred_videos_1, _, pred_states, wm_samples = image_guided_synthesis_sim_mode(
                    model,
                    "",
                    observation,
@@ -804,12 +805,16 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    fs=model_input_fs,
                    text_input=False,
                    timestep_spacing=args.timestep_spacing,
-                    guidance_rescale=args.guidance_rescale)
+                    guidance_rescale=args.guidance_rescale,
+                    decode_video=False)
+
+                # Decode only the last frame for CLIP embedding in next iteration
+                last_frame_pixel = model.decode_first_stage(wm_samples[:, :, -1:, :, :])

                for idx in range(args.exe_steps):
                    observation = {
                        'observation.images.top':
-                        pred_videos_1[0][:, idx:idx + 1].permute(1, 0, 2, 3),
+                        last_frame_pixel[0, :, 0:1].permute(1, 0, 2, 3),
                        'observation.state':
                        torch.zeros_like(pred_states[0][idx:idx + 1]) if
                        args.zero_pred_state else pred_states[0][idx:idx + 1],
@@ -827,30 +832,14 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                       pred_videos_0,
                                       sample_tag,
                                       fps=args.save_fps)
-                # Save videos environment changes via world-model interaction
-                sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
-                log_to_tensorboard(writer,
-                                   pred_videos_1,
-                                   sample_tag,
-                                   fps=args.save_fps)
-
-                # Save the imagen videos for decision-making
-                if pred_videos_0 is not None:
-                    sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
-                    save_results(pred_videos_0.cpu(),
-                                 sample_video_file,
-                                 fps=args.save_fps)
-                # Save videos environment changes via world-model interaction
-                sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
-                save_results(pred_videos_1.cpu(),
-                             sample_video_file,
-                             fps=args.save_fps)

                print('>' * 24)
-                # Collect the result of world-model interactions
-                wm_video.append(pred_videos_1[:, :, :args.exe_steps].cpu())
+                # Store raw latent for deferred decode
+                wm_latent.append(wm_samples[:, :, :args.exe_steps].cpu())

-            full_video = torch.cat(wm_video, dim=2)
+            # Deferred decode: batch decode all stored latents
+            full_latent = torch.cat(wm_latent, dim=2).to(device)
+            full_video = model.decode_first_stage(full_latent).cpu()
            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
            log_to_tensorboard(writer,
                               full_video,
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -1,14 +1,14 @@
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  __import__("pkg_resources").declare_namespace(__name__)
-2026-02-10 17:57:48.047156: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-10 17:57:48.050303: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-10 17:57:48.081710: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-10 17:57:48.081741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-10 17:57:48.083577: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-10 17:57:48.091772: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-10 17:57:48.092045: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-11 06:58:19.745318: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-11 06:58:19.748691: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-11 06:58:19.782405: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-11 06:58:19.782465: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-11 06:58:19.784464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-11 06:58:19.793381: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-11 06:58:19.794103: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-10 17:57:48.787960: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-11 06:58:20.607029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 [rank: 0] Global seed set to 123
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
@@ -66,8 +66,31 @@ DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096

  0%|          | 0/8 [00:00<?, ?it/s]/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/torch/nn/functional.py:5501: UserWarning: Attempting to use hipBLASLt on an unsupported architecture! Overriding blas backend to hipblas (Triggered internally at ../aten/src/ATen/Context.cpp:296.)
+  proj = linear(q, w, b)

 12%|█▎        | 1/8 [01:01<07:08, 61.25s/it]
+ 25%|██▌       | 2/8 [01:58<05:53, 58.90s/it]
+ 38%|███▊      | 3/8 [02:55<04:50, 58.14s/it]
+ 50%|█████     | 4/8 [03:52<03:51, 57.79s/it]
+ 62%|██████▎   | 5/8 [04:50<02:52, 57.60s/it]
+ 75%|███████▌  | 6/8 [05:47<01:54, 57.48s/it]
+ 88%|████████▊ | 7/8 [06:44<00:57, 57.41s/it]
+100%|██████████| 8/8 [07:42<00:00, 57.36s/it]
+100%|██████████| 8/8 [07:42<00:00, 57.75s/it]
+>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
@@ -117,30 +140,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
-DEBUG:PIL.Image:Importing XVThumbImagePlugin
-
- 12%|█▎        | 1/8 [01:03<07:22, 63.25s/it]
- 25%|██▌       | 2/8 [02:02<06:05, 60.93s/it]
- 38%|███▊      | 3/8 [03:01<05:00, 60.19s/it]
- 50%|█████     | 4/8 [04:01<03:59, 59.85s/it]
- 62%|██████▎   | 5/8 [05:00<02:59, 59.69s/it]
- 75%|███████▌  | 6/8 [05:59<01:59, 59.54s/it]
- 88%|████████▊ | 7/8 [06:59<00:59, 59.43s/it]
-100%|██████████| 8/8 [07:58<00:00, 59.46s/it]
-100%|██████████| 8/8 [07:58<00:00, 59.82s/it]
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 1: generating actions ...
->>> Step 1: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 2: generating actions ...
->>> Step 2: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 3: generating actions ...
->>> Step 3: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 4: generating actions ...
->>> Step 4: interacting with world model ...
 DEBUG:PIL.Image:Importing SgiImagePlugin
->>> Step 5: generating actions ...
->>> Step 5: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
@@ -1,5 +1,5 @@
 {
    "gt_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/unitree_z1_dual_arm_cleanup_pencils_case1_amd.mp4",
    "pred_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
-    "psnr": 32.442113263955434
+    "psnr": 30.34518638635329
 }