结果

Layer 3: 延迟 decode，只解码 CLIP 需要的 1 帧
- world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-11 07:12:16 +00:00 · 2026-02-11 07:11:55 +00:00 · 2026-02-10 18:15:52 +00:00 · 2026-02-10 13:40:52 +00:00 · 2026-02-10 10:47:10 +00:00 · 2026-02-10 07:02:20 +00:00
11 changed files with 202 additions and 106 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,5 @@ Experiment/checkpoint
 Experiment/log

 *.ckpt
-*.0
+*.0
+unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/traces/wx-ms-w7900d-0032_742306.1770698186047591119.pt.trace.json
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -559,6 +559,7 @@ def image_guided_synthesis_sim_mode(
        autocast_ctx = nullcontext()

    batch_variants = None
+    samples = None
    if ddim_sampler is not None:
        with autocast_ctx:
            samples, actions, states, intermedia = ddim_sampler.sample(
@@ -583,7 +584,7 @@ def image_guided_synthesis_sim_mode(
            batch_images = model.decode_first_stage(samples)
            batch_variants = batch_images

-    return batch_variants, actions, states
+    return batch_variants, actions, states, samples


 def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
@@ -625,6 +626,12 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
    # Compile hot ResBlocks for operator fusion
    apply_torch_compile(model)

+    # Fuse KV projections in attention layers (to_k + to_v → to_kv)
+    from unifolm_wma.modules.attention import CrossAttention
+    kv_count = sum(1 for m in model.modules()
+                   if isinstance(m, CrossAttention) and m.fuse_kv())
+    print(f"    ✓ KV fused: {kv_count} attention layers")
+
    # Export precision-converted checkpoint if requested
    if args.export_precision_ckpt:
        export_path = args.export_precision_ckpt
@@ -687,7 +694,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
            sample_save_dir = f'{video_save_dir}/wm/{fs}'
            os.makedirs(sample_save_dir, exist_ok=True)
            # For collecting interaction videos
-            wm_video = []
+            wm_latent = []
            # Initialize observation queues
            cond_obs_queues = {
                "observation.images.top":
@@ -743,7 +750,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:

                # Use world-model in policy to generate action
                print(f'>>> Step {itr}: generating actions ...')
-                pred_videos_0, pred_actions, _ = image_guided_synthesis_sim_mode(
+                pred_videos_0, pred_actions, _, _ = image_guided_synthesis_sim_mode(
                    model,
                    sample['instruction'],
                    observation,
@@ -785,7 +792,7 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:

                # Interaction with the world-model
                print(f'>>> Step {itr}: interacting with world model ...')
-                pred_videos_1, _, pred_states = image_guided_synthesis_sim_mode(
+                pred_videos_1, _, pred_states, wm_samples = image_guided_synthesis_sim_mode(
                    model,
                    "",
                    observation,
@@ -798,12 +805,16 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                    fs=model_input_fs,
                    text_input=False,
                    timestep_spacing=args.timestep_spacing,
-                    guidance_rescale=args.guidance_rescale)
+                    guidance_rescale=args.guidance_rescale,
+                    decode_video=False)
+
+                # Decode only the last frame for CLIP embedding in next iteration
+                last_frame_pixel = model.decode_first_stage(wm_samples[:, :, -1:, :, :])

                for idx in range(args.exe_steps):
                    observation = {
                        'observation.images.top':
-                        pred_videos_1[0][:, idx:idx + 1].permute(1, 0, 2, 3),
+                        last_frame_pixel[0, :, 0:1].permute(1, 0, 2, 3),
                        'observation.state':
                        torch.zeros_like(pred_states[0][idx:idx + 1]) if
                        args.zero_pred_state else pred_states[0][idx:idx + 1],
@@ -821,30 +832,14 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                       pred_videos_0,
                                       sample_tag,
                                       fps=args.save_fps)
-                # Save videos environment changes via world-model interaction
-                sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
-                log_to_tensorboard(writer,
-                                   pred_videos_1,
-                                   sample_tag,
-                                   fps=args.save_fps)
-
-                # Save the imagen videos for decision-making
-                if pred_videos_0 is not None:
-                    sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
-                    save_results(pred_videos_0.cpu(),
-                                 sample_video_file,
-                                 fps=args.save_fps)
-                # Save videos environment changes via world-model interaction
-                sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
-                save_results(pred_videos_1.cpu(),
-                             sample_video_file,
-                             fps=args.save_fps)

                print('>' * 24)
-                # Collect the result of world-model interactions
-                wm_video.append(pred_videos_1[:, :, :args.exe_steps].cpu())
+                # Store raw latent for deferred decode
+                wm_latent.append(wm_samples[:, :, :args.exe_steps].cpu())

-            full_video = torch.cat(wm_video, dim=2)
+            # Deferred decode: batch decode all stored latents
+            full_latent = torch.cat(wm_latent, dim=2).to(device)
+            full_video = model.decode_first_stage(full_latent).cpu()
            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
            log_to_tensorboard(writer,
                               full_video,
--- a/src/unifolm_wma/models/diffusion_head/conditional_unet1d.py
+++ b/src/unifolm_wma/models/diffusion_head/conditional_unet1d.py
@@ -567,6 +567,11 @@ class ConditionalUnet1D(nn.Module):
        # Broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timesteps = timesteps.expand(sample.shape[0])
        global_feature = self.diffusion_step_encoder(timesteps)
+        # Pre-expand global_feature once (reused in every down/mid/up block)
+        if self.use_linear_act_proj:
+            global_feature_expanded = global_feature.unsqueeze(1).expand(-1, T, -1)
+        else:
+            global_feature_expanded = global_feature.unsqueeze(1).expand(-1, 2, -1)
        (imagen_cond_down, imagen_cond_mid, imagen_cond_up
         ) = imagen_cond[0:4], imagen_cond[4], imagen_cond[5:]  #NOTE HAND CODE

@@ -603,15 +608,11 @@ class ConditionalUnet1D(nn.Module):

            if self.use_linear_act_proj:
                imagen_cond = imagen_cond.reshape(B, T, -1)
-                cur_global_feature = global_feature.unsqueeze(
-                    1).repeat_interleave(repeats=T, dim=1)
            else:
                imagen_cond = imagen_cond.permute(0, 3, 1, 2)
                imagen_cond = imagen_cond.reshape(B, 2, -1)
-                cur_global_feature = global_feature.unsqueeze(
-                    1).repeat_interleave(repeats=2, dim=1)
            cur_global_feature = torch.cat(
-                [cur_global_feature, global_cond, imagen_cond], axis=-1)
+                [global_feature_expanded, global_cond, imagen_cond], axis=-1)
            x = resnet(x, cur_global_feature)
            x = resnet2(x, cur_global_feature)
            h.append(x)
@@ -638,15 +639,11 @@ class ConditionalUnet1D(nn.Module):
        imagen_cond = rearrange(imagen_cond, '(b t) c d -> b t c d', b=B)
        if self.use_linear_act_proj:
            imagen_cond = imagen_cond.reshape(B, T, -1)
-            cur_global_feature = global_feature.unsqueeze(1).repeat_interleave(
-                repeats=T, dim=1)
        else:
            imagen_cond = imagen_cond.permute(0, 3, 1, 2)
            imagen_cond = imagen_cond.reshape(B, 2, -1)
-            cur_global_feature = global_feature.unsqueeze(1).repeat_interleave(
-                repeats=2, dim=1)
        cur_global_feature = torch.cat(
-            [cur_global_feature, global_cond, imagen_cond], axis=-1)
+            [global_feature_expanded, global_cond, imagen_cond], axis=-1)
        x = resnet(x, cur_global_feature)
        x = resnet2(x, cur_global_feature)

@@ -683,16 +680,12 @@ class ConditionalUnet1D(nn.Module):

            if self.use_linear_act_proj:
                imagen_cond = imagen_cond.reshape(B, T, -1)
-                cur_global_feature = global_feature.unsqueeze(
-                    1).repeat_interleave(repeats=T, dim=1)
            else:
                imagen_cond = imagen_cond.permute(0, 3, 1, 2)
                imagen_cond = imagen_cond.reshape(B, 2, -1)
-                cur_global_feature = global_feature.unsqueeze(
-                    1).repeat_interleave(repeats=2, dim=1)

            cur_global_feature = torch.cat(
-                [cur_global_feature, global_cond, imagen_cond], axis=-1)
+                [global_feature_expanded, global_cond, imagen_cond], axis=-1)

            x = torch.cat((x, h.pop()), dim=1)
            x = resnet(x, cur_global_feature)
--- a/src/unifolm_wma/models/samplers/ddim.py
+++ b/src/unifolm_wma/models/samplers/ddim.py
@@ -251,6 +251,13 @@ class DDIMSampler(object):
        dp_ddim_scheduler_action.set_timesteps(len(timesteps))
        dp_ddim_scheduler_state.set_timesteps(len(timesteps))
        ts = torch.empty((b, ), device=device, dtype=torch.long)
+        noise_buf = torch.empty_like(img)
+        # Pre-convert schedule arrays to inference dtype (avoid per-step .to())
+        _dtype = img.dtype
+        _alphas = (self.model.alphas_cumprod if ddim_use_original_steps else self.ddim_alphas).to(_dtype)
+        _alphas_prev = (self.model.alphas_cumprod_prev if ddim_use_original_steps else self.ddim_alphas_prev).to(_dtype)
+        _sqrt_one_minus = (self.model.sqrt_one_minus_alphas_cumprod if ddim_use_original_steps else self.ddim_sqrt_one_minus_alphas).to(_dtype)
+        _sigmas = (self.ddim_sigmas_for_original_num_steps if ddim_use_original_steps else self.ddim_sigmas).to(_dtype)
        enable_cross_attn_kv_cache(self.model)
        enable_ctx_cache(self.model)
        try:
@@ -286,6 +293,8 @@ class DDIMSampler(object):
                    x0=x0,
                    fs=fs,
                    guidance_rescale=guidance_rescale,
+                    noise_buf=noise_buf,
+                    schedule_arrays=(_alphas, _alphas_prev, _sqrt_one_minus, _sigmas),
                    **kwargs)

                img, pred_x0, model_output_action, model_output_state = outs
@@ -339,6 +348,8 @@ class DDIMSampler(object):
                      mask=None,
                      x0=None,
                      guidance_rescale=0.0,
+                      noise_buf=None,
+                      schedule_arrays=None,
                      **kwargs):
        b, *_, device = *x.shape, x.device

@@ -384,16 +395,18 @@ class DDIMSampler(object):
            e_t = score_corrector.modify_score(self.model, e_t, x, t, c,
                                               **corrector_kwargs)

-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        if schedule_arrays is not None:
+            alphas, alphas_prev, sqrt_one_minus_alphas, sigmas = schedule_arrays
+        else:
+            alphas = (self.model.alphas_cumprod if use_original_steps else self.ddim_alphas).to(x.dtype)
+            alphas_prev = (self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev).to(x.dtype)
+            sqrt_one_minus_alphas = (self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas).to(x.dtype)
+            sigmas = (self.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas).to(x.dtype)

-        # Use 0-d tensors directly (already on device); broadcasting handles shape
-        a_t = alphas[index].to(x.dtype)
-        a_prev = alphas_prev[index].to(x.dtype)
-        sigma_t = sigmas[index].to(x.dtype)
-        sqrt_one_minus_at = sqrt_one_minus_alphas[index].to(x.dtype)
+        a_t = alphas[index]
+        a_prev = alphas_prev[index]
+        sigma_t = sigmas[index]
+        sqrt_one_minus_at = sqrt_one_minus_alphas[index]

        if self.model.parameterization != "v":
            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
@@ -411,8 +424,12 @@ class DDIMSampler(object):

        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t

-        noise = sigma_t * noise_like(x.shape, device,
-                                     repeat_noise) * temperature
+        if noise_buf is not None:
+            noise_buf.normal_()
+            noise = sigma_t * noise_buf * temperature
+        else:
+            noise = sigma_t * noise_like(x.shape, device,
+                                         repeat_noise) * temperature
        if noise_dropout > 0.:
            noise = torch.nn.functional.dropout(noise, p=noise_dropout)

--- a/src/unifolm_wma/modules/attention.py
+++ b/src/unifolm_wma/modules/attention.py
@@ -99,6 +99,7 @@ class CrossAttention(nn.Module):
        self.agent_action_context_len = agent_action_context_len
        self._kv_cache = {}
        self._kv_cache_enabled = False
+        self._kv_fused = False

        self.cross_attention_scale_learnable = cross_attention_scale_learnable
        if self.image_cross_attention:
@@ -116,6 +117,27 @@ class CrossAttention(nn.Module):
                self.register_parameter('alpha_caa',
                                        nn.Parameter(torch.tensor(0.)))

+    def fuse_kv(self):
+        """Fuse to_k/to_v into to_kv (2 Linear → 1). Works for all layers."""
+        k_w = self.to_k.weight  # (inner_dim, context_dim)
+        v_w = self.to_v.weight
+        self.to_kv = nn.Linear(k_w.shape[1], k_w.shape[0] * 2, bias=False)
+        self.to_kv.weight = nn.Parameter(torch.cat([k_w, v_w], dim=0))
+        del self.to_k, self.to_v
+        if self.image_cross_attention:
+            for suffix in ('_ip', '_as', '_aa'):
+                k_attr = f'to_k{suffix}'
+                v_attr = f'to_v{suffix}'
+                kw = getattr(self, k_attr).weight
+                vw = getattr(self, v_attr).weight
+                fused = nn.Linear(kw.shape[1], kw.shape[0] * 2, bias=False)
+                fused.weight = nn.Parameter(torch.cat([kw, vw], dim=0))
+                setattr(self, f'to_kv{suffix}', fused)
+                delattr(self, k_attr)
+                delattr(self, v_attr)
+        self._kv_fused = True
+        return True
+
    def forward(self, x, context=None, mask=None):
        spatial_self_attn = (context is None)
        k_ip, v_ip, out_ip = None, None, None
@@ -276,14 +298,20 @@ class CrossAttention(nn.Module):
                                    self.agent_action_context_len +
                                    self.text_context_len:, :]

-            k = self.to_k(context_ins)
-            v = self.to_v(context_ins)
-            k_ip = self.to_k_ip(context_image)
-            v_ip = self.to_v_ip(context_image)
-            k_as = self.to_k_as(context_agent_state)
-            v_as = self.to_v_as(context_agent_state)
-            k_aa = self.to_k_aa(context_agent_action)
-            v_aa = self.to_v_aa(context_agent_action)
+            if self._kv_fused:
+                k, v = self.to_kv(context_ins).chunk(2, dim=-1)
+                k_ip, v_ip = self.to_kv_ip(context_image).chunk(2, dim=-1)
+                k_as, v_as = self.to_kv_as(context_agent_state).chunk(2, dim=-1)
+                k_aa, v_aa = self.to_kv_aa(context_agent_action).chunk(2, dim=-1)
+            else:
+                k = self.to_k(context_ins)
+                v = self.to_v(context_ins)
+                k_ip = self.to_k_ip(context_image)
+                v_ip = self.to_v_ip(context_image)
+                k_as = self.to_k_as(context_agent_state)
+                v_as = self.to_v_as(context_agent_state)
+                k_aa = self.to_k_aa(context_agent_action)
+                v_aa = self.to_v_aa(context_agent_action)

            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
                          (q, k, v))
@@ -304,8 +332,11 @@ class CrossAttention(nn.Module):
        else:
            if not spatial_self_attn:
                context = context[:, :self.text_context_len, :]
-            k = self.to_k(context)
-            v = self.to_v(context)
+            if self._kv_fused:
+                k, v = self.to_kv(context).chunk(2, dim=-1)
+            else:
+                k = self.to_k(context)
+                v = self.to_v(context)

            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
                          (q, k, v))
--- a/src/unifolm_wma/modules/networks/wma_model.py
+++ b/src/unifolm_wma/modules/networks/wma_model.py
@@ -690,6 +690,8 @@ class WMAModel(nn.Module):
        self._ctx_cache = {}
        # fs_embed cache
        self._fs_embed_cache = None
+        # Pre-created CUDA stream for parallel action/state UNet
+        self._side_stream = torch.cuda.Stream() if not self.base_model_gen_only else None

    def forward(self,
                x: Tensor,
@@ -848,15 +850,16 @@ class WMAModel(nn.Module):

        if not self.base_model_gen_only:
            ba, _, _ = x_action.shape
+            ts_state = timesteps[:ba] if b > 1 else timesteps
+            # Run action_unet and state_unet in parallel via pre-created CUDA stream
+            s_stream = self._side_stream
+            s_stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(s_stream):
+                s_y = self.state_unet(x_state, ts_state, hs_a,
+                                      context_action[:2], **kwargs)
            a_y = self.action_unet(x_action, timesteps[:ba], hs_a,
                                   context_action[:2], **kwargs)
-            # Predict state
-            if b > 1:
-                s_y = self.state_unet(x_state, timesteps[:ba], hs_a,
-                                      context_action[:2], **kwargs)
-            else:
-                s_y = self.state_unet(x_state, timesteps, hs_a,
-                                      context_action[:2], **kwargs)
+            torch.cuda.current_stream().wait_stream(s_stream)
        else:
            a_y = torch.zeros_like(x_action)
            s_y = torch.zeros_like(x_state)
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/output.log
@@ -1,14 +1,14 @@
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
  __import__("pkg_resources").declare_namespace(__name__)
-2026-02-09 18:39:50.119842: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
-2026-02-09 18:39:50.123128: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-09 18:39:50.156652: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
-2026-02-09 18:39:50.156708: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
-2026-02-09 18:39:50.158926: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
-2026-02-09 18:39:50.167779: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
-2026-02-09 18:39:50.168073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
+2026-02-11 06:58:19.745318: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
+2026-02-11 06:58:19.748691: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-11 06:58:19.782405: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
+2026-02-11 06:58:19.782465: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
+2026-02-11 06:58:19.784464: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
+2026-02-11 06:58:19.793381: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
+2026-02-11 06:58:19.794103: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
 To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2026-02-09 18:39:50.915144: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
+2026-02-11 06:58:20.607029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
 [rank: 0] Global seed set to 123
 /mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
@@ -41,6 +41,7 @@ INFO:root:Loading pretrained ViT-H-14 weights (laion2b_s32b_b79k).
    ⚠ Found 601 fp32 params, converting to bf16
    ✓ All parameters converted to bfloat16
    ✓ torch.compile: 3 ResBlocks in output_blocks[5, 8, 9]
+    ✓ KV fused: 66 attention layers
 INFO:root:***** Configing Data *****
 >>> unitree_z1_stackbox: 1 data samples loaded.
 >>> unitree_z1_stackbox: data stats loaded.
@@ -65,8 +66,31 @@ DEBUG:PIL.PngImagePlugin:STREAM b'pHYs' 41 9
 DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 62 4096

  0%|          | 0/8 [00:00<?, ?it/s]/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/torch/nn/functional.py:5501: UserWarning: Attempting to use hipBLASLt on an unsupported architecture! Overriding blas backend to hipblas (Triggered internally at ../aten/src/ATen/Context.cpp:296.)
+  proj = linear(q, w, b)

 12%|█▎        | 1/8 [01:01<07:08, 61.25s/it]
+ 25%|██▌       | 2/8 [01:58<05:53, 58.90s/it]
+ 38%|███▊      | 3/8 [02:55<04:50, 58.14s/it]
+ 50%|█████     | 4/8 [03:52<03:51, 57.79s/it]
+ 62%|██████▎   | 5/8 [04:50<02:52, 57.60s/it]
+ 75%|███████▌  | 6/8 [05:47<01:54, 57.48s/it]
+ 88%|████████▊ | 7/8 [06:44<00:57, 57.41s/it]
+100%|██████████| 8/8 [07:42<00:00, 57.36s/it]
+100%|██████████| 8/8 [07:42<00:00, 57.75s/it]
+>>> Step 0: generating actions ...
+>>> Step 0: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 1: generating actions ...
+>>> Step 1: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 2: generating actions ...
+>>> Step 2: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 3: generating actions ...
+>>> Step 3: interacting with world model ...
+>>>>>>>>>>>>>>>>>>>>>>>>
+>>> Step 4: generating actions ...
+>>> Step 4: interacting with world model ...
 >>>>>>>>>>>>>>>>>>>>>>>>
 >>> Step 5: generating actions ...
 >>> Step 5: interacting with world model ...
@@ -116,30 +140,7 @@ DEBUG:PIL.Image:Importing WmfImagePlugin
 DEBUG:PIL.Image:Importing PpmImagePlugin
 DEBUG:PIL.Image:Importing PsdImagePlugin
 DEBUG:PIL.Image:Importing QoiImagePlugin
-DEBUG:PIL.Image:Importing XVThumbImagePlugin
-
- 12%|█▎        | 1/8 [01:08<07:58, 68.38s/it]
- 25%|██▌       | 2/8 [02:13<06:38, 66.48s/it]
- 38%|███▊      | 3/8 [03:18<05:29, 65.83s/it]
- 50%|█████     | 4/8 [04:23<04:22, 65.52s/it]
- 62%|██████▎   | 5/8 [05:28<03:15, 65.33s/it]
- 75%|███████▌  | 6/8 [06:33<02:10, 65.23s/it]
- 88%|████████▊ | 7/8 [07:38<01:05, 65.12s/it]
-100%|██████████| 8/8 [08:43<00:00, 65.08s/it]
-100%|██████████| 8/8 [08:43<00:00, 65.44s/it]
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 1: generating actions ...
->>> Step 1: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 2: generating actions ...
->>> Step 2: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 3: generating actions ...
->>> Step 3: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
->>> Step 4: generating actions ...
->>> Step 4: interacting with world model ...
 DEBUG:PIL.Image:Importing SgiImagePlugin
->>> Step 5: generating actions ...
->>> Step 5: interacting with world model ...
->>>>>>>>>>>>>>>>>>>>>>>>
+DEBUG:PIL.Image:Importing SpiderImagePlugin
+DEBUG:PIL.Image:Importing SunImagePlugin
+DEBUG:PIL.Image:Importing TgaImagePlugin
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/baseline.csv
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/baseline.csv
@@ -0,0 +1,5 @@
+itr,stack_to_device_1,policy/ddim_sampler_init,policy/image_embedding,policy/vae_encode,policy/text_conditioning,policy/projectors,policy/cond_assembly,policy/ddim_sampling,policy/vae_decode,synth_policy,update_action_queue,stack_to_device_2,wm/ddim_sampler_init,wm/image_embedding,wm/vae_encode,wm/text_conditioning,wm/projectors,wm/cond_assembly,wm/ddim_sampling,wm/vae_decode,synth_world_model,update_obs_queue,tensorboard_log,save_results,cpu_transfer,itr_total
+0,0.16,0.08,20.98,49.56,14.51,0.29,0.07,31005.48,0.00,31094.51,0.39,0.13,0.09,20.62,48.76,14.17,0.28,0.07,31011.17,775.40,31875.87,0.61,0.31,97.28,7.19,63077.50
+1,0.16,0.09,20.97,49.63,14.52,0.30,0.07,31035.49,0.00,31125.16,0.54,0.17,0.14,21.46,49.26,14.88,0.49,0.12,31047.54,777.56,31918.60,0.75,0.60,109.89,6.21,63163.18
+2,0.18,0.10,21.44,49.71,15.05,0.34,0.07,31047.64,0.00,31138.56,0.58,0.16,0.13,21.03,48.74,14.69,0.32,0.08,31036.47,776.96,31905.96,0.67,0.39,116.96,7.43,63171.90
+3,0.18,0.10,21.38,49.47,15.02,0.35,0.08,31041.05,0.00,31132.03,0.48,0.16,0.12,20.81,49.34,14.41,0.47,0.11,31051.98,777.11,31920.42,0.64,0.38,121.67,7.29,63184.26
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/baseline_summary.csv
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/baseline_summary.csv
@@ -0,0 +1,5 @@
+stat,stack_to_device_1,policy/ddim_sampler_init,policy/image_embedding,policy/vae_encode,policy/text_conditioning,policy/projectors,policy/cond_assembly,policy/ddim_sampling,policy/vae_decode,synth_policy,update_action_queue,stack_to_device_2,wm/ddim_sampler_init,wm/image_embedding,wm/vae_encode,wm/text_conditioning,wm/projectors,wm/cond_assembly,wm/ddim_sampling,wm/vae_decode,synth_world_model,update_obs_queue,tensorboard_log,save_results,cpu_transfer,itr_total
+mean,0.17,0.09,21.19,49.59,14.78,0.32,0.07,31032.42,0.00,31122.56,0.49,0.15,0.12,20.98,49.03,14.53,0.39,0.10,31036.79,776.76,31905.21,0.67,0.42,111.45,7.03,63149.21
+std,0.01,0.01,0.22,0.09,0.26,0.03,0.00,16.13,0.00,16.88,0.07,0.01,0.02,0.31,0.28,0.27,0.09,0.02,15.83,0.82,17.84,0.05,0.11,9.19,0.48,42.08
+min,0.16,0.08,20.97,49.47,14.51,0.29,0.07,31005.48,0.00,31094.51,0.39,0.13,0.09,20.62,48.74,14.17,0.28,0.07,31011.17,775.40,31875.87,0.61,0.31,97.28,6.21,63077.50
+max,0.18,0.10,21.44,49.71,15.05,0.35,0.08,31047.64,0.00,31138.56,0.58,0.17,0.14,21.46,49.34,14.88,0.49,0.12,31051.98,777.56,31920.42,0.75,0.60,121.67,7.43,63184.26
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/profile.log
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/profile_output/profile.log
@@ -0,0 +1,45 @@
+/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/lightning_fabric/__init__.py:29: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  __import__("pkg_resources").declare_namespace(__name__)
+[rank: 0] Global seed set to 123
+/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/kornia/feature/lightglue.py:44: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.
+  @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+/mnt/ASC1637/miniconda3/envs/unifolm-wma-o/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  checkpoint = torch.load(checkpoint_path, map_location=map_location)
+/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py:168: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  state_dict = torch.load(args.ckpt_path, map_location="cpu")
+============================================================
+PROFILE ITERATION — Loading model...
+============================================================
+AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.
+    torch.compile: 3 ResBlocks in output_blocks[5, 8, 9]
+>>> Model loaded and ready.
+>>> Noise shape: [1, 4, 16, 40, 64]
+>>> DDIM steps: 50
+>>> fast_policy_no_decode: True
+============================================================
+LAYER 1: ITERATION-LEVEL PROFILING
+============================================================
+>>> unitree_z1_stackbox: 1 data samples loaded.
+>>> unitree_z1_stackbox: data stats loaded.
+>>> unitree_z1_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox: normalizer initiated.
+>>> unitree_z1_dual_arm_stackbox_v2: 1 data samples loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: data stats loaded.
+>>> unitree_z1_dual_arm_stackbox_v2: normalizer initiated.
+>>> unitree_z1_dual_arm_cleanup_pencils: 1 data samples loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: data stats loaded.
+>>> unitree_z1_dual_arm_cleanup_pencils: normalizer initiated.
+>>> unitree_g1_pack_camera: 1 data samples loaded.
+>>> unitree_g1_pack_camera: data stats loaded.
+>>> unitree_g1_pack_camera: normalizer initiated.
+>>> Running 5 profiled iterations ...
+Traceback (most recent call last):
+  File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 981, in <module>
+    main()
+  File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 967, in main
+    all_records = run_profiled_iterations(
+  File "/mnt/ASC1637/unifolm-world-model-action/scripts/evaluation/profile_iteration.py", line 502, in run_profiled_iterations
+    sampler_type=args.sampler_type)
+AttributeError: 'Namespace' object has no attribute 'sampler_type'
--- a/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
+++ b/unitree_z1_dual_arm_cleanup_pencils/case1/psnr_result1.json
@@ -1,5 +1,5 @@
 {
    "gt_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/unitree_z1_dual_arm_cleanup_pencils_case1_amd.mp4",
    "pred_video": "/mnt/ASC1637/unifolm-world-model-action/unitree_z1_dual_arm_cleanup_pencils/case1/output/inference/0_full_fs4.mp4",
-    "psnr": 31.802224855380352
+    "psnr": 30.34518638635329
 }
Author	SHA1	Message	Date
olivame	25c6a328ef	结果	2026-02-11 07:12:16 +00:00
olivame	1d23e5d36d	Layer 3: 延迟 decode，只解码 CLIP 需要的 1 帧 - world model 调用 decode_video=False，跳过 16 帧全量 decode - 只 decode 最后 1 帧给 CLIP embedding / observation queue - 存 raw latent，循环结束后统一 batch decode 生成最终视频 - 每轮省 15 次 VAE decode，8 轮共省 120 次 - 跳过中间迭代的 wm tensorboard/mp4 保存 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-11 07:11:55 +00:00
olivame	57ba85d147	KV 融合实现完成。改动总结：速度微弱提升psnr略微上升 attention.py — 3处改动： 1. __init__ 添加 _kv_fused = False 标志 2.新增 fuse_kv() 方法：将 to_k + to_v → to_kv，同时处理 _ip/_as/_aa 辅助 KV 对 2. bmm_forward 两个分支加_kv_fused 判断，用to_kv().chunk(2, dim=-1) 替代分别调用	2026-02-10 18:15:52 +00:00
olivame	2cef3e9e45	├─────┼─────────────────────────────────┼───────────────────────┼───────────────────┤ │ 1 │ CUDA Stream 预创建 │ wma_model.py │ 50次 → 0次 │ ├─────┼─────────────────────────────────┼───────────────────────┼───────────────────┤ │ 2 │ noise buffer 预分配 │ ddim.py │ 50次 alloc → 0次 │ ├─────┼─────────────────────────────────┼───────────────────────┼───────────────────┤ │ 3 │ global_feature expand提到循环外 │ conditional_unet1d.py │ ~700次 → ~100次 │ ├─────┼─────────────────────────────────┼───────────────────────┼───────────────────┤ │ 4 │ alpha/sigma dtype 预转换 │ ddim.py │ 200次 .to() → 0次 │ 效果不算特别明显	2026-02-10 13:40:52 +00:00
olivame	a09d35ae5b	- state_unet 放到一个独立的 CUDA stream 上执行 - action_unet 在默认 stream 上同时执行 - 用 wait_stream 确保两者都完成后再返回两个 1D UNet 输入完全独立，共享的 hs_a 和 context_action 都是只读的。GPU 利用率只有 ~31%，小张量 kernel 不会打满 GPU，两个 stream 可以真正并行。	2026-02-10 10:47:10 +00:00
olivame	db848bca01	profile 结果	2026-02-10 07:02:20 +00:00