轻量投影/MLP BF16

psnr指标反而比只量化扩散主干模型要低，原因不明
2026-01-18 18:26:37 +08:00
parent 2b634cde90
commit 3c0f409fcf
6 changed files with 96 additions and 48 deletions
--- a/src/unifolm_wma/models/ddpms.py
+++ b/src/unifolm_wma/models/ddpms.py
@@ -1882,6 +1882,7 @@ class LatentVisualDiffusion(LatentDiffusion):
                 dp_use_ema: bool = False,
                 pretrained_checkpoint: str | None = None,
                 decision_making_only: bool = True,
+                 projector_bf16: bool = True,
                 *args,
                 **kwargs):
        """
@@ -1907,6 +1908,7 @@ class LatentVisualDiffusion(LatentDiffusion):
            dp_use_ema: If True, maintain EMA for action UNet head.
            pretrained_checkpoint: Optional path to a pretrained checkpoint.
            decision_making_only: If True, use decision-only augmentation path.
+            projector_bf16: If True, run image/state/action projectors under BF16 autocast.
        """

        super().__init__(*args, **kwargs)
@@ -1917,6 +1919,7 @@ class LatentVisualDiffusion(LatentDiffusion):
        self.n_obs_steps_imagen = n_obs_steps_imagen
        self.n_obs_steps_acting = n_obs_steps_acting
        self.decision_making_only = decision_making_only
+        self.projector_bf16 = projector_bf16

        self._init_embedder(img_cond_stage_config, freeze_embedder)
        self._init_img_ctx_projector(image_proj_stage_config,
@@ -2025,6 +2028,28 @@ class LatentVisualDiffusion(LatentDiffusion):
        self.agent_state_pos_emb = nn.Parameter(
            torch.randn(1, self.n_obs_steps_imagen, self.global_emb_dim))

+    def _projector_forward(self, projector: nn.Module, x: Tensor,
+                           target_dtype: torch.dtype | None) -> Tensor:
+        use_bf16 = (self.projector_bf16 and x.device.type == "cuda"
+                    and torch.cuda.is_bf16_supported())
+        if use_bf16:
+            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                out = projector(x)
+        else:
+            out = projector(x)
+        if not hasattr(self, "_printed_projector_bf16"):
+            print(
+                ">>> projector bf16 autocast: "
+                f"enabled={self.projector_bf16} "
+                f"use_bf16={use_bf16} "
+                f"input={x.dtype} "
+                f"output={out.dtype} "
+                f"target={target_dtype}")
+            self._printed_projector_bf16 = True
+        if target_dtype is not None and out.dtype != target_dtype:
+            out = out.to(dtype=target_dtype)
+        return out
+
    def _get_augmented_batch(
            self,
            z: Tensor,
@@ -2166,6 +2191,7 @@ class LatentVisualDiffusion(LatentDiffusion):
        null_prompt = self.get_learned_conditioning([""])
        cond_ins_emb = torch.where(prompt_mask, null_prompt,
                                   cond_ins_emb.detach())
+        target_dtype = cond_ins_emb.dtype

        # Get conditioning frames
        cond_frame_index = 0
@@ -2176,7 +2202,8 @@ class LatentVisualDiffusion(LatentDiffusion):

        cond_img = input_mask * img
        cond_img_emb = self.embedder(cond_img)
-        cond_img_emb = self.image_proj_model(cond_img_emb)
+        cond_img_emb = self._projector_forward(self.image_proj_model,
+                                               cond_img_emb, target_dtype)

        if self.model.conditioning_key == 'hybrid':
            if self.interp_mode:
@@ -2191,11 +2218,15 @@ class LatentVisualDiffusion(LatentDiffusion):
                                      repeat=z.shape[2])
            cond["c_concat"] = [img_cat_cond]

-        cond_action = self.action_projector(action)
-        cond_action_emb = self.agent_action_pos_emb + cond_action
+        cond_action = self._projector_forward(self.action_projector, action,
+                                              target_dtype)
+        cond_action_emb = self.agent_action_pos_emb.to(
+            dtype=target_dtype) + cond_action
        # Get conditioning states
-        cond_state = self.state_projector(obs_state)
-        cond_state_emb = self.agent_state_pos_emb + cond_state
+        cond_state = self._projector_forward(self.state_projector, obs_state,
+                                             target_dtype)
+        cond_state_emb = self.agent_state_pos_emb.to(
+            dtype=target_dtype) + cond_state

        if self.decision_making_only:
            is_sim_mode = False