异步保存结果

2026-02-09 21:23:00 +08:00
parent bc78815acf
commit 6630952d2b
1 changed files with 101 additions and 21 deletions
--- a/scripts/evaluation/world_model_interaction.py
+++ b/scripts/evaluation/world_model_interaction.py
@@ -11,6 +11,8 @@ import warnings
 import imageio
 import time
 import json
+import atexit
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass, field, asdict
 from typing import Optional, Dict, List, Any, Mapping
@@ -21,7 +23,7 @@ from tqdm import tqdm
 from einops import rearrange, repeat
 from collections import OrderedDict
 from torch import nn
-from eval_utils import populate_queues, log_to_tensorboard
+from eval_utils import populate_queues
 from collections import deque
 from torch import Tensor
 from torch.utils.tensorboard import SummaryWriter
@@ -393,6 +395,81 @@ def init_profiler(enabled: bool, output_dir: str, profile_detail: str) -> Profil
    return _profiler


+# ========== Async I/O ==========
+_io_executor: Optional[ThreadPoolExecutor] = None
+_io_futures: List[Any] = []
+
+
+def _get_io_executor() -> ThreadPoolExecutor:
+    global _io_executor
+    if _io_executor is None:
+        _io_executor = ThreadPoolExecutor(max_workers=2)
+    return _io_executor
+
+
+def _flush_io():
+    """Wait for all pending async I/O to finish."""
+    global _io_futures
+    for fut in _io_futures:
+        try:
+            fut.result()
+        except Exception as e:
+            print(f">>> [async I/O] error: {e}")
+    _io_futures.clear()
+
+
+atexit.register(_flush_io)
+
+
+def _save_results_sync(video_cpu: Tensor, filename: str, fps: int) -> None:
+    """Synchronous save on CPU tensor (runs in background thread)."""
+    video = torch.clamp(video_cpu.float(), -1., 1.)
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4)
+    frame_grids = [
+        torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
+        for framesheet in video
+    ]
+    grid = torch.stack(frame_grids, dim=0)
+    grid = (grid + 1.0) / 2.0
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(filename,
+                               grid,
+                               fps=fps,
+                               video_codec='h264',
+                               options={'crf': '10'})
+
+
+def save_results_async(video: Tensor, filename: str, fps: int = 8) -> None:
+    """Submit video saving to background thread pool."""
+    video_cpu = video.detach().cpu()
+    fut = _get_io_executor().submit(_save_results_sync, video_cpu, filename, fps)
+    _io_futures.append(fut)
+
+
+def _log_to_tb_sync(writer, video_cpu: Tensor, tag: str, fps: int) -> None:
+    """Synchronous TensorBoard log on CPU tensor (runs in background thread)."""
+    if video_cpu.dim() == 5:
+        n = video_cpu.shape[0]
+        video = video_cpu.permute(2, 0, 1, 3, 4)
+        frame_grids = [
+            torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0)
+            for framesheet in video
+        ]
+        grid = torch.stack(frame_grids, dim=0)
+        grid = (grid + 1.0) / 2.0
+        grid = grid.unsqueeze(dim=0)
+        writer.add_video(tag, grid, fps=fps)
+
+
+def log_to_tensorboard_async(writer, data: Tensor, tag: str, fps: int = 10) -> None:
+    """Submit TensorBoard logging to background thread pool."""
+    if isinstance(data, torch.Tensor) and data.dim() == 5:
+        data_cpu = data.detach().cpu()
+        fut = _get_io_executor().submit(_log_to_tb_sync, writer, data_cpu, tag, fps)
+        _io_futures.append(fut)
+
+
 # ========== Original Functions ==========
 def get_device_from_parameters(module: nn.Module) -> torch.device:
    """Get a module's device by checking one of its parameters.
@@ -1392,28 +1469,28 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:
                                cond_obs_queues = populate_queues(cond_obs_queues,
                                                                  obs_update)

-                        # Save the imagen videos for decision-making
+                        # Save the imagen videos for decision-making (async)
                        with profiler.profile_section("save_results"):
                            sample_tag = f"{args.dataset}-vid{sample['videoid']}-dm-fs-{fs}/itr-{itr}"
-                            log_to_tensorboard(writer,
+                            log_to_tensorboard_async(writer,
                                                     pred_videos_0,
                                                     sample_tag,
                                                     fps=args.save_fps)
                            # Save videos environment changes via world-model interaction
                            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/itr-{itr}"
-                            log_to_tensorboard(writer,
+                            log_to_tensorboard_async(writer,
                                                     pred_videos_1,
                                                     sample_tag,
                                                     fps=args.save_fps)

                            # Save the imagen videos for decision-making
                            sample_video_file = f'{video_save_dir}/dm/{fs}/itr-{itr}.mp4'
-                            save_results(pred_videos_0.cpu(),
+                            save_results_async(pred_videos_0,
                                               sample_video_file,
                                               fps=args.save_fps)
                            # Save videos environment changes via world-model interaction
                            sample_video_file = f'{video_save_dir}/wm/{fs}/itr-{itr}.mp4'
-                            save_results(pred_videos_1.cpu(),
+                            save_results_async(pred_videos_1,
                                               sample_video_file,
                                               fps=args.save_fps)

@@ -1426,12 +1503,15 @@ def run_inference(args: argparse.Namespace, gpu_num: int, gpu_no: int) -> None:

            full_video = torch.cat(wm_video, dim=2)
            sample_tag = f"{args.dataset}-vid{sample['videoid']}-wd-fs-{fs}/full"
-            log_to_tensorboard(writer,
+            log_to_tensorboard_async(writer,
                                     full_video,
                                     sample_tag,
                                     fps=args.save_fps)
            sample_full_video_file = f"{video_save_dir}/../{sample['videoid']}_full_fs{fs}.mp4"
-            save_results(full_video, sample_full_video_file, fps=args.save_fps)
+            save_results_async(full_video, sample_full_video_file, fps=args.save_fps)
+
+    # Wait for all async I/O to complete before profiling report
+    _flush_io()

    # Save profiling results
    profiler.save_results()