性能剖析
This commit is contained in:
@@ -56,21 +56,50 @@ class TimingRecord:
|
||||
}
|
||||
|
||||
|
||||
class ProfilerManager:
|
||||
"""Manages macro and micro-level profiling."""
|
||||
|
||||
def __init__(self, enabled: bool = False, output_dir: str = "./profile_output"):
|
||||
self.enabled = enabled
|
||||
self.output_dir = output_dir
|
||||
self.macro_timings: Dict[str, List[float]] = {}
|
||||
self.cuda_events: Dict[str, List[tuple]] = {}
|
||||
self.memory_snapshots: List[Dict] = []
|
||||
self.pytorch_profiler = None
|
||||
self.current_iteration = 0
|
||||
self.operator_stats: Dict[str, Dict] = {}
|
||||
|
||||
if enabled:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
class ProfilerManager:
|
||||
"""Manages macro and micro-level profiling."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool = False,
|
||||
output_dir: str = "./profile_output",
|
||||
profile_detail: str = "light",
|
||||
):
|
||||
self.enabled = enabled
|
||||
self.output_dir = output_dir
|
||||
self.profile_detail = profile_detail
|
||||
self.macro_timings: Dict[str, List[float]] = {}
|
||||
self.cuda_events: Dict[str, List[tuple]] = {}
|
||||
self.memory_snapshots: List[Dict] = []
|
||||
self.pytorch_profiler = None
|
||||
self.current_iteration = 0
|
||||
self.operator_stats: Dict[str, Dict] = {}
|
||||
self.profiler_config = self._build_profiler_config(profile_detail)
|
||||
|
||||
if enabled:
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
def _build_profiler_config(self, profile_detail: str) -> Dict[str, Any]:
|
||||
"""Return profiler settings based on the requested detail level."""
|
||||
if profile_detail not in ("light", "full"):
|
||||
raise ValueError(f"Unsupported profile_detail: {profile_detail}")
|
||||
if profile_detail == "full":
|
||||
return {
|
||||
"record_shapes": True,
|
||||
"profile_memory": True,
|
||||
"with_stack": True,
|
||||
"with_flops": True,
|
||||
"with_modules": True,
|
||||
"group_by_input_shape": True,
|
||||
}
|
||||
return {
|
||||
"record_shapes": False,
|
||||
"profile_memory": False,
|
||||
"with_stack": False,
|
||||
"with_flops": False,
|
||||
"with_modules": False,
|
||||
"group_by_input_shape": False,
|
||||
}
|
||||
|
||||
@contextmanager
|
||||
def profile_section(self, name: str, sync_cuda: bool = True):
|
||||
@@ -133,22 +162,22 @@ class ProfilerManager:
|
||||
if not self.enabled:
|
||||
return nullcontext()
|
||||
|
||||
self.pytorch_profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
schedule=torch.profiler.schedule(
|
||||
wait=wait, warmup=warmup, active=active, repeat=1
|
||||
),
|
||||
on_trace_ready=self._trace_handler,
|
||||
record_shapes=True,
|
||||
profile_memory=True,
|
||||
with_stack=True,
|
||||
with_flops=True,
|
||||
with_modules=True,
|
||||
)
|
||||
return self.pytorch_profiler
|
||||
self.pytorch_profiler = torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
schedule=torch.profiler.schedule(
|
||||
wait=wait, warmup=warmup, active=active, repeat=1
|
||||
),
|
||||
on_trace_ready=self._trace_handler,
|
||||
record_shapes=self.profiler_config["record_shapes"],
|
||||
profile_memory=self.profiler_config["profile_memory"],
|
||||
with_stack=self.profiler_config["with_stack"],
|
||||
with_flops=self.profiler_config["with_flops"],
|
||||
with_modules=self.profiler_config["with_modules"],
|
||||
)
|
||||
return self.pytorch_profiler
|
||||
|
||||
def _trace_handler(self, prof):
|
||||
"""Handle profiler trace output."""
|
||||
@@ -158,8 +187,10 @@ class ProfilerManager:
|
||||
)
|
||||
prof.export_chrome_trace(trace_path)
|
||||
|
||||
# Extract operator statistics
|
||||
key_averages = prof.key_averages(group_by_input_shape=True)
|
||||
# Extract operator statistics
|
||||
key_averages = prof.key_averages(
|
||||
group_by_input_shape=self.profiler_config["group_by_input_shape"]
|
||||
)
|
||||
for evt in key_averages:
|
||||
op_name = evt.key
|
||||
if op_name not in self.operator_stats:
|
||||
@@ -344,18 +375,22 @@ class ProfilerManager:
|
||||
# Global profiler instance
|
||||
_profiler: Optional[ProfilerManager] = None
|
||||
|
||||
def get_profiler() -> ProfilerManager:
|
||||
"""Get the global profiler instance."""
|
||||
global _profiler
|
||||
if _profiler is None:
|
||||
_profiler = ProfilerManager(enabled=False)
|
||||
return _profiler
|
||||
|
||||
def init_profiler(enabled: bool, output_dir: str) -> ProfilerManager:
|
||||
"""Initialize the global profiler."""
|
||||
global _profiler
|
||||
_profiler = ProfilerManager(enabled=enabled, output_dir=output_dir)
|
||||
return _profiler
|
||||
def get_profiler() -> ProfilerManager:
|
||||
"""Get the global profiler instance."""
|
||||
global _profiler
|
||||
if _profiler is None:
|
||||
_profiler = ProfilerManager(enabled=False)
|
||||
return _profiler
|
||||
|
||||
def init_profiler(enabled: bool, output_dir: str, profile_detail: str) -> ProfilerManager:
|
||||
"""Initialize the global profiler."""
|
||||
global _profiler
|
||||
_profiler = ProfilerManager(
|
||||
enabled=enabled,
|
||||
output_dir=output_dir,
|
||||
profile_detail=profile_detail,
|
||||
)
|
||||
return _profiler
|
||||
|
||||
|
||||
# ========== Original Functions ==========
|
||||
@@ -1193,13 +1228,20 @@ def get_parser():
|
||||
default=None,
|
||||
help="Directory to save profiling results. Defaults to {savedir}/profile_output."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile_iterations",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
|
||||
)
|
||||
return parser
|
||||
parser.add_argument(
|
||||
"--profile_iterations",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Number of iterations to run PyTorch profiler's active phase for operator-level analysis."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile_detail",
|
||||
type=str,
|
||||
choices=["light", "full"],
|
||||
default="light",
|
||||
help="Profiling detail level. Use 'full' for shapes/stacks/memory/flops."
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -1214,7 +1256,11 @@ if __name__ == '__main__':
|
||||
profile_output_dir = args.profile_output_dir
|
||||
if profile_output_dir is None:
|
||||
profile_output_dir = os.path.join(args.savedir, "profile_output")
|
||||
init_profiler(enabled=args.profile, output_dir=profile_output_dir)
|
||||
init_profiler(
|
||||
enabled=args.profile,
|
||||
output_dir=profile_output_dir,
|
||||
profile_detail=args.profile_detail,
|
||||
)
|
||||
|
||||
rank, gpu_num = 0, 1
|
||||
run_inference(args, gpu_num, rank)
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
================================================================================
|
||||
PERFORMANCE PROFILING REPORT
|
||||
================================================================================
|
||||
|
||||
----------------------------------------
|
||||
MACRO-LEVEL TIMING SUMMARY
|
||||
----------------------------------------
|
||||
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
|
||||
--------------------------------------------------------------------------------------
|
||||
action_generation 11 399707.47 36337.04 36336.85
|
||||
data_loading 1 52.85 52.85 52.88
|
||||
get_latent_z/encode 22 901.39 40.97 41.01
|
||||
iteration_total 11 836793.23 76072.11 76071.63
|
||||
load_transitions 1 2.24 2.24 2.28
|
||||
model_loading/checkpoint 1 11833.31 11833.31 11833.43
|
||||
model_loading/config 1 49774.19 49774.19 49774.16
|
||||
model_to_cuda 1 8909.30 8909.30 8909.33
|
||||
prepare_init_input 1 10.52 10.52 10.55
|
||||
prepare_observation 11 5.41 0.49 0.53
|
||||
prepare_wm_observation 11 2.12 0.19 0.22
|
||||
save_results 11 38668.06 3515.28 3515.32
|
||||
synthesis/conditioning_prep 22 2916.63 132.57 132.61
|
||||
synthesis/ddim_sampling 22 782695.01 35577.05 35576.86
|
||||
synthesis/decode_first_stage 22 12444.31 565.65 565.70
|
||||
update_action_queues 11 6.85 0.62 0.65
|
||||
update_state_queues 11 17.67 1.61 1.64
|
||||
world_model_interaction 11 398375.58 36215.96 36215.75
|
||||
--------------------------------------------------------------------------------------
|
||||
TOTAL 2543116.13
|
||||
|
||||
----------------------------------------
|
||||
GPU MEMORY SUMMARY
|
||||
----------------------------------------
|
||||
Peak allocated: 17890.50 MB
|
||||
Average allocated: 16129.98 MB
|
||||
|
||||
----------------------------------------
|
||||
TOP 30 OPERATORS BY CUDA TIME
|
||||
----------------------------------------
|
||||
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
|
||||
------------------------------------------------------------------------------------------------
|
||||
ProfilerStep* 6 443804.16 237696.98 237689.25
|
||||
aten::linear 171276 112286.23 13179.82 0.00
|
||||
aten::addmm 81456 79537.36 3799.84 79296.37
|
||||
ampere_sgemm_128x64_tn 26400 52052.10 0.00 52052.10
|
||||
aten::matmul 90468 34234.05 6281.32 0.00
|
||||
aten::_convolution 100242 33623.79 13105.89 0.00
|
||||
aten::mm 89820 33580.74 3202.22 33253.18
|
||||
aten::convolution 100242 33575.23 13714.47 0.00
|
||||
aten::cudnn_convolution 98430 30932.19 8640.50 29248.12
|
||||
ampere_sgemm_32x128_tn 42348 20394.52 0.00 20394.52
|
||||
aten::conv2d 42042 18115.35 5932.30 0.00
|
||||
ampere_sgemm_128x32_tn 40938 16429.81 0.00 16429.81
|
||||
xformers::efficient_attention_forward_cutlass 24000 15222.23 2532.93 15120.44
|
||||
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15121.31 0.00 15121.31
|
||||
ampere_sgemm_64x64_tn 21000 14627.12 0.00 14627.12
|
||||
aten::copy_ 231819 14504.87 127056.51 14038.39
|
||||
aten::group_norm 87144 12033.73 10659.57 0.00
|
||||
aten::native_group_norm 87144 11473.40 9449.36 11002.02
|
||||
aten::conv3d 26400 8852.13 3365.43 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 87144 8714.68 0.00 8714.68
|
||||
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8525.44 0.00 8525.44
|
||||
aten::clone 214314 8200.26 8568.82 0.00
|
||||
void at::native::elementwise_kernel<128, 2, at:... 220440 8109.62 0.00 8109.62
|
||||
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7919.30 0.00 7919.30
|
||||
aten::_to_copy 12219 5963.43 122411.53 0.00
|
||||
aten::to 58101 5952.65 122443.72 0.00
|
||||
aten::conv1d 30000 5878.95 4556.48 0.00
|
||||
Memcpy HtoD (Pageable -> Device) 6696 5856.39 0.00 5856.39
|
||||
aten::reshape 671772 5124.03 9636.01 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5097.70 0.00 5097.70
|
||||
|
||||
----------------------------------------
|
||||
OPERATOR CATEGORY BREAKDOWN
|
||||
----------------------------------------
|
||||
Category CUDA Time(ms) Percentage
|
||||
---------------------------------------------------------
|
||||
Other 481950.47 41.9%
|
||||
Linear/GEMM 342333.09 29.8%
|
||||
Convolution 159920.77 13.9%
|
||||
Elementwise 54682.93 4.8%
|
||||
Memory 36883.36 3.2%
|
||||
Attention 34736.13 3.0%
|
||||
Normalization 32081.19 2.8%
|
||||
Activation 6449.19 0.6%
|
||||
@@ -0,0 +1,85 @@
|
||||
================================================================================
|
||||
PERFORMANCE PROFILING REPORT
|
||||
================================================================================
|
||||
|
||||
----------------------------------------
|
||||
MACRO-LEVEL TIMING SUMMARY
|
||||
----------------------------------------
|
||||
Section Count Total(ms) Avg(ms) CUDA Avg(ms)
|
||||
--------------------------------------------------------------------------------------
|
||||
action_generation 11 394370.58 35851.87 35851.67
|
||||
data_loading 1 52.00 52.00 52.03
|
||||
get_latent_z/encode 22 899.25 40.88 40.91
|
||||
iteration_total 11 830856.07 75532.37 75531.89
|
||||
load_transitions 1 2.11 2.11 2.16
|
||||
model_loading/checkpoint 1 10410.48 10410.48 10410.60
|
||||
model_loading/config 1 49460.02 49460.02 49460.01
|
||||
model_to_cuda 1 4398.71 4398.71 4398.74
|
||||
prepare_init_input 1 10.26 10.26 10.29
|
||||
prepare_observation 11 5.08 0.46 0.49
|
||||
prepare_wm_observation 11 2.03 0.18 0.21
|
||||
save_results 11 40851.48 3713.77 3713.80
|
||||
synthesis/conditioning_prep 22 2270.48 103.20 103.24
|
||||
synthesis/ddim_sampling 22 775253.03 35238.77 35238.59
|
||||
synthesis/decode_first_stage 22 12416.36 564.38 564.43
|
||||
update_action_queues 11 6.27 0.57 0.60
|
||||
update_state_queues 11 16.57 1.51 1.54
|
||||
world_model_interaction 11 395594.93 35963.18 35962.96
|
||||
--------------------------------------------------------------------------------------
|
||||
TOTAL 2516875.71
|
||||
|
||||
----------------------------------------
|
||||
GPU MEMORY SUMMARY
|
||||
----------------------------------------
|
||||
Peak allocated: 17890.50 MB
|
||||
Average allocated: 16129.98 MB
|
||||
|
||||
----------------------------------------
|
||||
TOP 30 OPERATORS BY CUDA TIME
|
||||
----------------------------------------
|
||||
Operator Count CUDA(ms) CPU(ms) Self CUDA(ms)
|
||||
------------------------------------------------------------------------------------------------
|
||||
ProfilerStep* 6 438046.75 232814.87 232809.14
|
||||
aten::linear 171276 112786.01 10941.68 0.00
|
||||
aten::addmm 81456 79765.93 3676.25 79525.01
|
||||
ampere_sgemm_128x64_tn 26400 52203.84 0.00 52203.84
|
||||
aten::matmul 90468 34345.67 5341.43 0.00
|
||||
aten::_convolution 100242 33699.82 12792.11 0.00
|
||||
aten::mm 89820 33690.79 3067.07 33361.05
|
||||
aten::convolution 100242 33629.44 13178.80 0.00
|
||||
aten::cudnn_convolution 98430 31003.85 9020.54 29316.78
|
||||
ampere_sgemm_32x128_tn 42348 20439.71 0.00 20439.71
|
||||
aten::conv2d 42042 18256.98 5775.15 0.00
|
||||
ampere_sgemm_128x32_tn 40938 16493.37 0.00 16493.37
|
||||
xformers::efficient_attention_forward_cutlass 24000 15256.14 2372.78 15154.49
|
||||
fmha_cutlassF_f32_aligned_64x64_rf_sm80(Attenti... 24000 15155.37 0.00 15155.37
|
||||
ampere_sgemm_64x64_tn 21000 14660.16 0.00 14660.16
|
||||
aten::copy_ 231819 13133.93 137045.31 12663.88
|
||||
aten::group_norm 87144 12058.55 9417.15 0.00
|
||||
aten::native_group_norm 87144 11497.70 8394.42 11024.58
|
||||
aten::conv3d 26400 8909.30 3210.64 0.00
|
||||
void at::native::(anonymous namespace)::Rowwise... 87144 8732.10 0.00 8732.10
|
||||
void cudnn::ops::nchwToNhwcKernel<float, float,... 169824 8550.65 0.00 8550.65
|
||||
aten::clone 214314 8182.15 7704.97 0.00
|
||||
void at::native::elementwise_kernel<128, 2, at:... 220440 8122.53 0.00 8122.53
|
||||
void cutlass::Kernel<cutlass_80_simt_sgemm_128x... 15000 7959.63 0.00 7959.63
|
||||
aten::conv1d 30000 5921.64 4150.30 0.00
|
||||
aten::reshape 671772 5134.95 7968.26 0.00
|
||||
sm80_xmma_fprop_implicit_gemm_indexed_tf32f32_t... 16272 5106.25 0.00 5106.25
|
||||
void cutlass_cudnn_infer::Kernel<cutlass_tensor... 4200 4882.51 0.00 4882.51
|
||||
aten::_to_copy 12219 4575.90 132491.24 0.00
|
||||
aten::to 58101 4568.11 132512.86 0.00
|
||||
|
||||
----------------------------------------
|
||||
OPERATOR CATEGORY BREAKDOWN
|
||||
----------------------------------------
|
||||
Category CUDA Time(ms) Percentage
|
||||
---------------------------------------------------------
|
||||
Other 473442.20 41.5%
|
||||
Linear/GEMM 343517.32 30.1%
|
||||
Convolution 160436.45 14.1%
|
||||
Elementwise 54809.55 4.8%
|
||||
Attention 34810.12 3.1%
|
||||
Memory 34401.76 3.0%
|
||||
Normalization 32147.89 2.8%
|
||||
Activation 6457.30 0.6%
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -22,7 +22,8 @@ dataset="unitree_g1_pack_camera"
|
||||
--guidance_rescale 0.7 \
|
||||
--perframe_ae \
|
||||
--profile \
|
||||
--profile_iterations 3
|
||||
--profile_iterations 3 \
|
||||
--profile_detail full
|
||||
} 2>&1 | tee "${res_dir}/output_profile.log"
|
||||
|
||||
echo ""
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user