From caed7f67271e975fc429fb46bef977095942296f Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Sat, 6 Apr 2024 18:57:44 +0000 Subject: [PATCH] profile pt2 compile time with strobelight (#123311) For oss this diff adds a decorator @profile_sb_fbcode that is a nop for non meta workload. Facebook: With this diff someone can generate a strobelight profile for pt2 compilation. users need to set the env variable TORCH_COMPILE_SL_PROFILE =TRUE . For example: ``` TORCH_COMPILE_SL_PROFILE =TRUE buck2 run @//mode/inplace @//mode/opt //caffe2/fb/strobelight:compiletime_profile_example ``` see sample output bellow, at the end of summary. The way this works, is that a unique id is generated and associated with all samples that are collected for functions that are decorated with profile_sb_fbcode. This id can then be used to combine different strobe light profile into one. (for example three compilation events happens in the code bellow). Right now the following two functions are annotated with profile_sb_fbcode. bw_compiler and _compile. if two profile_sl_fbcode is called recursively, recursive invocations are ignored and a log is printed. The output is: ``` Strobelight is enabled for pt2 compilation Unique user-id for this run is: 2024-04-03-13:59:49147091devvm4561.ash0.facebook.com You can use the following link to access the strobelight profile at the end of the run: https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22purposes%22%3A[]%2C%22end%22%3A%22now%22%2C%22start%22%3A%22-30%20days%22%2C%22filterMode%22%3A%22DEFAULT%22%2C%22modifiers%22%3A[]%2C%22sampleCols%22%3A[]%2C%22cols%22%3A[%22namespace_id%22%2C%22namespace_process_id%22]%2C%22derivedCols%22%3A[]%2C%22mappedCols%22%3A[]%2C%22enumCols%22%3A[]%2C%22return_remainder%22%3Afalse%2C%22should_pivot%22%3Afalse%2C%22is_timeseries%22%3Afalse%2C%22hideEmptyColumns%22%3Afalse%2C%22timezone%22%3A%22America%2FLos_Angeles%22%2C%22compare%22%3A%22none%22%2C%22samplingRatio%22%3A%221%22%2C%22metric%22%3A%22count%22%2C%22aggregation_field%22%3A%22async_stack_complete%22%2C%22top%22%3A10000%2C%22aggregateList%22%3A[]%2C%22param_dimensions%22%3A[%7B%22dim%22%3A%22py_async_stack%22%2C%22op%22%3A%22edge%22%2C%22param%22%3A%220%22%2C%22anchor%22%3A%220%22%7D]%2C%22order%22%3A%22weight%22%2C%22order_desc%22%3Atrue%2C%22constraints%22%3A[[%7B%22column%22%3A%22run_user%22%2C%22op%22%3A%22eq%22%2C%22value%22%3A[%22[%5C%222024-04-03-13:59:49147091devvm4561.ash0.facebook.com%5C%22]%22]%7D]]%2C%22c_constraints%22%3A[[]]%2C%22b_constraints%22%3A[[]]%2C%22ignoreGroupByInComparison%22%3Afalse%7D&view=GraphProfilerView&&pool=uber&graphprofiler_filter=&graphprofiler_column_to_sort_by=exclusive the link below takes you to the collected strobelight profile https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22dimensions%22%3A%5B%5D%2C%22param_dimensions%22%3A%5B%7B%22anchor%22%3A%220%22%2C%22param%22%3A%220%22%2C%22op%22%3A%22edge%22%2C%22dim%22%3A%22py_async_stack%22%7D%5D%2C%22constraints%22%3A%5B%5B%7B%22value%22%3A%5B%22%5B%5C%22-6800545191281321%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_id%22%7D%2C%7B%22value%22%3A%5B%22%5B%5C%222024-04-03-13%3A59%3A49147091devvm4561.ash0.facebook.com%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_user%22%7D%5D%5D%2C%22top%22%3A10000%2C%22end%22%3A%221712181610%22%2C%22start%22%3A%221712174410%22%7D&view=GraphProfilerView& 1 storbelight success runs out of 1 non-ignored runs. strobelight run id is: 6181728288420687 the link below takes you to the collected strobelight profile https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22dimensions%22%3A%5B%5D%2C%22param_dimensions%22%3A%5B%7B%22anchor%22%3A%220%22%2C%22param%22%3A%220%22%2C%22op%22%3A%22edge%22%2C%22dim%22%3A%22py_async_stack%22%7D%5D%2C%22constraints%22%3A%5B%5B%7B%22value%22%3A%5B%22%5B%5C%226181728288420687%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_id%22%7D%2C%7B%22value%22%3A%5B%22%5B%5C%222024-04-03-13%3A59%3A49147091devvm4561.ash0.facebook.com%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_user%22%7D%5D%5D%2C%22top%22%3A10000%2C%22end%22%3A%221712181621%22%2C%22start%22%3A%221712174421%22%7D&view=GraphProfilerView& 2 storbelight success runs out of 2 non-ignored runs. strobelight run id is: -1026103682715688 the link below takes you to the collected strobelight profile https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22dimensions%22%3A%5B%5D%2C%22param_dimensions%22%3A%5B%7B%22anchor%22%3A%220%22%2C%22param%22%3A%220%22%2C%22op%22%3A%22edge%22%2C%22dim%22%3A%22py_async_stack%22%7D%5D%2C%22constraints%22%3A%5B%5B%7B%22value%22%3A%5B%22%5B%5C%22-1026103682715688%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_id%22%7D%2C%7B%22value%22%3A%5B%22%5B%5C%222024-04-03-13%3A59%3A49147091devvm4561.ash0.facebook.com%5C%22%5D%22%5D%2C%22op%22%3A%22eq%22%2C%22column%22%3A%22run_user%22%7D%5D%5D%2C%22top%22%3A10000%2C%22end%22%3A%221712181647%22%2C%22start%22%3A%221712174447%22%7D&view=GraphProfilerView& 3 storbelight success runs out of 3 non-ignored runs. ``` Test Plan: Was tested on buck2 run @//mode/inplace @//mode/opt //caffe2/fb/strobelight:compiletime_profile_example This was also tested in one of the ads benchmarks ``` TORCH_COMPILE_SL_PROFILE =TRUE buck2 run mode/opt mode/inplace //pytorch/benchmark:run -- ads_mc_igctr_mc3_v0 -d cuda -t train --torchdynamo inductor ``` The results matches the results reported in https://fb.workplace.com/groups/257735836456307/permalink/657458576484029 Differential Revision: D55672271 Pull Request resolved: https://github.com/pytorch/pytorch/pull/123311 Approved by: https://github.com/aorenste --- torch/_dynamo/convert_frame.py | 3 ++- torch/_inductor/compile_fx.py | 3 ++- torch/_utils_internal.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py index c1740f41505..62a42d2b5d3 100644 --- a/torch/_dynamo/convert_frame.py +++ b/torch/_dynamo/convert_frame.py @@ -27,7 +27,7 @@ import torch import torch._logging from torch._guards import compile_context, CompileContext, CompileId, tracing from torch._logging import structured -from torch._utils_internal import signpost_event +from torch._utils_internal import compiletime_sl_profile_meta, signpost_event from torch.fx.experimental.symbolic_shapes import ( ConstraintViolationError, GuardOnDataDependentSymNode, @@ -441,6 +441,7 @@ def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle: return handle +@compiletime_sl_profile_meta(phase_name="_compile") @_use_lazy_graph_module(config.use_lazy_graph_module) @maybe_cprofile def _compile( diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py index 25f7b2978aa..d03d231b44c 100644 --- a/torch/_inductor/compile_fx.py +++ b/torch/_inductor/compile_fx.py @@ -46,7 +46,7 @@ from torch._inductor.utils import BoxedBool, count_tangents from torch._logging import trace_structured from torch._ops import OpOverload from torch._subclasses.fake_tensor import FakeTensor -from torch._utils_internal import signpost_event +from torch._utils_internal import compiletime_sl_profile_meta, signpost_event from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols from torch.fx.passes.fake_tensor_prop import FakeTensorProp @@ -1339,6 +1339,7 @@ def compile_fx( graph, joint_inputs, **kwargs, compiler="inductor" ) + @compiletime_sl_profile_meta(phase_name="bw_compiler") @dynamo_utils.dynamo_timed @dynamo_utils.maybe_cprofile def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py index 69849208814..8cf4b1826a6 100644 --- a/torch/_utils_internal.py +++ b/torch/_utils_internal.py @@ -62,6 +62,18 @@ def throw_abstract_impl_not_imported_error(opname, module, context): ) +# Meta only, act as nop otherwise. +def compiletime_sl_profile_meta(phase_name): + def compiletime_sl_profile_inner(function): + @functools.wraps(function) + def wrapper_function(*args, **kwargs): + return function(*args, **kwargs) + + return wrapper_function + + return compiletime_sl_profile_inner + + # Meta only, see # https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/ #