2024-06-08 18:24:41 +00:00
|
|
|
# mypy: allow-untyped-defs
|
2024-02-13 22:31:01 +00:00
|
|
|
import functools
|
2023-04-11 20:10:35 +00:00
|
|
|
import logging
|
2018-06-20 21:45:26 +00:00
|
|
|
import os
|
2023-11-08 00:39:00 +00:00
|
|
|
import sys
|
2020-08-12 16:36:40 +00:00
|
|
|
import tempfile
|
2025-01-21 21:42:12 +00:00
|
|
|
from typing import Any, Callable, Optional, TypeVar
|
2024-10-21 01:37:41 +00:00
|
|
|
from typing_extensions import ParamSpec
|
2023-04-11 20:10:35 +00:00
|
|
|
|
2023-05-16 16:05:44 +00:00
|
|
|
import torch
|
Add compile time profiler for non fbcode targets (#126904)
This is a tool that allow profiling compile time using strobelight profiler, its a meta only tool.
but works on non-fbcode targets.
A follow up diff will unify this with caffe2/fb/strobelight/compile_time_profiler.py.
example test:
```
run python tools/strobelight/examples/compile_time_profile_example.py
```
```
python torch/utils/_strobelight/examples/compile_time_profile_example.py
strobelight_compile_time_profiler, line 61, 2024-05-23 10:49:28,101, INFO: compile time strobelight profiling enabled
strobelight_compile_time_profiler, line 93, 2024-05-23 10:49:28,102, INFO: Unique sample tag for this run is: 2024-05-23-10:49:282334638devvm4561.ash0.facebook.com
strobelight_compile_time_profiler, line 94, 2024-05-23 10:49:28,102, INFO: You can use the following link to access the strobelight profile at the end of the run: https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22purposes%22%3A[]%2C%22end%22%3A%22now%22%2C%22start%22%3A%22-30%20days%22%2C%22filterMode%22%3A%22DEFAULT%22%2C%22modifiers%22%3A[]%2C%22sampleCols%22%3A[]%2C%22cols%22%3A[%22namespace_id%22%2C%22namespace_process_id%22]%2C%22derivedCols%22%3A[]%2C%22mappedCols%22%3A[]%2C%22enumCols%22%3A[]%2C%22return_remainder%22%3Afalse%2C%22should_pivot%22%3Afalse%2C%22is_timeseries%22%3Afalse%2C%22hideEmptyColumns%22%3Afalse%2C%22timezone%22%3A%22America%2FLos_Angeles%22%2C%22compare%22%3A%22none%22%2C%22samplingRatio%22%3A%221%22%2C%22metric%22%3A%22count%22%2C%22aggregation_field%22%3A%22async_stack_complete%22%2C%22top%22%3A10000%2C%22aggregateList%22%3A[]%2C%22param_dimensions%22%3A[%7B%22dim%22%3A%22py_async_stack%22%2C%22op%22%3A%22edge%22%2C%22param%22%3A%220%22%2C%22anchor%22%3A%220%22%7D]%2C%22order%22%3A%22weight%22%2C%22order_desc%22%3Atrue%2C%22constraints%22%3A[[%7B%22column%22%3A%22sample_tags%22%2C%22op%22%3A%22all%22%2C%22value%22%3A[%22[%5C%222024-05-23-10:49:282334638devvm4561.ash0.facebook.com%5C%22]%22]%7D]]%2C%22c_constraints%22%3A[[]]%2C%22b_constraints%22%3A[[]]%2C%22ignoreGroupByInComparison%22%3Afalse%7D&view=GraphProfilerView&&normalized=1712358002&pool=uber
strobelight_function_profiler, line 241, 2024-05-23 10:49:34,943, INFO: strobelight run id is: 3507039740348330
strobelight_function_profiler, line 243, 2024-05-23 10:50:00,907, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:50:02,741, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: Total samples: 7
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/75cxdro3
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/qsgydsee
strobelight_compile_time_profiler, line 120, 2024-05-23 10:50:06,174, INFO: 1 strobelight success runs out of 1 non-recursive compilation events.
strobelight_function_profiler, line 241, 2024-05-23 10:50:08,137, INFO: strobelight run id is: 8721740011604497
strobelight_function_profiler, line 243, 2024-05-23 10:50:34,801, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:50:36,803, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: Total samples: 3
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/qmi2ucwp
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/7fjkhs9i
strobelight_compile_time_profiler, line 120, 2024-05-23 10:50:41,289, INFO: 2 strobelight success runs out of 2 non-recursive compilation events.
strobelight_function_profiler, line 241, 2024-05-23 10:50:43,597, INFO: strobelight run id is: 1932476082259558
strobelight_function_profiler, line 243, 2024-05-23 10:51:09,791, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:51:11,883, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: Total samples: 3
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/vy1ujxec
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/2xgadviv
strobelight_compile_time_profiler, line 120, 2024-05-23 10:51:16,219, INFO: 3 strobelight success runs out of 3 non-recursive compilation events.
```
or pass TORCH_COMPILE_STROBELIGHT=TRUE for any torch compile python program.
ex running on XLNetLMHeadModel.
```
TORCH_COMPILE_STROBELIGHT=TRUE TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 time python benchmarks/dynamo/huggingface.py --ci --accuracy --timing --explain --inductor --device cuda --training --amp --only XLNetLMHeadModel
```
result:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126904
Approved by: https://github.com/aorenste
ghstack dependencies: #126444
2024-05-28 23:06:37 +00:00
|
|
|
from torch._strobelight.compile_time_profiler import StrobelightCompileTimeProfiler
|
2023-05-16 16:05:44 +00:00
|
|
|
|
2024-06-23 09:07:59 +00:00
|
|
|
|
2024-10-21 01:37:41 +00:00
|
|
|
_T = TypeVar("_T")
|
|
|
|
|
_P = ParamSpec("_P")
|
|
|
|
|
|
2023-04-11 20:10:35 +00:00
|
|
|
log = logging.getLogger(__name__)
|
2021-08-05 21:19:56 +00:00
|
|
|
|
2024-09-15 05:32:38 +00:00
|
|
|
if os.environ.get("TORCH_COMPILE_STROBELIGHT", False):
|
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
if not shutil.which("strobeclient"):
|
|
|
|
|
log.info(
|
|
|
|
|
"TORCH_COMPILE_STROBELIGHT is true, but seems like you are not on a FB machine."
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
log.info("Strobelight profiler is enabled via environment variable")
|
|
|
|
|
StrobelightCompileTimeProfiler.enable()
|
|
|
|
|
|
2018-06-20 21:45:26 +00:00
|
|
|
# this arbitrary-looking assortment of functionality is provided here
|
|
|
|
|
# to have a central place for overrideable behavior. The motivating
|
|
|
|
|
# use is the FB build environment, where this source file is replaced
|
|
|
|
|
# by an equivalent.
|
|
|
|
|
|
2023-05-16 16:05:44 +00:00
|
|
|
if torch._running_with_deploy():
|
2021-01-29 03:27:29 +00:00
|
|
|
# __file__ is meaningless in the context of frozen torch used in torch deploy.
|
|
|
|
|
# setting empty torch_parent should allow below functions to operate without crashing,
|
|
|
|
|
# but it's unclear if there is a valid use case for them in the context of deploy.
|
|
|
|
|
torch_parent = ""
|
2018-06-20 21:45:26 +00:00
|
|
|
else:
|
2022-07-22 02:19:50 +00:00
|
|
|
if os.path.basename(os.path.dirname(__file__)) == "shared":
|
2021-01-29 03:27:29 +00:00
|
|
|
torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
|
|
|
|
|
else:
|
|
|
|
|
torch_parent = os.path.dirname(os.path.dirname(__file__))
|
2018-06-20 21:45:26 +00:00
|
|
|
|
2022-07-22 02:19:50 +00:00
|
|
|
|
2018-06-20 21:45:26 +00:00
|
|
|
def get_file_path(*path_components: str) -> str:
|
|
|
|
|
return os.path.join(torch_parent, *path_components)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_file_path_2(*path_components: str) -> str:
|
|
|
|
|
return os.path.join(*path_components)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_writable_path(path: str) -> str:
|
2020-08-12 16:36:40 +00:00
|
|
|
if os.access(path, os.W_OK):
|
|
|
|
|
return path
|
|
|
|
|
return tempfile.mkdtemp(suffix=os.path.basename(path))
|
|
|
|
|
|
2018-06-20 21:45:26 +00:00
|
|
|
|
|
|
|
|
def prepare_multiprocessing_environment(path: str) -> None:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2019-03-13 05:06:25 +00:00
|
|
|
def resolve_library_path(path: str) -> str:
|
|
|
|
|
return os.path.realpath(path)
|
|
|
|
|
|
|
|
|
|
|
2023-09-21 21:04:16 +00:00
|
|
|
def throw_abstract_impl_not_imported_error(opname, module, context):
|
2023-11-08 00:39:00 +00:00
|
|
|
if module in sys.modules:
|
|
|
|
|
raise NotImplementedError(
|
2024-04-16 17:57:00 +00:00
|
|
|
f"{opname}: We could not find the fake impl for this operator. "
|
2023-11-08 00:39:00 +00:00
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
raise NotImplementedError(
|
2024-04-16 17:57:00 +00:00
|
|
|
f"{opname}: We could not find the fake impl for this operator. "
|
2023-11-08 00:39:00 +00:00
|
|
|
f"The operator specified that you may need to import the '{module}' "
|
2024-04-16 17:57:00 +00:00
|
|
|
f"Python module to load the fake impl. {context}"
|
2023-11-08 00:39:00 +00:00
|
|
|
)
|
2023-09-21 21:04:16 +00:00
|
|
|
|
|
|
|
|
|
2024-05-15 17:24:24 +00:00
|
|
|
# NB! This treats "skip" kwarg specially!!
|
2024-10-21 01:37:41 +00:00
|
|
|
def compile_time_strobelight_meta(
|
|
|
|
|
phase_name: str,
|
|
|
|
|
) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
|
|
|
|
|
def compile_time_strobelight_meta_inner(
|
|
|
|
|
function: Callable[_P, _T],
|
|
|
|
|
) -> Callable[_P, _T]:
|
2024-04-06 18:57:44 +00:00
|
|
|
@functools.wraps(function)
|
2024-10-21 01:37:41 +00:00
|
|
|
def wrapper_function(*args: _P.args, **kwargs: _P.kwargs) -> _T:
|
|
|
|
|
if "skip" in kwargs and isinstance(skip := kwargs["skip"], int):
|
|
|
|
|
kwargs["skip"] = skip + 1
|
2024-08-19 05:40:02 +00:00
|
|
|
|
|
|
|
|
if not StrobelightCompileTimeProfiler.enabled:
|
|
|
|
|
return function(*args, **kwargs)
|
|
|
|
|
|
Add compile time profiler for non fbcode targets (#126904)
This is a tool that allow profiling compile time using strobelight profiler, its a meta only tool.
but works on non-fbcode targets.
A follow up diff will unify this with caffe2/fb/strobelight/compile_time_profiler.py.
example test:
```
run python tools/strobelight/examples/compile_time_profile_example.py
```
```
python torch/utils/_strobelight/examples/compile_time_profile_example.py
strobelight_compile_time_profiler, line 61, 2024-05-23 10:49:28,101, INFO: compile time strobelight profiling enabled
strobelight_compile_time_profiler, line 93, 2024-05-23 10:49:28,102, INFO: Unique sample tag for this run is: 2024-05-23-10:49:282334638devvm4561.ash0.facebook.com
strobelight_compile_time_profiler, line 94, 2024-05-23 10:49:28,102, INFO: You can use the following link to access the strobelight profile at the end of the run: https://www.internalfb.com/intern/scuba/query/?dataset=pyperf_experimental%2Fon_demand&drillstate=%7B%22purposes%22%3A[]%2C%22end%22%3A%22now%22%2C%22start%22%3A%22-30%20days%22%2C%22filterMode%22%3A%22DEFAULT%22%2C%22modifiers%22%3A[]%2C%22sampleCols%22%3A[]%2C%22cols%22%3A[%22namespace_id%22%2C%22namespace_process_id%22]%2C%22derivedCols%22%3A[]%2C%22mappedCols%22%3A[]%2C%22enumCols%22%3A[]%2C%22return_remainder%22%3Afalse%2C%22should_pivot%22%3Afalse%2C%22is_timeseries%22%3Afalse%2C%22hideEmptyColumns%22%3Afalse%2C%22timezone%22%3A%22America%2FLos_Angeles%22%2C%22compare%22%3A%22none%22%2C%22samplingRatio%22%3A%221%22%2C%22metric%22%3A%22count%22%2C%22aggregation_field%22%3A%22async_stack_complete%22%2C%22top%22%3A10000%2C%22aggregateList%22%3A[]%2C%22param_dimensions%22%3A[%7B%22dim%22%3A%22py_async_stack%22%2C%22op%22%3A%22edge%22%2C%22param%22%3A%220%22%2C%22anchor%22%3A%220%22%7D]%2C%22order%22%3A%22weight%22%2C%22order_desc%22%3Atrue%2C%22constraints%22%3A[[%7B%22column%22%3A%22sample_tags%22%2C%22op%22%3A%22all%22%2C%22value%22%3A[%22[%5C%222024-05-23-10:49:282334638devvm4561.ash0.facebook.com%5C%22]%22]%7D]]%2C%22c_constraints%22%3A[[]]%2C%22b_constraints%22%3A[[]]%2C%22ignoreGroupByInComparison%22%3Afalse%7D&view=GraphProfilerView&&normalized=1712358002&pool=uber
strobelight_function_profiler, line 241, 2024-05-23 10:49:34,943, INFO: strobelight run id is: 3507039740348330
strobelight_function_profiler, line 243, 2024-05-23 10:50:00,907, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:50:02,741, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: Total samples: 7
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/75cxdro3
strobelight_function_profiler, line 215, 2024-05-23 10:50:06,173, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/qsgydsee
strobelight_compile_time_profiler, line 120, 2024-05-23 10:50:06,174, INFO: 1 strobelight success runs out of 1 non-recursive compilation events.
strobelight_function_profiler, line 241, 2024-05-23 10:50:08,137, INFO: strobelight run id is: 8721740011604497
strobelight_function_profiler, line 243, 2024-05-23 10:50:34,801, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:50:36,803, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: Total samples: 3
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/qmi2ucwp
strobelight_function_profiler, line 215, 2024-05-23 10:50:41,289, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/7fjkhs9i
strobelight_compile_time_profiler, line 120, 2024-05-23 10:50:41,289, INFO: 2 strobelight success runs out of 2 non-recursive compilation events.
strobelight_function_profiler, line 241, 2024-05-23 10:50:43,597, INFO: strobelight run id is: 1932476082259558
strobelight_function_profiler, line 243, 2024-05-23 10:51:09,791, INFO: strobelight profiling running
strobelight_function_profiler, line 224, 2024-05-23 10:51:11,883, INFO: strobelight profiling stopped
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: Total samples: 3
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: GraphProfiler (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/vy1ujxec
strobelight_function_profiler, line 215, 2024-05-23 10:51:16,218, INFO: Icicle view (python stack): https://fburl.com/scuba/pyperf_experimental/on_demand/2xgadviv
strobelight_compile_time_profiler, line 120, 2024-05-23 10:51:16,219, INFO: 3 strobelight success runs out of 3 non-recursive compilation events.
```
or pass TORCH_COMPILE_STROBELIGHT=TRUE for any torch compile python program.
ex running on XLNetLMHeadModel.
```
TORCH_COMPILE_STROBELIGHT=TRUE TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 time python benchmarks/dynamo/huggingface.py --ci --accuracy --timing --explain --inductor --device cuda --training --amp --only XLNetLMHeadModel
```
result:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/126904
Approved by: https://github.com/aorenste
ghstack dependencies: #126444
2024-05-28 23:06:37 +00:00
|
|
|
return StrobelightCompileTimeProfiler.profile_compile_time(
|
|
|
|
|
function, phase_name, *args, **kwargs
|
|
|
|
|
)
|
2024-04-06 18:57:44 +00:00
|
|
|
|
|
|
|
|
return wrapper_function
|
|
|
|
|
|
2024-04-23 20:50:53 +00:00
|
|
|
return compile_time_strobelight_meta_inner
|
2024-04-06 18:57:44 +00:00
|
|
|
|
|
|
|
|
|
2023-04-11 20:10:35 +00:00
|
|
|
# Meta only, see
|
|
|
|
|
# https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/
|
|
|
|
|
#
|
|
|
|
|
# This will cause an event to get logged to Scuba via the signposts API. You
|
|
|
|
|
# can view samples on the API at https://fburl.com/scuba/workflow_signpost/zh9wmpqs
|
|
|
|
|
# we log to subsystem "torch", and the category and name you provide here.
|
|
|
|
|
# Each of the arguments translate into a Scuba column. We're still figuring
|
|
|
|
|
# out local conventions in PyTorch, but category should be something like
|
|
|
|
|
# "dynamo" or "inductor", and name should be a specific string describing what
|
|
|
|
|
# kind of event happened.
|
|
|
|
|
#
|
|
|
|
|
# Killswitch is at
|
|
|
|
|
# https://www.internalfb.com/intern/justknobs/?name=pytorch%2Fsignpost#event
|
2025-01-21 21:42:12 +00:00
|
|
|
def signpost_event(category: str, name: str, parameters: dict[str, Any]):
|
2023-04-11 20:10:35 +00:00
|
|
|
log.info("%s %s: %r", category, name, parameters)
|
|
|
|
|
|
|
|
|
|
|
[Dynamo] Improve PT2 fbcode logging observability (#106932)
Summary:
https://docs.google.com/document/d/1D5K3_ELsda3tIUeSyNL_2yee-M3jVWbirqSQ5BDNvHQ/edit
This is the revamped version of D47908299.
For each frame, we will record a list of compilation metrics: e.g, backend_compile time, entire_frame_compile time, cache_size, co_filename, co_firstlineno, co_name, guards, graph input_count, graph node_count, graph op_count.
With the help of job info: mast_job_name, global_rank, we can satisfy the requirements from `Things I’ve used/wanted to use our logging to determine` in https://docs.google.com/document/d/1D5K3_ELsda3tIUeSyNL_2yee-M3jVWbirqSQ5BDNvHQ/edit (or add more metrics for this framework)
Test Plan:
```
buck2 test //caffe2/test:test_dynamo
```
Differential Revision: D48142400
Pull Request resolved: https://github.com/pytorch/pytorch/pull/106932
Approved by: https://github.com/anijain2305
2023-08-11 20:46:04 +00:00
|
|
|
def log_compilation_event(metrics):
|
|
|
|
|
log.info("%s", metrics)
|
|
|
|
|
|
|
|
|
|
|
2024-02-16 21:32:04 +00:00
|
|
|
def upload_graph(graph):
|
2023-12-01 18:25:56 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-01-30 08:13:52 +00:00
|
|
|
def set_pytorch_distributed_envs_from_justknobs():
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-02-12 17:28:14 +00:00
|
|
|
def log_export_usage(**kwargs):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-07-19 06:02:47 +00:00
|
|
|
def log_trace_structured_event(*args, **kwargs) -> None:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-09-01 19:02:09 +00:00
|
|
|
def log_cache_bypass(*args, **kwargs) -> None:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-07-09 22:24:16 +00:00
|
|
|
def log_torchscript_usage(api: str, **kwargs):
|
2024-03-19 17:38:27 +00:00
|
|
|
_ = api
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
2024-05-28 17:49:32 +00:00
|
|
|
def check_if_torch_exportable():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2024-10-03 22:21:29 +00:00
|
|
|
def export_training_ir_rollout_check() -> bool:
|
2024-11-20 16:13:11 +00:00
|
|
|
return True
|
2024-10-03 22:21:29 +00:00
|
|
|
|
|
|
|
|
|
2024-05-28 17:49:32 +00:00
|
|
|
def log_torch_jit_trace_exportability(
|
2024-06-23 09:07:59 +00:00
|
|
|
api: str,
|
|
|
|
|
type_of_export: str,
|
|
|
|
|
export_outcome: str,
|
|
|
|
|
result: str,
|
2024-05-28 17:49:32 +00:00
|
|
|
):
|
|
|
|
|
_, _, _, _ = api, type_of_export, export_outcome, result
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
2024-11-01 16:13:07 +00:00
|
|
|
def justknobs_check(name: str, default: bool = True) -> bool:
|
Enable TORCH_TRACE by default in all Tupperware like environments (#120915)
Summary:
This is a reimplemented version of the FB specific code in https://www.internalfb.com/diff/D54230697
The new strategy is that we unconditionally install an FB handler to trace_log logger (and always set level to DEBUG). When the first log message is emitted, we check the JK/filesystem to see if we should actually do logging. If we decide we don't do logging, we remove the handler from trace_log and are done.
build_only[github-export-checks,executorch,pytorch_benchmark,pytorch_quantization,pytorch_distributed,pytorch_distributed_gpu,pytorch_dynamo_inductor,pytorch_functorch,pytorch_fx2trt,pytorch_diff_train_tests_ads,glow_fb_pytorch_tests,training_platform,training_platform_compatibility,training_toolkit_applications,training_toolkit_examples,training_toolkit_model_optimization,dper3_pytorch,xplat_caffe2,pytorch_dev,android-pytorch-instrumentation-tests,smartpytorchgithub_first_try_merge,frl-target-determinator,f6-buck,training_platform_for_github,sigmoid_cpu,sigmoid_gpu,aiplatform_modelprocessing_for_github,accelerators_workloads_models_slimdsnn,ae_aotinductor_benchmark_test,aps_,aps_deterministic_ne_tests,dper_lib_silvertorch,torchrec,torchrec_fb,deeplearning_aot_inductor]
Test Plan:
sandcastle
```
buck2 test 'fbcode//mode/dev-nosan' fbcode//torchrec/inference/tests:test_single_gpu_executor -- --exact 'torchrec/inference/tests:test_single_gpu_executor - TorchDeployGPUTest.NestedModelSingleGPU'
buck2 test 'fbcode//mode/dev-nosan' fbcode//dper_lib/silvertorch/modules/dynamic_stats/tests:accumulators_test -- --exact 'dper_lib/silvertorch/modules/dynamic_stats/tests:accumulators_test - test_global_fixed_interval_accumulator (dper_lib.silvertorch.modules.dynamic_stats.tests.accumulators_test.GlobalFixedIntervalUnivalentAcculumatorTest)'
```
Also running a test flow with/without JK enabled
Differential Revision: D54275086
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120915
Approved by: https://github.com/yanboliang
2024-03-01 04:47:13 +00:00
|
|
|
"""
|
|
|
|
|
This function can be used to killswitch functionality in FB prod,
|
|
|
|
|
where you can toggle this value to False in JK without having to
|
|
|
|
|
do a code push. In OSS, we always have everything turned on all
|
|
|
|
|
the time, because downstream users can simply choose to not update
|
|
|
|
|
PyTorch. (If more fine-grained enable/disable is needed, we could
|
|
|
|
|
potentially have a map we lookup name in to toggle behavior. But
|
|
|
|
|
the point is that it's all tied to source code in OSS, since there's
|
|
|
|
|
no live server to query.)
|
|
|
|
|
|
|
|
|
|
This is the bare minimum functionality I needed to do some killswitches.
|
|
|
|
|
We have a more detailed plan at
|
|
|
|
|
https://docs.google.com/document/d/1Ukerh9_42SeGh89J-tGtecpHBPwGlkQ043pddkKb3PU/edit
|
|
|
|
|
In particular, in some circumstances it may be necessary to read in
|
|
|
|
|
a knob once at process start, and then use it consistently for the
|
|
|
|
|
rest of the process. Future functionality will codify these patterns
|
|
|
|
|
into a better high level API.
|
|
|
|
|
|
|
|
|
|
WARNING: Do NOT call this function at module import time, JK is not
|
|
|
|
|
fork safe and you will break anyone who forks the process and then
|
|
|
|
|
hits JK again.
|
|
|
|
|
"""
|
2024-11-01 16:13:07 +00:00
|
|
|
return default
|
Enable TORCH_TRACE by default in all Tupperware like environments (#120915)
Summary:
This is a reimplemented version of the FB specific code in https://www.internalfb.com/diff/D54230697
The new strategy is that we unconditionally install an FB handler to trace_log logger (and always set level to DEBUG). When the first log message is emitted, we check the JK/filesystem to see if we should actually do logging. If we decide we don't do logging, we remove the handler from trace_log and are done.
build_only[github-export-checks,executorch,pytorch_benchmark,pytorch_quantization,pytorch_distributed,pytorch_distributed_gpu,pytorch_dynamo_inductor,pytorch_functorch,pytorch_fx2trt,pytorch_diff_train_tests_ads,glow_fb_pytorch_tests,training_platform,training_platform_compatibility,training_toolkit_applications,training_toolkit_examples,training_toolkit_model_optimization,dper3_pytorch,xplat_caffe2,pytorch_dev,android-pytorch-instrumentation-tests,smartpytorchgithub_first_try_merge,frl-target-determinator,f6-buck,training_platform_for_github,sigmoid_cpu,sigmoid_gpu,aiplatform_modelprocessing_for_github,accelerators_workloads_models_slimdsnn,ae_aotinductor_benchmark_test,aps_,aps_deterministic_ne_tests,dper_lib_silvertorch,torchrec,torchrec_fb,deeplearning_aot_inductor]
Test Plan:
sandcastle
```
buck2 test 'fbcode//mode/dev-nosan' fbcode//torchrec/inference/tests:test_single_gpu_executor -- --exact 'torchrec/inference/tests:test_single_gpu_executor - TorchDeployGPUTest.NestedModelSingleGPU'
buck2 test 'fbcode//mode/dev-nosan' fbcode//dper_lib/silvertorch/modules/dynamic_stats/tests:accumulators_test -- --exact 'dper_lib/silvertorch/modules/dynamic_stats/tests:accumulators_test - test_global_fixed_interval_accumulator (dper_lib.silvertorch.modules.dynamic_stats.tests.accumulators_test.GlobalFixedIntervalUnivalentAcculumatorTest)'
```
Also running a test flow with/without JK enabled
Differential Revision: D54275086
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120915
Approved by: https://github.com/yanboliang
2024-03-01 04:47:13 +00:00
|
|
|
|
|
|
|
|
|
2024-03-14 00:36:10 +00:00
|
|
|
def justknobs_getval_int(name: str) -> int:
|
|
|
|
|
"""
|
|
|
|
|
Read warning on justknobs_check
|
|
|
|
|
"""
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
2024-08-13 02:49:43 +00:00
|
|
|
def is_fb_unit_test() -> bool:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
2024-02-13 22:31:01 +00:00
|
|
|
@functools.lru_cache(None)
|
|
|
|
|
def max_clock_rate():
|
2024-04-25 20:44:27 +00:00
|
|
|
if not torch.version.hip:
|
|
|
|
|
from triton.testing import nvsmi
|
2024-02-13 22:31:01 +00:00
|
|
|
|
2024-04-25 20:44:27 +00:00
|
|
|
return nvsmi(["clocks.max.sm"])[0]
|
|
|
|
|
else:
|
|
|
|
|
# Manually set max-clock speeds on ROCm until equivalent nvmsi
|
|
|
|
|
# functionality in triton.testing or via pyamdsmi enablement. Required
|
|
|
|
|
# for test_snode_runtime unit tests.
|
|
|
|
|
gcn_arch = str(torch.cuda.get_device_properties(0).gcnArchName.split(":", 1)[0])
|
|
|
|
|
if "gfx94" in gcn_arch:
|
|
|
|
|
return 1700
|
|
|
|
|
elif "gfx90a" in gcn_arch:
|
|
|
|
|
return 1700
|
|
|
|
|
elif "gfx908" in gcn_arch:
|
|
|
|
|
return 1502
|
|
|
|
|
elif "gfx11" in gcn_arch:
|
|
|
|
|
return 1700
|
|
|
|
|
elif "gfx103" in gcn_arch:
|
|
|
|
|
return 1967
|
|
|
|
|
elif "gfx101" in gcn_arch:
|
|
|
|
|
return 1144
|
|
|
|
|
else:
|
|
|
|
|
return 1100
|
2024-02-13 22:31:01 +00:00
|
|
|
|
|
|
|
|
|
2025-01-21 21:42:12 +00:00
|
|
|
def get_mast_job_name_version() -> Optional[tuple[str, int]]:
|
2024-11-02 13:15:10 +00:00
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2022-07-22 02:19:50 +00:00
|
|
|
TEST_MASTER_ADDR = "127.0.0.1"
|
2018-10-25 15:49:37 +00:00
|
|
|
TEST_MASTER_PORT = 29500
|
2021-01-29 03:27:29 +00:00
|
|
|
# USE_GLOBAL_DEPS controls whether __init__.py tries to load
|
2020-04-23 02:05:54 +00:00
|
|
|
# libtorch_global_deps, see Note [Global dependencies]
|
|
|
|
|
USE_GLOBAL_DEPS = True
|
|
|
|
|
# USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
|
|
|
|
|
# _C.so with RTLD_GLOBAL during the call to dlopen.
|
2020-01-09 15:26:25 +00:00
|
|
|
USE_RTLD_GLOBAL_WITH_LIBTORCH = False
|
2024-04-17 12:48:37 +00:00
|
|
|
# If an op was defined in C++ and extended from Python using the
|
|
|
|
|
# torch.library.register_fake, returns if we require that there be a
|
|
|
|
|
# m.set_python_module("mylib.ops") call from C++ that associates
|
|
|
|
|
# the C++ op with a python module.
|
|
|
|
|
REQUIRES_SET_PYTHON_MODULE = False
|
2024-04-30 15:05:01 +00:00
|
|
|
|
|
|
|
|
|
2024-05-21 00:44:55 +00:00
|
|
|
def maybe_upload_prof_stats_to_manifold(profile_path: str) -> Optional[str]:
|
2024-04-30 15:05:01 +00:00
|
|
|
print("Uploading profile stats (fb-only otherwise no-op)")
|
2024-05-21 00:44:55 +00:00
|
|
|
return None
|
2024-08-22 14:59:45 +00:00
|
|
|
|
|
|
|
|
|
2024-10-07 16:51:58 +00:00
|
|
|
def log_chromium_event_internal(
|
2025-01-21 21:42:12 +00:00
|
|
|
event: dict[str, Any],
|
|
|
|
|
stack: list[str],
|
[PT2 Compile Events] Revamp PT2 Compile/chromium event logging [1/?] (#138093)
This diff is the starting steps of https://docs.google.com/document/u/2/d/1kAEBt4AyW7HTAhXHbjoz8FBFHNyyEA2Qo2mPn7v3WUQ/edit?usp=drive_web&ouid=113555078003219714709
It implements the following changes:
- Only log spans to scuba, so no start events are ever logged
- Log events as the full event name, without "START" or "END"
- Only log to scuba major phases from chromium events. These are:
- entire_frame_compile (dynamo)
- backend_compile (aotdispatch)
- inductor_compile (inductor)
- codegen (inductor codegen)
Tlparse chromium events stay basically the same. But I implemented a few changes to clean that up as well:
- When there's a phase name available, log the phase name instead of the function name as the event name. This simplifies the trace to not have two identical rows. The fn_name is avaliable as metadata on the chromium event, if interested
- Log new events for pre and post grad passes. These do *not* log to scuba.
By making the phases much simpler in Scuba, with only categories for major phases of PT2 Compilation, we pave the way to add **much** more metadata and information to each individual event type. Diffs for that will come later.
**IMPLEMENTATION NOTES:**
- The logic for `log_chromium_event_internal` (which is the function that logs to Scuba) lives in chromium_events for now, but in the future as we add more metadata, it may belong independently in dynamo_timed or even outside of dynamo_timed. I haven't explored in detail what the refactor will look like. Once we start logging metadata for dynamo, aotdispatch, inductor, I suspect we will call log_pt2_compile_event directly, instead of making chromium event logger handle the pt2_compile_event logic. But that refactor is left for another PR on top of this one.
- There's an interesting space after pre grad passes within AOT autograd logic, that's between create_aot_dispatcher_function and pre grad passes. I'm not sure what we're spending time doing in that time, but I'll find out with a profile later.
Differential Revision: [D64479033](https://our.internmc.facebook.com/intern/diff/D64479033/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138093
Approved by: https://github.com/ezyang
2024-10-18 16:02:38 +00:00
|
|
|
logger_uuid: str,
|
|
|
|
|
start_time_ns: int,
|
2024-10-07 16:51:58 +00:00
|
|
|
):
|
2024-08-22 14:59:45 +00:00
|
|
|
return None
|
2024-11-14 18:27:38 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def record_chromium_event_internal(
|
2025-01-21 21:42:12 +00:00
|
|
|
event: dict[str, Any],
|
2024-11-14 18:27:38 +00:00
|
|
|
):
|
|
|
|
|
return None
|