mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
torchelastic: change monitor_interval default to 0.1 (#124692)
This reduces the default monitor_interval for torchelastic to 0.1s as testing shows negligble load for common use cases. Even at the extremes, 100k processes is only 45.4% cpu util of a single core. Torchelastic monitor_interval only monitors the processes on a single worker so under typical loads even for huge jobs we expect ~8 subprocesses per machine with one per GPU. As an external datapoint, Python's wait polls every 50usec-50ms (https://github.com/python/cpython/blob/main/Lib/subprocess.py#L2035). ## Motivation This setting is used to control how frequently we poll for failed processes in elastic. * For some jobs of note we run elastic 3 times per try so with the default timeout of 5 seconds we should save ~15 seconds per retry. * @kiukchung's use case: Apparently this is annoying in notebooks etc since it adds delay to shutdown when testing things ## Results This is measured in cores (100% is a single core under full load). | monitor_interval (s) | nproc-per-node | CPU util (highest observed) | | -------------------- | -------------- | --------------------------- | | 1.0 | 10 | 0.2% | | 0.1 | 1 | 0.4% | | 0.1 | 10 | 0.4% | | 0.01 | 10 | 0.9% | | 0.001 | 10 | 4.0% | | 0.1 | 100 | 0.5% | | 0.1 | 1000 | 2.2% | | 0.1 | 10000 | 15.7% | | 0.1 | 100000 | 45.4% | ## Methodology ```sh # run command $ LOGLEVEL=INFO torchrun --nnodes 1 --nproc-per-node 10 --monitor-interval 0.1 ~/wait.py # wait a few seconds for all processes to start and reach steady state and then run, wait ~30s or 3 prints and take the highest $ top -b -d 10 -c | rg 'torchrun.*wait ``` wait.py ```py import time time.sleep(10*60) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/124692 Approved by: https://github.com/kiukchung, https://github.com/kurman
This commit is contained in:
parent
03fa2421dc
commit
952a00eda7
5 changed files with 6 additions and 6 deletions
|
|
@ -54,7 +54,7 @@ class WorkerGroupTest(unittest.TestCase):
|
|||
args=(),
|
||||
rdzv_handler=None,
|
||||
max_restarts=50,
|
||||
monitor_interval=1,
|
||||
monitor_interval=0.1,
|
||||
)
|
||||
worker_group = WorkerGroup(spec)
|
||||
|
||||
|
|
@ -157,7 +157,7 @@ class SimpleElasticAgentTest(unittest.TestCase):
|
|||
def _get_worker_spec(
|
||||
self,
|
||||
max_restarts=1,
|
||||
monitor_interval=1.0,
|
||||
monitor_interval=0.1,
|
||||
role="test_trainer",
|
||||
local_world_size=8,
|
||||
local_addr=None,
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ def get_test_launch_config(
|
|||
nproc_per_node=nproc_per_node,
|
||||
run_id=run_id,
|
||||
rdzv_endpoint=rdzv_endpoint,
|
||||
monitor_interval=1,
|
||||
monitor_interval=0.1,
|
||||
rdzv_backend=rdzv_backend,
|
||||
start_method="spawn",
|
||||
max_restarts=0,
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ class WorkerSpec:
|
|||
entrypoint: Union[Callable, str, None] = None
|
||||
args: Tuple = ()
|
||||
max_restarts: int = 3
|
||||
monitor_interval: float = 30.0
|
||||
monitor_interval: float = 0.1
|
||||
master_port: Optional[int] = None
|
||||
master_addr: Optional[str] = None
|
||||
local_addr: Optional[str] = None
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ class LaunchConfig:
|
|||
rdzv_configs: Dict[str, Any] = field(default_factory=dict)
|
||||
rdzv_timeout: int = -1
|
||||
max_restarts: int = 3
|
||||
monitor_interval: float = 30
|
||||
monitor_interval: float = 0.1
|
||||
start_method: str = "spawn"
|
||||
log_line_prefix_template: Optional[str] = None
|
||||
metrics_cfg: Dict[str, str] = field(default_factory=dict)
|
||||
|
|
|
|||
|
|
@ -499,7 +499,7 @@ def get_args_parser() -> ArgumentParser:
|
|||
"--monitor_interval",
|
||||
action=env,
|
||||
type=float,
|
||||
default=5,
|
||||
default=0.1,
|
||||
help="Interval, in seconds, to monitor the state of workers.",
|
||||
)
|
||||
parser.add_argument(
|
||||
|
|
|
|||
Loading…
Reference in a new issue