Torchbench Dynamo Runner: Enable DDP for perf test and traces (#113332)

- Removes an outdated assert that prevents perf tests from running DDP, we now have single node --multiprocess and perf tests are already wrapping the model using `deepcopy_and_maybe_ddp`
- Append rank name to traces to avoid all ranks trying to create the same file
- Renames `deepcopy_and_maybe_ddp` to `deepcopy_and_maybe_parallelize` to include FSDP

Pull Request resolved: https://github.com/pytorch/pytorch/pull/113332
Approved by: https://github.com/H-Huang, https://github.com/wconstab
This commit is contained in:
Simon Fan 2024-01-12 22:41:05 +00:00 committed by PyTorch MergeBot
parent c329eddcb9
commit 4b25948ee6

View file

@ -683,7 +683,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
)
if args.export_profiler_trace:
name = args.profiler_trace_name + "_" + model.name + ".json"
name = args.profiler_trace_name + "_" + model.name
if hasattr(args, "rank"):
name += f"_rank_{args.rank}"
name += ".json"
name = os.path.join(torch._dynamo.config.base_dir, name)
p.export_chrome_trace(name)
median = np.median(timings, axis=0)
@ -2233,7 +2236,7 @@ class BenchmarkRunner:
return ModuleWrapPolicy(MODEL_FSDP_WRAP[model_name])
def deepcopy_and_maybe_ddp(self, model):
def deepcopy_and_maybe_parallelize(self, model):
model = self.deepcopy_model(model)
if self.args.ddp:
assert (
@ -2329,7 +2332,7 @@ class BenchmarkRunner:
inputs_fp64 = None
try:
model_fp64, inputs_fp64 = cast_to_fp64(
self.deepcopy_and_maybe_ddp(model),
self.deepcopy_and_maybe_parallelize(model),
clone_inputs(example_inputs),
)
self.init_optimizer(name, current_device, model_fp64.parameters())
@ -2363,7 +2366,7 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
model_copy = self.deepcopy_and_maybe_ddp(model)
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs)
@ -2384,7 +2387,7 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
model_copy = self.deepcopy_and_maybe_ddp(model)
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs)
@ -2431,7 +2434,7 @@ class BenchmarkRunner:
torch._dynamo.reset()
model_copy = None
try:
model_copy = self.deepcopy_and_maybe_ddp(model)
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
if self.args.export or self.args.export_aot_inductor:
# apply export on module directly
@ -2615,7 +2618,7 @@ class BenchmarkRunner:
model, example_inputs = self.maybe_cast(model, example_inputs)
# Use distributed wrapping as necessary
model = self.deepcopy_and_maybe_ddp(model)
model = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model.parameters())
with self.pick_grad(name, self.args.training):
@ -3409,12 +3412,6 @@ def run(runner, args, original_dir=None):
CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
)
if args.ddp:
# TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
# but just to measure impact on singlenode of performing graph-breaks.
# Left it as a follow up to keep this PR isolated.
assert (
args.accuracy
), "DDP benchmark is currently only hooked up to --accuracy bench"
assert args.training, "DDP benchmark requires --training mode"
if args.no_optimize_ddp:
torch._dynamo.config.optimize_ddp = False