Torchbench Dynamo Runner: Enable DDP for perf test and traces (#113332)

- Removes an outdated assert that prevents perf tests from running DDP, we now have single node --multiprocess and perf tests are already wrapping the model using `deepcopy_and_maybe_ddp` - Append rank name to traces to avoid all ranks trying to create the same file - Renames `deepcopy_and_maybe_ddp` to `deepcopy_and_maybe_parallelize` to include FSDP Pull Request resolved: https://github.com/pytorch/pytorch/pull/113332 Approved by: https://github.com/H-Huang, https://github.com/wconstab
2026-05-14 20:57:59 +00:00 · 2024-01-12 22:41:05 +00:00 · 2024-01-12 22:41:05 +00:00 · 4b25948ee6
commit 4b25948ee6
parent c329eddcb9
1 changed files with 10 additions and 13 deletions
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -683,7 +683,10 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
                )

    if args.export_profiler_trace:
-        name = args.profiler_trace_name + "_" + model.name + ".json"
+        name = args.profiler_trace_name + "_" + model.name
+        if hasattr(args, "rank"):
+            name += f"_rank_{args.rank}"
+        name += ".json"
        name = os.path.join(torch._dynamo.config.base_dir, name)
        p.export_chrome_trace(name)
    median = np.median(timings, axis=0)
@ -2233,7 +2236,7 @@ class BenchmarkRunner:

        return ModuleWrapPolicy(MODEL_FSDP_WRAP[model_name])

-    def deepcopy_and_maybe_ddp(self, model):
+    def deepcopy_and_maybe_parallelize(self, model):
        model = self.deepcopy_model(model)
        if self.args.ddp:
            assert (
@ -2329,7 +2332,7 @@ class BenchmarkRunner:
            inputs_fp64 = None
            try:
                model_fp64, inputs_fp64 = cast_to_fp64(
-                    self.deepcopy_and_maybe_ddp(model),
+                    self.deepcopy_and_maybe_parallelize(model),
                    clone_inputs(example_inputs),
                )
                self.init_optimizer(name, current_device, model_fp64.parameters())
@ -2363,7 +2366,7 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
                correct_result = self.run_n_iterations(
                    model_copy, clone_inputs(example_inputs)
@ -2384,7 +2387,7 @@ class BenchmarkRunner:
            reset_rng_state()
            model_copy = None
            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
                correct_rerun_result = self.run_n_iterations(
                    model_copy, clone_inputs(example_inputs)
@ -2431,7 +2434,7 @@ class BenchmarkRunner:
            torch._dynamo.reset()
            model_copy = None
            try:
-                model_copy = self.deepcopy_and_maybe_ddp(model)
+                model_copy = self.deepcopy_and_maybe_parallelize(model)
                self.init_optimizer(name, current_device, model_copy.parameters())
                if self.args.export or self.args.export_aot_inductor:
                    # apply export on module directly
@ -2615,7 +2618,7 @@ class BenchmarkRunner:
        model, example_inputs = self.maybe_cast(model, example_inputs)

        # Use distributed wrapping as necessary
-        model = self.deepcopy_and_maybe_ddp(model)
+        model = self.deepcopy_and_maybe_parallelize(model)

        self.init_optimizer(name, current_device, model.parameters())
        with self.pick_grad(name, self.args.training):
@ -3409,12 +3412,6 @@ def run(runner, args, original_dir=None):
            CI, args.backend, training=args.training, dynamic=args.dynamic_shapes
        )
    if args.ddp:
-        # TODO: we could also hook DDP bench up to --speedup bench, _not_ for mgpu e2e perf,
-        # but just to measure impact on singlenode of performing graph-breaks.
-        # Left it as a follow up to keep this PR isolated.
-        assert (
-            args.accuracy
-        ), "DDP benchmark is currently only hooked up to --accuracy bench"
        assert args.training, "DDP benchmark requires --training mode"
        if args.no_optimize_ddp:
            torch._dynamo.config.optimize_ddp = False