diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index b88d6ca8f57..87fbdaeb833 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -2,7 +2,6 @@ from __future__ import annotations -import abc import argparse import collections import contextlib @@ -22,13 +21,10 @@ import sys import time import weakref from contextlib import contextmanager -from pathlib import Path -from typing import Any, Callable, NamedTuple, Optional, TYPE_CHECKING -from typing_extensions import Self +from typing import Any, NamedTuple, TYPE_CHECKING from unittest.mock import MagicMock import numpy as np -import numpy.typing as npt import pandas as pd import psutil import yaml @@ -86,9 +82,7 @@ except ImportError: if TYPE_CHECKING: - from collections.abc import Generator, Mapping, Sequence - - from torch.onnx._internal.fx import diagnostics + from collections.abc import Mapping log = logging.getLogger(__name__) @@ -106,7 +100,6 @@ current_mode = "" current_dtype = "" current_quantization = "" current_settings = None -current_onnx_compiler = "" current_batch_size = None output_filename = None disable_output = False @@ -1277,170 +1270,6 @@ def speedup_experiment_ds(args, model_iter_fn, model, example_inputs): return output_str -@contextlib.contextmanager -def override_synchronize_with_onnx_iobinding(iobinding): - global synchronize - prev_synchrnoize = synchronize - try: - if iobinding is not None: - - def new_synchronize(): - iobinding.synchronize_inputs() - iobinding.synchronize_outputs() - - synchronize = new_synchronize - yield - finally: - synchronize = prev_synchrnoize - - -def speedup_experiment_onnx( - args, - model_iter_fn, - onnx_model: OnnxModel, - model, - example_inputs, - **kwargs, -): - """ - Measure speedups over eager. - - This function is responsible for the following: - 1. Creating iobinding with OnnxModel if device is CUDA, which is essential for perf measurement. - 2. Running ORT with OnnxModel. - - Writes to ./{output_filename}, which should be - `Path(self.output_dir) / f"{self.compiler}_{suite}_{self.dtype}_{self.mode}_{self.device}_{self.testing}.csv". - - TODO(bowbao): Record export time and export peak memory usage. - """ - timings = np.zeros((args.repeat, 2), np.float64) - is_correct = True - should_randomize_input = args.randomize_input - times = args.iterations_per_run - - def create_onnx_input_binded_fn(onnx_model: OnnxModel, pt_inputs, example_outputs): - # Goal is to move the iobinding creation outside of the timer function. - iobinding, outputs = onnx_model.create_iobinding(pt_inputs, example_outputs) - - def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): - onnx_model.run_with_iobinding(iobinding, outputs) - if collect_outputs: - return outputs - - return onnxrt_model_iter_fn, iobinding - - def create_onnx_fn(onnx_model: OnnxModel, pt_inputs): - # NOTE: Making perf comparison fair by moving out the i/o adapting part. - # 1. Pre-adapt `pt_inputs` to `onnx_inputs` here. - # 2. Drop `onnx_outputs` to `pt_outputs` adapting. Output comparison is not part of perf measurement. - onnx_inputs = onnx_model.adapt_pt_inputs_to_onnx(pt_inputs) - - def onnxrt_model_iter_fn(model, inputs, collect_outputs=True): - return onnx_model.run_with_onnx_inputs(onnx_inputs) - - return onnxrt_model_iter_fn - - def timed_onnx(model, onnx_model: OnnxModel, inputs): - if current_device == "cpu" or onnx_model.is_cpu(): - onnxrt_model_iter_fn = create_onnx_fn(onnx_model, inputs) - iobinding = None - else: - onnxrt_model_iter_fn, iobinding = create_onnx_input_binded_fn( - onnx_model, inputs, expected_output - ) - with override_synchronize_with_onnx_iobinding(iobinding): - return timed( - model, - onnxrt_model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - - # Insert ONNX warm-up - inputs = ( - randomize_input(copy.deepcopy(example_inputs)) - if should_randomize_input - else example_inputs - ) - _, expected_output = timed( - model, - model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - for _ in range(2): - timed_onnx(model, onnx_model, inputs) - - for rep in range(args.repeat): - inputs = ( - randomize_input(copy.deepcopy(example_inputs)) - if should_randomize_input - else example_inputs - ) - if torch.cuda.device_count() > 1: - # Manually set correct torch.cuda.current_device to ensure torch.cuda.synchronize() works as intended. - # When there are more than 1 cuda devices, the first one is used for pytorch eager. - # The second one is used for onnx ort. - torch.cuda.set_device(0) - timings[rep, 0], expected_output = timed( - model, - model_iter_fn, - inputs, - return_result=True, - times=times, - collect_outputs=args.collect_outputs, - ) - if torch.cuda.device_count() > 1: - # Manually set correct torch.cuda.current_device to ensure torch.cuda.synchronize() works as intended. - # When there are more than 1 cuda devices, the first one is used for pytorch eager. - # The second one is used for onnx ort. - torch.cuda.set_device(1) - timings[rep, 1], actual_output = timed_onnx(model, onnx_model, inputs) - - pvalue = ttest_ind(timings[:, 0], timings[:, 1]).pvalue - median = np.median(timings, axis=0) - speedup = median[0] / median[1] - if args.dump_raw_metrics: - np.save( - f"{output_filename[:-4]}-raw_timings-{current_name}-{current_device}.npy", - timings, - ) - - headers = ["dev", "name", "batch_size", "speedup", "abs_latency"] - row = [ - current_device, - current_name, - current_batch_size, - float(speedup), - median[1] * 1000, - ] - if "compilation_latency" in kwargs: - headers = headers + ["compilation_latency", "compression_ratio"] - row.append(kwargs["compilation_latency"]) - row.append(kwargs["compression_ratio"]) - - write_outputs( - output_filename, - headers, - row, - ) - headers, data = torch._dynamo.utils.compile_times(repr="csv", aggregate=True) - assert ( - output_filename.find(".csv") > 0 - ), f"expected output_filename to be a .csv, but got {output_filename}" - write_outputs( - output_filename[:-4] + "_compilation_metrics.csv", - ["dev", "name", "batch_size"] + headers, - [current_device, current_name, current_batch_size] + data, - ) - return format_speedup(speedup, pvalue, is_correct=is_correct) - - def overhead_experiment(*args, model_iter_fn): """ Measure overheads of TorchDynamo by running with no backend (only @@ -1683,685 +1512,6 @@ def download_retry_decorator(download_fn): return wrapper -class OnnxModel(abc.ABC): - TORCH_TO_NUMPY_DTYPE = { - torch.float16: np.float16, - torch.float32: np.float32, - torch.float64: np.float64, - torch.uint8: np.uint8, - torch.int8: np.int8, - torch.int16: np.int16, - torch.int32: np.int32, - torch.int64: np.longlong, - torch.bool: np.bool_, - } - - _COMPILER_NAME: str - - def __init__( - self, - output_directory, - model, - example_inputs, - dynamic_shapes: bool, - copy_before_export: bool = False, - use_experimental_patch: bool = False, - ): - """The abstract class for exporting ONNX model. - - Args: - output_directory: output path - model: model - example_inputs: example inputs for exporting - dynamic_shapes (bool): Whether to export the model with dynamic shapes. - copy_before_export (bool,): copy before export. Defaults to False. - use_experimental_patch (bool): Whether to apply torch_onnx patch which exports - with torch.export and onnx ir. Defaults to False. - """ - model_name = current_name - self.copy_before_export = copy_before_export - self.use_experimental_patch = use_experimental_patch - # NOTE: torch_onnx patch is using OnnxModelFromTorchScript to export ONNX model. - if self.use_experimental_patch: - self._COMPILER_NAME = "torch_onnx_patch" - self.model_dir = self._generate_onnx_model_directory( - output_directory, self._COMPILER_NAME, model_name - ) - self.model_path = str( - self.model_dir / f"{model_name}_{self._COMPILER_NAME}.onnx" - ) - - def _determine_deepcopy_target_device(self): - if current_device == "cpu": - target_device = "cpu" - else: - if torch.cuda.device_count() > 1: - # Copy to another cuda device to avoid OOM. - target_device = "cuda:1" - else: - target_device = "cuda" - return target_device - - def deepcopy_model_and_inputs_to_device(self, model, example_inputs, target_device): - # Deepcopy model before export to avoid modification to baseline model. - # To avoid OOM, the model is first moved to CPU. Both models are then moved to device. - model_device = next(model.parameters()).device - model.to("cpu") - model_copy = copy.deepcopy(model).to(target_device) - model.to(model_device) - - target_device_example_inputs = tree_map_only( - torch.Tensor, lambda x: x.to(device=target_device), example_inputs - ) - - return model_copy, target_device_example_inputs - - @classmethod - def _generate_onnx_model_directory( - cls, output_directory: str, compiler_name: str, model_name: str - ) -> Path: - model_path = Path( - output_directory, - ".onnx_models", - model_name, - compiler_name, - ) - if model_path.exists() and model_path.is_dir(): - shutil.rmtree(model_path) - model_path.mkdir(parents=True, exist_ok=True) - return model_path - - @abc.abstractmethod - def format_pt_inputs(self, pt_inputs: Any) -> Sequence[torch.Tensor]: ... - - @abc.abstractmethod - def format_pt_outputs(self, pt_outputs: Any) -> Sequence[torch.Tensor]: ... - - def adapt_pt_inputs_to_onnx(self, pt_inputs) -> Mapping[str, npt.NDArray]: - pt_inputs = self.format_pt_inputs(pt_inputs) - return { - ort_input.name: pt_input.cpu().numpy() - for ort_input, pt_input in zip(self.onnx_session.get_inputs(), pt_inputs) - } - - def adapt_onnx_outputs_to_pt(self, onnx_outputs: list[npt.NDArray]) -> Any: - pt_outputs = [ - torch.from_numpy(onnx_output).to(current_device) - for onnx_output in onnx_outputs - ] - if len(pt_outputs) == 1: - return pt_outputs[0] - return pt_outputs - - def _init_ort_session(self, model_path: str): - import onnxruntime - - if current_device == "cpu": - ort_providers = ["CPUExecutionProvider"] - else: - # NOTE(bowbao): Reduce OOM by running ORT on another gpu. - # TODO(bowbao): This works to avoid OOM, but performance is surprisingly very bad. - cuda_provider_options = { - "device_id": 1 if torch.cuda.device_count() > 1 else 0, - } - ort_providers = [("CUDAExecutionProvider", cuda_provider_options)] - session_options = onnxruntime.SessionOptions() - session_options.log_severity_level = 3 # Error - - ort_session = onnxruntime.InferenceSession( - self.model_path, - providers=ort_providers, - sess_options=session_options, - ) - return ort_session - - def is_cpu(self) -> bool: - return self.onnx_session.get_providers()[0] == "CPUExecutionProvider" - - def cpu(self) -> Self: - self.onnx_session.set_providers(["CPUExecutionProvider"]) - return self - - def create_outputs(self, *example_outputs): - return tuple(torch.empty_like(x) for x in example_outputs) - - def create_iobinding(self, pt_inputs, example_outputs): - pt_inputs = self.format_pt_inputs(pt_inputs) - example_outputs = self.format_pt_outputs(example_outputs) - - iobinding = self.onnx_session.io_binding() - args = [arg.contiguous() for arg in pt_inputs] - for ort_input, arg in zip(self.onnx_session.get_inputs(), args): - # NOTE: Run ORT on another cuda device to reduce OOM. - if torch.cuda.device_count() > 1: - arg = arg.detach().to("cuda:1") - device = arg.device - iobinding.bind_input( - ort_input.name, - device.type, - device.index or 0, - self.TORCH_TO_NUMPY_DTYPE[arg.dtype], - arg.size(), - arg.data_ptr(), - ) - - outputs = self.create_outputs(*example_outputs) - for ort_output, output in zip(self.onnx_session.get_outputs(), outputs): - if torch.cuda.device_count() > 1: - output = output.detach().to("cuda:1") - device = output.device - iobinding.bind_output( - ort_output.name, - device.type, - device.index or 0, - self.TORCH_TO_NUMPY_DTYPE[output.dtype], - output.size(), - output.data_ptr(), - ) - return iobinding, outputs - - def run_with_iobinding(self, iobinding, outputs): - # 'outputs' are torch empty tensors binded to 'iobinding'. - self.onnx_session.run_with_iobinding(iobinding) - return outputs - - def run_with_onnx_inputs(self, onnx_inputs): - return self.onnx_session.run(None, onnx_inputs) - - @classmethod - def save_tensor_data(cls, numpy_tensor, output_path): - from onnx import numpy_helper - - proto_tensor = numpy_helper.from_array(numpy_tensor) - with open(output_path, "wb") as f: - f.write(proto_tensor.SerializeToString()) - - def run_and_serialize_inputs_outputs(self, pt_inputs): - test_data_dir = self.model_dir / "test_data_set_0" - test_data_dir.mkdir(parents=True, exist_ok=True) - - onnx_inputs = self.adapt_pt_inputs_to_onnx(pt_inputs) - for i, onnx_input in enumerate(onnx_inputs.values()): - self.save_tensor_data(onnx_input, str(test_data_dir / f"input_{i}.pb")) - - onnx_outputs = self.run_with_onnx_inputs(onnx_inputs) - - for i, onnx_output in enumerate(onnx_outputs): - self.save_tensor_data(onnx_output, str(test_data_dir / f"output_{i}.pb")) - - return self.adapt_onnx_outputs_to_pt(onnx_outputs) - - def run(self, pt_inputs): - # NOTE: For CUDA performance testing, use `run_with_iobinding` to exclude memory - # copying overhead for inputs/outputs between cpu and gpu. - # Otherwise perf number is inaccurate. - onnx_inputs = self.adapt_pt_inputs_to_onnx(pt_inputs) - onnx_outputs = self.run_with_onnx_inputs(onnx_inputs) - return self.adapt_onnx_outputs_to_pt(onnx_outputs) - - -class OnnxModelFromTorchScript(OnnxModel): - """TorchScript based onnx export. `torch.onnx.export` - - TODO(bowbao): - * large model export failed. - Onnx Model is larger than 2GB, but exporter makes decision based pt model size, which is - smaller than 2GB. - * OOM on slightly larger model. - Both pt model and ort inference session are on gpu. Attempt has been made to move ORT to - cuda:1, however ORT perf drop significantly. - For now running everything with batch_size 1 set in launch script. - """ - - _COMPILER_NAME = "torchscript" - - def __init__( - self, output_directory, model, example_inputs, dynamic_shapes: bool, **kwargs - ): - if dynamic_shapes: - raise NotImplementedError("NYI dynamic shapes for OnnxModelFromTorchScript") - super().__init__( - output_directory, model, example_inputs, dynamic_shapes, **kwargs - ) - self._export( - model, - example_inputs, - self.model_path, - opset_version=17, - do_constant_folding=False, - verbose=False, - ) - self.onnx_session = self._init_ort_session(self.model_path) - - def _export(self, model, example_inputs, output_path: str, /, **kwargs) -> None: - if self.copy_before_export: - # Deepcopy model before export to avoid modification to baseline model. - model, example_inputs = self.deepcopy_model_and_inputs_to_device( - model, example_inputs, self._determine_deepcopy_target_device() - ) - - # Hack for huggingface models (kwargs only). - if isinstance(example_inputs, dict): - - class WrapperModel(torch.nn.Module): - def __init__(self, model, keys): - super().__init__() - self.model = model - self.keys = keys - - def forward(self, *args): - return self.model(**dict(zip(self.keys, args))) - - model = WrapperModel(model, list(example_inputs.keys())) - - if self.use_experimental_patch: - import torch_onnx - - torch_onnx.patch_torch( - error_report=True, - profile=True, - dump_exported_program=True, - artifacts_dir=os.path.dirname(output_path), - ) - else: - # make sure the patch is not in effect - try: - import torch_onnx - - torch_onnx.unpatch_torch() - except ImportError: - pass - - torch.onnx.export( - model, - self.format_pt_inputs(example_inputs), - output_path, - **kwargs, - ) - - def format_pt_inputs(self, pt_inputs): - # NOTE(bowbao): For huggingface benchmark, pt_inputs are formatted as dictionary, - # and consumed like `model(**pt_inputs)`. - # For other benchmarks, pt_inputs are formatted as tuple and consumed - # like `model(*pt_inputs)`. - if isinstance(pt_inputs, dict): - pt_inputs = list(pt_inputs.values()) - if isinstance(pt_inputs, torch.Tensor): - pt_inputs = (pt_inputs,) - return tuple(arg.contiguous() for arg in pt_inputs) - - def format_pt_outputs(self, pt_outputs): - if isinstance(pt_outputs, torch.Tensor): - pt_outputs = (pt_outputs,) - - pt_outputs = pytree.tree_leaves(pt_outputs) - - # Hack for huggingface model outputs - try: - from transformers import modeling_outputs - except ImportError: - pass - else: - - def _to_tuple(x): - if isinstance(x, modeling_outputs.ModelOutput): - return x.to_tuple() - return x - - pt_outputs = pytree.tree_map(_to_tuple, pt_outputs) - pt_outputs = pytree.tree_leaves(pt_outputs) - - return pt_outputs - - -class OnnxModelFromDynamo(OnnxModel): - """Dynamo and Fx based export. `torch.onnx.dynamo_export`.""" - - _COMPILER_NAME = "dynamo" - - def __init__( - self, output_directory, model, example_inputs, dynamic_shapes: bool, **kwargs - ): - super().__init__( - output_directory, model, example_inputs, dynamic_shapes, **kwargs - ) - self._dynamic_shapes = dynamic_shapes - self._onnx_program = self._export(model, example_inputs, self.model_path) - # Clear the model proto to save memory. - # The model proto is saved to disk and no longer needed from `onnx_program`. - # `onnx_program` is kept for i/o adapter usage. - self._onnx_program.model_proto.Clear() - self.onnx_session = self._init_ort_session(self.model_path) - - def _export( - self, model, example_inputs, output_path: str - ) -> torch.onnx.ONNXProgram: - if self.copy_before_export: - # Deepcopy model before export to avoid modification to baseline model. - model, example_inputs = self.deepcopy_model_and_inputs_to_device( - model, example_inputs, self._determine_deepcopy_target_device() - ) - - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes) - onnx_program = torch.onnx.dynamo_export( - model, *example_args, **example_kwargs, export_options=options - ) - - onnx_program.save(output_path) - return onnx_program - - def format_pt_inputs(self, pt_inputs): - pt_args, pt_kwargs = _normalize_bench_inputs(pt_inputs) - return self._onnx_program.adapt_torch_inputs_to_onnx(*pt_args, **pt_kwargs) - - def format_pt_outputs(self, pt_outputs): - return self._onnx_program.adapt_torch_outputs_to_onnx(pt_outputs) - - -class OnnxModelFromDynamoAotInline(OnnxModelFromDynamo): - """Dynamo and Fx based export, with AOT inline post export. `torch.onnx.dynamo_export`.""" - - _COMPILER_NAME = "dynamo_aot_inline" - - def _export( - self, model, example_inputs, output_path: str - ) -> torch.onnx.ONNXProgram: - if self.copy_before_export: - # Deepcopy model before export to avoid modification to baseline model. - model, example_inputs = self.deepcopy_model_and_inputs_to_device( - model, example_inputs, self._determine_deepcopy_target_device() - ) - - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes) - onnx_program = torch.onnx.dynamo_export( - model, *example_args, **example_kwargs, export_options=options - ) - # Apply AOT inline post export. - # Requires onnx >= 1.15 - import onnx - import onnx.inliner - - # Workaround for inliner not supporting with models larger than 2GB. - # Save model to disk first separating out external data, - # and load back without external data for inliner to work on. - model_proto = onnx_program.model_proto - onnx.save_model(model_proto, output_path, save_as_external_data=True) - model_proto = onnx.load(output_path, load_external_data=False) - model_proto = onnx.inliner.inline_local_functions(model_proto) - onnx.save_model(model_proto, output_path) - return onnx_program - - -class OnnxModelFromDynamoAotOptimize(OnnxModelFromDynamo): - """Dynamo and Fx based export, with AOT optimize post export. `torch.onnx.dynamo_export`.""" - - _COMPILER_NAME = "dynamo_aot_optimize" - - def _export( - self, model, example_inputs, output_path: str - ) -> torch.onnx.ONNXProgram: - if self.copy_before_export: - # Deepcopy model before export to avoid modification to baseline model. - model, example_inputs = self.deepcopy_model_and_inputs_to_device( - model, example_inputs, self._determine_deepcopy_target_device() - ) - - example_args, example_kwargs = _normalize_bench_inputs(example_inputs) - options = torch.onnx.ExportOptions(dynamic_shapes=self._dynamic_shapes) - export_output = torch.onnx.dynamo_export( - model, *example_args, **example_kwargs, export_options=options - ) - - import onnx - from onnxscript.rewriter.onnxruntime import rewrite - - model_proto = rewrite(export_output.model_proto) - onnx.save_model( - model_proto, - output_path, - save_as_external_data=True, - all_tensors_to_one_file=True, - ) - - return export_output - - -class _OnnxPatch: - @classmethod - def patch_non_tensor_outputs(cls, correct_result, new_result, fp64_outputs): - """Patch non-tensor outputs to make them comparable with the correct result. - - ONNX model always returns a flat tuple of tensors, but the PyTorch model outputs - `correct_result` and `fp64_outputs` can be arbitrary types. This function normalizes - the outputs to make them comparable with the ONNX model output. - """ - try: - from transformers import modeling_outputs - except ImportError: - has_transformers = False - else: - has_transformers = True - - if has_transformers and isinstance( - correct_result, modeling_outputs.ModelOutput - ): - correct_result = correct_result.to_tuple() - fp64_outputs = fp64_outputs.to_tuple() if fp64_outputs is not None else None - elif type(correct_result).__name__ in ( - "MaskedLMOutput", - "Seq2SeqLMOutput", - "CausalLMOutputWithCrossAttentions", - "LongformerMaskedLMOutput", - "Instances", - "SquashedNormal", - "Boxes", - "Normal", - "TanhTransform", - "Foo", - "Variable", - ): - # Copied from `same` function in `torch._dynamo.utils` - correct_result = [ - value - for key in correct_result.__dict__.keys() - if (value := getattr(correct_result, key)) is not None - ] - fp64_outputs = ( - [ - value - for key in fp64_outputs.__dict__.keys() - if (value := getattr(fp64_outputs, key)) is not None - ] - if fp64_outputs is not None - else None - ) - - # Flatten nested tuple of tensors, i.e. past_key_values - correct_result = pytree.tree_leaves(correct_result) - # Hack to put results from different runs on same device. - # This is needed for ONNX CPU fallback benchmark, where PyTorch eager is run on GPU. - # Assuming outputs from a single run are always on same device! - devices = [x.device for x in correct_result if isinstance(x, torch.Tensor)] - assert devices and all( - x == devices[0] for x in devices - ), "All tensors must be on same device!" - device = devices[0] - new_result = pytree.tree_leaves(new_result) - new_result = pytree.tree_map( - lambda x: x.to(device=device) if isinstance(x, torch.Tensor) else x, - new_result, - ) - fp64_outputs = pytree.tree_leaves(fp64_outputs) - - return correct_result, new_result, fp64_outputs - - -@dataclasses.dataclass -class OnnxExportErrorRow: - device: str - model_name: str - batch_size: int - rule_id: Optional[str] = None - rule_name: Optional[str] = None - diagnostic_level: Optional[str] = None - diagnostic_message: Optional[str] = None - exception_type_name: Optional[str] = None - exception_message: Optional[str] = None - - def __post_init__(self): - assert ( - self.rule_id is not None - and self.rule_name is not None - and self.diagnostic_level is not None - and self.diagnostic_message is not None - ) or self.exception_type_name, ( - "Either rule_id, rule_name, diagnostic_level and diagnostic_message " - "must be set or exception_type_name must be set" - ) - - @property - def headers(self) -> list[str]: - return [field.name for field in dataclasses.fields(self)] - - @property - def row(self) -> list[str]: - return [getattr(self, field.name) for field in dataclasses.fields(self)] - - -class OnnxExportErrorParser: - def __init__(self, device: str, model_name: str, batch_size: int): - self.device = device - self.model_name = model_name - self.batch_size = batch_size - - def _qualified_exception_class_name(self, exception: Exception) -> str: - if exception.__class__.__module__ == "builtins": - return exception.__class__.__name__ - return f"{exception.__class__.__module__}.{exception.__class__.__name__}" - - def parse_diagnostic_context( - self, - diagnostic_context: diagnostics.DiagnosticContext, - ) -> Generator[OnnxExportErrorRow, Any, Any]: - from torch.onnx._internal.fx import diagnostics - - for diagnostic in diagnostic_context.diagnostics: - if diagnostic.level >= diagnostics.levels.ERROR: - yield OnnxExportErrorRow( - device=self.device, - model_name=self.model_name, - batch_size=self.batch_size, - rule_id=diagnostic.rule.id, - rule_name=diagnostic.rule.name, - diagnostic_level=diagnostic.level.name, - diagnostic_message=diagnostic.message, - ) - - def parse_exception(self, exception: Exception) -> OnnxExportErrorRow: - return OnnxExportErrorRow( - device=self.device, - model_name=self.model_name, - batch_size=self.batch_size, - exception_type_name=self._qualified_exception_class_name(exception), - exception_message=str(exception), - ) - - -@dataclasses.dataclass -class OnnxContext: - onnx_model: Optional[OnnxModel] = None - - -def optimize_onnx_ctx( - output_directory: str, - onnx_model_cls: type[OnnxModel], - run_n_iterations: Callable, - dynamic_shapes: bool = False, - copy_before_export: bool = False, - use_experimental_patch: bool = False, -) -> Callable: - # NOTE(bowbao): This function creates and returns the onnx version of 'run_n_iterations', - # which does the following: - # 1. Export and cache model. - # 2. Create iobinding for ORT. - # 3. Run ORT for n iterations. - # The cached model is stored in 'context' under the returned callable. - context = OnnxContext() - test_data_dumped = False - - def run_n_iterations_onnx(model, inputs, n=2): - from torch.onnx._internal import _exporter_legacy - from torch.onnx._internal.fx import diagnostics - - # NOTE(bowbao): Capture all export & ort errors and diagnostics. - # Serialize to csv, to be parsed and summarized later by '._onnx/reporter.py'. - # TODO: Accuracy mismatch is not reported here in csv. - assert ( - output_filename.find(".csv") > 0 - ), f"expected output_filename to be a .csv, but got {output_filename}" - output_error_filename = output_filename[:-4] + "_export_error.csv" - parser = OnnxExportErrorParser(current_device, current_name, current_batch_size) - try: - nonlocal context - if context.onnx_model is None: - context.onnx_model = onnx_model_cls( - output_directory, - model, - copy.deepcopy(inputs), - dynamic_shapes=dynamic_shapes, - copy_before_export=copy_before_export, - use_experimental_patch=use_experimental_patch, - ) - onnx_model = context.onnx_model - - for _ in range(n): - nonlocal test_data_dumped - if not test_data_dumped: - # Serializes inputs and outputs to .pb files for further offline analysis. - # Due to this, this function is not and should not be used for perf measurement. - outputs = onnx_model.run_and_serialize_inputs_outputs(inputs) - test_data_dumped = True - else: - outputs = onnx_model.run(inputs) - return outputs - except _exporter_legacy.OnnxExporterError as e: - # `torch.onnx.dynamo_export` raises error that encloses diagnostics. - diagnostic_context = e.onnx_program.diagnostic_context - for parsed_error in parser.parse_diagnostic_context(diagnostic_context): - write_outputs( - output_error_filename, parsed_error.headers, parsed_error.row - ) - if context.onnx_model is not None: - e.onnx_program.save_diagnostics( - f"{context.onnx_model.model_dir}/" - f"{current_onnx_compiler}_{current_name}_{current_device}.sarif" - ) - - # Check also the raw exception that caused export failure. - # Skip if it is already analyzed by diagnostics. - cause_of_exception = e.__cause__ - if not isinstance( - cause_of_exception, diagnostics.RuntimeErrorWithDiagnostic - ): - parsed_error = parser.parse_exception(cause_of_exception) - write_outputs( - output_error_filename, parsed_error.headers, parsed_error.row - ) - raise - except Exception as e: - # `torch.onnx.export` errors. - # ORT errors. - parsed_error = parser.parse_exception(e) - write_outputs(output_error_filename, parsed_error.headers, parsed_error.row) - raise - - run_n_iterations_onnx.context = context - - return run_n_iterations_onnx - - def read_batch_size_from_file(args, filename, model_name): batch_size = None if os.path.exists("benchmarks"): @@ -3095,26 +2245,6 @@ class BenchmarkRunner: if name in self.skip_accuracy_check_as_eager_non_deterministic: return record_status("pass_due_to_skip", dynamo_start_stats=start_stats) - if ( - current_onnx_compiler == "torchscript" - or current_onnx_compiler == "dynamo" - ): - # Workaround for ONNX for non-tensor outputs - ( - correct_result, - new_result, - fp64_outputs, - ) = _OnnxPatch.patch_non_tensor_outputs( - correct_result, new_result, fp64_outputs - ) - # Relax tolerance for ONNX cuda - if current_device == "cuda": - tolerance = 1e-2 - - # TODO: store correct_result into the dumped file for offline onnx model validation. - # The downside and potential problem, is that the output formats may be different. - # E.g., the output order might not match, None might be part of output, etc. - force_max_multiplier = False if ( self.args.freezing @@ -3378,10 +2508,6 @@ class BenchmarkRunner: dynamo_cache_lookup_latency ) - if experiment.func is speedup_experiment_onnx: - experiment = functools.partial( - experiment, optimized_model_iter_fn.context.onnx_model - ) backend_timings = experiment( model, example_inputs, mark="expected", **experiment_kwargs ) @@ -3552,11 +2678,6 @@ class BenchmarkRunner: f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s" ) - if experiment.func is speedup_experiment_onnx: - experiment = functools.partial( - experiment, optimized_model_iter_fn.context.onnx_model - ) - if not hasattr(model, name): model.name = name results.append(experiment(model, example_inputs, **experiment_kwargs)) @@ -4209,36 +3330,6 @@ def parse_args(args=None): group.add_argument( "--xla", action="store_true", help="Compare TorchXLA to eager PyTorch" ) - group.add_argument( - "--torchscript-onnx", - "--torchscript_onnx", - action="store_true", - help="Measure speedup with TorchScript ONNX, i.e. `torch.onnx.export`", - ) - group.add_argument( - "--torch-onnx-patch", - "--torch_onnx_patch", - action="store_true", - help="Measure speedup with dynamo ONNX patch, i.e. `torch_onnx`", - ) - group.add_argument( - "--dynamo-onnx", - "--dynamo_onnx", - action="store_true", - help="Measure speedup with Dynamo ONNX, i.e. `torch.onnx.dynamo_export`", - ) - group.add_argument( - "--dynamo-onnx-aot-inline", - "--dynamo_onnx_aot_inline", - action="store_true", - help="Measure speedup with Dynamo ONNX AOT Inline, i.e. `torch.onnx.dynamo_export`", - ) - group.add_argument( - "--dynamo-onnx-aot-optimize", - "--dynamo_onnx_aot_optimize", - action="store_true", - help="Measure speedup with Dynamo ONNX w/ ort fusions, i.e. `torch.onnx.dynamo_export`", - ) group.add_argument( "--backend", choices=torch._dynamo.list_backends(exclude_tags=None), @@ -4591,8 +3682,7 @@ def run(runner, args, original_dir=None): current_settings, \ output_filename, \ disable_output, \ - optimize_ctx, \ - current_onnx_compiler + optimize_ctx optimize_ctx = contextlib.nullcontext() if args.disable_output: @@ -4625,60 +3715,6 @@ def run(runner, args, original_dir=None): torch._dynamo.mark_dynamic = MagicMock() experiment = xla output_filename = "xla.csv" - elif args.torchscript_onnx: - optimize_ctx = functools.partial( - optimize_onnx_ctx, - args.output_directory or ".", - OnnxModelFromTorchScript, - copy_before_export=args.performance, # Accuarcy bench already did deepcopy - ) - experiment = speedup_experiment_onnx - output_filename = "torchscript_onnx.csv" - current_onnx_compiler = "torchscript" - elif args.torch_onnx_patch: - optimize_ctx = functools.partial( - optimize_onnx_ctx, - args.output_directory or ".", - OnnxModelFromTorchScript, - copy_before_export=args.performance, - use_experimental_patch=True, - ) - experiment = speedup_experiment_onnx - output_filename = "torch_onnx_patch.csv" - current_onnx_compiler = "dynamo" - elif args.dynamo_onnx: - optimize_ctx = functools.partial( - optimize_onnx_ctx, - args.output_directory or ".", - OnnxModelFromDynamo, - dynamic_shapes=args.dynamic_shapes, - copy_before_export=args.performance, - ) - experiment = speedup_experiment_onnx - output_filename = "dynamo_onnx.csv" - current_onnx_compiler = "dynamo" - elif args.dynamo_onnx_aot_inline: - optimize_ctx = functools.partial( - optimize_onnx_ctx, - args.output_directory or ".", - OnnxModelFromDynamoAotInline, - dynamic_shapes=args.dynamic_shapes, - copy_before_export=args.performance, - ) - experiment = speedup_experiment_onnx - output_filename = "dynamo_onnx_aot_inline.csv" - current_onnx_compiler = "dynamo" - elif args.dynamo_onnx_aot_optimize: - optimize_ctx = functools.partial( - optimize_onnx_ctx, - args.output_directory or ".", - OnnxModelFromDynamoAotOptimize, - dynamic_shapes=args.dynamic_shapes, - copy_before_export=args.performance, - ) - experiment = speedup_experiment_onnx - output_filename = "dynamo_onnx_aot_optimize.csv" - current_onnx_compiler = "dynamo" elif args.speedup_dynamo_ts: optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython) experiment = speedup_experiment diff --git a/benchmarks/dynamo/runner.py b/benchmarks/dynamo/runner.py index 4d49b60ffa4..430815139c7 100755 --- a/benchmarks/dynamo/runner.py +++ b/benchmarks/dynamo/runner.py @@ -90,8 +90,6 @@ TABLE = { "inductor_max_autotune_no_cudagraphs": ( "--inference -n50 --inductor --inductor-compile-mode max-autotune-no-cudagraphs --disable-cudagraphs " ), - "torchscript-onnx": "--inference -n5 --torchscript-onnx", - "dynamo-onnx": "--inference -n5 --dynamo-onnx", }, } diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index f397cb63343..c3b9a2ae618 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -6,7 +6,6 @@ UNKNOWN=() # defaults PARALLEL=1 -export TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK=ERRORS while [[ $# -gt 0 ]] do @@ -48,44 +47,6 @@ if [[ "$SHARD_NUMBER" == "2" ]]; then xdoctest torch.onnx --style=google --options="+IGNORE_WHITESPACE" fi -if [[ "$SHARD_NUMBER" == "2" ]]; then - # Sanity check on torchbench w/ onnx - pip install pandas - log_folder="test/.torchbench_logs" - device="cpu" - modes=("accuracy" "performance") - compilers=("dynamo-onnx" "torchscript-onnx") - suites=("huggingface" "timm_models") - - mkdir -p "${log_folder}" - for mode in "${modes[@]}"; do - for compiler in "${compilers[@]}"; do - for suite in "${suites[@]}"; do - output_file="${log_folder}/${compiler}_${suite}_float32_inference_${device}_${mode}.csv" - bench_file="benchmarks/dynamo/${suite}.py" - bench_args=("--${mode}" --float32 "-d${device}" "--output=${output_file}" "--output-directory=${top_dir}" --inference -n5 "--${compiler}" --no-skip --dashboard --batch-size 1) - # Run only selected model for each suite to quickly validate the benchmark suite works as expected. - case "$suite" in - "torchbench") - bench_args+=(-k resnet18) - ;; - "huggingface") - bench_args+=(-k ElectraForQuestionAnswering) - ;; - "timm_models") - bench_args+=(-k lcnet_050) - ;; - *) - echo "Unknown suite: ${suite}" - exit 1 - ;; - esac - python "${top_dir}/${bench_file}" "${bench_args[@]}" - done - done - done -fi - # Our CI expects both coverage.xml and .coverage to be within test/ if [ -d .coverage ]; then mv .coverage test/.coverage