onnxruntime/tools/ci_build/build.py

3015 lines
129 KiB
Python
Raw Normal View History

2018-11-20 00:48:22 +00:00
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2018-11-20 00:48:22 +00:00
# Licensed under the MIT License.
import argparse
import contextlib
import json
2018-11-20 00:48:22 +00:00
import os
import platform
2018-11-20 00:48:22 +00:00
import re
import shlex
2018-11-20 00:48:22 +00:00
import shutil
import subprocess
import sys
import warnings
from pathlib import Path
2020-04-19 03:48:30 +00:00
def version_to_tuple(version: str) -> tuple:
v = []
for s in version.split("."):
with contextlib.suppress(ValueError):
v.append(int(s))
return tuple(v)
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", ".."))
sys.path.insert(0, os.path.join(REPO_DIR, "tools", "python"))
import util.android as android # noqa: E402
from util import get_logger, is_linux, is_macOS, is_windows, run # noqa: E402
log = get_logger("build")
class BaseError(Exception):
"""Base class for errors originating from build.py."""
2020-04-19 03:48:30 +00:00
class BuildError(BaseError):
"""Error from running build steps."""
def __init__(self, *messages):
super().__init__("\n".join(messages))
2020-04-19 03:48:30 +00:00
class UsageError(BaseError):
"""Usage related error."""
def __init__(self, message):
super().__init__(message)
2020-04-19 03:48:30 +00:00
def _check_python_version():
required_minor_version = 8
if (sys.version_info.major, sys.version_info.minor) < (3, required_minor_version):
raise UsageError(
f"Invalid Python version. At least Python 3.{required_minor_version} is required. "
f"Actual Python version: {sys.version}"
)
def _str_to_bool(s):
"""Convert string to bool (in argparse context)."""
if s.lower() not in ["true", "false"]:
raise ValueError(f"Need bool; got {s!r}")
return {"true": True, "false": False}[s.lower()]
2020-04-19 03:48:30 +00:00
_check_python_version()
2018-11-20 00:48:22 +00:00
def _openvino_verify_device_type(device_read):
choices = ["CPU", "GPU", "NPU"]
[OpenVINO-EP] V3.2 Release (#9232) * model caching changes for 2021.4 Signed-off-by: Your Name <you@example.com> * changed the ov version check * Minor changes added Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added support for external data format Starting from OpenVINO 2021.4 version, OpenVINO-EP will support onnx models with Weights saved in external file location. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Introduced Hetero/Multi options for perf_test Enabled to use HETERO/MULTI device feature from OpenVINO-EP using the onnxruntime_perf_test tool. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * cleaned up CMake code for older OV version support OV 2020.3 is now longer supported by OpenVINO-EP. This check is not required now. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Add option to disable graph partitioning Added a option to diable graph partitioning during build time for OpenVINO-EP. with this option, when the model is not fully supported on OpenVINO-EP, the model fully fall backs to default CPU EP (MLAS). Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Changed the flag for diabling graph partitioning Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes the flake8 check error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added changes for disable graph partition option Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixed flake8 indentation error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: Your Name <you@example.com>
2021-10-07 23:02:19 +00:00
choices1 = [
"CPU_NO_PARTITION",
"GPU_NO_PARTITION",
"NPU_NO_PARTITION",
"NPU_NO_CPU_FALLBACK",
]
status_hetero = True
res = False
if device_read in choices:
res = True
elif device_read in choices1:
[OpenVINO-EP] V3.2 Release (#9232) * model caching changes for 2021.4 Signed-off-by: Your Name <you@example.com> * changed the ov version check * Minor changes added Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added support for external data format Starting from OpenVINO 2021.4 version, OpenVINO-EP will support onnx models with Weights saved in external file location. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Introduced Hetero/Multi options for perf_test Enabled to use HETERO/MULTI device feature from OpenVINO-EP using the onnxruntime_perf_test tool. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * cleaned up CMake code for older OV version support OV 2020.3 is now longer supported by OpenVINO-EP. This check is not required now. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Add option to disable graph partitioning Added a option to diable graph partitioning during build time for OpenVINO-EP. with this option, when the model is not fully supported on OpenVINO-EP, the model fully fall backs to default CPU EP (MLAS). Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Changed the flag for diabling graph partitioning Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes the flake8 check error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added changes for disable graph partition option Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixed flake8 indentation error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: Your Name <you@example.com>
2021-10-07 23:02:19 +00:00
res = True
elif device_read.startswith(("HETERO:", "MULTI:", "AUTO:")):
res = True
comma_separated_devices = device_read.split(":")
comma_separated_devices = comma_separated_devices[1].split(",")
if len(comma_separated_devices) < 2:
print("At least two devices required in Hetero/Multi/Auto Mode")
status_hetero = False
dev_options = ["CPU", "GPU", "NPU"]
for dev in comma_separated_devices:
if dev not in dev_options:
status_hetero = False
break
def invalid_hetero_build():
print("\nIf trying to build Hetero/Multi/Auto, specify the supported devices along with it.\n")
print("specify the keyword HETERO or MULTI or AUTO followed by the devices ")
print("in the order of priority you want to build\n")
print("The different hardware devices that can be added in HETERO or MULTI or AUTO")
print("are ['CPU','GPU','NPU'] \n")
print("An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n")
print("An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n")
print("An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n")
sys.exit("Wrong Build Type selected")
if res is False:
print("\nYou have selected wrong configuration for the build.")
print("pick the build type for specific Hardware Device from following options: ", choices)
[OpenVINO-EP] V3.2 Release (#9232) * model caching changes for 2021.4 Signed-off-by: Your Name <you@example.com> * changed the ov version check * Minor changes added Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added support for external data format Starting from OpenVINO 2021.4 version, OpenVINO-EP will support onnx models with Weights saved in external file location. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Introduced Hetero/Multi options for perf_test Enabled to use HETERO/MULTI device feature from OpenVINO-EP using the onnxruntime_perf_test tool. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * cleaned up CMake code for older OV version support OV 2020.3 is now longer supported by OpenVINO-EP. This check is not required now. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Add option to disable graph partitioning Added a option to diable graph partitioning during build time for OpenVINO-EP. with this option, when the model is not fully supported on OpenVINO-EP, the model fully fall backs to default CPU EP (MLAS). Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Changed the flag for diabling graph partitioning Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes the flake8 check error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added changes for disable graph partition option Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixed flake8 indentation error Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: Your Name <you@example.com>
2021-10-07 23:02:19 +00:00
print("(or) from the following options with graph partitioning disabled: ", choices1)
print("\n")
if not (device_read.startswith(("HETERO", "MULTI", "AUTO"))):
invalid_hetero_build()
sys.exit("Wrong Build Type selected")
if status_hetero is False:
invalid_hetero_build()
return device_read
2020-04-19 03:48:30 +00:00
def parse_arguments():
class Parser(argparse.ArgumentParser):
# override argument file line parsing behavior - allow multiple arguments per line and handle quotes
def convert_arg_line_to_args(self, arg_line):
return shlex.split(arg_line)
parser = Parser(
2020-04-19 03:48:30 +00:00
description="ONNXRuntime CI build driver.",
usage="""
2020-04-19 03:48:30 +00:00
Default behavior is --update --build --test for native architecture builds.
Default behavior is --update --build for cross-compiled builds.
The Update phase will update git submodules, and run cmake to generate makefiles.
The Build phase will build all projects.
The Test phase will run all unit tests, and optionally the ONNX tests.
Use the individual flags to only run the specified stages.
""",
# files containing arguments can be specified on the command line with "@<filename>" and the arguments within
# will be included at that point
fromfile_prefix_chars="@",
)
2018-11-20 00:48:22 +00:00
# Main arguments
parser.add_argument("--build_dir", required=True, help="Path to the build directory.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--config",
nargs="+",
default=["Debug"],
2020-04-19 03:48:30 +00:00
choices=["Debug", "MinSizeRel", "Release", "RelWithDebInfo"],
help="Configuration(s) to build.",
)
parser.add_argument("--update", action="store_true", help="Update makefiles.")
parser.add_argument("--build", action="store_true", help="Build.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--clean", action="store_true", help="Run 'cmake --build --target clean' for the selected config/s."
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--parallel",
nargs="?",
const="0",
default="1",
type=int,
help="Use parallel build. The optional value specifies the maximum number of parallel jobs. "
"If the optional value is 0 or unspecified, it is interpreted as the number of CPUs.",
)
Flash Attention v2 MHA (#17227) ### Description Integrate Flash Attention V2 to PackedMultiHeadAttention, MultiHeadAttention and Attention operators. Flash Attention v2 source code is from https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src. We did some change to remove dependency on Torch, then removed backward and bfloat16 related code. Add benchmark script (see benchmark_mha.sh) to compare different attention kernels for MultiHeadAttention operator. Current limitations for Flash Attention in PackedMultiHeadAttention, MultiHeadAttention and Attention operators: * Relative Position Bias is not supported * Different hidden size for Q and V is not supported * Only float16 is supported * Padding/attention mask is not supported * For MultiHeadAttention, when there is past or present input, bias shall be provided to activate flash attention * For Attention, past or present inputs will deactivate flash attention * Causal is not supported Some limitations (like attention mask and causal) might be removed later. Currently, Flash Attention v2 only works in Linux. For Windows, we will enable later with Cutlass 3.2. Two environment variables can be used for testing purpose: (1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default value is 0 (enable). Set it to "1" to disable it. (2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is "513", which means that we only enable flash attention when sequence length is larger than 512 for packed QKV format. Set it to "0" if you want to use flash attention v2 whenever possible. ### Speedup The following result is from Standard_ND96amsr_A100_v4 VM (A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per second for MultiHeadAttention operator. There are 3 input formats: * `Q,K,V` means separated inputs query, key and value of BxSxNH * `Q,KV` means packed KV, where key is 5D: BxSxNx2xH * `QKV` means packed QKV, where query is 5D: BxSxNx3xH Note that flash attention cannot use packed QKV format, so extra Transpose is needed. We found that TensorRT kernel is faster for sequence length <= 512 for packed QKV. The reason might be no transpose is needed for TensorRT kernel in this format. We also notice that, TensorRT kernel is faster for stable diffusion 512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while flash attention v2 is faster for 1024x1024 image (see seq_len=16384, heads=8, head_dim=40 below). input format | batch size | sequence length | heads | head dim | flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention (TFLOPs/s) -- | -- | -- | -- | -- | -- | -- | -- Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3 Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7 Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3 Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4 Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8 Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7 Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7 Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3 Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7 Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6 Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2 Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8 Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8 Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5 Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8 Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2 Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2 Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8 Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1 Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6 Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7 Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7 Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3 Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7 Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8 Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1 Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4 Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1 Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6 Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8 Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6 Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5 Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7 Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1 Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3 Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9 Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6 Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2 Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8 Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5 Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6 Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6 Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8 Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8 Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5 Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3 Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8 Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8 Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9 Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0 Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0 Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9 Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9 Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8 QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3 QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9 QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6 QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2 QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9 QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5 QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7 QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2 QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7 QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5 QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2 QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7 QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1 QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7 QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4 QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5 QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8 QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9 QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1 QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6 QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7 QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6 QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5 QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1 QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5 QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2 QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6 QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15 QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84 QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75 QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95 ### Known Issues NVCC uses huge memory while compiling flash attention CUDA kernel. Linux build with CUDA might fail when machine has limited memory while number of CPUs is large. Walkaround is to use a build machine with larger memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in build. ### Motivation and Context Increases speed and efficiency of MHA or Packed MHA. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
parser.add_argument(
"--nvcc_threads",
nargs="?",
default=-1,
type=int,
help="Maximum number of NVCC threads in each parallel job."
"If the value is unspecified, it will be computed based on available memory and number of parallel jobs.",
Flash Attention v2 MHA (#17227) ### Description Integrate Flash Attention V2 to PackedMultiHeadAttention, MultiHeadAttention and Attention operators. Flash Attention v2 source code is from https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src. We did some change to remove dependency on Torch, then removed backward and bfloat16 related code. Add benchmark script (see benchmark_mha.sh) to compare different attention kernels for MultiHeadAttention operator. Current limitations for Flash Attention in PackedMultiHeadAttention, MultiHeadAttention and Attention operators: * Relative Position Bias is not supported * Different hidden size for Q and V is not supported * Only float16 is supported * Padding/attention mask is not supported * For MultiHeadAttention, when there is past or present input, bias shall be provided to activate flash attention * For Attention, past or present inputs will deactivate flash attention * Causal is not supported Some limitations (like attention mask and causal) might be removed later. Currently, Flash Attention v2 only works in Linux. For Windows, we will enable later with Cutlass 3.2. Two environment variables can be used for testing purpose: (1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default value is 0 (enable). Set it to "1" to disable it. (2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is "513", which means that we only enable flash attention when sequence length is larger than 512 for packed QKV format. Set it to "0" if you want to use flash attention v2 whenever possible. ### Speedup The following result is from Standard_ND96amsr_A100_v4 VM (A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per second for MultiHeadAttention operator. There are 3 input formats: * `Q,K,V` means separated inputs query, key and value of BxSxNH * `Q,KV` means packed KV, where key is 5D: BxSxNx2xH * `QKV` means packed QKV, where query is 5D: BxSxNx3xH Note that flash attention cannot use packed QKV format, so extra Transpose is needed. We found that TensorRT kernel is faster for sequence length <= 512 for packed QKV. The reason might be no transpose is needed for TensorRT kernel in this format. We also notice that, TensorRT kernel is faster for stable diffusion 512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while flash attention v2 is faster for 1024x1024 image (see seq_len=16384, heads=8, head_dim=40 below). input format | batch size | sequence length | heads | head dim | flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention (TFLOPs/s) -- | -- | -- | -- | -- | -- | -- | -- Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3 Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7 Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3 Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4 Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8 Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7 Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7 Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3 Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7 Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6 Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2 Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8 Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8 Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5 Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8 Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2 Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2 Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8 Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1 Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6 Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7 Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7 Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3 Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7 Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8 Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1 Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4 Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1 Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6 Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8 Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6 Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5 Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7 Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1 Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3 Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9 Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6 Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2 Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8 Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5 Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6 Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6 Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8 Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8 Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5 Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3 Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8 Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8 Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9 Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0 Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0 Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9 Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9 Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8 QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3 QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9 QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6 QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2 QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9 QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5 QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7 QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2 QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7 QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5 QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2 QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7 QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1 QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7 QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4 QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5 QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8 QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9 QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1 QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6 QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7 QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6 QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5 QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1 QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5 QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2 QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6 QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15 QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84 QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75 QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95 ### Known Issues NVCC uses huge memory while compiling flash attention CUDA kernel. Linux build with CUDA might fail when machine has limited memory while number of CPUs is large. Walkaround is to use a build machine with larger memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in build. ### Motivation and Context Increases speed and efficiency of MHA or Packed MHA. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
)
parser.add_argument("--test", action="store_true", help="Run unit tests.")
parser.add_argument("--skip_tests", action="store_true", help="Skip all tests.")
parser.add_argument(
"--compile_no_warning_as_error",
action="store_true",
help="Preventing warnings from being treated as errors on compile.",
)
# Training options
parser.add_argument("--enable_nvtx_profile", action="store_true", help="Enable NVTX profile in ORT.")
parser.add_argument("--enable_memory_profile", action="store_true", help="Enable memory profile in ORT.")
parser.add_argument(
"--enable_training",
action="store_true",
help="Enable full training functionality in ORT. Includes ORTModule and ORT Training APIs.",
)
parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.")
parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.")
parser.add_argument("--enable_nccl", action="store_true", help="Enable Nccl.")
parser.add_argument("--mpi_home", help="Path to MPI installation dir")
parser.add_argument("--nccl_home", help="Path to NCCL installation dir")
parser.add_argument(
"--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default."
)
2018-11-20 00:48:22 +00:00
# enable ONNX tests
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--enable_onnx_tests",
action="store_true",
2020-04-19 03:48:30 +00:00
help="""When running the Test phase, run onnx_test_running against
available test data directories.""",
)
2020-04-19 03:48:30 +00:00
parser.add_argument("--path_to_protoc_exe", help="Path to protoc exe.")
parser.add_argument("--fuzz_testing", action="store_true", help="Enable Fuzz testing of the onnxruntime.")
parser.add_argument(
"--enable_symbolic_shape_infer_tests",
action="store_true",
help="""When running the Test phase, run symbolic shape inference against
available test data directories.""",
)
# generate documentation
parser.add_argument(
"--gen_doc",
nargs="?",
const="yes",
type=str,
help="Generate documentation listing standard ONNX operators and types implemented by "
"various execution providers and contrib operator schemas. Must be used for inference builds, only!"
"Use `--gen_doc validate` to validate these match the current contents in /docs.",
)
parser.add_argument("--gen-api-doc", action="store_true", help="Generate API documentation for PyTorch frontend")
2018-11-20 00:48:22 +00:00
# CUDA related
parser.add_argument("--use_cuda", action="store_true", help="Enable CUDA.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--cuda_version", help="The version of CUDA toolkit to use. Auto-detect if not specified. e.g. 9.0"
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--cuda_home",
help="Path to CUDA home."
2020-04-19 03:48:30 +00:00
"Read from CUDA_HOME environment variable if --use_cuda is true and "
"--cuda_home is not specified.",
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--cudnn_home",
help="Path to CUDNN home. "
2020-04-19 03:48:30 +00:00
"Read from CUDNN_HOME environment variable if --use_cuda is true and "
"--cudnn_home is not specified.",
)
parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.")
parser.add_argument(
"--enable_cuda_nhwc_ops", action="store_true", help="Deprecated; default to enable CUDA NHWC ops in build."
)
parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
2018-11-20 00:48:22 +00:00
# Python bindings
parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
parser.add_argument("--build_wheel", action="store_true", help="Build Python Wheel.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--wheel_name_suffix",
help="Suffix to append to created wheel names. This value is currently only used for nightly builds.",
)
parser.add_argument("--skip-keras-test", action="store_true", help="Skip tests with Keras if keras is installed")
2018-11-20 00:48:22 +00:00
# C-Sharp bindings
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--build_csharp",
action="store_true",
help="Build C#.Net DLL and NuGet package. This should be only used in CI pipelines. "
"For building C# bindings and packaging them into nuget package use --build_nuget arg.",
)
parser.add_argument(
"--build_nuget",
action="store_true",
help="Build C#.Net DLL and NuGet package on the local machine. "
"Currently only Windows and Linux platforms are supported.",
)
2018-11-20 00:48:22 +00:00
parser.add_argument(
"--msbuild_extra_options",
nargs="+",
action="append",
help="Extra properties to pass to msbuild during build. "
"These are just msbuild /p: options without the leading /p:.",
)
2019-12-06 19:43:40 +00:00
# Java bindings
parser.add_argument("--build_java", action="store_true", help="Build Java bindings.")
2018-11-20 00:48:22 +00:00
# Node.js binding
parser.add_argument("--build_nodejs", action="store_true", help="Build Node.js binding and NPM package.")
# Objective-C binding
parser.add_argument("--build_objc", action="store_true", help="Build Objective-C binding.")
2018-11-20 00:48:22 +00:00
# Build a shared lib
parser.add_argument("--build_shared_lib", action="store_true", help="Build a shared library for the ONNXRuntime.")
# Build a shared lib
parser.add_argument(
"--build_apple_framework", action="store_true", help="Build a macOS/iOS framework for the ONNXRuntime."
)
2018-11-20 00:48:22 +00:00
# Build options
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--cmake_extra_defines",
nargs="+",
action="append",
2020-04-19 03:48:30 +00:00
help="Extra definitions to pass to CMake during build system "
"generation. These are just CMake -D options without the leading -D.",
)
parser.add_argument("--target", help="Build a specific target, e.g. winml_dll")
# This flag is needed when :
# 1. The OS is 64 bits Windows
# 2. And the target binary is for 32 bits Windows
# 3. And the python used for running this script is 64 bits.
# But if you can get a 32 bits python, the build will run better and you won't need this flag.
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--x86",
action="store_true",
help="[cross-compiling] Create Windows x86 makefiles. Requires --update and no existing cache "
"CMake setup. Delete CMakeCache.txt if needed",
)
parser.add_argument(
"--rv64",
action="store_true",
help="[cross-compiling] Create riscv64 makefiles. Requires --update and no existing cache "
"CMake setup. Delete CMakeCache.txt if needed",
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--arm",
action="store_true",
help="[cross-compiling] Create ARM makefiles. Requires --update and no existing cache "
"CMake setup. Delete CMakeCache.txt if needed",
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--arm64",
action="store_true",
help="[cross-compiling] Create ARM64 makefiles. Requires --update and no existing cache "
"CMake setup. Delete CMakeCache.txt if needed",
)
parser.add_argument(
"--arm64ec",
action="store_true",
help="[cross-compiling] Create ARM64EC makefiles. Requires --update and no existing cache "
"CMake setup. Delete CMakeCache.txt if needed",
)
parser.add_argument(
"--buildasx",
action="store_true",
help="[cross-compiling] Create ARM64X Binary.",
)
parser.add_argument(
"--riscv_toolchain_root",
type=str,
default="",
help="Path to RISC-V toolchain root dir. e.g. --riscv_toolchain_root=$HOME/riscv-tools/",
)
parser.add_argument(
"--riscv_qemu_path",
type=str,
default="",
help="Path to RISC-V qemu. e.g. --riscv_qemu_path=$HOME/qemu-dir/qemu-riscv64",
)
# https://gitlab.kitware.com/cmake/cmake/-/issues/25192
parser.add_argument(
"--msvc_toolset",
help="MSVC toolset to use. e.g. 14.11. It doesn't work if the version number is in the range of [14.36, 14.39]",
)
parser.add_argument("--windows_sdk_version", help="Windows SDK version to use. e.g. 10.0.19041.0")
parser.add_argument("--android", action="store_true", help="Build for Android")
parser.add_argument(
"--android_abi",
default="arm64-v8a",
choices=["armeabi-v7a", "arm64-v8a", "x86", "x86_64"],
help="Specify the target Android Application Binary Interface (ABI)",
)
parser.add_argument("--android_api", type=int, default=27, help="Android API Level, e.g. 21")
parser.add_argument(
"--android_sdk_path", type=str, default=os.environ.get("ANDROID_HOME", ""), help="Path to the Android SDK"
)
parser.add_argument(
"--android_ndk_path", type=str, default=os.environ.get("ANDROID_NDK_HOME", ""), help="Path to the Android NDK"
)
parser.add_argument(
"--android_cpp_shared",
action="store_true",
help="Build with shared libc++ instead of the default static libc++.",
)
parser.add_argument("--android_run_emulator", action="store_true", help="Start up an Android emulator if needed.")
parser.add_argument("--use_gdk", action="store_true", help="Build with the GDK toolchain.")
parser.add_argument(
"--gdk_edition",
default=os.path.normpath(os.environ.get("GameDKLatest", "")).split(os.sep)[-1], # noqa: SIM112
help="Build with a specific GDK edition. Defaults to the latest installed.",
)
2022-04-07 22:06:31 +00:00
parser.add_argument("--gdk_platform", default="Scarlett", help="Sets the GDK target platform.")
parser.add_argument("--enable_wasm_memory64", action="store_true", help="Enable WebAssembly 64bit support")
platform_group = parser.add_mutually_exclusive_group()
platform_group.add_argument("--ios", action="store_true", help="build for ios")
platform_group.add_argument("--visionos", action="store_true", help="build for visionOS")
platform_group.add_argument(
"--macos",
choices=["MacOSX", "Catalyst"],
help="Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present.",
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--apple_sysroot", default="", help="Specify the location name of the macOS platform SDK to be used"
)
parser.add_argument(
"--ios_toolchain_file",
default="",
help="Path to ios toolchain file, or cmake/onnxruntime_ios.toolchain.cmake will be used",
)
parser.add_argument(
"--visionos_toolchain_file",
default="",
help="Path to visionos toolchain file, or cmake/onnxruntime_visionos.toolchain.cmake will be used",
)
parser.add_argument(
"--xcode_code_signing_team_id", default="", help="The development team ID used for code signing in Xcode"
)
parser.add_argument(
"--xcode_code_signing_identity", default="", help="The development identity used for code signing in Xcode"
)
parser.add_argument(
"--use_xcode",
action="store_const",
const="Xcode",
dest="cmake_generator",
help="Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to '--cmake_generator Xcode'.",
)
parser.add_argument(
"--osx_arch",
default="arm64" if platform.machine() == "arm64" else "x86_64",
choices=["arm64", "arm64e", "x86_64"],
help="Specify the Target specific architectures for macOS and iOS, This is only supported on MacOS",
)
parser.add_argument(
"--apple_deploy_target",
type=str,
help="Specify the minimum version of the target platform "
"(e.g. macOS or iOS)"
"This is only supported on MacOS",
)
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
# A 32-bit progress doesn't have enough memory to run all the tests in onnxruntime_test_all.
# Mimalloc is incompatible with address sanitizer.
# Address sanitizer itself is also a memory leak checker, so when it is enabled we should disable_memleak_checker.
parser.add_argument(
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
"--enable_address_sanitizer", action="store_true", help="Enable address sanitizer. Windows/Linux/MacOS only."
)
# The following flag is mostly designed to be used in ONNX Runtime's Azure DevOps/Github build pipelines. Its main purpose is to make the built binaries pass BinSkim scan.
parser.add_argument("--use_binskim_compliant_compile_flags", action="store_true", help="Use preset compile flags.")
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
parser.add_argument(
"--disable_memleak_checker",
action="store_true",
help="Disable memory leak checker from Windows build. By default it is enabled in Windows Debug build. This option is Windows only.",
)
Create CMake option `onnxruntime_USE_VCPKG` (#21348) ### Changes 1. CMake option `onnxruntime_USE_VCPKG`. It will be used in the vcpkg port * Unit test may fail because this option leads to a mixture of unexpected external library versions. Especially ONNX, Protobuf, and Flatbuffers version can be different 2. Overhaul of `onnxruntime_external_deps.cmake` * Make `FetchContent_Declare` to try `find_package`. See https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html * Relocated `FetchContent_Declare` and `FetchContent_MakeAvailable`(or `onnxruntime_fetchcontent_makeavailable`) to closer lines. It was too hard to navigate the entire file to search related sections... * Alias `IMPORTED` targets like build targets (e.g. `ONNX::onnx` --> `onnx`) ```cmake # The script uses `find_package` with the changes. # In this case, use vcpkg to search dependencies # See https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html include(external/onnxruntime_external_deps.cmake) ``` 3. Create CMakePresets.json and presets to [run vcpkg in manifest mode](https://learn.microsoft.com/en-us/vcpkg/concepts/manifest-mode) * Currently, it's NOT for training build * Main triplets are `x64-windows` and `x64-osx` ```pwsh Push-Location "cmake" cmake --preset "x64-windows-vcpkg" cmake --build --preset "x64-windows-vcpkg-debug" Pop-Location ``` ```bash pushd "cmake" cmake --preset "x64-osx-vcpkg" cmake --build --preset "x64-osx-vcpkg-debug" popd ``` 4. Updated tools/ci_build/build.py * `--use_vcpkg` option: it needs `CMAKE_TOOLCHAIN_FILE` with [vcpkg.cmake toolchain script](https://github.com/microsoft/vcpkg/blob/master/scripts/buildsystems/vcpkg.cmake) * `--compile_no_warning_as_error` is recommended because library version differences will cause unexpected compiler warnings ```bash python ./tools/ci_build/build.py \ --compile_no_warning_as_error \ --use_vcpkg \ --cmake_extra_defines "CMAKE_TOOLCHAIN_FILE:FILEPATH=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" \ --cmake_extra_defines "VCPKG_TARGET_TRIPLET=..." ``` 5. Created Job `Vcpkg` for Windows and macOS * Show how to setup and use vcpkg. Similar to the CMakePresets.json usage ### Motivation and Context * Help #7150 * Help https://github.com/microsoft/vcpkg/pull/36850 * https://github.com/luncliff/vcpkg-registry/pull/212 * https://github.com/microsoft/vcpkg/pull/39881 * https://github.com/luncliff/vcpkg-registry/pull/215 * https://github.com/luncliff/vcpkg-registry/pull/216 * https://github.com/luncliff/vcpkg-registry/pull/227 * https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html * https://github.com/microsoft/vcpkg/blob/master/scripts/buildsystems/vcpkg.cmake ### Future Works? More feature coverage with the vcpkg supported libraries * CUDA feature support * Training feature support
2024-09-10 23:39:27 +00:00
# Dependency search with vcpkg
parser.add_argument(
"--use_vcpkg",
action="store_true",
help="Use vcpkg to search dependencies. Requires CMAKE_TOOLCHAIN_FILE for vcpkg.cmake",
)
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
# WebAssembly build
parser.add_argument("--build_wasm", action="store_true", help="Build for WebAssembly")
parser.add_argument("--build_wasm_static_lib", action="store_true", help="Build for WebAssembly static library")
parser.add_argument("--emsdk_version", default="3.1.59", help="Specify version of emsdk")
parser.add_argument("--enable_wasm_simd", action="store_true", help="Enable WebAssembly SIMD")
parser.add_argument("--enable_wasm_threads", action="store_true", help="Enable WebAssembly multi-threads support")
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
parser.add_argument(
"--disable_wasm_exception_catching", action="store_true", help="Disable exception catching in WebAssembly."
)
parser.add_argument(
"--enable_wasm_api_exception_catching", action="store_true", help="Catch exceptions at top level api."
)
parser.add_argument(
"--enable_wasm_exception_throwing_override",
action="store_true",
help="Enable exception throwing in WebAssembly, this will override default disabling exception throwing "
"behavior when disable exceptions.",
)
parser.add_argument("--wasm_run_tests_in_browser", action="store_true", help="Run WebAssembly tests in browser")
parser.add_argument(
"--enable_wasm_profiling", action="store_true", help="Enable WebAssembly profiling and preserve function names"
)
2021-04-30 05:22:52 +00:00
parser.add_argument(
"--enable_wasm_debug_info", action="store_true", help="Build WebAssembly with DWARF format debug info"
)
parser.add_argument("--wasm_malloc", help="Specify memory allocator for WebAssembly")
parser.add_argument(
"--emscripten_settings",
nargs="+",
action="append",
help="Extra emscripten settings to pass to emcc using '-s <key>=<value>' during build.",
)
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
# Enable onnxruntime-extensions
parser.add_argument(
"--use_extensions",
action="store_true",
help="Enable custom operators in onnxruntime-extensions, use git submodule onnxruntime-extensions "
"in path cmake/external/onnxruntime-extensions by default.",
)
parser.add_argument(
"--extensions_overridden_path",
type=str,
help="Path to pre-pulled onnxruntime-extensions, will override default onnxruntime-extensions path.",
)
2018-11-20 00:48:22 +00:00
# Arguments needed by CI
parser.add_argument("--cmake_path", default="cmake", help="Path to the CMake program.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--ctest_path",
default="ctest",
help="Path to the CTest program. It can be an empty string. If it is empty, "
"we will use this script driving the test programs directly.",
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--skip_submodule_sync",
action="store_true",
help="Don't do a 'git submodule update'. Makes the Update phase faster.",
)
parser.add_argument("--use_mimalloc", action="store_true", help="Use mimalloc allocator")
parser.add_argument("--use_dnnl", action="store_true", help="Build with DNNL.")
2020-11-13 04:17:54 +00:00
parser.add_argument(
"--dnnl_gpu_runtime", action="store", default="", type=str.lower, help="e.g. --dnnl_gpu_runtime ocl"
)
2020-11-13 04:17:54 +00:00
parser.add_argument(
"--dnnl_opencl_root",
action="store",
default="",
2020-11-13 04:17:54 +00:00
help="Path to OpenCL SDK. "
'e.g. --dnnl_opencl_root "C:/Program Files (x86)/IntelSWTools/sw_dev_tools/OpenCL/sdk"',
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--use_openvino",
nargs="?",
const="CPU",
type=_openvino_verify_device_type,
help="Build with OpenVINO for specific hardware.",
)
parser.add_argument(
"--dnnl_aarch64_runtime", action="store", default="", type=str.lower, help="e.g. --dnnl_aarch64_runtime acl"
)
parser.add_argument(
"--dnnl_acl_root",
action="store",
default="",
help='Path to ACL ROOT DIR. e.g. --dnnl_acl_root "$HOME/ComputeLibrary/"',
)
parser.add_argument("--use_coreml", action="store_true", help="Build with CoreML support.")
parser.add_argument("--use_webnn", action="store_true", help="Build with WebNN support.")
parser.add_argument("--use_snpe", action="store_true", help="Build with SNPE support.")
parser.add_argument("--snpe_root", help="Path to SNPE SDK root.")
parser.add_argument("--use_nnapi", action="store_true", help="Build with NNAPI support.")
parser.add_argument("--use_vsinpu", action="store_true", help="Build with VSINPU support.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--nnapi_min_api", type=int, help="Minimum Android API level to enable NNAPI, should be no less than 27"
)
parser.add_argument("--use_jsep", action="store_true", help="Build with JavaScript kernels.")
parser.add_argument("--use_webgpu", action="store_true", help="Build with WebGPU support.")
Add implementation of WebGPU EP (#22591) ### Description This PR adds the actual implementation of the WebGPU EP based on https://github.com/microsoft/onnxruntime/pull/22318. This change includes the following: <details> <summary><b>core framework of WebGPU EP</b></summary> - WebGPU EP factory classes for: - handling WebGPU options - creating WebGPU EP instance - creating WebGPU context - WebGPU Execution Provider classes - GPU Buffer allocator - data transfer - Buffer management classes - Buffer Manager - BufferCacheManager - DisabledCacheManager - SimpleCacheManager - LazyReleaseCacheManager - BucketCacheManager - Program classes - Program (base) - Program Cache Key - Program Manager - Shader helper classes - Shader Helper - ShaderIndicesHelper - ShaderVariableHelper - Utils - GPU Query based profiler - compute context - string utils - Miscs - Python binding webgpu support (basic) </details> <details> <summary><b>Kernel implementation</b></summary> - onnx.ai (default opset): - Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf, Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh, Tanh, Not, Cast - Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu, LeakyRelu, ThresholdedRelu, Gelu - Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual - (Tensors): Shape, Reshape, Squeeze, Unsqueeze - Where - Transpose - Concat - Expand - Gather - Tile - Range - LayerNormalization - com.microsoft - FastGelu - MatMulNBits - MultiHeadAttention - RotaryEmbedding - SkipLayerNormalization - LayerNormalization - SimplifiedLayerNormalization - SkipSimplifiedLayerNormalization </details> <details> <summary><b>Build, test and CI pipeline integration</b></summary> - build works for Windows, macOS and iOS - support onnxruntime_test_all and python node test - added a new unit test for `--use_external_dawn` build flag. - updated MacOS pipeline to build with WebGPU support - added a new pipeline for WebGPU Windows </details> This change does not include: - Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
parser.add_argument("--use_external_dawn", action="store_true", help="Treat Dawn as an external dependency.")
parser.add_argument("--use_qnn", action="store_true", help="Build with QNN support.")
parser.add_argument("--qnn_home", help="Path to QNN SDK dir.")
parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.")
parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.")
parser.add_argument("--eigen_path", help="Path to pre-installed Eigen.")
parser.add_argument("--enable_msinternal", action="store_true", help="Enable for Microsoft internal builds only.")
2018-11-20 00:48:22 +00:00
parser.add_argument("--llvm_path", help="Path to llvm dir")
parser.add_argument("--use_vitisai", action="store_true", help="Build with Vitis-AI")
parser.add_argument("--use_tvm", action="store_true", help="Build with TVM")
parser.add_argument("--tvm_cuda_runtime", action="store_true", default=False, help="Build TVM with CUDA support")
parser.add_argument(
"--use_tvm_hash", action="store_true", help="Build ipp-crypto for hash generation. It is used by TVM EP only"
)
parser.add_argument("--use_tensorrt", action="store_true", help="Build with TensorRT")
parser.add_argument(
"--use_tensorrt_builtin_parser", action="store_true", default=True, help="Use TensorRT builtin parser"
)
parser.add_argument("--use_tensorrt_oss_parser", action="store_true", help="Use TensorRT OSS parser")
parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir")
parser.add_argument("--test_all_timeout", default="10800", help="Set timeout for onnxruntime_test_all")
parser.add_argument("--use_migraphx", action="store_true", help="Build with MIGraphX")
parser.add_argument("--migraphx_home", help="Path to MIGraphX installation dir")
parser.add_argument("--use_full_protobuf", action="store_true", help="Use the full protobuf library")
parser.add_argument(
"--llvm_config",
type=str,
default="",
help="Path to llvm-config.exe for LLVM built from sources. It is strongly needed for build on Windows",
)
parser.add_argument(
"--skip_onnx_tests",
action="store_true",
help="Explicitly disable all onnx related tests. Note: Use --skip_tests to skip all tests.",
)
parser.add_argument("--skip_winml_tests", action="store_true", help="Explicitly disable all WinML related tests")
parser.add_argument("--skip_nodejs_tests", action="store_true", help="Explicitly disable all Node.js binding tests")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--enable_msvc_static_runtime", action="store_true", help="Enable static linking of MSVC runtimes."
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--cmake_generator",
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
choices=[
"MinGW Makefiles",
"Ninja",
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
"NMake Makefiles",
"NMake Makefiles JOM",
"Unix Makefiles",
"Visual Studio 17 2022",
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
"Xcode",
],
default=None,
help="Specify the generator that CMake invokes.",
)
parser.add_argument("--use_dml", action="store_true", help="Build with DirectML.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--dml_path",
type=str,
default="",
help="Path to a custom DirectML installation (must have bin/, lib/, and include/ subdirectories).",
)
parser.add_argument("--use_winml", action="store_true", help="Build with WinML.")
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--winml_root_namespace_override", type=str, help="Specify the namespace that WinML builds into."
)
2020-04-19 03:48:30 +00:00
parser.add_argument(
"--dml_external_project", action="store_true", help="Build with DirectML as an external project."
)
parser.add_argument(
"--use_telemetry", action="store_true", help="Only official builds can set this flag to enable telemetry."
)
parser.add_argument("--enable_wcos", action="store_true", help="Build for Windows Core OS.")
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
# Do not enable LTO when the compiler is MSVC and the flag for generating debug symbols is set to /Z7 and training
# is also enabled. Because both LTO and /Zi could significantly increase *.obj/*.lib files' size, and on Windows
# there is a 4GB per file limit(ERROR LNK1248). We may solve the issue by splitting the big static libs to smaller
# ones. Before the refactoring work is done, we should avoid enabling LTO and ccache at the same time because ccache
# needs /Z7.
parser.add_argument("--enable_lto", action="store_true", help="Enable Link Time Optimization")
parser.add_argument("--enable_transformers_tool_test", action="store_true", help="Enable transformers tool test")
parser.add_argument(
"--use_acl",
action="store_true",
help="Build with ACL for ARM architectures.",
)
parser.add_argument("--acl_home", help="Path to ACL home dir")
parser.add_argument("--acl_libs", help="Path to ACL libraries")
parser.add_argument("--use_armnn", action="store_true", help="Enable ArmNN Execution Provider.")
parser.add_argument(
"--armnn_relu", action="store_true", help="Use the Relu operator implementation from the ArmNN EP."
)
parser.add_argument(
"--armnn_bn", action="store_true", help="Use the Batch Normalization operator implementation from the ArmNN EP."
)
parser.add_argument("--armnn_home", help="Path to ArmNN home dir")
parser.add_argument("--armnn_libs", help="Path to ArmNN libraries")
parser.add_argument("--build_micro_benchmarks", action="store_true", help="Build ONNXRuntime micro-benchmarks.")
# options to reduce binary size
parser.add_argument(
"--minimal_build",
default=None,
nargs="*",
type=str.lower,
help="Create a build that only supports ORT format models. "
"See https://onnxruntime.ai/docs/tutorials/mobile/ for more information. "
"RTTI is automatically disabled in a minimal build. "
"To enable execution providers that compile kernels at runtime (e.g. NNAPI) pass 'extended' "
"as a parameter. e.g. '--minimal_build extended'. "
"To enable support for custom operators pass 'custom_ops' as a parameter. "
"e.g. '--minimal_build custom_ops'. This can be combined with an 'extended' build by passing "
"'--minimal_build extended custom_ops'",
)
parser.add_argument(
"--include_ops_by_config",
type=str,
help="Include ops from config file. See /docs/Reduced_Operator_Kernel_build.md for more information.",
)
parser.add_argument(
"--enable_reduced_operator_type_support",
action="store_true",
help="If --include_ops_by_config is specified, and the configuration file has type reduction "
"information, limit the types individual operators support where possible to further "
"reduce the build size. "
"See /docs/Reduced_Operator_Kernel_build.md for more information.",
)
parser.add_argument("--disable_contrib_ops", action="store_true", help="Disable contrib ops (reduces binary size)")
parser.add_argument(
"--disable_ml_ops", action="store_true", help="Disable traditional ML ops (reduces binary size)"
)
# Please note in our CMakeLists.txt this is already default on. But in this file we reverse it to default OFF.
parser.add_argument("--disable_rtti", action="store_true", help="Disable RTTI (reduces binary size)")
parser.add_argument(
"--disable_types",
nargs="+",
default=[],
choices=["float8", "optional", "sparsetensor"],
help="Disable selected data types (reduces binary size)",
)
parser.add_argument(
"--disable_exceptions",
action="store_true",
help="Disable exceptions to reduce binary size. Requires --minimal_build.",
)
parser.add_argument("--rocm_version", help="The version of ROCM stack to use. ")
parser.add_argument("--use_rocm", action="store_true", help="Build with ROCm")
parser.add_argument("--rocm_home", help="Path to ROCm installation dir")
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
# Code coverage
Add LearningModelBuilder to WinML Experimental Namespace along with various Audio operators (#6623) * model building * fix build * winml adapter model building api * model building * make build * make build again * add model building with audio op * inplace and inorder fft * add ifft * works! * cleanup * add comments * switch to iterative rather than recursive and use parallelization * batched parallelization * fft->dft * cleanup * window functions * add melweightmatrix op * updates to make spectrogram test work * push latest * add onesided * cleanup * Clean up building apis and fix mel * cleanup * cleanup * naive stft * fix test output * middle c complete * 3 tones * cleanup * signal def new line * Add save functionality * Perf improvements, 10x improvement * cleanup * use bitreverse lookup table for performance * implement constant initializers for tensors * small changes * add matmul tests * merge issues * support add attribute * add tests for double data type windowfunctions and minor cleanup * stft onesided/and not tests * cleanup * cleanup * clean up * cleanup * remove threading attribute * forward declare orttypeinfo * warnings * fwd declare * fix warnings * 1 more warning * remove saving to e drive... * cleanup and fix stft test * add opset picker * small additions * add onnxruntime tests * add signed/unsigned * fix warning * fix warning * finish onnxruntime tests * make windows namespace build succeed * add experimental flag * add experimental api into nuget package * add experimental api build flag and add to windows ai nuget package * turn experimental for tests * add minimum opset version to new experimental domain * api cleanup * disable ms experimental ops test when --ms_experimental is not enabled * add macro behind flag * remove unused x * pr feedback Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
2021-02-12 22:17:10 +00:00
parser.add_argument(
"--code_coverage", action="store_true", help="Generate code coverage when targeting Android (only)."
)
# lazy tensor support.
parser.add_argument(
"--enable_lazy_tensor", action="store_true", help="Enable use ORT as backend in Pytorch LazyTensor."
)
parser.add_argument("--ms_experimental", action="store_true", help="Build microsoft experimental operators.")
parser.add_argument(
"--enable_external_custom_op_schemas",
action="store_true",
help="Enable registering user defined custom operation schemas at shared library load time.\
This feature is only supported/available on Ubuntu.",
)
parser.add_argument(
"--external_graph_transformer_path", type=str, help="path to the external graph transformer dir."
)
parser.add_argument(
"--enable_cuda_profiling",
action="store_true",
help="enable cuda kernel profiling, \
cupti library must be added to PATH beforehand.",
)
parser.add_argument("--use_cann", action="store_true", help="Build with CANN")
parser.add_argument("--cann_home", help="Path to CANN installation dir")
parser.add_argument(
"--enable_rocm_profiling",
action="store_true",
help="enable rocm kernel profiling.",
)
parser.add_argument("--use_xnnpack", action="store_true", help="Enable xnnpack EP.")
parser.add_argument("--use_avx512", action="store_true", help="Enable AVX512 instructions")
parser.add_argument("--use_azure", action="store_true", help="Enable azure EP.")
parser.add_argument("--use_cache", action="store_true", help="Use compiler cache in CI")
integrate triton into ort (#15862) ### Description In some scenarios, the triton written kernels are more performant than CK or other handwritten kernels, so we implement a framework that onnxruntime can use these triton written kernels. This PR is to integrate triton into ort, so that ort can use kernels that written and compiled by triton. The main change focus on two part: 1. a build part to compile triton written kernel and combine these kernels into libonnxruntime_providers_rocm.so 2. a loader and launcher in c++, for loading and launch triton written kernels. #### Build To compile triton written kernel, add a script `tools/ci_build/compile_triton.py`. This script will dynamic load all kernel files, compile them, and generate `triton_kernel_infos.a` and `triton_kernel_infos.h`. `triton_kernel_infos.a` contains all compiled kernel instructions, this file will be combined into libonnxruntime_providers_rocm.so, using --whole-archive flag. `triton_kernel_infos.h` defines a const array that contains all the metadata for each compiled kernel. These metadata will be used for load and launch. So this header file is included by 'triton_kernel.cu' which defines load and launch functions. Add a build flag in build.py and CMakeList.txt, when building rocm provider, it will call triton_kernel build command, and generate all necessary files. #### C++ Load and Launch On c++ part, we implement load and launch functions in triton_kernel.cu and triton_kernel.h. These two files located in `providers/cuda`, and when compiling rocm, they will be hipified. so this part supports both cuda and rocm. But currently we only call triton kernel in rocm. We also implement a softmax triton op for example. Because there will generate many kernels for different input shape of softmax, we use TunableOp to select the best one. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
parser.add_argument("--use_triton_kernel", action="store_true", help="Use triton compiled kernels")
parser.add_argument("--use_lock_free_queue", action="store_true", help="Use lock-free task queue for threadpool.")
integrate triton into ort (#15862) ### Description In some scenarios, the triton written kernels are more performant than CK or other handwritten kernels, so we implement a framework that onnxruntime can use these triton written kernels. This PR is to integrate triton into ort, so that ort can use kernels that written and compiled by triton. The main change focus on two part: 1. a build part to compile triton written kernel and combine these kernels into libonnxruntime_providers_rocm.so 2. a loader and launcher in c++, for loading and launch triton written kernels. #### Build To compile triton written kernel, add a script `tools/ci_build/compile_triton.py`. This script will dynamic load all kernel files, compile them, and generate `triton_kernel_infos.a` and `triton_kernel_infos.h`. `triton_kernel_infos.a` contains all compiled kernel instructions, this file will be combined into libonnxruntime_providers_rocm.so, using --whole-archive flag. `triton_kernel_infos.h` defines a const array that contains all the metadata for each compiled kernel. These metadata will be used for load and launch. So this header file is included by 'triton_kernel.cu' which defines load and launch functions. Add a build flag in build.py and CMakeList.txt, when building rocm provider, it will call triton_kernel build command, and generate all necessary files. #### C++ Load and Launch On c++ part, we implement load and launch functions in triton_kernel.cu and triton_kernel.h. These two files located in `providers/cuda`, and when compiling rocm, they will be hipified. so this part supports both cuda and rocm. But currently we only call triton kernel in rocm. We also implement a softmax triton op for example. Because there will generate many kernels for different input shape of softmax, we use TunableOp to select the best one. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
if not is_windows():
parser.add_argument(
"--allow_running_as_root",
action="store_true",
help="Allow build to be run as root user. This is not allowed by default.",
)
args = parser.parse_args()
if args.android_sdk_path:
args.android_sdk_path = os.path.normpath(args.android_sdk_path)
if args.android_ndk_path:
args.android_ndk_path = os.path.normpath(args.android_ndk_path)
if args.enable_wasm_api_exception_catching:
# if we catch on api level, we don't want to catch all
args.disable_wasm_exception_catching = True
if not args.disable_wasm_exception_catching or args.enable_wasm_api_exception_catching:
# doesn't make sense to catch if no one throws
args.enable_wasm_exception_throwing_override = True
if args.cmake_generator is None and is_windows():
args.cmake_generator = "Ninja" if args.build_wasm else "Visual Studio 17 2022"
if args.enable_cuda_nhwc_ops:
warnings.warn(
"The argument '--enable_cuda_nhwc_ops' is deprecated and is default to True. ", DeprecationWarning
)
return args
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def is_reduced_ops_build(args):
return args.include_ops_by_config is not None
def resolve_executable_path(command_or_path):
"""Returns the absolute path of an executable."""
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
if command_or_path and command_or_path.strip():
executable_path = shutil.which(command_or_path)
if executable_path is None:
raise BuildError(f"Failed to resolve executable path for '{command_or_path}'.")
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
return os.path.abspath(executable_path)
else:
return None
2020-04-19 03:48:30 +00:00
def get_linux_distro():
try:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
with open("/etc/os-release") as f:
dist_info = dict(line.strip().split("=", 1) for line in f)
return dist_info.get("NAME", "").strip('"'), dist_info.get("VERSION", "").strip('"')
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
except (OSError, ValueError):
return "", ""
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
def get_config_build_dir(build_dir, config):
# build directory per configuration
return os.path.join(build_dir, config)
2020-04-19 03:48:30 +00:00
def run_subprocess(
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
args,
cwd=None,
capture_stdout=False,
dll_path=None,
shell=False,
env=None,
python_path=None,
cuda_home=None,
):
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
if env is None:
env = {}
if isinstance(args, str):
raise ValueError("args should be a sequence of strings, not a string")
2018-11-20 00:48:22 +00:00
my_env = os.environ.copy()
if dll_path:
if is_windows():
if "PATH" in my_env:
my_env["PATH"] = dll_path + os.pathsep + my_env["PATH"]
else:
my_env["PATH"] = dll_path
2018-11-20 00:48:22 +00:00
else:
if "LD_LIBRARY_PATH" in my_env:
my_env["LD_LIBRARY_PATH"] += os.pathsep + dll_path
else:
my_env["LD_LIBRARY_PATH"] = dll_path
# Add nvcc's folder to PATH env so that our cmake file can find nvcc
if cuda_home:
my_env["PATH"] = os.path.join(cuda_home, "bin") + os.pathsep + my_env["PATH"]
if python_path:
if "PYTHONPATH" in my_env:
my_env["PYTHONPATH"] += os.pathsep + python_path
else:
my_env["PYTHONPATH"] = python_path
2018-11-20 00:48:22 +00:00
my_env.update(env)
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
log.info(" ".join(args))
return run(*args, cwd=cwd, capture_stdout=capture_stdout, shell=shell, env=my_env)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
def update_submodules(source_dir):
run_subprocess(["git", "submodule", "sync", "--recursive"], cwd=source_dir)
run_subprocess(["git", "submodule", "update", "--init", "--recursive"], cwd=source_dir)
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
def setup_test_data(source_onnx_model_dir, dest_model_dir_name, build_dir, configs):
# create the symlink/shortcut of onnx models dir under build_dir
# currently, there're 2 sources of onnx models, one is build in OS image, another is
# from {source_dir}/js/test, which is downloaded from onnx web.
if is_windows():
src_model_dir = os.path.join(build_dir, dest_model_dir_name)
if os.path.exists(source_onnx_model_dir) and not os.path.exists(src_model_dir):
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
log.debug(f"creating shortcut {source_onnx_model_dir} -> {src_model_dir}")
run_subprocess(["mklink", "/D", "/J", src_model_dir, source_onnx_model_dir], shell=True)
for config in configs:
config_build_dir = get_config_build_dir(build_dir, config)
os.makedirs(config_build_dir, exist_ok=True)
dest_model_dir = os.path.join(config_build_dir, dest_model_dir_name)
if os.path.exists(source_onnx_model_dir) and not os.path.exists(dest_model_dir):
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
log.debug(f"creating shortcut {source_onnx_model_dir} -> {dest_model_dir}")
run_subprocess(["mklink", "/D", "/J", dest_model_dir, source_onnx_model_dir], shell=True)
elif os.path.exists(src_model_dir) and not os.path.exists(dest_model_dir):
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
log.debug(f"creating shortcut {src_model_dir} -> {dest_model_dir}")
run_subprocess(["mklink", "/D", "/J", dest_model_dir, src_model_dir], shell=True)
else:
src_model_dir = os.path.join(build_dir, dest_model_dir_name)
if os.path.exists(source_onnx_model_dir) and not os.path.exists(src_model_dir):
log.debug(f"create symlink {source_onnx_model_dir} -> {src_model_dir}")
os.symlink(source_onnx_model_dir, src_model_dir, target_is_directory=True)
2020-04-19 03:48:30 +00:00
def use_dev_mode(args):
if args.compile_no_warning_as_error:
return False
if args.use_acl:
return False
if args.use_armnn:
return False
if (args.ios or args.visionos) and is_macOS():
return False
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
SYSTEM_COLLECTIONURI = os.getenv("SYSTEM_COLLECTIONURI") # noqa: N806
if SYSTEM_COLLECTIONURI and SYSTEM_COLLECTIONURI != "https://dev.azure.com/onnxruntime/":
return False
return True
def add_default_definition(definition_list, key, default_value):
for x in definition_list:
if x.startswith(key + "="):
return definition_list
definition_list.append(key + "=" + default_value)
def normalize_arg_list(nested_list):
return [i for j in nested_list for i in j] if nested_list else []
def number_of_parallel_jobs(args):
return os.cpu_count() if args.parallel == 0 else args.parallel
def number_of_nvcc_threads(args):
if args.nvcc_threads >= 0:
return args.nvcc_threads
nvcc_threads = 1
try:
import psutil
available_memory = psutil.virtual_memory().available
if isinstance(available_memory, int) and available_memory > 0:
if available_memory > 60 * 1024 * 1024 * 1024:
# When available memory is large enough, chance of OOM is small.
nvcc_threads = 4
else:
# NVCC need a lot of memory to compile 8 flash attention cu files in Linux or 4 cutlass fmha cu files in Windows.
# Here we select number of threads to ensure each thread has enough memory (>= 4 GB). For example,
# Standard_NC4as_T4_v3 has 4 CPUs and 28 GB memory. When parallel=4 and nvcc_threads=2,
# total nvcc threads is 4 * 2, which is barely able to build in 28 GB memory so we will use nvcc_threads=1.
memory_per_thread = 4 * 1024 * 1024 * 1024
[CUDA] GroupQueryAttention operator using FlashAttention (#17674) ### Description Added Group Query Attention op, supporting integer multiple number of heads for Q / KV. As of now, this op can only use FlashAttention kernel, meaning it only supports sm>=80 on Linux. Results from onnxruntime/test/python/transformers/benchmark_gqa.py show an on-average ~37% speed-up over Decoder Masked Multi-Head Attention, with even greater improvements for long past sequence lengths. ``` op batch s_kv heads h_dim ms TFLOPS gqa 16 2048 8 32 0.34 0.10 dmmha 16 2048 8 32 0.39 0.09 --------- gqa 16 2048 8 64 0.45 0.15 dmmha 16 2048 8 64 0.61 0.11 --------- gqa 16 2048 8 128 0.54 0.25 dmmha 16 2048 8 128 0.83 0.16 --------- gqa 16 2048 16 32 0.45 0.15 dmmha 16 2048 16 32 0.69 0.10 --------- gqa 16 2048 16 64 0.69 0.19 dmmha 16 2048 16 64 0.83 0.16 --------- gqa 16 2048 16 128 0.71 0.38 dmmha 16 2048 16 128 1.28 0.21 --------- gqa 16 2048 32 32 0.58 0.23 dmmha 16 2048 32 32 0.77 0.17 --------- gqa 16 2048 32 64 0.58 0.46 dmmha 16 2048 32 64 1.25 0.21 --------- gqa 16 2048 32 128 0.76 0.71 dmmha 16 2048 32 128 2.15 0.25 --------- gqa 16 2048 64 32 0.68 0.39 dmmha 16 2048 64 32 1.23 0.22 --------- gqa 16 2048 64 64 0.77 0.70 dmmha 16 2048 64 64 2.11 0.25 --------- gqa 16 2048 64 128 1.10 0.97 dmmha 16 2048 64 128 4.06 0.26 --------- gqa 16 2048 128 32 1.00 0.54 dmmha 16 2048 128 32 2.09 0.26 --------- gqa 16 2048 128 64 1.10 0.97 dmmha 16 2048 128 64 4.08 0.26 ``` ### Motivation and Context As of now, this op is targeted for use on LLama models, as it supports kv-caching and different number of heads for Q and KV (Grouped Query Attention). We plan to add support for more platforms, input formats, etc. in the future. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-10-09 19:43:12 +00:00
fmha_cu_files = 4 if is_windows() else 16
fmha_parallel_jobs = min(fmha_cu_files, number_of_parallel_jobs(args))
nvcc_threads = max(1, int(available_memory / (memory_per_thread * fmha_parallel_jobs)))
print(
f"nvcc_threads={nvcc_threads} to ensure memory per thread >= 4GB for available_memory={available_memory} and fmha_parallel_jobs={fmha_parallel_jobs}"
)
except ImportError:
print(
"Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1"
)
return nvcc_threads
def generate_build_tree(
cmake_path,
source_dir,
build_dir,
cuda_home,
cudnn_home,
rocm_home,
mpi_home,
nccl_home,
tensorrt_home,
migraphx_home,
acl_home,
acl_libs,
armnn_home,
armnn_libs,
qnn_home,
snpe_root,
cann_home,
path_to_protoc_exe,
configs,
cmake_extra_defines,
args,
cmake_extra_args,
):
2018-11-20 00:48:22 +00:00
log.info("Generating CMake build tree")
cmake_dir = os.path.join(source_dir, "cmake")
cmake_args = [cmake_path, cmake_dir]
if not use_dev_mode(args):
cmake_args += ["--compile-no-warning-as-error"]
types_to_disable = args.disable_types
# enable/disable float 8 types
2023-12-11 03:37:29 +00:00
disable_float8_types = args.android or ("float8" in types_to_disable)
disable_optional_type = "optional" in types_to_disable
disable_sparse_tensors = "sparsetensor" in types_to_disable
cmake_args += [
"-Donnxruntime_RUN_ONNX_TESTS=" + ("ON" if args.enable_onnx_tests else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_GENERATE_TEST_REPORTS=ON",
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
# There are two ways of locating python C API header file. "find_package(PythonLibs 3.5 REQUIRED)"
# and "find_package(Python 3.5 COMPONENTS Development.Module)". The first one is deprecated and it
# depends on the "PYTHON_EXECUTABLE" variable. The second needs "Python_EXECUTABLE". Here we set both
# of them to get the best compatibility.
"-DPython_EXECUTABLE=" + sys.executable,
2020-04-19 03:48:30 +00:00
"-DPYTHON_EXECUTABLE=" + sys.executable,
Create CMake option `onnxruntime_USE_VCPKG` (#21348) ### Changes 1. CMake option `onnxruntime_USE_VCPKG`. It will be used in the vcpkg port * Unit test may fail because this option leads to a mixture of unexpected external library versions. Especially ONNX, Protobuf, and Flatbuffers version can be different 2. Overhaul of `onnxruntime_external_deps.cmake` * Make `FetchContent_Declare` to try `find_package`. See https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html * Relocated `FetchContent_Declare` and `FetchContent_MakeAvailable`(or `onnxruntime_fetchcontent_makeavailable`) to closer lines. It was too hard to navigate the entire file to search related sections... * Alias `IMPORTED` targets like build targets (e.g. `ONNX::onnx` --> `onnx`) ```cmake # The script uses `find_package` with the changes. # In this case, use vcpkg to search dependencies # See https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html include(external/onnxruntime_external_deps.cmake) ``` 3. Create CMakePresets.json and presets to [run vcpkg in manifest mode](https://learn.microsoft.com/en-us/vcpkg/concepts/manifest-mode) * Currently, it's NOT for training build * Main triplets are `x64-windows` and `x64-osx` ```pwsh Push-Location "cmake" cmake --preset "x64-windows-vcpkg" cmake --build --preset "x64-windows-vcpkg-debug" Pop-Location ``` ```bash pushd "cmake" cmake --preset "x64-osx-vcpkg" cmake --build --preset "x64-osx-vcpkg-debug" popd ``` 4. Updated tools/ci_build/build.py * `--use_vcpkg` option: it needs `CMAKE_TOOLCHAIN_FILE` with [vcpkg.cmake toolchain script](https://github.com/microsoft/vcpkg/blob/master/scripts/buildsystems/vcpkg.cmake) * `--compile_no_warning_as_error` is recommended because library version differences will cause unexpected compiler warnings ```bash python ./tools/ci_build/build.py \ --compile_no_warning_as_error \ --use_vcpkg \ --cmake_extra_defines "CMAKE_TOOLCHAIN_FILE:FILEPATH=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake" \ --cmake_extra_defines "VCPKG_TARGET_TRIPLET=..." ``` 5. Created Job `Vcpkg` for Windows and macOS * Show how to setup and use vcpkg. Similar to the CMakePresets.json usage ### Motivation and Context * Help #7150 * Help https://github.com/microsoft/vcpkg/pull/36850 * https://github.com/luncliff/vcpkg-registry/pull/212 * https://github.com/microsoft/vcpkg/pull/39881 * https://github.com/luncliff/vcpkg-registry/pull/215 * https://github.com/luncliff/vcpkg-registry/pull/216 * https://github.com/luncliff/vcpkg-registry/pull/227 * https://cmake.org/cmake/help/latest/guide/using-dependencies/index.html * https://github.com/microsoft/vcpkg/blob/master/scripts/buildsystems/vcpkg.cmake ### Future Works? More feature coverage with the vcpkg supported libraries * CUDA feature support * Training feature support
2024-09-10 23:39:27 +00:00
"-Donnxruntime_USE_VCPKG=" + ("ON" if args.use_vcpkg else "OFF"),
"-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"),
"-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"),
"-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),
"-Donnxruntime_BUILD_NODEJS=" + ("ON" if args.build_nodejs else "OFF"),
"-Donnxruntime_BUILD_OBJC=" + ("ON" if args.build_objc else "OFF"),
"-Donnxruntime_BUILD_SHARED_LIB=" + ("ON" if args.build_shared_lib else "OFF"),
"-Donnxruntime_BUILD_APPLE_FRAMEWORK=" + ("ON" if args.build_apple_framework else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_USE_DNNL=" + ("ON" if args.use_dnnl else "OFF"),
The initial PR for NNAPI EP (#4287) * Move nnapi dnnlib to subfolder * dnnlib compile settings * add nnapi buildin build.py * add onnxruntime_USE_NNAPI_BUILTIN * compile using onnxruntime_USE_NNAPI_BUILTIN * remove dnnlib from built in code * Group onnxruntime_USE_NNAPI_BUILTIN sources * add file stubs * java 32bit compile error * built in nnapi support 5-26 * init working version * initializer support * fix crash on free execution * add dynamic input support * bug fixes for dynamic input shape, add mul support, working on conv and batchnorm * Add batchnormalization, add overflow check for int64 attributes * add global average/max pool and reshape * minor changes * minor changes * add skip relu and options to use different type of memory * small bug fix for in operator relu * bug fix for nnapi * add transpose support, minor bug fix * Add transpose support * minor bug fixes, depthwise conv weight fix * fixed the bug where the onnx model input has mismatch order than the nnapi model input * add helper to add scalar operand * add separated opbuilder to handle single operator * add cast operator * fixed reshape, moved some logs to verbose * Add softmax and identity support, change shaper calling signature, and add support for int32 output * changed the way to execute the NNAPI * move NNMemory and InputOutputInfo into Model class * add limited support for input dynamic shape * add gemm support, fixed crash when allocating big array on stack * add abs/exp/floor/log/sigmoid/neg/sin/sqrt/tanh support * better dynamic input shape support; * add more check for IsOpSupportedImpl, refactored some code * some code style fix, switch to safeint * Move opbuilders to a map with single instance, minor bug fixes * add GetUniqueName for new temp tensors * change from throw std to ort_throw * build settings change and 3rd party notice update * add readme for nnapi_lib, move to ort log, add comments to public functions, clean the code * add android log sink and more logging changes, add new string for NnApiErrorDescription * add nnapi execution options/fp16 relax * fix a dnnlibrary build break * addressed review comments * address review comments, changed adding output for subgraph in NnapiExecutionProvider::GetCapability, minor issue fixes * formatting in build.py * more formatting fix in build.py, return fail status instead of throw in compute_func * moved android_log_sink to platform folder, minor coding style changes * addressed review comments
2020-06-26 07:02:39 +00:00
"-Donnxruntime_USE_NNAPI_BUILTIN=" + ("ON" if args.use_nnapi else "OFF"),
"-Donnxruntime_USE_VSINPU=" + ("ON" if args.use_vsinpu else "OFF"),
"-Donnxruntime_USE_RKNPU=" + ("ON" if args.use_rknpu else "OFF"),
"-Donnxruntime_USE_LLVM=" + ("ON" if args.use_tvm else "OFF"),
"-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"),
"-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
"-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
+ ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
# set vars for TVM
"-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"),
"-Donnxruntime_TVM_CUDA_RUNTIME=" + ("ON" if args.use_tvm and args.tvm_cuda_runtime else "OFF"),
"-Donnxruntime_TVM_USE_HASH=" + ("ON" if args.use_tvm_hash else "OFF"),
# set vars for migraphx
"-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"),
"-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"),
"-Donnxruntime_DISABLE_ML_OPS=" + ("ON" if args.disable_ml_ops else "OFF"),
"-Donnxruntime_DISABLE_RTTI="
+ ("ON" if args.disable_rtti or (args.minimal_build is not None and not args.enable_pybind) else "OFF"),
"-Donnxruntime_DISABLE_EXCEPTIONS=" + ("ON" if args.disable_exceptions else "OFF"),
# Need to use 'is not None' with minimal_build check as it could be an empty list.
"-Donnxruntime_MINIMAL_BUILD=" + ("ON" if args.minimal_build is not None else "OFF"),
"-Donnxruntime_EXTENDED_MINIMAL_BUILD="
+ ("ON" if args.minimal_build and "extended" in args.minimal_build else "OFF"),
"-Donnxruntime_MINIMAL_BUILD_CUSTOM_OPS="
+ (
"ON"
if (args.minimal_build is not None and ("custom_ops" in args.minimal_build or args.use_extensions))
else "OFF"
),
"-Donnxruntime_REDUCED_OPS_BUILD=" + ("ON" if is_reduced_ops_build(args) else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_USE_DML=" + ("ON" if args.use_dml else "OFF"),
"-Donnxruntime_USE_WINML=" + ("ON" if args.use_winml else "OFF"),
Add LearningModelBuilder to WinML Experimental Namespace along with various Audio operators (#6623) * model building * fix build * winml adapter model building api * model building * make build * make build again * add model building with audio op * inplace and inorder fft * add ifft * works! * cleanup * add comments * switch to iterative rather than recursive and use parallelization * batched parallelization * fft->dft * cleanup * window functions * add melweightmatrix op * updates to make spectrogram test work * push latest * add onesided * cleanup * Clean up building apis and fix mel * cleanup * cleanup * naive stft * fix test output * middle c complete * 3 tones * cleanup * signal def new line * Add save functionality * Perf improvements, 10x improvement * cleanup * use bitreverse lookup table for performance * implement constant initializers for tensors * small changes * add matmul tests * merge issues * support add attribute * add tests for double data type windowfunctions and minor cleanup * stft onesided/and not tests * cleanup * cleanup * clean up * cleanup * remove threading attribute * forward declare orttypeinfo * warnings * fwd declare * fix warnings * 1 more warning * remove saving to e drive... * cleanup and fix stft test * add opset picker * small additions * add onnxruntime tests * add signed/unsigned * fix warning * fix warning * finish onnxruntime tests * make windows namespace build succeed * add experimental flag * add experimental api into nuget package * add experimental api build flag and add to windows ai nuget package * turn experimental for tests * add minimum opset version to new experimental domain * api cleanup * disable ms experimental ops test when --ms_experimental is not enabled * add macro behind flag * remove unused x * pr feedback Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
2021-02-12 22:17:10 +00:00
"-Donnxruntime_BUILD_MS_EXPERIMENTAL_OPS=" + ("ON" if args.ms_experimental else "OFF"),
"-Donnxruntime_USE_TELEMETRY=" + ("ON" if args.use_telemetry else "OFF"),
2020-04-19 03:48:30 +00:00
"-Donnxruntime_ENABLE_LTO=" + ("ON" if args.enable_lto else "OFF"),
"-Donnxruntime_USE_ACL=" + ("ON" if args.use_acl else "OFF"),
"-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
"-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
"-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
"-Donnxruntime_USE_JSEP=" + ("ON" if args.use_jsep else "OFF"),
"-Donnxruntime_USE_WEBGPU=" + ("ON" if args.use_webgpu else "OFF"),
Add implementation of WebGPU EP (#22591) ### Description This PR adds the actual implementation of the WebGPU EP based on https://github.com/microsoft/onnxruntime/pull/22318. This change includes the following: <details> <summary><b>core framework of WebGPU EP</b></summary> - WebGPU EP factory classes for: - handling WebGPU options - creating WebGPU EP instance - creating WebGPU context - WebGPU Execution Provider classes - GPU Buffer allocator - data transfer - Buffer management classes - Buffer Manager - BufferCacheManager - DisabledCacheManager - SimpleCacheManager - LazyReleaseCacheManager - BucketCacheManager - Program classes - Program (base) - Program Cache Key - Program Manager - Shader helper classes - Shader Helper - ShaderIndicesHelper - ShaderVariableHelper - Utils - GPU Query based profiler - compute context - string utils - Miscs - Python binding webgpu support (basic) </details> <details> <summary><b>Kernel implementation</b></summary> - onnx.ai (default opset): - Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf, Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh, Tanh, Not, Cast - Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu, LeakyRelu, ThresholdedRelu, Gelu - Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual - (Tensors): Shape, Reshape, Squeeze, Unsqueeze - Where - Transpose - Concat - Expand - Gather - Tile - Range - LayerNormalization - com.microsoft - FastGelu - MatMulNBits - MultiHeadAttention - RotaryEmbedding - SkipLayerNormalization - LayerNormalization - SimplifiedLayerNormalization - SkipSimplifiedLayerNormalization </details> <details> <summary><b>Build, test and CI pipeline integration</b></summary> - build works for Windows, macOS and iOS - support onnxruntime_test_all and python node test - added a new unit test for `--use_external_dawn` build flag. - updated MacOS pipeline to build with WebGPU support - added a new pipeline for WebGPU Windows </details> This change does not include: - Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
"-Donnxruntime_USE_EXTERNAL_DAWN=" + ("ON" if args.use_external_dawn else "OFF"),
# Training related flags
"-Donnxruntime_ENABLE_NVTX_PROFILE=" + ("ON" if args.enable_nvtx_profile else "OFF"),
"-Donnxruntime_ENABLE_TRAINING=" + ("ON" if args.enable_training else "OFF"),
"-Donnxruntime_ENABLE_TRAINING_OPS=" + ("ON" if args.enable_training_ops else "OFF"),
"-Donnxruntime_ENABLE_TRAINING_APIS=" + ("ON" if args.enable_training_apis else "OFF"),
# Enable advanced computations such as AVX for some traininig related ops.
"-Donnxruntime_ENABLE_CPU_FP16_OPS=" + ("ON" if args.enable_training else "OFF"),
"-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_nccl else "OFF"),
"-Donnxruntime_BUILD_BENCHMARKS=" + ("ON" if args.build_micro_benchmarks else "OFF"),
"-Donnxruntime_USE_ROCM=" + ("ON" if args.use_rocm else "OFF"),
"-Donnxruntime_GCOV_COVERAGE=" + ("ON" if args.code_coverage else "OFF"),
"-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"),
"-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"),
"-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"),
"-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.use_cuda and not args.disable_cuda_nhwc_ops else "OFF"),
"-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING="
+ ("OFF" if args.disable_wasm_exception_catching else "ON"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_API_EXCEPTION_CATCHING="
+ ("ON" if args.enable_wasm_api_exception_catching else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_THROWING="
+ ("ON" if args.enable_wasm_exception_throwing_override else "OFF"),
"-Donnxruntime_WEBASSEMBLY_RUN_TESTS_IN_BROWSER=" + ("ON" if args.wasm_run_tests_in_browser else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_THREADS=" + ("ON" if args.enable_wasm_threads else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_MEMORY64=" + ("ON" if args.enable_wasm_memory64 else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO=" + ("ON" if args.enable_wasm_debug_info else "OFF"),
"-Donnxruntime_ENABLE_WEBASSEMBLY_PROFILING=" + ("ON" if args.enable_wasm_profiling else "OFF"),
"-Donnxruntime_ENABLE_LAZY_TENSOR=" + ("ON" if args.enable_lazy_tensor else "OFF"),
"-Donnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS="
+ ("ON" if args.enable_external_custom_op_schemas else "OFF"),
"-Donnxruntime_ENABLE_CUDA_PROFILING=" + ("ON" if args.enable_cuda_profiling else "OFF"),
"-Donnxruntime_ENABLE_ROCM_PROFILING=" + ("ON" if args.enable_rocm_profiling else "OFF"),
"-Donnxruntime_USE_XNNPACK=" + ("ON" if args.use_xnnpack else "OFF"),
"-Donnxruntime_USE_WEBNN=" + ("ON" if args.use_webnn else "OFF"),
"-Donnxruntime_USE_CANN=" + ("ON" if args.use_cann else "OFF"),
integrate triton into ort (#15862) ### Description In some scenarios, the triton written kernels are more performant than CK or other handwritten kernels, so we implement a framework that onnxruntime can use these triton written kernels. This PR is to integrate triton into ort, so that ort can use kernels that written and compiled by triton. The main change focus on two part: 1. a build part to compile triton written kernel and combine these kernels into libonnxruntime_providers_rocm.so 2. a loader and launcher in c++, for loading and launch triton written kernels. #### Build To compile triton written kernel, add a script `tools/ci_build/compile_triton.py`. This script will dynamic load all kernel files, compile them, and generate `triton_kernel_infos.a` and `triton_kernel_infos.h`. `triton_kernel_infos.a` contains all compiled kernel instructions, this file will be combined into libonnxruntime_providers_rocm.so, using --whole-archive flag. `triton_kernel_infos.h` defines a const array that contains all the metadata for each compiled kernel. These metadata will be used for load and launch. So this header file is included by 'triton_kernel.cu' which defines load and launch functions. Add a build flag in build.py and CMakeList.txt, when building rocm provider, it will call triton_kernel build command, and generate all necessary files. #### C++ Load and Launch On c++ part, we implement load and launch functions in triton_kernel.cu and triton_kernel.h. These two files located in `providers/cuda`, and when compiling rocm, they will be hipified. so this part supports both cuda and rocm. But currently we only call triton kernel in rocm. We also implement a softmax triton op for example. Because there will generate many kernels for different input shape of softmax, we use TunableOp to select the best one. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
"-Donnxruntime_USE_TRITON_KERNEL=" + ("ON" if args.use_triton_kernel else "OFF"),
"-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"),
"-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"),
"-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"),
2020-04-19 03:48:30 +00:00
]
if args.rv64:
add_default_definition(cmake_extra_defines, "onnxruntime_CROSS_COMPILING", "ON")
if not args.riscv_toolchain_root:
raise BuildError("The --riscv_toolchain_root option is required to build for riscv64.")
if not args.skip_tests and not args.riscv_qemu_path:
raise BuildError("The --riscv_qemu_path option is required for testing riscv64.")
cmake_args += [
"-DRISCV_TOOLCHAIN_ROOT:PATH=" + args.riscv_toolchain_root,
"-DRISCV_QEMU_PATH:PATH=" + args.riscv_qemu_path,
"-DCMAKE_TOOLCHAIN_FILE=" + os.path.join(source_dir, "cmake", "riscv64.toolchain.cmake"),
]
# By default on Windows we currently support only cross compiling for ARM/ARM64
# (no native compilation supported through this script).
if args.arm64 or args.arm64ec or args.arm:
add_default_definition(cmake_extra_defines, "onnxruntime_CROSS_COMPILING", "ON")
if args.use_extensions:
add_default_definition(cmake_extra_defines, "OPENCV_SKIP_SYSTEM_PROCESSOR_DETECTION", "ON")
if args.use_cache:
cmake_args.append("-Donnxruntime_BUILD_CACHE=ON")
if not (is_windows() and args.cmake_generator != "Ninja"):
cmake_args.append("-DCMAKE_CXX_COMPILER_LAUNCHER=ccache")
cmake_args.append("-DCMAKE_C_COMPILER_LAUNCHER=ccache")
if args.use_cuda:
cmake_args.append("-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache")
if args.use_rocm:
cmake_args.append("-DCMAKE_HIP_COMPILER_LAUNCHER=ccache")
Improve dependency management (#13523) ## Description 1. Convert some git submodules to cmake external projects 2. Update nsync from [1.23.0](https://github.com/google/nsync/releases/tag/1.23.0) to [1.25.0](https://github.com/google/nsync/releases/tag/1.25.0) 3. Update re2 from 2021-06-01 to 2022-06-01 4. Update wil from an old commit to 1.0.220914.1 tag 5. Update gtest to a newer commit so that it can optionally leverage absl/re2 for parsing command line flags. The following git submodules are deleted: 1. FP16 2. safeint 3. XNNPACK 4. cxxopts 5. dlpack 7. flatbuffers 8. googlebenchmark 9. json 10. mimalloc 11. mp11 12. pthreadpool More will come. ## Motivation and Context There are 3 ways of integrating 3rd party C/C++ libraries into ONNX Runtime: 1. Install them to a system location, then use cmake's find_package module to locate them. 2. Use git submodules 6. Use cmake's external projects(externalproject_add). At first when this project was just started, we considered both option 2 and option 3. We preferred option 2 because: 1. It's easier to handle authentication. At first this project was not open source, and it had some other non-public dependencies. If we use git submodule, ADO will handle authentication smoothly. Otherwise we need to manually pass tokens around and be very careful on not exposing them in build logs. 2. At that time, cmake fetched dependencies after "cmake" finished generating vcprojects/makefiles. So it was very difficult to make cflags consistent. Since cmake 3.11, it has a new command: FetchContent, which fetches dependencies when it generates vcprojects/makefiles just before add_subdirectories, so the parent project's variables/settings can be easily passed to the child projects. And when the project went on, we had some new concerns: 1. As we started to have more and more EPs and build configs, the number of submodules grew quickly. For more developers, most ORT submodules are not relevant to them. They shouldn't need to download all of them. 2. It is impossible to let two different build configs use two different versions of the same dependency. For example, right now we have protobuf 3.18.3 in the submodules. Then every EP must use the same version. Whenever we have a need to upgrade protobuf, we need to coordinate across the whole team and many external developers. I can't manage it anymore. 3. Some projects want to manage the dependencies in a different way, either because of their preference or because of compliance requirements. For example, some Microsoft teams want to use vcpkg, but we don't want to force every user of onnxruntime using vcpkg. 7. Someone wants to dynamically link to protobuf, but our build script only does static link. 8. Hard to handle security vulnerabilities. For example, whenever protobuf has a security patch, we have a lot of things to do. But if we allowed people to build ORT with a different version of protobuf without changing ORT"s source code, the customer who build ORT from source will be able to act on such things in a quicker way. They will not need to wait ORT having a patch release. 9. Every time we do a release, github will also publish a source file zip file and a source file tarball for us. But they are not usable, because they miss submodules. ### New features After this change, users will be able to: 1. Build the dependencies in the way they want, then install them to somewhere(for example, /usr or a temp folder). 2. Or download the dependencies by using cmake commands from these dependencies official website 3. Similar to the above, but use your private mirrors to migrate supply chain risks. 4. Use different versions of the dependencies, as long as our source code is compatible with them. For example, you may use you can't use protobuf 3.20.x as they need code changes in ONNX Runtime. 6. Only download the things the current build needs. 10. Avoid building external dependencies again and again in every build. ### Breaking change The onnxruntime_PREFER_SYSTEM_LIB build option is removed you could think from now it is default ON. If you don't like the new behavior, you can set FETCHCONTENT_TRY_FIND_PACKAGE_MODE to NEVER. Besides, for who relied on the onnxruntime_PREFER_SYSTEM_LIB build option, please be aware that this PR will change find_package calls from Module mode to Config mode. For example, in the past if you have installed protobuf from apt-get from ubuntu 20.04's official repo, find_package can find it and use it. But after this PR, it won't. This is because that protobuf version provided by Ubuntu 20.04 is too old to support the "config mode". It can be resolved by getting a newer version of protobuf from somewhere.
2022-12-01 17:51:59 +00:00
# By default cmake does not check TLS/SSL certificates. Here we turn it on.
# But, in some cases you may also need to supply a CA file.
add_default_definition(cmake_extra_defines, "CMAKE_TLS_VERIFY", "ON")
add_default_definition(cmake_extra_defines, "FETCHCONTENT_QUIET", "OFF")
if args.external_graph_transformer_path:
cmake_args.append("-Donnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH=" + args.external_graph_transformer_path)
2022-06-07 01:37:16 +00:00
if args.use_winml:
cmake_args.append("-Donnxruntime_BUILD_WINML_TESTS=" + ("OFF" if args.skip_winml_tests else "ON"))
if args.use_dnnl:
cmake_args.append("-Donnxruntime_DNNL_GPU_RUNTIME=" + args.dnnl_gpu_runtime)
cmake_args.append("-Donnxruntime_DNNL_OPENCL_ROOT=" + args.dnnl_opencl_root)
cmake_args.append("-Donnxruntime_DNNL_AARCH64_RUNTIME=" + args.dnnl_aarch64_runtime)
cmake_args.append("-Donnxruntime_DNNL_ACL_ROOT=" + args.dnnl_acl_root)
2022-06-07 01:37:16 +00:00
if args.build_wasm:
cmake_args.append("-Donnxruntime_ENABLE_WEBASSEMBLY_SIMD=" + ("ON" if args.enable_wasm_simd else "OFF"))
if args.use_migraphx:
cmake_args.append("-Donnxruntime_MIGRAPHX_HOME=" + migraphx_home)
if args.use_cuda:
nvcc_threads = number_of_nvcc_threads(args)
Flash Attention v2 MHA (#17227) ### Description Integrate Flash Attention V2 to PackedMultiHeadAttention, MultiHeadAttention and Attention operators. Flash Attention v2 source code is from https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src. We did some change to remove dependency on Torch, then removed backward and bfloat16 related code. Add benchmark script (see benchmark_mha.sh) to compare different attention kernels for MultiHeadAttention operator. Current limitations for Flash Attention in PackedMultiHeadAttention, MultiHeadAttention and Attention operators: * Relative Position Bias is not supported * Different hidden size for Q and V is not supported * Only float16 is supported * Padding/attention mask is not supported * For MultiHeadAttention, when there is past or present input, bias shall be provided to activate flash attention * For Attention, past or present inputs will deactivate flash attention * Causal is not supported Some limitations (like attention mask and causal) might be removed later. Currently, Flash Attention v2 only works in Linux. For Windows, we will enable later with Cutlass 3.2. Two environment variables can be used for testing purpose: (1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default value is 0 (enable). Set it to "1" to disable it. (2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is "513", which means that we only enable flash attention when sequence length is larger than 512 for packed QKV format. Set it to "0" if you want to use flash attention v2 whenever possible. ### Speedup The following result is from Standard_ND96amsr_A100_v4 VM (A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per second for MultiHeadAttention operator. There are 3 input formats: * `Q,K,V` means separated inputs query, key and value of BxSxNH * `Q,KV` means packed KV, where key is 5D: BxSxNx2xH * `QKV` means packed QKV, where query is 5D: BxSxNx3xH Note that flash attention cannot use packed QKV format, so extra Transpose is needed. We found that TensorRT kernel is faster for sequence length <= 512 for packed QKV. The reason might be no transpose is needed for TensorRT kernel in this format. We also notice that, TensorRT kernel is faster for stable diffusion 512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while flash attention v2 is faster for 1024x1024 image (see seq_len=16384, heads=8, head_dim=40 below). input format | batch size | sequence length | heads | head dim | flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention (TFLOPs/s) -- | -- | -- | -- | -- | -- | -- | -- Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3 Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7 Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3 Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4 Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8 Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7 Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7 Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3 Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7 Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6 Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2 Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8 Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8 Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5 Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8 Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2 Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2 Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8 Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1 Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6 Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7 Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7 Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3 Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7 Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8 Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1 Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4 Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1 Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6 Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8 Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6 Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5 Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7 Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1 Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3 Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9 Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6 Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2 Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8 Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5 Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6 Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6 Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8 Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8 Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5 Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3 Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8 Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8 Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9 Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0 Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0 Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9 Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9 Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8 QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3 QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9 QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6 QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2 QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9 QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5 QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7 QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2 QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7 QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5 QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2 QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7 QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1 QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7 QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4 QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5 QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8 QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9 QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1 QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6 QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7 QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6 QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5 QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1 QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5 QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2 QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6 QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15 QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84 QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75 QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95 ### Known Issues NVCC uses huge memory while compiling flash attention CUDA kernel. Linux build with CUDA might fail when machine has limited memory while number of CPUs is large. Walkaround is to use a build machine with larger memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in build. ### Motivation and Context Increases speed and efficiency of MHA or Packed MHA. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
cmake_args.append("-Donnxruntime_NVCC_THREADS=" + str(nvcc_threads))
if not disable_float8_types and args.cuda_version:
if version_to_tuple(args.cuda_version) < (11, 8):
raise BuildError(
f"Float 8 types require CUDA>=11.8. They must be disabled on CUDA=={args.cuda_version}. "
f"Add '--disable_types float8' to your command line. See option disable_types."
)
cmake_args.append(f"-DCMAKE_CUDA_COMPILER={cuda_home}/bin/nvcc")
2022-06-07 01:37:16 +00:00
if args.use_rocm:
cmake_args.append("-Donnxruntime_ROCM_HOME=" + rocm_home)
cmake_args.append("-Donnxruntime_ROCM_VERSION=" + args.rocm_version)
if args.use_tensorrt:
cmake_args.append("-Donnxruntime_TENSORRT_HOME=" + tensorrt_home)
if args.llvm_config:
cmake_args.append("-Donnxruntime_TVM_USE_LLVM=" + args.llvm_config)
2022-06-07 01:37:16 +00:00
if args.use_cuda:
add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON")
if args.cuda_version:
add_default_definition(cmake_extra_defines, "onnxruntime_CUDA_VERSION", args.cuda_version)
# TODO: this variable is not really needed
add_default_definition(cmake_extra_defines, "onnxruntime_CUDA_HOME", cuda_home)
if cudnn_home:
add_default_definition(cmake_extra_defines, "onnxruntime_CUDNN_HOME", cudnn_home)
if is_windows():
if args.enable_msvc_static_runtime:
add_default_definition(
cmake_extra_defines, "CMAKE_MSVC_RUNTIME_LIBRARY", "MultiThreaded$<$<CONFIG:Debug>:Debug>"
)
add_default_definition(cmake_extra_defines, "ONNX_USE_MSVC_STATIC_RUNTIME", "ON")
add_default_definition(cmake_extra_defines, "protobuf_MSVC_STATIC_RUNTIME", "ON")
# The following build option was added in ABSL 20240722.0 and it must be explicitly set
add_default_definition(cmake_extra_defines, "ABSL_MSVC_STATIC_RUNTIME", "ON")
add_default_definition(cmake_extra_defines, "gtest_force_shared_crt", "OFF")
else:
# CMAKE_MSVC_RUNTIME_LIBRARY is default to MultiThreaded$<$<CONFIG:Debug>:Debug>DLL
add_default_definition(cmake_extra_defines, "ONNX_USE_MSVC_STATIC_RUNTIME", "OFF")
add_default_definition(cmake_extra_defines, "protobuf_MSVC_STATIC_RUNTIME", "OFF")
add_default_definition(cmake_extra_defines, "ABSL_MSVC_STATIC_RUNTIME", "OFF")
add_default_definition(cmake_extra_defines, "gtest_force_shared_crt", "ON")
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
if acl_home and os.path.exists(acl_home):
cmake_args += ["-Donnxruntime_ACL_HOME=" + acl_home]
if acl_libs and os.path.exists(acl_libs):
cmake_args += ["-Donnxruntime_ACL_LIBS=" + acl_libs]
if armnn_home and os.path.exists(armnn_home):
cmake_args += ["-Donnxruntime_ARMNN_HOME=" + armnn_home]
if armnn_libs and os.path.exists(armnn_libs):
cmake_args += ["-Donnxruntime_ARMNN_LIBS=" + armnn_libs]
if mpi_home and os.path.exists(mpi_home):
if args.use_mpi:
cmake_args += ["-Donnxruntime_MPI_HOME=" + mpi_home]
else:
log.warning(
"mpi_home is supplied but use_mpi is set to false."
" Build will continue without linking MPI libraries."
)
if nccl_home and os.path.exists(nccl_home):
cmake_args += ["-Donnxruntime_NCCL_HOME=" + nccl_home]
if qnn_home and os.path.exists(qnn_home):
cmake_args += ["-Donnxruntime_QNN_HOME=" + qnn_home]
if snpe_root and os.path.exists(snpe_root):
cmake_args += ["-DSNPE_ROOT=" + snpe_root]
if cann_home and os.path.exists(cann_home):
cmake_args += ["-Donnxruntime_CANN_HOME=" + cann_home]
if args.winml_root_namespace_override:
cmake_args += ["-Donnxruntime_WINML_NAMESPACE_OVERRIDE=" + args.winml_root_namespace_override]
if args.use_openvino:
cmake_args += [
"-Donnxruntime_USE_OPENVINO=ON",
"-Donnxruntime_NPU_NO_FALLBACK=" + ("ON" if args.use_openvino == "NPU_NO_CPU_FALLBACK" else "OFF"),
"-Donnxruntime_USE_OPENVINO_GPU=" + ("ON" if args.use_openvino == "GPU" else "OFF"),
"-Donnxruntime_USE_OPENVINO_CPU=" + ("ON" if args.use_openvino == "CPU" else "OFF"),
"-Donnxruntime_USE_OPENVINO_NPU=" + ("ON" if args.use_openvino == "NPU" else "OFF"),
"-Donnxruntime_USE_OPENVINO_GPU_NP=" + ("ON" if args.use_openvino == "GPU_NO_PARTITION" else "OFF"),
"-Donnxruntime_USE_OPENVINO_CPU_NP=" + ("ON" if args.use_openvino == "CPU_NO_PARTITION" else "OFF"),
"-Donnxruntime_USE_OPENVINO_NPU_NP=" + ("ON" if args.use_openvino == "NPU_NO_PARTITION" else "OFF"),
"-Donnxruntime_USE_OPENVINO_HETERO=" + ("ON" if args.use_openvino.startswith("HETERO") else "OFF"),
"-Donnxruntime_USE_OPENVINO_DEVICE=" + (args.use_openvino),
"-Donnxruntime_USE_OPENVINO_MULTI=" + ("ON" if args.use_openvino.startswith("MULTI") else "OFF"),
"-Donnxruntime_USE_OPENVINO_AUTO=" + ("ON" if args.use_openvino.startswith("AUTO") else "OFF"),
]
2020-03-11 21:25:37 +00:00
# VitisAI and OpenVINO providers currently only support full_protobuf option.
if args.use_full_protobuf or args.use_openvino or args.use_vitisai or args.gen_doc:
cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"]
if args.use_tvm and args.llvm_path is not None:
cmake_args += [f"-DLLVM_DIR={args.llvm_path}"]
2018-11-20 00:48:22 +00:00
if args.use_cuda and not is_windows():
nvml_stub_path = cuda_home + "/lib64/stubs"
cmake_args += ["-DCUDA_CUDA_LIBRARY=" + nvml_stub_path]
if args.use_preinstalled_eigen:
cmake_args += ["-Donnxruntime_USE_PREINSTALLED_EIGEN=ON", "-Deigen_SOURCE_PATH=" + args.eigen_path]
2018-11-20 00:48:22 +00:00
if args.nnapi_min_api:
cmake_args += ["-Donnxruntime_NNAPI_MIN_API=" + str(args.nnapi_min_api)]
if args.android:
if not args.android_ndk_path:
raise BuildError("android_ndk_path required to build for Android")
if not args.android_sdk_path:
raise BuildError("android_sdk_path required to build for Android")
2020-04-19 03:48:30 +00:00
cmake_args += [
"-DCMAKE_TOOLCHAIN_FILE="
+ os.path.join(args.android_ndk_path, "build", "cmake", "android.toolchain.cmake"),
2020-04-19 03:48:30 +00:00
"-DANDROID_PLATFORM=android-" + str(args.android_api),
"-DANDROID_ABI=" + str(args.android_abi),
"-DANDROID_MIN_SDK=" + str(args.android_api),
2020-04-19 03:48:30 +00:00
]
if args.android_cpp_shared:
cmake_args += ["-DANDROID_STL=c++_shared"]
2022-04-07 22:06:31 +00:00
if args.dml_path:
cmake_args += [
"-Donnxruntime_USE_CUSTOM_DIRECTML=ON",
"-Ddml_INCLUDE_DIR=" + os.path.join(args.dml_path, "include"),
"-Ddml_LIB_DIR=" + os.path.join(args.dml_path, "lib"),
]
if args.dml_external_project:
cmake_args += [
"-Donnxruntime_USE_CUSTOM_DIRECTML=ON",
"-Ddml_EXTERNAL_PROJECT=ON",
]
2022-04-07 22:06:31 +00:00
if args.use_gdk:
cmake_args += [
"-DCMAKE_TOOLCHAIN_FILE=" + os.path.join(source_dir, "cmake", "gdk_toolchain.cmake"),
2022-04-07 22:06:31 +00:00
"-DGDK_EDITION=" + args.gdk_edition,
"-DGDK_PLATFORM=" + args.gdk_platform,
"-Donnxruntime_BUILD_UNIT_TESTS=OFF", # gtest doesn't build for GDK
2022-04-07 22:06:31 +00:00
]
if args.use_dml and not (args.dml_path or args.dml_external_project):
raise BuildError("You must set dml_path or dml_external_project when building with the GDK.")
2022-04-07 22:06:31 +00:00
if is_macOS() and not args.android:
cmake_args += ["-DCMAKE_OSX_ARCHITECTURES=" + args.osx_arch]
2021-01-27 18:43:17 +00:00
if args.apple_deploy_target:
cmake_args += ["-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target]
# Code sign the binaries, if the code signing development identity and/or team id are provided
if args.xcode_code_signing_identity:
cmake_args += ["-DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=" + args.xcode_code_signing_identity]
if args.xcode_code_signing_team_id:
cmake_args += ["-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=" + args.xcode_code_signing_team_id]
2021-01-27 18:43:17 +00:00
if args.use_qnn:
if args.qnn_home is None or os.path.exists(args.qnn_home) is False:
raise BuildError("qnn_home=" + qnn_home + " not valid." + " qnn_home paths must be specified and valid.")
cmake_args += ["-Donnxruntime_USE_QNN=ON"]
2021-01-27 18:43:17 +00:00
if args.use_coreml:
cmake_args += ["-Donnxruntime_USE_COREML=ON"]
if args.use_webnn:
if not args.build_wasm:
raise BuildError("WebNN is only available for WebAssembly build.")
cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
if args.use_jsep and args.use_webgpu:
raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
Add implementation of WebGPU EP (#22591) ### Description This PR adds the actual implementation of the WebGPU EP based on https://github.com/microsoft/onnxruntime/pull/22318. This change includes the following: <details> <summary><b>core framework of WebGPU EP</b></summary> - WebGPU EP factory classes for: - handling WebGPU options - creating WebGPU EP instance - creating WebGPU context - WebGPU Execution Provider classes - GPU Buffer allocator - data transfer - Buffer management classes - Buffer Manager - BufferCacheManager - DisabledCacheManager - SimpleCacheManager - LazyReleaseCacheManager - BucketCacheManager - Program classes - Program (base) - Program Cache Key - Program Manager - Shader helper classes - Shader Helper - ShaderIndicesHelper - ShaderVariableHelper - Utils - GPU Query based profiler - compute context - string utils - Miscs - Python binding webgpu support (basic) </details> <details> <summary><b>Kernel implementation</b></summary> - onnx.ai (default opset): - Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf, Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh, Tanh, Not, Cast - Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu, LeakyRelu, ThresholdedRelu, Gelu - Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater, GreaterOrEqual, Less, LessOrEqual - (Tensors): Shape, Reshape, Squeeze, Unsqueeze - Where - Transpose - Concat - Expand - Gather - Tile - Range - LayerNormalization - com.microsoft - FastGelu - MatMulNBits - MultiHeadAttention - RotaryEmbedding - SkipLayerNormalization - LayerNormalization - SimplifiedLayerNormalization - SkipSimplifiedLayerNormalization </details> <details> <summary><b>Build, test and CI pipeline integration</b></summary> - build works for Windows, macOS and iOS - support onnxruntime_test_all and python node test - added a new unit test for `--use_external_dawn` build flag. - updated MacOS pipeline to build with WebGPU support - added a new pipeline for WebGPU Windows </details> This change does not include: - Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
if args.use_external_dawn and not args.use_webgpu:
raise BuildError("External Dawn (--use_external_dawn) must be enabled with WebGPU (--use_webgpu).")
if args.use_snpe:
cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
if args.macos or args.ios or args.visionos:
# Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
if args.macos == "Catalyst" and args.cmake_generator == "Xcode":
raise BuildError("Xcode CMake generator ('--cmake_generator Xcode') doesn't support Mac Catalyst build.")
if (args.ios or args.visionos or args.macos == "MacOSX") and not args.cmake_generator == "Xcode":
raise BuildError(
"iOS/MacOS framework build requires use of the Xcode CMake generator ('--cmake_generator Xcode')."
)
2021-11-18 19:31:13 +00:00
needed_args = [
args.apple_sysroot,
2021-11-18 19:31:13 +00:00
args.apple_deploy_target,
]
arg_names = [
"--apple_sysroot " + "<the location or name of the macOS platform SDK>",
"--apple_deploy_target " + "<the minimum version of the target platform>",
2021-11-18 19:31:13 +00:00
]
if not all(needed_args):
raise BuildError(
"iOS/MacOS framework build on MacOS canceled due to missing arguments: "
+ ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond)
)
# note: this value is mainly used in framework_info.json file to specify the build osx type
platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot
2021-11-18 19:31:13 +00:00
cmake_args += [
"-Donnxruntime_BUILD_SHARED_LIB=ON",
"-DCMAKE_OSX_SYSROOT=" + args.apple_sysroot,
2021-11-18 19:31:13 +00:00
"-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target,
# we do not need protoc binary for ios cross build
"-Dprotobuf_BUILD_PROTOC_BINARIES=OFF",
"-DPLATFORM_NAME=" + platform_name,
2021-11-18 19:31:13 +00:00
]
if args.ios:
cmake_args += [
"-DCMAKE_SYSTEM_NAME=iOS",
"-DCMAKE_TOOLCHAIN_FILE="
+ (args.ios_toolchain_file if args.ios_toolchain_file else "../cmake/onnxruntime_ios.toolchain.cmake"),
]
# for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc.
# https://forums.developer.apple.com/forums/thread/122571
if args.macos == "Catalyst":
macabi_target = f"{args.osx_arch}-apple-ios{args.apple_deploy_target}-macabi"
cmake_args += [
"-DCMAKE_CXX_COMPILER_TARGET=" + macabi_target,
"-DCMAKE_C_COMPILER_TARGET=" + macabi_target,
"-DCMAKE_CC_COMPILER_TARGET=" + macabi_target,
f"-DCMAKE_CXX_FLAGS=--target={macabi_target}",
f"-DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
f"-DCMAKE_C_FLAGS=--target={macabi_target}",
f"-DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
f"-DCMAKE_CC_FLAGS=--target={macabi_target}",
f"-DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}",
]
if args.visionos:
cmake_args += [
"-DCMAKE_SYSTEM_NAME=visionOS",
"-DCMAKE_TOOLCHAIN_FILE="
+ (
args.visionos_toolchain_file
if args.visionos_toolchain_file
else "../cmake/onnxruntime_visionos.toolchain.cmake"
),
"-Donnxruntime_ENABLE_CPUINFO=OFF",
]
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args.build_wasm:
emsdk_dir = os.path.join(cmake_dir, "external", "emsdk")
emscripten_cmake_toolchain_file = os.path.join(
emsdk_dir, "upstream", "emscripten", "cmake", "Modules", "Platform", "Emscripten.cmake"
)
cmake_args += ["-DCMAKE_TOOLCHAIN_FILE=" + emscripten_cmake_toolchain_file]
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args.disable_wasm_exception_catching:
# WebAssembly unittest requires exception catching to work. If this feature is disabled, we do not build
# unit test.
cmake_args += [
"-Donnxruntime_BUILD_UNIT_TESTS=OFF",
]
# add default emscripten settings
emscripten_settings = normalize_arg_list(args.emscripten_settings)
# set -s MALLOC
if args.wasm_malloc is not None:
add_default_definition(emscripten_settings, "MALLOC", args.wasm_malloc)
add_default_definition(emscripten_settings, "MALLOC", "dlmalloc")
# set -s STACK_SIZE=5242880
add_default_definition(emscripten_settings, "STACK_SIZE", "5242880")
if emscripten_settings:
cmake_args += [f"-Donnxruntime_EMSCRIPTEN_SETTINGS={';'.join(emscripten_settings)}"]
# Append onnxruntime-extensions cmake options
if args.use_extensions:
cmake_args += ["-Donnxruntime_USE_EXTENSIONS=ON"]
# default path of onnxruntime-extensions, using git submodule
for config in configs:
onnxruntime_extensions_path = os.path.join(build_dir, config, "_deps", "extensions-src")
onnxruntime_extensions_path = os.path.abspath(onnxruntime_extensions_path)
if args.extensions_overridden_path and os.path.exists(args.extensions_overridden_path):
# use absolute path here because onnxruntime-extensions is outside onnxruntime
onnxruntime_extensions_path = os.path.abspath(args.extensions_overridden_path)
cmake_args += ["-Donnxruntime_EXTENSIONS_OVERRIDDEN=ON"]
print("[onnxruntime-extensions] Loading onnxruntime-extensions from: ", onnxruntime_extensions_path)
else:
print("[onnxruntime-extensions] Loading onnxruntime-extensions from: FetchContent")
cmake_args += ["-Donnxruntime_EXTENSIONS_PATH=" + onnxruntime_extensions_path]
if is_reduced_ops_build(args):
operators_config_file = os.path.abspath(args.include_ops_by_config)
cmake_tool_dir = os.path.join(onnxruntime_extensions_path, "tools")
# generate _selectedoplist.cmake by operators config file
run_subprocess([sys.executable, "gen_selectedops.py", operators_config_file], cwd=cmake_tool_dir)
if path_to_protoc_exe:
cmake_args += [f"-DONNX_CUSTOM_PROTOC_EXECUTABLE={path_to_protoc_exe}"]
if args.fuzz_testing:
if not (
args.build_shared_lib
and is_windows()
and args.cmake_generator == "Visual Studio 17 2022"
and args.use_full_protobuf
):
raise BuildError("Fuzz test has only be tested with build shared libs option using MSVC on windows")
cmake_args += [
"-Donnxruntime_BUILD_UNIT_TESTS=ON",
"-Donnxruntime_FUZZ_TEST=ON",
"-Donnxruntime_USE_FULL_PROTOBUF=ON",
]
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
if args.enable_lazy_tensor:
import torch
cmake_args += [f"-Donnxruntime_PREBUILT_PYTORCH_PATH={os.path.dirname(torch.__file__)}"]
cmake_args += ["-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
if args.use_azure:
add_default_definition(cmake_extra_defines, "onnxruntime_USE_AZURE", "ON")
if args.use_lock_free_queue:
add_default_definition(cmake_extra_defines, "onnxruntime_USE_LOCK_FREE_QUEUE", "ON")
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
if is_windows():
if args.use_cache:
add_default_definition(
cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "$<$<CONFIG:Debug,RelWithDebInfo>:Embedded>"
)
else:
# Always enable debug info even in release build. The debug information is in separated *.pdb files that
# can be easily discarded when debug symbols are not needed. We enable it by default because many auditting
# tools need to use the symbols.
add_default_definition(cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "ProgramDatabase")
if number_of_parallel_jobs(args) > 0:
# https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/
# NOTE: this disables /MP if set (according to comments on blog post).
# By default, MultiProcMaxCount and CL_MPCount value are equal to the number of CPU logical processors.
# See logic around setting CL_MPCount below
cmake_args += ["-DCMAKE_VS_GLOBALS=UseMultiToolTask=true;EnforceProcessCountAcrossBuilds=true"]
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
cmake_args += [f"-D{define}" for define in cmake_extra_defines]
2018-11-20 00:48:22 +00:00
cmake_args += cmake_extra_args
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
# ADO pipelines will store the pipeline build number
# (e.g. 191101-2300.1.master) and source version in environment
# variables. If present, use these values to define the
# WinML/ORT DLL versions.
build_number = os.getenv("Build_BuildNumber") # noqa: SIM112
source_version = os.getenv("Build_SourceVersion") # noqa: SIM112
if build_number and source_version:
build_matches = re.fullmatch(r"(\d\d)(\d\d)(\d\d)(\d\d)\.(\d+)", build_number)
if build_matches:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
YY = build_matches.group(2) # noqa: N806
MM = build_matches.group(3) # noqa: N806
DD = build_matches.group(4) # noqa: N806
# Get ORT major and minor number
with open(os.path.join(source_dir, "VERSION_NUMBER")) as f:
first_line = f.readline()
ort_version_matches = re.match(r"(\d+).(\d+)", first_line)
if not ort_version_matches:
raise BuildError("Couldn't read version from VERSION_FILE")
ort_major = ort_version_matches.group(1)
ort_minor = ort_version_matches.group(2)
2020-04-19 03:48:30 +00:00
# Example (BuildNumber: 191101-2300.1.master,
# SourceVersion: 0bce7ae6755c792eda558e5d27ded701707dc404)
# MajorPart = 1
# MinorPart = 0
# BuildPart = 1911
# PrivatePart = 123
# String = 191101-2300.1.master.0bce7ae
2020-04-19 03:48:30 +00:00
cmake_args += [
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"-DVERSION_MAJOR_PART={ort_major}",
f"-DVERSION_MINOR_PART={ort_minor}",
f"-DVERSION_BUILD_PART={YY}",
f"-DVERSION_PRIVATE_PART={MM}{DD}",
f"-DVERSION_STRING={ort_major}.{ort_minor}.{build_number}.{source_version[0:7]}",
2020-04-19 03:48:30 +00:00
]
for config in configs:
cflags = []
cxxflags = None
ldflags = None
cudaflags = []
if is_windows() and not args.ios and not args.android and not args.build_wasm:
njobs = number_of_parallel_jobs(args)
if args.use_cuda:
cudaflags.append("-allow-unsupported-compiler")
if njobs > 1:
if args.parallel == 0:
cflags += ["/MP"]
else:
cflags += ["/MP%d" % njobs]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
# Setup default values for cflags/cxxflags/ldflags.
# The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
if (
(args.use_binskim_compliant_compile_flags or args.enable_address_sanitizer)
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
and not args.ios
and not args.android
and not args.build_wasm
):
if is_windows():
cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS"]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
if not args.use_gdk:
# Target Windows 10
cflags += [
"/DWINAPI_FAMILY=100",
"/DWINVER=0x0A00",
"/D_WIN32_WINNT=0x0A00",
"/DNTDDI_VERSION=0x0A000000",
]
# The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users
# do not need to have it.
ldflags = ["/profile", "/DYNAMICBASE"]
# Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
if not args.enable_address_sanitizer:
# Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
cflags += ["/Qspectre", "/DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH"]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
if config == "Release":
cflags += ["/O2", "/Ob2", "/DNDEBUG"]
elif config == "RelWithDebInfo":
cflags += ["/O2", "/Ob1", "/DNDEBUG"]
elif config == "Debug":
cflags += ["/Ob0", "/Od", "/RTC1"]
elif config == "MinSizeRel":
cflags += ["/O1", "/Ob1", "/DNDEBUG"]
if args.enable_address_sanitizer:
cflags += ["/fsanitize=address"]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
cxxflags = cflags.copy()
if args.use_cuda:
# On Windows, nvcc passes /EHsc to the host compiler by default.
cuda_compile_flags_str = ""
for compile_flag in cflags:
if compile_flag.startswith("/D"):
cudaflags.append(compile_flag)
else:
cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
if len(cuda_compile_flags_str) != 0:
cudaflags.append(f'-Xcompiler="{cuda_compile_flags_str}"')
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
elif is_linux() or is_macOS():
if is_linux():
ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
else:
ldflags = []
if config == "Release":
cflags = [
"-DNDEBUG",
"-Wp,-D_FORTIFY_SOURCE=2",
"-Wp,-D_GLIBCXX_ASSERTIONS",
"-fstack-protector-strong",
"-O3",
"-pipe",
]
if is_linux():
ldflags += ["-Wl,--strip-all"]
elif config == "RelWithDebInfo":
cflags = [
"-DNDEBUG",
"-Wp,-D_FORTIFY_SOURCE=2",
"-Wp,-D_GLIBCXX_ASSERTIONS",
"-fstack-protector-strong",
"-O3",
"-pipe",
"-ggdb3",
]
elif config == "Debug":
cflags = ["-ggdb3", "-O0"]
if args.enable_address_sanitizer:
cflags += ["-fsanitize=address"]
ldflags += ["-fsanitize=address"]
elif config == "MinSizeRel":
cflags = [
"-DNDEBUG",
"-Wp,-D_FORTIFY_SOURCE=2",
"-Wp,-D_GLIBCXX_ASSERTIONS",
"-fstack-protector-strong",
"-Os",
"-pipe",
"-ggdb3",
]
if is_linux() and platform.machine() == "x86_64":
# The following flags needs GCC 8 and newer
cflags += ["-fstack-clash-protection"]
if not args.rv64:
cflags += ["-fcf-protection"]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
cxxflags = cflags.copy()
if args.use_cuda:
cudaflags = cflags.copy()
if cxxflags is None and cflags is not None and len(cflags) != 0:
cxxflags = cflags.copy()
2018-11-20 00:48:22 +00:00
config_build_dir = get_config_build_dir(build_dir, config)
os.makedirs(config_build_dir, exist_ok=True)
if args.use_tvm:
os.environ["PATH"] = (
os.path.join(config_build_dir, "_deps", "tvm-build")
+ os.pathsep
+ os.path.join(config_build_dir, "_deps", "tvm-src")
+ os.pathsep
+ os.path.dirname(sys.executable)
+ os.pathsep
+ os.environ["PATH"]
)
Improve dependency management (#13523) ## Description 1. Convert some git submodules to cmake external projects 2. Update nsync from [1.23.0](https://github.com/google/nsync/releases/tag/1.23.0) to [1.25.0](https://github.com/google/nsync/releases/tag/1.25.0) 3. Update re2 from 2021-06-01 to 2022-06-01 4. Update wil from an old commit to 1.0.220914.1 tag 5. Update gtest to a newer commit so that it can optionally leverage absl/re2 for parsing command line flags. The following git submodules are deleted: 1. FP16 2. safeint 3. XNNPACK 4. cxxopts 5. dlpack 7. flatbuffers 8. googlebenchmark 9. json 10. mimalloc 11. mp11 12. pthreadpool More will come. ## Motivation and Context There are 3 ways of integrating 3rd party C/C++ libraries into ONNX Runtime: 1. Install them to a system location, then use cmake's find_package module to locate them. 2. Use git submodules 6. Use cmake's external projects(externalproject_add). At first when this project was just started, we considered both option 2 and option 3. We preferred option 2 because: 1. It's easier to handle authentication. At first this project was not open source, and it had some other non-public dependencies. If we use git submodule, ADO will handle authentication smoothly. Otherwise we need to manually pass tokens around and be very careful on not exposing them in build logs. 2. At that time, cmake fetched dependencies after "cmake" finished generating vcprojects/makefiles. So it was very difficult to make cflags consistent. Since cmake 3.11, it has a new command: FetchContent, which fetches dependencies when it generates vcprojects/makefiles just before add_subdirectories, so the parent project's variables/settings can be easily passed to the child projects. And when the project went on, we had some new concerns: 1. As we started to have more and more EPs and build configs, the number of submodules grew quickly. For more developers, most ORT submodules are not relevant to them. They shouldn't need to download all of them. 2. It is impossible to let two different build configs use two different versions of the same dependency. For example, right now we have protobuf 3.18.3 in the submodules. Then every EP must use the same version. Whenever we have a need to upgrade protobuf, we need to coordinate across the whole team and many external developers. I can't manage it anymore. 3. Some projects want to manage the dependencies in a different way, either because of their preference or because of compliance requirements. For example, some Microsoft teams want to use vcpkg, but we don't want to force every user of onnxruntime using vcpkg. 7. Someone wants to dynamically link to protobuf, but our build script only does static link. 8. Hard to handle security vulnerabilities. For example, whenever protobuf has a security patch, we have a lot of things to do. But if we allowed people to build ORT with a different version of protobuf without changing ORT"s source code, the customer who build ORT from source will be able to act on such things in a quicker way. They will not need to wait ORT having a patch release. 9. Every time we do a release, github will also publish a source file zip file and a source file tarball for us. But they are not usable, because they miss submodules. ### New features After this change, users will be able to: 1. Build the dependencies in the way they want, then install them to somewhere(for example, /usr or a temp folder). 2. Or download the dependencies by using cmake commands from these dependencies official website 3. Similar to the above, but use your private mirrors to migrate supply chain risks. 4. Use different versions of the dependencies, as long as our source code is compatible with them. For example, you may use you can't use protobuf 3.20.x as they need code changes in ONNX Runtime. 6. Only download the things the current build needs. 10. Avoid building external dependencies again and again in every build. ### Breaking change The onnxruntime_PREFER_SYSTEM_LIB build option is removed you could think from now it is default ON. If you don't like the new behavior, you can set FETCHCONTENT_TRY_FIND_PACKAGE_MODE to NEVER. Besides, for who relied on the onnxruntime_PREFER_SYSTEM_LIB build option, please be aware that this PR will change find_package calls from Module mode to Config mode. For example, in the past if you have installed protobuf from apt-get from ubuntu 20.04's official repo, find_package can find it and use it. But after this PR, it won't. This is because that protobuf version provided by Ubuntu 20.04 is too old to support the "config mode". It can be resolved by getting a newer version of protobuf from somewhere.
2022-12-01 17:51:59 +00:00
preinstalled_dir = Path(build_dir) / config
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
temp_cmake_args = cmake_args.copy()
if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0:
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
temp_cmake_args += [
"-DCMAKE_C_FLAGS={}".format(" ".join(cflags)),
"-DCMAKE_CXX_FLAGS={}".format(" ".join(cxxflags)),
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
]
if cudaflags is not None and len(cudaflags) != 0:
temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT={}".format(" ".join(cudaflags))]
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
if ldflags is not None and len(ldflags) != 0:
temp_cmake_args += [
"-DCMAKE_EXE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
"-DCMAKE_MODULE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
"-DCMAKE_SHARED_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)),
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
]
2020-04-19 03:48:30 +00:00
run_subprocess(
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
[
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
*temp_cmake_args,
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"-DCMAKE_BUILD_TYPE={config}",
(
f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm)
else ""
),
],
cwd=config_build_dir,
cuda_home=cuda_home,
)
2018-11-20 00:48:22 +00:00
def clean_targets(cmake_path, build_dir, configs):
for config in configs:
log.info("Cleaning targets for %s configuration", config)
build_dir2 = get_config_build_dir(build_dir, config)
cmd_args = [cmake_path, "--build", build_dir2, "--config", config, "--target", "clean"]
2018-11-20 00:48:22 +00:00
run_subprocess(cmd_args)
2020-04-19 03:48:30 +00:00
def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, target=None):
2018-11-20 00:48:22 +00:00
for config in configs:
log.info("Building targets for %s configuration", config)
build_dir2 = get_config_build_dir(build_dir, config)
cmd_args = [cmake_path, "--build", build_dir2, "--config", config]
if target:
cmd_args.extend(["--target", target])
2018-11-20 00:48:22 +00:00
build_tool_args = []
if num_parallel_jobs != 1:
if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm:
# https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
# not maxing out CL_MPCount
# Start by having one less than num_parallel_jobs (default is num logical cores),
# limited to a range of 1..15
# that gives maxcpucount projects building using up to 15 cl.exe instances each
2020-03-11 21:25:37 +00:00
build_tool_args += [
f"/maxcpucount:{num_parallel_jobs}",
# one less than num_parallel_jobs, at least 1, up to 15
f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 15)}",
2020-03-11 21:25:37 +00:00
# if nodeReuse is true, msbuild processes will stay around for a bit after the build completes
"/nodeReuse:False",
]
elif args.cmake_generator == "Xcode":
build_tool_args += [
"-parallelizeTargets",
"-jobs",
str(num_parallel_jobs),
]
else:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
build_tool_args += [f"-j{num_parallel_jobs}"]
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
if build_tool_args:
cmd_args += ["--"]
2018-11-20 00:48:22 +00:00
cmd_args += build_tool_args
env = {}
if args.android:
env["ANDROID_SDK_ROOT"] = args.android_sdk_path
env["ANDROID_NDK_HOME"] = args.android_ndk_path
run_subprocess(cmd_args, env=env)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def add_dir_if_exists(directory, dir_list):
if os.path.isdir(directory):
dir_list.append(directory)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def setup_cuda_vars(args):
2018-11-20 00:48:22 +00:00
cuda_home = ""
cudnn_home = ""
2020-04-19 03:48:30 +00:00
if args.use_cuda:
cuda_home = args.cuda_home if args.cuda_home else os.getenv("CUDA_HOME")
cudnn_home = args.cudnn_home if args.cudnn_home else os.getenv("CUDNN_HOME")
2018-11-20 00:48:22 +00:00
cuda_home_valid = cuda_home is not None and os.path.exists(cuda_home)
cudnn_home_valid = cudnn_home is not None and os.path.exists(cudnn_home)
2018-11-20 00:48:22 +00:00
if not cuda_home_valid or (not is_windows() and not cudnn_home_valid):
2020-04-19 03:48:30 +00:00
raise BuildError(
"cuda_home and cudnn_home paths must be specified and valid.",
f"cuda_home='{cuda_home}' valid={cuda_home_valid}. cudnn_home='{cudnn_home}' valid={cudnn_home_valid}",
)
2018-11-20 00:48:22 +00:00
return cuda_home, cudnn_home
Trt execution provider (#382) * updated cmake files for trt * added trt execution provider * added trt basic test * removed trt_path action attribute * Add files via upload * Update build.py * Update trt_allocator.h * fixed issues found by reviewers * changed cast operator * added comment for custom kernel implementation * changed auto to auto& * changed to function compile APIs for TRT execution provider * changed to function compile APIs for TRT execution provider * added new DType DInt64 * adapted to the changes of onnxruntime_c_api * removed trt kernel (use function compile instead) * updated onnx-tensorrt submodule * set default memory type to TRT fused kernel * resolve merge conflict * fixed the issue that USE_CUDA conflicts with USE_TRT * construct graph by adding nodes in topological order * made changes for Windows * change buffers type * bypass HasImplementationOf check for TRT XP because TRT kernel is not registered * added domain to version info in rebuilt model proto * added trt to test option list * added DomainToVersionMap() to GraphViewer * removed Copy() * fixed broken code * format the code to clang format * used local reference to the frequently used values * fixed a couple of issues according to reviewers feedback * fixed a couple of issues according to reviewers feedback * added python binding for TRT and enable use_cuda when use_trt is on * fixed a redefinition issue * changed shared_ptr to unique_ptr on trt engines, and made a few changes required by reviewers * enabled trtexecution provider for unit tests * renamed trt to tensorrt * added tesorrt to python binding * update submodule onnx and onnx-tensorrt * made a couple of minor changes based on reviewer's feedback * added CUDA_CHECK * removed test code * fixed broken code after merge * updated onnx-tensorrt submodule * added post processing to align trt inputs/outputs with graph inputs/outputs * updated onnx submodule * added CUDA fallback for TensorRT and fixed TensorRT cmake issue * added ci pipeline for tensorrt and removed some redundent code from trt xp * fixed syntax issue * updated onnx-tensorrt submodule * fix trt build problem by: (#602) 1. Add additional /wd for debug build 2. Add io.h for additional targets 3. Bring back mb version of getopt * Update install_ubuntu.sh * Update linux-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * Update run_build.sh * Update run_build.sh * Update run_build.sh * Update run_build.sh * fixed the issue that GetKernelRegistry returns nullptr * merged master to this branch * moved some data types to private * fixed tensorrt CI pipeline issue * customized test data for TensorRT pipeline * added onnx-tensorrt in json file and fixed an issue in ci script * added comments
2019-03-14 19:00:39 +00:00
def setup_cann_vars(args):
cann_home = ""
if args.use_cann:
cann_home = args.cann_home if args.cann_home else os.getenv("ASCEND_HOME_PATH")
cann_home_valid = cann_home is not None and os.path.exists(cann_home)
if not cann_home_valid:
raise BuildError(
"cann_home paths must be specified and valid.",
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"cann_home='{cann_home}' valid={cann_home_valid}.",
)
return cann_home
2020-04-19 03:48:30 +00:00
def setup_tensorrt_vars(args):
Trt execution provider (#382) * updated cmake files for trt * added trt execution provider * added trt basic test * removed trt_path action attribute * Add files via upload * Update build.py * Update trt_allocator.h * fixed issues found by reviewers * changed cast operator * added comment for custom kernel implementation * changed auto to auto& * changed to function compile APIs for TRT execution provider * changed to function compile APIs for TRT execution provider * added new DType DInt64 * adapted to the changes of onnxruntime_c_api * removed trt kernel (use function compile instead) * updated onnx-tensorrt submodule * set default memory type to TRT fused kernel * resolve merge conflict * fixed the issue that USE_CUDA conflicts with USE_TRT * construct graph by adding nodes in topological order * made changes for Windows * change buffers type * bypass HasImplementationOf check for TRT XP because TRT kernel is not registered * added domain to version info in rebuilt model proto * added trt to test option list * added DomainToVersionMap() to GraphViewer * removed Copy() * fixed broken code * format the code to clang format * used local reference to the frequently used values * fixed a couple of issues according to reviewers feedback * fixed a couple of issues according to reviewers feedback * added python binding for TRT and enable use_cuda when use_trt is on * fixed a redefinition issue * changed shared_ptr to unique_ptr on trt engines, and made a few changes required by reviewers * enabled trtexecution provider for unit tests * renamed trt to tensorrt * added tesorrt to python binding * update submodule onnx and onnx-tensorrt * made a couple of minor changes based on reviewer's feedback * added CUDA_CHECK * removed test code * fixed broken code after merge * updated onnx-tensorrt submodule * added post processing to align trt inputs/outputs with graph inputs/outputs * updated onnx submodule * added CUDA fallback for TensorRT and fixed TensorRT cmake issue * added ci pipeline for tensorrt and removed some redundent code from trt xp * fixed syntax issue * updated onnx-tensorrt submodule * fix trt build problem by: (#602) 1. Add additional /wd for debug build 2. Add io.h for additional targets 3. Bring back mb version of getopt * Update install_ubuntu.sh * Update linux-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * Update run_build.sh * Update run_build.sh * Update run_build.sh * Update run_build.sh * fixed the issue that GetKernelRegistry returns nullptr * merged master to this branch * moved some data types to private * fixed tensorrt CI pipeline issue * customized test data for TensorRT pipeline * added onnx-tensorrt in json file and fixed an issue in ci script * added comments
2019-03-14 19:00:39 +00:00
tensorrt_home = ""
2020-04-19 03:48:30 +00:00
if args.use_tensorrt:
tensorrt_home = args.tensorrt_home if args.tensorrt_home else os.getenv("TENSORRT_HOME")
tensorrt_home_valid = tensorrt_home is not None and os.path.exists(tensorrt_home)
2020-04-19 03:48:30 +00:00
if not tensorrt_home_valid:
raise BuildError(
"tensorrt_home paths must be specified and valid.",
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"tensorrt_home='{tensorrt_home}' valid={tensorrt_home_valid}.",
)
2020-04-19 03:48:30 +00:00
# Set maximum workspace size in byte for
# TensorRT (1GB = 1073741824 bytes).
Initial commit for OpenVINO Execution Provider (#935) * Initial commit for OpenVINO Execution Provider OpenVINO Execution Provider provides the interface for ONNX Runtime applications to access Intel's hardware accelerators using Intel's OpenVINO Toolkit. * Fixed bug in GetCapability to disable custom ops Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added OPENVINO ci pipeline Added new pipeline for openvino provider, made changes to support the docker build and onnxruntime build with openvino. Signed-off-by: Luis Daniel Castellanos <luis.daniel.castellanos@intel.com> * Enabled all unit tests for OpenVINO EP Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed syntax issue in run_docker_build.sh file * Added missing default OPENVINO_VERSION Default value for OPENVINO_VERSION env was missing causing the build to fail * Added install Model Optimizer deps step * Fixed python unit tests and some tests from onnx_backend_test_series Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed indentation bug Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled some of the python backend tests for OpenVINO Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled some model tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Remove Duplicate checks for openvino in build.py Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Modified GetCapability for FP16 Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled GPU FP32 tests that are not supported Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Convert modelProto to string and use it in compile Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Pass byte-array input args to MO * Serialized ModelProto passed in-memory to MO ModelOptimizer python module receives the serialized ModelProto in-memory. Uses appropriate ONNX function to load the serialized bytes. * Make Py_Finalize compatible with older python versions Also, remove pFunc unassigned variable possibility. * Fallback if input dims of Matmul is greater than 2 Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * fixup: Device #define syntax * Updated the documentation Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Enable dynamic dim value * removed commented out code * Added Dockerfile for openvino EP Updated instructions on dockerfiles/README.md file Signed-off-by: Luis Daniel Castellanos <luis.daniel.castellanos@intel.com> * Disabled fp16_inception_v1 test Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Code formatting with clang-format Uses style from the .clang-format file in root directory. * fixup: docker tag and build error fixes * Heuristics to automatically detect batching Distributes slices from batch into parallel infer-request objects. * Handle disabled tests in GetCapability Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled average pool and max pool if ceil_mode is 1 Also dilations are not supported if they are greater than 1 Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Unsqueeze int32 test Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * changes to fix output results bug * Disabled a few C++ unit tests for MYRIAD FP16 Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Manually revert '9fe162bb Enable dynamic dim value' Reverts compile time setting of dynamic shape Reverting manually due to significantly huge auto-revert conflicts. * Fixed unused variable warning Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Mul test for GPU_FP16 due to accuracy issue Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * VPU documentation update * Disabled inception_v1 for MYRIAD and HDDL *Also disabled few C++ accuracy tests for HDDL Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * updates from upstream * use the new CustomOpApis for I/O interfacing * Pass initializers as subgraph meta-def inputs in GetCapability() Requirement due to API changes introduced with PR# 1019. * Remove obsolete functions * Save indexes of graph inputs from fused_node info Both inputs and initializers are passed as data inputs to the infer function. To identify only inputs among them, save thier index info from fused_node in Compile function. * Documentation changes to enable VPU * Fix VPU related changes in documentation * Fix minor changes in documentation * Fix VPU related changes in documentation * Use Node.In/OutputDefs() to track graph inputs and outputs. Don't use graph_viewer's GetInputs() or GetInputsIncludingInitializers(). * Permit "SAME_UPPER" auto_pad attribute from MaxPool * Disabled fp16_tiny_yolov2 in onnx model tests * Updated documentation to include configuration guides for myriad and hddl Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Use 8 Infer requests only for VAD-R * disable debug prints * Clang-format source files * Updated BUILD.md with OpenVINO R5 links Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled same upper python tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Update test exclusion syntax * Change path of install_onnx.sh Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disable tiny_yolov2 in broken tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Revert "Change path of install_onnx.sh" This reverts commit ba9db165f3be430f2aff1ef413299ed04637196a. This change is only required for Intel internal CI pipeline until the settings are matched with the upstream's CI pipeline. * Added debug statements for debugging CI error Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Add --build_wheel to linux openvino pipeline Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added -v option to onnx_test_runner for debugging Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Removed path change patch Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added -c 1 to onnx_test_runner Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Refactor MO python invocation in separate function Cleans up Model Optimizer python invocation check and conversion logic. Invokes MO only once in GetCapability() and passes the IR strings (xml and bin) to the Compiler as meta-def attributes. * Add comments * code cleanup and comments * Code cleanup for GetCapability Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Removed unnecessary files Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Revert "Added -v option to onnx_test_runner for debugging" This reverts commit d1dd70938a94d648df1a1dbbc2e48d0b97e49ec8. * Revert "Added debug statements for debugging CI error" This reverts commit b86d41afed2aa29c3508155d6f9c8d3a7263cc60. * incorporate Status Code changes * ComputeFunc returns Status::OK() on success * Use test names to disable tests for MYRIAD and VAD-R Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Rename local identifiers from CNNNetwork to OpenVINO network CNNNetwork is an OpenVINO's API class that represents more than just convolutional neural networks (CNNs). Renaming helps to avoid confusion that the API's only support CNN type models. * Added error message if building on windows * Removed duplicate option in Cmake * Removed unnecessary parameters in activation_opt_test Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Refactor Map search and access logic for efficiently and cleanliness. * use C++ style casts * Use os.path.join for python directory path operations * use C++ style casts * EP classes should use onnxruntime namespace * Clean up fixes from PR comments * Don't explicitly shutdown Py interpreter * Remove debug print statements Prints will be re-enabled later with a logging mechanism with debug/verbose printing options. * Decrement ref counts for used pyObjects * Restore build instructions for other compilers Content under the "Using other compilers" section has been accidentally deleted by a previous commit. Restoring back that content from the latest upstream repo. * CMake code cleanup Code clean up, commenting and formatting of CMake code. * Don't pass the unused device_info parameter to OpenVINOGraph ctor. * Add support for multiple I/O data types Adds support for the following tensor data types for graph inputs and outputs: 1) float 2) float16 3) int32 4) int16 5) int8 6) uint16 7) uint8 * cleanup setup.py module list definition * Deduce index of input using tracked input index map Ignores initializers in case they are ordered before inputs. * Removed debug statement in MO code Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * PR feedback * Removed per_sample_tolerance for openvino * Removed unnecessary disabled tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Removed debug function Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled tiny_yolo_v2 due to accuracy issues Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Changed the disabled reason for broken tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Reshape with no input Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Python formatting with Autopep8 * Minor fix for MYRIAD devices * Added zero dimension check *Removed setting batch size for the network Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Set the threshold to larger value for MNIST Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Removed setting higher threshold in provider_test_utils Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Check for --use_openvino in python wheel setup.py Add openvino modules to the setup script for building the wheel package only for --use_openvino a build option. * Removed nullptr checks for GetNode() Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com>
2019-06-18 15:58:53 +00:00
os.environ["ORT_TENSORRT_MAX_WORKSPACE_SIZE"] = "1073741824"
2020-04-19 03:48:30 +00:00
# Set maximum number of iterations to detect unsupported nodes
# and partition the models for TensorRT.
Add dynamic shape support in TensorRT execution provider (#2450) * remove onnx-tensorrt submodule * add new onnx-tensorrt submodule (experiment) for trt6 * update engine build for trt6 * update compile and compute for tensorrt6.0 * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * switch to onnx-tensorrt master for TensorRT6' * Update tensorrt_execution_provider.cc * Handle dynamic batch size and add memcpy in TensorRT EP * update test cases * Update tensorrt_execution_provider.cc * update onnx-tensorrt submodule * Update Dockerfile.ubuntu_tensorrt * Update Dockerfile.ubuntu_tensorrt * Update run_dockerbuild.sh * Update run_dockerbuild.sh * Update install_ubuntu.sh * Update concat_op_test.cc * Update tensorrt_execution_provider.cc * Upgrade TensorRT to version 6.0.1.5 * Update onnxruntime_providers.cmake * Update CMakeLists.txt * Update reduction_ops_test.cc * Update install_ubuntu.sh * Update Dockerfile.ubuntu_tensorrt * Update Dockerfile.tensorrt * Update BUILD.md * Update run_dockerbuild.sh * Update install_ubuntu.sh * Update onnxruntime_providers.cmake * Update install_ubuntu.sh * Update install_ubuntu.sh * Update gemm_test.cc * Update gather_op_test.cc * Update CMakeLists.txt * Removed submodule * update onnx-tensorrt submodule * update header file * Removed submodule * add submodule onnx-tensorrt kevin's branch shape-test' * add debugging code * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * merge master * Removed submodule * update onnx-tensorrt submodule * add more changes for dynamic shapes * Update tensorrt_execution_provider.cc * update for dynamic shape * update dynamic shape processing * fix logger issue * remove submodule onnx-tensorrt * add submodule onnx-tensorrt * add env variable min_subgraph_size * remove redundency * update document * use onnxruntime::make_unique * fix multi-run issue * remove some tests to save CI build time * Add dynamic shape test * Update TensorRT-ExecutionProvider.md * Add example of running Faster R-CNN model on TensorRT EP * Add more details on env variables * update environment variables * Update tensorrt_basic_test.cc * Update model tests * Update tensor_op_test.cc * remove --use_full_protobuf * Update build.py
2019-12-04 07:18:33 +00:00
os.environ["ORT_TENSORRT_MAX_PARTITION_ITERATIONS"] = "1000"
Upgrade TensorRT to version 7.0.0.11 (#2973) * update onnx-tensorrt submodule to trt7 branch * add fp16 option for TRT7 * switch to master branch of onnx tensorrt * update submodule * update to TensorRT7.0.0.11 * update to onnx-tensorrt for TensorRT7.0 * switch to private branch due to issues in master branch * remove trt_onnxify * disable warnings c4804 for TensorRT parser * disable warnings c4702 for TensorRT parser * add back sanity check of shape tensort input in the parser * disable some warnings for TensorRT7 * change fp16 threshold for TensorRT * update onn-tensorrt parser * fix cycle issue in faster-rcnn and add cycle detection in GetCapability * Update TensorRT container to v20.01 * Update TensorRT image name * Update linux-multi-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * disable rnn tests for TensorRT * disable rnn tests for TensorRT * disabled some unit test for TensorRT * update onnx-tensorrt submodule * update build scripts for TensorRT * formating the code * Update TensorRT-ExecutionProvider.md * Update BUILD.md * Update tensorrt_execution_provider.h * Update tensorrt_execution_provider.cc * Update win-gpu-tensorrt-ci-pipeline.yml * use GetEnvironmentVar function to get env virables and switch to Win-GPU-2019 agent pool for win CI build * change tensorrt path * change tensorrt path * fix win ci build issue * update code based on the reviews * fix build issue * roll back to cuda10.0 * add RemoveCycleTest for TensorRT * fix windows ci build issues * fix ci build issues * fix file permission * fix out of range issue for max_workspace_size_env
2020-02-12 15:03:58 +00:00
2020-04-19 03:48:30 +00:00
# Set minimum subgraph node size in graph partitioning
# for TensorRT.
Add dynamic shape support in TensorRT execution provider (#2450) * remove onnx-tensorrt submodule * add new onnx-tensorrt submodule (experiment) for trt6 * update engine build for trt6 * update compile and compute for tensorrt6.0 * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * switch to onnx-tensorrt master for TensorRT6' * Update tensorrt_execution_provider.cc * Handle dynamic batch size and add memcpy in TensorRT EP * update test cases * Update tensorrt_execution_provider.cc * update onnx-tensorrt submodule * Update Dockerfile.ubuntu_tensorrt * Update Dockerfile.ubuntu_tensorrt * Update run_dockerbuild.sh * Update run_dockerbuild.sh * Update install_ubuntu.sh * Update concat_op_test.cc * Update tensorrt_execution_provider.cc * Upgrade TensorRT to version 6.0.1.5 * Update onnxruntime_providers.cmake * Update CMakeLists.txt * Update reduction_ops_test.cc * Update install_ubuntu.sh * Update Dockerfile.ubuntu_tensorrt * Update Dockerfile.tensorrt * Update BUILD.md * Update run_dockerbuild.sh * Update install_ubuntu.sh * Update onnxruntime_providers.cmake * Update install_ubuntu.sh * Update install_ubuntu.sh * Update gemm_test.cc * Update gather_op_test.cc * Update CMakeLists.txt * Removed submodule * update onnx-tensorrt submodule * update header file * Removed submodule * add submodule onnx-tensorrt kevin's branch shape-test' * add debugging code * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * merge master * Removed submodule * update onnx-tensorrt submodule * add more changes for dynamic shapes * Update tensorrt_execution_provider.cc * update for dynamic shape * update dynamic shape processing * fix logger issue * remove submodule onnx-tensorrt * add submodule onnx-tensorrt * add env variable min_subgraph_size * remove redundency * update document * use onnxruntime::make_unique * fix multi-run issue * remove some tests to save CI build time * Add dynamic shape test * Update TensorRT-ExecutionProvider.md * Add example of running Faster R-CNN model on TensorRT EP * Add more details on env variables * update environment variables * Update tensorrt_basic_test.cc * Update model tests * Update tensor_op_test.cc * remove --use_full_protobuf * Update build.py
2019-12-04 07:18:33 +00:00
os.environ["ORT_TENSORRT_MIN_SUBGRAPH_SIZE"] = "1"
Upgrade TensorRT to version 7.0.0.11 (#2973) * update onnx-tensorrt submodule to trt7 branch * add fp16 option for TRT7 * switch to master branch of onnx tensorrt * update submodule * update to TensorRT7.0.0.11 * update to onnx-tensorrt for TensorRT7.0 * switch to private branch due to issues in master branch * remove trt_onnxify * disable warnings c4804 for TensorRT parser * disable warnings c4702 for TensorRT parser * add back sanity check of shape tensort input in the parser * disable some warnings for TensorRT7 * change fp16 threshold for TensorRT * update onn-tensorrt parser * fix cycle issue in faster-rcnn and add cycle detection in GetCapability * Update TensorRT container to v20.01 * Update TensorRT image name * Update linux-multi-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * disable rnn tests for TensorRT * disable rnn tests for TensorRT * disabled some unit test for TensorRT * update onnx-tensorrt submodule * update build scripts for TensorRT * formating the code * Update TensorRT-ExecutionProvider.md * Update BUILD.md * Update tensorrt_execution_provider.h * Update tensorrt_execution_provider.cc * Update win-gpu-tensorrt-ci-pipeline.yml * use GetEnvironmentVar function to get env virables and switch to Win-GPU-2019 agent pool for win CI build * change tensorrt path * change tensorrt path * fix win ci build issue * update code based on the reviews * fix build issue * roll back to cuda10.0 * add RemoveCycleTest for TensorRT * fix windows ci build issues * fix ci build issues * fix file permission * fix out of range issue for max_workspace_size_env
2020-02-12 15:03:58 +00:00
# Set FP16 flag
os.environ["ORT_TENSORRT_FP16_ENABLE"] = "0"
Trt execution provider (#382) * updated cmake files for trt * added trt execution provider * added trt basic test * removed trt_path action attribute * Add files via upload * Update build.py * Update trt_allocator.h * fixed issues found by reviewers * changed cast operator * added comment for custom kernel implementation * changed auto to auto& * changed to function compile APIs for TRT execution provider * changed to function compile APIs for TRT execution provider * added new DType DInt64 * adapted to the changes of onnxruntime_c_api * removed trt kernel (use function compile instead) * updated onnx-tensorrt submodule * set default memory type to TRT fused kernel * resolve merge conflict * fixed the issue that USE_CUDA conflicts with USE_TRT * construct graph by adding nodes in topological order * made changes for Windows * change buffers type * bypass HasImplementationOf check for TRT XP because TRT kernel is not registered * added domain to version info in rebuilt model proto * added trt to test option list * added DomainToVersionMap() to GraphViewer * removed Copy() * fixed broken code * format the code to clang format * used local reference to the frequently used values * fixed a couple of issues according to reviewers feedback * fixed a couple of issues according to reviewers feedback * added python binding for TRT and enable use_cuda when use_trt is on * fixed a redefinition issue * changed shared_ptr to unique_ptr on trt engines, and made a few changes required by reviewers * enabled trtexecution provider for unit tests * renamed trt to tensorrt * added tesorrt to python binding * update submodule onnx and onnx-tensorrt * made a couple of minor changes based on reviewer's feedback * added CUDA_CHECK * removed test code * fixed broken code after merge * updated onnx-tensorrt submodule * added post processing to align trt inputs/outputs with graph inputs/outputs * updated onnx submodule * added CUDA fallback for TensorRT and fixed TensorRT cmake issue * added ci pipeline for tensorrt and removed some redundent code from trt xp * fixed syntax issue * updated onnx-tensorrt submodule * fix trt build problem by: (#602) 1. Add additional /wd for debug build 2. Add io.h for additional targets 3. Bring back mb version of getopt * Update install_ubuntu.sh * Update linux-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * Update run_build.sh * Update run_build.sh * Update run_build.sh * Update run_build.sh * fixed the issue that GetKernelRegistry returns nullptr * merged master to this branch * moved some data types to private * fixed tensorrt CI pipeline issue * customized test data for TensorRT pipeline * added onnx-tensorrt in json file and fixed an issue in ci script * added comments
2019-03-14 19:00:39 +00:00
return tensorrt_home
2020-04-19 03:48:30 +00:00
def setup_migraphx_vars(args):
migraphx_home = None
if args.use_migraphx:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
print(f"migraphx_home = {args.migraphx_home}")
migraphx_home = args.migraphx_home or os.getenv("MIGRAPHX_HOME") or None
migraphx_home_not_valid = migraphx_home and not os.path.exists(migraphx_home)
if migraphx_home_not_valid:
raise BuildError(
"migraphx_home paths must be specified and valid.",
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"migraphx_home='{migraphx_home}' valid={migraphx_home_not_valid}.",
)
return migraphx_home or ""
def setup_dml_build(args, cmake_path, build_dir, configs):
2022-04-07 22:06:31 +00:00
if not args.use_dml:
return
if args.dml_path:
for expected_file in ["bin/DirectML.dll", "lib/DirectML.lib", "include/DirectML.h"]:
file_path = os.path.join(args.dml_path, expected_file)
if not os.path.exists(file_path):
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
raise BuildError("dml_path is invalid.", f"dml_path='{args.dml_path}' expected_file='{file_path}'.")
elif not args.dml_external_project:
for config in configs:
2020-04-19 03:48:30 +00:00
# Run the RESTORE_PACKAGES target to perform the initial
# NuGet setup.
cmd_args = [
cmake_path,
"--build",
get_config_build_dir(build_dir, config),
"--config",
config,
"--target",
"RESTORE_PACKAGES",
]
run_subprocess(cmd_args)
if args.minimal_build is not None:
raise BuildError("use_dml and minimal_build may not both be set")
def setup_rocm_build(args):
rocm_home = None
if args.use_rocm:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
print(f"rocm_home = {args.rocm_home}")
rocm_home = args.rocm_home or None
rocm_home_not_valid = rocm_home and not os.path.exists(rocm_home)
if rocm_home_not_valid:
raise BuildError(
"rocm_home paths must be specified and valid.",
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
f"rocm_home='{rocm_home}' valid={rocm_home_not_valid}.",
)
return rocm_home or ""
def run_android_tests(args, source_dir, build_dir, config, cwd):
if args.android_abi != "x86_64":
log.info(f"--android_abi ({args.android_abi}) is not x86_64, skipping running of Android tests on emulator.")
return
sdk_tool_paths = android.get_sdk_tool_paths(args.android_sdk_path)
device_dir = "/data/local/tmp"
def adb_push(src, dest, **kwargs):
return run_subprocess([sdk_tool_paths.adb, "push", src, dest], **kwargs)
def adb_shell(*args, **kwargs):
return run_subprocess([sdk_tool_paths.adb, "shell", *args], **kwargs)
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
def adb_install(*args, **kwargs):
return run_subprocess([sdk_tool_paths.adb, "install", *args], **kwargs)
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
def run_adb_shell(cmd):
# GCOV_PREFIX_STRIP specifies the depth of the directory hierarchy to strip and
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
# GCOV_PREFIX specifies the root directory
# for creating the runtime code coverage files.
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
if args.code_coverage:
adb_shell(f"cd {device_dir} && GCOV_PREFIX={device_dir} GCOV_PREFIX_STRIP={cwd.count(os.sep) + 1} {cmd}")
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
else:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
adb_shell(f"cd {device_dir} && {cmd}")
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
with contextlib.ExitStack() as context_stack:
if args.android_run_emulator:
avd_name = "ort_android"
system_image = f"system-images;android-{args.android_api};default;{args.android_abi}"
android.create_virtual_device(sdk_tool_paths, system_image, avd_name)
emulator_proc = context_stack.enter_context(
android.start_emulator(
sdk_tool_paths=sdk_tool_paths,
avd_name=avd_name,
extra_args=["-partition-size", "2047", "-wipe-data"],
)
)
context_stack.callback(android.stop_emulator, emulator_proc)
adb_push("testdata", device_dir, cwd=cwd)
adb_push(os.path.join(source_dir, "cmake", "external", "onnx", "onnx", "backend", "test"), device_dir, cwd=cwd)
adb_push("onnxruntime_test_all", device_dir, cwd=cwd)
adb_shell(f"chmod +x {device_dir}/onnxruntime_test_all")
adb_push("onnx_test_runner", device_dir, cwd=cwd)
adb_shell(f"chmod +x {device_dir}/onnx_test_runner")
run_adb_shell(f"{device_dir}/onnxruntime_test_all")
# remove onnxruntime_test_all as it takes up a _lot_ of space and can cause insufficient storage errors
# when we try to copy the java app to the device.
adb_shell(f"rm {device_dir}/onnxruntime_test_all")
if args.build_java:
# use the gradle wrapper under <repo root>/java
gradle_executable = os.path.join(source_dir, "java", "gradlew.bat" if is_windows() else "gradlew")
android_test_path = os.path.join(cwd, "java", "androidtest", "android")
run_subprocess(
[
gradle_executable,
"--no-daemon",
f"-DminSdkVer={args.android_api}",
"clean",
"connectedDebugAndroidTest",
],
cwd=android_test_path,
)
if args.use_nnapi:
run_adb_shell(f"{device_dir}/onnx_test_runner -e nnapi {device_dir}/test")
else:
run_adb_shell(f"{device_dir}/onnx_test_runner {device_dir}/test")
# run shared_lib_test if necessary
if args.build_shared_lib:
adb_push("libonnxruntime.so", device_dir, cwd=cwd)
adb_push("onnxruntime_shared_lib_test", device_dir, cwd=cwd)
adb_push("libcustom_op_library.so", device_dir, cwd=cwd)
adb_push("libcustom_op_get_const_input_test_library.so", device_dir, cwd=cwd)
adb_push("onnxruntime_customopregistration_test", device_dir, cwd=cwd)
adb_shell(f"chmod +x {device_dir}/onnxruntime_shared_lib_test")
adb_shell(f"chmod +x {device_dir}/onnxruntime_customopregistration_test")
run_adb_shell(f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_shared_lib_test")
run_adb_shell(
f"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{device_dir} {device_dir}/onnxruntime_customopregistration_test"
)
def run_ios_tests(args, source_dir, config, cwd):
is_targeting_iphone_simulator = "iphonesimulator" in args.apple_sysroot.lower()
if not is_targeting_iphone_simulator:
log.info(
f"Could not detect iphonesimulator target from --apple_sysroot ({args.apple_sysroot}), "
"skipping running of iOS tests on simulator."
)
return
host_arch = platform.machine()
if host_arch != args.osx_arch:
log.info(
f"Host arch ({host_arch}) and --osx_arch ({args.osx_arch}) mismatch, "
"skipping running of iOS tests on simulator."
)
return
simulator_device_info = subprocess.check_output(
[
sys.executable,
os.path.join(source_dir, "tools", "ci_build", "github", "apple", "get_simulator_device_info.py"),
],
text=True,
).strip()
log.debug(f"Simulator device info:\n{simulator_device_info}")
simulator_device_info = json.loads(simulator_device_info)
xc_test_schemes = [
"onnxruntime_test_all_xc",
]
if args.build_shared_lib:
xc_test_schemes += [
"onnxruntime_shared_lib_test_xc",
"onnxruntime_customopregistration_test_xc",
]
for xc_test_scheme in xc_test_schemes:
run_subprocess(
[
"xcodebuild",
"test-without-building",
"-project",
"./onnxruntime.xcodeproj",
"-configuration",
config,
"-scheme",
xc_test_scheme,
"-destination",
f"platform=iOS Simulator,id={simulator_device_info['device_udid']}",
],
cwd=cwd,
)
if args.build_apple_framework:
package_test_py = os.path.join(source_dir, "tools", "ci_build", "github", "apple", "test_apple_packages.py")
framework_info_file = os.path.join(cwd, "framework_info.json")
dynamic_framework_dir = os.path.join(cwd, config + "-" + args.apple_sysroot)
static_framework_dir = os.path.join(cwd, config + "-" + args.apple_sysroot, "static_framework")
# test dynamic framework
run_subprocess(
[
sys.executable,
package_test_py,
"--c_framework_dir",
dynamic_framework_dir,
"--framework_info_file",
framework_info_file,
"--variant",
"Full",
"--skip_macos_test",
],
cwd=cwd,
)
# test static framework
run_subprocess(
[
sys.executable,
package_test_py,
"--c_framework_dir",
static_framework_dir,
"--framework_info_file",
framework_info_file,
"--variant",
"Full",
"--skip_macos_test",
],
cwd=cwd,
)
def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
2018-11-20 00:48:22 +00:00
for config in configs:
log.info("Running tests for %s configuration", config)
cwd = get_config_build_dir(build_dir, config)
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
cwd = os.path.abspath(cwd)
Liqun/e2e transformer test (#3540) * initial change to transformer.py * prepare e2e transformer tests * refactor transformer tests * put test python files in a flat folder * fix typo pip install transform(s) * python 3.6 * python version to 3.6 in install_ubuntu.sh * remove argparser * to use opset ver 12 * workaround loss_scale naming patch in case of loss_fn_ * assign self.loss_fn_ so it can be checked * skip a few un-needed post-process steps * fix loss_scale_input_name, clean up post process steps * skip non-frontend tests * move cpu/cuda related files to coresponding cpu/cuda folder (#3668) Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * type cast for ratio is not necessary for dropout (#3682) Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * thrustallocator is not needed since cub is used directly for gather now. (#3683) Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * GatherND-12 Implementation (#3645) * Renamed, UT passing * Move GatherND CUDA Kerenl into onnxruntime * Merge GatherNDOpTest * Refactor Test code * Merge CPU Kernel Impl * Handle Negative Indice, Fix UT * Improve CUDA kernel to handle negative index * Minor Fixes * Preserve GatherND-1 Cuda kernel * Fix Mac build * fix UT * Fix Build * fix GatherNDOpTest.double > CUDA error cudaErrorInvalidDeviceFunction:invalid device function Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Peng Wang (pengwa) <pengwa@microsoft.com> * update with reviewers' comments * testBertTrainingGradientAccumulation was not using rtol and may fail occasionally with small (e-06) difference * fix merge mistakes Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Weixing Zhang <weixingzhang@users.noreply.github.com> Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: Sherlock <baihan.huang@gmail.com> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Peng Wang (pengwa) <pengwa@microsoft.com>
2020-04-30 19:26:38 +00:00
if args.android:
run_android_tests(args, source_dir, build_dir, config, cwd)
continue
elif args.ios:
run_ios_tests(args, source_dir, config, cwd)
continue
dll_path_list = []
if args.use_tensorrt:
dll_path_list.append(os.path.join(args.tensorrt_home, "lib"))
dll_path = None
if len(dll_path_list) > 0:
dll_path = os.pathsep.join(dll_path_list)
if not ctest_path and not is_windows():
executables = ["onnxruntime_test_all", "onnxruntime_mlas_test"]
if args.build_shared_lib:
executables.append("onnxruntime_shared_lib_test")
executables.append("onnxruntime_global_thread_pools_test")
executables.append("onnxruntime_customopregistration_test")
for exe in executables:
test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
else:
ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]
2020-03-11 21:25:37 +00:00
run_subprocess(ctest_cmd, cwd=cwd, dll_path=dll_path)
2018-11-20 00:48:22 +00:00
if args.enable_pybind:
python_path = None
if args.use_tvm:
python_path = str((Path(build_dir) / config / "_deps" / "tvm-src" / "python").resolve())
# Disable python tests in a reduced build as we don't know which ops have been included and which
# models can run.
if is_reduced_ops_build(args) or args.minimal_build is not None:
return
2018-11-20 00:48:22 +00:00
if is_windows():
cwd = os.path.join(cwd, config)
run_subprocess(
[sys.executable, "onnxruntime_test_python.py"], cwd=cwd, dll_path=dll_path, python_path=python_path
)
2021-07-22 22:24:36 +00:00
if not args.disable_contrib_ops:
run_subprocess([sys.executable, "onnxruntime_test_python_sparse_matmul.py"], cwd=cwd, dll_path=dll_path)
2021-07-22 22:24:36 +00:00
if args.enable_symbolic_shape_infer_tests:
run_subprocess(
[sys.executable, "onnxruntime_test_python_symbolic_shape_infer.py"], cwd=cwd, dll_path=dll_path
)
# For CUDA or DML enabled builds test IOBinding feature
if args.use_cuda or args.use_dml:
2020-07-10 21:02:28 +00:00
log.info("Testing IOBinding feature")
run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
if args.use_cuda:
log.info("Testing CUDA Graph feature")
run_subprocess([sys.executable, "onnxruntime_test_python_cudagraph.py"], cwd=cwd, dll_path=dll_path)
if args.use_dml:
log.info("Testing DML Graph feature")
run_subprocess([sys.executable, "onnxruntime_test_python_dmlgraph.py"], cwd=cwd, dll_path=dll_path)
if not args.disable_ml_ops and not args.use_tensorrt:
run_subprocess([sys.executable, "onnxruntime_test_python_mlops.py"], cwd=cwd, dll_path=dll_path)
if args.use_tensorrt:
run_subprocess(
[sys.executable, "onnxruntime_test_python_nested_control_flow_op.py"], cwd=cwd, dll_path=dll_path
)
2018-11-20 00:48:22 +00:00
try:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
import onnx # noqa: F401
2018-11-20 00:48:22 +00:00
onnx_test = True
except ImportError as error:
log.exception(error)
log.warning("onnx is not installed. The ONNX tests will be skipped.")
2018-11-20 00:48:22 +00:00
onnx_test = False
2018-11-20 00:48:22 +00:00
if onnx_test:
# Disable python onnx tests for TensorRT and CANN EP, because many tests are
# not supported yet.
if args.use_tensorrt or args.use_cann:
return
run_subprocess(
[sys.executable, "onnxruntime_test_python_backend.py"],
cwd=cwd,
dll_path=dll_path,
python_path=python_path,
)
if not args.disable_contrib_ops:
run_subprocess(
[sys.executable, "-m", "unittest", "discover", "-s", "quantization"], cwd=cwd, dll_path=dll_path
)
if args.enable_transformers_tool_test:
import google.protobuf
import numpy
numpy_init_version = numpy.__version__
pb_init_version = google.protobuf.__version__
run_subprocess(
[
sys.executable,
"-m",
"pip",
"install",
"-r",
"requirements/transformers-test/requirements.txt",
],
cwd=SCRIPT_DIR,
)
run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd)
# Restore initial numpy/protobuf version in case other tests use it
run_subprocess([sys.executable, "-m", "pip", "install", "numpy==" + numpy_init_version])
run_subprocess([sys.executable, "-m", "pip", "install", "protobuf==" + pb_init_version])
if not args.disable_ml_ops:
run_subprocess(
[sys.executable, "onnxruntime_test_python_backend_mlops.py"], cwd=cwd, dll_path=dll_path
)
run_subprocess(
[
sys.executable,
os.path.join(source_dir, "onnxruntime", "test", "onnx", "gen_test_models.py"),
"--output_dir",
"test_models",
],
cwd=cwd,
)
2020-03-11 21:25:37 +00:00
if not args.skip_onnx_tests:
run_subprocess([os.path.join(cwd, "onnx_test_runner"), "test_models"], cwd=cwd)
if config != "Debug":
run_subprocess([sys.executable, "onnx_backend_test_series.py"], cwd=cwd, dll_path=dll_path)
if not args.skip_keras_test:
try:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
import keras # noqa: F401
import onnxmltools # noqa: F401
onnxml_test = True
except ImportError:
log.warning("onnxmltools and keras are not installed. The keras tests will be skipped.")
onnxml_test = False
if onnxml_test:
run_subprocess([sys.executable, "onnxruntime_test_python_keras.py"], cwd=cwd, dll_path=dll_path)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
def tvm_run_python_tests(build_dir, configs):
for config in configs:
cwd = get_config_build_dir(build_dir, config)
if is_windows():
cwd = os.path.join(cwd, config)
python_path = os.path.join(build_dir, config, "_deps", "tvm-src", "python")
run_subprocess(
[sys.executable, "onnxruntime_test_python_tvm.py"], cwd=cwd, python_path=os.path.abspath(python_path)
)
def run_nodejs_tests(nodejs_binding_dir):
args = ["npm", "test", "--", "--timeout=90000"]
if is_windows():
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
args = ["cmd", "/c", *args]
run_subprocess(args, cwd=nodejs_binding_dir)
def build_python_wheel(
source_dir,
build_dir,
configs,
use_cuda,
cuda_version,
use_rocm,
use_migraphx,
rocm_version,
use_dnnl,
use_tensorrt,
use_openvino,
use_tvm,
use_vitisai,
use_acl,
use_armnn,
use_dml,
use_cann,
use_azure,
use_qnn,
wheel_name_suffix,
enable_training,
nightly_build=False,
default_training_package_device=False,
use_ninja=False,
enable_training_apis=False,
enable_rocm_profiling=False,
):
2018-11-20 00:48:22 +00:00
for config in configs:
cwd = get_config_build_dir(build_dir, config)
if is_windows() and not use_ninja:
2018-11-20 00:48:22 +00:00
cwd = os.path.join(cwd, config)
args = [sys.executable, os.path.join(source_dir, "setup.py"), "bdist_wheel"]
# Any combination of the following arguments can be applied
if nightly_build:
args.append("--nightly_build")
if default_training_package_device:
args.append("--default_training_package_device")
if wheel_name_suffix:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
args.append(f"--wheel_name_suffix={wheel_name_suffix}")
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
if enable_training:
args.append("--enable_training")
if enable_training_apis:
args.append("--enable_training_apis")
if enable_rocm_profiling:
args.append("--enable_rocm_profiling")
# The following arguments are mutually exclusive
if use_cuda:
2021-06-16 23:59:12 +00:00
# The following line assumes no other EP is enabled
args.append("--wheel_name_suffix=gpu")
if cuda_version:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
args.append(f"--cuda_version={cuda_version}")
elif use_rocm:
args.append("--use_rocm")
if rocm_version:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
args.append(f"--rocm_version={rocm_version}")
elif use_migraphx:
args.append("--use_migraphx")
elif use_openvino:
args.append("--use_openvino")
OpenVINO EP v2.0 (#3585) * Added FP16 transformations * Revert "Added CMAKE_BUILD_TYPE to make building dynamic" This reverts commit d3e17af1af655cfdc4d2fec33f52055caa525e85. * Added FP16 transformations for FP16 builds * Backend logic cleanup Cleans the backend(intel_graph.*) code in the following ways:- 1. Minimize global usage: Since all the IR graphs need to be re-generated on every Infer, it is bad practice to rely on globals for their saving and usage as there would be multiple readers and writers to the same global variable leading to incorrect usages or contentions. This change replaces globals with locals where possible. This change also fixes an existing bug with due to incorrect global usage. 2. Remove all unused functions. 3. Remove all unused headers and prepocessor directives. * removed commented out code * Disabled default optimization for Intel EP Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fix missed plugins.xml for python bindings * Fixed the build after latest master changes Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled unsupported ops for accelerators Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added some more disabled ops Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added environment variable to enable debugging Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added more debug statements Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsupported ops list for GPU and VPU Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsqueeze unit tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added error message to the status Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Overwrite Model proto with shape info from data Overwrites the shape info of Model proto with the shape from actual input data. Needed for inferring models with Dynamic shapes. * Removed print statement and disabled where op Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Reshape with Empty initializer * Added more debug statements for 1P * Don't allow 1D inputs with symbol for dimension * Disabled some 3rd phase ops * Disabled split and added zero dimension check for OutputDefs * Cleanup zero dimensionality check * Added different data type check for inputs and initializers * Added conditions for Mod, Cast and Pad * Removed unused variable * Disabled scan and added conditions for squeeze * Added changes for fixing all C++ unit tests * Implements Backend Manager class for caching Backend Manager provides a layer of indirection between EP interface and OV backend that provides caching services for models with symbolic dims in input shapes. * clean up commented blocks * clang-formatting * Read I/O type info from ModleProto Read the tensor element type information from ModelProto object, as FusedNode is no longer available. * code cleanup * clang-formatting * Added print statement for jenkins * Disabled some python tests * Changed the path of convert fp32 to fp16 hpp * Added conditions for BatchNorm in GetCapability * Fixed failed tests * Revert "Added conditions for BatchNorm in GetCapability" This reverts commit c3c28c3b00d27892c42546b35dacdd807a48ee90. * Added Intel to onnxruntime backends * pick up vars set by OV package setupvars.sh * Added conditions for Identity * remove a few cout prints * Added conditions for GPU_FP32 unit tests * Revert "pick up vars set by OV package setupvars.sh" This reverts commit 8199e029c03eae21a1a7ef6bfdc93d00e5d0198b. * Commented out fatal message for protobuf * Might need to be removed * Add interface class for current backend * moved common logic to base class * simplified cpu backend * Removed unused headers * use vectors to save i/o tensors for windows compatibility * move utils fxns to backend_utils namespace * rename ov_backend to ibackend * Factory pattern for backend creation * rename CPU backend to Basic backend * renamed to vad-M and added to factory list * Added conditions for VPU * Added print statements * Changed the logic for checking for symbolic shapes * Modified logic for zero dimension check * Removed VPU single dimension condition * Removed comments * Modified logic in DimensionCheck method * Remove legacy OpenVINO EP Remove all the legacy code for OpenVINO EP. UEP code will take its place going forward. This change does NOT remove OVEP files in the following areas asa they will be reused by UEP:- 1. Documentation: All .md files 2. Docker releated files 3. Python bindings 4. Java bindings 5. C# bindings 6. ORT Server 7. CI pipeline setup files * Rename Intel EP to OpenVINO EP * Added unique names to the subgraphs * Removed subgraphs with only constant inputs * Modified subgraph partitioning algorithm to remove const input subgraphs * Apply suggestion to onnxruntime/core/providers/openvino/openvino_execution_provider.cc * Tracking output names to fix the output order bug * Changed output names to a unordered map * Modified logic to check for symbolic input shapes * Fixed a bug in Reshape check * Added empty model path to Model constructor * Made necessary changes to cmake to build from the binary package * Changed INTEL_CVSDK_DIR to INTEL_OPENVINO_DIR * Enable dyn device selection with C++ API * Added Round operator to unsupported list * Modified subgraph partition logic for MYRIAD * Removed supported ops from the list * Enable dyn dev selection in Py API's * Add documentation for dynamic device selection * Use MYRIAD || HDDL instead of VPU * Removed temporary cast of Int64 to FP32 * Disabled unit Tests for CPU_FP32 and GPU_FP32 * Removed default "CPU" from unit tests to allow overriding * Removed ops Concat, Squeeze, Unsqueeze from unsupported list * Get the device id from info * Removed overwriting device_id and precision * Enabled ConvTranspose and EyeLike * Reordered unsupported ops in alphabetical order * Fixed syntax error * Fixed syntax error * Code clean-up: Handle exceptions, logs and formatting Code formatted according to ORT coding guidelines. * remove debug print from pybind code * updated docs with ops and models * formatting prints * Added default values for c and j for openvino * Overriding the values set for c and j to be 1 * BACKEND_OPENVINO should be empty if openvino is not in build * Overriding c value with default for perftest * fix VAD-M device string bug * Add IE error details to exceptions * Use IE specific device names in EP * Add VAD-F (FPGA) device support * Removed unecessary libraries from whl package * Code changes for Windows compatibility * Add VAD-F option to python API * [revert before merge] cmake changes for RC * Enable Windows build in CMake * Unset macro OPTIONAL for windows builds inference_engine.hpp's include chain defines a macro 'OPTIONAL' which conflicts with onnx project's headers when using MSVC. So would need to explictly unset it for MSVC. * Use a single copy of plugin/IE::Core Defined as a static member in Backend manager * Remove restriction of single subgraphs for myriad * Passed subgraph name to Backend to enhance log statements * Disabled zero dimension conditions * Disabled concat to remove zero dims * Enabled building ngraph as part of ORT * Removed serializing and added versioning * Fix CPU_FP32 unit tests * Removed unecessary condition * add ngraph.so.0.0 to .whl * Check for zero dimensions only for inputs and outputs * Restrict loading only 10 subgraphs on myriad * Build ngraph.dll within UEP. Doesn't link yet * Rename Linux included libngraph.so to libovep_ngraph.so Renames locally built libngraph.so containing ONNX importer to libovep_ngraph.so in order to avoid linkage conflicts with libngraph.so supplied by OpenVINO binary installer. Applies only for Linux builds. * use output_name cmake properties for lib name * fix .so name format in lib_name.patch * CMake code cleanup * Rename WIN32 included ngraph.dll to ovep_ngraph.dll To avoid conflict with ngraph.dll distributed by openvino. * Added myriad config for networks without 4 dimensions * Loading the 10 max clusters for inference on myriad * Refactor code and add Batching support Encapsulate subgraph settings into context structs. Add batching support for completely supported models. * Disabled some broken tests * use input_indexes to avoid batch-checking initializers * Avoid static initialization order error on WOS * Added candy to broken tests * InternalCI changes for 2020.2 * Updated DLDT instructions * Unsaved changed in install_openvino.sh * Changes after manual check * Remove custom ngraph onnx_import build for WOS ONNX Importer on WOS does not have protobuf issue. * Remove FP32ToFP16 ngraph pass This conversion is performed implicitly within IE. * Surround debug logic by #ifndef NDEBUG * remove invalid TODO comments * removed references to ngrpah-ep * clang-formatting * remove commented code * comment edits * updating copyright year to that of first OpenVINO-EP release * remove redundant log msg * Modified operator and topology support * Update build instructions * doc formatting * Fixed clip unit tests * Revert "Remove FP32ToFP16 ngraph pass" This reverts commit ec962ca5f315a5658ad980e740196f19de2639c1. * Applying FP16 transformation only for GPU FP16 * Fixed GPU FP32 python tests * automatically use full protobuf * disable onnxrt server for now * Disabled upsample * update dockerfile instructions * Removed MO paths and added ngraph path * Remove OVEP from ORT Server docs Will put it back in after validation * Updated path to Ngraph lib * Disabled Resize and some other python tests * Removed unnecesary header files * Use commit SHA to fetch ngraph repo * Avoid un-needed file changes due to version update * Fixed clip tests * Fixed Pow, max and min onnx tests * build.md doc typo * Update cmake patch command for ngraph src * remove dead cmake code for onnxruntime_USE_OPENVINO_BINARY * use spaces instead of tab * remove commented code * Add info about protobuf version * edit debug env var and enable for WIN32 * specify only version tag of 2020.2 for dockerbuilds * remove unnecessary file changes * Pass empty string as default argument to C# tests * Use ${OPENVINO_VERSION} to name openvino install directory in CI builds * Enabled unnecessarily disabled tests * Fixed ngraph protobuf patch * Fixed error in protobuf patch * Revert "Use ${OPENVINO_VERSION} to name openvino install directory in CI builds" This reverts commit 89e72adb8bf3b9712f5c81c5e13fe68c6c0df002. * Remove unsetting OPTIONAL macro This is no longer used in recent ONNX update onnx/onnx@da13be2, so this unset workaround is no longer necessary. * Use a null string default argument for C# API * Set OpenVINO version yml files and pass to CI Docker builds Git Tag info for DLDT as well as install directory are set using this value. This reverts commit 9fa9c20348ed72ae360a95c98e9b074d2f9fafc5. * Documentation: recommendation and instructions for disabling ORT graph optimizations * more doc updates * Reduced the number of models according to CI time constraints Co-authored-by: ynimmaga <yamini.nimmagadda@intel.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: Mikhail Treskin <mikhail.treskin@intel.com> Co-authored-by: mbencer <mateusz.bencer@intel.com> Co-authored-by: Aravind <aravindx.gunda@intel.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com>
2020-04-24 11:06:02 +00:00
elif use_dnnl:
args.append("--use_dnnl")
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
elif use_tvm:
args.append("--use_tvm")
elif use_vitisai:
args.append("--use_vitisai")
elif use_acl:
args.append("--use_acl")
elif use_armnn:
args.append("--use_armnn")
elif use_dml:
args.append("--wheel_name_suffix=directml")
elif use_cann:
args.append("--use_cann")
elif use_qnn:
args.append("--use_qnn")
elif use_azure:
args.append("--use_azure")
run_subprocess(args, cwd=cwd)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def build_nuget_package(
cmake_path,
source_dir,
build_dir,
configs,
use_cuda,
use_rocm,
use_openvino,
use_tensorrt,
use_dnnl,
use_tvm,
use_winml,
use_qnn,
enable_training_apis,
msbuild_extra_options,
):
if not (is_windows() or is_linux()):
raise BuildError(
2023-09-27 17:45:27 +00:00
"Currently csharp builds and nuget package creation is only supported on Windows and Linux platforms."
)
csharp_build_dir = os.path.join(source_dir, "csharp")
# in most cases we don't want/need to include the MAUI mobile targets, as doing so means the mobile workloads
# must be installed on the machine.
# they are only included in the Microsoft.ML.OnnxRuntime nuget package
sln = "OnnxRuntime.DesktopOnly.CSharp.sln"
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
have_exclude_mobile_targets_option = "IncludeMobileTargets=false" in msbuild_extra_options
# derive package name and execution provider based on the build args
target_name = "/t:CreatePackage"
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=None"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime"
enable_training_tests = "/p:TrainingEnabledNativeBuild=false"
if enable_training_apis:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
enable_training_tests = "/p:TrainingEnabledNativeBuild=true"
if use_cuda:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training.Gpu"
else:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training"
elif use_winml:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
package_name = "/p:OrtPackageId=Microsoft.AI.MachineLearning"
target_name = "/t:CreateWindowsAIPackage"
elif use_openvino:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=openvino"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino"
elif use_tensorrt:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=tensorrt"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT"
elif use_dnnl:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=dnnl"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.DNNL"
elif use_cuda:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu"
elif use_rocm:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm"
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
elif use_tvm:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=tvm"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm"
elif use_qnn:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
execution_provider = "/p:ExecutionProvider=qnn"
package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN"
elif any(map(lambda x: "OrtPackageId=" in x, msbuild_extra_options)):
pass
else:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
# we currently only allow building with mobile targets on Windows.
# it should be possible to allow building with android targets on Linux but that requires updating the
# csproj to separate the inclusion of ios and android targets.
if is_windows() and have_exclude_mobile_targets_option is False:
# use the sln that include the mobile targets
sln = "OnnxRuntime.CSharp.sln"
# explicitly exclude mobile targets in this case
if sln != "OnnxRuntime.CSharp.sln" and have_exclude_mobile_targets_option is False:
msbuild_extra_options.append("IncludeMobileTargets=false")
# expand extra_options to add prefix
extra_options = ["/p:" + option for option in msbuild_extra_options]
# we have to use msbuild directly if including Xamarin targets as dotnet only supports MAUI (.net6)
use_dotnet = sln != "OnnxRuntime.CSharp.sln"
if use_dotnet:
cmd_args = ["dotnet", "restore", sln, "--configfile", "NuGet.CSharp.config", *extra_options]
else:
cmd_args = ["msbuild", sln, "/t:restore", "/p:RestoreConfigFile=NuGet.CSharp.config", *extra_options]
# set build directory based on build_dir arg
native_dir = os.path.normpath(os.path.join(source_dir, build_dir))
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
ort_build_dir = "/p:OnnxRuntimeBuildDirectory=" + native_dir
run_subprocess(cmd_args, cwd=csharp_build_dir)
# build csharp bindings and create nuget package for each config
for config in configs:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
configuration = "/p:Configuration=" + config
if not use_winml:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
cmd_args = ["dotnet"] if use_dotnet else []
cmd_args += [
"msbuild",
sln,
configuration,
package_name,
ort_build_dir,
enable_training_tests,
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
*extra_options,
]
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
run_subprocess(cmd_args, cwd=csharp_build_dir)
else:
winml_interop_dir = os.path.join(source_dir, "csharp", "src", "Microsoft.AI.MachineLearning.Interop")
winml_interop_project = os.path.join(winml_interop_dir, "Microsoft.AI.MachineLearning.Interop.csproj")
winml_interop_project = os.path.normpath(winml_interop_project)
cmd_args = [
"dotnet",
"msbuild",
winml_interop_project,
configuration,
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
"/p:Platform=Any CPU",
ort_build_dir,
"-restore",
]
run_subprocess(cmd_args, cwd=csharp_build_dir)
if is_windows():
if not use_winml:
# user needs to make sure nuget is installed and added to the path variable
nuget_exe = "nuget.exe"
else:
# this path is setup by cmake/nuget_helpers.cmake for MSVC on Windows
nuget_exe = os.path.normpath(os.path.join(native_dir, config, "nuget_exe", "src", "nuget.exe"))
else:
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
# `dotnet pack` is used on Linux
nuget_exe = "NugetExe_not_set"
nuget_exe_arg = '/p:NugetExe="' + nuget_exe + '"'
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
cmd_args = ["dotnet"] if use_dotnet else []
cmd_args += [
"msbuild",
"OnnxRuntime.CSharp.proj",
target_name,
package_name,
configuration,
execution_provider,
ort_build_dir,
nuget_exe_arg,
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
*extra_options,
]
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
run_subprocess(cmd_args, cwd=csharp_build_dir)
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
log.info(f"nuget package was created in the {config} build output directory.")
def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt, use_dnnl, enable_training_apis):
# Currently only running tests on windows.
if not is_windows():
return
csharp_source_dir = os.path.join(source_dir, "csharp")
# define macros based on build args
macros = ""
if use_openvino:
macros += "USE_OPENVINO;"
if use_tensorrt:
macros += "USE_TENSORRT;"
if use_dnnl:
macros += "USE_DNNL;"
if use_cuda:
macros += "USE_CUDA;"
if enable_training_apis:
macros += "__TRAINING_ENABLED_NATIVE_BUILD__;__ENABLE_TRAINING_APIS__"
define_constants = ""
if macros:
define_constants = '/p:DefineConstants="' + macros + '"'
# set build directory based on build_dir arg
native_build_dir = os.path.normpath(os.path.join(source_dir, build_dir))
ort_build_dir = '/p:OnnxRuntimeBuildDirectory="' + native_build_dir + '"'
# Skip pretrained models test. Only run unit tests as part of the build
# add "--verbosity", "detailed" to this command if required
cmd_args = [
"dotnet",
"test",
"test\\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj",
"--filter",
"FullyQualifiedName!=Microsoft.ML.OnnxRuntime.Tests.InferenceTest.TestPreTrainedModels",
define_constants,
ort_build_dir,
]
run_subprocess(cmd_args, cwd=csharp_source_dir)
def generate_documentation(source_dir, build_dir, configs, validate):
# Randomly choose one build config
config = next(iter(configs))
cwd = get_config_build_dir(build_dir, config)
if is_windows():
cwd = os.path.join(cwd, config)
contrib_op_doc_path = os.path.join(source_dir, "docs", "ContribOperators.md")
opkernel_doc_path = os.path.join(source_dir, "docs", "OperatorKernels.md")
shutil.copy(os.path.join(source_dir, "tools", "python", "gen_contrib_doc.py"), cwd)
shutil.copy(os.path.join(source_dir, "tools", "python", "gen_opkernel_doc.py"), cwd)
# limit to just com.microsoft (excludes purely internal stuff like com.microsoft.nchwc).
run_subprocess(
[sys.executable, "gen_contrib_doc.py", "--output_path", contrib_op_doc_path, "--domains", "com.microsoft"],
cwd=cwd,
)
# we currently limit the documentation created by a build to a subset of EP's.
# Run get_opkernel_doc.py directly if you need/want documentation from other EPs that are enabled in the build.
run_subprocess(
[
sys.executable,
"gen_opkernel_doc.py",
"--output_path",
opkernel_doc_path,
"--providers",
"CPU",
"CUDA",
"DML",
],
cwd=cwd,
)
if validate:
try:
have_diff = False
def diff_file(path, regenerate_qualifiers=""):
[CUDA] GroupQueryAttention operator using FlashAttention (#17674) ### Description Added Group Query Attention op, supporting integer multiple number of heads for Q / KV. As of now, this op can only use FlashAttention kernel, meaning it only supports sm>=80 on Linux. Results from onnxruntime/test/python/transformers/benchmark_gqa.py show an on-average ~37% speed-up over Decoder Masked Multi-Head Attention, with even greater improvements for long past sequence lengths. ``` op batch s_kv heads h_dim ms TFLOPS gqa 16 2048 8 32 0.34 0.10 dmmha 16 2048 8 32 0.39 0.09 --------- gqa 16 2048 8 64 0.45 0.15 dmmha 16 2048 8 64 0.61 0.11 --------- gqa 16 2048 8 128 0.54 0.25 dmmha 16 2048 8 128 0.83 0.16 --------- gqa 16 2048 16 32 0.45 0.15 dmmha 16 2048 16 32 0.69 0.10 --------- gqa 16 2048 16 64 0.69 0.19 dmmha 16 2048 16 64 0.83 0.16 --------- gqa 16 2048 16 128 0.71 0.38 dmmha 16 2048 16 128 1.28 0.21 --------- gqa 16 2048 32 32 0.58 0.23 dmmha 16 2048 32 32 0.77 0.17 --------- gqa 16 2048 32 64 0.58 0.46 dmmha 16 2048 32 64 1.25 0.21 --------- gqa 16 2048 32 128 0.76 0.71 dmmha 16 2048 32 128 2.15 0.25 --------- gqa 16 2048 64 32 0.68 0.39 dmmha 16 2048 64 32 1.23 0.22 --------- gqa 16 2048 64 64 0.77 0.70 dmmha 16 2048 64 64 2.11 0.25 --------- gqa 16 2048 64 128 1.10 0.97 dmmha 16 2048 64 128 4.06 0.26 --------- gqa 16 2048 128 32 1.00 0.54 dmmha 16 2048 128 32 2.09 0.26 --------- gqa 16 2048 128 64 1.10 0.97 dmmha 16 2048 128 64 4.08 0.26 ``` ### Motivation and Context As of now, this op is targeted for use on LLama models, as it supports kv-caching and different number of heads for Q and KV (Grouped Query Attention). We plan to add support for more platforms, input formats, etc. in the future. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-10-09 19:43:12 +00:00
diff = subprocess.check_output(["git", "diff", "--ignore-blank-lines", path], cwd=source_dir).decode(
"utf-8"
)
if diff:
nonlocal have_diff
have_diff = True
log.warning(
f"The updated document {path} is different from the checked in version. "
f"Please regenerate the file{regenerate_qualifiers}, or copy the updated version from the "
"CI build's published artifacts if applicable."
)
log.debug("diff:\n" + diff) # noqa: G003
diff_file(opkernel_doc_path, " with CPU, CUDA and DML execution providers enabled")
diff_file(contrib_op_doc_path)
if have_diff:
# Output for the CI to publish the updated md files as an artifact
print("##vso[task.setvariable variable=DocUpdateNeeded]true")
raise BuildError("Generated documents have diffs. Check build output for details.")
except subprocess.CalledProcessError:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
raise BuildError("git diff returned non-zero error code") # noqa: B904
2018-11-20 00:48:22 +00:00
def main():
log.debug("Command line arguments:\n {}".format(" ".join(shlex.quote(arg) for arg in sys.argv[1:]))) # noqa: G001
2018-11-20 00:48:22 +00:00
args = parse_arguments()
Flash Attention v2 MHA (#17227) ### Description Integrate Flash Attention V2 to PackedMultiHeadAttention, MultiHeadAttention and Attention operators. Flash Attention v2 source code is from https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src. We did some change to remove dependency on Torch, then removed backward and bfloat16 related code. Add benchmark script (see benchmark_mha.sh) to compare different attention kernels for MultiHeadAttention operator. Current limitations for Flash Attention in PackedMultiHeadAttention, MultiHeadAttention and Attention operators: * Relative Position Bias is not supported * Different hidden size for Q and V is not supported * Only float16 is supported * Padding/attention mask is not supported * For MultiHeadAttention, when there is past or present input, bias shall be provided to activate flash attention * For Attention, past or present inputs will deactivate flash attention * Causal is not supported Some limitations (like attention mask and causal) might be removed later. Currently, Flash Attention v2 only works in Linux. For Windows, we will enable later with Cutlass 3.2. Two environment variables can be used for testing purpose: (1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default value is 0 (enable). Set it to "1" to disable it. (2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is "513", which means that we only enable flash attention when sequence length is larger than 512 for packed QKV format. Set it to "0" if you want to use flash attention v2 whenever possible. ### Speedup The following result is from Standard_ND96amsr_A100_v4 VM (A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per second for MultiHeadAttention operator. There are 3 input formats: * `Q,K,V` means separated inputs query, key and value of BxSxNH * `Q,KV` means packed KV, where key is 5D: BxSxNx2xH * `QKV` means packed QKV, where query is 5D: BxSxNx3xH Note that flash attention cannot use packed QKV format, so extra Transpose is needed. We found that TensorRT kernel is faster for sequence length <= 512 for packed QKV. The reason might be no transpose is needed for TensorRT kernel in this format. We also notice that, TensorRT kernel is faster for stable diffusion 512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while flash attention v2 is faster for 1024x1024 image (see seq_len=16384, heads=8, head_dim=40 below). input format | batch size | sequence length | heads | head dim | flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention (TFLOPs/s) -- | -- | -- | -- | -- | -- | -- | -- Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3 Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7 Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3 Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4 Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8 Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7 Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7 Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3 Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7 Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6 Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2 Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8 Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8 Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5 Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8 Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2 Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2 Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8 Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1 Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6 Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7 Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7 Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3 Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7 Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8 Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1 Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4 Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1 Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6 Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8 Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6 Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5 Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7 Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1 Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3 Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9 Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6 Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2 Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8 Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5 Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6 Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6 Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8 Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8 Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5 Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3 Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8 Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8 Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9 Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0 Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0 Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9 Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9 Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8 QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3 QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9 QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6 QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2 QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9 QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5 QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7 QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2 QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7 QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5 QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2 QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7 QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1 QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7 QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4 QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5 QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8 QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9 QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1 QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6 QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7 QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6 QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5 QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1 QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5 QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2 QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6 QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15 QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84 QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75 QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95 ### Known Issues NVCC uses huge memory while compiling flash attention CUDA kernel. Linux build with CUDA might fail when machine has limited memory while number of CPUs is large. Walkaround is to use a build machine with larger memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in build. ### Motivation and Context Increases speed and efficiency of MHA or Packed MHA. --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
print(args)
if os.getenv("ORT_BUILD_WITH_CACHE") == "1":
args.use_cache = True
if not is_windows():
if not args.allow_running_as_root:
is_root_user = os.geteuid() == 0
if is_root_user:
raise BuildError(
"Running as root is not allowed. If you really want to do that, use '--allow_running_as_root'."
)
cmake_extra_defines = normalize_arg_list(args.cmake_extra_defines)
cross_compiling = args.arm or args.arm64 or args.arm64ec or args.android
Enable Address Sanitizer in CI (#19073) ### Description 1. Add two build jobs for enabling Address Sanitizer in CI. One for Windows CPU, One for Linux CPU. 2. Set default compiler flags/linker flags in build.py for normal Windows/Linux/MacOS build. This can help control compiler flags in a more centralized way. 3. All Windows binaries in our official packages will be built with "/PROFILE" flag. Symbols of onnxruntime.dll can be found at [Microsoft public symbol server](https://learn.microsoft.com/en-us/windows-hardware/drivers/debugger/microsoft-public-symbols). Limitations: 1. On Linux Address Sanitizer ignores RPATH settings in ELF binaries. Therefore once Address Sanitizer is enabled, before running tests we need to manually set LD_LIBRARY_PATH properly otherwise libonnxruntime.so may not be able to find custom ops and shared EPs. 4. On Linux we also need to set LD_PRELOAD before running some tests(if the main executable, like python, is not built with address sanitizer. On Windows we do not need to. 5. On Windows before running python tests we should manually copy address sanitizer DLL to the onnxruntime/capi directory, because python 3.8 and above has enabled "Safe DLL Search Mode" that wouldn't use the information provided by PATH env. 6. On Linux Address Sanitizer found a lot of memory leaks from our python binding code. Therefore right now we cannot enable Address Sanitizer when building ONNX Runtime with python binding. 7. Address Sanitizer itself uses a lot of memory address space and delays memory deallocations, which is easy to cause OOM issues in 32-bit applications. We cannot run all the tests in onnxruntime_test_all in 32-bit mode with Address Sanitizer due to this reason. However, we still can run individual tests in such a way. We just cannot run all of them in one process. ### Motivation and Context To catch memory issues.
2024-01-12 15:24:40 +00:00
if args.enable_address_sanitizer:
# Disable ONNX Runtime's builtin memory checker
args.disable_memleak_checker = True
2020-04-19 03:48:30 +00:00
# If there was no explicit argument saying what to do, default
# to update, build and test (for native builds).
if not (args.update or args.clean or args.build or args.test or args.gen_doc):
log.debug("Defaulting to running update, build [and test for native builds].")
2018-11-20 00:48:22 +00:00
args.update = True
args.build = True
if cross_compiling:
args.test = args.android_abi == "x86_64" or args.android_abi == "arm64-v8a"
else:
args.test = True
2018-11-20 00:48:22 +00:00
if args.skip_tests:
args.test = False
Trt execution provider (#382) * updated cmake files for trt * added trt execution provider * added trt basic test * removed trt_path action attribute * Add files via upload * Update build.py * Update trt_allocator.h * fixed issues found by reviewers * changed cast operator * added comment for custom kernel implementation * changed auto to auto& * changed to function compile APIs for TRT execution provider * changed to function compile APIs for TRT execution provider * added new DType DInt64 * adapted to the changes of onnxruntime_c_api * removed trt kernel (use function compile instead) * updated onnx-tensorrt submodule * set default memory type to TRT fused kernel * resolve merge conflict * fixed the issue that USE_CUDA conflicts with USE_TRT * construct graph by adding nodes in topological order * made changes for Windows * change buffers type * bypass HasImplementationOf check for TRT XP because TRT kernel is not registered * added domain to version info in rebuilt model proto * added trt to test option list * added DomainToVersionMap() to GraphViewer * removed Copy() * fixed broken code * format the code to clang format * used local reference to the frequently used values * fixed a couple of issues according to reviewers feedback * fixed a couple of issues according to reviewers feedback * added python binding for TRT and enable use_cuda when use_trt is on * fixed a redefinition issue * changed shared_ptr to unique_ptr on trt engines, and made a few changes required by reviewers * enabled trtexecution provider for unit tests * renamed trt to tensorrt * added tesorrt to python binding * update submodule onnx and onnx-tensorrt * made a couple of minor changes based on reviewer's feedback * added CUDA_CHECK * removed test code * fixed broken code after merge * updated onnx-tensorrt submodule * added post processing to align trt inputs/outputs with graph inputs/outputs * updated onnx submodule * added CUDA fallback for TensorRT and fixed TensorRT cmake issue * added ci pipeline for tensorrt and removed some redundent code from trt xp * fixed syntax issue * updated onnx-tensorrt submodule * fix trt build problem by: (#602) 1. Add additional /wd for debug build 2. Add io.h for additional targets 3. Bring back mb version of getopt * Update install_ubuntu.sh * Update linux-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * Update run_build.sh * Update run_build.sh * Update run_build.sh * Update run_build.sh * fixed the issue that GetKernelRegistry returns nullptr * merged master to this branch * moved some data types to private * fixed tensorrt CI pipeline issue * customized test data for TensorRT pipeline * added onnx-tensorrt in json file and fixed an issue in ci script * added comments
2019-03-14 19:00:39 +00:00
if args.use_tensorrt:
args.use_cuda = True
if args.build_wheel or args.gen_doc or args.use_tvm or args.enable_training:
2018-11-20 00:48:22 +00:00
args.enable_pybind = True
if (
args.build_csharp
or args.build_nuget
or args.build_java
or args.build_nodejs
or (args.enable_pybind and not args.enable_training)
):
# If pyhon bindings are enabled, we embed the shared lib in the python package.
# If training is enabled, we don't embed the shared lib in the python package since training requires
# torch interop.
args.build_shared_lib = True
2018-11-20 00:48:22 +00:00
if args.build_nuget and cross_compiling:
raise BuildError("Currently nuget package creation is not supported while cross-compiling")
if args.enable_pybind:
if args.disable_rtti:
raise BuildError("Python bindings use typeid so you can't disable RTTI")
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
if args.disable_exceptions:
raise BuildError("Python bindings require exceptions to be enabled.")
if args.minimal_build is not None:
raise BuildError("Python bindings are not supported in a minimal build.")
if args.nnapi_min_api:
if not args.use_nnapi:
raise BuildError("Using --nnapi_min_api requires --use_nnapi")
if args.nnapi_min_api < 27:
raise BuildError("--nnapi_min_api should be 27+")
if args.build_wasm_static_lib:
args.build_wasm = True
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args.build_wasm:
if not args.disable_wasm_exception_catching and args.disable_exceptions:
# When '--disable_exceptions' is set, we set '--disable_wasm_exception_catching' as well
args.disable_wasm_exception_catching = True
if args.test and args.disable_wasm_exception_catching and not args.minimal_build:
raise BuildError("WebAssembly tests need exception catching enabled to run if it's not minimal build")
if args.test and args.enable_wasm_debug_info:
# With flag --enable_wasm_debug_info, onnxruntime_test_all.wasm will be very huge (>1GB). This will fail
# Node.js when trying to load the .wasm file.
# To debug ONNX Runtime WebAssembly, use ONNX Runtime Web to debug ort-wasm.wasm in browsers.
raise BuildError("WebAssembly tests cannot be enabled with flag --enable_wasm_debug_info")
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args.wasm_malloc is not None:
# mark --wasm_malloc as deprecated
log.warning(
"Flag '--wasm_malloc=<Value>' is deprecated. Please use '--emscripten_settings MALLOC=<Value>'."
)
Android code coverage (#6061) * Added Onnxruntime_GCOV_COVERAGE flag for Android. * Set CMAKE_SYSTEM_NAME explicityly for Android. * Added GCOV_PREFIX option to collect code coverage data. Added a new python script to generate code coverage info. Modified build pipeline to geneate Android code coverage info * Added build command line option --android_coverage * Added a comment describing the GCOV environment variables * Fixed PEP8 issues. * Added --android_coverage option to the build command. * Increased Android emulator memory from 3K to 8K. * Increased Android partition-size from 2GB to 4GB to overcome no-space-left-on-device error * Removed source_dir from command line args. * Use cwd absolute path to run tests. * Added commands to output the contents of /data/local/tmp on the emulator. * Added run_adb_shell function. * Format changes. * Removed keywd argument cwd. * Removed Android in the --build_dir path. * Removed commands added for debugging. * Removed exxtra new-lines. * Fix MacOs build pipeline failures by uninstalling openssl before running build script. * Revert "Fix MacOs build pipeline failures by uninstalling openssl before running build script." This reverts commit 90d0568fe533e9456c20d061a2d435c8fea48266. * Change dir to the build directory where the tar file is copied. * Changed the option from --android_coverage to --code_coverage * Moved steps to generate Android code coverage to run_nnap_code_coverage.sh * Require --android option if --code_coverage is specified. * No code coverage needed for onnx_test_runner. * Expect that the emulator is running when the script is executed. * Fixed the title in the buildpipeline step. * Fixed the formatting issue. * Added a command line argument, ORT_ROOT, to run_nnapi_code_coverage.sh script Co-authored-by: Satya Jandhyala <satyajandhyala@Satyas-Mac-mini.local>
2020-12-08 18:55:02 +00:00
if args.code_coverage and not args.android:
raise BuildError("Using --code_coverage requires --android")
if args.gen_api_doc and len(args.config) != 1:
raise BuildError("Using --get-api-doc requires a single build config")
# Disabling unit tests for GPU on nuget creation
if args.use_openvino and args.use_openvino != "CPU" and args.build_nuget:
[OpenVINO-EP] UEP v3.1 Release with OpenVINO 2021.4 (#8892) * Add command to skip tests * Remove support for OV_2021.3_LTS and ov_2021.1 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Removed request_id parameter from all references request_id parameter was being used with ov_2020.3 release. Starting from 2020.4 OV release, input_name paramater is being used instead to get the KernelContext_GetInput. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Enabling CI Logs in the branch * CI Commits to enable logs * Enable CI Print * Added Imagescaler op to the supported op's list Fixes test_tiny_yolo_V2 opset 8 model to support fully on OV-EP. This model is the older variation of tiny_yolo_v2 model which has Imagescaler op. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Added ops to fully support yolov3 model -Added changes to support yolov3 opset 10 model fully on CPU_FP32. -This also increases the operator coverage for GPU hardware. There by enabling yolov3 model on GPU with fewer subgraphs. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Enabling tiny_yolov3 model fully on CPU ->Enabled tiny_yolov3 model fully on CPU. -> Also reduces the number of subgraphs to infer this model on GPU Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Adding GatherND op support for CPU and GPU ->This enables yolov3_pytorch model to work with fewer subgraphs on CPU and GPU Devices. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes Albert model for ISV customer ConvTranspose op was getting rejected due to a condition. Fixed it. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Disabling this 4 cpp tests for openvino-ep These unit tests are failing with special conditions for conv_transpose op with output_shape attribute. so disabling them for now. Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Docker file changes for 2021.4-v3.1 * Remvoing duplicate code Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * ReduceMax No dimension supported * Fixes failing protobuf issue for docker Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Excluding openvinoep type for convtranpose test Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Disabled 2 Failing convtranspose tests with TensorRT EP Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: Aravind Gunda <aravindx.gunda@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com>
2021-08-31 16:23:13 +00:00
args.test = False
2022-04-07 22:06:31 +00:00
# GDK builds don't support testing
if args.use_gdk:
args.test = False
# enable_training is a higher level flag that enables all training functionality.
if args.enable_training:
args.enable_training_apis = True
args.enable_training_ops = True
2018-11-20 00:48:22 +00:00
configs = set(args.config)
# setup paths and directories
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
# cmake_path and ctest_path can be None. For example, if a person only wants to run the tests, he/she doesn't need
# to have cmake/ctest.
cmake_path = resolve_executable_path(args.cmake_path)
ctest_path = resolve_executable_path(args.ctest_path)
2018-11-20 00:48:22 +00:00
build_dir = args.build_dir
script_dir = os.path.realpath(os.path.dirname(__file__))
source_dir = os.path.normpath(os.path.join(script_dir, "..", ".."))
# if using cuda, setup cuda paths and env vars
cuda_home, cudnn_home = setup_cuda_vars(args)
mpi_home = args.mpi_home
nccl_home = args.nccl_home
snpe_root = args.snpe_root
acl_home = args.acl_home
acl_libs = args.acl_libs
armnn_home = args.armnn_home
armnn_libs = args.armnn_libs
qnn_home = args.qnn_home
Trt execution provider (#382) * updated cmake files for trt * added trt execution provider * added trt basic test * removed trt_path action attribute * Add files via upload * Update build.py * Update trt_allocator.h * fixed issues found by reviewers * changed cast operator * added comment for custom kernel implementation * changed auto to auto& * changed to function compile APIs for TRT execution provider * changed to function compile APIs for TRT execution provider * added new DType DInt64 * adapted to the changes of onnxruntime_c_api * removed trt kernel (use function compile instead) * updated onnx-tensorrt submodule * set default memory type to TRT fused kernel * resolve merge conflict * fixed the issue that USE_CUDA conflicts with USE_TRT * construct graph by adding nodes in topological order * made changes for Windows * change buffers type * bypass HasImplementationOf check for TRT XP because TRT kernel is not registered * added domain to version info in rebuilt model proto * added trt to test option list * added DomainToVersionMap() to GraphViewer * removed Copy() * fixed broken code * format the code to clang format * used local reference to the frequently used values * fixed a couple of issues according to reviewers feedback * fixed a couple of issues according to reviewers feedback * added python binding for TRT and enable use_cuda when use_trt is on * fixed a redefinition issue * changed shared_ptr to unique_ptr on trt engines, and made a few changes required by reviewers * enabled trtexecution provider for unit tests * renamed trt to tensorrt * added tesorrt to python binding * update submodule onnx and onnx-tensorrt * made a couple of minor changes based on reviewer's feedback * added CUDA_CHECK * removed test code * fixed broken code after merge * updated onnx-tensorrt submodule * added post processing to align trt inputs/outputs with graph inputs/outputs * updated onnx submodule * added CUDA fallback for TensorRT and fixed TensorRT cmake issue * added ci pipeline for tensorrt and removed some redundent code from trt xp * fixed syntax issue * updated onnx-tensorrt submodule * fix trt build problem by: (#602) 1. Add additional /wd for debug build 2. Add io.h for additional targets 3. Bring back mb version of getopt * Update install_ubuntu.sh * Update linux-gpu-tensorrt-ci-pipeline.yml * Update linux-gpu-tensorrt-ci-pipeline.yml * Update run_build.sh * Update run_build.sh * Update run_build.sh * Update run_build.sh * fixed the issue that GetKernelRegistry returns nullptr * merged master to this branch * moved some data types to private * fixed tensorrt CI pipeline issue * customized test data for TensorRT pipeline * added onnx-tensorrt in json file and fixed an issue in ci script * added comments
2019-03-14 19:00:39 +00:00
# if using tensorrt, setup tensorrt paths
tensorrt_home = setup_tensorrt_vars(args)
# if using migraphx, setup migraphx paths
migraphx_home = setup_migraphx_vars(args)
# if using rocm, setup rocm paths
rocm_home = setup_rocm_build(args)
# if using cann, setup cann paths
cann_home = setup_cann_vars(args)
if args.update or args.build:
for config in configs:
os.makedirs(get_config_build_dir(build_dir, config), exist_ok=True)
2018-11-20 00:48:22 +00:00
log.info("Build started")
2020-04-19 03:48:30 +00:00
if args.update:
if is_reduced_ops_build(args):
from reduce_op_kernels import reduce_ops
is_extended_minimal_build_or_higher = args.minimal_build is None or "extended" in args.minimal_build
for config in configs:
reduce_ops(
config_path=args.include_ops_by_config,
build_dir=get_config_build_dir(build_dir, config),
enable_type_reduction=args.enable_reduced_operator_type_support,
use_cuda=args.use_cuda,
is_extended_minimal_build_or_higher=is_extended_minimal_build_or_higher,
)
2019-01-15 18:29:00 +00:00
cmake_extra_args = []
path_to_protoc_exe = None
if args.path_to_protoc_exe:
path_to_protoc_exe = Path(args.path_to_protoc_exe)
if not path_to_protoc_exe.exists():
raise BuildError("The value to --path_to_protoc_exe is invalid.")
if not args.skip_submodule_sync:
update_submodules(source_dir)
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
if is_windows() and not args.build_wasm:
cpu_arch = platform.architecture()[0]
Refactor web-ci pipeline and delete eager mode CI pipeline (#15416) ### Description 1. Move it to a separated pool that use the same image as [the public hosted pool](https://learn.microsoft.com/en-us/azure/devops/pipelines/agents/hosted?view=azure-devops&tabs=yaml). Also, create a beta pool which contains the next version image of the hosted pool, and add jobs in our post merge pipeline to test if the next version image will break our CI. So, usually we will have at least one week to prepare. 2. Change the cmake generator in use in our pipelines from "Ninja" to "MingW Makefile", because the latest version of cmake doesn't work with the latest version of Ninja. People who prefer Ninja could still use ninja in their local build by passing "--cmake_generator ninja" to [build.py](https://github.com/microsoft/onnxruntime/blob/main/tools/ci_build/build.py). 3. Delete eager mode CI pipeline. ### Motivation and Context I need to update the software we have in our CI build machines, and I need to resolve this incompatibility issue. In more detail, the build error I hit was: em++: error: CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o: No such file or directory ("CMakeFilesonnxruntime_mlas_test.dirC_a_work1sonnxruntimetestmlasunittesttest_activation.cpp.o" was expected to be an input file, based on the commandline arguments provided) After this PR we will deprecate python 3.7 support. The eager mode CI pipeline is the last one that still use python 3.7. Then we can rework the PR #10953 made by [fs-eire](https://github.com/fs-eire) last year. Fixed [AB#14435](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/14435)
2023-04-10 17:41:04 +00:00
if args.cmake_generator == "Ninja":
if cpu_arch == "32bit" or args.arm or args.arm64 or args.arm64ec:
2020-04-19 03:48:30 +00:00
raise BuildError(
"To cross-compile with Ninja, load the toolset "
"environment for the target processor (e.g. Cross "
"Tools Command Prompt for VS)"
)
cmake_extra_args = ["-G", args.cmake_generator]
elif args.arm or args.arm64 or args.arm64ec:
if args.arm:
cmake_extra_args = ["-A", "ARM"]
elif args.arm64:
cmake_extra_args = ["-A", "ARM64"]
if args.buildasx:
cmake_extra_args += ["-D", "BUILD_AS_ARM64X=ARM64"]
elif args.arm64ec:
cmake_extra_args = ["-A", "ARM64EC"]
if args.buildasx:
cmake_extra_args += ["-D", "BUILD_AS_ARM64X=ARM64EC"]
cmake_extra_args += ["-G", args.cmake_generator]
2020-04-19 03:48:30 +00:00
# Cannot test on host build machine for cross-compiled
# builds (Override any user-defined behavior for test if any)
if args.test:
log.warning(
2020-04-19 03:48:30 +00:00
"Cannot test on host build machine for cross-compiled "
"ARM(64) builds. Will skip test running after build."
)
args.test = False
else:
target_arch = platform.machine()
if target_arch == "AMD64":
if cpu_arch == "32bit" or args.x86:
target_arch = "Win32"
else:
target_arch = "x64"
host_arch = "x64"
elif target_arch == "ARM64":
host_arch = "ARM64"
else:
raise BuildError("unknown python arch")
if args.msvc_toolset:
toolset = "host=" + host_arch + ",version=" + args.msvc_toolset
else:
toolset = "host=" + host_arch
2020-04-19 03:48:30 +00:00
if args.cuda_version:
toolset += ",cuda=" + args.cuda_version
elif args.cuda_home:
toolset += ",cuda=" + args.cuda_home
if args.windows_sdk_version:
target_arch += ",version=" + args.windows_sdk_version
cmake_extra_args = ["-A", target_arch, "-T", toolset, "-G", args.cmake_generator]
if args.enable_wcos:
cmake_extra_defines.append("CMAKE_USER_MAKE_RULES_OVERRIDE=wcos_rules_override.cmake")
elif args.cmake_generator is not None:
cmake_extra_args += ["-G", args.cmake_generator]
if is_macOS():
if (
not (args.ios or args.visionos)
and args.macos != "Catalyst"
and not args.android
and args.osx_arch == "arm64"
and platform.machine() == "x86_64"
):
if args.test:
log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.")
args.test = False
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args.build_wasm:
if is_windows() and platform.architecture()[0] == "32bit":
raise BuildError("Please use a 64-bit python to run this script")
if args.build_wheel or args.enable_pybind:
raise BuildError("WASM does not support pybind")
emsdk_version = args.emsdk_version
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
emsdk_dir = os.path.join(source_dir, "cmake", "external", "emsdk")
emsdk_file = os.path.join(emsdk_dir, "emsdk.bat") if is_windows() else os.path.join(emsdk_dir, "emsdk")
log.info("Installing emsdk...")
run_subprocess([emsdk_file, "install", emsdk_version], cwd=emsdk_dir)
log.info("Activating emsdk...")
build ONNXRuntime into WebAssembly (#6478) * Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake * Clean up CMakeLists.txt and add an example to create and compute a kernel * Load a model from bytes and remove graph building steps * Add all cpu and contrib ops with mlas library * WebAssembly build with Onnxruntime C/CXX API * Use protobuf cmakefile directory instead of adding every necessary source file * Fix invalid output at example * add missing files * Change an example to use Teams model and support ort mobile format * add API for javascript * fix input releasing in _ort_run() * update API * Let onnxruntime cmake build WebAssembly with option '--wasm' * allow one-step building for wasm * Make build script working on Linux and MacOS * Fix broken build from Windows command * Enable unit test on building WebAssembly * Resolve comments * update build flags * wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3 * Cleaned mlas unittest. * use glob * update comments * Update baseline due to loss scale fix (#6948) * fix stream sync issue (#6954) * Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960) * Update EyeLike CPU kernel. * Update Mod CPU kernel. * Update Multinomial CPU kernel. * Slight improvement to Pad CPU kernel binary size. * Update RandomNormal[Like], RandomUniform[Like] CPU kernels. * Fix warning from setting multiple MSVC warning level options. (#6917) Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one. * MLAS: quantized GEMM update (#6916) Various updates to the int8_t GEMMs: 1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before. 2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size. 3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator. * Implement QLinearAveragePool with unit tests. (#6896) Implement QLinearAveragePool with unit tests. * Attention fusion detect num_heads and hidden_size automatically (#6920) * fixed type to experimental session constructor (#6950) * fixed type to experimental session constructor Co-authored-by: David Medine <david.medine@brainproducts.com> * Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Fix possible fd leak in NNAPI (#6966) * Release buffers for prepacked tensors (#6820) Unsolved problems: 1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller? 2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too. 3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked. * Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963) * add CI * fix test in ci * fix flags for nsync in wasm build * add copyright banner * fix wasm source glob * add missing exports * resolve comments * Perf gain by make packb wide to 4 from 16 on GEMM for WASM. Remove no need direct conv in previous perf tuning. * fix buildbreak introduced from latest master merge * fix buildbreak in mlasi.h * resolve all comments except MLAS * rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each. and other changes according to PR feedback in mlas. * More complete scalar path in sgemm from Tracy. * Fix edge case handling in depthwise conv2d kernel 3x3. where: *) support input W==1 and H==1 *) recalc in accurate pad_right and pad_bottom *) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top * Add more test coverage for conv depthwise from Tracy. Fix one typo according to PR. * resolve comments * replace typedef by using * do not use throw in OrtRun() * output error message Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com> Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: David Medine <david.eric.medine@gmail.com> Co-authored-by: David Medine <david.medine@brainproducts.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
run_subprocess([emsdk_file, "activate", emsdk_version], cwd=emsdk_dir)
2020-04-19 03:48:30 +00:00
if args.enable_pybind and is_windows():
run_subprocess(
[sys.executable, "-m", "pip", "install", "-r", "requirements/pybind/requirements.txt"],
cwd=SCRIPT_DIR,
)
if args.use_rocm and args.rocm_version is None:
args.rocm_version = ""
if args.enable_external_custom_op_schemas and not is_linux():
raise BuildError("Registering external custom op schemas is only supported on Linux.")
2020-04-19 03:48:30 +00:00
generate_build_tree(
cmake_path,
source_dir,
build_dir,
cuda_home,
cudnn_home,
rocm_home,
mpi_home,
nccl_home,
tensorrt_home,
migraphx_home,
acl_home,
acl_libs,
armnn_home,
armnn_libs,
qnn_home,
snpe_root,
cann_home,
path_to_protoc_exe,
configs,
cmake_extra_defines,
args,
cmake_extra_args,
)
2020-04-19 03:48:30 +00:00
if args.clean:
2018-11-20 00:48:22 +00:00
clean_targets(cmake_path, build_dir, configs)
# if using DML, perform initial nuget package restore
setup_dml_build(args, cmake_path, build_dir, configs)
2020-04-19 03:48:30 +00:00
if args.build:
if args.parallel < 0:
Adopt linrtunner as the linting tool - take 2 (#15085) ### Description `lintrunner` is a linter runner successfully used by pytorch, onnx and onnx-script. It provides a uniform experience running linters locally and in CI. It supports all major dev systems: Windows, Linux and MacOs. The checks are enforced by the `Python format` workflow. This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors in Python code. `lintrunner` now runs all required python lints including `ruff`(replacing `flake8`), `black` and `isort`. Future lints like `clang-format` can be added. Most errors are auto-fixed by `ruff` and the fixes should be considered robust. Lints that are more complicated to fix are applied `# noqa` for now and should be fixed in follow up PRs. ### Notable changes 1. This PR **removed some suboptimal patterns**: - `not xxx in` -> `xxx not in` membership checks - bare excepts (`except:` -> `except Exception`) - unused imports The follow up PR will remove: - `import *` - mutable values as default in function definitions (`def func(a=[])`) - more unused imports - unused local variables 2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than flake8 and is more robust. We are using it successfully in onnx and onnx-script. It also supports auto-fixing many flake8 errors. 3. Removed the legacy flake8 ci flow and updated docs. 4. The added workflow supports SARIF code scanning reports on github, example snapshot: ![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png) 5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Unified linting experience in CI and local. Replacing https://github.com/microsoft/onnxruntime/pull/14306 --------- Signed-off-by: Justin Chu <justinchu@microsoft.com>
2023-03-24 22:29:03 +00:00
raise BuildError(f"Invalid parallel job count: {args.parallel}")
num_parallel_jobs = number_of_parallel_jobs(args)
build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, args.target)
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
if args.test:
if args.enable_onnx_tests:
source_onnx_model_dir = "C:\\local\\models" if is_windows() else "/data/models"
setup_test_data(source_onnx_model_dir, "models", build_dir, configs)
run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs)
# TODO(agladyshev):
# to support Windows, we need to update .github/workflows/windows.yml
# and add to the PATH variable the following value: C:Program Files\LLVM\bin
if args.enable_pybind and args.use_tvm and not is_windows():
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
tvm_run_python_tests(build_dir, configs)
# run node.js binding tests
if args.build_nodejs and not args.skip_nodejs_tests:
nodejs_binding_dir = os.path.normpath(os.path.join(source_dir, "js", "node"))
run_nodejs_tests(nodejs_binding_dir)
# Build packages after running the tests.
# NOTE: if you have a test that rely on a file which only get copied/generated during packaging step, it could
# fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it
# either.
2019-01-15 18:29:00 +00:00
if args.build:
# TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and
# the target OS is Windows
2019-01-15 18:29:00 +00:00
if args.build_wheel:
nightly_build = bool(os.getenv("NIGHTLY_BUILD") == "1")
default_training_package_device = bool(os.getenv("DEFAULT_TRAINING_PACKAGE_DEVICE") == "1")
build_python_wheel(
source_dir,
build_dir,
configs,
args.use_cuda,
args.cuda_version,
args.use_rocm,
args.use_migraphx,
args.rocm_version,
args.use_dnnl,
args.use_tensorrt,
args.use_openvino,
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
args.use_tvm,
args.use_vitisai,
args.use_acl,
args.use_armnn,
args.use_dml,
args.use_cann,
args.use_azure,
args.use_qnn,
args.wheel_name_suffix,
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
args.enable_training,
nightly_build=nightly_build,
default_training_package_device=default_training_package_device,
use_ninja=(args.cmake_generator == "Ninja"),
enable_training_apis=args.enable_training_apis,
enable_rocm_profiling=args.enable_rocm_profiling,
)
Rework/cleanup the C# build infrastructure for nuget packages. (#18127) ### Description Update the C# nuget build infrastructure to make building a test nuget package more user friendly and to simplify - Remove usage of dotnet and msbuild in CIs - was temporary requirement until .net 6 MAUI was added to the released Visual Studio - remove SelectedTargets property and its usage - Add property for excluding mobile targets - generally we exclude based on the nuget package name - can now specify `/p:IncludeMobileTargets=false` on the command line to force exclusion - support building test package using build.py `--build_nuget` better - limit inclusion of xamarin targets as building with them requires a lot more infrastructure - use msbuild directly if xamarin targets are included. use dotnet otherwise. - remove quoting of property values as it doesn't appear to be necessary and breaks when msbuild is being used - add infrastructure to be able to pack the nuget package on linux with `dotnet pack` - `nuget pack` is not user friendly as-per comments in changes - requires stub csproj to provide the nuspec path - Remove netstandard1.0 targets from nuspec - we removed support from the actual bindings previously - Remove usage of nuget-staging directory when creating nuget package on linux - the nuspec file element has a fully qualified path for a source file so there is no obvious benefit to copying to a staging directory prior to packing ### Motivation and Context Address issues with 1P users trying to create test nuget packages locally. Long overdue cleanup of CI complexity.
2023-11-03 16:05:17 +00:00
if args.build_nuget:
build_nuget_package(
cmake_path,
source_dir,
build_dir,
configs,
args.use_cuda,
args.use_rocm,
args.use_openvino,
args.use_tensorrt,
args.use_dnnl,
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
args.use_tvm,
args.use_winml,
args.use_qnn,
args.enable_training_apis,
normalize_arg_list(args.msbuild_extra_options),
)
if args.test and args.build_nuget:
run_csharp_tests(
source_dir,
build_dir,
args.use_cuda,
args.use_openvino,
args.use_tensorrt,
args.use_dnnl,
args.enable_training_apis,
)
if args.gen_doc:
# special case CI where we create the build config separately to building
if args.update and not args.build:
pass
else:
# assumes build has occurred for easier use in CI where we don't always build via build.py and need to run
# documentation generation as a separate task post-build
generate_documentation(source_dir, build_dir, configs, args.gen_doc == "validate")
2018-11-20 00:48:22 +00:00
if args.gen_api_doc and (args.build or args.test):
print("Generating Python doc for ORTModule...")
docbuild_dir = os.path.join(source_dir, "tools", "doc")
run_subprocess(
["bash", "builddoc.sh", os.path.dirname(sys.executable), source_dir, build_dir, args.config[0]],
cwd=docbuild_dir,
)
2018-11-20 00:48:22 +00:00
log.info("Build complete")
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
if __name__ == "__main__":
try:
sys.exit(main())
except BaseError as e:
log.error(str(e))
sys.exit(1)