2018-11-20 00:48:22 +00:00
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
2024-09-13 03:51:59 +00:00
# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2018-11-20 00:48:22 +00:00
# Licensed under the MIT License.
import argparse
2021-01-14 03:21:49 +00:00
import contextlib
2023-08-08 16:04:06 +00:00
import json
2018-11-20 00:48:22 +00:00
import os
2022-04-26 16:35:16 +00:00
import platform
2018-11-20 00:48:22 +00:00
import re
2021-08-05 16:41:17 +00:00
import shlex
2018-11-20 00:48:22 +00:00
import shutil
import subprocess
import sys
2024-11-06 17:54:55 +00:00
import warnings
2022-06-13 18:38:44 +00:00
from pathlib import Path
2020-04-19 03:48:30 +00:00
2023-11-21 13:37:48 +00:00
def version_to_tuple ( version : str ) - > tuple :
v = [ ]
for s in version . split ( " . " ) :
with contextlib . suppress ( ValueError ) :
v . append ( int ( s ) )
return tuple ( v )
2020-11-18 01:02:24 +00:00
SCRIPT_DIR = os . path . dirname ( os . path . realpath ( __file__ ) )
REPO_DIR = os . path . normpath ( os . path . join ( SCRIPT_DIR , " .. " , " .. " ) )
2020-12-30 22:22:55 +00:00
sys . path . insert ( 0 , os . path . join ( REPO_DIR , " tools " , " python " ) )
2020-11-18 01:02:24 +00:00
2021-01-14 03:21:49 +00:00
import util . android as android # noqa: E402
2022-04-26 16:35:16 +00:00
from util import get_logger , is_linux , is_macOS , is_windows , run # noqa: E402
2020-11-18 01:02:24 +00:00
log = get_logger ( " build " )
2019-01-10 06:33:14 +00:00
class BaseError ( Exception ) :
""" Base class for errors originating from build.py. """
2022-04-26 16:35:16 +00:00
2020-04-19 03:48:30 +00:00
2019-01-10 06:33:14 +00:00
class BuildError ( BaseError ) :
2018-12-18 21:23:32 +00:00
""" Error from running build steps. """
2020-05-14 21:15:06 +00:00
2018-12-18 21:23:32 +00:00
def __init__ ( self , * messages ) :
super ( ) . __init__ ( " \n " . join ( messages ) )
2020-04-19 03:48:30 +00:00
2019-01-10 06:33:14 +00:00
class UsageError ( BaseError ) :
""" Usage related error. """
2020-05-14 21:15:06 +00:00
2019-01-10 06:33:14 +00:00
def __init__ ( self , message ) :
super ( ) . __init__ ( message )
2020-04-19 03:48:30 +00:00
def _check_python_version ( ) :
2024-01-16 21:53:15 +00:00
required_minor_version = 8
2023-07-12 02:21:25 +00:00
if ( sys . version_info . major , sys . version_info . minor ) < ( 3 , required_minor_version ) :
2023-07-07 15:11:44 +00:00
raise UsageError (
2023-07-12 02:21:25 +00:00
f " Invalid Python version. At least Python 3. { required_minor_version } is required. "
f " Actual Python version: { sys . version } "
2023-07-07 15:11:44 +00:00
)
2020-02-21 07:26:25 +00:00
2020-12-18 00:21:33 +00:00
def _str_to_bool ( s ) :
""" Convert string to bool (in argparse context). """
2022-04-26 16:35:16 +00:00
if s . lower ( ) not in [ " true " , " false " ] :
2024-07-24 18:50:11 +00:00
raise ValueError ( f " Need bool; got { s !r} " )
2022-04-26 16:35:16 +00:00
return { " true " : True , " false " : False } [ s . lower ( ) ]
2020-12-18 00:21:33 +00:00
2020-04-19 03:48:30 +00:00
_check_python_version ( )
2018-11-20 00:48:22 +00:00
2020-11-17 00:58:43 +00:00
def _openvino_verify_device_type ( device_read ) :
2024-04-19 07:31:38 +00:00
choices = [ " CPU " , " GPU " , " NPU " ]
2021-10-07 23:02:19 +00:00
2022-04-26 16:35:16 +00:00
choices1 = [
2024-04-19 07:31:38 +00:00
" CPU_NO_PARTITION " ,
" GPU_NO_PARTITION " ,
2024-03-22 01:44:00 +00:00
" NPU_NO_PARTITION " ,
2024-06-28 15:31:02 +00:00
" NPU_NO_CPU_FALLBACK " ,
2022-04-26 16:35:16 +00:00
]
2020-11-17 00:58:43 +00:00
status_hetero = True
res = False
2022-04-26 16:35:16 +00:00
if device_read in choices :
2020-11-17 00:58:43 +00:00
res = True
2022-04-26 16:35:16 +00:00
elif device_read in choices1 :
2021-10-07 23:02:19 +00:00
res = True
2024-03-13 17:00:32 +00:00
elif device_read . startswith ( ( " HETERO: " , " MULTI: " , " AUTO: " ) ) :
2020-11-17 00:58:43 +00:00
res = True
comma_separated_devices = device_read . split ( " : " )
2022-04-26 16:35:16 +00:00
comma_separated_devices = comma_separated_devices [ 1 ] . split ( " , " )
if len ( comma_separated_devices ) < 2 :
2021-11-15 21:41:12 +00:00
print ( " At least two devices required in Hetero/Multi/Auto Mode " )
2020-11-17 00:58:43 +00:00
status_hetero = False
2024-03-22 01:44:00 +00:00
dev_options = [ " CPU " , " GPU " , " NPU " ]
2020-11-17 00:58:43 +00:00
for dev in comma_separated_devices :
2022-04-26 16:35:16 +00:00
if dev not in dev_options :
2020-11-17 00:58:43 +00:00
status_hetero = False
break
def invalid_hetero_build ( ) :
2023-09-05 17:59:27 +00:00
print ( " \n If trying to build Hetero/Multi/Auto, specify the supported devices along with it. \n " )
2021-11-15 21:41:12 +00:00
print ( " specify the keyword HETERO or MULTI or AUTO followed by the devices " )
2023-04-17 17:11:44 +00:00
print ( " in the order of priority you want to build \n " )
2021-11-15 21:41:12 +00:00
print ( " The different hardware devices that can be added in HETERO or MULTI or AUTO " )
2024-03-22 01:44:00 +00:00
print ( " are [ ' CPU ' , ' GPU ' , ' NPU ' ] \n " )
2023-04-26 03:59:42 +00:00
print ( " An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n " )
print ( " An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n " )
print ( " An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n " )
2020-11-17 00:58:43 +00:00
sys . exit ( " Wrong Build Type selected " )
2022-04-26 16:35:16 +00:00
if res is False :
2023-09-05 17:59:27 +00:00
print ( " \n You have selected wrong configuration for the build. " )
2020-11-17 00:58:43 +00:00
print ( " pick the build type for specific Hardware Device from following options: " , choices )
2021-10-07 23:02:19 +00:00
print ( " (or) from the following options with graph partitioning disabled: " , choices1 )
2020-11-17 00:58:43 +00:00
print ( " \n " )
2024-03-13 17:00:32 +00:00
if not ( device_read . startswith ( ( " HETERO " , " MULTI " , " AUTO " ) ) ) :
2020-11-17 00:58:43 +00:00
invalid_hetero_build ( )
sys . exit ( " Wrong Build Type selected " )
2022-04-26 16:35:16 +00:00
if status_hetero is False :
2020-11-17 00:58:43 +00:00
invalid_hetero_build ( )
return device_read
2020-04-19 03:48:30 +00:00
def parse_arguments ( ) :
2021-12-15 16:22:15 +00:00
class Parser ( argparse . ArgumentParser ) :
# override argument file line parsing behavior - allow multiple arguments per line and handle quotes
def convert_arg_line_to_args ( self , arg_line ) :
return shlex . split ( arg_line )
parser = Parser (
2020-04-19 03:48:30 +00:00
description = " ONNXRuntime CI build driver. " ,
2021-12-15 16:22:15 +00:00
usage = """
2020-04-19 03:48:30 +00:00
Default behavior is - - update - - build - - test for native architecture builds .
Default behavior is - - update - - build for cross - compiled builds .
The Update phase will update git submodules , and run cmake to generate makefiles .
The Build phase will build all projects .
The Test phase will run all unit tests , and optionally the ONNX tests .
Use the individual flags to only run the specified stages .
2021-12-15 16:22:15 +00:00
""" ,
# files containing arguments can be specified on the command line with "@<filename>" and the arguments within
# will be included at that point
2022-04-26 16:35:16 +00:00
fromfile_prefix_chars = " @ " ,
)
2018-11-20 00:48:22 +00:00
# Main arguments
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_dir " , required = True , help = " Path to the build directory. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --config " ,
nargs = " + " ,
default = [ " Debug " ] ,
2020-04-19 03:48:30 +00:00
choices = [ " Debug " , " MinSizeRel " , " Release " , " RelWithDebInfo " ] ,
2022-04-26 16:35:16 +00:00
help = " Configuration(s) to build. " ,
)
parser . add_argument ( " --update " , action = " store_true " , help = " Update makefiles. " )
parser . add_argument ( " --build " , action = " store_true " , help = " Build. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --clean " , action = " store_true " , help = " Run ' cmake --build --target clean ' for the selected config/s. "
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --parallel " ,
nargs = " ? " ,
const = " 0 " ,
default = " 1 " ,
type = int ,
2020-10-30 00:13:04 +00:00
help = " Use parallel build. The optional value specifies the maximum number of parallel jobs. "
2022-04-26 16:35:16 +00:00
" If the optional value is 0 or unspecified, it is interpreted as the number of CPUs. " ,
)
Flash Attention v2 MHA (#17227)
### Description
Integrate Flash Attention V2 to PackedMultiHeadAttention,
MultiHeadAttention and Attention operators.
Flash Attention v2 source code is from
https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src.
We did some change to remove dependency on Torch, then removed backward
and bfloat16 related code.
Add benchmark script (see benchmark_mha.sh) to compare different
attention kernels for MultiHeadAttention operator.
Current limitations for Flash Attention in PackedMultiHeadAttention,
MultiHeadAttention and Attention operators:
* Relative Position Bias is not supported
* Different hidden size for Q and V is not supported
* Only float16 is supported
* Padding/attention mask is not supported
* For MultiHeadAttention, when there is past or present input, bias
shall be provided to activate flash attention
* For Attention, past or present inputs will deactivate flash attention
* Causal is not supported
Some limitations (like attention mask and causal) might be removed
later.
Currently, Flash Attention v2 only works in Linux. For Windows, we will
enable later with Cutlass 3.2.
Two environment variables can be used for testing purpose:
(1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default
value is 0 (enable). Set it to "1" to disable it.
(2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is
"513", which means that we only enable flash attention when sequence
length is larger than 512 for packed QKV format. Set it to "0" if you
want to use flash attention v2 whenever possible.
### Speedup
The following result is from Standard_ND96amsr_A100_v4 VM
(A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per
second for MultiHeadAttention operator.
There are 3 input formats:
* `Q,K,V` means separated inputs query, key and value of BxSxNH
* `Q,KV` means packed KV, where key is 5D: BxSxNx2xH
* `QKV` means packed QKV, where query is 5D: BxSxNx3xH
Note that flash attention cannot use packed QKV format, so extra
Transpose is needed. We found that TensorRT kernel is faster for
sequence length <= 512 for packed QKV. The reason might be no transpose
is needed for TensorRT kernel in this format.
We also notice that, TensorRT kernel is faster for stable diffusion
512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while
flash attention v2 is faster for 1024x1024 image (see seq_len=16384,
heads=8, head_dim=40 below).
input format | batch size | sequence length | heads | head dim |
flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention
(TFLOPs/s)
-- | -- | -- | -- | -- | -- | -- | --
Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3
Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7
Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3
Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4
Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8
Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7
Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7
Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3
Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7
Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6
Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2
Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8
Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8
Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5
Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8
Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2
Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2
Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8
Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1
Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6
Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7
Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7
Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3
Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7
Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8
Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1
Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4
Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1
Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6
Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8
Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6
Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5
Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7
Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1
Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3
Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9
Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6
Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2
Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8
Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5
Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6
Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6
Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8
Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8
Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5
Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3
Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8
Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8
Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9
Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0
Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0
Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9
Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9
Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8
QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3
QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9
QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6
QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2
QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9
QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5
QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7
QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2
QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7
QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5
QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2
QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7
QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1
QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7
QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4
QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5
QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8
QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9
QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1
QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6
QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7
QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6
QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5
QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1
QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5
QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2
QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6
QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15
QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84
QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75
QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95
### Known Issues
NVCC uses huge memory while compiling flash attention CUDA kernel. Linux
build with CUDA might fail when machine has limited memory while number
of CPUs is large. Walkaround is to use a build machine with larger
memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in
build.
### Motivation and Context
Increases speed and efficiency of MHA or Packed MHA.
---------
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
parser . add_argument (
" --nvcc_threads " ,
nargs = " ? " ,
default = - 1 ,
type = int ,
2023-09-05 17:59:27 +00:00
help = " Maximum number of NVCC threads in each parallel job. "
" If the value is unspecified, it will be computed based on available memory and number of parallel jobs. " ,
Flash Attention v2 MHA (#17227)
### Description
Integrate Flash Attention V2 to PackedMultiHeadAttention,
MultiHeadAttention and Attention operators.
Flash Attention v2 source code is from
https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src.
We did some change to remove dependency on Torch, then removed backward
and bfloat16 related code.
Add benchmark script (see benchmark_mha.sh) to compare different
attention kernels for MultiHeadAttention operator.
Current limitations for Flash Attention in PackedMultiHeadAttention,
MultiHeadAttention and Attention operators:
* Relative Position Bias is not supported
* Different hidden size for Q and V is not supported
* Only float16 is supported
* Padding/attention mask is not supported
* For MultiHeadAttention, when there is past or present input, bias
shall be provided to activate flash attention
* For Attention, past or present inputs will deactivate flash attention
* Causal is not supported
Some limitations (like attention mask and causal) might be removed
later.
Currently, Flash Attention v2 only works in Linux. For Windows, we will
enable later with Cutlass 3.2.
Two environment variables can be used for testing purpose:
(1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default
value is 0 (enable). Set it to "1" to disable it.
(2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is
"513", which means that we only enable flash attention when sequence
length is larger than 512 for packed QKV format. Set it to "0" if you
want to use flash attention v2 whenever possible.
### Speedup
The following result is from Standard_ND96amsr_A100_v4 VM
(A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per
second for MultiHeadAttention operator.
There are 3 input formats:
* `Q,K,V` means separated inputs query, key and value of BxSxNH
* `Q,KV` means packed KV, where key is 5D: BxSxNx2xH
* `QKV` means packed QKV, where query is 5D: BxSxNx3xH
Note that flash attention cannot use packed QKV format, so extra
Transpose is needed. We found that TensorRT kernel is faster for
sequence length <= 512 for packed QKV. The reason might be no transpose
is needed for TensorRT kernel in this format.
We also notice that, TensorRT kernel is faster for stable diffusion
512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while
flash attention v2 is faster for 1024x1024 image (see seq_len=16384,
heads=8, head_dim=40 below).
input format | batch size | sequence length | heads | head dim |
flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention
(TFLOPs/s)
-- | -- | -- | -- | -- | -- | -- | --
Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3
Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7
Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3
Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4
Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8
Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7
Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7
Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3
Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7
Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6
Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2
Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8
Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8
Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5
Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8
Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2
Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2
Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8
Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1
Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6
Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7
Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7
Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3
Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7
Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8
Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1
Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4
Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1
Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6
Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8
Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6
Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5
Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7
Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1
Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3
Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9
Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6
Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2
Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8
Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5
Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6
Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6
Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8
Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8
Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5
Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3
Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8
Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8
Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9
Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0
Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0
Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9
Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9
Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8
QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3
QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9
QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6
QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2
QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9
QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5
QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7
QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2
QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7
QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5
QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2
QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7
QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1
QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7
QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4
QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5
QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8
QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9
QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1
QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6
QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7
QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6
QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5
QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1
QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5
QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2
QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6
QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15
QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84
QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75
QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95
### Known Issues
NVCC uses huge memory while compiling flash attention CUDA kernel. Linux
build with CUDA might fail when machine has limited memory while number
of CPUs is large. Walkaround is to use a build machine with larger
memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in
build.
### Motivation and Context
Increases speed and efficiency of MHA or Packed MHA.
---------
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
)
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --test " , action = " store_true " , help = " Run unit tests. " )
parser . add_argument ( " --skip_tests " , action = " store_true " , help = " Skip all tests. " )
2022-11-07 17:06:28 +00:00
parser . add_argument (
" --compile_no_warning_as_error " ,
action = " store_true " ,
help = " Preventing warnings from being treated as errors on compile. " ,
)
2020-04-21 03:30:24 +00:00
# Training options
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --enable_nvtx_profile " , action = " store_true " , help = " Enable NVTX profile in ORT. " )
parser . add_argument ( " --enable_memory_profile " , action = " store_true " , help = " Enable memory profile in ORT. " )
2020-05-25 05:55:24 +00:00
parser . add_argument (
2023-01-03 21:28:16 +00:00
" --enable_training " ,
action = " store_true " ,
help = " Enable full training functionality in ORT. Includes ORTModule and ORT Training APIs. " ,
2022-04-26 16:35:16 +00:00
)
2023-01-03 21:28:16 +00:00
parser . add_argument ( " --enable_training_apis " , action = " store_true " , help = " Enable ort training apis. " )
parser . add_argument ( " --enable_training_ops " , action = " store_true " , help = " Enable training ops in inference graph. " )
2023-02-07 21:47:48 +00:00
parser . add_argument ( " --enable_nccl " , action = " store_true " , help = " Enable Nccl. " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --mpi_home " , help = " Path to MPI installation dir " )
parser . add_argument ( " --nccl_home " , help = " Path to NCCL installation dir " )
2023-02-03 12:11:50 +00:00
parser . add_argument (
" --use_mpi " , nargs = " ? " , default = False , const = True , type = _str_to_bool , help = " Disabled by default. "
)
2020-04-09 22:31:22 +00:00
2018-11-20 00:48:22 +00:00
# enable ONNX tests
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_onnx_tests " ,
action = " store_true " ,
2020-04-19 03:48:30 +00:00
help = """ When running the Test phase, run onnx_test_running against
2022-04-26 16:35:16 +00:00
available test data directories . """ ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument ( " --path_to_protoc_exe " , help = " Path to protoc exe. " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --fuzz_testing " , action = " store_true " , help = " Enable Fuzz testing of the onnxruntime. " )
2020-07-06 23:34:34 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_symbolic_shape_infer_tests " ,
action = " store_true " ,
2020-09-18 16:31:06 +00:00
help = """ When running the Test phase, run symbolic shape inference against
2022-04-26 16:35:16 +00:00
available test data directories . """ ,
)
2019-03-27 04:58:01 +00:00
2021-03-22 17:20:33 +00:00
# generate documentation
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --gen_doc " ,
nargs = " ? " ,
const = " yes " ,
type = str ,
help = " Generate documentation listing standard ONNX operators and types implemented by "
2022-08-17 18:18:37 +00:00
" various execution providers and contrib operator schemas. Must be used for inference builds, only! "
2022-04-26 16:35:16 +00:00
" Use `--gen_doc validate` to validate these match the current contents in /docs. " ,
)
parser . add_argument ( " --gen-api-doc " , action = " store_true " , help = " Generate API documentation for PyTorch frontend " )
2021-03-22 17:20:33 +00:00
2018-11-20 00:48:22 +00:00
# CUDA related
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_cuda " , action = " store_true " , help = " Enable CUDA. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2023-04-17 17:11:44 +00:00
" --cuda_version " , help = " The version of CUDA toolkit to use. Auto-detect if not specified. e.g. 9.0 "
2022-04-26 16:35:16 +00:00
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --cuda_home " ,
help = " Path to CUDA home. "
2020-04-19 03:48:30 +00:00
" Read from CUDA_HOME environment variable if --use_cuda is true and "
2022-04-26 16:35:16 +00:00
" --cuda_home is not specified. " ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --cudnn_home " ,
help = " Path to CUDNN home. "
2020-04-19 03:48:30 +00:00
" Read from CUDNN_HOME environment variable if --use_cuda is true and "
2022-04-26 16:35:16 +00:00
" --cudnn_home is not specified. " ,
)
parser . add_argument ( " --enable_cuda_line_info " , action = " store_true " , help = " Enable CUDA line info. " )
2024-11-06 17:54:55 +00:00
parser . add_argument (
" --enable_cuda_nhwc_ops " , action = " store_true " , help = " Deprecated; default to enable CUDA NHWC ops in build. "
)
parser . add_argument ( " --disable_cuda_nhwc_ops " , action = " store_true " , help = " Disable CUDA NHWC ops in build. " )
2018-11-20 00:48:22 +00:00
# Python bindings
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --enable_pybind " , action = " store_true " , help = " Enable Python Bindings. " )
parser . add_argument ( " --build_wheel " , action = " store_true " , help = " Build Python Wheel. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --wheel_name_suffix " ,
2023-04-17 17:11:44 +00:00
help = " Suffix to append to created wheel names. This value is currently only used for nightly builds. " ,
2022-04-26 16:35:16 +00:00
)
parser . add_argument ( " --skip-keras-test " , action = " store_true " , help = " Skip tests with Keras if keras is installed " )
2018-11-20 00:48:22 +00:00
# C-Sharp bindings
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --build_csharp " ,
action = " store_true " ,
2020-08-26 19:33:48 +00:00
help = " Build C#.Net DLL and NuGet package. This should be only used in CI pipelines. "
2022-04-26 16:35:16 +00:00
" For building C# bindings and packaging them into nuget package use --build_nuget arg. " ,
)
2020-08-26 19:33:48 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --build_nuget " ,
action = " store_true " ,
2020-08-26 19:33:48 +00:00
help = " Build C#.Net DLL and NuGet package on the local machine. "
2022-04-26 16:35:16 +00:00
" Currently only Windows and Linux platforms are supported. " ,
)
2018-11-20 00:48:22 +00:00
2023-05-15 23:27:38 +00:00
parser . add_argument (
" --msbuild_extra_options " ,
nargs = " + " ,
action = " append " ,
help = " Extra properties to pass to msbuild during build. "
" These are just msbuild /p: options without the leading /p:. " ,
)
2019-12-06 19:43:40 +00:00
# Java bindings
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_java " , action = " store_true " , help = " Build Java bindings. " )
2018-11-20 00:48:22 +00:00
2020-05-27 20:30:22 +00:00
# Node.js binding
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_nodejs " , action = " store_true " , help = " Build Node.js binding and NPM package. " )
2020-05-27 20:30:22 +00:00
2021-04-27 17:06:30 +00:00
# Objective-C binding
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_objc " , action = " store_true " , help = " Build Objective-C binding. " )
2021-04-27 17:06:30 +00:00
2018-11-20 00:48:22 +00:00
# Build a shared lib
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_shared_lib " , action = " store_true " , help = " Build a shared library for the ONNXRuntime. " )
2020-03-13 23:54:55 +00:00
2021-04-15 23:47:53 +00:00
# Build a shared lib
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --build_apple_framework " , action = " store_true " , help = " Build a macOS/iOS framework for the ONNXRuntime. "
)
2021-04-15 23:47:53 +00:00
2018-11-20 00:48:22 +00:00
# Build options
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --cmake_extra_defines " ,
nargs = " + " ,
action = " append " ,
2020-04-19 03:48:30 +00:00
help = " Extra definitions to pass to CMake during build system "
2022-04-26 16:35:16 +00:00
" generation. These are just CMake -D options without the leading -D. " ,
)
parser . add_argument ( " --target " , help = " Build a specific target, e.g. winml_dll " )
2021-07-31 00:16:37 +00:00
# This flag is needed when :
# 1. The OS is 64 bits Windows
# 2. And the target binary is for 32 bits Windows
# 3. And the python used for running this script is 64 bits.
# But if you can get a 32 bits python, the build will run better and you won't need this flag.
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --x86 " ,
action = " store_true " ,
2021-07-31 00:16:37 +00:00
help = " [cross-compiling] Create Windows x86 makefiles. Requires --update and no existing cache "
2022-04-26 16:35:16 +00:00
" CMake setup. Delete CMakeCache.txt if needed " ,
)
2024-01-25 00:27:05 +00:00
parser . add_argument (
" --rv64 " ,
action = " store_true " ,
help = " [cross-compiling] Create riscv64 makefiles. Requires --update and no existing cache "
" CMake setup. Delete CMakeCache.txt if needed " ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --arm " ,
action = " store_true " ,
2021-07-31 00:16:37 +00:00
help = " [cross-compiling] Create ARM makefiles. Requires --update and no existing cache "
2022-04-26 16:35:16 +00:00
" CMake setup. Delete CMakeCache.txt if needed " ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --arm64 " ,
action = " store_true " ,
2021-07-31 00:16:37 +00:00
help = " [cross-compiling] Create ARM64 makefiles. Requires --update and no existing cache "
2022-04-26 16:35:16 +00:00
" CMake setup. Delete CMakeCache.txt if needed " ,
)
2021-03-29 22:35:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --arm64ec " ,
action = " store_true " ,
2021-07-31 00:16:37 +00:00
help = " [cross-compiling] Create ARM64EC makefiles. Requires --update and no existing cache "
2022-04-26 16:35:16 +00:00
" CMake setup. Delete CMakeCache.txt if needed " ,
)
2023-12-07 00:49:00 +00:00
parser . add_argument (
" --buildasx " ,
action = " store_true " ,
help = " [cross-compiling] Create ARM64X Binary. " ,
)
2024-01-25 00:27:05 +00:00
parser . add_argument (
" --riscv_toolchain_root " ,
type = str ,
default = " " ,
help = " Path to RISC-V toolchain root dir. e.g. --riscv_toolchain_root=$HOME/riscv-tools/ " ,
)
parser . add_argument (
" --riscv_qemu_path " ,
type = str ,
default = " " ,
help = " Path to RISC-V qemu. e.g. --riscv_qemu_path=$HOME/qemu-dir/qemu-riscv64 " ,
)
2024-06-18 20:52:34 +00:00
# https://gitlab.kitware.com/cmake/cmake/-/issues/25192
parser . add_argument (
" --msvc_toolset " ,
help = " MSVC toolset to use. e.g. 14.11. It doesn ' t work if the version number is in the range of [14.36, 14.39] " ,
)
2023-08-09 21:01:16 +00:00
parser . add_argument ( " --windows_sdk_version " , help = " Windows SDK version to use. e.g. 10.0.19041.0 " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --android " , action = " store_true " , help = " Build for Android " )
2020-09-17 22:53:14 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --android_abi " ,
default = " arm64-v8a " ,
2020-09-17 22:53:14 +00:00
choices = [ " armeabi-v7a " , " arm64-v8a " , " x86 " , " x86_64 " ] ,
2022-04-26 16:35:16 +00:00
help = " Specify the target Android Application Binary Interface (ABI) " ,
)
parser . add_argument ( " --android_api " , type = int , default = 27 , help = " Android API Level, e.g. 21 " )
parser . add_argument (
" --android_sdk_path " , type = str , default = os . environ . get ( " ANDROID_HOME " , " " ) , help = " Path to the Android SDK "
)
parser . add_argument (
" --android_ndk_path " , type = str , default = os . environ . get ( " ANDROID_NDK_HOME " , " " ) , help = " Path to the Android NDK "
)
parser . add_argument (
" --android_cpp_shared " ,
action = " store_true " ,
help = " Build with shared libc++ instead of the default static libc++. " ,
)
parser . add_argument ( " --android_run_emulator " , action = " store_true " , help = " Start up an Android emulator if needed. " )
parser . add_argument ( " --use_gdk " , action = " store_true " , help = " Build with the GDK toolchain. " )
parser . add_argument (
" --gdk_edition " ,
2023-04-17 17:11:44 +00:00
default = os . path . normpath ( os . environ . get ( " GameDKLatest " , " " ) ) . split ( os . sep ) [ - 1 ] , # noqa: SIM112
2022-04-26 16:35:16 +00:00
help = " Build with a specific GDK edition. Defaults to the latest installed. " ,
)
2022-04-07 22:06:31 +00:00
parser . add_argument ( " --gdk_platform " , default = " Scarlett " , help = " Sets the GDK target platform. " )
2024-10-25 03:21:51 +00:00
parser . add_argument ( " --enable_wasm_memory64 " , action = " store_true " , help = " Enable WebAssembly 64bit support " )
2024-04-24 01:15:07 +00:00
platform_group = parser . add_mutually_exclusive_group ( )
platform_group . add_argument ( " --ios " , action = " store_true " , help = " build for ios " )
platform_group . add_argument ( " --visionos " , action = " store_true " , help = " build for visionOS " )
platform_group . add_argument (
2024-03-20 17:55:19 +00:00
" --macos " ,
choices = [ " MacOSX " , " Catalyst " ] ,
help = " Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present. " ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2023-11-28 18:11:53 +00:00
" --apple_sysroot " , default = " " , help = " Specify the location name of the macOS platform SDK to be used "
2022-04-26 16:35:16 +00:00
)
2020-04-29 00:09:31 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --ios_toolchain_file " ,
default = " " ,
2023-04-17 17:11:44 +00:00
help = " Path to ios toolchain file, or cmake/onnxruntime_ios.toolchain.cmake will be used " ,
2022-04-26 16:35:16 +00:00
)
2024-04-24 01:15:07 +00:00
parser . add_argument (
" --visionos_toolchain_file " ,
default = " " ,
help = " Path to visionos toolchain file, or cmake/onnxruntime_visionos.toolchain.cmake will be used " ,
)
2020-09-17 22:53:14 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --xcode_code_signing_team_id " , default = " " , help = " The development team ID used for code signing in Xcode "
)
2021-04-15 23:47:53 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --xcode_code_signing_identity " , default = " " , help = " The development identity used for code signing in Xcode "
)
2020-04-29 00:09:31 +00:00
parser . add_argument (
2023-07-07 15:11:44 +00:00
" --use_xcode " ,
action = " store_const " ,
const = " Xcode " ,
dest = " cmake_generator " ,
2024-03-20 17:55:19 +00:00
help = " Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to ' --cmake_generator Xcode ' . " ,
2022-04-26 16:35:16 +00:00
)
2020-04-29 00:09:31 +00:00
parser . add_argument (
2020-11-30 19:22:08 +00:00
" --osx_arch " ,
default = " arm64 " if platform . machine ( ) == " arm64 " else " x86_64 " ,
2021-05-17 11:07:02 +00:00
choices = [ " arm64 " , " arm64e " , " x86_64 " ] ,
2022-04-26 16:35:16 +00:00
help = " Specify the Target specific architectures for macOS and iOS, This is only supported on MacOS " ,
)
2020-04-29 00:09:31 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --apple_deploy_target " ,
type = str ,
2020-04-29 00:09:31 +00:00
help = " Specify the minimum version of the target platform "
" (e.g. macOS or iOS) "
2022-04-26 16:35:16 +00:00
" This is only supported on MacOS " ,
)
2024-01-12 15:24:40 +00:00
# A 32-bit progress doesn't have enough memory to run all the tests in onnxruntime_test_all.
# Mimalloc is incompatible with address sanitizer.
# Address sanitizer itself is also a memory leak checker, so when it is enabled we should disable_memleak_checker.
2021-10-29 13:19:13 +00:00
parser . add_argument (
2024-01-12 15:24:40 +00:00
" --enable_address_sanitizer " , action = " store_true " , help = " Enable address sanitizer. Windows/Linux/MacOS only. "
)
2024-01-29 20:45:38 +00:00
# The following flag is mostly designed to be used in ONNX Runtime's Azure DevOps/Github build pipelines. Its main purpose is to make the built binaries pass BinSkim scan.
parser . add_argument ( " --use_binskim_compliant_compile_flags " , action = " store_true " , help = " Use preset compile flags. " )
2024-01-12 15:24:40 +00:00
parser . add_argument (
" --disable_memleak_checker " ,
action = " store_true " ,
help = " Disable memory leak checker from Windows build. By default it is enabled in Windows Debug build. This option is Windows only. " ,
2022-04-26 16:35:16 +00:00
)
2024-09-10 23:39:27 +00:00
# Dependency search with vcpkg
parser . add_argument (
" --use_vcpkg " ,
action = " store_true " ,
help = " Use vcpkg to search dependencies. Requires CMAKE_TOOLCHAIN_FILE for vcpkg.cmake " ,
)
2020-03-31 02:39:17 +00:00
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
# WebAssembly build
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --build_wasm " , action = " store_true " , help = " Build for WebAssembly " )
parser . add_argument ( " --build_wasm_static_lib " , action = " store_true " , help = " Build for WebAssembly static library " )
2024-08-22 18:21:00 +00:00
parser . add_argument ( " --emsdk_version " , default = " 3.1.59 " , help = " Specify version of emsdk " )
2022-03-22 18:55:45 +00:00
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --enable_wasm_simd " , action = " store_true " , help = " Enable WebAssembly SIMD " )
parser . add_argument ( " --enable_wasm_threads " , action = " store_true " , help = " Enable WebAssembly multi-threads support " )
2022-03-22 18:55:45 +00:00
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --disable_wasm_exception_catching " , action = " store_true " , help = " Disable exception catching in WebAssembly. "
)
2022-11-28 18:24:34 +00:00
parser . add_argument (
" --enable_wasm_api_exception_catching " , action = " store_true " , help = " Catch exceptions at top level api. "
)
2021-09-10 14:09:16 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_wasm_exception_throwing_override " ,
action = " store_true " ,
2021-09-10 14:09:16 +00:00
help = " Enable exception throwing in WebAssembly, this will override default disabling exception throwing "
2022-04-26 16:35:16 +00:00
" behavior when disable exceptions. " ,
)
2023-02-25 00:45:33 +00:00
parser . add_argument ( " --wasm_run_tests_in_browser " , action = " store_true " , help = " Run WebAssembly tests in browser " )
2022-03-22 18:55:45 +00:00
2021-10-12 05:04:50 +00:00
parser . add_argument (
2023-09-05 17:59:27 +00:00
" --enable_wasm_profiling " , action = " store_true " , help = " Enable WebAssembly profiling and preserve function names "
2022-04-26 16:35:16 +00:00
)
2021-04-30 05:22:52 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_wasm_debug_info " , action = " store_true " , help = " Build WebAssembly with DWARF format debug info "
)
2022-03-22 18:55:45 +00:00
parser . add_argument ( " --wasm_malloc " , help = " Specify memory allocator for WebAssembly " )
2021-06-04 06:08:56 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --emscripten_settings " ,
nargs = " + " ,
action = " append " ,
help = " Extra emscripten settings to pass to emcc using ' -s <key>=<value> ' during build. " ,
)
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
2021-07-01 16:34:03 +00:00
# Enable onnxruntime-extensions
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --use_extensions " ,
action = " store_true " ,
2021-08-28 04:45:52 +00:00
help = " Enable custom operators in onnxruntime-extensions, use git submodule onnxruntime-extensions "
2022-04-26 16:35:16 +00:00
" in path cmake/external/onnxruntime-extensions by default. " ,
)
2021-08-28 04:45:52 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --extensions_overridden_path " ,
type = str ,
help = " Path to pre-pulled onnxruntime-extensions, will override default onnxruntime-extensions path. " ,
)
2021-07-01 16:34:03 +00:00
2018-11-20 00:48:22 +00:00
# Arguments needed by CI
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --cmake_path " , default = " cmake " , help = " Path to the CMake program. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --ctest_path " ,
default = " ctest " ,
help = " Path to the CTest program. It can be an empty string. If it is empty, "
" we will use this script driving the test programs directly. " ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --skip_submodule_sync " ,
action = " store_true " ,
2023-04-17 17:11:44 +00:00
help = " Don ' t do a ' git submodule update ' . Makes the Update phase faster. " ,
2022-04-26 16:35:16 +00:00
)
parser . add_argument ( " --use_mimalloc " , action = " store_true " , help = " Use mimalloc allocator " )
parser . add_argument ( " --use_dnnl " , action = " store_true " , help = " Build with DNNL. " )
2020-11-13 04:17:54 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --dnnl_gpu_runtime " , action = " store " , default = " " , type = str . lower , help = " e.g. --dnnl_gpu_runtime ocl "
)
2020-11-13 04:17:54 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --dnnl_opencl_root " ,
action = " store " ,
default = " " ,
2020-11-13 04:17:54 +00:00
help = " Path to OpenCL SDK. "
2022-04-26 16:35:16 +00:00
' e.g. --dnnl_opencl_root " C:/Program Files (x86)/IntelSWTools/sw_dev_tools/OpenCL/sdk " ' ,
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --use_openvino " ,
nargs = " ? " ,
2024-04-19 07:31:38 +00:00
const = " CPU " ,
2020-11-17 00:58:43 +00:00
type = _openvino_verify_device_type ,
2022-04-26 16:35:16 +00:00
help = " Build with OpenVINO for specific hardware. " ,
)
2023-12-01 17:16:44 +00:00
parser . add_argument (
" --dnnl_aarch64_runtime " , action = " store " , default = " " , type = str . lower , help = " e.g. --dnnl_aarch64_runtime acl "
)
parser . add_argument (
" --dnnl_acl_root " ,
action = " store " ,
default = " " ,
help = ' Path to ACL ROOT DIR. e.g. --dnnl_acl_root " $HOME/ComputeLibrary/ " ' ,
)
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_coreml " , action = " store_true " , help = " Build with CoreML support. " )
2023-05-09 04:25:10 +00:00
parser . add_argument ( " --use_webnn " , action = " store_true " , help = " Build with WebNN support. " )
2022-06-03 21:10:02 +00:00
parser . add_argument ( " --use_snpe " , action = " store_true " , help = " Build with SNPE support. " )
parser . add_argument ( " --snpe_root " , help = " Path to SNPE SDK root. " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_nnapi " , action = " store_true " , help = " Build with NNAPI support. " )
2024-06-29 04:48:34 +00:00
parser . add_argument ( " --use_vsinpu " , action = " store_true " , help = " Build with VSINPU support. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --nnapi_min_api " , type = int , help = " Minimum Android API level to enable NNAPI, should be no less than 27 "
)
2023-04-26 04:20:03 +00:00
parser . add_argument ( " --use_jsep " , action = " store_true " , help = " Build with JavaScript kernels. " )
2024-10-08 23:10:46 +00:00
parser . add_argument ( " --use_webgpu " , action = " store_true " , help = " Build with WebGPU support. " )
Add implementation of WebGPU EP (#22591)
### Description
This PR adds the actual implementation of the WebGPU EP based on
https://github.com/microsoft/onnxruntime/pull/22318.
This change includes the following:
<details>
<summary><b>core framework of WebGPU EP</b></summary>
- WebGPU EP factory classes for:
- handling WebGPU options
- creating WebGPU EP instance
- creating WebGPU context
- WebGPU Execution Provider classes
- GPU Buffer allocator
- data transfer
- Buffer management classes
- Buffer Manager
- BufferCacheManager
- DisabledCacheManager
- SimpleCacheManager
- LazyReleaseCacheManager
- BucketCacheManager
- Program classes
- Program (base)
- Program Cache Key
- Program Manager
- Shader helper classes
- Shader Helper
- ShaderIndicesHelper
- ShaderVariableHelper
- Utils
- GPU Query based profiler
- compute context
- string utils
- Miscs
- Python binding webgpu support (basic)
</details>
<details>
<summary><b>Kernel implementation</b></summary>
- onnx.ai (default opset):
- Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf,
Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh,
Tanh, Not, Cast
- Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu,
LeakyRelu, ThresholdedRelu, Gelu
- Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater,
GreaterOrEqual, Less, LessOrEqual
- (Tensors): Shape, Reshape, Squeeze, Unsqueeze
- Where
- Transpose
- Concat
- Expand
- Gather
- Tile
- Range
- LayerNormalization
- com.microsoft
- FastGelu
- MatMulNBits
- MultiHeadAttention
- RotaryEmbedding
- SkipLayerNormalization
- LayerNormalization
- SimplifiedLayerNormalization
- SkipSimplifiedLayerNormalization
</details>
<details>
<summary><b>Build, test and CI pipeline integration</b></summary>
- build works for Windows, macOS and iOS
- support onnxruntime_test_all and python node test
- added a new unit test for `--use_external_dawn` build flag.
- updated MacOS pipeline to build with WebGPU support
- added a new pipeline for WebGPU Windows
</details>
This change does not include:
- Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
parser . add_argument ( " --use_external_dawn " , action = " store_true " , help = " Treat Dawn as an external dependency. " )
2023-03-01 21:48:20 +00:00
parser . add_argument ( " --use_qnn " , action = " store_true " , help = " Build with QNN support. " )
parser . add_argument ( " --qnn_home " , help = " Path to QNN SDK dir. " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_rknpu " , action = " store_true " , help = " Build with RKNPU. " )
parser . add_argument ( " --use_preinstalled_eigen " , action = " store_true " , help = " Use pre-installed Eigen. " )
2020-03-13 23:54:55 +00:00
parser . add_argument ( " --eigen_path " , help = " Path to pre-installed Eigen. " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --enable_msinternal " , action = " store_true " , help = " Enable for Microsoft internal builds only. " )
2018-11-20 00:48:22 +00:00
parser . add_argument ( " --llvm_path " , help = " Path to llvm dir " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_vitisai " , action = " store_true " , help = " Build with Vitis-AI " )
parser . add_argument ( " --use_tvm " , action = " store_true " , help = " Build with TVM " )
parser . add_argument ( " --tvm_cuda_runtime " , action = " store_true " , default = False , help = " Build TVM with CUDA support " )
2022-07-13 08:48:42 +00:00
parser . add_argument (
" --use_tvm_hash " , action = " store_true " , help = " Build ipp-crypto for hash generation. It is used by TVM EP only "
)
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_tensorrt " , action = " store_true " , help = " Build with TensorRT " )
2023-04-05 14:53:29 +00:00
parser . add_argument (
" --use_tensorrt_builtin_parser " , action = " store_true " , default = True , help = " Use TensorRT builtin parser "
)
parser . add_argument ( " --use_tensorrt_oss_parser " , action = " store_true " , help = " Use TensorRT OSS parser " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --tensorrt_home " , help = " Path to TensorRT installation dir " )
2022-12-14 21:06:03 +00:00
parser . add_argument ( " --test_all_timeout " , default = " 10800 " , help = " Set timeout for onnxruntime_test_all " )
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --use_migraphx " , action = " store_true " , help = " Build with MIGraphX " )
parser . add_argument ( " --migraphx_home " , help = " Path to MIGraphX installation dir " )
parser . add_argument ( " --use_full_protobuf " , action = " store_true " , help = " Use the full protobuf library " )
2022-07-13 08:48:42 +00:00
parser . add_argument (
" --llvm_config " ,
type = str ,
default = " " ,
2023-09-05 17:59:27 +00:00
help = " Path to llvm-config.exe for LLVM built from sources. It is strongly needed for build on Windows " ,
2022-07-13 08:48:42 +00:00
)
2022-04-26 16:35:16 +00:00
parser . add_argument (
" --skip_onnx_tests " ,
action = " store_true " ,
help = " Explicitly disable all onnx related tests. Note: Use --skip_tests to skip all tests. " ,
)
parser . add_argument ( " --skip_winml_tests " , action = " store_true " , help = " Explicitly disable all WinML related tests " )
parser . add_argument ( " --skip_nodejs_tests " , action = " store_true " , help = " Explicitly disable all Node.js binding tests " )
2021-06-02 07:47:40 +00:00
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_msvc_static_runtime " , action = " store_true " , help = " Enable static linking of MSVC runtimes. "
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
" --cmake_generator " ,
2023-04-10 17:41:04 +00:00
choices = [
" MinGW Makefiles " ,
2023-06-09 15:51:49 +00:00
" Ninja " ,
2023-04-10 17:41:04 +00:00
" NMake Makefiles " ,
2024-07-12 04:21:38 +00:00
" NMake Makefiles JOM " ,
2023-06-09 15:51:49 +00:00
" Unix Makefiles " ,
" Visual Studio 17 2022 " ,
2023-04-10 17:41:04 +00:00
" Xcode " ,
] ,
2023-04-12 20:47:58 +00:00
default = None ,
2023-07-07 15:11:44 +00:00
help = " Specify the generator that CMake invokes. " ,
2022-04-26 16:35:16 +00:00
)
parser . add_argument ( " --use_dml " , action = " store_true " , help = " Build with DirectML. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --dml_path " ,
type = str ,
default = " " ,
help = " Path to a custom DirectML installation (must have bin/, lib/, and include/ subdirectories). " ,
)
parser . add_argument ( " --use_winml " , action = " store_true " , help = " Build with WinML. " )
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --winml_root_namespace_override " , type = str , help = " Specify the namespace that WinML builds into. "
)
2020-04-19 03:48:30 +00:00
parser . add_argument (
2022-06-09 01:05:11 +00:00
" --dml_external_project " , action = " store_true " , help = " Build with DirectML as an external project. "
2022-05-10 23:57:47 +00:00
)
2021-06-09 02:43:59 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --use_telemetry " , action = " store_true " , help = " Only official builds can set this flag to enable telemetry. "
)
parser . add_argument ( " --enable_wcos " , action = " store_true " , help = " Build for Windows Core OS. " )
2024-01-12 15:24:40 +00:00
# Do not enable LTO when the compiler is MSVC and the flag for generating debug symbols is set to /Z7 and training
# is also enabled. Because both LTO and /Zi could significantly increase *.obj/*.lib files' size, and on Windows
# there is a 4GB per file limit(ERROR LNK1248). We may solve the issue by splitting the big static libs to smaller
# ones. Before the refactoring work is done, we should avoid enabling LTO and ccache at the same time because ccache
# needs /Z7.
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --enable_lto " , action = " store_true " , help = " Enable Link Time Optimization " )
parser . add_argument ( " --enable_transformers_tool_test " , action = " store_true " , help = " Enable transformers tool test " )
2020-04-20 08:05:28 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --use_acl " ,
2024-09-13 03:51:59 +00:00
action = " store_true " ,
2022-04-26 16:35:16 +00:00
help = " Build with ACL for ARM architectures. " ,
)
parser . add_argument ( " --acl_home " , help = " Path to ACL home dir " )
parser . add_argument ( " --acl_libs " , help = " Path to ACL libraries " )
parser . add_argument ( " --use_armnn " , action = " store_true " , help = " Enable ArmNN Execution Provider. " )
2020-06-03 17:27:51 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --armnn_relu " , action = " store_true " , help = " Use the Relu operator implementation from the ArmNN EP. "
)
2020-07-22 05:25:58 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --armnn_bn " , action = " store_true " , help = " Use the Batch Normalization operator implementation from the ArmNN EP. "
)
parser . add_argument ( " --armnn_home " , help = " Path to ArmNN home dir " )
parser . add_argument ( " --armnn_libs " , help = " Path to ArmNN libraries " )
parser . add_argument ( " --build_micro_benchmarks " , action = " store_true " , help = " Build ONNXRuntime micro-benchmarks. " )
# options to reduce binary size
2020-10-22 16:29:44 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --minimal_build " ,
default = None ,
nargs = " * " ,
type = str . lower ,
help = " Create a build that only supports ORT format models. "
" See https://onnxruntime.ai/docs/tutorials/mobile/ for more information. "
" RTTI is automatically disabled in a minimal build. "
" To enable execution providers that compile kernels at runtime (e.g. NNAPI) pass ' extended ' "
" as a parameter. e.g. ' --minimal_build extended ' . "
" To enable support for custom operators pass ' custom_ops ' as a parameter. "
" e.g. ' --minimal_build custom_ops ' . This can be combined with an ' extended ' build by passing "
" ' --minimal_build extended custom_ops ' " ,
)
2020-10-22 16:29:44 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --include_ops_by_config " ,
type = str ,
2023-04-17 17:11:44 +00:00
help = " Include ops from config file. See /docs/Reduced_Operator_Kernel_build.md for more information. " ,
2022-04-26 16:35:16 +00:00
)
2020-06-24 03:07:53 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_reduced_operator_type_support " ,
action = " store_true " ,
help = " If --include_ops_by_config is specified, and the configuration file has type reduction "
" information, limit the types individual operators support where possible to further "
" reduce the build size. "
" See /docs/Reduced_Operator_Kernel_build.md for more information. " ,
)
2020-09-04 21:59:01 +00:00
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --disable_contrib_ops " , action = " store_true " , help = " Disable contrib ops (reduces binary size) " )
parser . add_argument (
" --disable_ml_ops " , action = " store_true " , help = " Disable traditional ML ops (reduces binary size) "
)
2021-11-05 22:27:04 +00:00
# Please note in our CMakeLists.txt this is already default on. But in this file we reverse it to default OFF.
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --disable_rtti " , action = " store_true " , help = " Disable RTTI (reduces binary size) " )
2023-08-31 20:32:55 +00:00
parser . add_argument (
" --disable_types " ,
nargs = " + " ,
default = [ ] ,
choices = [ " float8 " , " optional " , " sparsetensor " ] ,
help = " Disable selected data types (reduces binary size) " ,
)
2021-04-24 00:22:31 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --disable_exceptions " ,
action = " store_true " ,
help = " Disable exceptions to reduce binary size. Requires --minimal_build. " ,
)
parser . add_argument ( " --rocm_version " , help = " The version of ROCM stack to use. " )
parser . add_argument ( " --use_rocm " , action = " store_true " , help = " Build with ROCm " )
2020-10-30 00:13:04 +00:00
parser . add_argument ( " --rocm_home " , help = " Path to ROCm installation dir " )
2020-12-08 18:55:02 +00:00
# Code coverage
2021-02-12 22:17:10 +00:00
parser . add_argument (
2024-07-22 20:37:32 +00:00
" --code_coverage " , action = " store_true " , help = " Generate code coverage when targeting Android (only). "
2022-04-26 16:35:16 +00:00
)
2022-08-22 16:40:40 +00:00
# lazy tensor support.
parser . add_argument (
" --enable_lazy_tensor " , action = " store_true " , help = " Enable use ORT as backend in Pytorch LazyTensor. "
)
2022-04-26 16:35:16 +00:00
parser . add_argument ( " --ms_experimental " , action = " store_true " , help = " Build microsoft experimental operators. " )
2022-08-22 16:40:40 +00:00
2021-08-28 18:05:21 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_external_custom_op_schemas " ,
action = " store_true " ,
2021-08-28 18:05:21 +00:00
help = " Enable registering user defined custom operation schemas at shared library load time. \
2022-04-26 16:35:16 +00:00
This feature is only supported / available on Ubuntu . " ,
)
2021-11-15 16:16:20 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --external_graph_transformer_path " , type = str , help = " path to the external graph transformer dir. "
)
2021-11-15 16:16:20 +00:00
2021-11-30 06:44:50 +00:00
parser . add_argument (
2022-04-26 16:35:16 +00:00
" --enable_cuda_profiling " ,
action = " store_true " ,
help = " enable cuda kernel profiling, \
cupti library must be added to PATH beforehand . " ,
)
2022-09-22 21:53:40 +00:00
parser . add_argument ( " --use_cann " , action = " store_true " , help = " Build with CANN " )
parser . add_argument ( " --cann_home " , help = " Path to CANN installation dir " )
2021-11-30 06:44:50 +00:00
2022-08-27 02:38:03 +00:00
parser . add_argument (
" --enable_rocm_profiling " ,
action = " store_true " ,
help = " enable rocm kernel profiling. " ,
)
2022-06-03 10:22:34 +00:00
parser . add_argument ( " --use_xnnpack " , action = " store_true " , help = " Enable xnnpack EP. " )
2024-10-10 00:48:09 +00:00
parser . add_argument ( " --use_avx512 " , action = " store_true " , help = " Enable AVX512 instructions " )
2023-01-11 20:25:04 +00:00
parser . add_argument ( " --use_azure " , action = " store_true " , help = " Enable azure EP. " )
2022-06-03 10:22:34 +00:00
2022-12-15 23:19:07 +00:00
parser . add_argument ( " --use_cache " , action = " store_true " , help = " Use compiler cache in CI " )
integrate triton into ort (#15862)
### Description
In some scenarios, the triton written kernels are more performant than
CK or other handwritten kernels, so we implement a framework that
onnxruntime can use these triton written kernels.
This PR is to integrate triton into ort, so that ort can use kernels
that written and compiled by triton.
The main change focus on two part:
1. a build part to compile triton written kernel and combine these
kernels into libonnxruntime_providers_rocm.so
2. a loader and launcher in c++, for loading and launch triton written
kernels.
#### Build
To compile triton written kernel, add a script
`tools/ci_build/compile_triton.py`. This script will dynamic load all
kernel files, compile them, and generate `triton_kernel_infos.a` and
`triton_kernel_infos.h`.
`triton_kernel_infos.a` contains all compiled kernel instructions, this
file will be combined into libonnxruntime_providers_rocm.so, using
--whole-archive flag.
`triton_kernel_infos.h` defines a const array that contains all the
metadata for each compiled kernel. These metadata will be used for load
and launch. So this header file is included by 'triton_kernel.cu' which
defines load and launch functions.
Add a build flag in build.py and CMakeList.txt, when building rocm
provider, it will call triton_kernel build command, and generate all
necessary files.
#### C++ Load and Launch
On c++ part, we implement load and launch functions in triton_kernel.cu
and triton_kernel.h.
These two files located in `providers/cuda`, and when compiling rocm,
they will be hipified. so this part supports both cuda and rocm. But
currently we only call triton kernel in rocm.
We also implement a softmax triton op for example. Because there will
generate many kernels for different input shape of softmax, we use
TunableOp to select the best one.
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
parser . add_argument ( " --use_triton_kernel " , action = " store_true " , help = " Use triton compiled kernels " )
Implement mutex-free spin lock for task queue (#14834)
Implemented "lock-free" spinlock to save CPU usage on context switching.
The change has been tested on queene service of Ads team, the lock-free
version of ort (40 threads) saves CPU usage on gen8 (128 logical
processors on 8 numa nodes) windows by nearly half, from 65% to 35%.
For 32 cores, the curve is flat:
Anubis, 32 vCPU, windows, hugging face models,
95 percentile E2E latency in ms:
model | mutex(ms) | mutex-free
--- | --- | ---
alvert_base_v2 | 34.21 | 34.09
bert_large_uncased | 116.27| 117.84
bart_base | 72.06 | 71.99
distilgpt2 | 25.43 | 25.02
vit_base_patch16_224 | 37.33 | 37.76
Anubis, 32 vCPU win, Linux, 1st party models,
95 percentile E2E latency in ms:
model | mutex(ms) | mutex-free
--- | --- | ---
deepthink_v2 | 24.35 | 22.95
bing_feeds | 36.96 | 36.48
deep_writes | 14.46 | 14.32
keypoints | 9.34 | 7.69
model11 | 1.71 | 1.66
model12 | 1.82 | 1.44
model2 | 4.21 | 3.95
model6 | 1.08 | 1.05
agiencoder | 0.99 | 0.93
geminet_transformer | 5.32 | 5.24
---------
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
2023-05-19 17:12:10 +00:00
parser . add_argument ( " --use_lock_free_queue " , action = " store_true " , help = " Use lock-free task queue for threadpool. " )
integrate triton into ort (#15862)
### Description
In some scenarios, the triton written kernels are more performant than
CK or other handwritten kernels, so we implement a framework that
onnxruntime can use these triton written kernels.
This PR is to integrate triton into ort, so that ort can use kernels
that written and compiled by triton.
The main change focus on two part:
1. a build part to compile triton written kernel and combine these
kernels into libonnxruntime_providers_rocm.so
2. a loader and launcher in c++, for loading and launch triton written
kernels.
#### Build
To compile triton written kernel, add a script
`tools/ci_build/compile_triton.py`. This script will dynamic load all
kernel files, compile them, and generate `triton_kernel_infos.a` and
`triton_kernel_infos.h`.
`triton_kernel_infos.a` contains all compiled kernel instructions, this
file will be combined into libonnxruntime_providers_rocm.so, using
--whole-archive flag.
`triton_kernel_infos.h` defines a const array that contains all the
metadata for each compiled kernel. These metadata will be used for load
and launch. So this header file is included by 'triton_kernel.cu' which
defines load and launch functions.
Add a build flag in build.py and CMakeList.txt, when building rocm
provider, it will call triton_kernel build command, and generate all
necessary files.
#### C++ Load and Launch
On c++ part, we implement load and launch functions in triton_kernel.cu
and triton_kernel.h.
These two files located in `providers/cuda`, and when compiling rocm,
they will be hipified. so this part supports both cuda and rocm. But
currently we only call triton kernel in rocm.
We also implement a softmax triton op for example. Because there will
generate many kernels for different input shape of softmax, we use
TunableOp to select the best one.
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
2023-03-27 21:46:04 +00:00
if not is_windows ( ) :
parser . add_argument (
" --allow_running_as_root " ,
action = " store_true " ,
help = " Allow build to be run as root user. This is not allowed by default. " ,
)
2022-03-30 01:42:57 +00:00
args = parser . parse_args ( )
if args . android_sdk_path :
args . android_sdk_path = os . path . normpath ( args . android_sdk_path )
if args . android_ndk_path :
args . android_ndk_path = os . path . normpath ( args . android_ndk_path )
2022-11-28 18:24:34 +00:00
if args . enable_wasm_api_exception_catching :
# if we catch on api level, we don't want to catch all
args . disable_wasm_exception_catching = True
if not args . disable_wasm_exception_catching or args . enable_wasm_api_exception_catching :
# doesn't make sense to catch if no one throws
args . enable_wasm_exception_throwing_override = True
2023-04-12 20:47:58 +00:00
if args . cmake_generator is None and is_windows ( ) :
2023-05-16 17:34:34 +00:00
args . cmake_generator = " Ninja " if args . build_wasm else " Visual Studio 17 2022 "
2023-04-12 20:47:58 +00:00
2024-11-06 17:54:55 +00:00
if args . enable_cuda_nhwc_ops :
warnings . warn (
" The argument ' --enable_cuda_nhwc_ops ' is deprecated and is default to True. " , DeprecationWarning
)
2022-03-30 01:42:57 +00:00
return args
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
2021-02-22 22:05:00 +00:00
def is_reduced_ops_build ( args ) :
return args . include_ops_by_config is not None
2018-12-18 21:23:32 +00:00
def resolve_executable_path ( command_or_path ) :
""" Returns the absolute path of an executable. """
2021-06-03 06:36:49 +00:00
if command_or_path and command_or_path . strip ( ) :
executable_path = shutil . which ( command_or_path )
if executable_path is None :
2023-04-17 17:11:44 +00:00
raise BuildError ( f " Failed to resolve executable path for ' { command_or_path } ' . " )
2021-06-03 06:36:49 +00:00
return os . path . abspath ( executable_path )
else :
return None
2018-12-18 21:23:32 +00:00
2020-04-19 03:48:30 +00:00
2019-12-27 20:17:06 +00:00
def get_linux_distro ( ) :
try :
2023-03-24 22:29:03 +00:00
with open ( " /etc/os-release " ) as f :
2024-07-24 18:50:11 +00:00
dist_info = dict ( line . strip ( ) . split ( " = " , 1 ) for line in f )
2022-04-26 16:35:16 +00:00
return dist_info . get ( " NAME " , " " ) . strip ( ' " ' ) , dist_info . get ( " VERSION " , " " ) . strip ( ' " ' )
2023-03-24 22:29:03 +00:00
except ( OSError , ValueError ) :
2022-04-26 16:35:16 +00:00
return " " , " "
2019-12-27 20:17:06 +00:00
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
def get_config_build_dir ( build_dir , config ) :
# build directory per configuration
return os . path . join ( build_dir , config )
2020-04-19 03:48:30 +00:00
2022-06-04 03:00:54 +00:00
def run_subprocess (
2023-03-24 22:29:03 +00:00
args ,
cwd = None ,
capture_stdout = False ,
dll_path = None ,
shell = False ,
env = None ,
python_path = None ,
cuda_home = None ,
2022-06-04 03:00:54 +00:00
) :
2023-03-24 22:29:03 +00:00
if env is None :
env = { }
2020-11-18 01:02:24 +00:00
if isinstance ( args , str ) :
raise ValueError ( " args should be a sequence of strings, not a string " )
2018-11-20 00:48:22 +00:00
my_env = os . environ . copy ( )
if dll_path :
if is_windows ( ) :
2021-10-14 19:54:49 +00:00
if " PATH " in my_env :
my_env [ " PATH " ] = dll_path + os . pathsep + my_env [ " PATH " ]
else :
my_env [ " PATH " ] = dll_path
2018-11-20 00:48:22 +00:00
else :
if " LD_LIBRARY_PATH " in my_env :
my_env [ " LD_LIBRARY_PATH " ] + = os . pathsep + dll_path
else :
my_env [ " LD_LIBRARY_PATH " ] = dll_path
2022-06-04 03:00:54 +00:00
# Add nvcc's folder to PATH env so that our cmake file can find nvcc
if cuda_home :
my_env [ " PATH " ] = os . path . join ( cuda_home , " bin " ) + os . pathsep + my_env [ " PATH " ]
2021-08-06 15:30:27 +00:00
if python_path :
2021-10-14 19:54:49 +00:00
if " PYTHONPATH " in my_env :
my_env [ " PYTHONPATH " ] + = os . pathsep + python_path
2021-08-06 15:30:27 +00:00
else :
2021-10-14 19:54:49 +00:00
my_env [ " PYTHONPATH " ] = python_path
2018-11-20 00:48:22 +00:00
2019-09-02 06:01:47 +00:00
my_env . update ( env )
2020-11-18 01:02:24 +00:00
2023-11-03 16:05:17 +00:00
log . info ( " " . join ( args ) )
2020-11-20 19:56:26 +00:00
return run ( * args , cwd = cwd , capture_stdout = capture_stdout , shell = shell , env = my_env )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
def update_submodules ( source_dir ) :
2019-07-10 20:11:59 +00:00
run_subprocess ( [ " git " , " submodule " , " sync " , " --recursive " ] , cwd = source_dir )
2022-04-26 16:35:16 +00:00
run_subprocess ( [ " git " , " submodule " , " update " , " --init " , " --recursive " ] , cwd = source_dir )
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
2022-09-07 00:33:27 +00:00
def setup_test_data ( source_onnx_model_dir , dest_model_dir_name , build_dir , configs ) :
# create the symlink/shortcut of onnx models dir under build_dir
# currently, there're 2 sources of onnx models, one is build in OS image, another is
# from {source_dir}/js/test, which is downloaded from onnx web.
2019-01-10 06:33:14 +00:00
if is_windows ( ) :
2022-09-07 00:33:27 +00:00
src_model_dir = os . path . join ( build_dir , dest_model_dir_name )
if os . path . exists ( source_onnx_model_dir ) and not os . path . exists ( src_model_dir ) :
2023-03-24 22:29:03 +00:00
log . debug ( f " creating shortcut { source_onnx_model_dir } -> { src_model_dir } " )
2022-09-07 00:33:27 +00:00
run_subprocess ( [ " mklink " , " /D " , " /J " , src_model_dir , source_onnx_model_dir ] , shell = True )
2018-12-13 22:46:59 +00:00
for config in configs :
config_build_dir = get_config_build_dir ( build_dir , config )
os . makedirs ( config_build_dir , exist_ok = True )
2022-09-07 00:33:27 +00:00
dest_model_dir = os . path . join ( config_build_dir , dest_model_dir_name )
if os . path . exists ( source_onnx_model_dir ) and not os . path . exists ( dest_model_dir ) :
2023-03-24 22:29:03 +00:00
log . debug ( f " creating shortcut { source_onnx_model_dir } -> { dest_model_dir } " )
2022-09-07 00:33:27 +00:00
run_subprocess ( [ " mklink " , " /D " , " /J " , dest_model_dir , source_onnx_model_dir ] , shell = True )
2022-04-26 16:35:16 +00:00
elif os . path . exists ( src_model_dir ) and not os . path . exists ( dest_model_dir ) :
2023-03-24 22:29:03 +00:00
log . debug ( f " creating shortcut { src_model_dir } -> { dest_model_dir } " )
2022-04-26 16:35:16 +00:00
run_subprocess ( [ " mklink " , " /D " , " /J " , dest_model_dir , src_model_dir ] , shell = True )
2022-09-07 00:33:27 +00:00
else :
src_model_dir = os . path . join ( build_dir , dest_model_dir_name )
if os . path . exists ( source_onnx_model_dir ) and not os . path . exists ( src_model_dir ) :
log . debug ( f " create symlink { source_onnx_model_dir } -> { src_model_dir } " )
os . symlink ( source_onnx_model_dir , src_model_dir , target_is_directory = True )
2020-04-19 03:48:30 +00:00
2020-10-07 02:03:33 +00:00
def use_dev_mode ( args ) :
2022-11-07 17:06:28 +00:00
if args . compile_no_warning_as_error :
return False
2020-10-07 02:03:33 +00:00
if args . use_acl :
2022-11-07 17:06:28 +00:00
return False
2020-10-07 02:03:33 +00:00
if args . use_armnn :
2022-11-07 17:06:28 +00:00
return False
2024-04-24 01:15:07 +00:00
if ( args . ios or args . visionos ) and is_macOS ( ) :
2022-11-07 17:06:28 +00:00
return False
2023-03-24 22:29:03 +00:00
SYSTEM_COLLECTIONURI = os . getenv ( " SYSTEM_COLLECTIONURI " ) # noqa: N806
if SYSTEM_COLLECTIONURI and SYSTEM_COLLECTIONURI != " https://dev.azure.com/onnxruntime/ " :
2022-11-07 17:06:28 +00:00
return False
return True
2020-10-07 02:03:33 +00:00
2022-03-22 02:10:47 +00:00
def add_default_definition ( definition_list , key , default_value ) :
for x in definition_list :
2021-07-31 00:16:37 +00:00
if x . startswith ( key + " = " ) :
2022-03-22 02:10:47 +00:00
return definition_list
definition_list . append ( key + " = " + default_value )
2021-07-31 00:16:37 +00:00
2022-03-22 18:55:45 +00:00
def normalize_arg_list ( nested_list ) :
2022-04-26 16:35:16 +00:00
return [ i for j in nested_list for i in j ] if nested_list else [ ]
2023-09-05 17:59:27 +00:00
def number_of_parallel_jobs ( args ) :
return os . cpu_count ( ) if args . parallel == 0 else args . parallel
def number_of_nvcc_threads ( args ) :
if args . nvcc_threads > = 0 :
return args . nvcc_threads
nvcc_threads = 1
try :
import psutil
available_memory = psutil . virtual_memory ( ) . available
if isinstance ( available_memory , int ) and available_memory > 0 :
if available_memory > 60 * 1024 * 1024 * 1024 :
# When available memory is large enough, chance of OOM is small.
nvcc_threads = 4
else :
# NVCC need a lot of memory to compile 8 flash attention cu files in Linux or 4 cutlass fmha cu files in Windows.
# Here we select number of threads to ensure each thread has enough memory (>= 4 GB). For example,
# Standard_NC4as_T4_v3 has 4 CPUs and 28 GB memory. When parallel=4 and nvcc_threads=2,
# total nvcc threads is 4 * 2, which is barely able to build in 28 GB memory so we will use nvcc_threads=1.
memory_per_thread = 4 * 1024 * 1024 * 1024
2023-10-09 19:43:12 +00:00
fmha_cu_files = 4 if is_windows ( ) else 16
2023-09-05 17:59:27 +00:00
fmha_parallel_jobs = min ( fmha_cu_files , number_of_parallel_jobs ( args ) )
nvcc_threads = max ( 1 , int ( available_memory / ( memory_per_thread * fmha_parallel_jobs ) ) )
print (
f " nvcc_threads= { nvcc_threads } to ensure memory per thread >= 4GB for available_memory= { available_memory } and fmha_parallel_jobs= { fmha_parallel_jobs } "
)
except ImportError :
print (
" Failed to import psutil. Please `pip install psutil` for better estimation of nvcc threads. Use nvcc_threads=1 "
)
return nvcc_threads
2022-04-26 16:35:16 +00:00
def generate_build_tree (
cmake_path ,
source_dir ,
build_dir ,
cuda_home ,
cudnn_home ,
rocm_home ,
mpi_home ,
nccl_home ,
tensorrt_home ,
migraphx_home ,
acl_home ,
acl_libs ,
armnn_home ,
armnn_libs ,
2023-03-01 21:48:20 +00:00
qnn_home ,
2022-06-03 21:10:02 +00:00
snpe_root ,
2022-09-22 21:53:40 +00:00
cann_home ,
2022-04-26 16:35:16 +00:00
path_to_protoc_exe ,
configs ,
cmake_extra_defines ,
args ,
cmake_extra_args ,
) :
2018-11-20 00:48:22 +00:00
log . info ( " Generating CMake build tree " )
cmake_dir = os . path . join ( source_dir , " cmake " )
2022-11-07 17:06:28 +00:00
cmake_args = [ cmake_path , cmake_dir ]
if not use_dev_mode ( args ) :
cmake_args + = [ " --compile-no-warning-as-error " ]
2023-08-31 20:32:55 +00:00
types_to_disable = args . disable_types
Introduce float 8 types (#14731)
### Description
The PR implements FloatE4M3FN, FloatE5M2, FloatE4MEFNUZ, FloatE5M2FNUZ
as described in PR https://github.com/onnx/onnx/pull/4805. It uses CUDA
API to cast float/half to float8 if CUDA>=11.8, a custom implementation
if CUDA<11.8.
* It implements, Cast, QuantizeLinear, DequantizeLinear for all types on
CPU, only for types FloatE4M3FN, FloatE5M2 on CUDA.
* It extends the supported types for control flow operator, Shape,
Reshape, Identity, If, Loop, Scan, Reshape
* It implements Equal(19).
* Cast, QuantizeLinear, DequantizeLinear operators now support a
parameter `saturate` only valid for float 8 types. It is true by
default. In that case, any value out of range is converted into the
maximum float 8 value. If false, it is infinite.
* QuantizeLinear, DequantizeLinear now supports multiple scales on CUDA
(and ROCm by extension), scale = 1D tensor with one scale per channel
### Motivation and Context
Supports latest onnx version.
Fixes
[AB#15395](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/15395)
---------
Co-authored-by: Xavier Dupre <xadupre@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Scott McKay <Scott.McKay@microsoft.com>
2023-05-30 20:25:58 +00:00
# enable/disable float 8 types
2023-12-11 03:37:29 +00:00
disable_float8_types = args . android or ( " float8 " in types_to_disable )
2023-08-31 20:32:55 +00:00
disable_optional_type = " optional " in types_to_disable
disable_sparse_tensors = " sparsetensor " in types_to_disable
Introduce float 8 types (#14731)
### Description
The PR implements FloatE4M3FN, FloatE5M2, FloatE4MEFNUZ, FloatE5M2FNUZ
as described in PR https://github.com/onnx/onnx/pull/4805. It uses CUDA
API to cast float/half to float8 if CUDA>=11.8, a custom implementation
if CUDA<11.8.
* It implements, Cast, QuantizeLinear, DequantizeLinear for all types on
CPU, only for types FloatE4M3FN, FloatE5M2 on CUDA.
* It extends the supported types for control flow operator, Shape,
Reshape, Identity, If, Loop, Scan, Reshape
* It implements Equal(19).
* Cast, QuantizeLinear, DequantizeLinear operators now support a
parameter `saturate` only valid for float 8 types. It is true by
default. In that case, any value out of range is converted into the
maximum float 8 value. If false, it is infinite.
* QuantizeLinear, DequantizeLinear now supports multiple scales on CUDA
(and ROCm by extension), scale = 1D tensor with one scale per channel
### Motivation and Context
Supports latest onnx version.
Fixes
[AB#15395](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/15395)
---------
Co-authored-by: Xavier Dupre <xadupre@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Scott McKay <Scott.McKay@microsoft.com>
2023-05-30 20:25:58 +00:00
2022-11-07 17:06:28 +00:00
cmake_args + = [
2021-02-16 04:46:51 +00:00
" -Donnxruntime_RUN_ONNX_TESTS= " + ( " ON " if args . enable_onnx_tests else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_GENERATE_TEST_REPORTS=ON " ,
2021-06-03 06:36:49 +00:00
# There are two ways of locating python C API header file. "find_package(PythonLibs 3.5 REQUIRED)"
# and "find_package(Python 3.5 COMPONENTS Development.Module)". The first one is deprecated and it
# depends on the "PYTHON_EXECUTABLE" variable. The second needs "Python_EXECUTABLE". Here we set both
# of them to get the best compatibility.
" -DPython_EXECUTABLE= " + sys . executable ,
2020-04-19 03:48:30 +00:00
" -DPYTHON_EXECUTABLE= " + sys . executable ,
2024-09-10 23:39:27 +00:00
" -Donnxruntime_USE_VCPKG= " + ( " ON " if args . use_vcpkg else " OFF " ) ,
2021-12-08 01:56:58 +00:00
" -Donnxruntime_USE_MIMALLOC= " + ( " ON " if args . use_mimalloc else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_ENABLE_PYTHON= " + ( " ON " if args . enable_pybind else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_BUILD_CSHARP= " + ( " ON " if args . build_csharp else " OFF " ) ,
" -Donnxruntime_BUILD_JAVA= " + ( " ON " if args . build_java else " OFF " ) ,
2020-05-27 20:30:22 +00:00
" -Donnxruntime_BUILD_NODEJS= " + ( " ON " if args . build_nodejs else " OFF " ) ,
2021-04-27 17:06:30 +00:00
" -Donnxruntime_BUILD_OBJC= " + ( " ON " if args . build_objc else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_BUILD_SHARED_LIB= " + ( " ON " if args . build_shared_lib else " OFF " ) ,
2021-04-15 23:47:53 +00:00
" -Donnxruntime_BUILD_APPLE_FRAMEWORK= " + ( " ON " if args . build_apple_framework else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_USE_DNNL= " + ( " ON " if args . use_dnnl else " OFF " ) ,
The initial PR for NNAPI EP (#4287)
* Move nnapi dnnlib to subfolder
* dnnlib compile settings
* add nnapi buildin build.py
* add onnxruntime_USE_NNAPI_BUILTIN
* compile using onnxruntime_USE_NNAPI_BUILTIN
* remove dnnlib from built in code
* Group onnxruntime_USE_NNAPI_BUILTIN sources
* add file stubs
* java 32bit compile error
* built in nnapi support 5-26
* init working version
* initializer support
* fix crash on free execution
* add dynamic input support
* bug fixes for dynamic input shape, add mul support, working on conv and batchnorm
* Add batchnormalization, add overflow check for int64 attributes
* add global average/max pool and reshape
* minor changes
* minor changes
* add skip relu and options to use different type of memory
* small bug fix for in operator relu
* bug fix for nnapi
* add transpose support, minor bug fix
* Add transpose support
* minor bug fixes, depthwise conv weight fix
* fixed the bug where the onnx model input has mismatch order than the nnapi model input
* add helper to add scalar operand
* add separated opbuilder to handle single operator
* add cast operator
* fixed reshape, moved some logs to verbose
* Add softmax and identity support, change shaper calling signature, and add support for int32 output
* changed the way to execute the NNAPI
* move NNMemory and InputOutputInfo into Model class
* add limited support for input dynamic shape
* add gemm support, fixed crash when allocating big array on stack
* add abs/exp/floor/log/sigmoid/neg/sin/sqrt/tanh support
* better dynamic input shape support;
* add more check for IsOpSupportedImpl, refactored some code
* some code style fix, switch to safeint
* Move opbuilders to a map with single instance, minor bug fixes
* add GetUniqueName for new temp tensors
* change from throw std to ort_throw
* build settings change and 3rd party notice update
* add readme for nnapi_lib, move to ort log, add comments to public functions, clean the code
* add android log sink and more logging changes, add new string for NnApiErrorDescription
* add nnapi execution options/fp16 relax
* fix a dnnlibrary build break
* addressed review comments
* address review comments, changed adding output for subgraph in NnapiExecutionProvider::GetCapability, minor issue fixes
* formatting in build.py
* more formatting fix in build.py, return fail status instead of throw in compute_func
* moved android_log_sink to platform folder, minor coding style changes
* addressed review comments
2020-06-26 07:02:39 +00:00
" -Donnxruntime_USE_NNAPI_BUILTIN= " + ( " ON " if args . use_nnapi else " OFF " ) ,
2024-06-29 04:48:34 +00:00
" -Donnxruntime_USE_VSINPU= " + ( " ON " if args . use_vsinpu else " OFF " ) ,
Initial PR for RKNPU execution provider (#3609)
* Initial RKNPU execution provider
* Init
* Support Ops:
Conv, Relu, Clip, LeakyRelu,
MaxPool, AveragePool, GlobalAveragePool,
Concat, Softmax, BatchNormalization, Gemm,
Add, Mul, Sub,
Reshape, Squeeze, Unsqueeze,
Flatten, Transpose,
QLinearConv, DequantizeLinear
* Add rknpu unittest
* Update BUILD.md and Add RKNPU-ExecutionProvider.md
* misc code update
* fix CLIP accuracy issue.
* fix "Error: Duplicate definition of name".
* move rknpu_ddk out of onnxruntime submodule.
* remove temporary code.
* add rknpu namespace.
* update misc of node_attr_helper
* add const & comment for onnx_converter
* add const & comment for shaper
* unify variable name
Co-authored-by: dkm <dkm@rock-chips.com>
Co-authored-by: George Wu <jywu@microsoft.com>
2020-05-06 03:36:47 +00:00
" -Donnxruntime_USE_RKNPU= " + ( " ON " if args . use_rknpu else " OFF " ) ,
2022-09-07 22:11:18 +00:00
" -Donnxruntime_USE_LLVM= " + ( " ON " if args . use_tvm else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_ENABLE_MICROSOFT_INTERNAL= " + ( " ON " if args . enable_msinternal else " OFF " ) ,
2020-05-19 12:32:32 +00:00
" -Donnxruntime_USE_VITISAI= " + ( " ON " if args . use_vitisai else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_USE_TENSORRT= " + ( " ON " if args . use_tensorrt else " OFF " ) ,
2023-04-05 14:53:29 +00:00
" -Donnxruntime_USE_TENSORRT_BUILTIN_PARSER= "
+ ( " ON " if args . use_tensorrt_builtin_parser and not args . use_tensorrt_oss_parser else " OFF " ) ,
2022-02-15 09:21:02 +00:00
# set vars for TVM
" -Donnxruntime_USE_TVM= " + ( " ON " if args . use_tvm else " OFF " ) ,
" -Donnxruntime_TVM_CUDA_RUNTIME= " + ( " ON " if args . use_tvm and args . tvm_cuda_runtime else " OFF " ) ,
2022-07-13 08:48:42 +00:00
" -Donnxruntime_TVM_USE_HASH= " + ( " ON " if args . use_tvm_hash else " OFF " ) ,
2020-05-26 20:24:59 +00:00
# set vars for migraphx
" -Donnxruntime_USE_MIGRAPHX= " + ( " ON " if args . use_migraphx else " OFF " ) ,
2020-09-04 21:59:01 +00:00
" -Donnxruntime_DISABLE_CONTRIB_OPS= " + ( " ON " if args . disable_contrib_ops else " OFF " ) ,
" -Donnxruntime_DISABLE_ML_OPS= " + ( " ON " if args . disable_ml_ops else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_DISABLE_RTTI= "
+ ( " ON " if args . disable_rtti or ( args . minimal_build is not None and not args . enable_pybind ) else " OFF " ) ,
2020-09-04 21:59:01 +00:00
" -Donnxruntime_DISABLE_EXCEPTIONS= " + ( " ON " if args . disable_exceptions else " OFF " ) ,
2021-02-16 04:46:51 +00:00
# Need to use 'is not None' with minimal_build check as it could be an empty list.
2021-02-13 02:42:33 +00:00
" -Donnxruntime_MINIMAL_BUILD= " + ( " ON " if args . minimal_build is not None else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_EXTENDED_MINIMAL_BUILD= "
+ ( " ON " if args . minimal_build and " extended " in args . minimal_build else " OFF " ) ,
" -Donnxruntime_MINIMAL_BUILD_CUSTOM_OPS= "
+ (
" ON "
if ( args . minimal_build is not None and ( " custom_ops " in args . minimal_build or args . use_extensions ) )
else " OFF "
) ,
2021-02-22 22:05:00 +00:00
" -Donnxruntime_REDUCED_OPS_BUILD= " + ( " ON " if is_reduced_ops_build ( args ) else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_USE_DML= " + ( " ON " if args . use_dml else " OFF " ) ,
" -Donnxruntime_USE_WINML= " + ( " ON " if args . use_winml else " OFF " ) ,
2021-02-12 22:17:10 +00:00
" -Donnxruntime_BUILD_MS_EXPERIMENTAL_OPS= " + ( " ON " if args . ms_experimental else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_USE_TELEMETRY= " + ( " ON " if args . use_telemetry else " OFF " ) ,
2020-04-19 03:48:30 +00:00
" -Donnxruntime_ENABLE_LTO= " + ( " ON " if args . enable_lto else " OFF " ) ,
2020-04-20 08:05:28 +00:00
" -Donnxruntime_USE_ACL= " + ( " ON " if args . use_acl else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_USE_ARMNN= " + ( " ON " if args . use_armnn else " OFF " ) ,
" -Donnxruntime_ARMNN_RELU_USE_CPU= " + ( " OFF " if args . armnn_relu else " ON " ) ,
" -Donnxruntime_ARMNN_BN_USE_CPU= " + ( " OFF " if args . armnn_bn else " ON " ) ,
2023-04-26 04:20:03 +00:00
" -Donnxruntime_USE_JSEP= " + ( " ON " if args . use_jsep else " OFF " ) ,
2024-10-08 23:10:46 +00:00
" -Donnxruntime_USE_WEBGPU= " + ( " ON " if args . use_webgpu else " OFF " ) ,
Add implementation of WebGPU EP (#22591)
### Description
This PR adds the actual implementation of the WebGPU EP based on
https://github.com/microsoft/onnxruntime/pull/22318.
This change includes the following:
<details>
<summary><b>core framework of WebGPU EP</b></summary>
- WebGPU EP factory classes for:
- handling WebGPU options
- creating WebGPU EP instance
- creating WebGPU context
- WebGPU Execution Provider classes
- GPU Buffer allocator
- data transfer
- Buffer management classes
- Buffer Manager
- BufferCacheManager
- DisabledCacheManager
- SimpleCacheManager
- LazyReleaseCacheManager
- BucketCacheManager
- Program classes
- Program (base)
- Program Cache Key
- Program Manager
- Shader helper classes
- Shader Helper
- ShaderIndicesHelper
- ShaderVariableHelper
- Utils
- GPU Query based profiler
- compute context
- string utils
- Miscs
- Python binding webgpu support (basic)
</details>
<details>
<summary><b>Kernel implementation</b></summary>
- onnx.ai (default opset):
- Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf,
Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh,
Tanh, Not, Cast
- Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu,
LeakyRelu, ThresholdedRelu, Gelu
- Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater,
GreaterOrEqual, Less, LessOrEqual
- (Tensors): Shape, Reshape, Squeeze, Unsqueeze
- Where
- Transpose
- Concat
- Expand
- Gather
- Tile
- Range
- LayerNormalization
- com.microsoft
- FastGelu
- MatMulNBits
- MultiHeadAttention
- RotaryEmbedding
- SkipLayerNormalization
- LayerNormalization
- SimplifiedLayerNormalization
- SkipSimplifiedLayerNormalization
</details>
<details>
<summary><b>Build, test and CI pipeline integration</b></summary>
- build works for Windows, macOS and iOS
- support onnxruntime_test_all and python node test
- added a new unit test for `--use_external_dawn` build flag.
- updated MacOS pipeline to build with WebGPU support
- added a new pipeline for WebGPU Windows
</details>
This change does not include:
- Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
" -Donnxruntime_USE_EXTERNAL_DAWN= " + ( " ON " if args . use_external_dawn else " OFF " ) ,
2020-04-21 03:30:24 +00:00
# Training related flags
2021-02-16 04:46:51 +00:00
" -Donnxruntime_ENABLE_NVTX_PROFILE= " + ( " ON " if args . enable_nvtx_profile else " OFF " ) ,
" -Donnxruntime_ENABLE_TRAINING= " + ( " ON " if args . enable_training else " OFF " ) ,
" -Donnxruntime_ENABLE_TRAINING_OPS= " + ( " ON " if args . enable_training_ops else " OFF " ) ,
2023-01-03 21:28:16 +00:00
" -Donnxruntime_ENABLE_TRAINING_APIS= " + ( " ON " if args . enable_training_apis else " OFF " ) ,
2020-12-18 00:21:33 +00:00
# Enable advanced computations such as AVX for some traininig related ops.
2021-02-16 04:46:51 +00:00
" -Donnxruntime_ENABLE_CPU_FP16_OPS= " + ( " ON " if args . enable_training else " OFF " ) ,
2023-02-07 21:47:48 +00:00
" -Donnxruntime_USE_NCCL= " + ( " ON " if args . enable_nccl else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_BUILD_BENCHMARKS= " + ( " ON " if args . build_micro_benchmarks else " OFF " ) ,
2020-10-30 00:13:04 +00:00
" -Donnxruntime_USE_ROCM= " + ( " ON " if args . use_rocm else " OFF " ) ,
2024-06-19 23:14:58 +00:00
" -Donnxruntime_GCOV_COVERAGE= " + ( " ON " if args . code_coverage else " OFF " ) ,
2021-02-16 04:46:51 +00:00
" -Donnxruntime_USE_MPI= " + ( " ON " if args . use_mpi else " OFF " ) ,
" -Donnxruntime_ENABLE_MEMORY_PROFILE= " + ( " ON " if args . enable_memory_profile else " OFF " ) ,
2021-02-23 06:00:21 +00:00
" -Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO= " + ( " ON " if args . enable_cuda_line_info else " OFF " ) ,
2024-11-06 17:54:55 +00:00
" -Donnxruntime_USE_CUDA_NHWC_OPS= " + ( " ON " if args . use_cuda and not args . disable_cuda_nhwc_ops else " OFF " ) ,
2022-01-19 02:05:04 +00:00
" -Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB= " + ( " ON " if args . build_wasm_static_lib else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING= "
+ ( " OFF " if args . disable_wasm_exception_catching else " ON " ) ,
2022-11-28 18:24:34 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_API_EXCEPTION_CATCHING= "
+ ( " ON " if args . enable_wasm_api_exception_catching else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_THROWING= "
+ ( " ON " if args . enable_wasm_exception_throwing_override else " OFF " ) ,
2023-02-25 00:45:33 +00:00
" -Donnxruntime_WEBASSEMBLY_RUN_TESTS_IN_BROWSER= " + ( " ON " if args . wasm_run_tests_in_browser else " OFF " ) ,
2021-04-16 04:46:11 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_THREADS= " + ( " ON " if args . enable_wasm_threads else " OFF " ) ,
2024-10-25 03:21:51 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_MEMORY64= " + ( " ON " if args . enable_wasm_memory64 else " OFF " ) ,
2021-05-21 08:32:00 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_DEBUG_INFO= " + ( " ON " if args . enable_wasm_debug_info else " OFF " ) ,
2021-10-12 05:04:50 +00:00
" -Donnxruntime_ENABLE_WEBASSEMBLY_PROFILING= " + ( " ON " if args . enable_wasm_profiling else " OFF " ) ,
2022-08-22 16:40:40 +00:00
" -Donnxruntime_ENABLE_LAZY_TENSOR= " + ( " ON " if args . enable_lazy_tensor else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS= "
+ ( " ON " if args . enable_external_custom_op_schemas else " OFF " ) ,
2021-11-30 06:44:50 +00:00
" -Donnxruntime_ENABLE_CUDA_PROFILING= " + ( " ON " if args . enable_cuda_profiling else " OFF " ) ,
2022-08-27 02:38:03 +00:00
" -Donnxruntime_ENABLE_ROCM_PROFILING= " + ( " ON " if args . enable_rocm_profiling else " OFF " ) ,
2022-06-03 10:22:34 +00:00
" -Donnxruntime_USE_XNNPACK= " + ( " ON " if args . use_xnnpack else " OFF " ) ,
2023-05-09 04:25:10 +00:00
" -Donnxruntime_USE_WEBNN= " + ( " ON " if args . use_webnn else " OFF " ) ,
2022-09-22 21:53:40 +00:00
" -Donnxruntime_USE_CANN= " + ( " ON " if args . use_cann else " OFF " ) ,
integrate triton into ort (#15862)
### Description
In some scenarios, the triton written kernels are more performant than
CK or other handwritten kernels, so we implement a framework that
onnxruntime can use these triton written kernels.
This PR is to integrate triton into ort, so that ort can use kernels
that written and compiled by triton.
The main change focus on two part:
1. a build part to compile triton written kernel and combine these
kernels into libonnxruntime_providers_rocm.so
2. a loader and launcher in c++, for loading and launch triton written
kernels.
#### Build
To compile triton written kernel, add a script
`tools/ci_build/compile_triton.py`. This script will dynamic load all
kernel files, compile them, and generate `triton_kernel_infos.a` and
`triton_kernel_infos.h`.
`triton_kernel_infos.a` contains all compiled kernel instructions, this
file will be combined into libonnxruntime_providers_rocm.so, using
--whole-archive flag.
`triton_kernel_infos.h` defines a const array that contains all the
metadata for each compiled kernel. These metadata will be used for load
and launch. So this header file is included by 'triton_kernel.cu' which
defines load and launch functions.
Add a build flag in build.py and CMakeList.txt, when building rocm
provider, it will call triton_kernel build command, and generate all
necessary files.
#### C++ Load and Launch
On c++ part, we implement load and launch functions in triton_kernel.cu
and triton_kernel.h.
These two files located in `providers/cuda`, and when compiling rocm,
they will be hipified. so this part supports both cuda and rocm. But
currently we only call triton kernel in rocm.
We also implement a softmax triton op for example. Because there will
generate many kernels for different input shape of softmax, we use
TunableOp to select the best one.
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2023-05-17 01:35:28 +00:00
" -Donnxruntime_USE_TRITON_KERNEL= " + ( " ON " if args . use_triton_kernel else " OFF " ) ,
Introduce float 8 types (#14731)
### Description
The PR implements FloatE4M3FN, FloatE5M2, FloatE4MEFNUZ, FloatE5M2FNUZ
as described in PR https://github.com/onnx/onnx/pull/4805. It uses CUDA
API to cast float/half to float8 if CUDA>=11.8, a custom implementation
if CUDA<11.8.
* It implements, Cast, QuantizeLinear, DequantizeLinear for all types on
CPU, only for types FloatE4M3FN, FloatE5M2 on CUDA.
* It extends the supported types for control flow operator, Shape,
Reshape, Identity, If, Loop, Scan, Reshape
* It implements Equal(19).
* Cast, QuantizeLinear, DequantizeLinear operators now support a
parameter `saturate` only valid for float 8 types. It is true by
default. In that case, any value out of range is converted into the
maximum float 8 value. If false, it is infinite.
* QuantizeLinear, DequantizeLinear now supports multiple scales on CUDA
(and ROCm by extension), scale = 1D tensor with one scale per channel
### Motivation and Context
Supports latest onnx version.
Fixes
[AB#15395](https://aiinfra.visualstudio.com/6a833879-cd9b-44a4-a9de-adc2d818f13c/_workitems/edit/15395)
---------
Co-authored-by: Xavier Dupre <xadupre@microsoft.com@orttrainingdev8.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Scott McKay <Scott.McKay@microsoft.com>
2023-05-30 20:25:58 +00:00
" -Donnxruntime_DISABLE_FLOAT8_TYPES= " + ( " ON " if disable_float8_types else " OFF " ) ,
2023-08-31 20:32:55 +00:00
" -Donnxruntime_DISABLE_SPARSE_TENSORS= " + ( " ON " if disable_sparse_tensors else " OFF " ) ,
" -Donnxruntime_DISABLE_OPTIONAL_TYPE= " + ( " ON " if disable_optional_type else " OFF " ) ,
2020-04-19 03:48:30 +00:00
]
2023-03-29 20:12:07 +00:00
2024-01-25 00:27:05 +00:00
if args . rv64 :
add_default_definition ( cmake_extra_defines , " onnxruntime_CROSS_COMPILING " , " ON " )
if not args . riscv_toolchain_root :
raise BuildError ( " The --riscv_toolchain_root option is required to build for riscv64. " )
if not args . skip_tests and not args . riscv_qemu_path :
raise BuildError ( " The --riscv_qemu_path option is required for testing riscv64. " )
cmake_args + = [
" -DRISCV_TOOLCHAIN_ROOT:PATH= " + args . riscv_toolchain_root ,
" -DRISCV_QEMU_PATH:PATH= " + args . riscv_qemu_path ,
" -DCMAKE_TOOLCHAIN_FILE= " + os . path . join ( source_dir , " cmake " , " riscv64.toolchain.cmake " ) ,
]
2023-03-29 20:12:07 +00:00
# By default on Windows we currently support only cross compiling for ARM/ARM64
# (no native compilation supported through this script).
if args . arm64 or args . arm64ec or args . arm :
add_default_definition ( cmake_extra_defines , " onnxruntime_CROSS_COMPILING " , " ON " )
if args . use_extensions :
add_default_definition ( cmake_extra_defines , " OPENCV_SKIP_SYSTEM_PROCESSOR_DETECTION " , " ON " )
2022-12-15 23:19:07 +00:00
if args . use_cache :
2023-01-06 03:19:57 +00:00
cmake_args . append ( " -Donnxruntime_BUILD_CACHE=ON " )
if not ( is_windows ( ) and args . cmake_generator != " Ninja " ) :
cmake_args . append ( " -DCMAKE_CXX_COMPILER_LAUNCHER=ccache " )
2022-12-16 08:38:12 +00:00
cmake_args . append ( " -DCMAKE_C_COMPILER_LAUNCHER=ccache " )
2023-01-06 03:19:57 +00:00
if args . use_cuda :
cmake_args . append ( " -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache " )
2023-01-28 14:34:24 +00:00
if args . use_rocm :
cmake_args . append ( " -DCMAKE_HIP_COMPILER_LAUNCHER=ccache " )
Improve dependency management (#13523)
## Description
1. Convert some git submodules to cmake external projects
2. Update nsync from
[1.23.0](https://github.com/google/nsync/releases/tag/1.23.0) to
[1.25.0](https://github.com/google/nsync/releases/tag/1.25.0)
3. Update re2 from 2021-06-01 to 2022-06-01
4. Update wil from an old commit to 1.0.220914.1 tag
5. Update gtest to a newer commit so that it can optionally leverage
absl/re2 for parsing command line flags.
The following git submodules are deleted:
1. FP16
2. safeint
3. XNNPACK
4. cxxopts
5. dlpack
7. flatbuffers
8. googlebenchmark
9. json
10. mimalloc
11. mp11
12. pthreadpool
More will come.
## Motivation and Context
There are 3 ways of integrating 3rd party C/C++ libraries into ONNX
Runtime:
1. Install them to a system location, then use cmake's find_package
module to locate them.
2. Use git submodules
6. Use cmake's external projects(externalproject_add).
At first when this project was just started, we considered both option 2
and option 3. We preferred option 2 because:
1. It's easier to handle authentication. At first this project was not
open source, and it had some other non-public dependencies. If we use
git submodule, ADO will handle authentication smoothly. Otherwise we
need to manually pass tokens around and be very careful on not exposing
them in build logs.
2. At that time, cmake fetched dependencies after "cmake" finished
generating vcprojects/makefiles. So it was very difficult to make cflags
consistent. Since cmake 3.11, it has a new command: FetchContent, which
fetches dependencies when it generates vcprojects/makefiles just before
add_subdirectories, so the parent project's variables/settings can be
easily passed to the child projects.
And when the project went on, we had some new concerns:
1. As we started to have more and more EPs and build configs, the number
of submodules grew quickly. For more developers, most ORT submodules are
not relevant to them. They shouldn't need to download all of them.
2. It is impossible to let two different build configs use two different
versions of the same dependency. For example, right now we have protobuf
3.18.3 in the submodules. Then every EP must use the same version.
Whenever we have a need to upgrade protobuf, we need to coordinate
across the whole team and many external developers. I can't manage it
anymore.
3. Some projects want to manage the dependencies in a different way,
either because of their preference or because of compliance
requirements. For example, some Microsoft teams want to use vcpkg, but
we don't want to force every user of onnxruntime using vcpkg.
7. Someone wants to dynamically link to protobuf, but our build script
only does static link.
8. Hard to handle security vulnerabilities. For example, whenever
protobuf has a security patch, we have a lot of things to do. But if we
allowed people to build ORT with a different version of protobuf without
changing ORT"s source code, the customer who build ORT from source will
be able to act on such things in a quicker way. They will not need to
wait ORT having a patch release.
9. Every time we do a release, github will also publish a source file
zip file and a source file tarball for us. But they are not usable,
because they miss submodules.
### New features
After this change, users will be able to:
1. Build the dependencies in the way they want, then install them to
somewhere(for example, /usr or a temp folder).
2. Or download the dependencies by using cmake commands from these
dependencies official website
3. Similar to the above, but use your private mirrors to migrate supply
chain risks.
4. Use different versions of the dependencies, as long as our source
code is compatible with them. For example, you may use you can't use
protobuf 3.20.x as they need code changes in ONNX Runtime.
6. Only download the things the current build needs.
10. Avoid building external dependencies again and again in every build.
### Breaking change
The onnxruntime_PREFER_SYSTEM_LIB build option is removed you could think from now
it is default ON. If you don't like the new behavior, you can set FETCHCONTENT_TRY_FIND_PACKAGE_MODE to NEVER.
Besides, for who relied on the onnxruntime_PREFER_SYSTEM_LIB build
option, please be aware that this PR will change find_package calls from
Module mode to Config mode. For example, in the past if you have
installed protobuf from apt-get from ubuntu 20.04's official repo,
find_package can find it and use it. But after this PR, it won't. This
is because that protobuf version provided by Ubuntu 20.04 is too old to
support the "config mode". It can be resolved by getting a newer version
of protobuf from somewhere.
2022-12-01 17:51:59 +00:00
# By default cmake does not check TLS/SSL certificates. Here we turn it on.
# But, in some cases you may also need to supply a CA file.
add_default_definition ( cmake_extra_defines , " CMAKE_TLS_VERIFY " , " ON " )
add_default_definition ( cmake_extra_defines , " FETCHCONTENT_QUIET " , " OFF " )
2021-11-15 16:16:20 +00:00
if args . external_graph_transformer_path :
cmake_args . append ( " -Donnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH= " + args . external_graph_transformer_path )
2022-06-07 01:37:16 +00:00
if args . use_winml :
cmake_args . append ( " -Donnxruntime_BUILD_WINML_TESTS= " + ( " OFF " if args . skip_winml_tests else " ON " ) )
if args . use_dnnl :
cmake_args . append ( " -Donnxruntime_DNNL_GPU_RUNTIME= " + args . dnnl_gpu_runtime )
cmake_args . append ( " -Donnxruntime_DNNL_OPENCL_ROOT= " + args . dnnl_opencl_root )
2023-12-01 17:16:44 +00:00
cmake_args . append ( " -Donnxruntime_DNNL_AARCH64_RUNTIME= " + args . dnnl_aarch64_runtime )
cmake_args . append ( " -Donnxruntime_DNNL_ACL_ROOT= " + args . dnnl_acl_root )
2022-06-07 01:37:16 +00:00
if args . build_wasm :
cmake_args . append ( " -Donnxruntime_ENABLE_WEBASSEMBLY_SIMD= " + ( " ON " if args . enable_wasm_simd else " OFF " ) )
if args . use_migraphx :
cmake_args . append ( " -Donnxruntime_MIGRAPHX_HOME= " + migraphx_home )
if args . use_cuda :
2023-09-05 17:59:27 +00:00
nvcc_threads = number_of_nvcc_threads ( args )
Flash Attention v2 MHA (#17227)
### Description
Integrate Flash Attention V2 to PackedMultiHeadAttention,
MultiHeadAttention and Attention operators.
Flash Attention v2 source code is from
https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src.
We did some change to remove dependency on Torch, then removed backward
and bfloat16 related code.
Add benchmark script (see benchmark_mha.sh) to compare different
attention kernels for MultiHeadAttention operator.
Current limitations for Flash Attention in PackedMultiHeadAttention,
MultiHeadAttention and Attention operators:
* Relative Position Bias is not supported
* Different hidden size for Q and V is not supported
* Only float16 is supported
* Padding/attention mask is not supported
* For MultiHeadAttention, when there is past or present input, bias
shall be provided to activate flash attention
* For Attention, past or present inputs will deactivate flash attention
* Causal is not supported
Some limitations (like attention mask and causal) might be removed
later.
Currently, Flash Attention v2 only works in Linux. For Windows, we will
enable later with Cutlass 3.2.
Two environment variables can be used for testing purpose:
(1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default
value is 0 (enable). Set it to "1" to disable it.
(2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is
"513", which means that we only enable flash attention when sequence
length is larger than 512 for packed QKV format. Set it to "0" if you
want to use flash attention v2 whenever possible.
### Speedup
The following result is from Standard_ND96amsr_A100_v4 VM
(A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per
second for MultiHeadAttention operator.
There are 3 input formats:
* `Q,K,V` means separated inputs query, key and value of BxSxNH
* `Q,KV` means packed KV, where key is 5D: BxSxNx2xH
* `QKV` means packed QKV, where query is 5D: BxSxNx3xH
Note that flash attention cannot use packed QKV format, so extra
Transpose is needed. We found that TensorRT kernel is faster for
sequence length <= 512 for packed QKV. The reason might be no transpose
is needed for TensorRT kernel in this format.
We also notice that, TensorRT kernel is faster for stable diffusion
512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while
flash attention v2 is faster for 1024x1024 image (see seq_len=16384,
heads=8, head_dim=40 below).
input format | batch size | sequence length | heads | head dim |
flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention
(TFLOPs/s)
-- | -- | -- | -- | -- | -- | -- | --
Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3
Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7
Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3
Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4
Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8
Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7
Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7
Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3
Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7
Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6
Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2
Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8
Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8
Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5
Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8
Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2
Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2
Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8
Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1
Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6
Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7
Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7
Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3
Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7
Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8
Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1
Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4
Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1
Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6
Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8
Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6
Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5
Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7
Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1
Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3
Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9
Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6
Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2
Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8
Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5
Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6
Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6
Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8
Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8
Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5
Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3
Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8
Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8
Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9
Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0
Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0
Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9
Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9
Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8
QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3
QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9
QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6
QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2
QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9
QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5
QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7
QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2
QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7
QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5
QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2
QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7
QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1
QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7
QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4
QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5
QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8
QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9
QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1
QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6
QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7
QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6
QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5
QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1
QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5
QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2
QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6
QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15
QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84
QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75
QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95
### Known Issues
NVCC uses huge memory while compiling flash attention CUDA kernel. Linux
build with CUDA might fail when machine has limited memory while number
of CPUs is large. Walkaround is to use a build machine with larger
memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in
build.
### Motivation and Context
Increases speed and efficiency of MHA or Packed MHA.
---------
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
cmake_args . append ( " -Donnxruntime_NVCC_THREADS= " + str ( nvcc_threads ) )
2023-11-21 13:37:48 +00:00
if not disable_float8_types and args . cuda_version :
if version_to_tuple ( args . cuda_version ) < ( 11 , 8 ) :
raise BuildError (
f " Float 8 types require CUDA>=11.8. They must be disabled on CUDA== { args . cuda_version } . "
f " Add ' --disable_types float8 ' to your command line. See option disable_types. "
)
2024-06-17 21:41:43 +00:00
cmake_args . append ( f " -DCMAKE_CUDA_COMPILER= { cuda_home } /bin/nvcc " )
2022-06-07 01:37:16 +00:00
if args . use_rocm :
cmake_args . append ( " -Donnxruntime_ROCM_HOME= " + rocm_home )
cmake_args . append ( " -Donnxruntime_ROCM_VERSION= " + args . rocm_version )
if args . use_tensorrt :
cmake_args . append ( " -Donnxruntime_TENSORRT_HOME= " + tensorrt_home )
2022-07-13 08:48:42 +00:00
if args . llvm_config :
cmake_args . append ( " -Donnxruntime_TVM_USE_LLVM= " + args . llvm_config )
2022-06-07 01:37:16 +00:00
2021-06-25 21:08:01 +00:00
if args . use_cuda :
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " onnxruntime_USE_CUDA " , " ON " )
2022-06-04 03:00:54 +00:00
if args . cuda_version :
add_default_definition ( cmake_extra_defines , " onnxruntime_CUDA_VERSION " , args . cuda_version )
2021-07-31 00:16:37 +00:00
# TODO: this variable is not really needed
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " onnxruntime_CUDA_HOME " , cuda_home )
2022-08-25 01:21:50 +00:00
if cudnn_home :
add_default_definition ( cmake_extra_defines , " onnxruntime_CUDNN_HOME " , cudnn_home )
2021-07-31 00:16:37 +00:00
if is_windows ( ) :
if args . enable_msvc_static_runtime :
2022-04-26 16:35:16 +00:00
add_default_definition (
cmake_extra_defines , " CMAKE_MSVC_RUNTIME_LIBRARY " , " MultiThreaded$<$<CONFIG:Debug>:Debug> "
)
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " ONNX_USE_MSVC_STATIC_RUNTIME " , " ON " )
add_default_definition ( cmake_extra_defines , " protobuf_MSVC_STATIC_RUNTIME " , " ON " )
2024-10-11 03:09:13 +00:00
# The following build option was added in ABSL 20240722.0 and it must be explicitly set
add_default_definition ( cmake_extra_defines , " ABSL_MSVC_STATIC_RUNTIME " , " ON " )
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " gtest_force_shared_crt " , " OFF " )
2021-07-31 00:16:37 +00:00
else :
# CMAKE_MSVC_RUNTIME_LIBRARY is default to MultiThreaded$<$<CONFIG:Debug>:Debug>DLL
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " ONNX_USE_MSVC_STATIC_RUNTIME " , " OFF " )
add_default_definition ( cmake_extra_defines , " protobuf_MSVC_STATIC_RUNTIME " , " OFF " )
2024-10-11 03:09:13 +00:00
add_default_definition ( cmake_extra_defines , " ABSL_MSVC_STATIC_RUNTIME " , " OFF " )
2022-03-22 02:10:47 +00:00
add_default_definition ( cmake_extra_defines , " gtest_force_shared_crt " , " ON " )
2021-06-03 06:36:49 +00:00
2020-10-22 16:29:44 +00:00
if acl_home and os . path . exists ( acl_home ) :
cmake_args + = [ " -Donnxruntime_ACL_HOME= " + acl_home ]
if acl_libs and os . path . exists ( acl_libs ) :
cmake_args + = [ " -Donnxruntime_ACL_LIBS= " + acl_libs ]
if armnn_home and os . path . exists ( armnn_home ) :
cmake_args + = [ " -Donnxruntime_ARMNN_HOME= " + armnn_home ]
if armnn_libs and os . path . exists ( armnn_libs ) :
cmake_args + = [ " -Donnxruntime_ARMNN_LIBS= " + armnn_libs ]
2020-06-15 15:47:03 +00:00
if mpi_home and os . path . exists ( mpi_home ) :
2020-12-18 00:21:33 +00:00
if args . use_mpi :
cmake_args + = [ " -Donnxruntime_MPI_HOME= " + mpi_home ]
else :
2022-04-26 16:35:16 +00:00
log . warning (
" mpi_home is supplied but use_mpi is set to false. "
" Build will continue without linking MPI libraries. "
)
2020-06-15 15:47:03 +00:00
if nccl_home and os . path . exists ( nccl_home ) :
cmake_args + = [ " -Donnxruntime_NCCL_HOME= " + nccl_home ]
2023-03-01 21:48:20 +00:00
if qnn_home and os . path . exists ( qnn_home ) :
cmake_args + = [ " -Donnxruntime_QNN_HOME= " + qnn_home ]
2022-06-03 21:10:02 +00:00
if snpe_root and os . path . exists ( snpe_root ) :
cmake_args + = [ " -DSNPE_ROOT= " + snpe_root ]
2022-09-22 21:53:40 +00:00
if cann_home and os . path . exists ( cann_home ) :
cmake_args + = [ " -Donnxruntime_CANN_HOME= " + cann_home ]
2020-04-17 13:18:54 +00:00
if args . winml_root_namespace_override :
2022-04-26 16:35:16 +00:00
cmake_args + = [ " -Donnxruntime_WINML_NAMESPACE_OVERRIDE= " + args . winml_root_namespace_override ]
2020-08-17 16:40:31 +00:00
if args . use_openvino :
2022-04-26 16:35:16 +00:00
cmake_args + = [
" -Donnxruntime_USE_OPENVINO=ON " ,
2024-06-28 15:31:02 +00:00
" -Donnxruntime_NPU_NO_FALLBACK= " + ( " ON " if args . use_openvino == " NPU_NO_CPU_FALLBACK " else " OFF " ) ,
2024-04-19 07:31:38 +00:00
" -Donnxruntime_USE_OPENVINO_GPU= " + ( " ON " if args . use_openvino == " GPU " else " OFF " ) ,
" -Donnxruntime_USE_OPENVINO_CPU= " + ( " ON " if args . use_openvino == " CPU " else " OFF " ) ,
2024-03-22 01:44:00 +00:00
" -Donnxruntime_USE_OPENVINO_NPU= " + ( " ON " if args . use_openvino == " NPU " else " OFF " ) ,
2024-04-19 07:31:38 +00:00
" -Donnxruntime_USE_OPENVINO_GPU_NP= " + ( " ON " if args . use_openvino == " GPU_NO_PARTITION " else " OFF " ) ,
" -Donnxruntime_USE_OPENVINO_CPU_NP= " + ( " ON " if args . use_openvino == " CPU_NO_PARTITION " else " OFF " ) ,
2024-03-22 01:44:00 +00:00
" -Donnxruntime_USE_OPENVINO_NPU_NP= " + ( " ON " if args . use_openvino == " NPU_NO_PARTITION " else " OFF " ) ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_USE_OPENVINO_HETERO= " + ( " ON " if args . use_openvino . startswith ( " HETERO " ) else " OFF " ) ,
" -Donnxruntime_USE_OPENVINO_DEVICE= " + ( args . use_openvino ) ,
" -Donnxruntime_USE_OPENVINO_MULTI= " + ( " ON " if args . use_openvino . startswith ( " MULTI " ) else " OFF " ) ,
" -Donnxruntime_USE_OPENVINO_AUTO= " + ( " ON " if args . use_openvino . startswith ( " AUTO " ) else " OFF " ) ,
]
2020-03-11 21:25:37 +00:00
2024-04-19 02:39:08 +00:00
# VitisAI and OpenVINO providers currently only support full_protobuf option.
2024-05-06 22:00:13 +00:00
if args . use_full_protobuf or args . use_openvino or args . use_vitisai or args . gen_doc :
2022-04-26 16:35:16 +00:00
cmake_args + = [ " -Donnxruntime_USE_FULL_PROTOBUF=ON " , " -DProtobuf_USE_STATIC_LIBS=ON " ]
2020-02-04 03:33:14 +00:00
2022-09-07 22:11:18 +00:00
if args . use_tvm and args . llvm_path is not None :
2024-07-24 18:50:11 +00:00
cmake_args + = [ f " -DLLVM_DIR= { args . llvm_path } " ]
2018-11-20 00:48:22 +00:00
if args . use_cuda and not is_windows ( ) :
nvml_stub_path = cuda_home + " /lib64/stubs "
cmake_args + = [ " -DCUDA_CUDA_LIBRARY= " + nvml_stub_path ]
if args . use_preinstalled_eigen :
2022-04-26 16:35:16 +00:00
cmake_args + = [ " -Donnxruntime_USE_PREINSTALLED_EIGEN=ON " , " -Deigen_SOURCE_PATH= " + args . eigen_path ]
2018-11-20 00:48:22 +00:00
2020-11-16 01:04:45 +00:00
if args . nnapi_min_api :
cmake_args + = [ " -Donnxruntime_NNAPI_MIN_API= " + str ( args . nnapi_min_api ) ]
2019-07-24 20:20:05 +00:00
if args . android :
2021-03-31 01:42:18 +00:00
if not args . android_ndk_path :
raise BuildError ( " android_ndk_path required to build for Android " )
if not args . android_sdk_path :
raise BuildError ( " android_sdk_path required to build for Android " )
2020-04-19 03:48:30 +00:00
cmake_args + = [
2022-04-26 16:35:16 +00:00
" -DCMAKE_TOOLCHAIN_FILE= "
+ os . path . join ( args . android_ndk_path , " build " , " cmake " , " android.toolchain.cmake " ) ,
2020-04-19 03:48:30 +00:00
" -DANDROID_PLATFORM=android- " + str ( args . android_api ) ,
2021-05-12 23:01:25 +00:00
" -DANDROID_ABI= " + str ( args . android_abi ) ,
" -DANDROID_MIN_SDK= " + str ( args . android_api ) ,
2020-04-19 03:48:30 +00:00
]
2020-04-01 00:10:48 +00:00
2020-09-09 11:38:34 +00:00
if args . android_cpp_shared :
cmake_args + = [ " -DANDROID_STL=c++_shared " ]
2022-04-07 22:06:31 +00:00
if args . dml_path :
cmake_args + = [
" -Donnxruntime_USE_CUSTOM_DIRECTML=ON " ,
" -Ddml_INCLUDE_DIR= " + os . path . join ( args . dml_path , " include " ) ,
" -Ddml_LIB_DIR= " + os . path . join ( args . dml_path , " lib " ) ,
]
2022-04-12 18:59:00 +00:00
if args . dml_external_project :
cmake_args + = [
" -Donnxruntime_USE_CUSTOM_DIRECTML=ON " ,
" -Ddml_EXTERNAL_PROJECT=ON " ,
]
2022-04-07 22:06:31 +00:00
if args . use_gdk :
cmake_args + = [
2022-04-26 16:35:16 +00:00
" -DCMAKE_TOOLCHAIN_FILE= " + os . path . join ( source_dir , " cmake " , " gdk_toolchain.cmake " ) ,
2022-04-07 22:06:31 +00:00
" -DGDK_EDITION= " + args . gdk_edition ,
" -DGDK_PLATFORM= " + args . gdk_platform ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_BUILD_UNIT_TESTS=OFF " , # gtest doesn't build for GDK
2022-04-07 22:06:31 +00:00
]
2022-04-12 18:59:00 +00:00
if args . use_dml and not ( args . dml_path or args . dml_external_project ) :
raise BuildError ( " You must set dml_path or dml_external_project when building with the GDK. " )
2022-04-07 22:06:31 +00:00
2020-11-30 19:22:08 +00:00
if is_macOS ( ) and not args . android :
cmake_args + = [ " -DCMAKE_OSX_ARCHITECTURES= " + args . osx_arch ]
2021-01-27 18:43:17 +00:00
if args . apple_deploy_target :
cmake_args + = [ " -DCMAKE_OSX_DEPLOYMENT_TARGET= " + args . apple_deploy_target ]
2021-04-15 23:47:53 +00:00
# Code sign the binaries, if the code signing development identity and/or team id are provided
if args . xcode_code_signing_identity :
cmake_args + = [ " -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY= " + args . xcode_code_signing_identity ]
if args . xcode_code_signing_team_id :
cmake_args + = [ " -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM= " + args . xcode_code_signing_team_id ]
2021-01-27 18:43:17 +00:00
2023-03-01 21:48:20 +00:00
if args . use_qnn :
if args . qnn_home is None or os . path . exists ( args . qnn_home ) is False :
raise BuildError ( " qnn_home= " + qnn_home + " not valid. " + " qnn_home paths must be specified and valid. " )
cmake_args + = [ " -Donnxruntime_USE_QNN=ON " ]
2021-01-27 18:43:17 +00:00
if args . use_coreml :
cmake_args + = [ " -Donnxruntime_USE_COREML=ON " ]
2020-11-30 19:22:08 +00:00
2023-05-09 04:25:10 +00:00
if args . use_webnn :
if not args . build_wasm :
raise BuildError ( " WebNN is only available for WebAssembly build. " )
cmake_args + = [ " -Donnxruntime_USE_WEBNN=ON " ]
2024-10-08 23:10:46 +00:00
if args . use_jsep and args . use_webgpu :
raise BuildError ( " JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time. " )
Add implementation of WebGPU EP (#22591)
### Description
This PR adds the actual implementation of the WebGPU EP based on
https://github.com/microsoft/onnxruntime/pull/22318.
This change includes the following:
<details>
<summary><b>core framework of WebGPU EP</b></summary>
- WebGPU EP factory classes for:
- handling WebGPU options
- creating WebGPU EP instance
- creating WebGPU context
- WebGPU Execution Provider classes
- GPU Buffer allocator
- data transfer
- Buffer management classes
- Buffer Manager
- BufferCacheManager
- DisabledCacheManager
- SimpleCacheManager
- LazyReleaseCacheManager
- BucketCacheManager
- Program classes
- Program (base)
- Program Cache Key
- Program Manager
- Shader helper classes
- Shader Helper
- ShaderIndicesHelper
- ShaderVariableHelper
- Utils
- GPU Query based profiler
- compute context
- string utils
- Miscs
- Python binding webgpu support (basic)
</details>
<details>
<summary><b>Kernel implementation</b></summary>
- onnx.ai (default opset):
- Elementwise (math): Abs, Neg, Floor, Ceil, Reciprocal, Sqrt, Exp, Erf,
Log, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Asinh, Acosh, Atanh,
Tanh, Not, Cast
- Elementwise (activation): Sigmoid, HardSigmoid, Clip, Elu, Relu,
LeakyRelu, ThresholdedRelu, Gelu
- Binary (math): Add, Sub, Mul, Div, Pow, Equal, Greater,
GreaterOrEqual, Less, LessOrEqual
- (Tensors): Shape, Reshape, Squeeze, Unsqueeze
- Where
- Transpose
- Concat
- Expand
- Gather
- Tile
- Range
- LayerNormalization
- com.microsoft
- FastGelu
- MatMulNBits
- MultiHeadAttention
- RotaryEmbedding
- SkipLayerNormalization
- LayerNormalization
- SimplifiedLayerNormalization
- SkipSimplifiedLayerNormalization
</details>
<details>
<summary><b>Build, test and CI pipeline integration</b></summary>
- build works for Windows, macOS and iOS
- support onnxruntime_test_all and python node test
- added a new unit test for `--use_external_dawn` build flag.
- updated MacOS pipeline to build with WebGPU support
- added a new pipeline for WebGPU Windows
</details>
This change does not include:
- Node.js binding support for WebGPU (will be a separate PR)
2024-10-30 01:29:40 +00:00
if args . use_external_dawn and not args . use_webgpu :
raise BuildError ( " External Dawn (--use_external_dawn) must be enabled with WebGPU (--use_webgpu). " )
2022-06-03 21:10:02 +00:00
if args . use_snpe :
cmake_args + = [ " -Donnxruntime_USE_SNPE=ON " ]
2024-04-24 01:15:07 +00:00
if args . macos or args . ios or args . visionos :
2024-03-20 17:55:19 +00:00
# Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet.
if args . macos == " Catalyst " and args . cmake_generator == " Xcode " :
raise BuildError ( " Xcode CMake generator ( ' --cmake_generator Xcode ' ) doesn ' t support Mac Catalyst build. " )
2024-04-24 01:15:07 +00:00
if ( args . ios or args . visionos or args . macos == " MacOSX " ) and not args . cmake_generator == " Xcode " :
2023-11-28 18:11:53 +00:00
raise BuildError (
" iOS/MacOS framework build requires use of the Xcode CMake generator ( ' --cmake_generator Xcode ' ). "
)
2023-07-07 15:11:44 +00:00
2021-11-18 19:31:13 +00:00
needed_args = [
2023-11-28 18:11:53 +00:00
args . apple_sysroot ,
2021-11-18 19:31:13 +00:00
args . apple_deploy_target ,
]
arg_names = [
2023-11-28 18:11:53 +00:00
" --apple_sysroot " + " <the location or name of the macOS platform SDK> " ,
2023-10-06 04:07:33 +00:00
" --apple_deploy_target " + " <the minimum version of the target platform> " ,
2021-11-18 19:31:13 +00:00
]
if not all ( needed_args ) :
raise BuildError (
2023-11-28 18:11:53 +00:00
" iOS/MacOS framework build on MacOS canceled due to missing arguments: "
2022-04-26 16:35:16 +00:00
+ " , " . join ( val for val , cond in zip ( arg_names , needed_args ) if not cond )
)
2024-03-20 17:55:19 +00:00
# note: this value is mainly used in framework_info.json file to specify the build osx type
platform_name = " macabi " if args . macos == " Catalyst " else args . apple_sysroot
2021-11-18 19:31:13 +00:00
cmake_args + = [
" -Donnxruntime_BUILD_SHARED_LIB=ON " ,
2023-11-28 18:11:53 +00:00
" -DCMAKE_OSX_SYSROOT= " + args . apple_sysroot ,
2021-11-18 19:31:13 +00:00
" -DCMAKE_OSX_DEPLOYMENT_TARGET= " + args . apple_deploy_target ,
# we do not need protoc binary for ios cross build
" -Dprotobuf_BUILD_PROTOC_BINARIES=OFF " ,
2024-03-20 17:55:19 +00:00
" -DPLATFORM_NAME= " + platform_name ,
2021-11-18 19:31:13 +00:00
]
2023-11-28 18:11:53 +00:00
if args . ios :
cmake_args + = [
" -DCMAKE_SYSTEM_NAME=iOS " ,
" -DCMAKE_TOOLCHAIN_FILE= "
+ ( args . ios_toolchain_file if args . ios_toolchain_file else " ../cmake/onnxruntime_ios.toolchain.cmake " ) ,
]
2024-03-20 17:55:19 +00:00
# for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc.
# https://forums.developer.apple.com/forums/thread/122571
if args . macos == " Catalyst " :
macabi_target = f " { args . osx_arch } -apple-ios { args . apple_deploy_target } -macabi "
cmake_args + = [
" -DCMAKE_CXX_COMPILER_TARGET= " + macabi_target ,
" -DCMAKE_C_COMPILER_TARGET= " + macabi_target ,
" -DCMAKE_CC_COMPILER_TARGET= " + macabi_target ,
f " -DCMAKE_CXX_FLAGS=--target= { macabi_target } " ,
f " -DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target= { macabi_target } " ,
f " -DCMAKE_C_FLAGS=--target= { macabi_target } " ,
f " -DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target= { macabi_target } " ,
f " -DCMAKE_CC_FLAGS=--target= { macabi_target } " ,
f " -DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target= { macabi_target } " ,
]
2024-04-24 01:15:07 +00:00
if args . visionos :
cmake_args + = [
" -DCMAKE_SYSTEM_NAME=visionOS " ,
" -DCMAKE_TOOLCHAIN_FILE= "
+ (
args . visionos_toolchain_file
if args . visionos_toolchain_file
else " ../cmake/onnxruntime_visionos.toolchain.cmake "
) ,
" -Donnxruntime_ENABLE_CPUINFO=OFF " ,
]
2019-07-24 20:20:05 +00:00
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args . build_wasm :
emsdk_dir = os . path . join ( cmake_dir , " external " , " emsdk " )
2022-04-26 16:35:16 +00:00
emscripten_cmake_toolchain_file = os . path . join (
emsdk_dir , " upstream " , " emscripten " , " cmake " , " Modules " , " Platform " , " Emscripten.cmake "
)
cmake_args + = [ " -DCMAKE_TOOLCHAIN_FILE= " + emscripten_cmake_toolchain_file ]
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args . disable_wasm_exception_catching :
# WebAssembly unittest requires exception catching to work. If this feature is disabled, we do not build
# unit test.
cmake_args + = [
" -Donnxruntime_BUILD_UNIT_TESTS=OFF " ,
]
2022-03-22 18:55:45 +00:00
# add default emscripten settings
emscripten_settings = normalize_arg_list ( args . emscripten_settings )
# set -s MALLOC
if args . wasm_malloc is not None :
2022-04-26 16:35:16 +00:00
add_default_definition ( emscripten_settings , " MALLOC " , args . wasm_malloc )
add_default_definition ( emscripten_settings , " MALLOC " , " dlmalloc " )
2022-03-22 18:55:45 +00:00
2023-05-08 23:49:47 +00:00
# set -s STACK_SIZE=5242880
add_default_definition ( emscripten_settings , " STACK_SIZE " , " 5242880 " )
2022-04-26 16:35:16 +00:00
if emscripten_settings :
2022-03-22 18:55:45 +00:00
cmake_args + = [ f " -Donnxruntime_EMSCRIPTEN_SETTINGS= { ' ; ' . join ( emscripten_settings ) } " ]
2021-08-28 04:45:52 +00:00
# Append onnxruntime-extensions cmake options
if args . use_extensions :
cmake_args + = [ " -Donnxruntime_USE_EXTENSIONS=ON " ]
# default path of onnxruntime-extensions, using git submodule
2023-02-23 03:42:36 +00:00
for config in configs :
onnxruntime_extensions_path = os . path . join ( build_dir , config , " _deps " , " extensions-src " )
onnxruntime_extensions_path = os . path . abspath ( onnxruntime_extensions_path )
if args . extensions_overridden_path and os . path . exists ( args . extensions_overridden_path ) :
# use absolute path here because onnxruntime-extensions is outside onnxruntime
onnxruntime_extensions_path = os . path . abspath ( args . extensions_overridden_path )
cmake_args + = [ " -Donnxruntime_EXTENSIONS_OVERRIDDEN=ON " ]
print ( " [onnxruntime-extensions] Loading onnxruntime-extensions from: " , onnxruntime_extensions_path )
else :
print ( " [onnxruntime-extensions] Loading onnxruntime-extensions from: FetchContent " )
2021-08-28 04:45:52 +00:00
2023-02-23 03:42:36 +00:00
cmake_args + = [ " -Donnxruntime_EXTENSIONS_PATH= " + onnxruntime_extensions_path ]
2021-08-28 04:45:52 +00:00
2023-02-23 03:42:36 +00:00
if is_reduced_ops_build ( args ) :
operators_config_file = os . path . abspath ( args . include_ops_by_config )
cmake_tool_dir = os . path . join ( onnxruntime_extensions_path , " tools " )
2021-08-28 04:45:52 +00:00
2023-02-23 03:42:36 +00:00
# generate _selectedoplist.cmake by operators config file
run_subprocess ( [ sys . executable , " gen_selectedops.py " , operators_config_file ] , cwd = cmake_tool_dir )
2021-08-28 04:45:52 +00:00
2019-03-23 00:41:21 +00:00
if path_to_protoc_exe :
2023-02-23 03:42:36 +00:00
cmake_args + = [ f " -DONNX_CUSTOM_PROTOC_EXECUTABLE= { path_to_protoc_exe } " ]
2019-03-09 01:42:20 +00:00
2020-07-06 23:34:34 +00:00
if args . fuzz_testing :
2022-04-26 16:35:16 +00:00
if not (
args . build_shared_lib
and is_windows ( )
2023-05-16 17:34:34 +00:00
and args . cmake_generator == " Visual Studio 17 2022 "
2022-04-26 16:35:16 +00:00
and args . use_full_protobuf
) :
raise BuildError ( " Fuzz test has only be tested with build shared libs option using MSVC on windows " )
2020-07-06 23:34:34 +00:00
cmake_args + = [
" -Donnxruntime_BUILD_UNIT_TESTS=ON " ,
" -Donnxruntime_FUZZ_TEST=ON " ,
2022-04-26 16:35:16 +00:00
" -Donnxruntime_USE_FULL_PROTOBUF=ON " ,
]
2020-07-06 23:34:34 +00:00
2023-04-10 17:41:04 +00:00
if args . enable_lazy_tensor :
2021-08-06 15:30:27 +00:00
import torch
2022-04-26 16:35:16 +00:00
2024-07-24 18:50:11 +00:00
cmake_args + = [ f " -Donnxruntime_PREBUILT_PYTORCH_PATH= { os . path . dirname ( torch . __file__ ) } " ]
2022-04-26 16:35:16 +00:00
cmake_args + = [ " -D_GLIBCXX_USE_CXX11_ABI= " + str ( int ( torch . _C . _GLIBCXX_USE_CXX11_ABI ) ) ]
2021-08-06 15:30:27 +00:00
2023-01-11 20:25:04 +00:00
if args . use_azure :
add_default_definition ( cmake_extra_defines , " onnxruntime_USE_AZURE " , " ON " )
CloudEP (#13855)
Implement CloudEP for hybrid inferencing.
The PR introduces zero new API, customers could configure session and
run options to do inferencing with Azure [triton
endpoint.](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-with-triton?tabs=azure-cli%2Cendpoint)
Sample configuration in python be like:
```
sess_opt.add_session_config_entry('cloud.endpoint_type', 'triton');
sess_opt.add_session_config_entry('cloud.uri', 'https://cloud.com');
sess_opt.add_session_config_entry('cloud.model_name', 'detection2');
sess_opt.add_session_config_entry('cloud.model_version', '7'); // optional, default 1
sess_opt.add_session_config_entry('cloud.verbose', '1'); // optional, default '0', meaning no verbose
...
run_opt.add_run_config_entry('use_cloud', '1') # 0 for local inferencing, 1 for cloud endpoint.
run_opt.add_run_config_entry('cloud.auth_key', '...')
...
sess.run(None, {'input':input_}, run_opt)
```
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
2023-01-03 18:03:15 +00:00
Implement mutex-free spin lock for task queue (#14834)
Implemented "lock-free" spinlock to save CPU usage on context switching.
The change has been tested on queene service of Ads team, the lock-free
version of ort (40 threads) saves CPU usage on gen8 (128 logical
processors on 8 numa nodes) windows by nearly half, from 65% to 35%.
For 32 cores, the curve is flat:
Anubis, 32 vCPU, windows, hugging face models,
95 percentile E2E latency in ms:
model | mutex(ms) | mutex-free
--- | --- | ---
alvert_base_v2 | 34.21 | 34.09
bert_large_uncased | 116.27| 117.84
bart_base | 72.06 | 71.99
distilgpt2 | 25.43 | 25.02
vit_base_patch16_224 | 37.33 | 37.76
Anubis, 32 vCPU win, Linux, 1st party models,
95 percentile E2E latency in ms:
model | mutex(ms) | mutex-free
--- | --- | ---
deepthink_v2 | 24.35 | 22.95
bing_feeds | 36.96 | 36.48
deep_writes | 14.46 | 14.32
keypoints | 9.34 | 7.69
model11 | 1.71 | 1.66
model12 | 1.82 | 1.44
model2 | 4.21 | 3.95
model6 | 1.08 | 1.05
agiencoder | 0.99 | 0.93
geminet_transformer | 5.32 | 5.24
---------
Co-authored-by: Randy Shuai <rashuai@microsoft.com>
2023-05-19 17:12:10 +00:00
if args . use_lock_free_queue :
add_default_definition ( cmake_extra_defines , " onnxruntime_USE_LOCK_FREE_QUEUE " , " ON " )
2024-01-12 15:24:40 +00:00
if is_windows ( ) :
if args . use_cache :
add_default_definition (
cmake_extra_defines , " CMAKE_MSVC_DEBUG_INFORMATION_FORMAT " , " $<$<CONFIG:Debug,RelWithDebInfo>:Embedded> "
)
else :
# Always enable debug info even in release build. The debug information is in separated *.pdb files that
# can be easily discarded when debug symbols are not needed. We enable it by default because many auditting
# tools need to use the symbols.
add_default_definition ( cmake_extra_defines , " CMAKE_MSVC_DEBUG_INFORMATION_FORMAT " , " ProgramDatabase " )
2024-02-27 16:56:16 +00:00
if number_of_parallel_jobs ( args ) > 0 :
# https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/
# NOTE: this disables /MP if set (according to comments on blog post).
# By default, MultiProcMaxCount and CL_MPCount value are equal to the number of CPU logical processors.
# See logic around setting CL_MPCount below
cmake_args + = [ " -DCMAKE_VS_GLOBALS=UseMultiToolTask=true;EnforceProcessCountAcrossBuilds=true " ]
2023-03-24 22:29:03 +00:00
cmake_args + = [ f " -D { define } " for define in cmake_extra_defines ]
2018-11-20 00:48:22 +00:00
2020-08-17 16:40:31 +00:00
cmake_args + = cmake_extra_args
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
# ADO pipelines will store the pipeline build number
# (e.g. 191101-2300.1.master) and source version in environment
# variables. If present, use these values to define the
2020-02-07 19:00:28 +00:00
# WinML/ORT DLL versions.
2023-04-17 17:11:44 +00:00
build_number = os . getenv ( " Build_BuildNumber " ) # noqa: SIM112
source_version = os . getenv ( " Build_SourceVersion " ) # noqa: SIM112
2020-02-07 19:00:28 +00:00
if build_number and source_version :
2022-04-26 16:35:16 +00:00
build_matches = re . fullmatch ( r " ( \ d \ d)( \ d \ d)( \ d \ d)( \ d \ d) \ .( \ d+) " , build_number )
2020-02-07 19:00:28 +00:00
if build_matches :
2023-03-24 22:29:03 +00:00
YY = build_matches . group ( 2 ) # noqa: N806
MM = build_matches . group ( 3 ) # noqa: N806
DD = build_matches . group ( 4 ) # noqa: N806
2020-03-13 23:54:55 +00:00
2020-02-07 19:00:28 +00:00
# Get ORT major and minor number
2022-04-26 16:35:16 +00:00
with open ( os . path . join ( source_dir , " VERSION_NUMBER " ) ) as f :
2020-02-07 19:00:28 +00:00
first_line = f . readline ( )
ort_version_matches = re . match ( r " ( \ d+).( \ d+) " , first_line )
if not ort_version_matches :
raise BuildError ( " Couldn ' t read version from VERSION_FILE " )
ort_major = ort_version_matches . group ( 1 )
ort_minor = ort_version_matches . group ( 2 )
2020-04-19 03:48:30 +00:00
# Example (BuildNumber: 191101-2300.1.master,
# SourceVersion: 0bce7ae6755c792eda558e5d27ded701707dc404)
2020-02-07 19:00:28 +00:00
# MajorPart = 1
# MinorPart = 0
# BuildPart = 1911
# PrivatePart = 123
# String = 191101-2300.1.master.0bce7ae
2020-04-19 03:48:30 +00:00
cmake_args + = [
2023-03-24 22:29:03 +00:00
f " -DVERSION_MAJOR_PART= { ort_major } " ,
f " -DVERSION_MINOR_PART= { ort_minor } " ,
f " -DVERSION_BUILD_PART= { YY } " ,
f " -DVERSION_PRIVATE_PART= { MM } { DD } " ,
f " -DVERSION_STRING= { ort_major } . { ort_minor } . { build_number } . { source_version [ 0 : 7 ] } " ,
2020-04-19 03:48:30 +00:00
]
2024-01-29 20:45:38 +00:00
2020-03-13 23:54:55 +00:00
for config in configs :
2024-01-29 20:45:38 +00:00
cflags = [ ]
cxxflags = None
ldflags = None
cudaflags = [ ]
if is_windows ( ) and not args . ios and not args . android and not args . build_wasm :
njobs = number_of_parallel_jobs ( args )
2024-06-19 06:14:08 +00:00
if args . use_cuda :
cudaflags . append ( " -allow-unsupported-compiler " )
2024-01-29 20:45:38 +00:00
if njobs > 1 :
if args . parallel == 0 :
cflags + = [ " /MP " ]
else :
cflags + = [ " /MP %d " % njobs ]
2024-01-12 15:24:40 +00:00
# Setup default values for cflags/cxxflags/ldflags.
# The values set here are purely for security and compliance purposes. ONNX Runtime should work fine without these flags.
if (
2024-01-29 20:45:38 +00:00
( args . use_binskim_compliant_compile_flags or args . enable_address_sanitizer )
2024-01-12 15:24:40 +00:00
and not args . ios
and not args . android
and not args . build_wasm
) :
if is_windows ( ) :
2024-10-21 22:32:14 +00:00
cflags + = [ " /guard:cf " , " /DWIN32 " , " /D_WINDOWS " ]
2024-01-12 15:24:40 +00:00
if not args . use_gdk :
# Target Windows 10
cflags + = [
" /DWINAPI_FAMILY=100 " ,
" /DWINVER=0x0A00 " ,
" /D_WIN32_WINNT=0x0A00 " ,
" /DNTDDI_VERSION=0x0A000000 " ,
]
# The "/profile" flag implies "/DEBUG:FULL /DEBUGTYPE:cv,fixup /OPT:REF /OPT:NOICF /INCREMENTAL:NO /FIXED:NO". We set it for satisfying a Microsoft internal compliance requirement. External users
# do not need to have it.
ldflags = [ " /profile " , " /DYNAMICBASE " ]
2024-01-29 20:45:38 +00:00
# Address Sanitizer libs do not have a Qspectre version. So they two cannot be both enabled.
if not args . enable_address_sanitizer :
2024-02-15 02:35:56 +00:00
# Also enable a special perf patch that was made for Intel Meteor Lake mobile CPUs
cflags + = [ " /Qspectre " , " /DONNXRUNTIME_ENABLE_INTEL_METEOR_LAKE_MOBILE_PLATFORM_PERF_PATCH " ]
2024-01-12 15:24:40 +00:00
if config == " Release " :
cflags + = [ " /O2 " , " /Ob2 " , " /DNDEBUG " ]
elif config == " RelWithDebInfo " :
cflags + = [ " /O2 " , " /Ob1 " , " /DNDEBUG " ]
elif config == " Debug " :
cflags + = [ " /Ob0 " , " /Od " , " /RTC1 " ]
elif config == " MinSizeRel " :
cflags + = [ " /O1 " , " /Ob1 " , " /DNDEBUG " ]
2024-01-29 20:45:38 +00:00
if args . enable_address_sanitizer :
cflags + = [ " /fsanitize=address " ]
2024-01-12 15:24:40 +00:00
cxxflags = cflags . copy ( )
2024-01-14 19:36:49 +00:00
if args . use_cuda :
# On Windows, nvcc passes /EHsc to the host compiler by default.
cuda_compile_flags_str = " "
for compile_flag in cflags :
if compile_flag . startswith ( " /D " ) :
cudaflags . append ( compile_flag )
else :
cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag
if len ( cuda_compile_flags_str ) != 0 :
2024-07-24 18:50:11 +00:00
cudaflags . append ( f ' -Xcompiler= " { cuda_compile_flags_str } " ' )
2024-01-12 15:24:40 +00:00
elif is_linux ( ) or is_macOS ( ) :
if is_linux ( ) :
2024-01-14 19:36:49 +00:00
ldflags = [ " -Wl,-Bsymbolic-functions " , " -Wl,-z,relro " , " -Wl,-z,now " , " -Wl,-z,noexecstack " ]
2024-01-12 15:24:40 +00:00
else :
ldflags = [ ]
if config == " Release " :
cflags = [
" -DNDEBUG " ,
" -Wp,-D_FORTIFY_SOURCE=2 " ,
" -Wp,-D_GLIBCXX_ASSERTIONS " ,
" -fstack-protector-strong " ,
" -O3 " ,
" -pipe " ,
]
if is_linux ( ) :
ldflags + = [ " -Wl,--strip-all " ]
elif config == " RelWithDebInfo " :
cflags = [
" -DNDEBUG " ,
" -Wp,-D_FORTIFY_SOURCE=2 " ,
" -Wp,-D_GLIBCXX_ASSERTIONS " ,
" -fstack-protector-strong " ,
" -O3 " ,
" -pipe " ,
" -ggdb3 " ,
]
elif config == " Debug " :
cflags = [ " -ggdb3 " , " -O0 " ]
if args . enable_address_sanitizer :
cflags + = [ " -fsanitize=address " ]
ldflags + = [ " -fsanitize=address " ]
elif config == " MinSizeRel " :
cflags = [
" -DNDEBUG " ,
" -Wp,-D_FORTIFY_SOURCE=2 " ,
" -Wp,-D_GLIBCXX_ASSERTIONS " ,
" -fstack-protector-strong " ,
" -Os " ,
" -pipe " ,
" -ggdb3 " ,
]
if is_linux ( ) and platform . machine ( ) == " x86_64 " :
# The following flags needs GCC 8 and newer
2024-01-25 00:27:05 +00:00
cflags + = [ " -fstack-clash-protection " ]
if not args . rv64 :
cflags + = [ " -fcf-protection " ]
2024-01-12 15:24:40 +00:00
cxxflags = cflags . copy ( )
2024-01-14 19:36:49 +00:00
if args . use_cuda :
cudaflags = cflags . copy ( )
2024-01-29 20:45:38 +00:00
if cxxflags is None and cflags is not None and len ( cflags ) != 0 :
cxxflags = cflags . copy ( )
2018-11-20 00:48:22 +00:00
config_build_dir = get_config_build_dir ( build_dir , config )
os . makedirs ( config_build_dir , exist_ok = True )
2022-09-07 22:11:18 +00:00
if args . use_tvm :
2022-01-27 19:31:13 +00:00
os . environ [ " PATH " ] = (
2022-04-26 16:35:16 +00:00
os . path . join ( config_build_dir , " _deps " , " tvm-build " )
+ os . pathsep
+ os . path . join ( config_build_dir , " _deps " , " tvm-src " )
+ os . pathsep
+ os . path . dirname ( sys . executable )
+ os . pathsep
+ os . environ [ " PATH " ]
)
Improve dependency management (#13523)
## Description
1. Convert some git submodules to cmake external projects
2. Update nsync from
[1.23.0](https://github.com/google/nsync/releases/tag/1.23.0) to
[1.25.0](https://github.com/google/nsync/releases/tag/1.25.0)
3. Update re2 from 2021-06-01 to 2022-06-01
4. Update wil from an old commit to 1.0.220914.1 tag
5. Update gtest to a newer commit so that it can optionally leverage
absl/re2 for parsing command line flags.
The following git submodules are deleted:
1. FP16
2. safeint
3. XNNPACK
4. cxxopts
5. dlpack
7. flatbuffers
8. googlebenchmark
9. json
10. mimalloc
11. mp11
12. pthreadpool
More will come.
## Motivation and Context
There are 3 ways of integrating 3rd party C/C++ libraries into ONNX
Runtime:
1. Install them to a system location, then use cmake's find_package
module to locate them.
2. Use git submodules
6. Use cmake's external projects(externalproject_add).
At first when this project was just started, we considered both option 2
and option 3. We preferred option 2 because:
1. It's easier to handle authentication. At first this project was not
open source, and it had some other non-public dependencies. If we use
git submodule, ADO will handle authentication smoothly. Otherwise we
need to manually pass tokens around and be very careful on not exposing
them in build logs.
2. At that time, cmake fetched dependencies after "cmake" finished
generating vcprojects/makefiles. So it was very difficult to make cflags
consistent. Since cmake 3.11, it has a new command: FetchContent, which
fetches dependencies when it generates vcprojects/makefiles just before
add_subdirectories, so the parent project's variables/settings can be
easily passed to the child projects.
And when the project went on, we had some new concerns:
1. As we started to have more and more EPs and build configs, the number
of submodules grew quickly. For more developers, most ORT submodules are
not relevant to them. They shouldn't need to download all of them.
2. It is impossible to let two different build configs use two different
versions of the same dependency. For example, right now we have protobuf
3.18.3 in the submodules. Then every EP must use the same version.
Whenever we have a need to upgrade protobuf, we need to coordinate
across the whole team and many external developers. I can't manage it
anymore.
3. Some projects want to manage the dependencies in a different way,
either because of their preference or because of compliance
requirements. For example, some Microsoft teams want to use vcpkg, but
we don't want to force every user of onnxruntime using vcpkg.
7. Someone wants to dynamically link to protobuf, but our build script
only does static link.
8. Hard to handle security vulnerabilities. For example, whenever
protobuf has a security patch, we have a lot of things to do. But if we
allowed people to build ORT with a different version of protobuf without
changing ORT"s source code, the customer who build ORT from source will
be able to act on such things in a quicker way. They will not need to
wait ORT having a patch release.
9. Every time we do a release, github will also publish a source file
zip file and a source file tarball for us. But they are not usable,
because they miss submodules.
### New features
After this change, users will be able to:
1. Build the dependencies in the way they want, then install them to
somewhere(for example, /usr or a temp folder).
2. Or download the dependencies by using cmake commands from these
dependencies official website
3. Similar to the above, but use your private mirrors to migrate supply
chain risks.
4. Use different versions of the dependencies, as long as our source
code is compatible with them. For example, you may use you can't use
protobuf 3.20.x as they need code changes in ONNX Runtime.
6. Only download the things the current build needs.
10. Avoid building external dependencies again and again in every build.
### Breaking change
The onnxruntime_PREFER_SYSTEM_LIB build option is removed you could think from now
it is default ON. If you don't like the new behavior, you can set FETCHCONTENT_TRY_FIND_PACKAGE_MODE to NEVER.
Besides, for who relied on the onnxruntime_PREFER_SYSTEM_LIB build
option, please be aware that this PR will change find_package calls from
Module mode to Config mode. For example, in the past if you have
installed protobuf from apt-get from ubuntu 20.04's official repo,
find_package can find it and use it. But after this PR, it won't. This
is because that protobuf version provided by Ubuntu 20.04 is too old to
support the "config mode". It can be resolved by getting a newer version
of protobuf from somewhere.
2022-12-01 17:51:59 +00:00
preinstalled_dir = Path ( build_dir ) / config
2024-01-12 15:24:40 +00:00
temp_cmake_args = cmake_args . copy ( )
2024-01-29 20:45:38 +00:00
if cflags is not None and cxxflags is not None and len ( cflags ) != 0 and len ( cxxflags ) != 0 :
2024-01-12 15:24:40 +00:00
temp_cmake_args + = [
2024-07-24 18:50:11 +00:00
" -DCMAKE_C_FLAGS= {} " . format ( " " . join ( cflags ) ) ,
" -DCMAKE_CXX_FLAGS= {} " . format ( " " . join ( cxxflags ) ) ,
2024-01-12 15:24:40 +00:00
]
2024-01-14 19:36:49 +00:00
if cudaflags is not None and len ( cudaflags ) != 0 :
2024-07-24 18:50:11 +00:00
temp_cmake_args + = [ " -DCMAKE_CUDA_FLAGS_INIT= {} " . format ( " " . join ( cudaflags ) ) ]
2024-01-12 15:24:40 +00:00
if ldflags is not None and len ( ldflags ) != 0 :
temp_cmake_args + = [
2024-07-24 18:50:11 +00:00
" -DCMAKE_EXE_LINKER_FLAGS_INIT= {} " . format ( " " . join ( ldflags ) ) ,
" -DCMAKE_MODULE_LINKER_FLAGS_INIT= {} " . format ( " " . join ( ldflags ) ) ,
" -DCMAKE_SHARED_LINKER_FLAGS_INIT= {} " . format ( " " . join ( ldflags ) ) ,
2024-01-12 15:24:40 +00:00
]
2020-04-19 03:48:30 +00:00
run_subprocess (
2023-03-24 22:29:03 +00:00
[
2024-01-12 15:24:40 +00:00
* temp_cmake_args ,
2023-03-24 22:29:03 +00:00
f " -DCMAKE_BUILD_TYPE= { config } " ,
2024-02-20 21:40:35 +00:00
(
f " -DCMAKE_PREFIX_PATH= { build_dir } / { config } /installed "
if preinstalled_dir . exists ( ) and not ( args . arm64 or args . arm64ec or args . arm )
else " "
) ,
2022-04-26 16:35:16 +00:00
] ,
cwd = config_build_dir ,
2022-06-04 03:00:54 +00:00
cuda_home = cuda_home ,
2022-04-26 16:35:16 +00:00
)
2018-11-20 00:48:22 +00:00
def clean_targets ( cmake_path , build_dir , configs ) :
for config in configs :
log . info ( " Cleaning targets for %s configuration " , config )
build_dir2 = get_config_build_dir ( build_dir , config )
2022-04-26 16:35:16 +00:00
cmd_args = [ cmake_path , " --build " , build_dir2 , " --config " , config , " --target " , " clean " ]
2018-11-20 00:48:22 +00:00
run_subprocess ( cmd_args )
2020-04-19 03:48:30 +00:00
2020-10-30 00:13:04 +00:00
def build_targets ( args , cmake_path , build_dir , configs , num_parallel_jobs , target = None ) :
2018-11-20 00:48:22 +00:00
for config in configs :
log . info ( " Building targets for %s configuration " , config )
build_dir2 = get_config_build_dir ( build_dir , config )
2022-04-26 16:35:16 +00:00
cmd_args = [ cmake_path , " --build " , build_dir2 , " --config " , config ]
2020-09-10 04:38:14 +00:00
if target :
2022-04-26 16:35:16 +00:00
cmd_args . extend ( [ " --target " , target ] )
2018-11-20 00:48:22 +00:00
build_tool_args = [ ]
2020-10-30 00:13:04 +00:00
if num_parallel_jobs != 1 :
2022-04-26 16:35:16 +00:00
if is_windows ( ) and args . cmake_generator != " Ninja " and not args . build_wasm :
2024-02-27 16:56:16 +00:00
# https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests
# not maxing out CL_MPCount
# Start by having one less than num_parallel_jobs (default is num logical cores),
2024-03-28 05:30:33 +00:00
# limited to a range of 1..15
# that gives maxcpucount projects building using up to 15 cl.exe instances each
2020-03-11 21:25:37 +00:00
build_tool_args + = [
2023-01-06 03:19:57 +00:00
f " /maxcpucount: { num_parallel_jobs } " ,
2024-03-28 05:30:33 +00:00
# one less than num_parallel_jobs, at least 1, up to 15
f " /p:CL_MPCount= { min ( max ( num_parallel_jobs - 1 , 1 ) , 15 ) } " ,
2020-03-11 21:25:37 +00:00
# if nodeReuse is true, msbuild processes will stay around for a bit after the build completes
" /nodeReuse:False " ,
2020-05-14 21:15:06 +00:00
]
2023-07-07 15:11:44 +00:00
elif args . cmake_generator == " Xcode " :
2024-02-20 21:40:35 +00:00
build_tool_args + = [
" -parallelizeTargets " ,
" -jobs " ,
str ( num_parallel_jobs ) ,
]
2020-10-30 00:13:04 +00:00
else :
2023-03-24 22:29:03 +00:00
build_tool_args + = [ f " -j { num_parallel_jobs } " ]
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
if build_tool_args :
cmd_args + = [ " -- " ]
2018-11-20 00:48:22 +00:00
cmd_args + = build_tool_args
2020-02-27 16:23:50 +00:00
env = { }
if args . android :
2022-04-26 16:35:16 +00:00
env [ " ANDROID_SDK_ROOT " ] = args . android_sdk_path
env [ " ANDROID_NDK_HOME " ] = args . android_ndk_path
2020-02-27 16:23:50 +00:00
run_subprocess ( cmd_args , env = env )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def add_dir_if_exists ( directory , dir_list ) :
if os . path . isdir ( directory ) :
dir_list . append ( directory )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
def setup_cuda_vars ( args ) :
2018-11-20 00:48:22 +00:00
cuda_home = " "
cudnn_home = " "
2020-04-19 03:48:30 +00:00
if args . use_cuda :
2022-04-26 16:35:16 +00:00
cuda_home = args . cuda_home if args . cuda_home else os . getenv ( " CUDA_HOME " )
cudnn_home = args . cudnn_home if args . cudnn_home else os . getenv ( " CUDNN_HOME " )
2018-11-20 00:48:22 +00:00
2022-04-26 16:35:16 +00:00
cuda_home_valid = cuda_home is not None and os . path . exists ( cuda_home )
cudnn_home_valid = cudnn_home is not None and os . path . exists ( cudnn_home )
2018-11-20 00:48:22 +00:00
2022-08-25 01:21:50 +00:00
if not cuda_home_valid or ( not is_windows ( ) and not cudnn_home_valid ) :
2020-04-19 03:48:30 +00:00
raise BuildError (
" cuda_home and cudnn_home paths must be specified and valid. " ,
2024-03-13 17:00:32 +00:00
f " cuda_home= ' { cuda_home } ' valid= { cuda_home_valid } . cudnn_home= ' { cudnn_home } ' valid= { cudnn_home_valid } " ,
2022-04-26 16:35:16 +00:00
)
2018-11-20 00:48:22 +00:00
return cuda_home , cudnn_home
2019-03-14 19:00:39 +00:00
2022-09-22 21:53:40 +00:00
def setup_cann_vars ( args ) :
cann_home = " "
if args . use_cann :
cann_home = args . cann_home if args . cann_home else os . getenv ( " ASCEND_HOME_PATH " )
cann_home_valid = cann_home is not None and os . path . exists ( cann_home )
if not cann_home_valid :
raise BuildError (
" cann_home paths must be specified and valid. " ,
2023-03-24 22:29:03 +00:00
f " cann_home= ' { cann_home } ' valid= { cann_home_valid } . " ,
2022-09-22 21:53:40 +00:00
)
return cann_home
2020-04-19 03:48:30 +00:00
def setup_tensorrt_vars ( args ) :
2019-03-14 19:00:39 +00:00
tensorrt_home = " "
2020-04-19 03:48:30 +00:00
if args . use_tensorrt :
2022-04-26 16:35:16 +00:00
tensorrt_home = args . tensorrt_home if args . tensorrt_home else os . getenv ( " TENSORRT_HOME " )
tensorrt_home_valid = tensorrt_home is not None and os . path . exists ( tensorrt_home )
2020-04-19 03:48:30 +00:00
if not tensorrt_home_valid :
raise BuildError (
" tensorrt_home paths must be specified and valid. " ,
2023-03-24 22:29:03 +00:00
f " tensorrt_home= ' { tensorrt_home } ' valid= { tensorrt_home_valid } . " ,
2022-04-26 16:35:16 +00:00
)
2020-04-19 03:48:30 +00:00
# Set maximum workspace size in byte for
# TensorRT (1GB = 1073741824 bytes).
2019-06-18 15:58:53 +00:00
os . environ [ " ORT_TENSORRT_MAX_WORKSPACE_SIZE " ] = " 1073741824 "
2019-05-24 17:12:55 +00:00
2020-04-19 03:48:30 +00:00
# Set maximum number of iterations to detect unsupported nodes
# and partition the models for TensorRT.
2019-12-04 07:18:33 +00:00
os . environ [ " ORT_TENSORRT_MAX_PARTITION_ITERATIONS " ] = " 1000 "
2020-02-12 15:03:58 +00:00
2020-04-19 03:48:30 +00:00
# Set minimum subgraph node size in graph partitioning
# for TensorRT.
2019-12-04 07:18:33 +00:00
os . environ [ " ORT_TENSORRT_MIN_SUBGRAPH_SIZE " ] = " 1 "
2020-02-12 15:03:58 +00:00
# Set FP16 flag
os . environ [ " ORT_TENSORRT_FP16_ENABLE " ] = " 0 "
2019-03-14 19:00:39 +00:00
return tensorrt_home
2020-04-19 03:48:30 +00:00
2020-05-26 20:24:59 +00:00
def setup_migraphx_vars ( args ) :
migraphx_home = None
2022-04-26 16:35:16 +00:00
if args . use_migraphx :
2023-03-24 22:29:03 +00:00
print ( f " migraphx_home = { args . migraphx_home } " )
2020-05-26 20:24:59 +00:00
migraphx_home = args . migraphx_home or os . getenv ( " MIGRAPHX_HOME " ) or None
2022-04-26 16:35:16 +00:00
migraphx_home_not_valid = migraphx_home and not os . path . exists ( migraphx_home )
2020-05-26 20:24:59 +00:00
2022-04-26 16:35:16 +00:00
if migraphx_home_not_valid :
raise BuildError (
" migraphx_home paths must be specified and valid. " ,
2023-03-24 22:29:03 +00:00
f " migraphx_home= ' { migraphx_home } ' valid= { migraphx_home_not_valid } . " ,
2022-04-26 16:35:16 +00:00
)
return migraphx_home or " "
2020-05-26 20:24:59 +00:00
2019-10-15 13:13:07 +00:00
def setup_dml_build ( args , cmake_path , build_dir , configs ) :
2022-04-07 22:06:31 +00:00
if not args . use_dml :
return
if args . dml_path :
for expected_file in [ " bin/DirectML.dll " , " lib/DirectML.lib " , " include/DirectML.h " ] :
file_path = os . path . join ( args . dml_path , expected_file )
if not os . path . exists ( file_path ) :
2023-03-24 22:29:03 +00:00
raise BuildError ( " dml_path is invalid. " , f " dml_path= ' { args . dml_path } ' expected_file= ' { file_path } ' . " )
2022-04-12 18:59:00 +00:00
elif not args . dml_external_project :
2019-10-15 13:13:07 +00:00
for config in configs :
2020-04-19 03:48:30 +00:00
# Run the RESTORE_PACKAGES target to perform the initial
# NuGet setup.
2022-04-26 16:35:16 +00:00
cmd_args = [
cmake_path ,
" --build " ,
get_config_build_dir ( build_dir , config ) ,
" --config " ,
config ,
" --target " ,
" RESTORE_PACKAGES " ,
]
2019-10-15 13:13:07 +00:00
run_subprocess ( cmd_args )
2024-01-03 02:06:05 +00:00
if args . minimal_build is not None :
raise BuildError ( " use_dml and minimal_build may not both be set " )
2019-10-15 13:13:07 +00:00
2022-10-21 05:46:22 +00:00
def setup_rocm_build ( args ) :
2020-10-30 00:13:04 +00:00
rocm_home = None
2022-04-26 16:35:16 +00:00
if args . use_rocm :
2023-03-24 22:29:03 +00:00
print ( f " rocm_home = { args . rocm_home } " )
2020-10-30 00:13:04 +00:00
rocm_home = args . rocm_home or None
2022-04-26 16:35:16 +00:00
rocm_home_not_valid = rocm_home and not os . path . exists ( rocm_home )
if rocm_home_not_valid :
raise BuildError (
" rocm_home paths must be specified and valid. " ,
2023-03-24 22:29:03 +00:00
f " rocm_home= ' { rocm_home } ' valid= { rocm_home_not_valid } . " ,
2022-04-26 16:35:16 +00:00
)
return rocm_home or " "
2020-10-30 00:13:04 +00:00
2021-05-04 22:39:14 +00:00
def run_android_tests ( args , source_dir , build_dir , config , cwd ) :
2024-10-14 16:24:38 +00:00
if args . android_abi != " x86_64 " :
log . info ( f " --android_abi ( { args . android_abi } ) is not x86_64, skipping running of Android tests on emulator. " )
return
2021-01-14 03:21:49 +00:00
sdk_tool_paths = android . get_sdk_tool_paths ( args . android_sdk_path )
2022-04-26 16:35:16 +00:00
device_dir = " /data/local/tmp "
2019-07-24 20:20:05 +00:00
2021-01-14 03:21:49 +00:00
def adb_push ( src , dest , * * kwargs ) :
2022-04-26 16:35:16 +00:00
return run_subprocess ( [ sdk_tool_paths . adb , " push " , src , dest ] , * * kwargs )
2020-05-14 21:15:06 +00:00
2021-01-14 03:21:49 +00:00
def adb_shell ( * args , * * kwargs ) :
2022-04-26 16:35:16 +00:00
return run_subprocess ( [ sdk_tool_paths . adb , " shell " , * args ] , * * kwargs )
2020-12-08 18:55:02 +00:00
2021-05-04 22:39:14 +00:00
def adb_install ( * args , * * kwargs ) :
2022-04-26 16:35:16 +00:00
return run_subprocess ( [ sdk_tool_paths . adb , " install " , * args ] , * * kwargs )
2021-05-04 22:39:14 +00:00
2020-12-08 18:55:02 +00:00
def run_adb_shell ( cmd ) :
2021-01-14 03:21:49 +00:00
# GCOV_PREFIX_STRIP specifies the depth of the directory hierarchy to strip and
2020-12-08 18:55:02 +00:00
# GCOV_PREFIX specifies the root directory
2021-01-14 03:21:49 +00:00
# for creating the runtime code coverage files.
2020-12-08 18:55:02 +00:00
if args . code_coverage :
2023-11-08 21:04:40 +00:00
adb_shell ( f " cd { device_dir } && GCOV_PREFIX= { device_dir } GCOV_PREFIX_STRIP= { cwd . count ( os . sep ) + 1 } { cmd } " )
2020-12-08 18:55:02 +00:00
else :
2023-03-24 22:29:03 +00:00
adb_shell ( f " cd { device_dir } && { cmd } " )
2020-12-08 18:55:02 +00:00
2024-10-14 16:24:38 +00:00
with contextlib . ExitStack ( ) as context_stack :
if args . android_run_emulator :
avd_name = " ort_android "
system_image = f " system-images;android- { args . android_api } ;default; { args . android_abi } "
android . create_virtual_device ( sdk_tool_paths , system_image , avd_name )
emulator_proc = context_stack . enter_context (
android . start_emulator (
sdk_tool_paths = sdk_tool_paths ,
avd_name = avd_name ,
extra_args = [ " -partition-size " , " 2047 " , " -wipe-data " ] ,
2022-04-26 16:35:16 +00:00
)
)
2024-10-14 16:24:38 +00:00
context_stack . callback ( android . stop_emulator , emulator_proc )
adb_push ( " testdata " , device_dir , cwd = cwd )
adb_push ( os . path . join ( source_dir , " cmake " , " external " , " onnx " , " onnx " , " backend " , " test " ) , device_dir , cwd = cwd )
adb_push ( " onnxruntime_test_all " , device_dir , cwd = cwd )
adb_shell ( f " chmod +x { device_dir } /onnxruntime_test_all " )
adb_push ( " onnx_test_runner " , device_dir , cwd = cwd )
adb_shell ( f " chmod +x { device_dir } /onnx_test_runner " )
run_adb_shell ( f " { device_dir } /onnxruntime_test_all " )
# remove onnxruntime_test_all as it takes up a _lot_ of space and can cause insufficient storage errors
# when we try to copy the java app to the device.
adb_shell ( f " rm { device_dir } /onnxruntime_test_all " )
if args . build_java :
# use the gradle wrapper under <repo root>/java
gradle_executable = os . path . join ( source_dir , " java " , " gradlew.bat " if is_windows ( ) else " gradlew " )
android_test_path = os . path . join ( cwd , " java " , " androidtest " , " android " )
run_subprocess (
[
gradle_executable ,
" --no-daemon " ,
f " -DminSdkVer= { args . android_api } " ,
" clean " ,
" connectedDebugAndroidTest " ,
] ,
cwd = android_test_path ,
)
2021-08-27 22:58:56 +00:00
2024-10-14 16:24:38 +00:00
if args . use_nnapi :
run_adb_shell ( f " { device_dir } /onnx_test_runner -e nnapi { device_dir } /test " )
else :
run_adb_shell ( f " { device_dir } /onnx_test_runner { device_dir } /test " )
# run shared_lib_test if necessary
if args . build_shared_lib :
adb_push ( " libonnxruntime.so " , device_dir , cwd = cwd )
adb_push ( " onnxruntime_shared_lib_test " , device_dir , cwd = cwd )
adb_push ( " libcustom_op_library.so " , device_dir , cwd = cwd )
adb_push ( " libcustom_op_get_const_input_test_library.so " , device_dir , cwd = cwd )
adb_push ( " onnxruntime_customopregistration_test " , device_dir , cwd = cwd )
adb_shell ( f " chmod +x { device_dir } /onnxruntime_shared_lib_test " )
adb_shell ( f " chmod +x { device_dir } /onnxruntime_customopregistration_test " )
run_adb_shell ( f " LD_LIBRARY_PATH=$LD_LIBRARY_PATH: { device_dir } { device_dir } /onnxruntime_shared_lib_test " )
run_adb_shell (
f " LD_LIBRARY_PATH=$LD_LIBRARY_PATH: { device_dir } { device_dir } /onnxruntime_customopregistration_test "
)
2020-09-17 22:53:14 +00:00
2020-09-29 20:53:11 +00:00
def run_ios_tests ( args , source_dir , config , cwd ) :
2024-10-14 16:24:38 +00:00
is_targeting_iphone_simulator = " iphonesimulator " in args . apple_sysroot . lower ( )
if not is_targeting_iphone_simulator :
log . info (
f " Could not detect iphonesimulator target from --apple_sysroot ( { args . apple_sysroot } ), "
" skipping running of iOS tests on simulator. "
)
return
host_arch = platform . machine ( )
if host_arch != args . osx_arch :
log . info (
f " Host arch ( { host_arch } ) and --osx_arch ( { args . osx_arch } ) mismatch, "
" skipping running of iOS tests on simulator. "
)
return
2023-08-08 16:04:06 +00:00
simulator_device_info = subprocess . check_output (
[
sys . executable ,
os . path . join ( source_dir , " tools " , " ci_build " , " github " , " apple " , " get_simulator_device_info.py " ) ,
] ,
2023-06-07 19:07:11 +00:00
text = True ,
) . strip ( )
2023-08-08 16:04:06 +00:00
log . debug ( f " Simulator device info: \n { simulator_device_info } " )
simulator_device_info = json . loads ( simulator_device_info )
2023-06-07 19:07:11 +00:00
2023-01-12 05:11:34 +00:00
xc_test_schemes = [
" onnxruntime_test_all_xc " ,
]
if args . build_shared_lib :
xc_test_schemes + = [
2022-04-26 16:35:16 +00:00
" onnxruntime_shared_lib_test_xc " ,
2023-01-12 05:11:34 +00:00
" onnxruntime_customopregistration_test_xc " ,
]
for xc_test_scheme in xc_test_schemes :
run_subprocess (
[
" xcodebuild " ,
" test-without-building " ,
" -project " ,
" ./onnxruntime.xcodeproj " ,
" -configuration " ,
config ,
" -scheme " ,
xc_test_scheme ,
" -destination " ,
2023-08-08 16:04:06 +00:00
f " platform=iOS Simulator,id= { simulator_device_info [ ' device_udid ' ] } " ,
2023-01-12 05:11:34 +00:00
] ,
cwd = cwd ,
)
2021-06-01 18:01:37 +00:00
if args . build_apple_framework :
2023-11-28 18:11:53 +00:00
package_test_py = os . path . join ( source_dir , " tools " , " ci_build " , " github " , " apple " , " test_apple_packages.py " )
2022-04-26 16:35:16 +00:00
framework_info_file = os . path . join ( cwd , " framework_info.json " )
2023-11-28 18:11:53 +00:00
dynamic_framework_dir = os . path . join ( cwd , config + " - " + args . apple_sysroot )
static_framework_dir = os . path . join ( cwd , config + " - " + args . apple_sysroot , " static_framework " )
2021-07-14 23:39:17 +00:00
# test dynamic framework
2022-04-26 16:35:16 +00:00
run_subprocess (
[
sys . executable ,
package_test_py ,
" --c_framework_dir " ,
dynamic_framework_dir ,
" --framework_info_file " ,
framework_info_file ,
2023-11-28 18:11:53 +00:00
" --variant " ,
2024-06-07 21:20:32 +00:00
" Full " ,
" --skip_macos_test " ,
2022-04-26 16:35:16 +00:00
] ,
cwd = cwd ,
)
2021-07-14 23:39:17 +00:00
# test static framework
2022-04-26 16:35:16 +00:00
run_subprocess (
[
sys . executable ,
package_test_py ,
" --c_framework_dir " ,
static_framework_dir ,
" --framework_info_file " ,
framework_info_file ,
2023-11-28 18:11:53 +00:00
" --variant " ,
2024-06-07 21:20:32 +00:00
" Full " ,
" --skip_macos_test " ,
2022-04-26 16:35:16 +00:00
] ,
cwd = cwd ,
)
2020-09-29 20:53:11 +00:00
2020-06-07 03:28:53 +00:00
def run_onnxruntime_tests ( args , source_dir , ctest_path , build_dir , configs ) :
2018-11-20 00:48:22 +00:00
for config in configs :
log . info ( " Running tests for %s configuration " , config )
cwd = get_config_build_dir ( build_dir , config )
2020-12-08 18:55:02 +00:00
cwd = os . path . abspath ( cwd )
2020-04-30 19:26:38 +00:00
2020-09-09 09:06:20 +00:00
if args . android :
2021-05-04 22:39:14 +00:00
run_android_tests ( args , source_dir , build_dir , config , cwd )
2019-07-24 20:20:05 +00:00
continue
2020-09-29 20:53:11 +00:00
elif args . ios :
run_ios_tests ( args , source_dir , config , cwd )
continue
2020-06-07 03:28:53 +00:00
dll_path_list = [ ]
if args . use_tensorrt :
2022-04-26 16:35:16 +00:00
dll_path_list . append ( os . path . join ( args . tensorrt_home , " lib " ) )
2020-06-07 03:28:53 +00:00
dll_path = None
if len ( dll_path_list ) > 0 :
dll_path = os . pathsep . join ( dll_path_list )
2023-04-07 00:06:59 +00:00
if not ctest_path and not is_windows ( ) :
executables = [ " onnxruntime_test_all " , " onnxruntime_mlas_test " ]
if args . build_shared_lib :
executables . append ( " onnxruntime_shared_lib_test " )
executables . append ( " onnxruntime_global_thread_pools_test " )
executables . append ( " onnxruntime_customopregistration_test " )
2023-04-14 13:32:46 +00:00
for exe in executables :
2023-05-17 15:31:16 +00:00
test_output = f " --gtest_output=xml: { cwd } / { exe } . { config } .results.xml "
run_subprocess ( [ os . path . join ( cwd , exe ) , test_output ] , cwd = cwd , dll_path = dll_path )
2020-02-04 03:33:14 +00:00
else :
2022-12-14 21:06:03 +00:00
ctest_cmd = [ ctest_path , " --build-config " , config , " --verbose " , " --timeout " , args . test_all_timeout ]
2020-03-11 21:25:37 +00:00
run_subprocess ( ctest_cmd , cwd = cwd , dll_path = dll_path )
2018-11-20 00:48:22 +00:00
2020-09-04 21:59:01 +00:00
if args . enable_pybind :
2022-02-24 15:24:23 +00:00
python_path = None
if args . use_tvm :
2022-06-13 18:38:44 +00:00
python_path = str ( ( Path ( build_dir ) / config / " _deps " / " tvm-src " / " python " ) . resolve ( ) )
2022-02-24 15:24:23 +00:00
2020-09-04 21:59:01 +00:00
# Disable python tests in a reduced build as we don't know which ops have been included and which
2021-02-16 04:46:51 +00:00
# models can run.
2021-02-22 22:05:00 +00:00
if is_reduced_ops_build ( args ) or args . minimal_build is not None :
2020-09-04 21:59:01 +00:00
return
2018-11-20 00:48:22 +00:00
if is_windows ( ) :
cwd = os . path . join ( cwd , config )
2019-11-27 21:03:23 +00:00
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " onnxruntime_test_python.py " ] , cwd = cwd , dll_path = dll_path , python_path = python_path
)
2019-11-27 21:03:23 +00:00
2021-07-22 22:24:36 +00:00
if not args . disable_contrib_ops :
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " onnxruntime_test_python_sparse_matmul.py " ] , cwd = cwd , dll_path = dll_path )
2021-07-22 22:24:36 +00:00
2020-09-18 16:31:06 +00:00
if args . enable_symbolic_shape_infer_tests :
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " onnxruntime_test_python_symbolic_shape_infer.py " ] , cwd = cwd , dll_path = dll_path
)
2020-09-18 16:31:06 +00:00
2023-09-13 14:26:35 +00:00
# For CUDA or DML enabled builds test IOBinding feature
if args . use_cuda or args . use_dml :
2020-07-10 21:02:28 +00:00
log . info ( " Testing IOBinding feature " )
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " onnxruntime_test_python_iobinding.py " ] , cwd = cwd , dll_path = dll_path )
2020-07-08 04:09:37 +00:00
2023-09-13 14:26:35 +00:00
if args . use_cuda :
2022-03-07 04:47:31 +00:00
log . info ( " Testing CUDA Graph feature " )
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " onnxruntime_test_python_cudagraph.py " ] , cwd = cwd , dll_path = dll_path )
2022-03-07 04:47:31 +00:00
2024-04-18 17:15:00 +00:00
if args . use_dml :
log . info ( " Testing DML Graph feature " )
run_subprocess ( [ sys . executable , " onnxruntime_test_python_dmlgraph.py " ] , cwd = cwd , dll_path = dll_path )
2022-06-16 14:46:40 +00:00
if not args . disable_ml_ops and not args . use_tensorrt :
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " onnxruntime_test_python_mlops.py " ] , cwd = cwd , dll_path = dll_path )
2020-06-20 13:36:06 +00:00
2024-10-29 16:23:41 +00:00
if args . use_tensorrt :
run_subprocess (
[ sys . executable , " onnxruntime_test_python_nested_control_flow_op.py " ] , cwd = cwd , dll_path = dll_path
)
2023-07-23 23:16:17 +00:00
2018-11-20 00:48:22 +00:00
try :
2023-03-24 22:29:03 +00:00
import onnx # noqa: F401
2022-04-26 16:35:16 +00:00
2018-11-20 00:48:22 +00:00
onnx_test = True
2019-11-27 21:03:23 +00:00
except ImportError as error :
log . exception ( error )
2020-09-04 21:59:01 +00:00
log . warning ( " onnx is not installed. The ONNX tests will be skipped. " )
2018-11-20 00:48:22 +00:00
onnx_test = False
2019-11-27 21:03:23 +00:00
2018-11-20 00:48:22 +00:00
if onnx_test :
2022-12-16 14:57:40 +00:00
# Disable python onnx tests for TensorRT and CANN EP, because many tests are
2022-06-16 14:46:40 +00:00
# not supported yet.
2022-12-16 14:57:40 +00:00
if args . use_tensorrt or args . use_cann :
2022-06-16 14:46:40 +00:00
return
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " onnxruntime_test_python_backend.py " ] ,
cwd = cwd ,
dll_path = dll_path ,
python_path = python_path ,
)
2021-03-23 03:01:27 +00:00
if not args . disable_contrib_ops :
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " -m " , " unittest " , " discover " , " -s " , " quantization " ] , cwd = cwd , dll_path = dll_path
)
2021-06-09 02:43:59 +00:00
if args . enable_transformers_tool_test :
2021-07-29 21:09:36 +00:00
import google . protobuf
2022-04-26 16:35:16 +00:00
import numpy
2021-06-11 19:57:52 +00:00
numpy_init_version = numpy . __version__
2021-07-29 21:09:36 +00:00
pb_init_version = google . protobuf . __version__
2022-04-26 16:35:16 +00:00
run_subprocess (
2024-07-22 19:39:10 +00:00
[
sys . executable ,
" -m " ,
" pip " ,
" install " ,
" -r " ,
" requirements/transformers-test/requirements.txt " ,
] ,
2024-01-24 00:34:26 +00:00
cwd = SCRIPT_DIR ,
2022-04-26 16:35:16 +00:00
)
run_subprocess ( [ sys . executable , " -m " , " pytest " , " transformers " ] , cwd = cwd )
2021-07-29 21:09:36 +00:00
# Restore initial numpy/protobuf version in case other tests use it
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " -m " , " pip " , " install " , " numpy== " + numpy_init_version ] )
run_subprocess ( [ sys . executable , " -m " , " pip " , " install " , " protobuf== " + pb_init_version ] )
2020-06-20 13:36:06 +00:00
if not args . disable_ml_ops :
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " onnxruntime_test_python_backend_mlops.py " ] , cwd = cwd , dll_path = dll_path
)
2020-09-04 21:59:01 +00:00
2022-04-26 16:35:16 +00:00
run_subprocess (
[
sys . executable ,
os . path . join ( source_dir , " onnxruntime " , " test " , " onnx " , " gen_test_models.py " ) ,
" --output_dir " ,
" test_models " ,
] ,
cwd = cwd ,
)
2020-06-20 13:36:06 +00:00
2020-03-11 21:25:37 +00:00
if not args . skip_onnx_tests :
2022-04-26 16:35:16 +00:00
run_subprocess ( [ os . path . join ( cwd , " onnx_test_runner " ) , " test_models " ] , cwd = cwd )
if config != " Debug " :
run_subprocess ( [ sys . executable , " onnx_backend_test_series.py " ] , cwd = cwd , dll_path = dll_path )
2019-11-27 21:03:23 +00:00
2019-03-13 09:55:56 +00:00
if not args . skip_keras_test :
try :
2023-03-24 22:29:03 +00:00
import keras # noqa: F401
import onnxmltools # noqa: F401
2022-04-26 16:35:16 +00:00
2019-03-13 09:55:56 +00:00
onnxml_test = True
except ImportError :
2023-04-17 17:11:44 +00:00
log . warning ( " onnxmltools and keras are not installed. The keras tests will be skipped. " )
2019-03-13 09:55:56 +00:00
onnxml_test = False
if onnxml_test :
2022-04-26 16:35:16 +00:00
run_subprocess ( [ sys . executable , " onnxruntime_test_python_keras.py " ] , cwd = cwd , dll_path = dll_path )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
2022-02-15 09:21:02 +00:00
def tvm_run_python_tests ( build_dir , configs ) :
2022-01-27 19:31:13 +00:00
for config in configs :
cwd = get_config_build_dir ( build_dir , config )
if is_windows ( ) :
cwd = os . path . join ( cwd , config )
2022-07-19 14:05:28 +00:00
python_path = os . path . join ( build_dir , config , " _deps " , " tvm-src " , " python " )
run_subprocess (
[ sys . executable , " onnxruntime_test_python_tvm.py " ] , cwd = cwd , python_path = os . path . abspath ( python_path )
)
2022-01-27 19:31:13 +00:00
2020-06-11 02:16:32 +00:00
def run_nodejs_tests ( nodejs_binding_dir ) :
2022-08-03 02:01:19 +00:00
args = [ " npm " , " test " , " -- " , " --timeout=90000 " ]
2020-06-11 02:16:32 +00:00
if is_windows ( ) :
2023-03-24 22:29:03 +00:00
args = [ " cmd " , " /c " , * args ]
2020-06-11 02:16:32 +00:00
run_subprocess ( args , cwd = nodejs_binding_dir )
2020-04-20 08:05:28 +00:00
def build_python_wheel (
2022-04-26 16:35:16 +00:00
source_dir ,
build_dir ,
configs ,
use_cuda ,
cuda_version ,
use_rocm ,
2024-07-12 04:21:38 +00:00
use_migraphx ,
2022-04-26 16:35:16 +00:00
rocm_version ,
use_dnnl ,
use_tensorrt ,
use_openvino ,
use_tvm ,
use_vitisai ,
use_acl ,
use_armnn ,
use_dml ,
2022-09-22 21:53:40 +00:00
use_cann ,
2023-01-11 20:25:04 +00:00
use_azure ,
2023-03-03 15:26:53 +00:00
use_qnn ,
2022-04-26 16:35:16 +00:00
wheel_name_suffix ,
enable_training ,
nightly_build = False ,
default_training_package_device = False ,
use_ninja = False ,
2023-01-03 21:28:16 +00:00
enable_training_apis = False ,
2022-10-17 02:11:20 +00:00
enable_rocm_profiling = False ,
2022-04-26 16:35:16 +00:00
) :
2018-11-20 00:48:22 +00:00
for config in configs :
cwd = get_config_build_dir ( build_dir , config )
2020-06-08 17:27:32 +00:00
if is_windows ( ) and not use_ninja :
2018-11-20 00:48:22 +00:00
cwd = os . path . join ( cwd , config )
2020-04-29 16:47:51 +00:00
2022-04-26 16:35:16 +00:00
args = [ sys . executable , os . path . join ( source_dir , " setup.py " ) , " bdist_wheel " ]
2020-04-29 16:47:51 +00:00
# Any combination of the following arguments can be applied
2019-04-12 05:06:18 +00:00
if nightly_build :
2022-04-26 16:35:16 +00:00
args . append ( " --nightly_build " )
2021-05-27 05:44:20 +00:00
if default_training_package_device :
2022-04-26 16:35:16 +00:00
args . append ( " --default_training_package_device " )
2020-04-29 16:47:51 +00:00
if wheel_name_suffix :
2023-03-24 22:29:03 +00:00
args . append ( f " --wheel_name_suffix= { wheel_name_suffix } " )
Add new PytTrch front-end (#4815)
* Add ORTTrainerOptions class for the new pytorch frontend (#4382)
Add ORTTrainerOptions class and some placeholders
* Add _ORTTrainerModelDesc to perform validation for model description (#4416)
* Add Loss Scaler classes to the new frontend (#4306)
* Add TrainStepInfo used on the new frontend API (#4256)
* Add Optimizer classes to the new frontend (#4280)
* Add LRScheduler implementation (#4357)
* Add basic ORTTrainer API (#4435)
This PR presents the public API for ORTTrainer for the short term
development.
It also validates and saves input parameters, which will be used in the
next stages, such as building ONNX model, post processing the model and
configuring the training session
* Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592)
* Update ModelDescription and minor fix on ORTTrainer ctor (#4605)
* Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions
This PR keeps the public API intact, but changes how model description is stored on the backend
Currently, users creates a dict with two lists of tuples.
One list called 'inputs' and each tuple has the following format tuple(name, shape).
The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss).
With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual.
However, tuples are internally replaced by namedtuples and all output tuples will have
tuple(name, shape, is_loss) format instead of is_loss being optionally present.
Additionally to that normalization in the internal representation (which eases coding),
two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype)
or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output.
This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx.
Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None
* Rename input name for test
* Add ONNX Model Export to New Frontend (#4612)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Create training session + minor improvements (#4668)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Save ONNX model in file (#4671)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add eval step (#4674)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add train_step (#4677)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add LR Scheduler (#4694)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add deterministic compute tests (#4716)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add legacy vs experimental ORTTrainer accuracy comparison (#4727)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add Mixed precision/LossScaler + several fixes (#4739)
Additionally to the mixed precision/loss scaler code, this PR includes:
* Fix CUDA training
* Add optimization_step into TrainStepInfo class
* Refactor LRSCheduler to use optimization_step instead of step
* Updated several default values at ORTTrainerOptions
* Add initial Gradient Accumulation supported. Untested
* Fix ONNX model post processing
* Refactor unit tests
* Add ONNX BERT example + minor fixes (#4757)
* Fix training issue when passing ONNX file into ORTTrainer
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add Dynamic Shape support (#4758)
* Update DeepSpeed Zero Stage option to a separate option group (#4772)
* Add support to fetches (#4777)
* Add Gradient Accumulation Steps support (#4793)
* Fix Dynamic Axes feature and add unit test (#4795)
* Add frozen weights test (#4807)
* Move new pytorch front-end to 'experimental' namespace (#4814)
* Fix build
Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
if enable_training :
args . append ( " --enable_training " )
2023-01-03 21:28:16 +00:00
if enable_training_apis :
args . append ( " --enable_training_apis " )
2022-10-17 02:11:20 +00:00
if enable_rocm_profiling :
args . append ( " --enable_rocm_profiling " )
2020-04-29 16:47:51 +00:00
# The following arguments are mutually exclusive
2021-11-18 21:26:51 +00:00
if use_cuda :
2021-06-16 23:59:12 +00:00
# The following line assumes no other EP is enabled
2022-04-26 16:35:16 +00:00
args . append ( " --wheel_name_suffix=gpu " )
2021-04-13 23:19:42 +00:00
if cuda_version :
2023-03-24 22:29:03 +00:00
args . append ( f " --cuda_version= { cuda_version } " )
2021-04-24 00:22:31 +00:00
elif use_rocm :
2022-04-26 16:35:16 +00:00
args . append ( " --use_rocm " )
2021-04-24 00:22:31 +00:00
if rocm_version :
2023-03-24 22:29:03 +00:00
args . append ( f " --rocm_version= { rocm_version } " )
2024-07-12 04:21:38 +00:00
elif use_migraphx :
args . append ( " --use_migraphx " )
2019-06-27 22:45:06 +00:00
elif use_openvino :
2022-04-26 16:35:16 +00:00
args . append ( " --use_openvino " )
2020-04-24 11:06:02 +00:00
elif use_dnnl :
2022-04-26 16:35:16 +00:00
args . append ( " --use_dnnl " )
2022-02-15 09:21:02 +00:00
elif use_tvm :
2022-04-26 16:35:16 +00:00
args . append ( " --use_tvm " )
2020-05-19 12:32:32 +00:00
elif use_vitisai :
2022-04-26 16:35:16 +00:00
args . append ( " --use_vitisai " )
2020-04-20 08:05:28 +00:00
elif use_acl :
2022-04-26 16:35:16 +00:00
args . append ( " --use_acl " )
2020-06-18 14:54:14 +00:00
elif use_armnn :
2022-04-26 16:35:16 +00:00
args . append ( " --use_armnn " )
2020-09-08 21:34:09 +00:00
elif use_dml :
2022-04-26 16:35:16 +00:00
args . append ( " --wheel_name_suffix=directml " )
2022-09-22 21:53:40 +00:00
elif use_cann :
args . append ( " --use_cann " )
2023-03-03 15:26:53 +00:00
elif use_qnn :
args . append ( " --use_qnn " )
2024-09-19 06:24:32 +00:00
elif use_azure :
args . append ( " --use_azure " )
2020-04-14 16:00:13 +00:00
2019-06-27 22:45:06 +00:00
run_subprocess ( args , cwd = cwd )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
2022-04-26 16:35:16 +00:00
def build_nuget_package (
2023-05-15 23:27:38 +00:00
cmake_path ,
2022-12-05 22:54:09 +00:00
source_dir ,
build_dir ,
configs ,
use_cuda ,
2023-05-15 23:27:38 +00:00
use_rocm ,
2022-12-05 22:54:09 +00:00
use_openvino ,
use_tensorrt ,
use_dnnl ,
use_tvm ,
use_winml ,
2023-04-29 02:33:14 +00:00
use_qnn ,
2023-01-03 21:28:16 +00:00
enable_training_apis ,
2023-05-15 23:27:38 +00:00
msbuild_extra_options ,
2022-04-26 16:35:16 +00:00
) :
2020-08-26 19:33:48 +00:00
if not ( is_windows ( ) or is_linux ( ) ) :
raise BuildError (
2023-09-27 17:45:27 +00:00
" Currently csharp builds and nuget package creation is only supported on Windows and Linux platforms. "
2022-04-26 16:35:16 +00:00
)
2020-08-26 19:33:48 +00:00
2022-04-26 16:35:16 +00:00
csharp_build_dir = os . path . join ( source_dir , " csharp " )
2020-08-26 19:33:48 +00:00
2024-06-19 06:20:58 +00:00
# in most cases we don't want/need to include the MAUI mobile targets, as doing so means the mobile workloads
# must be installed on the machine.
2021-11-23 01:29:53 +00:00
# they are only included in the Microsoft.ML.OnnxRuntime nuget package
sln = " OnnxRuntime.DesktopOnly.CSharp.sln "
2023-11-03 16:05:17 +00:00
have_exclude_mobile_targets_option = " IncludeMobileTargets=false " in msbuild_extra_options
2021-11-23 01:29:53 +00:00
2020-08-26 19:33:48 +00:00
# derive package name and execution provider based on the build args
2021-11-04 07:42:51 +00:00
target_name = " /t:CreatePackage "
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=None "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime "
enable_training_tests = " /p:TrainingEnabledNativeBuild=false "
2023-01-03 21:28:16 +00:00
if enable_training_apis :
2023-11-03 16:05:17 +00:00
enable_training_tests = " /p:TrainingEnabledNativeBuild=true "
2022-12-05 22:54:09 +00:00
if use_cuda :
2023-11-03 16:05:17 +00:00
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training.Gpu "
2022-12-05 22:54:09 +00:00
else :
2023-11-03 16:05:17 +00:00
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training "
2022-12-05 22:54:09 +00:00
elif use_winml :
2023-11-03 16:05:17 +00:00
package_name = " /p:OrtPackageId=Microsoft.AI.MachineLearning "
2021-11-04 07:42:51 +00:00
target_name = " /t:CreateWindowsAIPackage "
elif use_openvino :
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=openvino "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino "
2020-08-26 19:33:48 +00:00
elif use_tensorrt :
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=tensorrt "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT "
2020-08-26 19:33:48 +00:00
elif use_dnnl :
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=dnnl "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DNNL "
2020-08-26 19:33:48 +00:00
elif use_cuda :
2023-11-03 16:05:17 +00:00
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu "
2023-05-15 23:27:38 +00:00
elif use_rocm :
2023-11-03 16:05:17 +00:00
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm "
2022-02-15 09:21:02 +00:00
elif use_tvm :
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=tvm "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm "
2023-04-29 02:33:14 +00:00
elif use_qnn :
2023-11-03 16:05:17 +00:00
execution_provider = " /p:ExecutionProvider=qnn "
package_name = " /p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN "
2023-05-15 23:27:38 +00:00
elif any ( map ( lambda x : " OrtPackageId= " in x , msbuild_extra_options ) ) :
pass
2020-08-26 19:33:48 +00:00
else :
2023-11-03 16:05:17 +00:00
# we currently only allow building with mobile targets on Windows.
# it should be possible to allow building with android targets on Linux but that requires updating the
# csproj to separate the inclusion of ios and android targets.
if is_windows ( ) and have_exclude_mobile_targets_option is False :
# use the sln that include the mobile targets
sln = " OnnxRuntime.CSharp.sln "
# explicitly exclude mobile targets in this case
if sln != " OnnxRuntime.CSharp.sln " and have_exclude_mobile_targets_option is False :
msbuild_extra_options . append ( " IncludeMobileTargets=false " )
# expand extra_options to add prefix
extra_options = [ " /p: " + option for option in msbuild_extra_options ]
# we have to use msbuild directly if including Xamarin targets as dotnet only supports MAUI (.net6)
use_dotnet = sln != " OnnxRuntime.CSharp.sln "
if use_dotnet :
cmd_args = [ " dotnet " , " restore " , sln , " --configfile " , " NuGet.CSharp.config " , * extra_options ]
else :
cmd_args = [ " msbuild " , sln , " /t:restore " , " /p:RestoreConfigFile=NuGet.CSharp.config " , * extra_options ]
2020-08-26 19:33:48 +00:00
2020-10-02 16:31:35 +00:00
# set build directory based on build_dir arg
native_dir = os . path . normpath ( os . path . join ( source_dir , build_dir ) )
2023-11-03 16:05:17 +00:00
ort_build_dir = " /p:OnnxRuntimeBuildDirectory= " + native_dir
2020-10-02 16:31:35 +00:00
run_subprocess ( cmd_args , cwd = csharp_build_dir )
2020-08-26 19:33:48 +00:00
# build csharp bindings and create nuget package for each config
for config in configs :
2023-11-03 16:05:17 +00:00
configuration = " /p:Configuration= " + config
2021-11-04 07:42:51 +00:00
if not use_winml :
2023-11-03 16:05:17 +00:00
cmd_args = [ " dotnet " ] if use_dotnet else [ ]
cmd_args + = [
2023-04-17 16:57:23 +00:00
" msbuild " ,
sln ,
configuration ,
package_name ,
ort_build_dir ,
enable_training_tests ,
2023-11-03 16:05:17 +00:00
* extra_options ,
2023-04-17 16:57:23 +00:00
]
2023-11-03 16:05:17 +00:00
2021-11-04 07:42:51 +00:00
run_subprocess ( cmd_args , cwd = csharp_build_dir )
else :
winml_interop_dir = os . path . join ( source_dir , " csharp " , " src " , " Microsoft.AI.MachineLearning.Interop " )
winml_interop_project = os . path . join ( winml_interop_dir , " Microsoft.AI.MachineLearning.Interop.csproj " )
winml_interop_project = os . path . normpath ( winml_interop_project )
2022-04-26 16:35:16 +00:00
cmd_args = [
" dotnet " ,
" msbuild " ,
winml_interop_project ,
configuration ,
2023-11-03 16:05:17 +00:00
" /p:Platform=Any CPU " ,
2022-04-26 16:35:16 +00:00
ort_build_dir ,
" -restore " ,
]
2021-11-04 07:42:51 +00:00
run_subprocess ( cmd_args , cwd = csharp_build_dir )
2021-11-23 01:29:53 +00:00
if is_windows ( ) :
2022-12-05 22:54:09 +00:00
if not use_winml :
2021-12-07 15:48:31 +00:00
# user needs to make sure nuget is installed and added to the path variable
nuget_exe = " nuget.exe "
else :
# this path is setup by cmake/nuget_helpers.cmake for MSVC on Windows
nuget_exe = os . path . normpath ( os . path . join ( native_dir , config , " nuget_exe " , " src " , " nuget.exe " ) )
2021-11-23 01:29:53 +00:00
else :
2023-11-03 16:05:17 +00:00
# `dotnet pack` is used on Linux
nuget_exe = " NugetExe_not_set "
2021-11-23 01:29:53 +00:00
2022-04-26 16:35:16 +00:00
nuget_exe_arg = ' /p:NugetExe= " ' + nuget_exe + ' " '
2020-08-26 19:33:48 +00:00
2023-11-03 16:05:17 +00:00
cmd_args = [ " dotnet " ] if use_dotnet else [ ]
cmd_args + = [
2022-04-26 16:35:16 +00:00
" msbuild " ,
" OnnxRuntime.CSharp.proj " ,
target_name ,
package_name ,
configuration ,
execution_provider ,
ort_build_dir ,
nuget_exe_arg ,
2023-11-03 16:05:17 +00:00
* extra_options ,
2022-04-26 16:35:16 +00:00
]
2023-11-03 16:05:17 +00:00
2020-10-02 16:31:35 +00:00
run_subprocess ( cmd_args , cwd = csharp_build_dir )
2020-08-26 19:33:48 +00:00
2023-11-03 16:05:17 +00:00
log . info ( f " nuget package was created in the { config } build output directory. " )
2020-08-26 19:33:48 +00:00
2023-01-03 21:28:16 +00:00
def run_csharp_tests ( source_dir , build_dir , use_cuda , use_openvino , use_tensorrt , use_dnnl , enable_training_apis ) :
2020-08-26 19:33:48 +00:00
# Currently only running tests on windows.
if not is_windows ( ) :
return
2022-04-26 16:35:16 +00:00
csharp_source_dir = os . path . join ( source_dir , " csharp " )
2020-08-26 19:33:48 +00:00
# define macros based on build args
macros = " "
if use_openvino :
macros + = " USE_OPENVINO; "
if use_tensorrt :
macros + = " USE_TENSORRT; "
if use_dnnl :
macros + = " USE_DNNL; "
if use_cuda :
macros + = " USE_CUDA; "
2023-01-03 21:28:16 +00:00
if enable_training_apis :
2023-04-17 16:57:23 +00:00
macros + = " __TRAINING_ENABLED_NATIVE_BUILD__;__ENABLE_TRAINING_APIS__ "
2020-08-26 19:33:48 +00:00
define_constants = " "
2023-04-17 17:11:44 +00:00
if macros :
2022-04-26 16:35:16 +00:00
define_constants = ' /p:DefineConstants= " ' + macros + ' " '
2020-08-26 19:33:48 +00:00
2020-10-02 16:31:35 +00:00
# set build directory based on build_dir arg
native_build_dir = os . path . normpath ( os . path . join ( source_dir , build_dir ) )
2022-04-26 16:35:16 +00:00
ort_build_dir = ' /p:OnnxRuntimeBuildDirectory= " ' + native_build_dir + ' " '
2020-10-02 16:31:35 +00:00
2020-08-26 19:33:48 +00:00
# Skip pretrained models test. Only run unit tests as part of the build
2020-10-02 16:31:35 +00:00
# add "--verbosity", "detailed" to this command if required
2022-04-26 16:35:16 +00:00
cmd_args = [
" dotnet " ,
" test " ,
2022-12-05 22:54:09 +00:00
" test \\ Microsoft.ML.OnnxRuntime.Tests.NetCoreApp \\ Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj " ,
2022-04-26 16:35:16 +00:00
" --filter " ,
" FullyQualifiedName!=Microsoft.ML.OnnxRuntime.Tests.InferenceTest.TestPreTrainedModels " ,
define_constants ,
ort_build_dir ,
]
2020-10-02 16:31:35 +00:00
run_subprocess ( cmd_args , cwd = csharp_source_dir )
2020-08-26 19:33:48 +00:00
2021-06-02 07:47:40 +00:00
def generate_documentation ( source_dir , build_dir , configs , validate ) :
2021-04-21 23:20:56 +00:00
# Randomly choose one build config
config = next ( iter ( configs ) )
cwd = get_config_build_dir ( build_dir , config )
if is_windows ( ) :
cwd = os . path . join ( cwd , config )
2021-06-02 07:47:40 +00:00
2022-04-26 16:35:16 +00:00
contrib_op_doc_path = os . path . join ( source_dir , " docs " , " ContribOperators.md " )
opkernel_doc_path = os . path . join ( source_dir , " docs " , " OperatorKernels.md " )
shutil . copy ( os . path . join ( source_dir , " tools " , " python " , " gen_contrib_doc.py " ) , cwd )
shutil . copy ( os . path . join ( source_dir , " tools " , " python " , " gen_opkernel_doc.py " ) , cwd )
2021-06-02 07:47:40 +00:00
# limit to just com.microsoft (excludes purely internal stuff like com.microsoft.nchwc).
2022-04-26 16:35:16 +00:00
run_subprocess (
[ sys . executable , " gen_contrib_doc.py " , " --output_path " , contrib_op_doc_path , " --domains " , " com.microsoft " ] ,
cwd = cwd ,
)
2022-09-09 17:21:25 +00:00
# we currently limit the documentation created by a build to a subset of EP's.
2021-06-02 07:47:40 +00:00
# Run get_opkernel_doc.py directly if you need/want documentation from other EPs that are enabled in the build.
2022-04-26 16:35:16 +00:00
run_subprocess (
2022-09-09 17:21:25 +00:00
[
sys . executable ,
" gen_opkernel_doc.py " ,
" --output_path " ,
opkernel_doc_path ,
" --providers " ,
" CPU " ,
" CUDA " ,
" DML " ,
] ,
2022-04-26 16:35:16 +00:00
cwd = cwd ,
)
2021-06-02 07:47:40 +00:00
if validate :
try :
have_diff = False
2022-04-26 16:35:16 +00:00
def diff_file ( path , regenerate_qualifiers = " " ) :
2023-10-09 19:43:12 +00:00
diff = subprocess . check_output ( [ " git " , " diff " , " --ignore-blank-lines " , path ] , cwd = source_dir ) . decode (
" utf-8 "
)
2021-06-02 07:47:40 +00:00
if diff :
nonlocal have_diff
have_diff = True
2022-04-26 16:35:16 +00:00
log . warning (
2024-03-13 17:00:32 +00:00
f " The updated document { path } is different from the checked in version. "
f " Please regenerate the file { regenerate_qualifiers } , or copy the updated version from the "
" CI build ' s published artifacts if applicable. "
2022-04-26 16:35:16 +00:00
)
2024-03-13 17:00:32 +00:00
log . debug ( " diff: \n " + diff ) # noqa: G003
2021-06-02 07:47:40 +00:00
2022-10-27 21:20:48 +00:00
diff_file ( opkernel_doc_path , " with CPU, CUDA and DML execution providers enabled " )
2021-06-02 07:47:40 +00:00
diff_file ( contrib_op_doc_path )
if have_diff :
# Output for the CI to publish the updated md files as an artifact
2022-04-26 16:35:16 +00:00
print ( " ##vso[task.setvariable variable=DocUpdateNeeded]true " )
raise BuildError ( " Generated documents have diffs. Check build output for details. " )
2021-06-02 07:47:40 +00:00
except subprocess . CalledProcessError :
2023-03-24 22:29:03 +00:00
raise BuildError ( " git diff returned non-zero error code " ) # noqa: B904
2019-05-01 21:58:21 +00:00
2019-03-27 04:58:01 +00:00
2018-11-20 00:48:22 +00:00
def main ( ) :
2024-03-13 17:00:32 +00:00
log . debug ( " Command line arguments: \n {} " . format ( " " . join ( shlex . quote ( arg ) for arg in sys . argv [ 1 : ] ) ) ) # noqa: G001
2021-08-05 16:41:17 +00:00
2018-11-20 00:48:22 +00:00
args = parse_arguments ( )
2023-03-27 21:46:04 +00:00
Flash Attention v2 MHA (#17227)
### Description
Integrate Flash Attention V2 to PackedMultiHeadAttention,
MultiHeadAttention and Attention operators.
Flash Attention v2 source code is from
https://github.com/Dao-AILab/flash-attention/tree/main/csrc/flash_attn/src.
We did some change to remove dependency on Torch, then removed backward
and bfloat16 related code.
Add benchmark script (see benchmark_mha.sh) to compare different
attention kernels for MultiHeadAttention operator.
Current limitations for Flash Attention in PackedMultiHeadAttention,
MultiHeadAttention and Attention operators:
* Relative Position Bias is not supported
* Different hidden size for Q and V is not supported
* Only float16 is supported
* Padding/attention mask is not supported
* For MultiHeadAttention, when there is past or present input, bias
shall be provided to activate flash attention
* For Attention, past or present inputs will deactivate flash attention
* Causal is not supported
Some limitations (like attention mask and causal) might be removed
later.
Currently, Flash Attention v2 only works in Linux. For Windows, we will
enable later with Cutlass 3.2.
Two environment variables can be used for testing purpose:
(1) `ORT_DISABLE_FLASH_ATTENTION` to disable flash attention. Default
value is 0 (enable). Set it to "1" to disable it.
(2) `ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV`. Default value is
"513", which means that we only enable flash attention when sequence
length is larger than 512 for packed QKV format. Set it to "0" if you
want to use flash attention v2 whenever possible.
### Speedup
The following result is from Standard_ND96amsr_A100_v4 VM
(A100-SXM4-80GB GPU) using benchmark_mha.sh. The metric is TFLOPs per
second for MultiHeadAttention operator.
There are 3 input formats:
* `Q,K,V` means separated inputs query, key and value of BxSxNH
* `Q,KV` means packed KV, where key is 5D: BxSxNx2xH
* `QKV` means packed QKV, where query is 5D: BxSxNx3xH
Note that flash attention cannot use packed QKV format, so extra
Transpose is needed. We found that TensorRT kernel is faster for
sequence length <= 512 for packed QKV. The reason might be no transpose
is needed for TensorRT kernel in this format.
We also notice that, TensorRT kernel is faster for stable diffusion
512x512 image (see seq_len=4096, heads=8, head_dim=40 below), while
flash attention v2 is faster for 1024x1024 image (see seq_len=16384,
heads=8, head_dim=40 below).
input format | batch size | sequence length | heads | head dim |
flash_v2 (TFLOPs/s) | TensorRT (TFLOPs/s) | Memory Efficient Attention
(TFLOPs/s)
-- | -- | -- | -- | -- | -- | -- | --
Q,K,V | 32 | 512 | 64 | 32 | 78.1 | 60.0 | 39.3
Q,K,V | 32 | 512 | 128 | 16 | 46.8 | 44.1 | 21.7
Q,K,V | 16 | 1024 | 64 | 32 | 99.0 | 72.8 | 44.3
Q,K,V | 16 | 1024 | 128 | 16 | 54.7 | 49.2 | 23.4
Q,K,V | 8 | 2048 | 64 | 32 | 113.8 | 81.2 | 47.8
Q,K,V | 8 | 2048 | 128 | 16 | 59.7 | 51.9 | 24.7
Q,K,V | 4 | 4096 | 64 | 32 | 122.5 | 85.6 | 49.7
Q,K,V | 4 | 4096 | 128 | 16 | 62.5 | 53.3 | 25.3
Q,K,V | 2 | 8192 | 64 | 32 | 127.4 | 87.5 | 50.7
Q,K,V | 2 | 8192 | 128 | 16 | 64.0 | 54.2 | 25.6
Q,K,V | 1 | 16384 | 64 | 32 | 129.5 | 91.0 | 51.2
Q,K,V | 1 | 16384 | 128 | 16 | 64.7 | 54.5 | 25.8
Q,K,V | 1 | 4096 | 8 | 40 | 51.0 | 43.6 | 36.8
Q,K,V | 1 | 4096 | 8 | 80 | 97.7 | 77.0 | 55.5
Q,K,V | 1 | 4096 | 8 | 160 | 120.0 | 39.7 | 57.8
Q,K,V | 4 | 4096 | 8 | 40 | 89.0 | 84.4 | 49.2
Q,K,V | 4 | 4096 | 8 | 80 | 133.0 | 92.2 | 63.2
Q,K,V | 4 | 4096 | 8 | 160 | 164.8 | 42.7 | 63.8
Q,K,V | 1 | 16384 | 8 | 40 | 96.9 | 91.3 | 52.1
Q,K,V | 1 | 16384 | 8 | 80 | 142.9 | 101.5 | 65.6
Q,K,V | 1 | 16384 | 8 | 160 | 177.4 | 44.2 | 65.7
Q,K,V | 128 | 128 | 12 | 64 | 29.0 | 26.9 | 25.7
Q,K,V | 64 | 128 | 12 | 64 | 23.1 | 10.8 | 21.3
Q,K,V | 128 | 384 | 12 | 64 | 83.5 | 60.8 | 55.7
Q,K,V | 64 | 384 | 12 | 64 | 72.6 | 40.5 | 52.8
Q,K,V | 128 | 512 | 12 | 64 | 98.9 | 77.9 | 62.1
Q,K,V | 64 | 512 | 12 | 64 | 94.7 | 75.6 | 60.4
Q,KV | 32 | 512 | 64 | 32 | 85.9 | 41.1 | 41.1
Q,KV | 32 | 512 | 128 | 16 | 47.1 | 21.6 | 21.6
Q,KV | 16 | 1024 | 64 | 32 | 104.4 | 45.8 | 45.8
Q,KV | 16 | 1024 | 128 | 16 | 54.7 | 23.6 | 23.6
Q,KV | 8 | 2048 | 64 | 32 | 116.8 | 48.5 | 48.5
Q,KV | 8 | 2048 | 128 | 16 | 59.8 | 24.7 | 24.7
Q,KV | 4 | 4096 | 64 | 32 | 124.2 | 50.1 | 50.1
Q,KV | 4 | 4096 | 128 | 16 | 62.6 | 25.3 | 25.3
Q,KV | 2 | 8192 | 64 | 32 | 128.5 | 50.8 | 50.9
Q,KV | 2 | 8192 | 128 | 16 | 64.1 | 25.6 | 25.6
Q,KV | 1 | 16384 | 64 | 32 | 129.4 | 51.2 | 51.2
Q,KV | 1 | 16384 | 128 | 16 | 64.8 | 25.8 | 25.8
Q,KV | 1 | 4096 | 8 | 40 | 67.5 | 37.7 | 37.5
Q,KV | 1 | 4096 | 8 | 80 | 101.3 | 56.7 | 56.6
Q,KV | 1 | 4096 | 8 | 160 | 124.0 | 58.6 | 58.6
Q,KV | 4 | 4096 | 8 | 40 | 90.8 | 49.8 | 49.8
Q,KV | 4 | 4096 | 8 | 80 | 135.6 | 63.8 | 63.8
Q,KV | 4 | 4096 | 8 | 160 | 166.3 | 64.5 | 64.5
Q,KV | 1 | 16384 | 8 | 40 | 97.5 | 52.3 | 52.3
Q,KV | 1 | 16384 | 8 | 80 | 143.5 | 65.9 | 65.8
Q,KV | 1 | 16384 | 8 | 160 | 178.4 | 65.9 | 65.8
Q,KV | 128 | 128 | 12 | 64 | 26.8 | 48.1 | 30.9
Q,KV | 64 | 128 | 12 | 64 | 28.0 | 38.9 | 25.0
Q,KV | 128 | 384 | 12 | 64 | 97.7 | 61.1 | 61.0
Q,KV | 64 | 384 | 12 | 64 | 89.5 | 57.8 | 57.9
Q,KV | 128 | 512 | 12 | 64 | 111.9 | 66.7 | 66.9
Q,KV | 64 | 512 | 12 | 64 | 107.2 | 64.9 | 64.8
QKV | 32 | 512 | 64 | 32 | 77.2 | 84.7 | 39.3
QKV | 32 | 512 | 128 | 16 | 43.4 | 53.1 | 20.9
QKV | 16 | 1024 | 64 | 32 | 98.8 | 87.4 | 44.6
QKV | 16 | 1024 | 128 | 16 | 52.0 | 54.1 | 23.2
QKV | 8 | 2048 | 64 | 32 | 113.1 | 89.0 | 47.9
QKV | 8 | 2048 | 128 | 16 | 58.2 | 54.6 | 24.5
QKV | 4 | 4096 | 64 | 32 | 120.6 | 89.7 | 49.7
QKV | 4 | 4096 | 128 | 16 | 61.7 | 54.6 | 25.2
QKV | 2 | 8192 | 64 | 32 | 125.9 | 89.5 | 50.7
QKV | 2 | 8192 | 128 | 16 | 63.6 | 54.8 | 25.5
QKV | 1 | 16384 | 64 | 32 | 128.5 | 92.0 | 51.2
QKV | 1 | 16384 | 128 | 16 | 64.6 | 54.8 | 25.7
QKV | 1 | 4096 | 8 | 40 | 60.2 | **69.8** | 38.1
QKV | 1 | 4096 | 8 | 80 | 101.6 | 75.2 | 56.7
QKV | 1 | 4096 | 8 | 160 | 130.2 | 41.2 | 58.4
QKV | 4 | 4096 | 8 | 40 | 90.6 | **91.0** | 49.5
QKV | 4 | 4096 | 8 | 80 | 133.6 | 98.1 | 62.8
QKV | 4 | 4096 | 8 | 160 | 165.3 | 43.7 | 63.9
QKV | 1 | 16384 | 8 | 40 | 97.2 | 92.8 | 52.1
QKV | 1 | 16384 | 8 | 80 | 143.0 | 103.1 | 65.6
QKV | 1 | 16384 | 8 | 160 | 177.6 | 44.5 | 65.7
QKV | 128 | 128 | 12 | 64 | 31.1 | 65.9 | 27.6
QKV | 64 | 128 | 12 | 64 | 26.1 | 49.8 | 23.5
QKV | 128 | 384 | 12 | 64 | 84.6 | 88.5 | 56.1
QKV | 64 | 384 | 12 | 64 | 79.1 | 80.3 | 53.5
QKV | 128 | 512 | 12 | 64 | 97.3 | 114.2 | 62.2
QKV | 64 | 512 | 12 | 64 | 95.9 | 110.7 | 60.6
QKV | 4 | 2048 | 32 | 128 | 125.26 | 44.72 | 78.15
QKV | 4 | 4096 | 32 | 128 | 141.62 | 46.29 | 85.84
QKV | 8 | 2048 | 32 | 128 | 127.40 | 45.49 | 78.75
QKV | 8 | 4096 | 32 | 128 | 144.24 | 46.60 | 86.95
### Known Issues
NVCC uses huge memory while compiling flash attention CUDA kernel. Linux
build with CUDA might fail when machine has limited memory while number
of CPUs is large. Walkaround is to use a build machine with larger
memory, or use argument like `--nvcc_threads 1` to limit nvcc threads in
build.
### Motivation and Context
Increases speed and efficiency of MHA or Packed MHA.
---------
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: tlwu@microsoft.com <tlwu@a100.crj0ad2y1kku1j4yxl4sj10o4e.gx.internal.cloudapp.net>
2023-08-31 20:52:21 +00:00
print ( args )
2023-04-14 15:56:59 +00:00
if os . getenv ( " ORT_BUILD_WITH_CACHE " ) == " 1 " :
args . use_cache = True
2023-03-27 21:46:04 +00:00
if not is_windows ( ) :
if not args . allow_running_as_root :
is_root_user = os . geteuid ( ) == 0
if is_root_user :
raise BuildError (
" Running as root is not allowed. If you really want to do that, use ' --allow_running_as_root ' . "
)
2022-03-22 18:55:45 +00:00
cmake_extra_defines = normalize_arg_list ( args . cmake_extra_defines )
2021-03-29 22:35:30 +00:00
cross_compiling = args . arm or args . arm64 or args . arm64ec or args . android
2019-07-24 20:20:05 +00:00
2024-01-12 15:24:40 +00:00
if args . enable_address_sanitizer :
# Disable ONNX Runtime's builtin memory checker
args . disable_memleak_checker = True
2020-04-19 03:48:30 +00:00
# If there was no explicit argument saying what to do, default
# to update, build and test (for native builds).
2022-10-27 21:20:48 +00:00
if not ( args . update or args . clean or args . build or args . test or args . gen_doc ) :
2021-02-08 21:15:31 +00:00
log . debug ( " Defaulting to running update, build [and test for native builds]. " )
2018-11-20 00:48:22 +00:00
args . update = True
args . build = True
2019-07-24 20:20:05 +00:00
if cross_compiling :
2022-04-26 16:35:16 +00:00
args . test = args . android_abi == " x86_64 " or args . android_abi == " arm64-v8a "
2019-03-09 01:42:20 +00:00
else :
args . test = True
2018-11-20 00:48:22 +00:00
2019-11-27 21:03:23 +00:00
if args . skip_tests :
args . test = False
2019-03-14 19:00:39 +00:00
if args . use_tensorrt :
args . use_cuda = True
2023-08-08 03:32:55 +00:00
if args . build_wheel or args . gen_doc or args . use_tvm or args . enable_training :
2018-11-20 00:48:22 +00:00
args . enable_pybind = True
2019-03-21 21:06:38 +00:00
2024-07-02 22:37:50 +00:00
if (
args . build_csharp
or args . build_nuget
or args . build_java
or args . build_nodejs
or ( args . enable_pybind and not args . enable_training )
) :
# If pyhon bindings are enabled, we embed the shared lib in the python package.
# If training is enabled, we don't embed the shared lib in the python package since training requires
# torch interop.
2019-01-10 01:06:56 +00:00
args . build_shared_lib = True
2018-11-20 00:48:22 +00:00
2020-08-26 19:33:48 +00:00
if args . build_nuget and cross_compiling :
2022-04-26 16:35:16 +00:00
raise BuildError ( " Currently nuget package creation is not supported while cross-compiling " )
2020-09-04 21:59:01 +00:00
2024-02-08 17:08:41 +00:00
if args . enable_pybind :
if args . disable_rtti :
raise BuildError ( " Python bindings use typeid so you can ' t disable RTTI " )
2021-06-03 06:36:49 +00:00
2024-02-08 17:08:41 +00:00
if args . disable_exceptions :
raise BuildError ( " Python bindings require exceptions to be enabled. " )
if args . minimal_build is not None :
raise BuildError ( " Python bindings are not supported in a minimal build. " )
2020-08-26 19:33:48 +00:00
2020-11-16 01:04:45 +00:00
if args . nnapi_min_api :
if not args . use_nnapi :
raise BuildError ( " Using --nnapi_min_api requires --use_nnapi " )
if args . nnapi_min_api < 27 :
raise BuildError ( " --nnapi_min_api should be 27+ " )
2022-01-19 02:05:04 +00:00
if args . build_wasm_static_lib :
args . build_wasm = True
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args . build_wasm :
if not args . disable_wasm_exception_catching and args . disable_exceptions :
# When '--disable_exceptions' is set, we set '--disable_wasm_exception_catching' as well
args . disable_wasm_exception_catching = True
if args . test and args . disable_wasm_exception_catching and not args . minimal_build :
raise BuildError ( " WebAssembly tests need exception catching enabled to run if it ' s not minimal build " )
2021-05-21 08:32:00 +00:00
if args . test and args . enable_wasm_debug_info :
# With flag --enable_wasm_debug_info, onnxruntime_test_all.wasm will be very huge (>1GB). This will fail
# Node.js when trying to load the .wasm file.
# To debug ONNX Runtime WebAssembly, use ONNX Runtime Web to debug ort-wasm.wasm in browsers.
raise BuildError ( " WebAssembly tests cannot be enabled with flag --enable_wasm_debug_info " )
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
2022-03-22 18:55:45 +00:00
if args . wasm_malloc is not None :
# mark --wasm_malloc as deprecated
log . warning (
2023-04-17 17:11:44 +00:00
" Flag ' --wasm_malloc=<Value> ' is deprecated. Please use ' --emscripten_settings MALLOC=<Value> ' . "
2022-04-26 16:35:16 +00:00
)
2022-03-22 18:55:45 +00:00
2020-12-08 18:55:02 +00:00
if args . code_coverage and not args . android :
raise BuildError ( " Using --code_coverage requires --android " )
2021-03-22 17:20:33 +00:00
if args . gen_api_doc and len ( args . config ) != 1 :
2022-04-26 16:35:16 +00:00
raise BuildError ( " Using --get-api-doc requires a single build config " )
2021-03-22 17:20:33 +00:00
2023-04-26 03:59:42 +00:00
# Disabling unit tests for GPU on nuget creation
2024-04-19 07:31:38 +00:00
if args . use_openvino and args . use_openvino != " CPU " and args . build_nuget :
2021-08-31 16:23:13 +00:00
args . test = False
2022-04-07 22:06:31 +00:00
# GDK builds don't support testing
if args . use_gdk :
args . test = False
2023-02-01 01:17:26 +00:00
# enable_training is a higher level flag that enables all training functionality.
if args . enable_training :
args . enable_training_apis = True
args . enable_training_ops = True
2018-11-20 00:48:22 +00:00
configs = set ( args . config )
# setup paths and directories
2021-06-03 06:36:49 +00:00
# cmake_path and ctest_path can be None. For example, if a person only wants to run the tests, he/she doesn't need
# to have cmake/ctest.
2018-12-18 21:23:32 +00:00
cmake_path = resolve_executable_path ( args . cmake_path )
2023-04-07 00:06:59 +00:00
ctest_path = resolve_executable_path ( args . ctest_path )
2018-11-20 00:48:22 +00:00
build_dir = args . build_dir
script_dir = os . path . realpath ( os . path . dirname ( __file__ ) )
source_dir = os . path . normpath ( os . path . join ( script_dir , " .. " , " .. " ) )
# if using cuda, setup cuda paths and env vars
cuda_home , cudnn_home = setup_cuda_vars ( args )
2020-06-15 15:47:03 +00:00
mpi_home = args . mpi_home
nccl_home = args . nccl_home
2022-06-03 21:10:02 +00:00
snpe_root = args . snpe_root
2020-10-22 16:29:44 +00:00
acl_home = args . acl_home
acl_libs = args . acl_libs
armnn_home = args . armnn_home
armnn_libs = args . armnn_libs
2023-03-01 21:48:20 +00:00
qnn_home = args . qnn_home
2019-03-14 19:00:39 +00:00
# if using tensorrt, setup tensorrt paths
tensorrt_home = setup_tensorrt_vars ( args )
2020-05-26 20:24:59 +00:00
# if using migraphx, setup migraphx paths
migraphx_home = setup_migraphx_vars ( args )
2020-10-30 00:13:04 +00:00
# if using rocm, setup rocm paths
2022-10-21 05:46:22 +00:00
rocm_home = setup_rocm_build ( args )
2020-10-30 00:13:04 +00:00
2022-09-22 21:53:40 +00:00
# if using cann, setup cann paths
cann_home = setup_cann_vars ( args )
2021-07-31 00:16:37 +00:00
if args . update or args . build :
2021-12-29 03:04:20 +00:00
for config in configs :
os . makedirs ( get_config_build_dir ( build_dir , config ) , exist_ok = True )
2018-11-20 00:48:22 +00:00
log . info ( " Build started " )
2021-12-29 03:04:20 +00:00
2020-04-19 03:48:30 +00:00
if args . update :
2021-12-29 03:04:20 +00:00
if is_reduced_ops_build ( args ) :
from reduce_op_kernels import reduce_ops
2022-04-26 16:35:16 +00:00
2022-04-27 17:31:02 +00:00
is_extended_minimal_build_or_higher = args . minimal_build is None or " extended " in args . minimal_build
2021-12-29 03:04:20 +00:00
for config in configs :
reduce_ops (
config_path = args . include_ops_by_config ,
build_dir = get_config_build_dir ( build_dir , config ) ,
enable_type_reduction = args . enable_reduced_operator_type_support ,
2022-04-26 16:35:16 +00:00
use_cuda = args . use_cuda ,
2022-04-27 17:31:02 +00:00
is_extended_minimal_build_or_higher = is_extended_minimal_build_or_higher ,
2022-04-26 16:35:16 +00:00
)
2021-12-29 03:04:20 +00:00
2019-01-15 18:29:00 +00:00
cmake_extra_args = [ ]
2023-04-07 00:06:59 +00:00
path_to_protoc_exe = None
if args . path_to_protoc_exe :
path_to_protoc_exe = Path ( args . path_to_protoc_exe )
if not path_to_protoc_exe . exists ( ) :
raise BuildError ( " The value to --path_to_protoc_exe is invalid. " )
2020-05-14 17:53:37 +00:00
if not args . skip_submodule_sync :
update_submodules ( source_dir )
2023-04-10 17:41:04 +00:00
if is_windows ( ) and not args . build_wasm :
2021-07-31 00:16:37 +00:00
cpu_arch = platform . architecture ( ) [ 0 ]
2023-04-10 17:41:04 +00:00
if args . cmake_generator == " Ninja " :
2022-04-26 16:35:16 +00:00
if cpu_arch == " 32bit " or args . arm or args . arm64 or args . arm64ec :
2020-04-19 03:48:30 +00:00
raise BuildError (
" To cross-compile with Ninja, load the toolset "
" environment for the target processor (e.g. Cross "
2022-04-26 16:35:16 +00:00
" Tools Command Prompt for VS) "
)
cmake_extra_args = [ " -G " , args . cmake_generator ]
2021-03-29 22:35:30 +00:00
elif args . arm or args . arm64 or args . arm64ec :
2020-03-19 15:52:40 +00:00
if args . arm :
2022-04-26 16:35:16 +00:00
cmake_extra_args = [ " -A " , " ARM " ]
2021-03-29 22:35:30 +00:00
elif args . arm64 :
2022-04-26 16:35:16 +00:00
cmake_extra_args = [ " -A " , " ARM64 " ]
2023-12-07 00:49:00 +00:00
if args . buildasx :
cmake_extra_args + = [ " -D " , " BUILD_AS_ARM64X=ARM64 " ]
2021-03-29 22:35:30 +00:00
elif args . arm64ec :
2022-04-26 16:35:16 +00:00
cmake_extra_args = [ " -A " , " ARM64EC " ]
2023-12-07 00:49:00 +00:00
if args . buildasx :
cmake_extra_args + = [ " -D " , " BUILD_AS_ARM64X=ARM64EC " ]
2022-04-26 16:35:16 +00:00
cmake_extra_args + = [ " -G " , args . cmake_generator ]
2020-04-19 03:48:30 +00:00
# Cannot test on host build machine for cross-compiled
2024-07-22 20:37:32 +00:00
# builds (Override any user-defined behavior for test if any)
2020-03-19 15:52:40 +00:00
if args . test :
2020-12-01 18:00:06 +00:00
log . warning (
2020-04-19 03:48:30 +00:00
" Cannot test on host build machine for cross-compiled "
2022-04-26 16:35:16 +00:00
" ARM(64) builds. Will skip test running after build. "
)
2020-03-19 15:52:40 +00:00
args . test = False
2019-12-18 23:34:58 +00:00
else :
2023-04-12 00:14:54 +00:00
target_arch = platform . machine ( )
if target_arch == " AMD64 " :
if cpu_arch == " 32bit " or args . x86 :
target_arch = " Win32 "
else :
target_arch = " x64 "
host_arch = " x64 "
elif target_arch == " ARM64 " :
host_arch = " ARM64 "
else :
raise BuildError ( " unknown python arch " )
2021-07-31 00:16:37 +00:00
if args . msvc_toolset :
2023-04-12 00:14:54 +00:00
toolset = " host= " + host_arch + " ,version= " + args . msvc_toolset
2020-03-19 15:52:40 +00:00
else :
2023-04-12 00:14:54 +00:00
toolset = " host= " + host_arch
2020-04-19 03:48:30 +00:00
if args . cuda_version :
2022-04-26 16:35:16 +00:00
toolset + = " ,cuda= " + args . cuda_version
2023-05-02 01:00:47 +00:00
elif args . cuda_home :
toolset + = " ,cuda= " + args . cuda_home
2023-08-09 21:01:16 +00:00
if args . windows_sdk_version :
target_arch + = " ,version= " + args . windows_sdk_version
2023-04-12 00:14:54 +00:00
cmake_extra_args = [ " -A " , target_arch , " -T " , toolset , " -G " , args . cmake_generator ]
2020-09-16 17:46:27 +00:00
if args . enable_wcos :
2022-04-26 16:35:16 +00:00
cmake_extra_defines . append ( " CMAKE_USER_MAKE_RULES_OVERRIDE=wcos_rules_override.cmake " )
2023-12-07 00:49:00 +00:00
2023-07-07 15:11:44 +00:00
elif args . cmake_generator is not None :
2022-04-26 16:35:16 +00:00
cmake_extra_args + = [ " -G " , args . cmake_generator ]
2023-07-07 15:11:44 +00:00
if is_macOS ( ) :
2024-03-20 17:55:19 +00:00
if (
2024-04-24 01:15:07 +00:00
not ( args . ios or args . visionos )
2024-03-20 17:55:19 +00:00
and args . macos != " Catalyst "
and not args . android
and args . osx_arch == " arm64 "
and platform . machine ( ) == " x86_64 "
) :
2020-12-01 18:00:06 +00:00
if args . test :
2022-04-26 16:35:16 +00:00
log . warning ( " Cannot test ARM64 build on X86_64. Will skip test running after build. " )
2020-12-01 18:00:06 +00:00
args . test = False
2020-04-01 00:10:48 +00:00
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
if args . build_wasm :
2023-05-21 01:07:39 +00:00
if is_windows ( ) and platform . architecture ( ) [ 0 ] == " 32bit " :
raise BuildError ( " Please use a 64-bit python to run this script " )
if args . build_wheel or args . enable_pybind :
raise BuildError ( " WASM does not support pybind " )
2021-06-07 16:45:02 +00:00
emsdk_version = args . emsdk_version
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
emsdk_dir = os . path . join ( source_dir , " cmake " , " external " , " emsdk " )
emsdk_file = os . path . join ( emsdk_dir , " emsdk.bat " ) if is_windows ( ) else os . path . join ( emsdk_dir , " emsdk " )
2021-06-07 16:45:02 +00:00
log . info ( " Installing emsdk... " )
run_subprocess ( [ emsdk_file , " install " , emsdk_version ] , cwd = emsdk_dir )
log . info ( " Activating emsdk... " )
build ONNXRuntime into WebAssembly (#6478)
* Simplified version of WebAssembly support to keep most of existing data structures and add cmake using Ninja and emcmake
* Clean up CMakeLists.txt and add an example to create and compute a kernel
* Load a model from bytes and remove graph building steps
* Add all cpu and contrib ops with mlas library
* WebAssembly build with Onnxruntime C/CXX API
* Use protobuf cmakefile directory instead of adding every necessary source file
* Fix invalid output at example
* add missing files
* Change an example to use Teams model and support ort mobile format
* add API for javascript
* fix input releasing in _ort_run()
* update API
* Let onnxruntime cmake build WebAssembly with option '--wasm'
* allow one-step building for wasm
* Make build script working on Linux and MacOS
* Fix broken build from Windows command
* Enable unit test on building WebAssembly
* Resolve comments
* update build flags
* wasm conv improvement from: 1) GemmV; 2) Depthwise direct convolution 3x3; 3) Direct convolution 3x3
* Cleaned mlas unittest.
* use glob
* update comments
* Update baseline due to loss scale fix (#6948)
* fix stream sync issue (#6954)
* Enable type reduction in EyeLike, Mod, random.cc CPU kernels. (#6960)
* Update EyeLike CPU kernel.
* Update Mod CPU kernel.
* Update Multinomial CPU kernel.
* Slight improvement to Pad CPU kernel binary size.
* Update RandomNormal[Like], RandomUniform[Like] CPU kernels.
* Fix warning from setting multiple MSVC warning level options. (#6917)
Fix warning from setting multiple MSVC warning level options. Replace an existing /Wn flag instead of always appending a new one.
* MLAS: quantized GEMM update (#6916)
Various updates to the int8_t GEMMs:
1) Add ARM64 udot kernel to take advantage of dot product instructions available in newer cores. Some models run 4x faster than the stock implementation we used before.
2) Refactor the x64 kernels to share common code for AVX2(u8u8/u8s8/avxvnni) vs AVX512(u8u8/u8s8/avx512vnni) to reduce binary size.
3) Extend kernels to support per-column zero points for matrix B. This is not currently wired to an operator.
* Implement QLinearAveragePool with unit tests. (#6896)
Implement QLinearAveragePool with unit tests.
* Attention fusion detect num_heads and hidden_size automatically (#6920)
* fixed type to experimental session constructor (#6950)
* fixed type to experimental session constructor
Co-authored-by: David Medine <david.medine@brainproducts.com>
* Update onnxruntime_perf_test.exe to accept free dimension overrides (#6962)
Co-authored-by: Ori Levari <orlevari@microsoft.com>
* Fix possible fd leak in NNAPI (#6966)
* Release buffers for prepacked tensors (#6820)
Unsolved problems:
1. One test failure was caused by a bug in Cudnn rnn kernels, when they can allocate a buffer and partially initialize it, the garbage data near tail of the buffer caused problem in some of the hardware. To attack this problem in a broader sense, should we add code in our allocators, and during a memory fuzzing test, fill an allocated buffer with garbage before returning to the caller?
2. Prepacking is used more widely than we know. For instance, Cudnn rnn kernels also cache their weights. They mix several weight tensors together into a single buffer, and never touch the original weight tensor anymore. This is the same idea with pre-pack, but they didn't override the virtual function, and they never tried to release those weight tensors, leading to memory waste. It also seems to me that there are some other kernels have similar behavior. Wonder how much memory we can save if we try to cleanup those too.
3. Turning off memory pattern planning does increase memory fragmentation, leading to out of memory error in some training test cases. Perhaps we can revisit the idea of pushing kernels-creation stage earlier, and then during initializer deserialization, we only avoid tracing those that will be prepacked.
* Enable type reduction for Range, ReverseSequence, ScatterND, Split, and Unique CPU kernels. (#6963)
* add CI
* fix test in ci
* fix flags for nsync in wasm build
* add copyright banner
* fix wasm source glob
* add missing exports
* resolve comments
* Perf gain by make packb wide to 4 from 16 on GEMM for WASM.
Remove no need direct conv in previous perf tuning.
* fix buildbreak introduced from latest master merge
* fix buildbreak in mlasi.h
* resolve all comments except MLAS
* rewrite packb related 3 functions for WASM_SCALAR seperately rather than using #ifdef in each.
and other changes according to PR feedback in mlas.
* More complete scalar path in sgemm from Tracy.
* Fix edge case handling in depthwise conv2d kernel 3x3. where:
*) support input W==1 and H==1
*) recalc in accurate pad_right and pad_bottom
*) support hidden pad_right == 2 or pad_bottom == 2 when W == 1 or H==1 and no pad left/top
* Add more test coverage for conv depthwise from Tracy.
Fix one typo according to PR.
* resolve comments
* replace typedef by using
* do not use throw in OrtRun()
* output error message
Co-authored-by: Sunghoon <35605090+hanbitmyths@users.noreply.github.com>
Co-authored-by: Lei Zhang <zhang.huanning@hotmail.com>
Co-authored-by: Wei-Sheng Chin <wschin@outlook.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com>
Co-authored-by: David Medine <david.eric.medine@gmail.com>
Co-authored-by: David Medine <david.medine@brainproducts.com>
Co-authored-by: Ori Levari <ori.levari@microsoft.com>
Co-authored-by: Ori Levari <orlevari@microsoft.com>
Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com>
Co-authored-by: Chen Fu <chenfucs@gmail.com>
2021-04-06 23:18:10 +00:00
run_subprocess ( [ emsdk_file , " activate " , emsdk_version ] , cwd = emsdk_dir )
2020-04-19 03:48:30 +00:00
if args . enable_pybind and is_windows ( ) :
2024-07-22 19:39:10 +00:00
run_subprocess (
[ sys . executable , " -m " , " pip " , " install " , " -r " , " requirements/pybind/requirements.txt " ] ,
cwd = SCRIPT_DIR ,
)
2021-06-02 07:47:40 +00:00
2021-04-24 00:22:31 +00:00
if args . use_rocm and args . rocm_version is None :
args . rocm_version = " "
2021-06-02 07:47:40 +00:00
2021-08-30 17:13:47 +00:00
if args . enable_external_custom_op_schemas and not is_linux ( ) :
raise BuildError ( " Registering external custom op schemas is only supported on Linux. " )
2021-08-06 15:30:27 +00:00
2020-04-19 03:48:30 +00:00
generate_build_tree (
2022-04-26 16:35:16 +00:00
cmake_path ,
source_dir ,
build_dir ,
cuda_home ,
cudnn_home ,
rocm_home ,
mpi_home ,
nccl_home ,
tensorrt_home ,
migraphx_home ,
acl_home ,
acl_libs ,
armnn_home ,
armnn_libs ,
2023-03-01 21:48:20 +00:00
qnn_home ,
2022-06-03 21:10:02 +00:00
snpe_root ,
2022-09-22 21:53:40 +00:00
cann_home ,
2022-04-26 16:35:16 +00:00
path_to_protoc_exe ,
configs ,
cmake_extra_defines ,
args ,
cmake_extra_args ,
)
2019-01-10 06:33:14 +00:00
2020-04-19 03:48:30 +00:00
if args . clean :
2018-11-20 00:48:22 +00:00
clean_targets ( cmake_path , build_dir , configs )
2019-10-15 13:13:07 +00:00
# if using DML, perform initial nuget package restore
setup_dml_build ( args , cmake_path , build_dir , configs )
2020-04-19 03:48:30 +00:00
if args . build :
2020-10-30 00:13:04 +00:00
if args . parallel < 0 :
2023-03-24 22:29:03 +00:00
raise BuildError ( f " Invalid parallel job count: { args . parallel } " )
2023-09-05 17:59:27 +00:00
num_parallel_jobs = number_of_parallel_jobs ( args )
2020-10-30 00:13:04 +00:00
build_targets ( args , cmake_path , build_dir , configs , num_parallel_jobs , args . target )
2018-11-20 00:48:22 +00:00
2020-04-19 03:48:30 +00:00
if args . test :
2022-09-07 00:33:27 +00:00
if args . enable_onnx_tests :
source_onnx_model_dir = " C: \\ local \\ models " if is_windows ( ) else " /data/models "
2022-09-21 17:02:57 +00:00
setup_test_data ( source_onnx_model_dir , " models " , build_dir , configs )
2022-09-07 00:33:27 +00:00
2020-06-07 03:28:53 +00:00
run_onnxruntime_tests ( args , source_dir , ctest_path , build_dir , configs )
2019-10-15 13:13:07 +00:00
2022-07-19 14:05:28 +00:00
# TODO(agladyshev):
# to support Windows, we need to update .github/workflows/windows.yml
# and add to the PATH variable the following value: C:Program Files\LLVM\bin
if args . enable_pybind and args . use_tvm and not is_windows ( ) :
2022-02-15 09:21:02 +00:00
tvm_run_python_tests ( build_dir , configs )
2022-01-27 19:31:13 +00:00
2020-06-11 02:16:32 +00:00
# run node.js binding tests
if args . build_nodejs and not args . skip_nodejs_tests :
2021-04-16 08:33:10 +00:00
nodejs_binding_dir = os . path . normpath ( os . path . join ( source_dir , " js " , " node " ) )
2020-06-11 02:16:32 +00:00
run_nodejs_tests ( nodejs_binding_dir )
2021-07-31 00:16:37 +00:00
# Build packages after running the tests.
# NOTE: if you have a test that rely on a file which only get copied/generated during packaging step, it could
# fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it
# either.
2019-01-15 18:29:00 +00:00
if args . build :
2024-07-12 04:21:38 +00:00
# TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and
# the target OS is Windows
2019-01-15 18:29:00 +00:00
if args . build_wheel :
2022-04-26 16:35:16 +00:00
nightly_build = bool ( os . getenv ( " NIGHTLY_BUILD " ) == " 1 " )
default_training_package_device = bool ( os . getenv ( " DEFAULT_TRAINING_PACKAGE_DEVICE " ) == " 1 " )
2020-04-14 16:00:13 +00:00
build_python_wheel (
source_dir ,
build_dir ,
configs ,
args . use_cuda ,
2021-04-13 23:19:42 +00:00
args . cuda_version ,
2021-04-24 00:22:31 +00:00
args . use_rocm ,
2024-07-12 04:21:38 +00:00
args . use_migraphx ,
2021-04-24 00:22:31 +00:00
args . rocm_version ,
2020-04-14 16:00:13 +00:00
args . use_dnnl ,
args . use_tensorrt ,
args . use_openvino ,
2022-02-15 09:21:02 +00:00
args . use_tvm ,
2020-05-19 12:32:32 +00:00
args . use_vitisai ,
2020-04-20 08:05:28 +00:00
args . use_acl ,
2020-06-18 14:54:14 +00:00
args . use_armnn ,
2020-09-08 21:34:09 +00:00
args . use_dml ,
2022-09-22 21:53:40 +00:00
args . use_cann ,
2023-01-11 20:25:04 +00:00
args . use_azure ,
2023-03-03 15:26:53 +00:00
args . use_qnn ,
2020-11-26 01:46:11 +00:00
args . wheel_name_suffix ,
Add new PytTrch front-end (#4815)
* Add ORTTrainerOptions class for the new pytorch frontend (#4382)
Add ORTTrainerOptions class and some placeholders
* Add _ORTTrainerModelDesc to perform validation for model description (#4416)
* Add Loss Scaler classes to the new frontend (#4306)
* Add TrainStepInfo used on the new frontend API (#4256)
* Add Optimizer classes to the new frontend (#4280)
* Add LRScheduler implementation (#4357)
* Add basic ORTTrainer API (#4435)
This PR presents the public API for ORTTrainer for the short term
development.
It also validates and saves input parameters, which will be used in the
next stages, such as building ONNX model, post processing the model and
configuring the training session
* Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592)
* Update ModelDescription and minor fix on ORTTrainer ctor (#4605)
* Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions
This PR keeps the public API intact, but changes how model description is stored on the backend
Currently, users creates a dict with two lists of tuples.
One list called 'inputs' and each tuple has the following format tuple(name, shape).
The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss).
With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual.
However, tuples are internally replaced by namedtuples and all output tuples will have
tuple(name, shape, is_loss) format instead of is_loss being optionally present.
Additionally to that normalization in the internal representation (which eases coding),
two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype)
or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output.
This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx.
Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None
* Rename input name for test
* Add ONNX Model Export to New Frontend (#4612)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Create training session + minor improvements (#4668)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Save ONNX model in file (#4671)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add eval step (#4674)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add train_step (#4677)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add LR Scheduler (#4694)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add deterministic compute tests (#4716)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add legacy vs experimental ORTTrainer accuracy comparison (#4727)
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
* Add Mixed precision/LossScaler + several fixes (#4739)
Additionally to the mixed precision/loss scaler code, this PR includes:
* Fix CUDA training
* Add optimization_step into TrainStepInfo class
* Refactor LRSCheduler to use optimization_step instead of step
* Updated several default values at ORTTrainerOptions
* Add initial Gradient Accumulation supported. Untested
* Fix ONNX model post processing
* Refactor unit tests
* Add ONNX BERT example + minor fixes (#4757)
* Fix training issue when passing ONNX file into ORTTrainer
Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
* Add Dynamic Shape support (#4758)
* Update DeepSpeed Zero Stage option to a separate option group (#4772)
* Add support to fetches (#4777)
* Add Gradient Accumulation Steps support (#4793)
* Fix Dynamic Axes feature and add unit test (#4795)
* Add frozen weights test (#4807)
* Move new pytorch front-end to 'experimental' namespace (#4814)
* Fix build
Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com>
Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
args . enable_training ,
2020-04-14 16:00:13 +00:00
nightly_build = nightly_build ,
2021-05-27 05:44:20 +00:00
default_training_package_device = default_training_package_device ,
2022-04-26 16:35:16 +00:00
use_ninja = ( args . cmake_generator == " Ninja " ) ,
2023-01-03 21:28:16 +00:00
enable_training_apis = args . enable_training_apis ,
2022-10-17 02:11:20 +00:00
enable_rocm_profiling = args . enable_rocm_profiling ,
2020-04-14 16:00:13 +00:00
)
2023-11-03 16:05:17 +00:00
2020-08-26 19:33:48 +00:00
if args . build_nuget :
build_nuget_package (
2023-05-15 23:27:38 +00:00
cmake_path ,
2020-10-02 16:31:35 +00:00
source_dir ,
build_dir ,
2020-08-26 19:33:48 +00:00
configs ,
args . use_cuda ,
2023-05-15 23:27:38 +00:00
args . use_rocm ,
2020-08-26 19:33:48 +00:00
args . use_openvino ,
args . use_tensorrt ,
2021-02-18 00:13:07 +00:00
args . use_dnnl ,
2022-02-15 09:21:02 +00:00
args . use_tvm ,
2021-11-04 07:42:51 +00:00
args . use_winml ,
2023-04-29 02:33:14 +00:00
args . use_qnn ,
2023-01-03 21:28:16 +00:00
args . enable_training_apis ,
2023-05-15 23:27:38 +00:00
normalize_arg_list ( args . msbuild_extra_options ) ,
2020-08-26 19:33:48 +00:00
)
if args . test and args . build_nuget :
2022-12-05 22:54:09 +00:00
run_csharp_tests (
source_dir ,
build_dir ,
args . use_cuda ,
args . use_openvino ,
args . use_tensorrt ,
args . use_dnnl ,
2023-01-03 21:28:16 +00:00
args . enable_training_apis ,
2022-12-05 22:54:09 +00:00
)
2019-04-21 00:02:35 +00:00
2022-10-27 21:20:48 +00:00
if args . gen_doc :
# special case CI where we create the build config separately to building
if args . update and not args . build :
pass
else :
# assumes build has occurred for easier use in CI where we don't always build via build.py and need to run
# documentation generation as a separate task post-build
generate_documentation ( source_dir , build_dir , configs , args . gen_doc == " validate " )
2018-11-20 00:48:22 +00:00
2021-03-22 17:20:33 +00:00
if args . gen_api_doc and ( args . build or args . test ) :
2022-04-26 16:35:16 +00:00
print ( " Generating Python doc for ORTModule... " )
docbuild_dir = os . path . join ( source_dir , " tools " , " doc " )
run_subprocess (
[ " bash " , " builddoc.sh " , os . path . dirname ( sys . executable ) , source_dir , build_dir , args . config [ 0 ] ] ,
cwd = docbuild_dir ,
)
2021-03-22 17:20:33 +00:00
2018-11-20 00:48:22 +00:00
log . info ( " Build complete " )
2020-04-19 03:48:30 +00:00
2018-11-20 00:48:22 +00:00
if __name__ == " __main__ " :
2018-12-18 21:23:32 +00:00
try :
sys . exit ( main ( ) )
2019-01-10 06:33:14 +00:00
except BaseError as e :
2018-12-18 21:23:32 +00:00
log . error ( str ( e ) )
2020-05-14 21:15:06 +00:00
sys . exit ( 1 )