2023-03-24 22:29:03 +00:00
|
|
|
# Configuration for lintrunner https://github.com/suo/lintrunner
|
|
|
|
|
# You can install the dependencies and initialize with
|
|
|
|
|
#
|
|
|
|
|
# ```sh
|
2024-12-17 18:59:20 +00:00
|
|
|
# pip install -r requirements-lintrunner.txt
|
2023-03-24 22:29:03 +00:00
|
|
|
# lintrunner init
|
|
|
|
|
# ```
|
|
|
|
|
#
|
|
|
|
|
# This will install lintrunner on your system and download all the necessary
|
|
|
|
|
# dependencies to run linters locally.
|
|
|
|
|
#
|
2024-12-17 18:59:20 +00:00
|
|
|
# To format local changes:
|
2023-03-24 22:29:03 +00:00
|
|
|
#
|
|
|
|
|
# ```bash
|
2024-12-17 18:59:20 +00:00
|
|
|
# lintrunner -a
|
2023-03-24 22:29:03 +00:00
|
|
|
# ```
|
|
|
|
|
#
|
2024-12-17 18:59:20 +00:00
|
|
|
# To format all files:
|
2023-03-24 22:29:03 +00:00
|
|
|
#
|
|
|
|
|
# ```bash
|
2024-12-17 18:59:20 +00:00
|
|
|
# lintrunner -a --all-files
|
2023-03-24 22:29:03 +00:00
|
|
|
# ```
|
|
|
|
|
#
|
|
|
|
|
# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
|
|
|
|
|
# To update an existing linting rule or create a new one, modify this file or create a
|
|
|
|
|
# new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
|
|
|
|
|
|
2023-04-18 16:26:58 +00:00
|
|
|
merge_base_with = 'origin/main'
|
|
|
|
|
|
2023-03-24 22:29:03 +00:00
|
|
|
[[linter]]
|
|
|
|
|
code = 'RUFF'
|
|
|
|
|
include_patterns = [
|
|
|
|
|
'**/*.py',
|
|
|
|
|
'**/*.pyi',
|
|
|
|
|
]
|
|
|
|
|
exclude_patterns = [
|
|
|
|
|
'cmake/external/**',
|
|
|
|
|
# ignore generated flatbuffers code
|
|
|
|
|
'onnxruntime/core/flatbuffers/ort_flatbuffers_py/**',
|
2023-10-25 07:11:02 +00:00
|
|
|
'orttraining/orttraining/python/training/optim/_ds_code_store.py',
|
2023-03-24 22:29:03 +00:00
|
|
|
]
|
|
|
|
|
command = [
|
|
|
|
|
'python',
|
|
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'ruff_linter',
|
|
|
|
|
'--config=pyproject.toml',
|
|
|
|
|
'@{{PATHSFILE}}'
|
|
|
|
|
]
|
|
|
|
|
init_command = [
|
|
|
|
|
'python',
|
|
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'pip_init',
|
|
|
|
|
'--dry-run={{DRYRUN}}',
|
2023-04-20 15:54:26 +00:00
|
|
|
'--requirement=requirements-lintrunner.txt',
|
2023-03-24 22:29:03 +00:00
|
|
|
]
|
|
|
|
|
is_formatter = true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
[[linter]]
|
2025-01-16 19:14:15 +00:00
|
|
|
code = 'RUFF-FORMAT'
|
2023-03-24 22:29:03 +00:00
|
|
|
include_patterns = [
|
|
|
|
|
'**/*.py',
|
|
|
|
|
]
|
|
|
|
|
exclude_patterns = [
|
|
|
|
|
'cmake/**',
|
|
|
|
|
'orttraining/*',
|
|
|
|
|
'onnxruntime/core/flatbuffers/**',
|
2023-10-25 07:11:02 +00:00
|
|
|
'orttraining/orttraining/python/training/optim/_ds_code_store.py',
|
2023-03-24 22:29:03 +00:00
|
|
|
]
|
|
|
|
|
command = [
|
2023-04-20 15:54:26 +00:00
|
|
|
'python',
|
2023-03-24 22:29:03 +00:00
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
2025-01-16 19:14:15 +00:00
|
|
|
'ruff_format_linter',
|
2023-03-24 22:29:03 +00:00
|
|
|
'--',
|
|
|
|
|
'@{{PATHSFILE}}'
|
|
|
|
|
]
|
|
|
|
|
init_command = [
|
2023-04-20 15:54:26 +00:00
|
|
|
'python',
|
2023-03-24 22:29:03 +00:00
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'pip_init',
|
|
|
|
|
'--dry-run={{DRYRUN}}',
|
2023-04-20 15:54:26 +00:00
|
|
|
'--requirement=requirements-lintrunner.txt',
|
2023-03-24 22:29:03 +00:00
|
|
|
]
|
|
|
|
|
is_formatter = true
|
|
|
|
|
|
2023-03-27 15:12:59 +00:00
|
|
|
[[linter]]
|
|
|
|
|
code = 'RUSTFMT'
|
|
|
|
|
include_patterns = ['**/*.rs']
|
|
|
|
|
command = [
|
|
|
|
|
'python',
|
|
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'rustfmt_linter',
|
|
|
|
|
'--binary=rustfmt',
|
|
|
|
|
'--config-path=rust/rustfmt.toml',
|
|
|
|
|
'--',
|
|
|
|
|
'@{{PATHSFILE}}'
|
|
|
|
|
]
|
|
|
|
|
is_formatter = true
|
2023-04-18 16:26:58 +00:00
|
|
|
|
|
|
|
|
[[linter]]
|
|
|
|
|
code = 'CLANGFORMAT'
|
|
|
|
|
include_patterns = [
|
|
|
|
|
'**/*.h',
|
|
|
|
|
'**/*.cc',
|
|
|
|
|
'**/*.hpp',
|
|
|
|
|
'**/*.cpp',
|
|
|
|
|
'**/*.m',
|
|
|
|
|
'**/*.mm',
|
|
|
|
|
]
|
|
|
|
|
exclude_patterns = [
|
|
|
|
|
'java/**', # FIXME: Enable clang-format for java
|
|
|
|
|
'onnxruntime/contrib_ops/cuda/bert/tensorrt_fused_multihead_attention/**', # Contains data chunks
|
2023-06-22 19:27:23 +00:00
|
|
|
'onnxruntime/core/flatbuffers/schema/*.fbs.h', # Generated code
|
2024-04-22 22:17:43 +00:00
|
|
|
'onnxruntime/test/flatbuffers/*.fbs.h', # Generated code
|
2023-04-18 16:26:58 +00:00
|
|
|
'onnxruntime/core/graph/contrib_ops/quantization_defs.cc',
|
|
|
|
|
'onnxruntime/core/mlas/**', # Contains assembly code
|
2024-06-12 23:02:26 +00:00
|
|
|
'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS based libs recommends NO automatic code formatting
|
|
|
|
|
'onnxruntime/core/mickey/gemm/**', # CUTLASS based libs recommends NO automatic code formatting
|
2023-07-26 04:56:50 +00:00
|
|
|
'winml/lib/Api.Image/shaders/**', # Contains data chunks
|
Make Flash Attention work on Windows (#21015)
### Description
Previously, Flash Attention only worked on Linux systems. This PR will
make it work and enable it to be built and run on Windows.
Limitations of Flash Attention in Windows: Requires CUDA 12.
### Motivation and Context
This will significantly increase the performance of Windows-based LLM's
with hardware sm>=80.
To illustrate the improvement of Flash Attention over Memory Efficient
Attention, here are some average benchmark numbers for the GQA operator,
run with configurations based on several recent models (Llama, Mixtral,
Phi-3). The benchmarks were obtained on RTX4090 GPU using the test
script located at
(onnxruntime/test/python/transformers/benchmark_gqa_windows.py).
* Clarifying Note: These benchmarks are just for the GQA operator, not
the entire model.
### Memory Efficient Attention Kernel Benchmarks:
| Model Name | Max Sequence Length | Inference Interval (ms) |
Throughput (samples/second) |
|----------------------------------------|---------------------|-------------------------|-----------------------------|
| Llama3-8B (Average Prompt) | 8192 | 0.19790525 | 13105.63425 |
| Llama3-8B (Average Token) | 8192 | 0.207775538 | 12025.10172 |
| Llama3-70B (Average Prompt) | 8192 | 0.216049167 | 11563.31185 |
| Llama3-70B (Average Token) | 8192 | 0.209730731 | 12284.38149 |
| Mixtral-8x22B-v0.1 (Average Prompt) | 32768 | 0.371928785 |
7031.440056 |
| Mixtral-8x22B-v0.1 (Average Token) | 32768 | 0.2996659 | 7607.947159 |
| Phi-3-mini-128k (Average Prompt) | 131072 | 0.183195867 | 15542.0852 |
| Phi-3-mini-128k (Average Token) | 131072 | 0.198215688 | 12874.53494 |
| Phi-3-small-128k (Average Prompt) | 65536 | 2.9884929 | 2332.584142 |
| Phi-3-small-128k (Average Token) | 65536 | 0.845072406 | 2877.85822 |
| Phi-3-medium-128K (Average Prompt) | 32768 | 0.324974429 | 8094.909517
|
| Phi-3-medium-128K (Average Token) | 32768 | 0.263662567 | 8978.463687
|
### Flash Attention Kernel Benchmarks:
| Model Name | Max Sequence Length | Inference Interval (ms) |
Throughput (samples/second) |
|--------------------------------------|---------------------|-------------------------|-----------------------------|
| Llama3-8B (Average Prompt) | 8192 | 0.163566292 | 16213.69057 |
| Llama3-8B (Average Token) | 8192 | 0.161643692 | 16196.14715 |
| Llama3-70B (Average Prompt) | 8192 | 0.160510375 | 17448.67753 |
| Llama3-70B (Average Token) | 8192 | 0.169427308 | 14702.62043 |
| Mixtral-8x22B-v0.1 (Average Prompt) | 32768 | 0.164121964 |
15618.51301 |
| Mixtral-8x22B-v0.1 (Average Token) | 32768 | 0.1715865 | 14524.32273 |
| Phi-3-mini-128k (Average Prompt) | 131072 | 0.167527167 | 14576.725 |
| Phi-3-mini-128k (Average Token) | 131072 | 0.175940594 | 15762.051 |
| Phi-3-small-128k (Average Prompt) | 65536 | 0.162719733 | 17824.494 |
| Phi-3-small-128k (Average Token) | 65536 | 0.14977525 | 16749.19858 |
| Phi-3-medium-128K (Average Prompt) | 32768 | 0.156490786 | 17679.2513
|
| Phi-3-medium-128K (Average Token) | 32768 | 0.165333833 | 14932.26079
|
Flash Attention is consistently faster for every configuration we
benchmarked, with improvements in our trials ranging from ~20% to ~650%.
In addition to these improvements in performance, Flash Attention has
better memory usage. For example, Memory Efficient Attention cannot
handle a max sequence length higher than 32,768, but Flash Attention can
handle max sequence lengths at least as high as 131,072.
---------
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
2024-06-24 16:43:49 +00:00
|
|
|
'onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h', # Bool Switches hang Clang
|
2024-07-25 22:29:33 +00:00
|
|
|
'onnxruntime/core/providers/coreml/mlprogram_test_scripts/**', # test scripts only
|
2023-04-18 16:26:58 +00:00
|
|
|
]
|
|
|
|
|
command = [
|
|
|
|
|
'python',
|
|
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'clangformat_linter',
|
|
|
|
|
'--binary=clang-format',
|
|
|
|
|
'--fallback',
|
|
|
|
|
'--',
|
|
|
|
|
'@{{PATHSFILE}}'
|
|
|
|
|
]
|
|
|
|
|
init_command = [
|
|
|
|
|
'python',
|
|
|
|
|
'-m',
|
|
|
|
|
'lintrunner_adapters',
|
|
|
|
|
'run',
|
|
|
|
|
'pip_init',
|
|
|
|
|
'--dry-run={{DRYRUN}}',
|
2023-04-20 15:54:26 +00:00
|
|
|
'--requirement=requirements-lintrunner.txt',
|
2023-04-18 16:26:58 +00:00
|
|
|
]
|
|
|
|
|
is_formatter = true
|