Move reduced ops files into build directory (#10030)

In a reduced ops build, some source files get updated. This change moves the updated files into the build directory. This way, it is easier to simultaneously manage different build directories (with possibly different reduced ops configurations) based on a single source directory.
2026-05-14 20:48:00 +00:00 · 2021-12-28 19:04:20 -08:00 · 2021-12-28 19:04:20 -08:00 · 3bc91c2151
commit 3bc91c2151
parent a367f0664d
17 changed files with 173 additions and 183 deletions
--- a/.gitignore
+++ b/.gitignore
@ -60,11 +60,3 @@ onnxruntime/python/version_info.py
 .envrc
 .psenvrc
 *.csproj.user
-# exclude generated reduced kernel registration and type control
-onnxruntime/contrib_ops/cpu/cpu_contrib_kernels_reduced_ops.cc
-onnxruntime/core/providers/cpu/cpu_execution_provider_reduced_ops.cc
-orttraining/orttraining/training_ops/cpu/cpu_training_kernels_reduced_ops.cc
-onnxruntime/contrib_ops/cuda/cuda_contrib_kernels_reduced_ops.cc
-onnxruntime/core/providers/cuda/cuda_execution_provider_reduced_ops.cc
-orttraining/orttraining/training_ops/cuda/cuda_training_kernels_reduced_ops.cc
-onnxruntime/core/providers/op_kernel_type_control_overrides_reduced_types.inc
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -118,7 +118,6 @@ cmake_dependent_option(onnxruntime_DISABLE_EXCEPTIONS "Disable exception handlin
 option(onnxruntime_EXTENDED_MINIMAL_BUILD "onnxruntime_MINIMAL_BUILD with support for execution providers that compile kernels." OFF)
 option(onnxruntime_MINIMAL_BUILD_CUSTOM_OPS "Add custom operator kernels support to a minimal build." OFF)
 option(onnxruntime_REDUCED_OPS_BUILD "Reduced set of kernels are registered in build via modification of the kernel registration source files." OFF)
-option(onnxruntime_REDUCED_OP_TYPE_SUPPORT "Limit the types individual operators support where possible to further reduce the build size." OFF)
 option(onnxruntime_DISABLE_EXTERNAL_INITIALIZERS "Don't allow models to load external data" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_ORT_FORMAT_RUNTIME_GRAPH_OPTIMIZATION
                       "Enable runtime graph optimization of ORT format models. Warning: Not yet ready for general use."
@ -390,9 +389,6 @@ endif()

 if (onnxruntime_REDUCED_OPS_BUILD)
  add_compile_definitions(REDUCED_OPS_BUILD)
-  if (onnxruntime_REDUCED_OP_TYPE_SUPPORT)
-    add_compile_definitions(REDUCED_OP_TYPE_SUPPORT)
-  endif()
 endif()

 if (onnxruntime_DISABLE_EXTERNAL_INITIALIZERS)
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -1,6 +1,64 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

+# Reduced ops build helpers
+
+# In a reduced ops build, the reduction is performed by updating source files.
+# Rather than modifying the source files directly, updated versions will be
+# saved to another location in the build directory: ${op_reduction_root}.
+set(op_reduction_root "${CMAKE_BINARY_DIR}/op_reduction.generated")
+
+# This helper function replaces the relevant original source files with their
+# updated, reduced ops versions in `all_srcs`.
+function(substitute_op_reduction_srcs all_srcs)
+  # files that are potentially updated in a reduced ops build
+  set(original_srcs
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cpu/cpu_contrib_kernels.cc"
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/cuda_contrib_kernels.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/cpu/cpu_execution_provider.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_execution_provider.cc"
+    "${ONNXRUNTIME_ROOT}/core/providers/op_kernel_type_control_overrides.inc"
+    "${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/cpu_training_kernels.cc"
+    "${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/cuda_training_kernels.cc"
+    )
+
+  set(replacement_srcs)
+
+  foreach(original_src ${original_srcs})
+    string(FIND "${${all_srcs}}" "${original_src}" idx)
+    if(idx EQUAL "-1")
+      continue()
+    endif()
+
+    file(RELATIVE_PATH src_relative_path "${REPO_ROOT}" "${original_src}")
+    set(replacement_src "${op_reduction_root}/${src_relative_path}")
+
+    message("File '${original_src}' substituted with reduced op version '${replacement_src}'.")
+
+    string(REPLACE "${original_src}" "${replacement_src}" ${all_srcs} "${${all_srcs}}")
+
+    list(APPEND replacement_srcs "${replacement_src}")
+  endforeach()
+
+  if(replacement_srcs)
+    source_group(TREE "${op_reduction_root}" PREFIX "op_reduction.generated" FILES ${replacement_srcs})
+  endif()
+
+  set(${all_srcs} "${${all_srcs}}" PARENT_SCOPE)
+endfunction()
+
+# This helper function adds reduced ops build-specific include directories to
+# `target`.
+function(add_op_reduction_include_dirs target)
+  set(op_reduction_include_dirs "${op_reduction_root}/onnxruntime")
+  if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
+    list(APPEND op_reduction_include_dirs "${op_reduction_root}/orttraining")
+  endif()
+  # add include directories BEFORE so they are searched first, giving op reduction file paths precedence
+  target_include_directories(${target} BEFORE PRIVATE ${op_reduction_include_dirs})
+endfunction()
+
+
 file(GLOB_RECURSE onnxruntime_providers_srcs CONFIGURE_DEPENDS
  "${ONNXRUNTIME_ROOT}/core/providers/cpu/*.h"
  "${ONNXRUNTIME_ROOT}/core/providers/cpu/*.cc"
@ -45,16 +103,10 @@ file(GLOB_RECURSE onnxruntime_rocm_generated_contrib_ops_cu_srcs CONFIGURE_DEPEN
  "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/*.cuh"
 )

-
 file(GLOB onnxruntime_providers_common_srcs CONFIGURE_DEPENDS
  "${ONNXRUNTIME_ROOT}/core/providers/*.h"
  "${ONNXRUNTIME_ROOT}/core/providers/*.cc"
-  # If we are building with reduced number of kernel registration and types,
-  # "core/providers/op_kernel_type_control_overrides_reduced_types.inc"
-  # will be generated with type specifications code.
-  # For simplicity, we inlcude both .inc files,
-  # see onnxruntime/core/providers/op_kernel_type_control.h
-  "${ONNXRUNTIME_ROOT}/core/providers/op_kernel_type_control_overrides*.inc"
+  "${ONNXRUNTIME_ROOT}/core/providers/op_kernel_type_control_overrides.inc"
 )

 if(onnxruntime_USE_NUPHAR)
@ -176,7 +228,13 @@ if (onnxruntime_ENABLE_TRAINING)
  list(APPEND onnxruntime_providers_src ${onnxruntime_providers_dlpack_srcs})
 endif()

+if (onnxruntime_REDUCED_OPS_BUILD)
+  substitute_op_reduction_srcs(onnxruntime_providers_src)
+endif()
 onnxruntime_add_static_library(onnxruntime_providers ${onnxruntime_providers_src})
+if (onnxruntime_REDUCED_OPS_BUILD)
+  add_op_reduction_include_dirs(onnxruntime_providers)
+endif()

 if (MSVC)
   target_compile_options(onnxruntime_providers PRIVATE "/bigobj")
@ -323,7 +381,13 @@ if (onnxruntime_USE_CUDA)
    list(APPEND onnxruntime_providers_cuda_src ${onnxruntime_cuda_training_ops_cc_srcs} ${onnxruntime_cuda_training_ops_cu_srcs})
  endif()

+  if (onnxruntime_REDUCED_OPS_BUILD)
+    substitute_op_reduction_srcs(onnxruntime_providers_cuda_src)
+  endif()
  onnxruntime_add_shared_library_module(onnxruntime_providers_cuda ${onnxruntime_providers_cuda_src})
+  if (onnxruntime_REDUCED_OPS_BUILD)
+    add_op_reduction_include_dirs(onnxruntime_providers_cuda)
+  endif()

  #target_compile_options(onnxruntime_providers_cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler \"/analyze:stacksize 131072\">")
  if (HAS_GUARD_CF)
--- a/docs/Reduced_Operator_Kernel_build.md
+++ b/docs/Reduced_Operator_Kernel_build.md
@ -4,17 +4,23 @@ In order to reduce the compiled binary size of ONNX Runtime (ORT), the operator

 A configuration file must be created with details of the kernels that are required.

-Following that, ORT must be manually built, providing the configuration file in the `--include_ops_by_config` parameter. The build process will update the ORT kernel registration source files to exclude the unused kernels.
+Following that, ORT must be manually built, providing the configuration file in the [build.py](../tools/ci_build/build.py) `--include_ops_by_config` argument.

 See the [build instructions](https://www.onnxruntime.ai/docs/how-to/build.html#build-instructions) for more details on building ORT.

-When building ORT with a reduced set of kernel registrations, `--skip_tests` **MUST** be specified as the kernel reduction will render many of the unit tests invalid.
+The build process will generate updated ORT kernel registration and type reduction source files to exclude unused kernel implementations.
+The generated files will be under the build directory and the original source files that they are based on are not directly modified.
+When building, the generated files will be used instead of the original files.

-NOTE: The operator exclusion logic when building with an operator reduction configuration file will only disable kernel registrations each time it runs. It will NOT re-enable previously disabled kernels. If you wish to change the list of kernels included, it is best to revert the repository to a clean state (e.g. via `git reset --hard`) before building ORT again.
+The operator exclusion logic only runs during the build file generation (or "update") phase of the build process, i.e., when invoking build.py with no build phase arguments or explicitly with `--update`.
+
+Note: It is also possible to run the operator exclusion logic independently with [reduce_op_kernels.py](../tools/ci_build/reduce_op_kernels.py). This may be useful when building ORT without using build.py.
+As the generated files will go into a build directory, the build directory must be provided with the reduce_op_kernels.py `--cmake_build_dir` argument.
+Note that this argument is slightly different from the build.py `--build_dir` argument - build.py will append an additional directory for the build configuration to its `--build_dir` value to get the equivalent of `--cmake_build_dir`.

 ## Creating a configuration file with the required kernels

-The script in `<ORT Root>/tools/python/create_reduced_build_config.py` should be used to create the configuration file. This file can be manually edited as needed. The configuration can be created from either ONNX or ORT format models.
+The [create_reduced_build_config.py](../tools/python/create_reduced_build_config.py) script should be used to create the configuration file. This file can be manually edited as needed. The configuration can be created from either ONNX or ORT format models.

 ```
 create_reduced_build_config.py --help
@ -35,7 +41,7 @@ optional arguments:

 ### Type reduction

-If the configuration file is created using ORT format models, the input/output types that individual operators require can be tracked if `--enable_type_reduction` is specified. This can be used to further reduce the build size if `--enable_reduced_operator_type_support` is specified when building ORT.
+If the configuration file is created using ORT format models, the input/output types that individual operators require can be tracked if the `--enable_type_reduction` argument is specified. This can be used to further reduce the build size if the build.py `--enable_reduced_operator_type_support` argument is specified when building ORT.

 ONNX format models are not guaranteed to include the required per-node type information, so cannot be used with this option.

--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "contrib_ops/cpu/cpu_contrib_kernels.h"
 #include "core/graph/constants.h"
 #include "core/mlas/inc/mlas.h"
@ -278,5 +268,3 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {

 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_common.h"

@ -214,5 +204,3 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "core/providers/cpu/cpu_execution_provider.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/kernel_registry.h"
@ -2074,5 +2064,3 @@ std::unique_ptr<IDataTransfer> CPUExecutionProvider::GetDataTransfer() const {
  return std::make_unique<CPUDataTransfer>();
 }
 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_common.h"
@ -2363,5 +2353,3 @@ void CUDAExecutionProvider::RegisterAllocator(std::shared_ptr<AllocatorManager>
 }

 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/onnxruntime/core/providers/op_kernel_type_control.h
+++ b/onnxruntime/core/providers/op_kernel_type_control.h
@ -469,17 +469,4 @@ struct EnabledTypes {
 #include "core/framework/data_types.h"  // for types that might be used in type specifications

 // all allowed type specifications should be contained in the following file
-
-// If we are building with reduced number of kernel registration and types
-// <op_kernel_type_control_overrides.inc> will be copied to
-// <core/providers/op_kernel_type_control_overrides_reduced_types.inc>,
-// where the type specifications code will be inserted,
-// This will prevent,
-// 1. Accidental commit of the modified <op_kernel_type_control_overrides.inc>
-// 2. If the required ops and types config has changed, user has to revert the changes to
-//    <op_kernel_type_control_overrides.inc>
-#ifndef REDUCED_OP_TYPE_SUPPORT
 #include "core/providers/op_kernel_type_control_overrides.inc"
-#else
-#include "core/providers/op_kernel_type_control_overrides_reduced_types.inc"
-#endif
--- a/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_partitioning_tests.cc
@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#if !defined(REDUCED_OPS_BUILD)  // may not work with excluded op kernel implementations
+
 #include "core/common/logging/logging.h"
 #include "core/framework/compute_capability.h"
 #include "core/framework/utils.h"
@ -342,3 +344,5 @@ TEST(InternalTestingEP, DISABLED_TestNnapiPartitioningMlPerfModels) {

 }  // namespace test
 }  // namespace onnxruntime
+
+#endif  // !defined(REDUCED_OPS_BUILD)
--- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
+++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc
@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

+#if !defined(REDUCED_OPS_BUILD)  // may not work with excluded op kernel implementations
+
 #include "core/common/logging/logging.h"
 #include "core/framework/utils.h"
 #include "core/session/inference_session.h"
@ -354,3 +356,5 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) {
 }
 }  // namespace test
 }  // namespace onnxruntime
+
+#endif  // !defined(REDUCED_OPS_BUILD)
--- a/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "orttraining/training_ops/cpu/cpu_training_kernels.h"
 #include "core/graph/constants.h"

@ -244,5 +234,3 @@ Status RegisterCpuTrainingKernels(KernelRegistry& kernel_registry) {

 }  // namespace contrib
 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@ -1,16 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-// If we are building with reduced number of kernel registration,
-// this file will be copied to <file_name>_reduced_ops.cc,
-// where the unused kernel registration will be commented out
-// and the "#ifndef REDUCED_OPS_BUILD" be replaced with "#ifdef REDUCED_OPS_BUILD"
-// This will prevent,
-// 1. Accidental commit of the reduced kernel registration files
-// 2. If the required ops config has changed, user has to revert the changes to
-//    the kernel registration files
-#ifndef REDUCED_OPS_BUILD
-
 #include "core/providers/shared_library/provider_api.h"
 #include "core/providers/cuda/cuda_fwd.h"
 #include "core/providers/cuda/cuda_pch.h"
@ -468,5 +458,3 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {

 }  // namespace cuda
 }  // namespace onnxruntime
-
-#endif  // #ifndef REDUCED_OPS_BUILD
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -791,9 +791,6 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
                                                     args.minimal_build or args.use_extensions))
                                                     else "OFF"),
        "-Donnxruntime_REDUCED_OPS_BUILD=" + ("ON" if is_reduced_ops_build(args) else "OFF"),
-        "-Donnxruntime_REDUCED_OP_TYPE_SUPPORT=" + (
-            "ON" if is_reduced_ops_build(args) and args.enable_reduced_operator_type_support
-            else "OFF"),
        "-Donnxruntime_ENABLE_LANGUAGE_INTEROP_OPS=" + ("ON" if args.enable_language_interop_ops else "OFF"),
        "-Donnxruntime_USE_DML=" + ("ON" if args.use_dml else "OFF"),
        "-Donnxruntime_USE_WINML=" + ("ON" if args.use_winml else "OFF"),
@ -2035,13 +2032,6 @@ def main():
    if args.skip_tests:
        args.test = False

-    if is_reduced_ops_build(args) and args.update:
-        from reduce_op_kernels import reduce_ops
-        reduce_ops(
-            config_path=args.include_ops_by_config,
-            enable_type_reduction=args.enable_reduced_operator_type_support,
-            use_cuda=args.use_cuda)
-
    if args.use_tensorrt:
        args.use_cuda = True

@ -2127,10 +2117,21 @@ def main():
    rocm_home = setup_rocm_build(args, configs)

    if args.update or args.build:
-        os.makedirs(build_dir, exist_ok=True)
+        for config in configs:
+            os.makedirs(get_config_build_dir(build_dir, config), exist_ok=True)

    log.info("Build started")
+
    if args.update:
+        if is_reduced_ops_build(args):
+            from reduce_op_kernels import reduce_ops
+            for config in configs:
+                reduce_ops(
+                    config_path=args.include_ops_by_config,
+                    build_dir=get_config_build_dir(build_dir, config),
+                    enable_type_reduction=args.enable_reduced_operator_type_support,
+                    use_cuda=args.use_cuda)
+
        cmake_extra_args = []
        path_to_protoc_exe = args.path_to_protoc_exe
        if not args.skip_submodule_sync:
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-minimal-build-ci-pipeline.yml
@ -110,10 +110,6 @@ jobs:
              --enable-custom-ops
      workingDirectory: $(Build.SourcesDirectory)

-  - script: git checkout -- .
-    displayName: Discard local changes to Git repository files
-    workingDirectory: $(Build.SourcesDirectory)
-
  - task: CmdLine@2
    displayName: 3b. Build minimal onnxruntime [exceptions ENABLED, type reduction ENABLED] and run tests
    inputs:
@ -132,10 +128,6 @@ jobs:
              --enable-type-reduction
      workingDirectory: $(Build.SourcesDirectory)

-  - script: git checkout -- .
-    displayName: Discard local changes to Git repository files
-    workingDirectory: $(Build.SourcesDirectory)
-
  - task: CmdLine@2
    displayName: 4. Build minimal onnxruntime [exceptions ENABLED, type reduction ENABLED (globally allowed types)] and run tests
    inputs:
@ -159,10 +151,6 @@ jobs:
              --skip-model-tests
      workingDirectory: $(Build.SourcesDirectory)

-  - script: git checkout -- .
-    displayName: Discard local changes to Git repository files
-    workingDirectory: $(Build.SourcesDirectory)
-
  - task: CmdLine@2
    displayName: 5. Build onnxruntime minimal baseline for Android arm64-v8a and report binary size
    inputs:
@ -207,10 +195,6 @@ jobs:
          --build_id=$(Build.BuildId)
      workingDirectory: '$(Build.BinariesDirectory)'

-  - script: git checkout -- .
-    displayName: Discard local changes to Git repository files
-    workingDirectory: $(Build.SourcesDirectory)
-
  - task: CmdLine@2
    displayName: 6a. Build full onnxruntime with runtime optimizations enabled
    inputs:
@ -252,10 +236,6 @@ jobs:
              --cmake_extra_defines onnxruntime_ENABLE_ORT_FORMAT_RUNTIME_GRAPH_OPTIMIZATION=ON
      workingDirectory: $(Build.SourcesDirectory)

-  - script: git checkout -- .
-    displayName: Discard local changes to Git repository files
-    workingDirectory: $(Build.SourcesDirectory)
-
  - task: CmdLine@2
    displayName: 7a. Regular build with python and all optional features disabled. 
    inputs:
--- a/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
+++ b/tools/ci_build/github/linux/ort_minimal/build_full_ort_and_create_ort_files.sh
@ -17,7 +17,7 @@ python3 /onnxruntime_src/tools/ci_build/op_registration_validator.py
 # Run a full build of ORT.
 # We need the ORT python package to generate the ORT format files and the required ops config files.
 # We do not run tests in this command since those are covered by other CIs.
-# We run two full builds here. One for enabling nnapi and the other for enabling coreml.
+# Both the NNAPI and CoreML EPs are enabled.
 python3 /onnxruntime_src/tools/ci_build/build.py \
    --build_dir ${BUILD_DIR} --cmake_generator Ninja \
    --config Debug \
@ -27,7 +27,7 @@ python3 /onnxruntime_src/tools/ci_build/build.py \
    --skip_tests \
    --enable_training_ops \
    --enable_pybind --cmake_extra_defines PYTHON_INCLUDE_DIR=/opt/python/cp37-cp37m/include/python3.7m PYTHON_LIBRARY=/usr/lib64/librt.so \
-    --use_nnapi\
+    --use_nnapi \
    --use_coreml

 # Run kernel def hash verification test
--- a/tools/ci_build/reduce_op_kernels.py
+++ b/tools/ci_build/reduce_op_kernels.py
@ -4,7 +4,6 @@

 import argparse
 import op_registration_utils
-import os
 import shutil
 import sys
 import typing
@ -12,14 +11,13 @@ import typing
 from logger import get_logger
 from pathlib import Path

-REDUCED_KERNEL_DEF_SUFFIX = '_reduced_ops'
-REDUCED_TYPE_CONTROL_SUFFIX = '_reduced_types'
+# directory containing the reduced op files, relative to the build directory
+OP_REDUCTION_DIR = "op_reduction.generated"

 # add the path to /tools/python so we can import the config parsing and type reduction processing
-script_path = os.path.dirname(os.path.realpath(__file__))
-ort_root = os.path.abspath(os.path.join(script_path, '..', '..', ))
-ort_tools_py_path = os.path.abspath(os.path.join(ort_root, 'tools', 'python'))
-sys.path.append(ort_tools_py_path)
+SCRIPT_DIR = Path(__file__).parent.resolve()
+ORT_ROOT = SCRIPT_DIR.parents[1]
+sys.path.append(str(ORT_ROOT / 'tools' / 'python'))

 from util import parse_config  # noqa
 from util.ort_format_model.operator_type_usage_processors import OpTypeImplFilterInterface  # noqa
@ -96,24 +94,37 @@ class _ExcludingRegistrationProcessor(op_registration_utils.RegistrationProcesso
        return True


-def _process_provider_registrations(
-        ort_root: str, use_cuda: bool,
+def _get_op_reduction_file_path(ort_root: Path, build_dir: Path, original_path: typing.Optional[Path] = None):
+    '''
+    Return the op reduction file path corresponding to `original_path` or the op reduction file root if unspecified.
+    Op reduction files are in a subdirectory of `build_dir` but otherwise share the same components of `original_path`
+    relative to `ort_root`.
+    '''
+    op_reduction_root = Path(build_dir, OP_REDUCTION_DIR)
+    return (op_reduction_root / original_path.relative_to(ort_root)) if original_path is not None \
+        else op_reduction_root
+
+
+def _generate_provider_registrations(
+        ort_root: Path, build_dir: Path, use_cuda: bool,
        required_ops: typing.Optional[dict],
        op_type_impl_filter: typing.Optional[OpTypeImplFilterInterface]):
-    '''Rewrite provider registration files.'''
-    kernel_registration_files = op_registration_utils.get_kernel_registration_files(ort_root, use_cuda)
+    '''Generate provider registration files.'''
+    kernel_registration_files = [Path(f) for f in
+                                 op_registration_utils.get_kernel_registration_files(str(ort_root), use_cuda)]

    for kernel_registration_file in kernel_registration_files:
-        if not os.path.isfile(kernel_registration_file):
-            raise ValueError('Kernel registration file {} does not exist'.format(kernel_registration_file))
+        if not kernel_registration_file.is_file():
+            raise ValueError(f'Kernel registration file does not exist: {kernel_registration_file}')

        log.info("Processing {}".format(kernel_registration_file))

-        old_path = Path(kernel_registration_file)
-        reduced_path = Path(old_path.parent, f'{old_path.stem}{REDUCED_KERNEL_DEF_SUFFIX}{old_path.suffix}')
+        reduced_path = _get_op_reduction_file_path(ort_root, build_dir, kernel_registration_file)

-        # read from original and create the reduced kernel def file (*_reduced_ops.cc),
-        # with commented out lines for any kernels that are not required
+        reduced_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # read from original and create the reduced kernel def file with commented out lines for any kernels that are
+        # not required
        with open(reduced_path, 'w') as file_to_write:
            processor = _ExcludingRegistrationProcessor(required_ops, op_type_impl_filter, file_to_write)

@ -123,29 +134,24 @@ def _process_provider_registrations(
                # error should have already been logged so just exit
                sys.exit(-1)

-        # enable the contents in the *_reduced_ops.cc
-        with open(reduced_path, 'r+') as file:
-            file_content = file.read().replace(r'#ifndef REDUCED_OPS_BUILD', r'#ifdef REDUCED_OPS_BUILD')

-        with open(reduced_path, "w") as file_to_write:
-            file_to_write.write(file_content)
-
-
-def _insert_type_control_cpp_code(ort_root: str, cpp_lines: typing.Sequence[str]):
+def _generate_type_control_overrides(ort_root: Path, build_dir: Path, cpp_lines: typing.Sequence[str]):
    '''
-    Insert the C++ code to specify operator type requirements.
+    Generate type control overrides. Insert applicable C++ code to specify operator type requirements.
    :param ort_root: Root of the ONNX Runtime repository
+    :param build_dir: Path to the build directory
    :param cpp_lines: The C++ code to insert
    '''
-    src = os.path.join(ort_root, 'onnxruntime', 'core', 'providers', 'op_kernel_type_control_overrides.inc')
-    if not os.path.exists(src) or not os.path.isfile(src):
-        log.warning('Could not find {}. Skipping generation of C++ code to reduce the types supported by operators.'
-                    .format(src))
-        return
+    src = Path(ort_root, 'onnxruntime', 'core', 'providers', 'op_kernel_type_control_overrides.inc')
+
+    if not src.is_file():
+        raise ValueError(f"Op kernel type control overrides file does not exist: {src}")
+
+    # create a copy of op_kernel_type_control_overrides.inc
+    target = _get_op_reduction_file_path(ort_root, build_dir, src)
+
+    target.parent.mkdir(parents=True, exist_ok=True)

-    # create a copy of the op_kernel_type_control_overrides.inc even the cpp_lines is empty
-    src_path = Path(src)
-    target = Path(src_path.parent, f'{src_path.stem}{REDUCED_TYPE_CONTROL_SUFFIX}{src_path.suffix}')
    shutil.copyfile(src, target)

    if cpp_lines:
@ -173,20 +179,29 @@ def _insert_type_control_cpp_code(ort_root: str, cpp_lines: typing.Sequence[str]
            raise RuntimeError('Insertion point was not found in {}'.format(target))


-def reduce_ops(config_path: str, enable_type_reduction: bool = False, use_cuda: bool = True):
+def reduce_ops(config_path: str, build_dir: str, enable_type_reduction: bool = False, use_cuda: bool = True):
    '''
    Reduce op kernel implementations.
    :param config_path: Path to configuration file that specifies the ops to include
+    :param build_dir: Path to the build directory. The op reduction files will be generated under the build directory.
    :param enable_type_reduction: Whether per operator type reduction is enabled
    :param use_cuda: Whether to reduce op kernels for the CUDA provider
    '''
+    build_dir = Path(build_dir).resolve()
+    build_dir.mkdir(parents=True, exist_ok=True)
+
    required_ops, op_type_impl_filter = parse_config(config_path, enable_type_reduction)

-    _process_provider_registrations(ort_root, use_cuda, required_ops, op_type_impl_filter)
+    # delete any existing generated files first
+    op_reduction_root = _get_op_reduction_file_path(ORT_ROOT, build_dir)
+    if op_reduction_root.is_dir():
+        log.info(f"Deleting existing op reduction file root directory: {op_reduction_root}")
+        shutil.rmtree(op_reduction_root)

-    if enable_type_reduction:
-        type_control_cpp_code = op_type_impl_filter.get_cpp_entries() if op_type_impl_filter is not None else []
-        _insert_type_control_cpp_code(ort_root, type_control_cpp_code)
+    _generate_provider_registrations(ORT_ROOT, build_dir, use_cuda, required_ops, op_type_impl_filter)
+
+    type_control_cpp_code = op_type_impl_filter.get_cpp_entries() if op_type_impl_filter is not None else []
+    _generate_type_control_overrides(ORT_ROOT, build_dir, type_control_cpp_code)


 if __name__ == "__main__":
@ -199,6 +214,19 @@ if __name__ == "__main__":
                             "Create with <ORT root>/tools/python/create_reduced_build_config.py and edit if needed. "
                             "See /docs/ONNX_Runtime_Format_Model_Usage.md for more information.")

+    parser.add_argument("--cmake_build_dir", type=str, required=True,
+                        help="Path to the build directory. "
+                             "The op reduction files will be generated under the build directory.")
+
+    parser.add_argument("--enable_type_reduction", action="store_true",
+                        help="Whether per operator type reduction is enabled.")
+
+    parser.add_argument("--use_cuda", action="store_true",
+                        help="Whether to reduce op kernels for the CUDA provider.")
+
    args = parser.parse_args()
-    config_path = os.path.abspath(args.config_path)
-    reduce_ops(config_path, enable_type_reduction=True, use_cuda=True)
+
+    reduce_ops(config_path=args.config_path,
+               build_dir=args.cmake_build_dir,
+               enable_type_reduction=args.enable_type_reduction,
+               use_cuda=args.use_cuda)