From 55c745eefdeea54ff1e527b2634289dde17ddbc5 Mon Sep 17 00:00:00 2001
From: Thiago Crepaldi <thiago.crepaldi@microsoft.com>
Date: Thu, 8 Sep 2022 15:30:44 -0400
Subject: [PATCH] Add support for ORTModule Torch cpp CUDA extension build
 within docker (#12868)

Currently, CUDA hardware is not available to be leveraged by build
during `docker build`. because of that, CUDA capable hardware would not
have CUDA support

This PR adds an env varf ONNXRUNTIME_FORCE_CUDA in which it allows CUDA
extensions to be compiled even when CUDA support is not detected.
---
 .../training/ortmodule/torch_cpp_extensions/__init__.py     | 1 +
 .../ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py  | 4 +++-
 .../torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py  | 4 +++-
 .../training/ortmodule/torch_cpp_extensions/install.py      | 6 +++++-
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py
index 765f33dd9a..e6b1f0fb8b 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/__init__.py
@@ -21,6 +21,7 @@ The following environment variables are available for the extensions setup.py
     - ORTMODULE_TORCH_CPP_DIR: ORTModule's internal
     - ONNXRUNTIME_ROCM_VERSION: ROCM version used to build ONNX Runtime package
     - ONNXRUNTIME_CUDA_VERSION: CUDA version used to build ONNX Runtime package
+    - ONNXRUNTIME_FORCE_CUDA: Force CUDA extensions to be used when it is not available to build ONNX Runtime package
 
 TODO: Create a generic mechanism to pass arguments from ORTModule into each extension setup.py
 TODO: Create environment variables to allow extensions to be hosted outside ONNX runtime installation folder
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
index 71d44292d8..b73623c430 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
@@ -21,7 +21,9 @@ filenames = [
 use_rocm = True if os.environ["ONNXRUNTIME_ROCM_VERSION"] else False
 extra_compile_args = {"cxx": ["-O3"]}
 if not use_rocm:
-    extra_compile_args.update({"nvcc": os.environ["ONNXRUNTIME_CUDA_NVCC_EXTRA_ARGS"].split(",")})
+    nvcc_extra_args = os.environ.get("ONNXRUNTIME_CUDA_NVCC_EXTRA_ARGS", "")
+    if nvcc_extra_args:
+        extra_compile_args.update({"nvcc": nvcc_extra_args.split(",")})
 
 setup(
     name="fused_ops",
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
index 169c500b57..99f6699dca 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
@@ -25,7 +25,9 @@ with fileinput.FileInput(filename, inplace=True) as file:
 
 extra_compile_args = {"cxx": ["-O3"]}
 if not use_rocm:
-    extra_compile_args.update({"nvcc": os.environ["ONNXRUNTIME_CUDA_NVCC_EXTRA_ARGS"].split(",")})
+    nvcc_extra_args = os.environ.get("ONNXRUNTIME_CUDA_NVCC_EXTRA_ARGS", "")
+    if nvcc_extra_args:
+        extra_compile_args.update({"nvcc": nvcc_extra_args.split(",")})
 
 setup(
     name="torch_gpu_allocator",
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
index 6c1f805310..bb0952dea5 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
@@ -55,6 +55,10 @@ def build_torch_cpp_extensions():
     is_gpu_available = (torch.version.cuda is not None or torch.version.hip is not None) and (
         ortmodule.ONNXRUNTIME_CUDA_VERSION is not None or ortmodule.ONNXRUNTIME_ROCM_VERSION is not None
     )
+
+    # Docker build don't have CUDA support, but Torch C++ extensions with CUDA may be forced
+    force_cuda = bool(os.environ.get("ONNXRUNTIME_FORCE_CUDA", False))
+
     os.chdir(ortmodule.ORTMODULE_TORCH_CPP_DIR)
 
     # Extensions might leverage CUDA/ROCM versions internally
@@ -71,7 +75,7 @@ def build_torch_cpp_extensions():
     ############################################################################
     # Pytorch CPP Extensions that DO require CUDA/ROCM
     ############################################################################
-    if is_gpu_available:
+    if is_gpu_available or force_cuda:
         for ext_setup in _list_cuda_extensions():
             _install_extension(ext_setup.split(os.sep)[-2], ext_setup, ortmodule.ORTMODULE_TORCH_CPP_DIR)