From 6958f49dae3a26b3fc177fecc7aa87e177733d7e Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <borisfom@users.noreply.github.com>
Date: Fri, 31 Jul 2020 23:49:23 -0700
Subject: [PATCH] Added Dockerfile and build instructions for Jetson. Also set
 CUDA arch set automatically. (#4637)

* Revert "Remove docstrigs if __ONNX_NO_DOC_STRINGS" (#4495)

This reverts commit bb4d331fa7bf1fe8d68b1527dda56e4739c80800.

* Bump version to 1.4.0 (#4496)

* Create N-1 threads in intra-op pool, given main thread now active (#4493)

Create N-1 threads in a thread pool when configured with intra-op parallelism of N. This ensures we have N active threads, given that the main thread also runs work. To avoid ambiguity on the value returned, rename ThreadPool::NumThreads method to ThreadPool::DegreeOfParallelism, and make corresponding updates in MLAS and operators.

* Conditionally compile without std::is_trivially_copyable to satisfy old GCC versions. (#4510)

* Adding CUDA arch flags for NVIDIA Jetson

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Added Dockerfile for Jetson and instructions to build wheel and image

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Removing guess about nvcc location

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Restoring pip3 setuptools install order

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Updated README with links and notes re NVIDIA Docker runtime

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Added mention of nvidia-docker

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Addressing code review comments

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

* Addressing code review comments

Signed-off-by: Boris Fomitchev <bfomitchev@nvidia.com>

Co-authored-by: Tiago Koji Castro Shibata <ticastro@microsoft.com>
Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Co-authored-by: Tim Harris <tiharr@microsoft.com>
Co-authored-by: edgchen1 <18449977+edgchen1@users.noreply.github.com>
---
 BUILD.md                      | 29 +++++++--------------------
 cmake/CMakeLists.txt          | 28 ++++++++++++++++----------
 dockerfiles/Dockerfile.jetson | 37 +++++++++++++++++++++++++++++++++++
 dockerfiles/README.md         | 36 ++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 32 deletions(-)
 create mode 100644 dockerfiles/Dockerfile.jetson

diff --git a/BUILD.md b/BUILD.md
index e43593fbbf..839932601f 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -12,6 +12,7 @@
   * Execution Providers
     * [NVIDIA CUDA](#CUDA)
     * [NVIDIA TensorRT](#TensorRT)
+    * [NVIDIA Jetson TX1/TX2/Nano/Xavier](#nvidia-jetson-tx1tx2nanoxavier)
     * [Intel DNNL/MKL-ML](#DNNL-and-MKLML)
     * [Intel nGraph](#nGraph)
     * [Intel OpenVINO](#openvino)
@@ -210,34 +211,18 @@ Dockerfile instructions are available [here](./dockerfiles#tensorrt)
 
 ---
 
-#### Jetson TX1/TX2/Nano (ARM64 Builds)
+#### NVIDIA Jetson TX1/TX2/Nano/Xavier
 
-1. ONNX Runtime v1.2.0 or higher requires TensorRT 7 support, at this moment, the compatible TensorRT and CUDA libraries in [JetPack](https://docs.nvidia.com/jetson/jetpack/release-notes/) 4.4 is still under developer preview stage. Therefore, we suggest using ONNX Runtime v1.1.2 with JetPack 4.3 which has been validated.
-```
-git clone --single-branch --recursive --branch v1.1.2 https://github.com/Microsoft/onnxruntime
-```
-2. Indicate CUDA compiler. It's optional, cmake can automatically find the correct cuda.
+1. Indicate CUDA compiler, or add its location to the PATH.
+Cmake can't automatically find the correct nvcc if it's not in the PATH.
 ```
 export CUDACXX="/usr/local/cuda/bin/nvcc"
 ```
-3. Modify  tools/ci_build/build.py
+or:
 ```
-- "-Donnxruntime_DEV_MODE=" + ("OFF" if args.android else "ON"),
-+ "-Donnxruntime_DEV_MODE=" + ("OFF" if args.android else "OFF"),
+export PATH="/usr/local/cuda/bin:${PATH}"
 ```
-4. Modify cmake/CMakeLists.txt
-```
--  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
-+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_53,code=sm_53") # Jetson TX1/Nano
-+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_62,code=sm_62") # Jetson TX2
-```
-5. Build onnxruntime with --use_tensorrt flag
-```
-./build.sh --config Release --update --build --build_wheel --use_tensorrt --cuda_home /usr/local/cuda --cudnn_home /usr/lib/aarch64-linux-gnu --tensorrt_home /usr/lib/aarch64-linux-gnu
-
-```
-
-See [instructions](https://github.com/microsoft/onnxruntime/issues/2684#issuecomment-568548387) for additional information and tips.
+2. Follow instructions in [Docker README](./dockerfiles/README.md) to build the wheel file. 
 
 ---
 
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 703afeacfe..1c0be4d4b8 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -891,17 +891,25 @@ if (onnxruntime_USE_CUDA)
   endif()
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ONNXRUNTIME_CUDA_LIBRARIES})
 
-  # the following compute capabilities are deprecated in CUDA 11 Toolkit
-  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
-  endif()
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
-  if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
+  if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu")
+    string (APPEND CMAKE_CUDA_FLAGS "-gencode=arch=compute_53,code=sm_53 -gencode=arch=compute_62,code=sm_62") #nano, TX1, TX2 
+    string (APPEND CMAKE_CUDA_FLAGS "-gencode=arch=compute_72,code=sm_72") # Jetson N
+  else()
+    # the following compute capabilities are deprecated in CUDA 11 Toolkit
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+    endif()
+    
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
+    
+    if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
+    endif()
   endif()
+  
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy")
   if (NOT WIN32)
     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC")
diff --git a/dockerfiles/Dockerfile.jetson b/dockerfiles/Dockerfile.jetson
new file mode 100644
index 0000000000..9998fa91e2
--- /dev/null
+++ b/dockerfiles/Dockerfile.jetson
@@ -0,0 +1,37 @@
+# syntax=docker/dockerfile:experimental
+#
+# This Dockerfile just installs pre-built ONNX Runtime wheel inside the image.
+# Please make sure you have nvidia-runtime enabled in docker config and then build like:
+#
+# sudo -H DOCKER_BUILDKIT=1 nvidia-docker build --build-arg WHEEL_FILE=<path> -f Dockerfile.jetson
+#
+
+ARG BASE_IMAGE=nvcr.io/nvidia/l4t-base:r32.4.3
+FROM ${BASE_IMAGE} as onnxruntime
+
+ARG WHEEL_FILE
+RUN echo "Building ONNX Runtime Docker image with ${WHEEL_FILE}..."
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+    	build-essential \
+	software-properties-common \
+	libopenblas-dev \
+        libpython3.6-dev \
+        python3-pip \
+        python3-dev \
+	cmake
+
+RUN pip3 install --upgrade pip
+RUN pip3 install setuptools
+RUN pip3 install wheel pybind11 pytest
+
+WORKDIR /onnxruntime
+
+# copy previously built wheel into the container
+COPY ${WHEEL_FILE} .
+
+RUN basename ${WHEEL_FILE} | xargs pip3 install 
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
index 9795a6cfa1..c284336c55 100644
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
@@ -10,6 +10,7 @@
 - OpenVINO: [Dockerfile](Dockerfile.openvino), [Instructions](#openvino)
 - Nuphar: [Dockerfile](Dockerfile.nuphar), [Instructions](#nuphar)
 - ARM 32v7: [Dockerfile](Dockerfile.arm32v7), [Instructions](#arm-32v7)
+- NVIDIA Jetson TX1/TX2/Nano/Xavier: [Dockerfile](Dockerfile.jetson), [Instructions](#nvidia-jetson-tx1tx2nanoxavier)
 - ONNX-Ecosystem (CPU + Converters): [Dockerfile](https://github.com/onnx/onnx-docker/blob/master/onnx-ecosystem/Dockerfile), [Instructions](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem)
 - ONNX Runtime Server: [Dockerfile](Dockerfile.server), [Instructions](#onnx-runtime-server)
 - MIGraphX: [Dockerfile](Dockerfile.migraphx), [Instructions](#migraphx)
@@ -246,6 +247,41 @@ The Dockerfile used in these instructions specifically targets Raspberry Pi 3/3+
     ```
 10. Test installation by following the instructions [here](https://microsoft.github.io/onnxruntime/)
 
+## NVIDIA Jetson TX1/TX2/Nano/Xavier:
+
+These instructions are for [JetPack SDK 4.4](https://developer.nvidia.com/embedded/jetpack).
+The Dockerfile.jetson is using [NVIDIA L4T 32.4.3](https://developer.nvidia.com/embedded/linux-tegra) as base image.
+Versions different from these may require modifications to these instructions.
+Instructions assume you are on Jetson host in the root of onnxruntime git project clone(`https://github.com/microsoft/onnxruntime`)
+
+Two-step installation is required:
+
+1. Build Python 'wheel' for ONNX Runtime on host Jetson system;
+2. Build Docker image using ONNX Runtime wheel from step 1. You can also install the wheel on the host directly.
+
+Here are the build commands for each step:
+
+1.1 Install ONNX Runtime build dependencies on Jetpack 4.4 host:
+```
+   sudo apt install -y --no-install-recommends \
+    	build-essential software-properties-common cmake libopenblas-dev \
+	libpython3.6-dev python3-pip python3-dev
+```
+1.2 Build ONNXRuntime Python wheel:
+```
+   ./build.sh --update --config Release --build --build_wheel \
+   --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/lib/aarch64-linux-gnu
+```
+Note: You may add --use_tensorrt and --tensorrt_home options if you wish to use NVIDIA TensorRT (support is experimental), as well as any other options supported by [build.sh script](build.sh).
+
+2. After the Python wheel is successfully built, use 'find' command for Docker to install the wheel inside new image:
+```
+   find . -name '*.whl' -print -exec sudo -H DOCKER_BUILDKIT=1 nvidia-docker build --build-arg WHEEL_FILE={} -f ./dockerfiles/Dockerfile.jetson . \;
+```
+Note: Resulting Docker image will have ONNX Runtime installed in /usr, and ONNX Runtime wheel copied to /onnxruntime directory.
+Nothing else from ONNX Runtime source tree will be copied/installed to the image.
+
+Note: When running the container you built in Docker, please either use 'nvidia-docker' command instead of 'docker', or use Docker command-line options to make sure NVIDIA runtime will be used and appropiate files mounted from host. Otherwise, CUDA libraries won't be found. You can also [set NVIDIA runtime as default in Docker](https://github.com/dusty-nv/jetson-containers#docker-default-runtime).
 
 ## Nuphar
 *Public Preview*