From 9ddbb046a2c1a26b26ea4628f25c2f6147ae4004 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Sun, 16 May 2021 19:22:03 -0700
Subject: [PATCH] Merge related issues and code review feedback.

---
 cmake/CMakeLists.txt                          |  1 +
 include/onnxruntime/core/framework/tensor.h   |  8 ++-
 .../main/java/ai/onnxruntime/OnnxRuntime.java |  2 +
 .../contrib_ops/cuda/bert/attention.cc        |  4 +-
 .../core/framework/provider_bridge_ort.cc     | 14 ++---
 onnxruntime/core/providers/cuda/rnn/gru.cc    |  4 +-
 onnxruntime/core/providers/cuda/rnn/lstm.cc   |  4 +-
 onnxruntime/core/providers/cuda/rnn/rnn.cc    |  4 +-
 .../core/providers/cuda/tensor/reshape.cc     |  4 +-
 .../providers/shared_library/provider_api.h   | 17 -----
 .../provider_bridge_provider.cc               | 62 +++++++++----------
 11 files changed, 55 insertions(+), 69 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index fb7a898085..9e7a537d01 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -193,6 +193,7 @@ if(onnxruntime_USE_VALGRIND AND NOT WIN32)
 endif()
 
 if (onnxruntime_ENABLE_NVTX_PROFILE)
+  message(WARNING "NTVX profile temporarily disabled, will be fixed soon")
 # TODO: This doesn't work with the shared cuda provider. Disabling temporarily to do a clean fix later as it wasn't trivial
 #  add_definitions(-DENABLE_NVTX_PROFILE=1)
 endif()
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index cea531a6e1..4b09260856 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -34,8 +34,12 @@ namespace onnxruntime {
 */
 class Tensor final {
  public:
-  static std::unique_ptr<Tensor> Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) { return std::make_unique<Tensor>(p_type, shape, allocator); }
-  static std::unique_ptr<Tensor> Create(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset = 0) { return std::make_unique<Tensor>(p_type, shape, p_data, alloc, offset); }
+  static std::unique_ptr<Tensor> Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator) {
+    return std::make_unique<Tensor>(p_type, shape, allocator);
+  }
+  static std::unique_ptr<Tensor> Create(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset = 0) {
+    return std::make_unique<Tensor>(p_type, shape, p_data, alloc, offset);
+  }
 
   Tensor() = default;  // to allow creating vector<Tensor> to support seq(tensor)
 
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index a6b7085e96..4e6db62ace 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -120,6 +120,7 @@ final class OnnxRuntime {
    * in time.
    *
    * @param file The file to remove.
+   * @param onExitOnly Delete the file on exit only, vs trying to do it immediately
    */
   private static void cleanUp(File file, boolean onExitOnly) {
     if (!file.exists()) {
@@ -146,6 +147,7 @@ final class OnnxRuntime {
    *
    * @param tempDirectory The temp directory to write the library resource to.
    * @param library The bare name of the library.
+   * @param systemLoad If system.Load(..) should be called on the library vs just preparing it
    * @throws IOException If the file failed to read or write.
    */
   private static void load(Path tempDirectory, String library, boolean systemLoad)
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index 8cc92094c4..733b98ea1c 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/cuda/cuda_common.h"
-#include "core/providers/cuda/shared_inc/fpgeneric.h"
 #include "attention.h"
 #include "attention_impl.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
 
 using namespace onnxruntime::cuda;
 using namespace ::onnxruntime::common;
diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc
index 09f665eb8a..0d26c876ab 100644
--- a/onnxruntime/core/framework/provider_bridge_ort.cc
+++ b/onnxruntime/core/framework/provider_bridge_ort.cc
@@ -1007,48 +1007,48 @@ std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const c
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* provider_options) {
-  if (auto provider = s_library_cuda.Get())
+  if (auto* provider = s_library_cuda.Get())
     return provider->CreateExecutionProviderFactory(provider_options);
 
   return nullptr;
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Dnnl(int use_arena) {
-  if (auto provider = s_library_dnnl.Get())
+  if (auto* provider = s_library_dnnl.Get())
     return provider->CreateExecutionProviderFactory(use_arena);
 
   return nullptr;
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(int device_id) {
-  if (auto provider = s_library_tensorrt.Get())
+  if (auto* provider = s_library_tensorrt.Get())
     return provider->CreateExecutionProviderFactory(device_id);
 
   return nullptr;
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* provider_options) {
-  if (auto provider = s_library_tensorrt.Get())
+  if (auto* provider = s_library_tensorrt.Get())
     return provider->CreateExecutionProviderFactory(provider_options);
 
   return nullptr;
 }
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* provider_options) {
-  if (auto provider = s_library_openvino.Get())
+  if (auto* provider = s_library_openvino.Get())
     return provider->CreateExecutionProviderFactory(provider_options);
 
   return nullptr;
 }
 
 ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO() {
-  if (auto provider = s_library_openvino.Get())
+  if (auto* provider = s_library_openvino.Get())
     return reinterpret_cast<ProviderInfo_OpenVINO*>(provider->GetInfo());
   return nullptr;
 }
 
 ProviderInfo_CUDA* GetProviderInfo_CUDA() {
-  if (auto provider = s_library_cuda.Get())
+  if (auto* provider = s_library_cuda.Get())
     return reinterpret_cast<ProviderInfo_CUDA*>(provider->GetInfo());
   LOGS_DEFAULT(WARNING) << "GetProviderInfo_CUDA called, returning nullptr";
   ORT_THROW("CUDA Provider not available, can't get interface for it");
diff --git a/onnxruntime/core/providers/cuda/rnn/gru.cc b/onnxruntime/core/providers/cuda/rnn/gru.cc
index 578c1dd27b..964aebf560 100644
--- a/onnxruntime/core/providers/cuda/rnn/gru.cc
+++ b/onnxruntime/core/providers/cuda/rnn/gru.cc
@@ -17,10 +17,10 @@ namespace cuda {
       13,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
-      KernelDefBuilder()                                                        \
+      (*KernelDefBuilder::Create())                                             \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<int32_t>())         \
-          .InputMemoryType<OrtMemTypeCPUInput>(RNN_Input_Index::sequence_lens), \
+          .InputMemoryType(OrtMemTypeCPUInput, RNN_Input_Index::sequence_lens), \
       GRU<T>);
 
 #define REGISTER_KERNEL_TYPED(T)                                                \
diff --git a/onnxruntime/core/providers/cuda/rnn/lstm.cc b/onnxruntime/core/providers/cuda/rnn/lstm.cc
index ad9e2ddecc..890d15cef6 100644
--- a/onnxruntime/core/providers/cuda/rnn/lstm.cc
+++ b/onnxruntime/core/providers/cuda/rnn/lstm.cc
@@ -15,10 +15,10 @@ namespace cuda {
       13,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
-      KernelDefBuilder()                                                        \
+      (*KernelDefBuilder::Create())                                             \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<int32_t>())         \
-          .InputMemoryType<OrtMemTypeCPUInput>(RNN_Input_Index::sequence_lens), \
+          .InputMemoryType(OrtMemTypeCPUInput, RNN_Input_Index::sequence_lens), \
       LSTM<T>);
 
 #define REGISTER_KERNEL_TYPED(T)                                                \
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.cc b/onnxruntime/core/providers/cuda/rnn/rnn.cc
index b438f981c9..4bd22340ef 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.cc
@@ -17,10 +17,10 @@ namespace cuda {
       13,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
-      KernelDefBuilder()                                                        \
+      (*KernelDefBuilder::Create())                                             \
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<int32_t>())         \
-          .InputMemoryType<OrtMemTypeCPUInput>(RNN_Input_Index::sequence_lens), \
+          .InputMemoryType(OrtMemTypeCPUInput, RNN_Input_Index::sequence_lens), \
       RNN<T>);
 
 #define REGISTER_KERNEL_TYPED(T)                                                \
diff --git a/onnxruntime/core/providers/cuda/tensor/reshape.cc b/onnxruntime/core/providers/cuda/tensor/reshape.cc
index 297768fcf4..61bca5bfe7 100644
--- a/onnxruntime/core/providers/cuda/tensor/reshape.cc
+++ b/onnxruntime/core/providers/cuda/tensor/reshape.cc
@@ -11,11 +11,11 @@ ONNX_OPERATOR_KERNEL_EX(
     kOnnxDomain,
     14,
     kCudaExecutionProvider,
-    KernelDefBuilder()
+    (*KernelDefBuilder::Create())
         .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
         .TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
         .Alias(0, 0)
-        .InputMemoryType<OrtMemTypeCPUInput>(1),
+        .InputMemoryType(OrtMemTypeCPUInput, 1),
     Reshape);
 
 ONNX_OPERATOR_VERSIONED_KERNEL_EX(
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index e21882f5df..3c58150e9f 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -206,23 +206,6 @@ using NameMLValMap = std::unordered_map<std::string, OrtValue>;
 
 namespace onnxruntime {
 
-// From Tensor.h
-class BufferDeleter {
- public:
-  BufferDeleter() : alloc_(nullptr) {}
-  BufferDeleter(AllocatorPtr alloc) : alloc_(alloc) {}
-
-  void operator()(void* p) const {
-    if (alloc_)
-      alloc_->Free(p);
-  }
-
- private:
-  AllocatorPtr alloc_;
-};
-
-using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
-
 // The function passed in will be run on provider DLL unload. This is used to free thread_local variables that are in threads we don't own
 // Since these are not destroyed when the DLL unloads we have to do it manually. Search for usage for an example.
 void RunOnUnload(std::function<void()> function);
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index 26ab338108..058211379d 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -8,6 +8,35 @@
 #include <mutex>
 #include "core/providers/shared/common.h"
 
+#include "core/framework/random_generator.h"
+#include "core/providers/cpu/controlflow/if.h"
+#include "core/providers/cpu/controlflow/loop.h"
+#include "core/providers/cpu/controlflow/scan.h"
+#include "core/providers/cpu/math/einsum.h"
+#include "core/providers/cpu/object_detection/non_max_suppression.h"
+#include "core/providers/cpu/tensor/concatbase.h"
+#include "core/providers/cpu/tensor/padbase.h"
+#include "core/providers/cpu/tensor/gatherbase.h"
+#include "core/providers/cpu/tensor/slice.h"
+#include "core/providers/cpu/tensor/split.h"
+#include "core/providers/cpu/tensor/size.h"
+#include "core/providers/cpu/tensor/scatter_nd.h"
+#include "core/providers/cpu/tensor/unsqueeze.h"
+#include "core/providers/cpu/tensor/tile.h"
+
+#ifndef DISABLE_CONTRIB_OPS
+#include "contrib_ops/cpu/bert/attention_base.h"
+#include "contrib_ops/cpu/bert/bias_gelu_helper.h"
+#include "contrib_ops/cpu/bert/embed_layer_norm_helper.h"
+#include "contrib_ops/cpu/bert/longformer_attention_base.h"
+#endif
+
+#ifdef ENABLE_TRAINING
+#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
+#include "orttraining/training_ops/cpu/controlflow/group.h"
+#include "orttraining/training_ops/cpu/controlflow/yield.h"
+#endif
+
 #ifndef _Ret_notnull_
 #define _Ret_notnull_
 #endif
@@ -311,39 +340,6 @@ std::unique_ptr<OpKernelInfo> CopyOpKernelInfo(const OpKernelInfo& info) {
   return g_host->CopyOpKernelInfo(info);
 }
 
-}  // namespace onnxruntime
-
-#include "core/providers/cpu/tensor/unsqueeze.h"
-#include "core/providers/cpu/tensor/slice.h"
-#include "core/providers/cpu/tensor/split.h"
-#include "core/providers/cpu/tensor/size.h"
-#include "core/providers/cpu/tensor/scatter_nd.h"
-#include "core/providers/cpu/tensor/padbase.h"
-#include "core/providers/cpu/tensor/concatbase.h"
-#include "core/providers/cpu/tensor/gatherbase.h"
-#include "core/providers/cpu/controlflow/scan.h"
-#include "core/providers/cpu/controlflow/loop.h"
-#include "core/providers/cpu/tensor/tile.h"
-#include "core/providers/cpu/object_detection/non_max_suppression.h"
-#include "core/framework/random_generator.h"
-#include "core/providers/cpu/math/einsum.h"
-#include "core/providers/cpu/controlflow/if.h"
-
-#ifndef DISABLE_CONTRIB_OPS
-#include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "contrib_ops/cpu/bert/embed_layer_norm_helper.h"
-#include "contrib_ops/cpu/bert/longformer_attention_base.h"
-#include "contrib_ops/cpu/bert/attention_base.h"
-#endif
-
-#ifdef ENABLE_TRAINING
-#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
-#include "orttraining/training_ops/cpu/controlflow/group.h"
-#include "orttraining/training_ops/cpu/controlflow/yield.h"
-#endif
-
-namespace onnxruntime {
-
 namespace utils {
 template <>
 Status UnpackTensor(const ONNX_NAMESPACE::TensorProto& tensor, const void* raw_data, size_t raw_data_len, /*out*/ bool* p_data, size_t expected_size) { return g_host->UnpackTensor(tensor, raw_data, raw_data_len, p_data, expected_size); }