diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index 91374bf86d..e4cf9747c0 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -171,7 +171,7 @@
             "Other": {
                "Name": "Boost",
                "Version": "1.69.0",
-               "DownloadUrl": "http://dl.bintray.com/boostorg/release/1.69.0/source/boost_1_69_0.tar.bz2"
+               "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
             }
          }
       },
@@ -462,14 +462,14 @@
       },
       {
          "component": {
-           "type": "git",
-           "git": {
-             "commitHash": "e1e11e0d555c08bec08a6c7773aa777dfcaae9da",
-             "repositoryUrl": "https://github.com/dmlc/dlpack.git"
-           },
-           "comments": "dlpack"
+            "type": "git",
+            "git": {
+               "commitHash": "e1e11e0d555c08bec08a6c7773aa777dfcaae9da",
+               "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+            },
+            "comments": "dlpack"
          }
       }
    ],
    "Version": 1
-}
+}
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/props.xml b/csharp/src/Microsoft.ML.OnnxRuntime/props.xml
index 591b127518..c467dfa5c6 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/props.xml
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/props.xml
@@ -45,6 +45,36 @@
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
       <Visible>false</Visible>
     </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_shared.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_shared.dll')">
+      <Link>onnxruntime_providers_shared.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_cuda.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_cuda.dll')">
+      <Link>onnxruntime_providers_cuda.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_dnnl.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_dnnl.dll')">
+      <Link>onnxruntime_providers_dnnl.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_tensorrt.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_tensorrt.dll')">
+      <Link>onnxruntime_providers_tensorrt.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_openvino.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_openvino.dll')">
+      <Link>onnxruntime_providers_openvino.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
     <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll"
           Condition="('$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')) AND
                      Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll')">
diff --git a/java/src/main/native/ai_onnxruntime_OrtSession.c b/java/src/main/native/ai_onnxruntime_OrtSession.c
index 56a0cf7328..8e96227e0c 100644
--- a/java/src/main/native/ai_onnxruntime_OrtSession.c
+++ b/java/src/main/native/ai_onnxruntime_OrtSession.c
@@ -260,15 +260,19 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run
     jobject* javaOutputStrings;
     checkOrtStatus(jniEnv, api, api->AllocatorAlloc(allocator,sizeof(jobject)*numOutputs,(void**)&javaOutputStrings));
 
-    // Extract the names of the input values.
+    // Extract a C array of longs which are pointers to the input tensors.
+    // Need to convert longs to OrtValue* in case we run on non-64bit systems
+    jlong* inputTensors = (*jniEnv)->GetLongArrayElements(jniEnv,tensorArr,NULL);
+    const OrtValue** inputValues;
+    checkOrtStatus(jniEnv, api, api->AllocatorAlloc(allocator,sizeof(OrtValue*)*numInputs,(void**)&inputValues));
+
+    // Extract the names and native pointers of the input values.
     for (int i = 0; i < numInputs; i++) {
         javaInputStrings[i] = (*jniEnv)->GetObjectArrayElement(jniEnv,inputNamesArr,i);
         inputNames[i] = (*jniEnv)->GetStringUTFChars(jniEnv,javaInputStrings[i],NULL);
+        inputValues[i] = (OrtValue*)inputTensors[i];
     }
 
-    // Extract a C array of longs which are pointers to the input tensors.
-    jlong* inputTensors = (*jniEnv)->GetLongArrayElements(jniEnv,tensorArr,NULL);
-
     // Extract the names of the output values, and allocate their output array.
     OrtValue** outputValues;
     checkOrtStatus(jniEnv,api,api->AllocatorAlloc(allocator,sizeof(OrtValue*)*numOutputs,(void**)&outputValues));
@@ -281,7 +285,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run
     // Actually score the inputs.
     //printf("inputTensors = %p, first tensor = %p, numInputs = %ld, outputValues = %p, numOutputs = %ld\n",inputTensors,(OrtValue*)inputTensors[0],numInputs,outputValues,numOutputs);
     //ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess, _In_ OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, _In_ const char* const* output_names, size_t output_names_len, _Out_ OrtValue** output);
-    checkOrtStatus(jniEnv,api,api->Run(session, runOptions, (const char* const*) inputNames, (const OrtValue* const*) inputTensors, numInputs, (const char* const*) outputNames, numOutputs, outputValues));
+    checkOrtStatus(jniEnv,api,api->Run(session, runOptions, (const char* const*) inputNames, (const OrtValue* const*) inputValues, numInputs, (const char* const*) outputNames, numOutputs, outputValues));
     // Release the C array of pointers to the tensors.
     (*jniEnv)->ReleaseLongArrayElements(jniEnv,tensorArr,inputTensors,JNI_ABORT);
 
@@ -307,6 +311,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run
 
     // Release the buffers
     checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)inputNames));
+    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)inputValues));
     checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)outputNames));
     checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, javaInputStrings));
     checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, javaOutputStrings));
diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc
index c295aa0b4a..1f2e61f628 100644
--- a/onnxruntime/core/framework/provider_bridge_ort.cc
+++ b/onnxruntime/core/framework/provider_bridge_ort.cc
@@ -109,9 +109,8 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 
 namespace onnxruntime {
 
-//ProviderHost* g_host{};
-
-ProviderInfo_CUDA* GetProviderInfo_CUDA();
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
+ProviderInfo_CUDA& GetProviderInfo_CUDA();
 
 struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
   TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
@@ -187,15 +186,15 @@ struct ProviderHostImpl : ProviderHost {
   void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); }
 
 #ifdef USE_CUDA
-  std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAAllocator(device_id, name); }
-  std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAPinnedAllocator(device_id, name); }
-  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA()->CreateGPUDataTransfer(stream); }
+  std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); }
+  std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(device_id, name); }
+  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(stream); }
 
-  void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
-  void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
+  void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
+  void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
 
-  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_false(retCode, exprString, libName, successCode, msg); }
-  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_true(retCode, exprString, libName, successCode, msg); }
+  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg); }
+  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg); }
 #endif
 
   std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); }
@@ -1003,7 +1002,7 @@ void UnloadSharedProviders() {
 
 // Used by test code
 std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->CreateCUDAPinnedAllocator(device_id, name);
 
   return nullptr;
@@ -1050,10 +1049,17 @@ ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO() {
   return nullptr;
 }
 
-ProviderInfo_CUDA* GetProviderInfo_CUDA() {
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA() {
   if (auto* provider = s_library_cuda.Get())
     return reinterpret_cast<ProviderInfo_CUDA*>(provider->GetInfo());
-  LOGS_DEFAULT(WARNING) << "GetProviderInfo_CUDA called, returning nullptr";
+
+  return nullptr;
+}
+
+ProviderInfo_CUDA& GetProviderInfo_CUDA() {
+  if(auto* info = TryGetProviderInfo_CUDA())
+    return *info;
+
   ORT_THROW("CUDA Provider not available, can't get interface for it");
 }
 
@@ -1063,13 +1069,13 @@ void CopyGpuToCpu(
     const size_t size,
     const OrtMemoryInfo& dst_location,
     const OrtMemoryInfo& src_location) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->CopyGpuToCpu(dst_ptr, src_ptr, size, dst_location, src_location);
   ORT_THROW("GPU-to-CPU copy is not implemented.");
 }
 
 void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->cudaMemcpy_HostToDevice(dst, src, count);
   ORT_THROW("cudaMemcpy_HostToDevice is not implemented.");
 }
@@ -1077,7 +1083,7 @@ void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) {
 #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
 namespace cuda {
 INcclService& INcclService::GetInstance() {
-  return GetProviderInfo_CUDA()->GetINcclService();
+  return GetProviderInfo_CUDA().GetINcclService();
 }
 }  // namespace cuda
 #endif
@@ -1147,17 +1153,17 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessi
 
 ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) {
   API_IMPL_BEGIN
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->SetCurrentGpuDeviceId(device_id);
-  return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
+  return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
   API_IMPL_END
 }
 
 ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) {
   API_IMPL_BEGIN
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
     return info->GetCurrentGpuDeviceId(device_id);
-  return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
+  return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
   API_IMPL_END
 }
 
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 2fc7e9ae12..2f0ed2b95b 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -40,6 +40,7 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 namespace onnxruntime {
 
 namespace {
+
 class WindowsThread : public EnvThread {
  private:
   struct Param {
diff --git a/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc b/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
index f7e7033471..bba62b290e 100644
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
@@ -10,7 +10,13 @@ namespace op_kernel_type_control {
 ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST_ALL_OPSETS(
     kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, Output, 0,
     ConstantOfShapeDefaultOutputTypes);
-}
+
+// pytorch converter uses ConstantOfShape with int64 to create Pad input
+// https://github.com/pytorch/pytorch/blob/044b519a80459f6787f6723c1c091a18b153d184/torch/onnx/symbolic_opset11.py#L449
+ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES_ALL_OPSETS(
+    kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, Output, 0,
+    int64_t);
+}  // namespace op_kernel_type_control
 
 namespace {
 
diff --git a/onnxruntime/python/_pybind_state.py b/onnxruntime/python/_pybind_state.py
index b2feecc161..2b32641050 100644
--- a/onnxruntime/python/_pybind_state.py
+++ b/onnxruntime/python/_pybind_state.py
@@ -7,50 +7,12 @@ Ensure that dependencies are available and then load the extension module.
 """
 import os
 import platform
-import sys
 
 from . import _ld_preload  # noqa: F401
 
 if platform.system() == "Windows":
     from . import version_info
 
-    if version_info.use_cuda:
-        cuda_version_major, cuda_version_minor = version_info.cuda_version.split(".")
-        if int(cuda_version_major) < 11:
-            # Prior to CUDA 11 both major and minor version at build time/runtime have to match.
-            cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{cuda_version_minor}"
-            if cuda_env_variable not in os.environ:
-                raise ImportError(f"CUDA Toolkit {version_info.cuda_version} not installed on the machine.")
-        else:
-            # With CUDA 11 and newer only the major version at build time/runtime has to match.
-            # Use the most recent minor version available.
-            cuda_env_variable = None
-            for i in range(9, -1, -1):
-                if f"CUDA_PATH_V{cuda_version_major}_{i}" in os.environ:
-                    cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{i}"
-                    break
-            if not cuda_env_variable:
-                raise ImportError(f"CUDA Toolkit {cuda_version_major}.x not installed on the machine.")
-
-        cuda_bin_dir = os.path.join(os.environ[cuda_env_variable], "bin")
-        if not os.path.isfile(os.path.join(cuda_bin_dir, f"cudnn64_{version_info.cudnn_version}.dll")):
-            raise ImportError(f"cuDNN {version_info.cudnn_version} not installed in {cuda_bin_dir}.")
-
-        if sys.version_info >= (3, 8):
-            # Python 3.8 (and later) doesn't search system PATH when loading DLLs, so the CUDA location needs to be
-            # specified explicitly using the new API introduced in Python 3.8.
-            os.add_dll_directory(cuda_bin_dir)
-            cuda_root = os.path.join(cuda_bin_dir, "..", "..")
-            for root, _, files in os.walk(cuda_root):
-                for f in files:
-                    if f == "cupti.lib":
-                        os.add_dll_directory(root)
-        else:
-            # Python 3.7 (and earlier) searches directories listed in PATH variable.
-            # Make sure that the target CUDA version is at the beginning (important if multiple CUDA versions are
-            # installed on the machine.)
-            os.environ["PATH"] = cuda_bin_dir + os.pathsep + os.environ["PATH"]
-
     if version_info.vs2019 and platform.architecture()[0] == "64bit":
         if not os.path.isfile("C:\\Windows\\System32\\vcruntime140_1.dll"):
             raise ImportError(
diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 7b28fed2d1..9d58045143 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -67,11 +67,11 @@ void CpuToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
 
 #ifdef USE_CUDA
 void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes) {
-  GetProviderInfo_CUDA()->cudaMemcpy_HostToDevice(dst, src, num_bytes);
+  GetProviderInfo_CUDA().cudaMemcpy_HostToDevice(dst, src, num_bytes);
 }
 
 void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
-  GetProviderInfo_CUDA()->cudaMemcpy_DeviceToHost(dst, src, num_bytes);
+  GetProviderInfo_CUDA().cudaMemcpy_DeviceToHost(dst, src, num_bytes);
 }
 
 const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
@@ -82,7 +82,7 @@ const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpy
 }
 
 bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id) {
-  int num_devices = GetProviderInfo_CUDA()->cudaGetDeviceCount();
+  int num_devices = GetProviderInfo_CUDA().cudaGetDeviceCount();
 
   if (0 == num_devices) {
     LOGS(logger, WARNING) << "your system does not have a CUDA capable device.";
@@ -105,7 +105,7 @@ AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) {
 
   if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) {
     // TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method
-    id_to_allocator_map->insert({id, GetProviderInfo_CUDA()->CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
+    id_to_allocator_map->insert({id, GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
   }
 
   return (*id_to_allocator_map)[id];
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index e352093593..b8a43318dc 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -463,24 +463,33 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
 #endif
     } else if (type == kCudaExecutionProvider) {
 #ifdef USE_CUDA
-      const auto it = provider_options_map.find(type);
-      CUDAExecutionProviderInfo info{};
-      if (it != provider_options_map.end())
-        GetProviderInfo_CUDA()->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
-      else {
-        info.device_id = cuda_device_id;
-        info.gpu_mem_limit = gpu_mem_limit;
-        info.arena_extend_strategy = arena_extend_strategy;
-        info.cudnn_conv_algo_search = cudnn_conv_algo_search;
-        info.do_copy_in_default_stream = do_copy_in_default_stream;
-        info.external_allocator_info = external_allocator_info;
-      }
+      if(auto* cuda_provider_info = TryGetProviderInfo_CUDA())
+      {
+        const auto it = provider_options_map.find(type);
+        CUDAExecutionProviderInfo info{};
+        if (it != provider_options_map.end())
+          cuda_provider_info->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
+        else {
+          info.device_id = cuda_device_id;
+          info.gpu_mem_limit = gpu_mem_limit;
+          info.arena_extend_strategy = arena_extend_strategy;
+          info.cudnn_conv_algo_search = cudnn_conv_algo_search;
+          info.do_copy_in_default_stream = do_copy_in_default_stream;
+          info.external_allocator_info = external_allocator_info;
+        }
 
-      // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
-      // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
-      // since FromProviderOptions might contain external CUDA allocator.
-      external_allocator_info = info.external_allocator_info;
-      RegisterExecutionProvider(sess, *GetProviderInfo_CUDA()->CreateExecutionProviderFactory(info));
+        // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
+        // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
+        // since FromProviderOptions might contain external CUDA allocator.
+        external_allocator_info = info.external_allocator_info;
+        RegisterExecutionProvider(sess, *cuda_provider_info->CreateExecutionProviderFactory(info));
+      }
+      else
+      {
+        if(!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
+          ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
+        }
+      }
 #endif
     } else if (type == kRocmExecutionProvider) {
 #ifdef USE_ROCM
@@ -1608,7 +1617,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
   addOrtValueMethods(m);
   addIoBindingMethods(m);
 
-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#if !defined(__APPLE__) && (!defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS))
   Ort::SessionOptions tmp_options;
   if (!InitProvidersSharedLibrary()) {
     const logging::Logger& default_logger = logging::LoggingManager::DefaultLogger();
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 8dd547bbb5..ad21f8fff8 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -161,7 +161,8 @@ extern std::string nuphar_settings;
 #if defined(USE_CUDA) || defined(USE_ROCM)
 #ifdef USE_CUDA
 namespace onnxruntime {
-ProviderInfo_CUDA* GetProviderInfo_CUDA();
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
+ProviderInfo_CUDA& GetProviderInfo_CUDA();
 namespace python {
 // TODO remove deprecated global config
 extern OrtCudnnConvAlgoSearch cudnn_conv_algo_search;
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index dc41b8efbf..fefacd1fac 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -112,7 +112,7 @@ class ONNXModel:
 
     def find_node_by_name(self, node_name, new_nodes_list, graph):
         '''
-        Find out if a node exists in a graph or a node is in the 
+        Find out if a node exists in a graph or a node is in the
         new set of nodes created during quantization. Return the node found.
         '''
         graph_nodes_list = list(graph.node)  #deep copy
@@ -256,6 +256,8 @@ class ONNXModel:
                 return True
         return False
 
+    # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
+    # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
     def topological_sort(self):
         deps_count = [0]*len(self.nodes()) # dependency count of each node
         deps_to_nodes = {} # input to node indice
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 45914afcb0..16358e4d81 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -781,7 +781,67 @@ class OnnxModel:
                             return False
         return True
 
+    @staticmethod
+    def graph_topological_sort(graph):
+        deps_count = [0]*len(graph.node) # dependency count of each node
+        deps_to_nodes = {} # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(graph.node):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _ )
+            if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
+                sorted_nodes.append(graph.node[node_idx])
+                continue
+
+            for input_name in node.input:
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        initializer_names = [init.name for init in graph.initializer]
+        graph_input_names = [input.name for input in graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(graph.node[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(graph.node[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        assert(end == len(graph.node)), "Graph is not a DAG"
+        graph.ClearField('node')
+        graph.node.extend(sorted_nodes)
+
+    def topological_sort(self):
+        #TODO: support graph_topological_sort() in subgraphs
+        #for graph in self.graphs():
+        #    self.graph_topological_sort(graph)
+        OnnxModel.graph_topological_sort(self.model.graph)
+
     def save_model_to_file(self, output_path, use_external_data_format=False):
+        logger.info(f"Sort graphs in topological order")
+        self.topological_sort()
+
         logger.info(f"Output model to {output_path}")
 
         Path(output_path).parent.mkdir(parents=True, exist_ok=True)
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 25a0e0a272..c4bdfa4d0a 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -66,6 +66,11 @@ struct KernelRegistryAndStatus {
 };
 }  // namespace
 namespace onnxruntime {
+
+#ifdef USE_CUDA
+ProviderInfo_CUDA& GetProviderInfo_CUDA();
+#endif
+
 class FuseAdd : public OpKernel {
  public:
   explicit FuseAdd(const OpKernelInfo& info) : OpKernel(info) {
@@ -354,7 +359,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
                                                                   shape,
                                                                   cpu_allocator);
 #ifdef USE_CUDA
-    cudaStream_t stream = static_cast<cudaStream_t>(static_cast<const onnxruntime::CUDAExecutionProvider*>(TestCudaExecutionProvider())->GetComputeStream());
+    cudaStream_t stream = static_cast<cudaStream_t>(gpu_provider->GetComputeStream());
+    st = GetProviderInfo_CUDA().CreateGPUDataTransfer(stream)->CopyTensor(rtensor, *cpu_tensor.get(), 0);
 #elif USE_ROCM
     hipStream_t stream = static_cast<hipStream_t>(static_cast<const onnxruntime::ROCMExecutionProvider*>(TestRocmExecutionProvider())->GetComputeStream());
 #endif
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index 70793d8a36..807cf60533 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -55,5 +55,25 @@ try:
 except:
     raise(f'PyTorch {MINIMUM_TORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')
 
+# Initalized ORT's random seed with pytorch's initial seed
+# Initalized ORT's random seed with pytorch's current seed, 
+# in case user has set pytorch seed before importing ORTModule
+import sys
+from onnxruntime import set_seed
+set_seed((torch.initial_seed() % sys.maxsize))
+
+# Override torch.manual_seed and torch.cuda.manual_seed
+def override_torch_manual_seed(seed):
+    set_seed(seed % sys.maxsize)
+    return torch_manual_seed(seed)
+torch_manual_seed = torch.manual_seed
+torch.manual_seed = override_torch_manual_seed
+
+def override_torch_cuda_manual_seed(seed):
+    set_seed(seed % sys.maxsize)
+    return torch_cuda_manual_seed(seed)
+torch_cuda_manual_seed = torch.cuda.manual_seed
+torch.cuda.manual_seed = override_torch_cuda_manual_seed
+
 # ORTModule must be loaded only after all validation passes
 from .ortmodule import ORTModule
diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py
index 8aaa9e908c..8699ae5792 100644
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@@ -284,7 +284,7 @@ def _extract_schema(data):
     elif isinstance(data, torch.Tensor):
         return _TensorStub(dtype=str(data.dtype), shape_dims=len(data.size()))
 
-    if isinstance(data, abc.Sequence):
+    if isinstance(data, abc.Sequence) and not isinstance(data, str):
         sequence_type = type(data)
         data = list(data)
         for idx in range(len(data)):
diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
index 52882c6052..de8a9acc0f 100644
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@@ -193,3 +193,18 @@ class ORTModule(torch.nn.Module):
         """Raises a NotImplementedError exception since ORTModule does not support adding modules to it"""
 
         raise NotImplementedError("ORTModule does not support adding modules to it.")
+
+    @property
+    def module(self):
+        """The original `torch.nn.Module` that this module wraps.
+
+        This property provides access to methods and properties on the original module.
+        """
+
+        # HuggingFace Trainer `save_model` method checks to see if the input model is a HuggingFace PreTrainedModel
+        # or if the model has an attribute called `module` which references a HuggingFace PreTrainedModel to save
+        # the entire context of the model so that it can be loaded using HuggingFace `from_pretrained` method.
+        # This `module` property enables HuggingFace Trainer to retrieve the underlying PreTrainedModel inside ORTModule
+        # to save and load a complete checkpoint
+
+        return self._module_metadata.original_module
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 9a8a387142..2bde15fcec 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -6,7 +6,7 @@ import math
 import random
 import copy
 import torch
-from transformers import AutoConfig, BertForSequenceClassification
+from transformers import AutoConfig, BertForSequenceClassification, Trainer
 from transformers.modeling_outputs import SequenceClassifierOutput
 import pytest
 from time import sleep
@@ -15,6 +15,7 @@ from unittest.mock import patch
 from collections import OrderedDict
 from collections import namedtuple
 from inspect import signature
+import tempfile
 
 from onnxruntime.training.ortmodule import ORTModule, _utils, _io
 import _test_helpers
@@ -2777,3 +2778,46 @@ def test_load_state_dict_for_wrapped_ortmodule():
     for param_name, param_value in state_dict1.items():
         assert param_name in state_dict2
         assert torch.equal(param_value, state_dict2[param_name])
+
+def test_hf_save_pretrained():
+    device = 'cuda'
+
+    model1 = _get_bert_for_sequence_classification_model(device)
+    model1 = ORTModule(model1)
+    state_dict = model1.state_dict()
+    list(next(iter(state_dict.items())))[1] += 100
+    model1.load_state_dict(state_dict)
+
+    trainer = Trainer(model=model1)
+
+    # Assert that ORTModule has an attribute called module. This attribute is used
+    # for trainer.save_model to reference the underlying HuggingFace PreTrainedModel
+    assert hasattr(model1, "module")
+
+    # Create a temporary directory for the checkpoint from save_pretrained
+    with tempfile.TemporaryDirectory() as temporary_dir:
+        trainer.save_model(temporary_dir)
+
+        # Create a new model and compare all state dictionary values for equality
+        # to check if from_pretrained worked.
+        config = AutoConfig.from_pretrained(temporary_dir)
+        model2 = BertForSequenceClassification.from_pretrained(
+            temporary_dir, config=config,
+        ).to(device)
+        model2 = ORTModule(model2)
+
+        for p1, p2 in zip(model1.parameters(), model2.parameters()):
+            assert p1.data.ne(p2.data).sum() == 0
+
+def test_input_with_string_exception():
+    class MyStrNet(torch.nn.Module):
+        def forward(self, x, my_str):
+            if my_str.lower() == 'hello':
+                print('hi')
+            return x
+
+    model = MyStrNet()
+    model = ORTModule(model)
+    with pytest.raises(TypeError) as ex_info:
+        _ = model(torch.randn(1, 2), 'hello')
+    assert "ORTModule does not support the following model data type <class 'str'>" in str(ex_info.value)
diff --git a/server/get_boost.cmake b/server/get_boost.cmake
index 1aff8a6aa9..7943cbdd53 100644
--- a/server/get_boost.cmake
+++ b/server/get_boost.cmake
@@ -70,7 +70,7 @@ macro(DOWNLOAD_BOOST)
   include(ExternalProject)
   ExternalProject_Add(
       Boost
-      URL http://dl.bintray.com/boostorg/release/${BOOST_REQUESTED_VERSION}/source/boost_${BOOST_REQUESTED_VERSION_UNDERSCORE}.tar.bz2
+      URL https://boostorg.jfrog.io/artifactory/main/release/${BOOST_REQUESTED_VERSION}/source/boost_${BOOST_REQUESTED_VERSION_UNDERSCORE}.tar.bz2
       URL_HASH SHA256=${BOOST_SHA1}
       DOWNLOAD_DIR ${BOOST_ROOT_DIR}
       SOURCE_DIR ${BOOST_ROOT_DIR}
diff --git a/tools/ci_build/github/android/mobile_package.required_operators.config b/tools/ci_build/github/android/mobile_package.required_operators.config
index 72b47ec529..255f631ea5 100644
--- a/tools/ci_build/github/android/mobile_package.required_operators.config
+++ b/tools/ci_build/github/android/mobile_package.required_operators.config
@@ -14,8 +14,8 @@ ai.onnx;12;Abs,Add,And,ArgMax,ArgMin,AveragePool,Cast,Ceil,Clip,Concat,ConstantO
 ai.onnx;13;Abs,Add,And,ArgMax,ArgMin,AveragePool,Cast,Ceil,Clip,Concat,ConstantOfShape,Conv,ConvTranspose,Cos,CumSum,DepthToSpace,DequantizeLinear,Div,DynamicQuantizeLinear,Elu,Equal,Exp,Expand,Flatten,Floor,Gather,GatherND,Gemm,Greater,GreaterOrEqual,Identity,If,LRN,LeakyRelu,Less,LessOrEqual,Log,LogSoftmax,Loop,MatMul,Max,MaxPool,Mean,Min,Mul,Neg,NonMaxSuppression,NonZero,Not,Or,PRelu,Pad,Pow,QuantizeLinear,Range,Reciprocal,ReduceMax,ReduceMean,ReduceMin,ReduceProd,ReduceSum,Relu,Reshape,Resize,ReverseSequence,Round,ScatterND,Shape,Sigmoid,Sin,Size,Slice,Softmax,SpaceToDepth,Split,Sqrt,Squeeze,Sub,Sum,Tanh,ThresholdedRelu,Tile,TopK,Transpose,Unique,Unsqueeze,Where
 
 # other ops found in test models 
-ai.onnx;12;GlobalAveragePool,MatMulInteger,QLinearConv,QLinearMatMul
-ai.onnx;13;GlobalAveragePool,MatMulInteger,QLinearConv,QLinearMatMul
+ai.onnx;12;Erf,GlobalAveragePool,InstanceNormalization,MatMulInteger,QLinearConv,QLinearMatMul
+ai.onnx;13;Erf,GlobalAveragePool,InstanceNormalization,MatMulInteger,QLinearConv,QLinearMatMul
 
 # Control flow ops
 #  - If and Loop are covered by the tflite converter list
@@ -24,7 +24,9 @@ ai.onnx;12;Scan
 ai.onnx;13;Scan
 
 # internal ops added by optimizers
-com.microsoft;1;DynamicQuantizeMatMul,FusedConv,FusedGemm,FusedMatMul,MatMulIntegerToFloat,NhwcMaxPool,QLinearAdd,QLinearAveragePool,QLinearConv,QLinearGlobalAveragePool,QLinearMul,QLinearSigmoid
+# Note: LayerNormalization is an internal op even though it is (incorrectly) registered in the ONNX domain.
+ai.onnx;1;LayerNormalization
+com.microsoft;1;DynamicQuantizeMatMul,FusedConv,FusedGemm,FusedMatMul,Gelu,MatMulIntegerToFloat,NhwcMaxPool,QLinearAdd,QLinearAveragePool,QLinearConv,QLinearGlobalAveragePool,QLinearMul,QLinearSigmoid
 # NHWC transformer also uses this, so assuming it's valuable enough to include 
 com.microsoft;1;QLinearLeakyRelu
 
diff --git a/tools/ci_build/github/android/mobile_package.required_operators.readme.txt b/tools/ci_build/github/android/mobile_package.required_operators.readme.txt
index 707eb790a9..141e336e29 100644
--- a/tools/ci_build/github/android/mobile_package.required_operators.readme.txt
+++ b/tools/ci_build/github/android/mobile_package.required_operators.readme.txt
@@ -76,4 +76,5 @@ Other
   - SuperResolution (https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html)
   - DeepLabV3 (https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html)
   - EfficientNet (https://github.com/lukemelas/EfficientNet-PyTorch)
-  - SSD Mobilenet V1 and V2 (https://github.com/qfgaohao/pytorch-ssd)
\ No newline at end of file
+  - SSD Mobilenet V1 and V2 (https://github.com/qfgaohao/pytorch-ssd)
+  - Wav2Vec 2.0 (adapted from https://github.com/pytorch/ios-demo-app/blob/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition/create_wav2vec2.py)
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
index 96d53cf863..967356a230 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
@@ -220,6 +220,8 @@ jobs:
              mkdir %%~ni\runtimes\linux-x64
              mkdir %%~ni\runtimes\linux-x64\native
              move linux-x64\linux-x64\libonnxruntime.so %%~ni\runtimes\linux-x64\native\libonnxruntime.so
+             move linux-x64\linux-x64\libonnxruntime_providers_shared.so %%~ni\runtimes\linux-x64\native\libonnxruntime_providers_shared.so
+             move linux-x64\linux-x64\libonnxruntime_providers_cuda.so %%~ni\runtimes\linux-x64\native\libonnxruntime_providers_cuda.so
              pushd %%~ni
              zip -r ..\%%~ni.zip .
              popd
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
index 22b1488f5e..b8075f81ab 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
@@ -16,7 +16,7 @@ jobs:
       "
     DoNugetPack: 'false'
     ArtifactName: 'drop-linux'
-    TimeoutInMinutes: 120
+    TimeoutInMinutes: 140
     # Enable unreleased onnx opsets in CI builds
     # This facilitates testing the implementation for the new opsets
     AllowReleasedOpsetOnly: '0'
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
index 747d24d208..f77e41129c 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@@ -1,7 +1,7 @@
 pandas
 sklearn
 numpy==1.19.5
-transformers==v4.3.2
+transformers==v4.4.2
 tensorboard
 h5py
 wget
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 0948207c5c..35a297ab0f 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -47,7 +47,15 @@ def generate_owners(list, owners):
     list.append('<owners>' + owners + '</owners>')
 
 
-def generate_description(list, description):
+def generate_description(list, package_name):
+    description = ''
+
+    if package_name == 'Microsoft.AI.MachineLearning':
+        description = 'This package contains Windows ML binaries.'
+    elif 'Microsoft.ML.OnnxRuntime' in package_name:  # This is a Microsoft.ML.OnnxRuntime.* package
+        description = 'This package contains native shared library artifacts ' \
+                      'for all supported platforms of ONNX Runtime.'
+
     list.append('<description>' + description + '</description>')
 
 
@@ -153,8 +161,7 @@ def generate_metadata(list, args):
     generate_version(metadata_list, args.package_version)
     generate_authors(metadata_list, 'Microsoft')
     generate_owners(metadata_list, 'Microsoft')
-    generate_description(metadata_list, 'This package contains native shared library artifacts '
-                                        'for all supported platforms of ONNX Runtime.')
+    generate_description(metadata_list, args.package_name)
     generate_copyright(metadata_list, '\xc2\xa9 ' + 'Microsoft Corporation. All rights reserved.')
     generate_tags(metadata_list, 'ONNX ONNX Runtime Machine Learning')
     generate_icon_url(metadata_list, 'https://go.microsoft.com/fwlink/?linkid=2049168')
diff --git a/tools/python/register_custom_ops_pytorch_exporter.py b/tools/python/register_custom_ops_pytorch_exporter.py
index 7173c66f8b..4cd3938a3d 100644
--- a/tools/python/register_custom_ops_pytorch_exporter.py
+++ b/tools/python/register_custom_ops_pytorch_exporter.py
@@ -4,8 +4,8 @@
 # Register pytorch symbolic for export using ONNX Runtime contrib ops
 
 from torch.onnx import register_custom_op_symbolic
-from torch.onnx.symbolic_helper import parse_args
 import torch.onnx.symbolic_helper as sym_help
+from torch.onnx.symbolic_helper import parse_args, _get_tensor_dim_size, _get_tensor_sizes
 
 _onnx_opset_version = 1
 
@@ -18,16 +18,16 @@ def register_custom_op(is_ortmodule=False):
 
     # Symbolic definition
     def inverse(g, self):
-        return g.op("com.microsoft::Inverse", self)
+        return g.op("com.microsoft::Inverse", self).setType(self.type())
 
     def gelu(g, self):
-        return g.op("com.microsoft::Gelu", self)
+        return g.op("com.microsoft::Gelu", self).setType(self.type())
 
     def triu(g, self, diagonal):
-        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1)
+        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())
 
     def tril(g, self, diagonal):
-        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0)
+        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0).setType(self.type())
 
     # Op Registration
     register_custom_op_symbolic('::inverse', inverse, _onnx_opset_version)
@@ -45,8 +45,13 @@ def register_custom_op(is_ortmodule=False):
                 f'"sparse":{str(sparse).lower()}'
                 '}'
             )
-            return g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
-                        custom_attributes_json_s=custom_attributes_json)
+            output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
+                          custom_attributes_json_s=custom_attributes_json)
+            indices_shape = _get_tensor_sizes(indices)
+            if indices_shape is not None and hasattr(weight.type(), 'with_sizes'):
+                output_type = weight.type().with_sizes(indices_shape + [_get_tensor_dim_size(weight, 1)])
+                output.setType(output_type)
+            return output
 
         register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version)