Cherry picks for release - 1.8.1 Round 2 (#8137)

* fix boost download url (#7843) * Topo sort the model before saving (#7913) * checkin toposort * review comments * revert and add TODO * Add shape inference to custom symbolic functions (#7937) **Description**: As title. **Motivation and Context** - PyTorch ONNX exporter heavily depends on ONNX shape inference to export accurate and efficient model. Custom symbolic function exports the op as contrib ops, thus exporter is unable to perform standard onnx shape inference. Models with dynamic shape inputs are affected. * Fix missing files on linux (#8066) * [Mobile package] Update required operator config with additional ops for wav2vec2. (#8079) Add some additional ops to the mobile package that are needed for the wav2vec2 model. * Add module attribute to ORTModule to support HuggingFace Trainer save_model (#8088) * Fix input schema extrator for ORTModule (#8098) * Fix 32bit Android java API crash (#8122) * Fix 32bit Android java API crash * fix code formating * [Mobile package] Update required operator config with additional ops for newer version of Wav2Vec 2. (#8123) This is an update to https://github.com/microsoft/onnxruntime/pull/8079 The sample application motivating the original update changed to use an updated version of the model. Now, fewer ops are required. This change removes the previously added ops which are no longer needed. * Add int64 as a required type to ConstantOfShape as it's used by the pytorch converter for Pad. (#8128) It's also used pointlessly for torch.tensor.repeat (although that usage should always be able to be constant folded). * Update logic in props.xml to account for shared provider library changes (#8138) * Ortmodule override torch.manual_seed() (#8131) * Ortmodule override torch.manual_seed() * Fix Python Cuda loading issues (#7939) * Fix mac shared_provider warning (#8153) Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Bowen Bao <bowbao@microsoft.com> Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: baijumeswani <bmeswani@microsoft.com> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: Hariharan Seshadri <shariharan91@gmail.com> Co-authored-by: Sherlock <baihan.huang@gmail.com>
2026-06-25 02:50:42 +00:00 · 2021-06-26 11:26:29 -07:00 · 2021-06-26 11:26:29 -07:00 · 2e55002e50
commit 2e55002e50
parent b85c7e7a8c
25 changed files with 300 additions and 116 deletions
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@ -171,7 +171,7 @@
            "Other": {
               "Name": "Boost",
               "Version": "1.69.0",
-               "DownloadUrl": "http://dl.bintray.com/boostorg/release/1.69.0/source/boost_1_69_0.tar.bz2"
+               "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2"
            }
         }
      },
@ -462,14 +462,14 @@
      },
      {
         "component": {
-           "type": "git",
-           "git": {
-             "commitHash": "e1e11e0d555c08bec08a6c7773aa777dfcaae9da",
-             "repositoryUrl": "https://github.com/dmlc/dlpack.git"
-           },
-           "comments": "dlpack"
+            "type": "git",
+            "git": {
+               "commitHash": "e1e11e0d555c08bec08a6c7773aa777dfcaae9da",
+               "repositoryUrl": "https://github.com/dmlc/dlpack.git"
+            },
+            "comments": "dlpack"
         }
      }
   ],
   "Version": 1
-}
+}
--- a/csharp/src/Microsoft.ML.OnnxRuntime/props.xml
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/props.xml
@ -45,6 +45,36 @@
      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
      <Visible>false</Visible>
    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_shared.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_shared.dll')">
+      <Link>onnxruntime_providers_shared.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_cuda.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_cuda.dll')">
+      <Link>onnxruntime_providers_cuda.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_dnnl.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_dnnl.dll')">
+      <Link>onnxruntime_providers_dnnl.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_tensorrt.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_tensorrt.dll')">
+      <Link>onnxruntime_providers_tensorrt.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
+    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_openvino.dll"
+          Condition="Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\onnxruntime_providers_openvino.dll')">
+      <Link>onnxruntime_providers_openvino.dll</Link>
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+      <Visible>false</Visible>
+    </None>
    <None Include="$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll"
          Condition="('$(PlatformTarget)' == 'x64' OR ('$(PlatformTarget)' == 'AnyCPU' AND '$(Prefer32Bit)' != 'true')) AND
                     Exists('$(MSBuildThisFileDirectory)..\..\runtimes\win-x64\native\dnnl.dll')">
--- a/java/src/main/native/ai_onnxruntime_OrtSession.c
+++ b/java/src/main/native/ai_onnxruntime_OrtSession.c
@ -260,15 +260,19 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run
    jobject* javaOutputStrings;
    checkOrtStatus(jniEnv, api, api->AllocatorAlloc(allocator,sizeof(jobject)*numOutputs,(void**)&javaOutputStrings));

-    // Extract the names of the input values.
+    // Extract a C array of longs which are pointers to the input tensors.
+    // Need to convert longs to OrtValue* in case we run on non-64bit systems
+    jlong* inputTensors = (*jniEnv)->GetLongArrayElements(jniEnv,tensorArr,NULL);
+    const OrtValue** inputValues;
+    checkOrtStatus(jniEnv, api, api->AllocatorAlloc(allocator,sizeof(OrtValue*)*numInputs,(void**)&inputValues));
+
+    // Extract the names and native pointers of the input values.
    for (int i = 0; i < numInputs; i++) {
        javaInputStrings[i] = (*jniEnv)->GetObjectArrayElement(jniEnv,inputNamesArr,i);
        inputNames[i] = (*jniEnv)->GetStringUTFChars(jniEnv,javaInputStrings[i],NULL);
+        inputValues[i] = (OrtValue*)inputTensors[i];
    }

-    // Extract a C array of longs which are pointers to the input tensors.
-    jlong* inputTensors = (*jniEnv)->GetLongArrayElements(jniEnv,tensorArr,NULL);
-
    // Extract the names of the output values, and allocate their output array.
    OrtValue** outputValues;
    checkOrtStatus(jniEnv,api,api->AllocatorAlloc(allocator,sizeof(OrtValue*)*numOutputs,(void**)&outputValues));
@ -281,7 +285,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run
    // Actually score the inputs.
    //printf("inputTensors = %p, first tensor = %p, numInputs = %ld, outputValues = %p, numOutputs = %ld\n",inputTensors,(OrtValue*)inputTensors[0],numInputs,outputValues,numOutputs);
    //ORT_API_STATUS(OrtRun, _Inout_ OrtSession* sess, _In_ OrtRunOptions* run_options, _In_ const char* const* input_names, _In_ const OrtValue* const* input, size_t input_len, _In_ const char* const* output_names, size_t output_names_len, _Out_ OrtValue** output);
-    checkOrtStatus(jniEnv,api,api->Run(session, runOptions, (const char* const*) inputNames, (const OrtValue* const*) inputTensors, numInputs, (const char* const*) outputNames, numOutputs, outputValues));
+    checkOrtStatus(jniEnv,api,api->Run(session, runOptions, (const char* const*) inputNames, (const OrtValue* const*) inputValues, numInputs, (const char* const*) outputNames, numOutputs, outputValues));
    // Release the C array of pointers to the tensors.
    (*jniEnv)->ReleaseLongArrayElements(jniEnv,tensorArr,inputTensors,JNI_ABORT);

@ -307,6 +311,7 @@ JNIEXPORT jobjectArray JNICALL Java_ai_onnxruntime_OrtSession_run

    // Release the buffers
    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)inputNames));
+    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)inputValues));
    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, (void*)outputNames));
    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, javaInputStrings));
    checkOrtStatus(jniEnv, api, api->AllocatorFree(allocator, javaOutputStrings));
--- a/onnxruntime/core/framework/provider_bridge_ort.cc
+++ b/onnxruntime/core/framework/provider_bridge_ort.cc
@ -109,9 +109,8 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;

 namespace onnxruntime {

-//ProviderHost* g_host{};
-
-ProviderInfo_CUDA* GetProviderInfo_CUDA();
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
+ProviderInfo_CUDA& GetProviderInfo_CUDA();

 struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
  TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
@ -187,15 +186,15 @@ struct ProviderHostImpl : ProviderHost {
  void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); }

 #ifdef USE_CUDA
-  std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAAllocator(device_id, name); }
-  std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAPinnedAllocator(device_id, name); }
-  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA()->CreateGPUDataTransfer(stream); }
+  std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); }
+  std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(device_id, name); }
+  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(stream); }

-  void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
-  void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
+  void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
+  void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }

-  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_false(retCode, exprString, libName, successCode, msg); }
-  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_true(retCode, exprString, libName, successCode, msg); }
+  bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg); }
+  bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg); }
 #endif

  std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); }
@ -1003,7 +1002,7 @@ void UnloadSharedProviders() {

 // Used by test code
 std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
    return info->CreateCUDAPinnedAllocator(device_id, name);

  return nullptr;
@ -1050,10 +1049,17 @@ ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO() {
  return nullptr;
 }

-ProviderInfo_CUDA* GetProviderInfo_CUDA() {
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA() {
  if (auto* provider = s_library_cuda.Get())
    return reinterpret_cast<ProviderInfo_CUDA*>(provider->GetInfo());
-  LOGS_DEFAULT(WARNING) << "GetProviderInfo_CUDA called, returning nullptr";
+
+  return nullptr;
+}
+
+ProviderInfo_CUDA& GetProviderInfo_CUDA() {
+  if(auto* info = TryGetProviderInfo_CUDA())
+    return *info;
+
  ORT_THROW("CUDA Provider not available, can't get interface for it");
 }

@ -1063,13 +1069,13 @@ void CopyGpuToCpu(
    const size_t size,
    const OrtMemoryInfo& dst_location,
    const OrtMemoryInfo& src_location) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
    return info->CopyGpuToCpu(dst_ptr, src_ptr, size, dst_location, src_location);
  ORT_THROW("GPU-to-CPU copy is not implemented.");
 }

 void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) {
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
    return info->cudaMemcpy_HostToDevice(dst, src, count);
  ORT_THROW("cudaMemcpy_HostToDevice is not implemented.");
 }
@ -1077,7 +1083,7 @@ void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) {
 #if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
 namespace cuda {
 INcclService& INcclService::GetInstance() {
-  return GetProviderInfo_CUDA()->GetINcclService();
+  return GetProviderInfo_CUDA().GetINcclService();
 }
 }  // namespace cuda
 #endif
@ -1147,17 +1153,17 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessi

 ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) {
  API_IMPL_BEGIN
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
    return info->SetCurrentGpuDeviceId(device_id);
-  return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
+  return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
  API_IMPL_END
 }

 ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) {
  API_IMPL_BEGIN
-  if (auto* info = onnxruntime::GetProviderInfo_CUDA())
+  if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
    return info->GetCurrentGpuDeviceId(device_id);
-  return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
+  return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
  API_IMPL_END
 }

--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@ -40,6 +40,7 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 namespace onnxruntime {

 namespace {
+
 class WindowsThread : public EnvThread {
 private:
  struct Param {
--- a/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
+++ b/onnxruntime/core/providers/cpu/generator/constant_of_shape.cc
@ -10,7 +10,13 @@ namespace op_kernel_type_control {
 ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPE_LIST_ALL_OPSETS(
    kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, Output, 0,
    ConstantOfShapeDefaultOutputTypes);
-}
+
+// pytorch converter uses ConstantOfShape with int64 to create Pad input
+// https://github.com/pytorch/pytorch/blob/044b519a80459f6787f6723c1c091a18b153d184/torch/onnx/symbolic_opset11.py#L449
+ORT_SPECIFY_OP_KERNEL_ARG_REQUIRED_TYPES_ALL_OPSETS(
+    kCpuExecutionProvider, kOnnxDomain, ConstantOfShape, Output, 0,
+    int64_t);
+}  // namespace op_kernel_type_control

 namespace {

--- a/onnxruntime/python/_pybind_state.py
+++ b/onnxruntime/python/_pybind_state.py
@ -7,50 +7,12 @@ Ensure that dependencies are available and then load the extension module.
 """
 import os
 import platform
-import sys

 from . import _ld_preload  # noqa: F401

 if platform.system() == "Windows":
    from . import version_info

-    if version_info.use_cuda:
-        cuda_version_major, cuda_version_minor = version_info.cuda_version.split(".")
-        if int(cuda_version_major) < 11:
-            # Prior to CUDA 11 both major and minor version at build time/runtime have to match.
-            cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{cuda_version_minor}"
-            if cuda_env_variable not in os.environ:
-                raise ImportError(f"CUDA Toolkit {version_info.cuda_version} not installed on the machine.")
-        else:
-            # With CUDA 11 and newer only the major version at build time/runtime has to match.
-            # Use the most recent minor version available.
-            cuda_env_variable = None
-            for i in range(9, -1, -1):
-                if f"CUDA_PATH_V{cuda_version_major}_{i}" in os.environ:
-                    cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{i}"
-                    break
-            if not cuda_env_variable:
-                raise ImportError(f"CUDA Toolkit {cuda_version_major}.x not installed on the machine.")
-
-        cuda_bin_dir = os.path.join(os.environ[cuda_env_variable], "bin")
-        if not os.path.isfile(os.path.join(cuda_bin_dir, f"cudnn64_{version_info.cudnn_version}.dll")):
-            raise ImportError(f"cuDNN {version_info.cudnn_version} not installed in {cuda_bin_dir}.")
-
-        if sys.version_info >= (3, 8):
-            # Python 3.8 (and later) doesn't search system PATH when loading DLLs, so the CUDA location needs to be
-            # specified explicitly using the new API introduced in Python 3.8.
-            os.add_dll_directory(cuda_bin_dir)
-            cuda_root = os.path.join(cuda_bin_dir, "..", "..")
-            for root, _, files in os.walk(cuda_root):
-                for f in files:
-                    if f == "cupti.lib":
-                        os.add_dll_directory(root)
-        else:
-            # Python 3.7 (and earlier) searches directories listed in PATH variable.
-            # Make sure that the target CUDA version is at the beginning (important if multiple CUDA versions are
-            # installed on the machine.)
-            os.environ["PATH"] = cuda_bin_dir + os.pathsep + os.environ["PATH"]
-
    if version_info.vs2019 and platform.architecture()[0] == "64bit":
        if not os.path.isfile("C:\\Windows\\System32\\vcruntime140_1.dll"):
            raise ImportError(
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@ -67,11 +67,11 @@ void CpuToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {

 #ifdef USE_CUDA
 void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes) {
-  GetProviderInfo_CUDA()->cudaMemcpy_HostToDevice(dst, src, num_bytes);
+  GetProviderInfo_CUDA().cudaMemcpy_HostToDevice(dst, src, num_bytes);
 }

 void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
-  GetProviderInfo_CUDA()->cudaMemcpy_DeviceToHost(dst, src, num_bytes);
+  GetProviderInfo_CUDA().cudaMemcpy_DeviceToHost(dst, src, num_bytes);
 }

 const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
@ -82,7 +82,7 @@ const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpy
 }

 bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id) {
-  int num_devices = GetProviderInfo_CUDA()->cudaGetDeviceCount();
+  int num_devices = GetProviderInfo_CUDA().cudaGetDeviceCount();

  if (0 == num_devices) {
    LOGS(logger, WARNING) << "your system does not have a CUDA capable device.";
@ -105,7 +105,7 @@ AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) {

  if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) {
    // TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method
-    id_to_allocator_map->insert({id, GetProviderInfo_CUDA()->CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
+    id_to_allocator_map->insert({id, GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
  }

  return (*id_to_allocator_map)[id];
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@ -463,24 +463,33 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
 #endif
    } else if (type == kCudaExecutionProvider) {
 #ifdef USE_CUDA
-      const auto it = provider_options_map.find(type);
-      CUDAExecutionProviderInfo info{};
-      if (it != provider_options_map.end())
-        GetProviderInfo_CUDA()->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
-      else {
-        info.device_id = cuda_device_id;
-        info.gpu_mem_limit = gpu_mem_limit;
-        info.arena_extend_strategy = arena_extend_strategy;
-        info.cudnn_conv_algo_search = cudnn_conv_algo_search;
-        info.do_copy_in_default_stream = do_copy_in_default_stream;
-        info.external_allocator_info = external_allocator_info;
-      }
+      if(auto* cuda_provider_info = TryGetProviderInfo_CUDA())
+      {
+        const auto it = provider_options_map.find(type);
+        CUDAExecutionProviderInfo info{};
+        if (it != provider_options_map.end())
+          cuda_provider_info->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
+        else {
+          info.device_id = cuda_device_id;
+          info.gpu_mem_limit = gpu_mem_limit;
+          info.arena_extend_strategy = arena_extend_strategy;
+          info.cudnn_conv_algo_search = cudnn_conv_algo_search;
+          info.do_copy_in_default_stream = do_copy_in_default_stream;
+          info.external_allocator_info = external_allocator_info;
+        }

-      // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
-      // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
-      // since FromProviderOptions might contain external CUDA allocator.
-      external_allocator_info = info.external_allocator_info;
-      RegisterExecutionProvider(sess, *GetProviderInfo_CUDA()->CreateExecutionProviderFactory(info));
+        // This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
+        // exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
+        // since FromProviderOptions might contain external CUDA allocator.
+        external_allocator_info = info.external_allocator_info;
+        RegisterExecutionProvider(sess, *cuda_provider_info->CreateExecutionProviderFactory(info));
+      }
+      else
+      {
+        if(!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
+          ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
+        }
+      }
 #endif
    } else if (type == kRocmExecutionProvider) {
 #ifdef USE_ROCM
@ -1608,7 +1617,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
  addOrtValueMethods(m);
  addIoBindingMethods(m);

-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+#if !defined(__APPLE__) && (!defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS))
  Ort::SessionOptions tmp_options;
  if (!InitProvidersSharedLibrary()) {
    const logging::Logger& default_logger = logging::LoggingManager::DefaultLogger();
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@ -161,7 +161,8 @@ extern std::string nuphar_settings;
 #if defined(USE_CUDA) || defined(USE_ROCM)
 #ifdef USE_CUDA
 namespace onnxruntime {
-ProviderInfo_CUDA* GetProviderInfo_CUDA();
+ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
+ProviderInfo_CUDA& GetProviderInfo_CUDA();
 namespace python {
 // TODO remove deprecated global config
 extern OrtCudnnConvAlgoSearch cudnn_conv_algo_search;
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@ -112,7 +112,7 @@ class ONNXModel:

    def find_node_by_name(self, node_name, new_nodes_list, graph):
        '''
-        Find out if a node exists in a graph or a node is in the 
+        Find out if a node exists in a graph or a node is in the
        new set of nodes created during quantization. Return the node found.
        '''
        graph_nodes_list = list(graph.node)  #deep copy
@ -256,6 +256,8 @@ class ONNXModel:
                return True
        return False

+    # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
+    # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
    def topological_sort(self):
        deps_count = [0]*len(self.nodes()) # dependency count of each node
        deps_to_nodes = {} # input to node indice
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@ -781,7 +781,67 @@ class OnnxModel:
                            return False
        return True

+    @staticmethod
+    def graph_topological_sort(graph):
+        deps_count = [0]*len(graph.node) # dependency count of each node
+        deps_to_nodes = {} # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(graph.node):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _ )
+            if deps_count[node_idx] == 0: # Constant doesn't depend on any inputs
+                sorted_nodes.append(graph.node[node_idx])
+                continue
+
+            for input_name in node.input:
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        initializer_names = [init.name for init in graph.initializer]
+        graph_input_names = [input.name for input in graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(graph.node[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(graph.node[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        assert(end == len(graph.node)), "Graph is not a DAG"
+        graph.ClearField('node')
+        graph.node.extend(sorted_nodes)
+
+    def topological_sort(self):
+        #TODO: support graph_topological_sort() in subgraphs
+        #for graph in self.graphs():
+        #    self.graph_topological_sort(graph)
+        OnnxModel.graph_topological_sort(self.model.graph)
+
    def save_model_to_file(self, output_path, use_external_data_format=False):
+        logger.info(f"Sort graphs in topological order")
+        self.topological_sort()
+
        logger.info(f"Output model to {output_path}")

        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@ -66,6 +66,11 @@ struct KernelRegistryAndStatus {
 };
 }  // namespace
 namespace onnxruntime {
+
+#ifdef USE_CUDA
+ProviderInfo_CUDA& GetProviderInfo_CUDA();
+#endif
+
 class FuseAdd : public OpKernel {
 public:
  explicit FuseAdd(const OpKernelInfo& info) : OpKernel(info) {
@ -354,7 +359,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
                                                                  shape,
                                                                  cpu_allocator);
 #ifdef USE_CUDA
-    cudaStream_t stream = static_cast<cudaStream_t>(static_cast<const onnxruntime::CUDAExecutionProvider*>(TestCudaExecutionProvider())->GetComputeStream());
+    cudaStream_t stream = static_cast<cudaStream_t>(gpu_provider->GetComputeStream());
+    st = GetProviderInfo_CUDA().CreateGPUDataTransfer(stream)->CopyTensor(rtensor, *cpu_tensor.get(), 0);
 #elif USE_ROCM
    hipStream_t stream = static_cast<hipStream_t>(static_cast<const onnxruntime::ROCMExecutionProvider*>(TestRocmExecutionProvider())->GetComputeStream());
 #endif
--- a/orttraining/orttraining/python/training/ortmodule/init.py
+++ b/orttraining/orttraining/python/training/ortmodule/init.py
@ -55,5 +55,25 @@ try:
 except:
    raise(f'PyTorch {MINIMUM_TORCH_VERSION_STR} must be installed in order to run ONNX Runtime ORTModule frontend!')

+# Initalized ORT's random seed with pytorch's initial seed
+# Initalized ORT's random seed with pytorch's current seed, 
+# in case user has set pytorch seed before importing ORTModule
+import sys
+from onnxruntime import set_seed
+set_seed((torch.initial_seed() % sys.maxsize))
+
+# Override torch.manual_seed and torch.cuda.manual_seed
+def override_torch_manual_seed(seed):
+    set_seed(seed % sys.maxsize)
+    return torch_manual_seed(seed)
+torch_manual_seed = torch.manual_seed
+torch.manual_seed = override_torch_manual_seed
+
+def override_torch_cuda_manual_seed(seed):
+    set_seed(seed % sys.maxsize)
+    return torch_cuda_manual_seed(seed)
+torch_cuda_manual_seed = torch.cuda.manual_seed
+torch.cuda.manual_seed = override_torch_cuda_manual_seed
+
 # ORTModule must be loaded only after all validation passes
 from .ortmodule import ORTModule
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@ -284,7 +284,7 @@ def _extract_schema(data):
    elif isinstance(data, torch.Tensor):
        return _TensorStub(dtype=str(data.dtype), shape_dims=len(data.size()))

-    if isinstance(data, abc.Sequence):
+    if isinstance(data, abc.Sequence) and not isinstance(data, str):
        sequence_type = type(data)
        data = list(data)
        for idx in range(len(data)):
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@ -193,3 +193,18 @@ class ORTModule(torch.nn.Module):
        """Raises a NotImplementedError exception since ORTModule does not support adding modules to it"""

        raise NotImplementedError("ORTModule does not support adding modules to it.")
+
+    @property
+    def module(self):
+        """The original `torch.nn.Module` that this module wraps.
+
+        This property provides access to methods and properties on the original module.
+        """
+
+        # HuggingFace Trainer `save_model` method checks to see if the input model is a HuggingFace PreTrainedModel
+        # or if the model has an attribute called `module` which references a HuggingFace PreTrainedModel to save
+        # the entire context of the model so that it can be loaded using HuggingFace `from_pretrained` method.
+        # This `module` property enables HuggingFace Trainer to retrieve the underlying PreTrainedModel inside ORTModule
+        # to save and load a complete checkpoint
+
+        return self._module_metadata.original_module
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@ -6,7 +6,7 @@ import math
 import random
 import copy
 import torch
-from transformers import AutoConfig, BertForSequenceClassification
+from transformers import AutoConfig, BertForSequenceClassification, Trainer
 from transformers.modeling_outputs import SequenceClassifierOutput
 import pytest
 from time import sleep
@ -15,6 +15,7 @@ from unittest.mock import patch
 from collections import OrderedDict
 from collections import namedtuple
 from inspect import signature
+import tempfile

 from onnxruntime.training.ortmodule import ORTModule, _utils, _io
 import _test_helpers
@ -2777,3 +2778,46 @@ def test_load_state_dict_for_wrapped_ortmodule():
    for param_name, param_value in state_dict1.items():
        assert param_name in state_dict2
        assert torch.equal(param_value, state_dict2[param_name])
+
+def test_hf_save_pretrained():
+    device = 'cuda'
+
+    model1 = _get_bert_for_sequence_classification_model(device)
+    model1 = ORTModule(model1)
+    state_dict = model1.state_dict()
+    list(next(iter(state_dict.items())))[1] += 100
+    model1.load_state_dict(state_dict)
+
+    trainer = Trainer(model=model1)
+
+    # Assert that ORTModule has an attribute called module. This attribute is used
+    # for trainer.save_model to reference the underlying HuggingFace PreTrainedModel
+    assert hasattr(model1, "module")
+
+    # Create a temporary directory for the checkpoint from save_pretrained
+    with tempfile.TemporaryDirectory() as temporary_dir:
+        trainer.save_model(temporary_dir)
+
+        # Create a new model and compare all state dictionary values for equality
+        # to check if from_pretrained worked.
+        config = AutoConfig.from_pretrained(temporary_dir)
+        model2 = BertForSequenceClassification.from_pretrained(
+            temporary_dir, config=config,
+        ).to(device)
+        model2 = ORTModule(model2)
+
+        for p1, p2 in zip(model1.parameters(), model2.parameters()):
+            assert p1.data.ne(p2.data).sum() == 0
+
+def test_input_with_string_exception():
+    class MyStrNet(torch.nn.Module):
+        def forward(self, x, my_str):
+            if my_str.lower() == 'hello':
+                print('hi')
+            return x
+
+    model = MyStrNet()
+    model = ORTModule(model)
+    with pytest.raises(TypeError) as ex_info:
+        _ = model(torch.randn(1, 2), 'hello')
+    assert "ORTModule does not support the following model data type <class 'str'>" in str(ex_info.value)
--- a/server/get_boost.cmake
+++ b/server/get_boost.cmake
@ -70,7 +70,7 @@ macro(DOWNLOAD_BOOST)
  include(ExternalProject)
  ExternalProject_Add(
      Boost
-      URL http://dl.bintray.com/boostorg/release/${BOOST_REQUESTED_VERSION}/source/boost_${BOOST_REQUESTED_VERSION_UNDERSCORE}.tar.bz2
+      URL https://boostorg.jfrog.io/artifactory/main/release/${BOOST_REQUESTED_VERSION}/source/boost_${BOOST_REQUESTED_VERSION_UNDERSCORE}.tar.bz2
      URL_HASH SHA256=${BOOST_SHA1}
      DOWNLOAD_DIR ${BOOST_ROOT_DIR}
      SOURCE_DIR ${BOOST_ROOT_DIR}
--- a/tools/ci_build/github/android/mobile_package.required_operators.config
+++ b/tools/ci_build/github/android/mobile_package.required_operators.config
@ -14,8 +14,8 @@ ai.onnx;12;Abs,Add,And,ArgMax,ArgMin,AveragePool,Cast,Ceil,Clip,Concat,ConstantO
 ai.onnx;13;Abs,Add,And,ArgMax,ArgMin,AveragePool,Cast,Ceil,Clip,Concat,ConstantOfShape,Conv,ConvTranspose,Cos,CumSum,DepthToSpace,DequantizeLinear,Div,DynamicQuantizeLinear,Elu,Equal,Exp,Expand,Flatten,Floor,Gather,GatherND,Gemm,Greater,GreaterOrEqual,Identity,If,LRN,LeakyRelu,Less,LessOrEqual,Log,LogSoftmax,Loop,MatMul,Max,MaxPool,Mean,Min,Mul,Neg,NonMaxSuppression,NonZero,Not,Or,PRelu,Pad,Pow,QuantizeLinear,Range,Reciprocal,ReduceMax,ReduceMean,ReduceMin,ReduceProd,ReduceSum,Relu,Reshape,Resize,ReverseSequence,Round,ScatterND,Shape,Sigmoid,Sin,Size,Slice,Softmax,SpaceToDepth,Split,Sqrt,Squeeze,Sub,Sum,Tanh,ThresholdedRelu,Tile,TopK,Transpose,Unique,Unsqueeze,Where

 # other ops found in test models 
-ai.onnx;12;GlobalAveragePool,MatMulInteger,QLinearConv,QLinearMatMul
-ai.onnx;13;GlobalAveragePool,MatMulInteger,QLinearConv,QLinearMatMul
+ai.onnx;12;Erf,GlobalAveragePool,InstanceNormalization,MatMulInteger,QLinearConv,QLinearMatMul
+ai.onnx;13;Erf,GlobalAveragePool,InstanceNormalization,MatMulInteger,QLinearConv,QLinearMatMul

 # Control flow ops
 #  - If and Loop are covered by the tflite converter list
@ -24,7 +24,9 @@ ai.onnx;12;Scan
 ai.onnx;13;Scan

 # internal ops added by optimizers
-com.microsoft;1;DynamicQuantizeMatMul,FusedConv,FusedGemm,FusedMatMul,MatMulIntegerToFloat,NhwcMaxPool,QLinearAdd,QLinearAveragePool,QLinearConv,QLinearGlobalAveragePool,QLinearMul,QLinearSigmoid
+# Note: LayerNormalization is an internal op even though it is (incorrectly) registered in the ONNX domain.
+ai.onnx;1;LayerNormalization
+com.microsoft;1;DynamicQuantizeMatMul,FusedConv,FusedGemm,FusedMatMul,Gelu,MatMulIntegerToFloat,NhwcMaxPool,QLinearAdd,QLinearAveragePool,QLinearConv,QLinearGlobalAveragePool,QLinearMul,QLinearSigmoid
 # NHWC transformer also uses this, so assuming it's valuable enough to include 
 com.microsoft;1;QLinearLeakyRelu

--- a/tools/ci_build/github/android/mobile_package.required_operators.readme.txt
+++ b/tools/ci_build/github/android/mobile_package.required_operators.readme.txt
@ -76,4 +76,5 @@ Other
  - SuperResolution (https://pytorch.org/tutorials/advanced/super_resolution_with_onnxruntime.html)
  - DeepLabV3 (https://pytorch.org/tutorials/beginner/deeplabv3_on_android.html)
  - EfficientNet (https://github.com/lukemelas/EfficientNet-PyTorch)
-  - SSD Mobilenet V1 and V2 (https://github.com/qfgaohao/pytorch-ssd)
+  - SSD Mobilenet V1 and V2 (https://github.com/qfgaohao/pytorch-ssd)
+  - Wav2Vec 2.0 (adapted from https://github.com/pytorch/ios-demo-app/blob/f2b9aa196821c136d3299b99c5dd592de1fa1776/SpeechRecognition/create_wav2vec2.py)
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml
@ -220,6 +220,8 @@ jobs:
             mkdir %%~ni\runtimes\linux-x64
             mkdir %%~ni\runtimes\linux-x64\native
             move linux-x64\linux-x64\libonnxruntime.so %%~ni\runtimes\linux-x64\native\libonnxruntime.so
+             move linux-x64\linux-x64\libonnxruntime_providers_shared.so %%~ni\runtimes\linux-x64\native\libonnxruntime_providers_shared.so
+             move linux-x64\linux-x64\libonnxruntime_providers_cuda.so %%~ni\runtimes\linux-x64\native\libonnxruntime_providers_cuda.so
             pushd %%~ni
             zip -r ..\%%~ni.zip .
             popd
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
@ -16,7 +16,7 @@ jobs:
      "
    DoNugetPack: 'false'
    ArtifactName: 'drop-linux'
-    TimeoutInMinutes: 120
+    TimeoutInMinutes: 140
    # Enable unreleased onnx opsets in CI builds
    # This facilitates testing the implementation for the new opsets
    AllowReleasedOpsetOnly: '0'
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@ -1,7 +1,7 @@
 pandas
 sklearn
 numpy==1.19.5
-transformers==v4.3.2
+transformers==v4.4.2
 tensorboard
 h5py
 wget
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@ -47,7 +47,15 @@ def generate_owners(list, owners):
    list.append('<owners>' + owners + '</owners>')


-def generate_description(list, description):
+def generate_description(list, package_name):
+    description = ''
+
+    if package_name == 'Microsoft.AI.MachineLearning':
+        description = 'This package contains Windows ML binaries.'
+    elif 'Microsoft.ML.OnnxRuntime' in package_name:  # This is a Microsoft.ML.OnnxRuntime.* package
+        description = 'This package contains native shared library artifacts ' \
+                      'for all supported platforms of ONNX Runtime.'
+
    list.append('<description>' + description + '</description>')


@ -153,8 +161,7 @@ def generate_metadata(list, args):
    generate_version(metadata_list, args.package_version)
    generate_authors(metadata_list, 'Microsoft')
    generate_owners(metadata_list, 'Microsoft')
-    generate_description(metadata_list, 'This package contains native shared library artifacts '
-                                        'for all supported platforms of ONNX Runtime.')
+    generate_description(metadata_list, args.package_name)
    generate_copyright(metadata_list, '\xc2\xa9 ' + 'Microsoft Corporation. All rights reserved.')
    generate_tags(metadata_list, 'ONNX ONNX Runtime Machine Learning')
    generate_icon_url(metadata_list, 'https://go.microsoft.com/fwlink/?linkid=2049168')
--- a/tools/python/register_custom_ops_pytorch_exporter.py
+++ b/tools/python/register_custom_ops_pytorch_exporter.py
@ -4,8 +4,8 @@
 # Register pytorch symbolic for export using ONNX Runtime contrib ops

 from torch.onnx import register_custom_op_symbolic
-from torch.onnx.symbolic_helper import parse_args
 import torch.onnx.symbolic_helper as sym_help
+from torch.onnx.symbolic_helper import parse_args, _get_tensor_dim_size, _get_tensor_sizes

 _onnx_opset_version = 1

@ -18,16 +18,16 @@ def register_custom_op(is_ortmodule=False):

    # Symbolic definition
    def inverse(g, self):
-        return g.op("com.microsoft::Inverse", self)
+        return g.op("com.microsoft::Inverse", self).setType(self.type())

    def gelu(g, self):
-        return g.op("com.microsoft::Gelu", self)
+        return g.op("com.microsoft::Gelu", self).setType(self.type())

    def triu(g, self, diagonal):
-        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1)
+        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1).setType(self.type())

    def tril(g, self, diagonal):
-        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0)
+        return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0).setType(self.type())

    # Op Registration
    register_custom_op_symbolic('::inverse', inverse, _onnx_opset_version)
@ -45,8 +45,13 @@ def register_custom_op(is_ortmodule=False):
                f'"sparse":{str(sparse).lower()}'
                '}'
            )
-            return g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
-                        custom_attributes_json_s=custom_attributes_json)
+            output = g.op("com.microsoft::ATenOp", weight, indices, name_s='aten::embedding',
+                          custom_attributes_json_s=custom_attributes_json)
+            indices_shape = _get_tensor_sizes(indices)
+            if indices_shape is not None and hasattr(weight.type(), 'with_sizes'):
+                output_type = weight.type().with_sizes(indices_shape + [_get_tensor_dim_size(weight, 1)])
+                output.setType(output_type)
+            return output

        register_custom_op_symbolic('::embedding', embedding, _onnx_opset_version)