ATen Fallback for Inference (#11597)

* aten op for inference

* fix build error

* more some code to training only

* remove domain from operator name

* move aten_op_executor ext out from ortmodule

* add pipeline

* add exec mode

* fix script

* fix ut script

* fix test pipeline

* failure test

* rollback

* bugfix

* resolve comments

* enable aten for python build only

* fix win build

* use target_compile_definitions

* support io binding

* turn off aten by default

* fix ut

Co-authored-by: Vincent Wang <weicwang@microsoft.com>
Co-authored-by: zhijxu <zhijxu@microsoft.com>
This commit is contained in:
Vincent Wang 2022-06-09 16:07:30 +08:00 committed by GitHub
parent 927bac0f86
commit 5ecfaef042
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
44 changed files with 755 additions and 267 deletions

View file

@ -184,6 +184,9 @@ option(onnxruntime_ENABLE_CUDA_PROFILING "Enable CUDA kernel profiling" OFF)
option(onnxruntime_ENABLE_CPUINFO "Enable cpuinfo" ON)
# ATen fallback support
option(onnxruntime_ENABLE_ATEN "Enable ATen fallback" OFF)
if (onnxruntime_USE_CUDA)
set(onnxruntime_DISABLE_RTTI OFF)
endif()
@ -906,7 +909,7 @@ if (onnxruntime_ENABLE_CPUINFO)
else()
# if xnnpack is enabled in a wasm build it needs clog from cpuinfo, but we won't internally use cpuinfo
# so we don't set CPUINFO_SUPPORTED in the CXX flags below.
if (onnxruntime_BUILD_WEBASSEMBLY AND NOT onnxruntime_USE_XNNPACK)
if (onnxruntime_BUILD_WEBASSEMBLY AND NOT onnxruntime_USE_XNNPACK)
set(CPUINFO_SUPPORTED FALSE)
else()
set(CPUINFO_SUPPORTED TRUE)
@ -937,8 +940,8 @@ if (CPUINFO_SUPPORTED)
set(IOS ON CACHE INTERNAL "")
set(IOS_ARCH "${CMAKE_OSX_ARCHITECTURES}" CACHE INTERNAL "")
endif()
# if this is a wasm build with xnnpack (only type of wasm build where cpuinfo is involved)
# if this is a wasm build with xnnpack (only type of wasm build where cpuinfo is involved)
# we do not use cpuinfo in ORT code, so don't define CPUINFO_SUPPORTED.
if (NOT onnxruntime_BUILD_WEBASSEMBLY)
string(APPEND CMAKE_CXX_FLAGS " -DCPUINFO_SUPPORTED")
@ -985,8 +988,8 @@ include(eigen)
set(onnxruntime_EXTERNAL_LIBRARIES onnx onnx_proto ${PROTOBUF_LIB} re2::re2)
if(NOT onnxruntime_DISABLE_ABSEIL)
set(ABSEIL_LIBS absl::inlined_vector absl::flat_hash_set
absl::flat_hash_map absl::node_hash_set absl::node_hash_map absl::base absl::throw_delegate absl::raw_hash_set
set(ABSEIL_LIBS absl::inlined_vector absl::flat_hash_set
absl::flat_hash_map absl::node_hash_set absl::node_hash_map absl::base absl::throw_delegate absl::raw_hash_set
absl::hash absl::city absl::low_level_hash absl::raw_logging_internal)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ABSEIL_LIBS})
else()
@ -1301,7 +1304,7 @@ function(onnxruntime_configure_target target_name)
set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE)
set_target_properties(${target_name} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE)
endif()
# Keep BinSkim happy
if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
target_link_options(${target_name} PRIVATE "/CETCOMPAT")
@ -1542,7 +1545,7 @@ if (onnxruntime_USE_XNNPACK)
set(XNNPACK_DIR external/XNNPACK)
set(XNNPACK_INCLUDE_DIR ${XNNPACK_DIR}/include)
set(XNNPACK_USE_SYSTEM_LIBS ON CACHE INTERNAL "")
set(XNNPACK_USE_SYSTEM_LIBS ON CACHE INTERNAL "")
set(XNNPACK_BUILD_TESTS OFF CACHE INTERNAL "")
set(XNNPACK_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
@ -1550,14 +1553,14 @@ if (onnxruntime_USE_XNNPACK)
set(CLOG_SOURCE_DIR "${PYTORCH_CPUINFO_DIR}/deps/clog")
set(CPUINFO_SOURCE_DIR ${PYTORCH_CPUINFO_DIR})
add_subdirectory(external/FP16)
add_subdirectory(external/FP16)
add_subdirectory(external/pthreadpool)
add_subdirectory(external/XNNPACK)
set_target_properties(fp16 PROPERTIES FOLDER "External/Xnnpack")
set_target_properties(pthreadpool PROPERTIES FOLDER "External/Xnnpack")
set_target_properties(XNNPACK PROPERTIES FOLDER "External/Xnnpack")
set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK pthreadpool)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK})
@ -1570,19 +1573,19 @@ if (onnxruntime_USE_XNNPACK)
message("Adding WebAssembly Source Files to XNNPACK")
set(wasm_src_patterns "${XNNPACK_DIR}/src/wasm-*.c"
"${XNNPACK_DIR}/src/*-wasm-*.c"
"${XNNPACK_DIR}/src/*-wasm.c")
"${XNNPACK_DIR}/src/*-wasm.c")
set(wasm32_asm_src_patterns "${XNNPACK_DIR}/src/wasm_shr_*.S")
file(GLOB_RECURSE XNNPACK_WASM_MICROKERNEL_SRCS CONFIGURE_DEPENDS ${wasm_src_patterns})
file(GLOB_RECURSE XNNPACK_WASM32_ASM_MICROKERNEL_SRCS CONFIGURE_DEPENDS ${wasm32_asm_src_patterns})
message(DEBUG "XNNPACK_WASM_MICROKERNEL_SRCS:${XNNPACK_WASM_MICROKERNEL_SRCS}")
message(DEBUG "XNNPACK_WASM32_ASM_MICROKERNEL_SRCS:${XNNPACK_WASM32_ASM_MICROKERNEL_SRCS}")
target_sources(XNNPACK PRIVATE ${XNNPACK_WASM_MICROKERNEL_SRCS}
${XNNPACK_WASM32_ASM_MICROKERNEL_SRCS})
if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
target_compile_options(XNNPACK PRIVATE "-msimd128")
set(wasmsimd_src_patterns "${XNNPACK_DIR}/src/wasmsimd-*.c"
@ -1591,7 +1594,7 @@ if (onnxruntime_USE_XNNPACK)
file(GLOB_RECURSE XNNPACK_WASMSIMD_MICROKERNEL_SRCS CONFIGURE_DEPENDS ${wasmsimd_src_patterns})
message(DEBUG "XNNPACK_WASMSIMD_MICROKERNEL_SRCS:${XNNPACK_WASMSIMD_MICROKERNEL_SRCS}")
target_sources(XNNPACK PRIVATE ${XNNPACK_WASMSIMD_MICROKERNEL_SRCS})
endif()
endif()
@ -2033,10 +2036,14 @@ if (onnxruntime_ENABLE_TRAINING)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES tensorboard)
endif()
if (onnxruntime_ENABLE_TRAINING)
set(onnxruntime_ENABLE_ATEN ON)
endif()
set(ONNXRUNTIME_TARGETS onnxruntime_common onnxruntime_graph onnxruntime_framework onnxruntime_util onnxruntime_providers onnxruntime_optimizer onnxruntime_session onnxruntime_mlas onnxruntime_flatbuffers)
if (onnxruntime_USE_NUPHAR_TVM)
list(APPEND ONNXRUNTIME_TARGETS onnxruntime_codegen_tvm)
list(APPEND ONNXRUNTIME_TARGETS onnxruntime_codegen_tvm)
endif()
if (onnxruntime_ENABLE_EAGER_MODE)
if (NOT onnxruntime_ENABLE_TRAINING OR NOT onnxruntime_ENABLE_PYTHON)
@ -2205,4 +2212,3 @@ if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
COMMENT "Installing protobuf"
)
endif()

View file

@ -57,8 +57,9 @@ if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
target_include_directories(onnxruntime_framework PUBLIC ${MPI_CXX_INCLUDE_DIRS})
endif()
endif()
if (onnxruntime_ENABLE_TRAINING)
if (onnxruntime_ENABLE_ATEN)
# DLPack is a header-only dependency
target_compile_definitions(onnxruntime_framework PRIVATE ENABLE_ATEN)
set(DLPACK_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/external/dlpack/include)
target_include_directories(onnxruntime_framework PRIVATE ${DLPACK_INCLUDE_DIR})
endif()

View file

@ -126,11 +126,15 @@ if (WIN32)
set_target_properties(onnxruntime_graph PROPERTIES
STATIC_LIBRARY_FLAGS "${onnxruntime_graph_static_library_flags}")
if (NOT onnxruntime_DISABLE_EXCEPTIONS)
if (NOT onnxruntime_DISABLE_EXCEPTIONS)
target_compile_options(onnxruntime_graph PRIVATE
/EHsc # exception handling - C++ may throw, extern "C" will not
)
endif()
endif()
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_graph PRIVATE ENABLE_ATEN)
endif()
if (NOT onnxruntime_BUILD_SHARED_LIB)

View file

@ -165,6 +165,13 @@ set(onnxruntime_providers_src ${onnxruntime_providers_common_srcs} ${onnxruntime
# disable contrib ops conditionally
if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
if (NOT onnxruntime_ENABLE_ATEN)
list(REMOVE_ITEM onnxruntime_cpu_contrib_ops_srcs
"${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op.h"
"${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op.cc"
"${ONNXRUNTIME_ROOT}/contrib_ops/cpu/aten_ops/aten_op_executor.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cpu_contrib_ops_srcs})
list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_contrib_ops_srcs})
@ -195,13 +202,21 @@ if (onnxruntime_ENABLE_TRAINING_OPS)
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/gist/*.h"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/tensorboard/*.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/tensorboard/*.h"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/aten_ops/*.cc"
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/aten_ops/*.h"
)
list(REMOVE_ITEM onnxruntime_providers_src ${onnxruntime_cpu_full_training_only_srcs})
endif()
if (onnxruntime_ENABLE_ATEN)
file(GLOB_RECURSE onnxruntime_providers_dlpack_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.cc"
"${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.h"
)
set(onnxruntime_providers_dlpack_srcs ${onnxruntime_providers_dlpack_srcs})
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dlpack_srcs})
list(APPEND onnxruntime_providers_src ${onnxruntime_providers_dlpack_srcs})
endif()
if (onnxruntime_ENABLE_TRAINING)
file(GLOB_RECURSE onnxruntime_cpu_training_ops_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/training_ops/cpu/*.h"
@ -222,15 +237,6 @@ if (onnxruntime_ENABLE_TRAINING)
source_group(TREE ${ORTTRAINING_ROOT}/ FILES ${onnxruntime_cpu_training_ops_srcs})
list(APPEND onnxruntime_providers_src ${onnxruntime_cpu_training_ops_srcs})
# todo: put this in core/framework and enabled only for training
file(GLOB_RECURSE onnxruntime_providers_dlpack_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.cc"
"${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.h"
)
set(onnxruntime_providers_dlpack_srcs ${onnxruntime_providers_dlpack_srcs})
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dlpack_srcs})
list(APPEND onnxruntime_providers_src ${onnxruntime_providers_dlpack_srcs})
endif()
if (onnxruntime_REDUCED_OPS_BUILD)
@ -277,6 +283,13 @@ if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
target_include_directories(onnxruntime_providers PRIVATE ${ORTTRAINING_ROOT})
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers PRIVATE ENABLE_ATEN)
# DLPack is a header-only dependency
set(DLPACK_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/external/dlpack/include)
target_include_directories(onnxruntime_providers PRIVATE ${DLPACK_INCLUDE_DIR})
endif()
if (onnxruntime_ENABLE_TRAINING)
add_dependencies(onnxruntime_providers tensorboard)
onnxruntime_add_include_to_target(onnxruntime_providers tensorboard)
@ -287,10 +300,6 @@ if (onnxruntime_ENABLE_TRAINING)
if (onnxruntime_USE_NCCL OR onnxruntime_USE_MPI)
target_include_directories(onnxruntime_providers PUBLIC ${MPI_CXX_INCLUDE_DIRS})
endif()
# DLPack is a header-only dependency
set(DLPACK_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/external/dlpack/include)
target_include_directories(onnxruntime_providers PRIVATE ${DLPACK_INCLUDE_DIR})
endif()
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cpu DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
@ -342,7 +351,7 @@ if (onnxruntime_USE_CUDA)
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.cc"
)
# The shared_library files are in a separate list since they use precompiled headers, and the above files have them disabled.
file(GLOB_RECURSE onnxruntime_providers_cuda_shared_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
@ -358,6 +367,11 @@ if (onnxruntime_USE_CUDA)
# disable contrib ops conditionally
if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
if (NOT onnxruntime_ENABLE_ATEN)
list(REMOVE_ITEM onnxruntime_cuda_contrib_ops_cc_srcs
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/aten_ops/aten_op.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_cuda_src ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs})
@ -481,10 +495,10 @@ if (onnxruntime_USE_CUDA)
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.cc"
)
# minimize the Windows includes.
# minimize the Windows includes.
# this avoids an issue with CUDA 11.6 where 'small' is defined in the windows and cuda headers.
target_compile_definitions(onnxruntime_providers_cuda PRIVATE "WIN32_LEAN_AND_MEAN")
# disable a warning from the CUDA headers about unreferenced local functions
#target_compile_options(onnxruntime_providers_cuda PRIVATE /wd4505)
if (onnxruntime_USE_NUPHAR_TVM)
@ -509,6 +523,10 @@ if (onnxruntime_USE_CUDA)
message(FATAL_ERROR "onnxruntime_providers_cuda unknown platform, need to specify shared library exports for it")
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers_cuda PRIVATE ENABLE_ATEN)
endif()
install(TARGETS onnxruntime_providers_cuda
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@ -758,12 +776,12 @@ if (onnxruntime_USE_OPENVINO)
# Header paths
find_package(InferenceEngine REQUIRED)
find_package(ngraph REQUIRED)
if (OPENVINO_2022_1)
find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
list (OV_20_LIBS openvino::frontend::onnx openvino::runtime)
endif()
if (WIN32)
unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO)
endif()
@ -1297,6 +1315,11 @@ if (onnxruntime_USE_ROCM)
# disable contrib ops conditionally
if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
if (NOT onnxruntime_ENABLE_ATEN)
list(REMOVE_ITEM onnxruntime_rocm_contrib_ops_cc_srcs
"${ONNXRUNTIME_ROOT}/contrib_ops/rocm/aten_ops/aten_op.cc"
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_rocm_src ${onnxruntime_rocm_contrib_ops_cc_srcs} ${onnxruntime_rocm_contrib_ops_cu_srcs})
@ -1404,6 +1427,10 @@ if (onnxruntime_USE_ROCM)
message(FATAL_ERROR "onnxruntime_providers_rocm unknown platform, need to specify shared library exports for it")
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ENABLE_ATEN)
endif()
install(TARGETS onnxruntime_providers_rocm
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@ -1499,4 +1526,3 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

View file

@ -51,7 +51,7 @@ if(MSVC)
target_compile_options(onnxruntime_pybind11_state PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
if(onnxruntime_ENABLE_TRAINING)
target_compile_options(onnxruntime_pybind11_state PRIVATE "/bigobj")
endif()
endif()
endif()
if(HAS_CAST_FUNCTION_TYPE)
target_compile_options(onnxruntime_pybind11_state PRIVATE "-Wno-cast-function-type")
@ -98,9 +98,13 @@ else()
set(ONNXRUNTIME_SO_LINK_FLAG "-DEF:${ONNXRUNTIME_ROOT}/python/pybind.def")
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE ENABLE_ATEN)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${PROJECT_SOURCE_DIR}/external/dlpack/include)
endif()
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${ORTTRAINING_ROOT})
target_include_directories(onnxruntime_pybind11_state PRIVATE ${PROJECT_SOURCE_DIR}/external/dlpack/include)
target_link_libraries(onnxruntime_pybind11_state PRIVATE onnxruntime_training)
endif()
@ -128,7 +132,7 @@ if (onnxruntime_ENABLE_EAGER_MODE)
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_util.cpp" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter)
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/python/orttraining_python_module.cc" PROPERTIES COMPILE_FLAGS -Wno-unused-parameter)
endif()
if (MSVC)
if (MSVC)
target_compile_options(onnxruntime_pybind11_state PRIVATE "/wd4100" "/wd4324" "/wd4458" "/wd4127" "/wd4193" "/wd4624" "/wd4702")
target_compile_options(onnxruntime_pybind11_state PRIVATE "/bigobj" "/wd4275" "/wd4244" "/wd4267" "/wd4067")
endif()
@ -306,7 +310,7 @@ if (onnxruntime_ENABLE_TRAINING)
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/*.py"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cpu/aten_op_executor/*"
"${ONNXRUNTIME_ROOT}/python/torch_cpp_extensions/aten_op_executor/*"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_torch_interop_utils_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/*"

View file

@ -466,8 +466,8 @@ endif()
if(onnxruntime_USE_NUPHAR)
# the test case under nuphar_tvm is only to verify some basic tvm show case, which is already out of date
# it doesn't have relationship to nuphar directly. consider we have an official tvm execution provider now,
# keep those test cases doesn't bring any value now.
# keep those test cases doesn't bring any value now.
list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/framework/nuphar/*)
list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_nuphar)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_nuphar)
@ -699,11 +699,11 @@ endif()
set(test_all_args)
if (onnxruntime_USE_TENSORRT)
# TRT EP CI takes much longer time when updating to TRT 8.2
# So, we only run trt ep and exclude other eps to reduce CI test time.
# So, we only run trt ep and exclude other eps to reduce CI test time.
#
# The test names of model tests were using sequential number in the past.
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu__* or *xxx__*.
# This PR https://github.com/microsoft/onnxruntime/pull/10220 (Please see ExpandModelName function in model_tests.cc for more details)
# made test name contain the "ep" and "model path" information, so we can easily filter the tests using cuda ep or other ep with *cpu__* or *xxx__*.
list(APPEND test_all_args "--gtest_filter=-*cpu__*:*cuda__*" )
endif ()
@ -714,7 +714,7 @@ AddTest(
onnx_test_runner_common ${onnxruntime_test_providers_libs} ${onnxruntime_test_common_libs}
onnx_test_data_proto nlohmann_json::nlohmann_json
DEPENDS ${all_dependencies}
TEST_ARGS ${test_all_args}
TEST_ARGS ${test_all_args}
)
if (MSVC)
# The warning means the type of two integral values around a binary operator is narrow than their result.
@ -755,6 +755,10 @@ if (onnxruntime_BUILD_WEBASSEMBLY)
endif()
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_test_all PRIVATE ENABLE_ATEN)
endif()
set(test_data_target onnxruntime_test_all)
onnxruntime_add_static_library(onnx_test_data_proto ${TEST_SRC_DIR}/proto/tml.proto)

View file

@ -212,7 +212,7 @@ class OpKernelContext {
const OrtValue* GetImplicitInputMLValue(int index) const;
OrtValue* GetOutputMLValue(int index);
#ifdef ENABLE_TRAINING
#ifdef ENABLE_ATEN
Status SetOutputMLValue(int index, const OrtValue& ort_value);
#endif

View file

@ -0,0 +1,76 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "contrib_ops/cpu/aten_ops/aten_op.h"
#include "core/dlpack/dlpack_converter.h"
#include "core/framework/op_kernel_context_internal.h"
#include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
namespace onnxruntime {
namespace contrib {
ONNX_OPERATOR_KERNEL_EX(ATen, kPytorchAtenDomain, 1, kCpuExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorAndSequenceTensorTypes()), ATen);
Status ATen::Compute(OpKernelContext* p_ctx) const {
auto* p_ctx_internal = static_cast<OpKernelContextInternal*>(p_ctx);
size_t input_size = static_cast<size_t>(p_ctx_internal->InputCount());
size_t output_size = static_cast<size_t>(p_ctx_internal->OutputCount());
std::unique_ptr<DLManagedTensor*[]> dlpack_inputs = std::make_unique<DLManagedTensor*[]>(input_size);
std::unique_ptr<DLManagedTensor*[]> dlpack_outputs = std::make_unique<DLManagedTensor*[]>(output_size);
for (size_t i = 0; i < input_size; ++i) {
const OrtValue* p_ort_value = p_ctx_internal->GetInputMLValue(static_cast<int>(i));
if (!p_ort_value) {
dlpack_inputs[i] = nullptr;
} else {
OrtValue ort_value = *p_ort_value;
dlpack_inputs[i] = dlpack::OrtValueToDlpack(ort_value);
}
}
aten_ops::ATenOperatorExecutor::Instance()(op_name_, overload_name_, input_size, dlpack_inputs.get(), output_size,
dlpack_outputs.get());
for (size_t i = 0; i < output_size; ++i) {
ORT_RETURN_IF_ERROR(
p_ctx_internal->SetOutputMLValue(static_cast<int>(i), dlpack::DlpackToOrtValue(dlpack_outputs[i])));
}
return Status::OK();
}
#ifdef ENABLE_TRAINING
bool IsATenOperatorExecutorInitialized() { return aten_ops::ATenOperatorExecutor::Instance().IsInitialized(); }
Status ExecuteReduceSumATen(OpKernelContext* p_ctx, const gsl::span<const int64_t>& axes, bool keepdims) {
ORT_ENFORCE(aten_ops::ATenOperatorExecutor::Instance().IsInitialized() && !axes.empty());
size_t input_size = 4;
std::unique_ptr<DLManagedTensor*[]> dlpack_inputs = std::make_unique<DLManagedTensor*[]>(input_size);
auto* p_ctx_internal = static_cast<OpKernelContextInternal*>(p_ctx);
OrtValue ort_value = *p_ctx_internal->GetInputMLValue(0);
dlpack_inputs[0] = dlpack::OrtValueToDlpack(ort_value);
OrtValue axes_tensor;
OrtValue keepdims_tensor;
TensorShapeVector axes_tensor_shape(1, static_cast<int64_t>(axes.size()));
TensorShapeVector keepdims_tensor_shape(1, 1);
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
OrtMemoryInfo info("Cpu", OrtDeviceAllocator);
auto axes_tensor_obj = std::make_unique<Tensor>(DataTypeImpl::GetType<int64_t>(), axes_tensor_shape,
const_cast<void*>(reinterpret_cast<const void*>(&axes[0])), info);
axes_tensor.Init(axes_tensor_obj.release(), ml_tensor, ml_tensor->GetDeleteFunc());
auto keepdims_tensor_obj = std::make_unique<Tensor>(DataTypeImpl::GetType<bool>(), keepdims_tensor_shape,
reinterpret_cast<void*>(&keepdims), info);
keepdims_tensor.Init(keepdims_tensor_obj.release(), ml_tensor, ml_tensor->GetDeleteFunc());
dlpack_inputs[1] = dlpack::OrtValueToDlpack(axes_tensor);
dlpack_inputs[2] = dlpack::OrtValueToDlpack(keepdims_tensor);
dlpack_inputs[3] = nullptr;
DLManagedTensor* dlpack_output = nullptr;
aten_ops::ATenOperatorExecutor::Instance()("sum", "dim_IntList", input_size, dlpack_inputs.get(), 1, &dlpack_output);
ORT_ENFORCE(dlpack_output);
ORT_RETURN_IF_ERROR(p_ctx_internal->SetOutputMLValue(0, dlpack::DlpackToOrtValue(dlpack_output)));
return Status::OK();
}
#endif
} // namespace contrib
} // namespace onnxruntime

View file

@ -22,8 +22,10 @@ class ATen : public OpKernel {
std::string overload_name_;
};
#ifdef ENABLE_TRAINING
bool IsATenOperatorExecutorInitialized();
Status ExecuteReduceSumATen(OpKernelContext* p_ctx, const gsl::span<const int64_t>& axes, bool keepdims);
#endif
} // namespace contrib
} // namespace onnxruntime

View file

@ -11,8 +11,9 @@ namespace contrib {
namespace aten_ops {
typedef bool (*IsTensorArgumentFunc)(const char* op_name, const char* overload_name, size_t index);
typedef std::vector<DLManagedTensor*> (*ExecuteATenOperatorFunc)(const char* op_name, const char* overload_name,
const std::vector<DLManagedTensor*>& dlpacks);
typedef void (*ExecuteATenOperatorFunc)(const char* op_name, const char* overload_name, size_t input_size,
DLManagedTensor** dlpack_inputs, size_t output_size,
DLManagedTensor** dlpack_outputs);
class ATenOperatorExecutor {
public:
@ -34,10 +35,11 @@ class ATenOperatorExecutor {
return p_is_tensor_argument_func_(op_name.c_str(), overload_name.c_str(), index);
}
std::vector<DLManagedTensor*> operator()(const std::string& op_name, const std::string& overload_name,
const std::vector<DLManagedTensor*>& dlpacks) {
void operator()(const std::string& op_name, const std::string& overload_name, size_t input_size,
DLManagedTensor** dlpack_inputs, size_t output_size, DLManagedTensor** dlpack_outputs) {
ORT_ENFORCE(p_execute_aten_op_func_, "ATenOperatorExecutor is not initialized.");
return p_execute_aten_op_func_(op_name.c_str(), overload_name.c_str(), dlpacks);
p_execute_aten_op_func_(op_name.c_str(), overload_name.c_str(), input_size, dlpack_inputs, output_size,
dlpack_outputs);
}
private:

View file

@ -112,6 +112,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu);
#ifdef ENABLE_ATEN
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen);
#endif
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
KernelCreateInfo info;
@ -249,6 +253,10 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Inverse)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Trilu)>,
#ifdef ENABLE_ATEN
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
#endif
};
for (auto& function_table_entry : function_table) {

View file

@ -0,0 +1,18 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "contrib_ops/cpu/aten_ops/aten_op.h"
namespace onnxruntime {
namespace contrib {
namespace cuda {
ONNX_OPERATOR_KERNEL_EX(
ATen, kPytorchAtenDomain, 1, kCudaExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllTensorAndSequenceTensorTypes()),
onnxruntime::contrib::ATen);
} // namespace cuda
} // namespace contrib
} // namespace onnxruntime

View file

@ -97,6 +97,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, LayerNormalization);
#ifdef ENABLE_ATEN
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen);
#endif
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
KernelCreateInfo info;
@ -195,6 +199,10 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, LayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FusedConv)>,
#ifdef ENABLE_ATEN
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
#endif
};
for (auto& function_table_entry : function_table) {

View file

@ -96,6 +96,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, LayerNormalization);
#ifdef ENABLE_ATEN
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain, 1, ATen);
#endif
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
KernelCreateInfo info;
@ -194,6 +198,10 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, BFloat16_float_BFloat16, LayerNormalization)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FusedConv)>,
#ifdef ENABLE_ATEN
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
#endif
};
for (auto& function_table_entry : function_table) {

View file

@ -38,18 +38,23 @@ IExecutionFrame::IExecutionFrame(const OrtValueNameIdxMap& ort_value_idx_map,
IExecutionFrame::~IExecutionFrame() = default;
#ifdef ENABLE_TRAINING
#ifdef ENABLE_ATEN
Status IExecutionFrame::SetOutputMLValue(int index, const OrtValue& ort_value) {
int ort_value_idx = GetNodeIdxToMLValueIdx(index);
if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast<size_t>(ort_value_idx) >= all_values_size_) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid index ", ort_value_idx);
}
ORT_ENFORCE(!all_values_[ort_value_idx].IsAllocated());
all_values_[ort_value_idx] = ort_value;
if (all_values_[ort_value_idx].IsAllocated()) {
ORT_RETURN_IF_ERROR(CopyTensor(ort_value.Get<Tensor>(), *all_values_[ort_value_idx].GetMutable<Tensor>()));
} else {
all_values_[ort_value_idx] = ort_value;
}
return Status::OK();
}
#endif
#ifdef ENABLE_TRAINING
void IExecutionFrame::UpdateFeeds(const std::vector<int>& feed_mlvalue_idxs, const std::vector<OrtValue>& feeds) {
ORT_ENFORCE(feed_mlvalue_idxs.size() == feeds.size());

View file

@ -51,9 +51,12 @@ class IExecutionFrame {
const OrtValue* GetNodeInputOrOutputMLValue(int index) const;
OrtValue* GetMutableNodeInputOrOutputMLValue(int index);
#ifdef ENABLE_TRAINING
#ifdef ENABLE_ATEN
// Override the index-th output with ort_value
Status SetOutputMLValue(int index, const OrtValue& ort_value);
#endif
#ifdef ENABLE_TRAINING
void UpdateFeeds(const std::vector<int>& feed_mlvalue_idxs, const std::vector<OrtValue>& feeds);
void UpdateFetches(const std::vector<int>& fetch_mlvalue_idxs, const std::vector<OrtValue>& fetches,
const std::unordered_map<int, OrtValue>& initializers);
@ -73,7 +76,7 @@ class IExecutionFrame {
/**
* write the output values to the 'fetches' vector
* Don't access the values after SessionState is destroyed
* Don't access the values after SessionState is destroyed
*/
Status GetOutputs(std::vector<OrtValue>& fetches);

View file

@ -208,7 +208,7 @@ OrtValue* OpKernelContext::GetOutputMLValue(int index) {
return execution_frame_->GetMutableNodeInputOrOutputMLValue(output_arg_index);
}
#ifdef ENABLE_TRAINING
#ifdef ENABLE_ATEN
Status OpKernelContext::SetOutputMLValue(int index, const OrtValue& ort_value) {
if (index < 0 || index >= OutputCount()) {
return Status(common::ONNXRUNTIME, common::FAIL,

View file

@ -53,7 +53,7 @@ class OpKernelContextInternal : public OpKernelContext {
return OpKernelContext::GetOutputMLValue(index);
}
#ifdef ENABLE_TRAINING
#ifdef ENABLE_ATEN
Status SetOutputMLValue(int index, const OrtValue& ort_value) {
return OpKernelContext::SetOutputMLValue(index, ort_value);
}

View file

@ -21,7 +21,10 @@
#include "core/framework/TensorSeq.h"
#ifdef ENABLE_TRAINING
#include "core/framework/orttraining_partial_executor.h"
#include "orttraining/training_ops/cpu/aten_ops/aten_op_executor.h"
#endif
#ifdef ENABLE_ATEN
#include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
#endif
namespace ONNX_NAMESPACE {
@ -787,8 +790,9 @@ bool IsInputOnCpu(const Node& node, const KernelCreateInfo* p_kci, size_t index)
return true;
}
#ifdef ENABLE_TRAINING
if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" && node.Domain() == kPytorchAtenDomain) {
#ifdef ENABLE_ATEN
if (node.GetExecutionProviderType() == kCudaExecutionProvider && node.OpType() == "ATen" &&
node.Domain() == kPytorchAtenDomain) {
const auto& attrs = node.GetAttributes();
ORT_ENFORCE(utils::HasString(attrs.at("operator")));
std::string op_name = attrs.at("operator").s();

View file

@ -2510,6 +2510,24 @@ This op functions in much the same was as Dropout-11 and Dropout-13 do, execpt t
}
});
#ifdef ENABLE_ATEN
ONNX_CONTRIB_OPERATOR_SCHEMA(ATen)
.SetDomain(kPytorchAtenDomain)
.SinceVersion(1)
.SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
.SetDoc("ATen")
.Input(0, "inputs", "ATen Op inputs.", "T", OpSchema::Variadic,
/*is_homogeneous*/ false,
/*min_arity*/ 1)
.Output(0, "outputs", "ATen Op outputs.", "T", OpSchema::Variadic,
/*is_homogeneous*/ false,
/*min_arity*/ 1)
.Attr("operator", "Name of ATen operator.", AttributeProto::STRING)
.Attr("overload_name", "Overload name of ATen operator.", AttributeProto::STRING, false)
.TypeConstraint("T", OpSchema::all_tensor_types_with_bfloat(),
"Allow inputs and outputs to be any kind of tensor.");
#endif
#ifndef _OPSCHEMA_LIB_
// Register the NCHWc schemas if supported by the platform.
if (MlasNchwcGetBlockSize() > 1) {

View file

@ -32,10 +32,12 @@
#include "contrib_ops/cpu/bert/embed_layer_norm_helper.h"
#include "contrib_ops/cpu/bert/longformer_attention_base.h"
#include "contrib_ops/cpu/transformers/beam_search.h"
#ifdef ENABLE_ATEN
#include "contrib_ops/cpu/aten_ops/aten_op.h"
#endif
#endif
#ifdef ENABLE_TRAINING
#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
#include "orttraining/training_ops/cpu/controlflow/group.h"
#include "orttraining/training_ops/cpu/controlflow/record.h"
#include "orttraining/training_ops/cpu/controlflow/wait.h"
@ -173,10 +175,13 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
void BeamSearch__Init(contrib::transformers::BeamSearch* p, const OpKernelInfo& info) override { p->contrib::transformers::BeamSearch::Init(info); }
virtual Status BeamSearch__Compute(const contrib::transformers::BeamSearch* p, OpKernelContext* ctx) { return p->contrib::transformers::BeamSearch::Compute(ctx); }
virtual Status BeamSearch__SetupSubgraphExecutionInfo(contrib::transformers::BeamSearch* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) override { return p->contrib::transformers::BeamSearch::SetupSubgraphExecutionInfo(session_state, attribute_name, subgraph_session_state); }
#ifdef ENABLE_ATEN
Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) override { return p->ATen::Compute(p_ctx); }
#endif
#endif
#ifdef ENABLE_TRAINING
Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) override { return p->ATen::Compute(p_ctx); }
void contrib__record_event_in_tensor(const Tensor& event_id_tensor) override { return contrib::record_event_in_tensor(event_id_tensor); }
void contrib__wait_event_in_tensor(const Tensor& event_id_tensor) override { return contrib::wait_event_in_tensor(event_id_tensor); }
Status contrib__Group__Compute(const contrib::Group* p, OpKernelContext* context) override { return p->Group::Compute(context); }

View file

@ -134,10 +134,13 @@ struct ProviderHostCPU {
virtual void BeamSearch__Init(contrib::transformers::BeamSearch* p, const OpKernelInfo& info) = 0;
virtual Status BeamSearch__Compute(const contrib::transformers::BeamSearch* p, OpKernelContext* ctx) = 0;
virtual Status BeamSearch__SetupSubgraphExecutionInfo(contrib::transformers::BeamSearch* p, const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) = 0;
#ifdef ENABLE_ATEN
virtual Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) = 0;
#endif
#endif
#ifdef ENABLE_TRAINING
virtual Status ATen__Compute(const contrib::ATen* p, OpKernelContext* p_ctx) = 0;
virtual void contrib__record_event_in_tensor(const Tensor& event_id_tensor) = 0;
virtual void contrib__wait_event_in_tensor(const Tensor& event_id_tensor) = 0;
virtual Status contrib__Group__Compute(const contrib::Group* p, OpKernelContext* context) = 0;

View file

@ -9,7 +9,7 @@
#include "core/providers/cuda/math/binary_elementwise_ops.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
#ifdef ENABLE_TRAINING
#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
#include "contrib_ops/cpu/aten_ops/aten_op.h"
#endif
using namespace onnxruntime::common;

View file

@ -31,10 +31,12 @@
#include "contrib_ops/cpu/bert/embed_layer_norm_helper.h"
#include "contrib_ops/cpu/bert/longformer_attention_base.h"
#include "contrib_ops/cpu/transformers/beam_search.h"
#ifdef ENABLE_ATEN
#include "contrib_ops/cpu/aten_ops/aten_op.h"
#endif
#endif
#ifdef ENABLE_TRAINING
#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
#include "orttraining/training_ops/cpu/controlflow/group.h"
#include "orttraining/training_ops/cpu/controlflow/yield.h"
@ -541,6 +543,10 @@ void BeamSearch::Init(const OpKernelInfo& info) { g_host_cpu.BeamSearch__Init(th
Status BeamSearch::Compute(OpKernelContext* ctx) const { return g_host_cpu.BeamSearch__Compute(this, ctx); }
Status BeamSearch::SetupSubgraphExecutionInfo(const SessionState& session_state, const std::string& attribute_name, const SessionState& subgraph_session_state) { return g_host_cpu.BeamSearch__SetupSubgraphExecutionInfo(this, session_state, attribute_name, subgraph_session_state); }
} // namespace transformers
#ifdef ENABLE_ATEN
Status ATen::Compute(OpKernelContext* p_ctx) const { return g_host_cpu.ATen__Compute(this, p_ctx); }
#endif
} // namespace contrib
#endif
@ -567,7 +573,6 @@ Status Scan<9>::SetupSubgraphExecutionInfo(const SessionState& session_state, co
#ifdef ENABLE_TRAINING
namespace contrib {
Status ATen::Compute(OpKernelContext* p_ctx) const { return g_host_cpu.ATen__Compute(this, p_ctx); }
Status Group::Compute(OpKernelContext* context) const { return g_host_cpu.contrib__Group__Compute(this, context); }
Status PassThrough::Compute(OpKernelContext* context) const { return g_host_cpu.contrib__PassThrough__Compute(this, context); }
Status YieldOp::Compute(OpKernelContext* context) const { return g_host_cpu.contrib__YieldOp__Compute(this, context); }

View file

@ -30,6 +30,10 @@
#include "core/session/provider_bridge_ort.h"
#include "core/providers/tensorrt/tensorrt_provider_options.h"
#ifdef ENABLE_ATEN
#include "contrib_ops/cpu/aten_ops/aten_op_executor.h"
#endif
// Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct,
// GCC 4.x doesn't seem to define this and it breaks the pipelines based on CentOS as it uses
// GCC 4.x.
@ -1010,6 +1014,19 @@ void addGlobalMethods(py::module& m, Environment& env) {
arena_extend_strategy = strategy;
});
#endif
#ifdef ENABLE_ATEN
m.def("register_aten_op_executor",
[](const std::string& is_tensor_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
size_t is_tensor_argument_address_int, aten_op_executor_address_int;
ORT_THROW_IF_ERROR(
ParseStringWithClassicLocale(is_tensor_argument_address_str, is_tensor_argument_address_int));
ORT_THROW_IF_ERROR(ParseStringWithClassicLocale(aten_op_executor_address_str, aten_op_executor_address_int));
void* p_is_tensor_argument = reinterpret_cast<void*>(is_tensor_argument_address_int);
void* p_aten_op_executor = reinterpret_cast<void*>(aten_op_executor_address_int);
contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_tensor_argument, p_aten_op_executor);
});
#endif
}
void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn) {
@ -1236,7 +1253,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
const OrtValue* ml_value = ml_value_pyobject.attr(PYTHON_ORTVALUE_NATIVE_OBJECT_ATTR).cast<OrtValue*>();
ORT_THROW_IF_ERROR(options->AddInitializer(name, ml_value));
})
.def("add_external_initializers", [](PySessionOptions* options, py::list& names,
.def("add_external_initializers", [](PySessionOptions* options, py::list& names,
const py::list& ort_values) -> void {
#if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_EXTERNAL_INITIALIZERS)
const auto init_num = ort_values.size();

View file

@ -192,19 +192,19 @@ class SymbolicShapeInference:
"SkipLayerNormalization": self._infer_SkipLayerNormalization,
}
self.aten_op_dispatcher_ = {
"aten::embedding": self._infer_Gather,
"aten::bitwise_or": self._infer_aten_bitwise_or,
"aten::diagonal": self._infer_aten_diagonal,
"aten::max_pool2d_with_indices": self._infer_aten_pool2d,
"aten::max": self._infer_aten_minmax,
"aten::min": self._infer_aten_minmax,
"aten::multinomial": self._infer_aten_multinomial,
"aten::unfold": self._infer_aten_unfold,
"aten::argmax": self._infer_aten_argmax,
"aten::avg_pool2d": self._infer_aten_pool2d,
"aten::_adaptive_avg_pool2d": self._infer_aten_pool2d,
"aten::binary_cross_entropy_with_logits": self._infer_aten_bce,
"aten::numpy_T": self._infer_Transpose,
"embedding": self._infer_Gather,
"bitwise_or": self._infer_aten_bitwise_or,
"diagonal": self._infer_aten_diagonal,
"max_pool2d_with_indices": self._infer_aten_pool2d,
"max": self._infer_aten_minmax,
"min": self._infer_aten_minmax,
"multinomial": self._infer_aten_multinomial,
"unfold": self._infer_aten_unfold,
"argmax": self._infer_aten_argmax,
"avg_pool2d": self._infer_aten_pool2d,
"_adaptive_avg_pool2d": self._infer_aten_pool2d,
"binary_cross_entropy_with_logits": self._infer_aten_bce,
"numpy_T": self._infer_Transpose,
}
self.run_ = True
self.suggested_merge_ = {}

View file

@ -1,8 +1,8 @@
from onnxruntime.capi import _pybind_state as C
import threading
from functools import wraps
from onnxruntime.capi import _pybind_state as _C
def run_once_aten_op_executor(f):
"""
@ -28,6 +28,6 @@ def run_once_aten_op_executor(f):
def load_aten_op_executor_cpp_extension():
from onnxruntime.training.ortmodule.torch_cpp_extensions import aten_op_executor
C.register_aten_op_executor(
_C.register_aten_op_executor(
str(aten_op_executor.is_tensor_argument_address()), str(aten_op_executor.execute_aten_operator_address())
)

View file

@ -175,13 +175,13 @@ bool IsTensorArgument(const char* op_name, const char* overload_name, size_t ind
return aten_op.elem_kinds[index] == c10::TypeKind::TensorType;
}
std::vector<DLManagedTensor*> ExecuteATenOperator(const char* op_name, const char* overload_name,
const std::vector<DLManagedTensor*>& dlpacks) {
void ExecuteATenOperator(const char* op_name, const char* overload_name, size_t input_size,
DLManagedTensor** dlpack_inputs, size_t output_size, DLManagedTensor** dlpack_outputs) {
const auto& aten_op = ATenOperatorCache::Instance().GetOperator(op_name, overload_name);
TORCH_INTERNAL_ASSERT(dlpacks.size() == aten_op.argument_size);
TORCH_INTERNAL_ASSERT(input_size == aten_op.argument_size);
std::vector<c10::IValue> arguments;
for (size_t i = 0; i < dlpacks.size(); i++) {
arguments.emplace_back(aten_op.ToIValueArgument(dlpacks[i], i));
for (size_t i = 0; i < input_size; ++i) {
arguments.emplace_back(aten_op.ToIValueArgument(dlpack_inputs[i], i));
}
torch::jit::Stack stack;
@ -191,8 +191,7 @@ std::vector<DLManagedTensor*> ExecuteATenOperator(const char* op_name, const cha
#ifndef TORCH_VERSION_PREEQ
#define TORCH_VERSION_PREEQ(x, y) \
((TORCH_VERSION_MAJOR == (x) && TORCH_VERSION_MINOR >= (y)) || \
(TORCH_VERSION_MAJOR > (x)))
((TORCH_VERSION_MAJOR == (x) && TORCH_VERSION_MINOR >= (y)) || (TORCH_VERSION_MAJOR > (x)))
#endif
// pull request https://github.com/pytorch/pytorch/pull/63414 introduced
@ -207,13 +206,12 @@ std::vector<DLManagedTensor*> ExecuteATenOperator(const char* op_name, const cha
aten_op.op->getOperation()(&stack);
#endif
std::vector<DLManagedTensor*> result;
for (const auto& ret : torch::jit::pop(stack, aten_op.return_size)) {
TORCH_INTERNAL_ASSERT(output_size == aten_op.return_size);
size_t output_index = 0;
for (const auto& ret : torch::jit::pop(stack, output_size)) {
const auto& tensor = ret.toTensor();
result.emplace_back(at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous()));
dlpack_outputs[output_index++] = at::toDLPack(tensor.is_contiguous() ? tensor : tensor.contiguous());
}
return result;
}
size_t is_tensor_argument_address() { return reinterpret_cast<size_t>(&IsTensorArgument); }

View file

@ -4,7 +4,8 @@
# --------------------------------------------------------------------------
import os
from setuptools import setup, Extension
from setuptools import setup
from torch.utils import cpp_extension
filename = os.path.join(os.path.dirname(__file__), "aten_op_executor.cc")

View file

@ -0,0 +1,37 @@
import threading
from functools import wraps
import torch
from onnxruntime.capi import _pybind_state as _C
from .aten_op_executor import execute_aten_operator_address, is_tensor_argument_address
def run_once_aten_op_executor(f):
"""
Decorator to run a function only once.
:param f: function to be run only once during execution time despite the number of calls
:return: The original function with the params passed to it if it hasn't already been run before
"""
@wraps(f)
def aten_op_executor_wrapper(*args, **kwargs):
if not aten_op_executor_wrapper.has_run:
with aten_op_executor_wrapper.lock:
if not aten_op_executor_wrapper.has_run:
aten_op_executor_wrapper.has_run = True
return f(*args, **kwargs)
aten_op_executor_wrapper.lock = threading.Lock()
aten_op_executor_wrapper.has_run = False
return aten_op_executor_wrapper
@run_once_aten_op_executor
def load_aten_op_executor_cpp_extension():
_C.register_aten_op_executor(str(is_tensor_argument_address()), str(execute_aten_operator_address()))
def init_aten_op_executor():
load_aten_op_executor_cpp_extension()

View file

@ -0,0 +1,19 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import os
from setuptools import setup
from torch.utils import cpp_extension
filename = os.path.join(os.path.dirname(__file__), "aten_op_executor/aten_op_executor.cc")
setup(
name="ort_torch_ext",
version="1.0",
ext_modules=[cpp_extension.CppExtension(name="ort_torch_ext.aten_op_executor", sources=[filename])],
packages=["ort_torch_ext"],
cmdclass={"build_ext": cpp_extension.BuildExtension},
)

View file

@ -0,0 +1,131 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import io
import unittest
import numpy as np
import onnx
import torch
from onnx import TensorProto, helper
from ort_torch_ext import init_aten_op_executor
from torch.onnx import export
import onnxruntime as ort
class OrtOpTests(unittest.TestCase):
def test_aten_embedding(self):
class NeuralNetEmbedding(torch.nn.Module):
def __init__(self, num_embeddings, embedding_dim, hidden_size):
super(NeuralNetEmbedding, self).__init__()
self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
self.linear = torch.nn.Linear(embedding_dim, hidden_size)
def forward(self, input):
embedding_result = self.embedding(input)
return embedding_result, self.linear(embedding_result)
N, num_embeddings, embedding_dim, hidden_size = 64, 32, 128, 128
model = NeuralNetEmbedding(num_embeddings, embedding_dim, hidden_size)
with torch.no_grad():
x = torch.randint(high=num_embeddings, size=(N,), dtype=torch.int64)
dynamic_axes = {"x": {0: "x_dim0"}, "y": {0: "y_dim0", 1: "y_dim1"}}
f = io.BytesIO()
export(
model,
x,
f=f,
input_names=["x"],
output_names=["y"],
dynamic_axes=dynamic_axes,
opset_version=14,
)
exported_model = onnx.load_model_from_string(f.getvalue())
# PyTorch exporter emitting ATen op is still under development. Currently convert it manually for testing.
for node in exported_model.graph.node:
if node.op_type == "Gather":
node.domain = "org.pytorch.aten"
node.op_type = "ATen"
attr = node.attribute.add()
attr.name = "operator"
attr.type = 3
attr.s = "embedding".encode()
exported_model.graph.node.append(
helper.make_node(
"Constant",
[],
["padding_idx"],
value=helper.make_tensor("padding_idx", TensorProto.INT64, (), [-1]),
)
)
exported_model.graph.node.append(
helper.make_node(
"Constant",
[],
["scale_grad_by_freq"],
value=helper.make_tensor("scale_grad_by_freq", TensorProto.BOOL, (), [False]),
)
)
exported_model.graph.node.append(
helper.make_node(
"Constant",
[],
["sparse"],
value=helper.make_tensor("sparse", TensorProto.BOOL, (), [False]),
)
)
node.input.append("padding_idx")
node.input.append("scale_grad_by_freq")
node.input.append("sparse")
exported_model.graph.value_info.append(
helper.make_value_info(
name=node.output[0],
type_proto=helper.make_tensor_type_proto(
elem_type=TensorProto.FLOAT, shape=[node.output[0] + "_dim0", node.output[0] + "_dim1"]
),
)
)
break
# The ONNX graph to run contains ATen Op.
assert any(node.op_type == "ATen" for node in exported_model.graph.node)
init_aten_op_executor()
# Run w/o IO binding.
for _ in range(8):
x = torch.randint(high=num_embeddings, size=(N,), dtype=torch.int64)
pt_y1, pt_y2 = model(x)
session = ort.InferenceSession(exported_model.SerializeToString(), providers=["CPUExecutionProvider"])
ort_y1, ort_y2 = session.run([], {"x": x.numpy()})
np.testing.assert_almost_equal(ort_y1, pt_y1.detach().numpy())
np.testing.assert_almost_equal(ort_y2, pt_y2.detach().numpy())
# Run w/ IO binding.
for _ in range(8):
x = torch.randint(high=num_embeddings, size=(N,), dtype=torch.int64)
ort_x = ort.OrtValue.ortvalue_from_numpy(x.detach().numpy(), "cpu")
pt_y1, pt_y2 = model(x)
np_y1 = np.zeros(tuple(pt_y1.size()), dtype=np.float32)
np_y2 = np.zeros(tuple(pt_y2.size()), dtype=np.float32)
ort_y1 = ort.OrtValue.ortvalue_from_numpy(np_y1, "cpu")
ort_y2 = ort.OrtValue.ortvalue_from_numpy(np_y2, "cpu")
session = ort.InferenceSession(exported_model.SerializeToString(), providers=["CPUExecutionProvider"])
io_binding = session.io_binding()
io_binding.bind_ortvalue_input(exported_model.graph.input[0].name, ort_x)
io_binding.bind_ortvalue_output(exported_model.graph.output[0].name, ort_y1)
io_binding.bind_ortvalue_output(exported_model.graph.output[1].name, ort_y2)
session.run_with_iobinding(io_binding)
np.testing.assert_almost_equal(np_y1, pt_y1.detach().numpy())
np.testing.assert_almost_equal(np_y2, pt_y2.detach().numpy())
if __name__ == "__main__":
unittest.main()

View file

@ -3384,24 +3384,6 @@ Return true if all elements are true and false otherwise.
}
});
#ifdef ENABLE_TRAINING
ONNX_CONTRIB_OPERATOR_SCHEMA(ATen)
.SetDomain(kPytorchAtenDomain)
.SinceVersion(1)
.SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
.SetDoc("ATen")
.Input(0, "inputs", "ATen Op inputs.", "T", OpSchema::Variadic,
/*is_homogeneous*/ false,
/*min_arity*/ 1)
.Output(0, "outputs", "ATen Op outputs.", "T", OpSchema::Variadic,
/*is_homogeneous*/ false,
/*min_arity*/ 1)
.Attr("operator", "Name of ATen operator.", AttributeProto::STRING)
.Attr("overload_name", "Overload name of ATen operator.", AttributeProto::STRING, false)
.TypeConstraint("T", OpSchema::all_tensor_types_with_bfloat(),
"Allow inputs and outputs to be any kind of tensor.");
#endif
ONNX_CONTRIB_OPERATOR_SCHEMA(PythonOp)
.SetDomain(kMSDomain)
.SinceVersion(1)

View file

@ -21,9 +21,6 @@
#include "orttraining/core/framework/ortmodule_graph_builder.h"
#include "orttraining/core/graph/gradient_definition_registry.h"
#include "python/onnxruntime_pybind_mlvalue.h"
#include "orttraining/training_ops/cpu/aten_ops/aten_op_executor.h"
#include "orttraining/python/orttraining_pybind_common.h"
#ifdef ENABLE_TRAINING_TORCH_INTEROP
@ -555,16 +552,6 @@ for every transfered tensor.
m.def("get_mpi_context_world_size", []() -> int { return MPIContext::GetInstance().GetWorldSize(); });
#endif
m.def("register_aten_op_executor",
[](const std::string& is_tensor_argument_address_str, const std::string& aten_op_executor_address_str) -> void {
size_t is_tensor_argument_address_int, aten_op_executor_address_int;
ORT_THROW_IF_ERROR(
ParseStringWithClassicLocale(is_tensor_argument_address_str, is_tensor_argument_address_int));
ORT_THROW_IF_ERROR(ParseStringWithClassicLocale(aten_op_executor_address_str, aten_op_executor_address_int));
void* p_is_tensor_argument = reinterpret_cast<void*>(is_tensor_argument_address_int);
void* p_aten_op_executor = reinterpret_cast<void*>(aten_op_executor_address_int);
contrib::aten_ops::ATenOperatorExecutor::Instance().Initialize(p_is_tensor_argument, p_aten_op_executor);
});
m.def("register_forward_runner", [](py::object obj) -> void {
#ifdef ENABLE_TRAINING_TORCH_INTEROP
auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance();

View file

@ -25,6 +25,7 @@
# 'is_tensor' is optional, if not present, the default is False.
import json
from onnxruntime.capi import _pybind_state as C
@ -85,7 +86,7 @@ def register_gradient(domain, name, *attributes):
# For ATen op, we need to provide op_name and overload name.
@register_gradient("org.pytorch.aten", "ATen", "aten::embedding", "")
@register_gradient("org.pytorch.aten", "ATen", "embedding", "")
def embedding_gradient():
return [
("Constant", [], ["Const_0"], {"value": {"value": 0, "dtype": "int", "is_tensor": True}}),
@ -95,12 +96,12 @@ def embedding_gradient():
("ATen", "org.pytorch.aten"),
["GO(0)", "I(1)", "Gather_X_0", "I(2)", "I(3)", "I(4)"],
["GI(0)"],
{"operator": {"value": "aten::embedding_backward", "dtype": "string"}},
{"operator": {"value": "embedding_backward", "dtype": "string"}},
),
]
@register_gradient("org.pytorch.aten", "ATen", "aten::diagonal", "")
@register_gradient("org.pytorch.aten", "ATen", "diagonal", "")
def diagonal_gradient():
return [
("Shape", ["I(0)"], ["Shape_X"]),
@ -108,19 +109,19 @@ def diagonal_gradient():
("ATen", "org.pytorch.aten"),
["GO(0)", "Shape_X", "I(1)", "I(2)", "I(3)"],
["GI(0)"],
{"operator": {"value": "aten::diagonal_backward", "dtype": "string"}},
{"operator": {"value": "diagonal_backward", "dtype": "string"}},
),
]
@register_gradient("org.pytorch.aten", "ATen", "aten::max_pool2d_with_indices", "")
@register_gradient("org.pytorch.aten", "ATen", "max_pool2d_with_indices", "")
def max_pool2d_gradient():
return [
(
("ATen", "org.pytorch.aten"),
["GO(0)", "I(0)", "I(1)", "I(2)", "I(3)", "I(4)", "I(5)", "O(1)"],
["GI(0)"],
{"operator": {"value": "aten::max_pool2d_with_indices_backward", "dtype": "string"}},
{"operator": {"value": "max_pool2d_with_indices_backward", "dtype": "string"}},
),
]
@ -143,8 +144,8 @@ def minmax_gradient():
]
min_gradient = register_gradient("org.pytorch.aten", "ATen", "aten::min", "")(minmax_gradient)
max_gradient = register_gradient("org.pytorch.aten", "ATen", "aten::max", "")(minmax_gradient)
min_gradient = register_gradient("org.pytorch.aten", "ATen", "min", "")(minmax_gradient)
max_gradient = register_gradient("org.pytorch.aten", "ATen", "max", "")(minmax_gradient)
def minmax_dim_gradient():
@ -161,16 +162,16 @@ def minmax_dim_gradient():
("ATen", "org.pytorch.aten"),
["GO(0)", "I(1)", "O(1)", "Shape_X", "I(2)"],
["GI(0)"],
{"operator": {"value": "aten::value_selecting_reduction_backward", "dtype": "string"}},
{"operator": {"value": "value_selecting_reduction_backward", "dtype": "string"}},
),
]
min_dim_gradient = register_gradient("org.pytorch.aten", "ATen", "aten::min", "dim")(minmax_dim_gradient)
max_dim_gradient = register_gradient("org.pytorch.aten", "ATen", "aten::max", "dim")(minmax_dim_gradient)
min_dim_gradient = register_gradient("org.pytorch.aten", "ATen", "min", "dim")(minmax_dim_gradient)
max_dim_gradient = register_gradient("org.pytorch.aten", "ATen", "max", "dim")(minmax_dim_gradient)
@register_gradient("org.pytorch.aten", "ATen", "aten::unfold", "")
@register_gradient("org.pytorch.aten", "ATen", "unfold", "")
def unfold_gradient():
return [
("Shape", ["I(0)"], ["Shape_X"]),
@ -178,58 +179,58 @@ def unfold_gradient():
("ATen", "org.pytorch.aten"),
["GO(0)", "Shape_X", "I(1)", "I(2)", "I(3)"],
["GI(0)"],
{"operator": {"value": "aten::unfold_backward", "dtype": "string"}},
{"operator": {"value": "unfold_backward", "dtype": "string"}},
),
]
@register_gradient("org.pytorch.aten", "ATen", "aten::avg_pool2d", "")
@register_gradient("org.pytorch.aten", "ATen", "avg_pool2d", "")
def avg_pool2d_gradient():
return [
(
("ATen", "org.pytorch.aten"),
["GO(0)", "I(0)", "I(1)", "I(2)", "I(3)", "I(4)", "I(5)", "I(6)"],
["GI(0)"],
{"operator": {"value": "aten::avg_pool2d_backward", "dtype": "string"}},
{"operator": {"value": "avg_pool2d_backward", "dtype": "string"}},
),
]
@register_gradient("org.pytorch.aten", "ATen", "aten::_adaptive_avg_pool2d", "")
@register_gradient("org.pytorch.aten", "ATen", "_adaptive_avg_pool2d", "")
def adaptive_avg_pool2d_gradient():
return [
(
("ATen", "org.pytorch.aten"),
["GO(0)", "I(0)"],
["GI(0)"],
{"operator": {"value": "aten::_adaptive_avg_pool2d_backward", "dtype": "string"}},
{"operator": {"value": "_adaptive_avg_pool2d_backward", "dtype": "string"}},
),
]
CustomGradientRegistry.register_custom_stop_gradient_edges([0], "org.pytorch.aten", "ATen", "aten::argmax", "")
CustomGradientRegistry.register_custom_stop_gradient_edges([0], "org.pytorch.aten", "ATen", "aten::multinomial", "")
CustomGradientRegistry.register_custom_stop_gradient_edges([0], "org.pytorch.aten", "ATen", "argmax", "")
CustomGradientRegistry.register_custom_stop_gradient_edges([0], "org.pytorch.aten", "ATen", "multinomial", "")
@register_gradient("org.pytorch.aten", "ATen", "aten::binary_cross_entropy_with_logits", "")
@register_gradient("org.pytorch.aten", "ATen", "binary_cross_entropy_with_logits", "")
def binary_cross_entropy_with_logits_gradient():
return [
(
("ATen", "org.pytorch.aten"),
["GO(0)", "I(0)", "I(1)", "I(2)", "I(3)", "I(4)"],
["GI(0)"],
{"operator": {"value": "aten::binary_cross_entropy_with_logits_backward", "dtype": "string"}},
{"operator": {"value": "binary_cross_entropy_with_logits_backward", "dtype": "string"}},
),
]
@register_gradient("org.pytorch.aten", "ATen", "aten::numpy_T", "")
@register_gradient("org.pytorch.aten", "ATen", "numpy_T", "")
def numpy_T_gradient():
return [
(
("ATen", "org.pytorch.aten"),
["GO(0)"],
["GI(0)"],
{"operator": {"value": "aten::numpy_T", "dtype": "string"}},
{"operator": {"value": "numpy_T", "dtype": "string"}},
),
]

View file

@ -3,10 +3,10 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from torch.onnx import register_custom_op_symbolic
from torch.onnx.symbolic_helper import parse_args, _get_tensor_dim_size, _get_tensor_sizes
import torch.onnx.symbolic_helper as sym_help
import torch
import torch.onnx.symbolic_helper as sym_help
from torch.onnx import register_custom_op_symbolic
from torch.onnx.symbolic_helper import _get_tensor_dim_size, _get_tensor_sizes, parse_args
class CustomOpSymbolicRegistry:
@ -73,7 +73,7 @@ def nll_loss(g, self, target, weight, reduction, ignore_index):
@register_symbolic("embedding")
def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
output = g.op(
"org.pytorch.aten::ATen", weight, indices, padding_idx, scale_grad_by_freq, sparse, operator_s="aten::embedding"
"org.pytorch.aten::ATen", weight, indices, padding_idx, scale_grad_by_freq, sparse, operator_s="embedding"
)
indices_shape = _get_tensor_sizes(indices)
if indices_shape is not None and hasattr(weight.type(), "with_sizes"):
@ -84,19 +84,19 @@ def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
@register_symbolic("bitwise_or")
def bitwise_or(g, self, other):
return g.op("org.pytorch.aten::ATen", self, other, operator_s="aten::bitwise_or", overload_name_s="Tensor")
return g.op("org.pytorch.aten::ATen", self, other, operator_s="bitwise_or", overload_name_s="Tensor")
@register_symbolic("diagonal")
def diagonal(g, self, offset, dim1, dim2):
return g.op("org.pytorch.aten::ATen", self, offset, dim1, dim2, operator_s="aten::diagonal")
return g.op("org.pytorch.aten::ATen", self, offset, dim1, dim2, operator_s="diagonal")
@register_symbolic("multinomial")
def multinomial(g, self, num_samples, replacement=False, generator=None):
if generator is not None and not sym_help._is_none(generator):
raise RuntimeError("Unsupported: ONNX does not support generator for multinomial")
return g.op("org.pytorch.aten::ATen", self, num_samples, replacement, generator, operator_s="aten::multinomial")
return g.op("org.pytorch.aten::ATen", self, num_samples, replacement, generator, operator_s="multinomial")
@register_symbolic("max_pool2d")
@ -112,7 +112,7 @@ def max_pool2d(g, self, kernel_size, stride, padding, dilation, ceil_mode):
padding,
dilation,
ceil_mode,
operator_s="aten::max_pool2d_with_indices",
operator_s="max_pool2d_with_indices",
outputs=2,
)[0]
@ -121,38 +121,34 @@ def max_pool2d(g, self, kernel_size, stride, padding, dilation, ceil_mode):
def max(g, self, dim_or_y=None, keepdim=None):
# torch.max(input), returns the max value in the tensor
if dim_or_y is None and keepdim is None:
return g.op("org.pytorch.aten::ATen", self, operator_s="aten::max")
return g.op("org.pytorch.aten::ATen", self, operator_s="max")
# torch.max(input, other)
if keepdim is None:
return g.op("Max", self, dim_or_y)
# torch.max(input, dim, keepdim), returns (max_values, max_indices)
return g.op(
"org.pytorch.aten::ATen", self, dim_or_y, keepdim, operator_s="aten::max", overload_name_s="dim", outputs=2
)
return g.op("org.pytorch.aten::ATen", self, dim_or_y, keepdim, operator_s="max", overload_name_s="dim", outputs=2)
@register_symbolic("min")
def min(g, self, dim_or_y=None, keepdim=None):
# torch.min(input), returns the min value in the tensor
if dim_or_y is None and keepdim is None:
return g.op("org.pytorch.aten::ATen", self, operator_s="aten::min")
return g.op("org.pytorch.aten::ATen", self, operator_s="min")
# torch.min(input, other)
if keepdim is None:
return g.op("Min", self, dim_or_y)
# torch.min(input, dim, keepdim), returns (min_values, min_indices)
return g.op(
"org.pytorch.aten::ATen", self, dim_or_y, keepdim, operator_s="aten::min", overload_name_s="dim", outputs=2
)
return g.op("org.pytorch.aten::ATen", self, dim_or_y, keepdim, operator_s="min", overload_name_s="dim", outputs=2)
@register_symbolic("unfold")
def unfold(g, input, dimension, size, step):
return g.op("org.pytorch.aten::ATen", input, dimension, size, step, operator_s="aten::unfold")
return g.op("org.pytorch.aten::ATen", input, dimension, size, step, operator_s="unfold")
@register_symbolic("argmax")
def argmax(g, input, dim, keepdim):
return g.op("org.pytorch.aten::ATen", input, dim, keepdim, operator_s="aten::argmax")
return g.op("org.pytorch.aten::ATen", input, dim, keepdim, operator_s="argmax")
@register_symbolic("avg_pool2d")
@ -169,13 +165,13 @@ def avg_pool2d(g, self, kernel_size, stride, padding, ceil_mode, count_include_p
ceil_mode,
count_include_pad,
divisor_override,
operator_s="aten::avg_pool2d",
operator_s="avg_pool2d",
)
@register_symbolic("adaptive_avg_pool2d")
def adaptive_avg_pool2d(g, self, output_size):
return g.op("org.pytorch.aten::ATen", self, output_size, operator_s="aten::_adaptive_avg_pool2d")
return g.op("org.pytorch.aten::ATen", self, output_size, operator_s="_adaptive_avg_pool2d")
@register_symbolic("binary_cross_entropy_with_logits")
@ -191,7 +187,7 @@ def binary_cross_entropy_with_logits(g, self, target, weight, pos_weight, reduct
weight,
pos_weight,
reduction,
operator_s="aten::binary_cross_entropy_with_logits",
operator_s="binary_cross_entropy_with_logits",
)
from torch.onnx.symbolic_opset12 import binary_cross_entropy_with_logits as bce
@ -209,7 +205,7 @@ def numpy_T(g, self):
else:
# if we don't have dim information we cannot
# output a permute so use ATen instead
return g.op("com.microsoft::ATenOp", self, name_s="aten::numpy_T")
return g.op("org.pytorch.aten::ATen", self, operator_s="numpy_T")
@register_symbolic("squeeze")

View file

@ -1,68 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
#include "core/dlpack/dlpack_converter.h"
#include "core/framework/op_kernel_context_internal.h"
#include "orttraining/training_ops/cpu/aten_ops/aten_op_executor.h"
namespace onnxruntime {
namespace contrib {
ONNX_OPERATOR_KERNEL_EX(ATen, kPytorchAtenDomain, 1, kCpuExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorAndSequenceTensorTypes()),
ATen);
Status ATen::Compute(OpKernelContext* p_ctx) const {
auto* p_ctx_internal = static_cast<OpKernelContextInternal*>(p_ctx);
std::vector<DLManagedTensor*> dlpacks;
for (int i = 0; i < p_ctx_internal->InputCount(); i++) {
const OrtValue* p_ort_value = p_ctx_internal->GetInputMLValue(i);
if (!p_ort_value) {
dlpacks.emplace_back(nullptr);
} else {
OrtValue ort_value = *p_ort_value;
dlpacks.emplace_back(dlpack::OrtValueToDlpack(ort_value));
}
}
auto result = aten_ops::ATenOperatorExecutor::Instance()(op_name_, overload_name_, dlpacks);
for (size_t i = 0; i < result.size(); i++) {
ORT_RETURN_IF_ERROR(p_ctx_internal->SetOutputMLValue(static_cast<int>(i), dlpack::DlpackToOrtValue(result[i])));
}
return Status::OK();
}
bool IsATenOperatorExecutorInitialized() {
return aten_ops::ATenOperatorExecutor::Instance().IsInitialized();
}
Status ExecuteReduceSumATen(OpKernelContext* p_ctx, const gsl::span<const int64_t>& axes, bool keepdims) {
ORT_ENFORCE(aten_ops::ATenOperatorExecutor::Instance().IsInitialized() && !axes.empty());
std::vector<DLManagedTensor*> dlpacks;
auto* p_ctx_internal = static_cast<OpKernelContextInternal*>(p_ctx);
OrtValue ort_value = *p_ctx_internal->GetInputMLValue(0);
dlpacks.emplace_back(dlpack::OrtValueToDlpack(ort_value));
OrtValue axes_tensor;
OrtValue keepdims_tensor;
TensorShapeVector axes_tensor_shape(1, static_cast<int64_t>(axes.size()));
TensorShapeVector keepdims_tensor_shape(1, 1);
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
OrtMemoryInfo info("Cpu", OrtDeviceAllocator);
auto axes_tensor_obj = std::make_unique<Tensor>(DataTypeImpl::GetType<int64_t>(), axes_tensor_shape,
const_cast<void*>(reinterpret_cast<const void*>(&axes[0])), info);
axes_tensor.Init(axes_tensor_obj.release(), ml_tensor, ml_tensor->GetDeleteFunc());
auto keepdims_tensor_obj = std::make_unique<Tensor>(DataTypeImpl::GetType<bool>(), keepdims_tensor_shape, reinterpret_cast<void*>(&keepdims), info);
keepdims_tensor.Init(keepdims_tensor_obj.release(), ml_tensor, ml_tensor->GetDeleteFunc());
dlpacks.emplace_back(dlpack::OrtValueToDlpack(axes_tensor));
dlpacks.emplace_back(dlpack::OrtValueToDlpack(keepdims_tensor));
dlpacks.emplace_back(nullptr);
auto result = aten_ops::ATenOperatorExecutor::Instance()("aten::sum", "dim_IntList", dlpacks);
ORT_RETURN_IF_ERROR(p_ctx_internal->SetOutputMLValue(0, dlpack::DlpackToOrtValue(result[0])));
return Status::OK();
}
} // namespace contrib
} // namespace onnxruntime

View file

@ -98,7 +98,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Recv)
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, RecordEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WaitEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, YieldOp);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SummaryScalar);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SummaryHistogram);
@ -208,7 +207,6 @@ Status RegisterCpuTrainingKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, RecordEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, WaitEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, YieldOp)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SummaryScalar)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SummaryHistogram)>,

View file

@ -1,16 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "orttraining/training_ops/cpu/aten_ops/aten_op.h"
#include "core/providers/cuda/cuda_fwd.h"
namespace onnxruntime {
namespace cuda {
ONNX_OPERATOR_KERNEL_EX(ATen, kPytorchAtenDomain, 1, kCudaExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllTensorAndSequenceTensorTypes()),
onnxruntime::contrib::ATen);
} // namespace cuda
} // namespace onnxruntime

View file

@ -220,7 +220,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Adas
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, WaitEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, YieldOp);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen);
#ifdef ENABLE_TRAINING_TORCH_INTEROP
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOp);
@ -443,7 +442,6 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, WaitEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, YieldOp)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
#ifdef ENABLE_TRAINING_TORCH_INTEROP
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PythonOp)>,

View file

@ -195,7 +195,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Adas
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, RecordEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, WaitEvent);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, YieldOp);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain, 1, ATen);
#ifdef ENABLE_TRAINING_TORCH_INTEROP
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOp);
@ -392,7 +391,6 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, RecordEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, WaitEvent)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, YieldOp)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
#ifdef ENABLE_TRAINING_TORCH_INTEROP
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PythonOp)>,

View file

@ -0,0 +1,77 @@
jobs:
- job: Linux_Build
timeoutInMinutes: 120
workspace:
clean: all
pool: Linux-CPU-2019
steps:
- checkout: self
clean: true
submodules: recursive
- task: NodeTool@0
inputs:
versionSpec: '12.16.3'
- template: templates/get-docker-image-steps.yml
parameters:
Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_aten_cpu
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
Repository: onnxruntimecpubuildaten
- task: CmdLine@2
displayName: 'build'
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimecpubuildaten \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Release \
--skip_submodule_sync \
--build_shared_lib \
--parallel \
--build_wheel \
--skip_tests \
--cmake_extra_defines onnxruntime_ENABLE_ATEN=ON
workingDirectory: $(Build.SourcesDirectory)
- task: CmdLine@2
displayName: 'install ort_torch_ext and launch test'
inputs:
script: |
mkdir -p $HOME/.onnx
docker run --rm \
--volume /data/onnx:/data/onnx:ro \
--volume $(Build.SourcesDirectory):/onnxruntime_src \
--volume $(Build.BinariesDirectory):/build \
--volume $HOME/.onnx:/home/onnxruntimedev/.onnx \
-e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \
-e NIGHTLY_BUILD \
-e BUILD_BUILDNUMBER \
onnxruntimecpubuildaten \
bash -c "rm -rf /build/Release/onnxruntime /build/Release/pybind11 && \
/opt/python/cp38-cp38/bin/python3 -m pip install /build/Release/dist/*.whl && \
/opt/python/cp38-cp38/bin/python3 -m pip install /onnxruntime_src/onnxruntime/python/torch_cpp_extensions && \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/onnxruntime/test/python/contrib_ops/aten_op_tests.py && \
/opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \
--build_dir /build --cmake_generator Ninja \
--config Release \
--skip_submodule_sync \
--build_shared_lib \
--parallel \
--build_wheel \
--test \
--cmake_extra_defines onnxruntime_ENABLE_ATEN=ON"
workingDirectory: $(Build.SourcesDirectory)
- template: templates/clean-agent-build-directory-step.yml

View file

@ -0,0 +1,10 @@
FROM quay.io/pypa/manylinux2014_x86_64:latest
ADD scripts /tmp/scripts
RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps_aten.sh && rm -rf /tmp/scripts
ARG BUILD_UID=1001
ARG BUILD_USER=onnxruntimedev
RUN adduser --uid $BUILD_UID $BUILD_USER
WORKDIR /home/$BUILD_USER
USER $BUILD_USER

View file

@ -0,0 +1,112 @@
#!/bin/bash
set -e -x
# Development tools and libraries
yum -y install \
graphviz
# Download a file from internet
function GetFile {
local uri=$1
local path=$2
local force=${3:-false}
local download_retries=${4:-5}
local retry_wait_time_seconds=${5:-30}
if [[ -f $path ]]; then
if [[ $force = false ]]; then
echo "File '$path' already exists. Skipping download"
return 0
else
rm -rf $path
fi
fi
if [[ -f $uri ]]; then
echo "'$uri' is a file path, copying file to '$path'"
cp $uri $path
return $?
fi
echo "Downloading $uri"
# Use aria2c if available, otherwise use curl
if command -v aria2c > /dev/null; then
aria2c -q -d $(dirname $path) -o $(basename $path) "$uri"
else
curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail
fi
return $?
}
if [ ! -d "/opt/conda/bin" ]; then
PYTHON_EXES=("/opt/python/cp37-cp37m/bin/python3.7" "/opt/python/cp38-cp38/bin/python3.8" "/opt/python/cp39-cp39/bin/python3.9")
else
PYTHON_EXES=("/opt/conda/bin/python")
fi
os_major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1)
SYS_LONG_BIT=$(getconf LONG_BIT)
mkdir -p /tmp/src
GLIBC_VERSION=$(getconf GNU_LIBC_VERSION | cut -f 2 -d \.)
DISTRIBUTOR=$(lsb_release -i -s)
if [[ "$DISTRIBUTOR" = "CentOS" && $SYS_LONG_BIT = "64" ]]; then
LIBDIR="lib64"
else
LIBDIR="lib"
fi
cd /tmp/src
echo "Installing azcopy"
mkdir -p /tmp/azcopy
GetFile https://aka.ms/downloadazcopy-v10-linux /tmp/azcopy/azcopy.tar.gz
tar --strip 1 -xf /tmp/azcopy/azcopy.tar.gz -C /tmp/azcopy
cp /tmp/azcopy/azcopy /usr/bin
echo "Installing Ninja"
GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz
tar -zxf ninja-linux.tar.gz
cd ninja-1.10.0
cmake -Bbuild-cmake -H.
cmake --build build-cmake
mv ./build-cmake/ninja /usr/bin
echo "Installing Node.js"
GetFile https://nodejs.org/dist/v16.14.2/node-v16.14.2-linux-x64.tar.gz /tmp/src/node-v16.14.2-linux-x64.tar.gz
tar --strip 1 -xf /tmp/src/node-v16.14.2-linux-x64.tar.gz -C /usr
cd /tmp/src
GetFile https://downloads.gradle-dn.com/distributions/gradle-6.3-bin.zip /tmp/src/gradle-6.3-bin.zip
unzip /tmp/src/gradle-6.3-bin.zip
mv /tmp/src/gradle-6.3 /usr/local/gradle
if ! [ -x "$(command -v protoc)" ]; then
source ${0/%install_deps_aten\.sh/..\/install_protobuf.sh}
fi
export ONNX_ML=1
export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF"
for PYTHON_EXE in "${PYTHON_EXES[@]}"
do
${PYTHON_EXE} -m pip install -r ${0/%install_deps_aten\.sh/requirements\.txt}
if ![[ ${PYTHON_EXE} = "/opt/python/cp310-cp310/bin/python3.10" ]]; then
${PYTHON_EXE} -m pip install -r ${0/%install_deps_aten\.sh/..\/training\/ortmodule\/stage1\/requirements_torch_cpu\/requirements.txt}
else
${PYTHON_EXE} -m pip install torch==1.11.0
fi
done
cd /tmp/src
GetFile 'https://sourceware.org/pub/valgrind/valgrind-3.16.1.tar.bz2' /tmp/src/valgrind-3.16.1.tar.bz2
tar -jxvf valgrind-3.16.1.tar.bz2
cd valgrind-3.16.1
./configure --prefix=/usr --libdir=/usr/lib64 --enable-only64bit --enable-tls
make -j$(getconf _NPROCESSORS_ONLN)
make install
cd /
rm -rf /tmp/src