mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-30 23:18:20 +00:00
Merge remote-tracking branch 'origin/master' into scmckay/UpdateCudaInfoInBuildMd
This commit is contained in:
commit
4dd1e50aa7
8 changed files with 94 additions and 41 deletions
|
|
@ -110,6 +110,13 @@ else()
|
|||
add_definitions(-DUSE_OPENMP)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
#For Mac compliance
|
||||
message("Adding flags for Mac builds")
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
|
||||
endif()
|
||||
|
||||
find_package(PNG)
|
||||
set(ENABLE_DATE_TESTING OFF CACHE BOOL "" FORCE)
|
||||
set(USE_SYSTEM_TZ_DB ON CACHE BOOL "" FORCE)
|
||||
|
|
|
|||
13
cmake/external/mkldnn.cmake
vendored
13
cmake/external/mkldnn.cmake
vendored
|
|
@ -11,6 +11,9 @@ set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include)
|
|||
# patch for mkldnn_sgemm thread safety bug.
|
||||
# it can be removed once a fix is available in a validated mkldnn release version.
|
||||
set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/mkldnn_sgemm.patch)
|
||||
set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch)
|
||||
# discard prior changes due to patching in mkldnn source to unblock incremental builds.
|
||||
set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .)
|
||||
|
||||
if(WIN32)
|
||||
set(MKLDNN_SHARED_LIB mkldnn.dll)
|
||||
|
|
@ -20,15 +23,17 @@ if(WIN32)
|
|||
set(MKLML_SHARED_LIB mklml.dll)
|
||||
set(IOMP5MD_SHARED_LIB libiomp5md.dll)
|
||||
endif()
|
||||
set(MKLDNN_PATCH_COMMAND2 "")
|
||||
else()
|
||||
set(MKLDNN_SHARED_LIB libmkldnn.so.0)
|
||||
if (APPLE)
|
||||
set(MKLDNN_SHARED_LIB libmkldnn.0.dylib)
|
||||
else()
|
||||
set(MKLDNN_SHARED_LIB libmkldnn.so.0)
|
||||
endif()
|
||||
if(onnxruntime_USE_MKLML)
|
||||
set(DOWNLOAD_MKLML ${MKLDNN_SOURCE}/scripts/prepare_mkl.sh)
|
||||
set(MKLML_SHARED_LIB libmklml_intel.so)
|
||||
set(IOMP5MD_SHARED_LIB libiomp5.so)
|
||||
endif()
|
||||
set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch)
|
||||
endif()
|
||||
|
||||
if(NOT onnxruntime_USE_MKLDNN OR EXISTS ${MKLDNN_SOURCE}/external)
|
||||
|
|
@ -39,7 +44,7 @@ ExternalProject_Add(project_mkldnn
|
|||
PREFIX mkl-dnn
|
||||
GIT_REPOSITORY ${MKLDNN_URL}
|
||||
GIT_TAG ${MKLDNN_TAG}
|
||||
PATCH_COMMAND ${DOWNLOAD_MKLML} COMMAND ${MKLDNN_PATCH_COMMAND1} COMMAND ${MKLDNN_PATCH_COMMAND2}
|
||||
PATCH_COMMAND ${DOWNLOAD_MKLML} COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND1} COMMAND ${MKLDNN_PATCH_COMMAND2}
|
||||
SOURCE_DIR ${MKLDNN_SOURCE}
|
||||
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -75,13 +75,16 @@ add_dependencies(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_depende
|
|||
if (MSVC)
|
||||
# if MSVC, pybind11 looks for release version of python lib (pybind11/detail/common.h undefs _DEBUG)
|
||||
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY_RELEASE} ${ONNXRUNTIME_SO_LINK_FLAG})
|
||||
elseif (APPLE)
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
|
||||
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${ONNXRUNTIME_SO_LINK_FLAG})
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES
|
||||
INSTALL_RPATH "@loader_path"
|
||||
BUILD_WITH_INSTALL_RPATH TRUE
|
||||
INSTALL_RPATH_USE_LINK_PATH FALSE)
|
||||
else()
|
||||
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY} ${ONNXRUNTIME_SO_LINK_FLAG})
|
||||
if (APPLE)
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES INSTALL_RPATH "@loader_path")
|
||||
else()
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
|
||||
endif()
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
|
||||
endif()
|
||||
|
||||
set_target_properties(onnxruntime_pybind11_state PROPERTIES PREFIX "")
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# ONNX Runtime High Level Design
|
||||
|
||||
This document outlines the high level design of
|
||||
ONNXRuntime - a high performance, cross platform engine.
|
||||
ONNX Runtime - a high performance, cross platform engine.
|
||||
|
||||
## Key objectives
|
||||
* Maximally and automatically leverage the custom accelerators and runtimes
|
||||
|
|
@ -10,8 +10,8 @@ available on disparate platforms.
|
|||
runtimes. We call this abstraction an [execution
|
||||
provider](../include/onnxruntime/core/framework/execution_provider.h). It defines and exposes a set of
|
||||
its capabilities to ONNXRuntime: a set of single or fused nodes it can
|
||||
execute, its memory allocator and more. Custom accelerators and runtimes are
|
||||
instances of execution provider.
|
||||
execute, its memory allocator, and more. Custom accelerators and runtimes are
|
||||
instances of execution providers.
|
||||
* We don't expect that an execution provider can always run an ONNX model fully
|
||||
on its device. This means that ONNXRuntime must be able to execute a single
|
||||
model in a heterogeneous environment involving multiple execution providers.
|
||||
|
|
@ -35,46 +35,45 @@ provider using the GetCapability() API.
|
|||
|
||||

|
||||
|
||||
*Note: TensorRT and nGraph support in the works.*
|
||||
*Note: TensorRT and nGraph support are in progress*
|
||||
|
||||
### More about partitioning
|
||||
ONNXRuntime partitions a model graph based on the available execution providers
|
||||
into subgraphs, each for a distinct provider respectively. ONNXRuntime provides
|
||||
a default execution provider that is used for fallback execution for the
|
||||
ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides
|
||||
a default execution provider that is used as the fallback execution for the
|
||||
operators that cannot be pushed onto the more specialized but more efficient
|
||||
execution providers. Intuitively we probably want to push computation to the
|
||||
specialized execution providers as much as possible.
|
||||
execution providers. Intuitively we want to push computation to more
|
||||
specialized execution providers whenever possible.
|
||||
|
||||
We use a simple graph partitioning technique. The available execution providers
|
||||
will be considered in a specific order, and each will be assigned the maximal
|
||||
subgraphs (possibly more than one) that it is able to handle. The
|
||||
ONNXRuntime-provided default execution provider will be the last one to be
|
||||
ONNXRuntime-provided default execution provider will be the last one
|
||||
considered, and it ensures completeness. More sophisticated optimizations can be
|
||||
considered in the future (or can even be implemented as a composite execution
|
||||
provider).
|
||||
|
||||
Conceptually, each partition is reduced to a single fused operator. It is
|
||||
created by invoking the execution provider's Compile() method and wrap it as a
|
||||
created by invoking the execution provider's Compile() method and wraps it as a
|
||||
custom operator. Currently we support only synchronous mode of execution. An execution
|
||||
provider exposes its memory allocator, which is used to allocate the input
|
||||
tensors for the execution provider. The rewriting and partitioning transform the
|
||||
initial model graph into a new graph composed with operators assigned to either
|
||||
initial model graph into a new graph composed of operators assigned to either
|
||||
the default execution provider or other registered execution
|
||||
providers. ONNXRuntime execution engine is responsible for running this graph.
|
||||
providers. The ONNXRuntime execution engine is responsible for running this graph.
|
||||
|
||||
## Key design decisions
|
||||
* Multiple threads should be able to inovke the Run() method on the same
|
||||
* Multiple threads can invoke the Run() method on the same
|
||||
inference session object. See [API doc](C_API.md) for more details.
|
||||
* To facilitate the above the Compute() function of all kernels is const
|
||||
* To facilitate this, the Compute() function of all kernels is const
|
||||
implying the kernels are stateless.
|
||||
* We call implementations of the operators by execution providers as
|
||||
* Implementations of the operators by execution providers are called
|
||||
kernels. Each execution provider supports a subset of the (ONNX)
|
||||
operators/kernels.
|
||||
* ONNXRuntime runtime guarantees that all operators are supported by the default
|
||||
* The ONNXRuntime runtime guarantees that all operators are supported by the default
|
||||
execution provider.
|
||||
* Tensor representation: ONNXRuntime will utilize a standard representation for
|
||||
the tensor runtime values. The execution providers can internally use a
|
||||
different representation, if they choose to, but it is their responsibility to
|
||||
different representation if they choose to, but it is their responsibility to
|
||||
convert the values from/to the standard representation at the boundaries of
|
||||
their subgraph.
|
||||
|
||||
|
|
|
|||
|
|
@ -34,14 +34,20 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 1);
|
|||
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMax, 1);
|
||||
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 1);
|
||||
|
||||
// When all reduce axises located at the tail of the dims, quite general cases, transpose and extra
|
||||
// copy could be skiped to improve performance, if required by check_no_transpose = true;
|
||||
// return value: true means transposedInputData is not created/copied, input tensor data could
|
||||
// be direct use as row major matrix [block_size, blocks], where blocks is the
|
||||
// size of each reduce.
|
||||
template <typename T>
|
||||
void PrepareForReduce(OpKernelContext* ctx,
|
||||
bool PrepareForReduce(OpKernelContext* ctx,
|
||||
std::vector<T>& transposedInputData,
|
||||
Tensor** reducedTensor,
|
||||
int64_t& block_size,
|
||||
int64_t& blocks,
|
||||
const std::vector<int64_t>& axes_,
|
||||
bool keepdims_) {
|
||||
bool keepdims_,
|
||||
bool check_no_transpose = false) {
|
||||
const Tensor* input_tensor_ptr = ctx->Input<Tensor>(0);
|
||||
ONNXRUNTIME_ENFORCE(input_tensor_ptr != nullptr);
|
||||
const Tensor& input = *input_tensor_ptr;
|
||||
|
|
@ -51,8 +57,6 @@ void PrepareForReduce(OpKernelContext* ctx,
|
|||
ONNXRUNTIME_ENFORCE(axe >= 0 && axe < (int64_t)ndim, "Axis attribute out of range");
|
||||
}
|
||||
|
||||
transposedInputData.resize(input.Shape().Size(), 0);
|
||||
|
||||
std::vector<int64_t> axes = axes_;
|
||||
if (axes.empty()) {
|
||||
// This is the default case for non-arg kind reductions. Reduce on all dimensions.
|
||||
|
|
@ -62,6 +66,13 @@ void PrepareForReduce(OpKernelContext* ctx,
|
|||
|
||||
std::sort(axes.begin(), axes.end());
|
||||
|
||||
// If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
|
||||
bool need_copy = true;
|
||||
if (axes.size() <= ndim && axes.front() == static_cast<int64_t>(ndim - axes.size())
|
||||
&& axes.back() == static_cast<int64_t>(ndim) - 1) {
|
||||
need_copy = false;
|
||||
}
|
||||
|
||||
vector<bool> keep_axis(ndim, true);
|
||||
for (auto i : axes) {
|
||||
keep_axis[i] = false;
|
||||
|
|
@ -96,7 +107,6 @@ void PrepareForReduce(OpKernelContext* ctx,
|
|||
}
|
||||
|
||||
const T* from_data = input.template Data<T>();
|
||||
T* to_data = &transposedInputData[0];
|
||||
size_t count = input.Shape().Size();
|
||||
|
||||
//set to-be-reduced axes to one. squeeze is keepdims_ is false
|
||||
|
|
@ -117,9 +127,15 @@ void PrepareForReduce(OpKernelContext* ctx,
|
|||
block_size = input.Shape().Size() / first_dim;
|
||||
blocks = first_dim;
|
||||
|
||||
if (!need_copy && check_no_transpose) {
|
||||
return true;
|
||||
}
|
||||
|
||||
transposedInputData.resize(input.Shape().Size(), 0);
|
||||
T* to_data = &transposedInputData[0];
|
||||
if (num_axes < 2 || n_shared_idxs == num_axes) {
|
||||
memcpy(to_data, from_data, count * sizeof(T));
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
int itr_axes = num_axes - n_shared_idxs;
|
||||
|
|
@ -178,6 +194,7 @@ void PrepareForReduce(OpKernelContext* ctx,
|
|||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
@ -272,12 +289,22 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
|
|||
std::vector<T> transposedInputData;
|
||||
int64_t block_size, blocks;
|
||||
Tensor* reduced;
|
||||
PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
|
||||
bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
|
||||
if (no_transpose) {
|
||||
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
|
||||
}
|
||||
}
|
||||
else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
@ -317,12 +344,22 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
|||
std::vector<T> transposedInputData;
|
||||
int64_t block_size, blocks;
|
||||
Tensor* reduced;
|
||||
PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
|
||||
bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
|
||||
|
||||
T* output_data = reduced->template MutableData<T>();
|
||||
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
|
||||
if (no_transpose) {
|
||||
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
|
||||
|
||||
#pragma omp parallel for
|
||||
for (int64_t i = 0; i < block_size; ++i) {
|
||||
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
|
||||
}
|
||||
}
|
||||
else {
|
||||
EigenVectorMap<T> out_vec(output_data, block_size);
|
||||
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
|||
2
setup.py
2
setup.py
|
|
@ -25,6 +25,8 @@ except ImportError:
|
|||
# Additional binaries
|
||||
if platform.system() == 'Linux':
|
||||
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.so.0', 'libmklml_intel.so', 'libiomp5.so']
|
||||
elif platform.system() == "Darwin":
|
||||
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.0.dylib'] # TODO add libmklml and libiomp5 later.
|
||||
else:
|
||||
libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll']
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ jobs:
|
|||
pool: Linux-CPU
|
||||
|
||||
steps:
|
||||
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory)'
|
||||
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--use_mklml"'
|
||||
displayName: 'Command Line Script'
|
||||
env:
|
||||
AZURE_BLOB_KEY: $(onnxruntime-storage-key)
|
||||
|
|
|
|||
|
|
@ -38,6 +38,6 @@ else
|
|||
--config Debug Release --build_shared_lib \
|
||||
--skip_submodule_sync \
|
||||
--enable_pybind \
|
||||
--parallel --use_mkldnn --use_mklml --build_shared_lib $BUILD_EXTR_PAR
|
||||
--parallel --use_mkldnn --build_shared_lib $BUILD_EXTR_PAR
|
||||
/home/onnxruntimedev/Release/onnx_test_runner /data/onnx
|
||||
fi
|
||||
|
|
|
|||
Loading…
Reference in a new issue