Merge remote-tracking branch 'origin/master' into scmckay/UpdateCudaInfoInBuildMd

This commit is contained in:
Scott McKay 2018-11-29 14:53:11 +10:00
commit 4dd1e50aa7
8 changed files with 94 additions and 41 deletions

View file

@ -110,6 +110,13 @@ else()
add_definitions(-DUSE_OPENMP)
endif()
endif()
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
#For Mac compliance
message("Adding flags for Mac builds")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstack-protector-strong")
endif()
find_package(PNG)
set(ENABLE_DATE_TESTING OFF CACHE BOOL "" FORCE)
set(USE_SYSTEM_TZ_DB ON CACHE BOOL "" FORCE)

View file

@ -11,6 +11,9 @@ set(MKLDNN_INCLUDE_DIR ${MKLDNN_INSTALL}/include)
# patch for mkldnn_sgemm thread safety bug.
# it can be removed once a fix is available in a validated mkldnn release version.
set(MKLDNN_PATCH_COMMAND1 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/mkldnn_sgemm.patch)
set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch)
# discard prior changes due to patching in mkldnn source to unblock incremental builds.
set(MKLDNN_PATCH_DISCARD_COMMAND cd ${MKLDNN_SOURCE} && git checkout -- .)
if(WIN32)
set(MKLDNN_SHARED_LIB mkldnn.dll)
@ -20,15 +23,17 @@ if(WIN32)
set(MKLML_SHARED_LIB mklml.dll)
set(IOMP5MD_SHARED_LIB libiomp5md.dll)
endif()
set(MKLDNN_PATCH_COMMAND2 "")
else()
set(MKLDNN_SHARED_LIB libmkldnn.so.0)
if (APPLE)
set(MKLDNN_SHARED_LIB libmkldnn.0.dylib)
else()
set(MKLDNN_SHARED_LIB libmkldnn.so.0)
endif()
if(onnxruntime_USE_MKLML)
set(DOWNLOAD_MKLML ${MKLDNN_SOURCE}/scripts/prepare_mkl.sh)
set(MKLML_SHARED_LIB libmklml_intel.so)
set(IOMP5MD_SHARED_LIB libiomp5.so)
endif()
set(MKLDNN_PATCH_COMMAND2 git apply ${CMAKE_SOURCE_DIR}/patches/mkldnn/platform.cmake.patch)
endif()
if(NOT onnxruntime_USE_MKLDNN OR EXISTS ${MKLDNN_SOURCE}/external)
@ -39,7 +44,7 @@ ExternalProject_Add(project_mkldnn
PREFIX mkl-dnn
GIT_REPOSITORY ${MKLDNN_URL}
GIT_TAG ${MKLDNN_TAG}
PATCH_COMMAND ${DOWNLOAD_MKLML} COMMAND ${MKLDNN_PATCH_COMMAND1} COMMAND ${MKLDNN_PATCH_COMMAND2}
PATCH_COMMAND ${DOWNLOAD_MKLML} COMMAND ${MKLDNN_PATCH_DISCARD_COMMAND} COMMAND ${MKLDNN_PATCH_COMMAND1} COMMAND ${MKLDNN_PATCH_COMMAND2}
SOURCE_DIR ${MKLDNN_SOURCE}
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL}
)

View file

@ -75,13 +75,16 @@ add_dependencies(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_depende
if (MSVC)
# if MSVC, pybind11 looks for release version of python lib (pybind11/detail/common.h undefs _DEBUG)
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY_RELEASE} ${ONNXRUNTIME_SO_LINK_FLAG})
elseif (APPLE)
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${ONNXRUNTIME_SO_LINK_FLAG})
set_target_properties(onnxruntime_pybind11_state PROPERTIES
INSTALL_RPATH "@loader_path"
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE)
else()
target_link_libraries(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${PYTHON_LIBRARY} ${ONNXRUNTIME_SO_LINK_FLAG})
if (APPLE)
set_target_properties(onnxruntime_pybind11_state PROPERTIES INSTALL_RPATH "@loader_path")
else()
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
endif()
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "-Xlinker -rpath=\$ORIGIN")
endif()
set_target_properties(onnxruntime_pybind11_state PROPERTIES PREFIX "")

View file

@ -1,7 +1,7 @@
# ONNX Runtime High Level Design
This document outlines the high level design of
ONNXRuntime - a high performance, cross platform engine.
ONNX Runtime - a high performance, cross platform engine.
## Key objectives
* Maximally and automatically leverage the custom accelerators and runtimes
@ -10,8 +10,8 @@ available on disparate platforms.
runtimes. We call this abstraction an [execution
provider](../include/onnxruntime/core/framework/execution_provider.h). It defines and exposes a set of
its capabilities to ONNXRuntime: a set of single or fused nodes it can
execute, its memory allocator and more. Custom accelerators and runtimes are
instances of execution provider.
execute, its memory allocator, and more. Custom accelerators and runtimes are
instances of execution providers.
* We don't expect that an execution provider can always run an ONNX model fully
on its device. This means that ONNXRuntime must be able to execute a single
model in a heterogeneous environment involving multiple execution providers.
@ -35,46 +35,45 @@ provider using the GetCapability() API.
![ONNXRuntime high level system architecture](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/228d22d3-6e3e-48b1-811c-1d48353f031c.png)
*Note: TensorRT and nGraph support in the works.*
*Note: TensorRT and nGraph support are in progress*
### More about partitioning
ONNXRuntime partitions a model graph based on the available execution providers
into subgraphs, each for a distinct provider respectively. ONNXRuntime provides
a default execution provider that is used for fallback execution for the
ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides
a default execution provider that is used as the fallback execution for the
operators that cannot be pushed onto the more specialized but more efficient
execution providers. Intuitively we probably want to push computation to the
specialized execution providers as much as possible.
execution providers. Intuitively we want to push computation to more
specialized execution providers whenever possible.
We use a simple graph partitioning technique. The available execution providers
will be considered in a specific order, and each will be assigned the maximal
subgraphs (possibly more than one) that it is able to handle. The
ONNXRuntime-provided default execution provider will be the last one to be
ONNXRuntime-provided default execution provider will be the last one
considered, and it ensures completeness. More sophisticated optimizations can be
considered in the future (or can even be implemented as a composite execution
provider).
Conceptually, each partition is reduced to a single fused operator. It is
created by invoking the execution provider's Compile() method and wrap it as a
created by invoking the execution provider's Compile() method and wraps it as a
custom operator. Currently we support only synchronous mode of execution. An execution
provider exposes its memory allocator, which is used to allocate the input
tensors for the execution provider. The rewriting and partitioning transform the
initial model graph into a new graph composed with operators assigned to either
initial model graph into a new graph composed of operators assigned to either
the default execution provider or other registered execution
providers. ONNXRuntime execution engine is responsible for running this graph.
providers. The ONNXRuntime execution engine is responsible for running this graph.
## Key design decisions
* Multiple threads should be able to inovke the Run() method on the same
* Multiple threads can invoke the Run() method on the same
inference session object. See [API doc](C_API.md) for more details.
* To facilitate the above the Compute() function of all kernels is const
* To facilitate this, the Compute() function of all kernels is const
implying the kernels are stateless.
* We call implementations of the operators by execution providers as
* Implementations of the operators by execution providers are called
kernels. Each execution provider supports a subset of the (ONNX)
operators/kernels.
* ONNXRuntime runtime guarantees that all operators are supported by the default
* The ONNXRuntime runtime guarantees that all operators are supported by the default
execution provider.
* Tensor representation: ONNXRuntime will utilize a standard representation for
the tensor runtime values. The execution providers can internally use a
different representation, if they choose to, but it is their responsibility to
different representation if they choose to, but it is their responsibility to
convert the values from/to the standard representation at the boundaries of
their subgraph.

View file

@ -34,14 +34,20 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ReduceSumSquare, 1);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMax, 1);
REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 1);
// When all reduce axises located at the tail of the dims, quite general cases, transpose and extra
// copy could be skiped to improve performance, if required by check_no_transpose = true;
// return value: true means transposedInputData is not created/copied, input tensor data could
// be direct use as row major matrix [block_size, blocks], where blocks is the
// size of each reduce.
template <typename T>
void PrepareForReduce(OpKernelContext* ctx,
bool PrepareForReduce(OpKernelContext* ctx,
std::vector<T>& transposedInputData,
Tensor** reducedTensor,
int64_t& block_size,
int64_t& blocks,
const std::vector<int64_t>& axes_,
bool keepdims_) {
bool keepdims_,
bool check_no_transpose = false) {
const Tensor* input_tensor_ptr = ctx->Input<Tensor>(0);
ONNXRUNTIME_ENFORCE(input_tensor_ptr != nullptr);
const Tensor& input = *input_tensor_ptr;
@ -51,8 +57,6 @@ void PrepareForReduce(OpKernelContext* ctx,
ONNXRUNTIME_ENFORCE(axe >= 0 && axe < (int64_t)ndim, "Axis attribute out of range");
}
transposedInputData.resize(input.Shape().Size(), 0);
std::vector<int64_t> axes = axes_;
if (axes.empty()) {
// This is the default case for non-arg kind reductions. Reduce on all dimensions.
@ -62,6 +66,13 @@ void PrepareForReduce(OpKernelContext* ctx,
std::sort(axes.begin(), axes.end());
// If all reduced axes are located at the tail of the input shape, then copy could be skipped is required
bool need_copy = true;
if (axes.size() <= ndim && axes.front() == static_cast<int64_t>(ndim - axes.size())
&& axes.back() == static_cast<int64_t>(ndim) - 1) {
need_copy = false;
}
vector<bool> keep_axis(ndim, true);
for (auto i : axes) {
keep_axis[i] = false;
@ -96,7 +107,6 @@ void PrepareForReduce(OpKernelContext* ctx,
}
const T* from_data = input.template Data<T>();
T* to_data = &transposedInputData[0];
size_t count = input.Shape().Size();
//set to-be-reduced axes to one. squeeze is keepdims_ is false
@ -117,9 +127,15 @@ void PrepareForReduce(OpKernelContext* ctx,
block_size = input.Shape().Size() / first_dim;
blocks = first_dim;
if (!need_copy && check_no_transpose) {
return true;
}
transposedInputData.resize(input.Shape().Size(), 0);
T* to_data = &transposedInputData[0];
if (num_axes < 2 || n_shared_idxs == num_axes) {
memcpy(to_data, from_data, count * sizeof(T));
return;
return false;
}
int itr_axes = num_axes - n_shared_idxs;
@ -178,6 +194,7 @@ void PrepareForReduce(OpKernelContext* ctx,
}
}
}
return false;
}
template <typename T>
@ -272,12 +289,22 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
int64_t block_size, blocks;
Tensor* reduced;
PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
T* output_data = reduced->template MutableData<T>();
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
if (no_transpose) {
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
#pragma omp parallel for
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).mean();
}
}
else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().mean();
}
return Status::OK();
}
@ -317,12 +344,22 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
int64_t block_size, blocks;
Tensor* reduced;
PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_);
bool no_transpose = PrepareForReduce<T>(ctx, transposedInputData, &reduced, block_size, blocks, axes_, keepdims_, true);
T* output_data = reduced->template MutableData<T>();
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
if (no_transpose) {
const T* input_data = ctx->Input<Tensor>(0)->template Data<T>();
#pragma omp parallel for
for (int64_t i = 0; i < block_size; ++i) {
output_data[i] = ConstEigenVectorMap<T>(input_data + (i * blocks), blocks).sum();
}
}
else {
EigenVectorMap<T> out_vec(output_data, block_size);
out_vec = ConstEigenMatrixMap<T>(&transposedInputData[0], block_size, blocks).rowwise().sum();
}
return Status::OK();
}

View file

@ -25,6 +25,8 @@ except ImportError:
# Additional binaries
if platform.system() == 'Linux':
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.so.0', 'libmklml_intel.so', 'libiomp5.so']
elif platform.system() == "Darwin":
libs = ['onnxruntime_pybind11_state.so', 'libmkldnn.0.dylib'] # TODO add libmklml and libiomp5 later.
else:
libs = ['onnxruntime_pybind11_state.pyd', 'mkldnn.dll', 'mklml.dll', 'libiomp5md.dll']

View file

@ -5,7 +5,7 @@ jobs:
pool: Linux-CPU
steps:
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory)'
- script: 'tools/ci_build/github/linux/run_dockerbuild.sh -o ubuntu16.04 -d cpu -r $(Build.BinariesDirectory) -x "--use_mklml"'
displayName: 'Command Line Script'
env:
AZURE_BLOB_KEY: $(onnxruntime-storage-key)

View file

@ -38,6 +38,6 @@ else
--config Debug Release --build_shared_lib \
--skip_submodule_sync \
--enable_pybind \
--parallel --use_mkldnn --use_mklml --build_shared_lib $BUILD_EXTR_PAR
--parallel --use_mkldnn --build_shared_lib $BUILD_EXTR_PAR
/home/onnxruntimedev/Release/onnx_test_runner /data/onnx
fi