Merge remote-tracking branch 'upstream/master' into tmp_merge

This commit is contained in:
Justin Stoecker 2021-12-02 10:32:14 -08:00
commit 2e9dc8a0e2
25 changed files with 528 additions and 166 deletions

View file

@ -962,7 +962,7 @@ if (WIN32)
# issued by thrust nonstandard extension used: nameless struct/union
list(APPEND ORT_WARNING_FLAGS "/wd4201")
# warning C4800: Implicit conversion from 'X' to bool. Possible information loss
if (onnxruntime_USE_OPENVINO)
if (onnxruntime_USE_OPENVINO OR onnxruntime_ENABLE_EAGER_MODE)
list(APPEND ORT_WARNING_FLAGS "/wd4800")
else()
list(APPEND ORT_WARNING_FLAGS "/w34800")

View file

@ -10,14 +10,14 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_eager_srcs})
add_library(onnxruntime_eager ${onnxruntime_eager_srcs})
if(MSVC AND onnxruntime_ENABLE_EAGER_MODE)
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_aten.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4458")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_aten.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4458")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_customops.g.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_backends.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_hooks.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_eager.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_log.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4324")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_log.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4324")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_guard.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_tensor.cpp" PROPERTIES COMPILE_FLAGS "/wd4100" COMPILE_FLAGS "/wd4458" COMPILE_FLAGS "/wd4127")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_tensor.cpp" PROPERTIES COMPILE_FLAGS "/wd4100 /wd4458 /wd4127")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_ops.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
set_source_files_properties("${ORTTRAINING_ROOT}/orttraining/eager/ort_util.cpp" PROPERTIES COMPILE_FLAGS "/wd4100")
endif()

View file

@ -1106,6 +1106,9 @@ if (onnxruntime_USE_ROCM)
# Generate GPU code for GFX9 Generation
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906 --amdgpu-target=gfx908)
if (ROCM_VERSION_DEV_INT GREATER_EQUAL 50000)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx90a)
endif()
#onnxruntime_add_shared_library_module(onnxruntime_providers_rocm ${onnxruntime_providers_rocm_src})
hip_add_library(onnxruntime_providers_rocm MODULE ${onnxruntime_providers_rocm_src})

View file

@ -4,10 +4,10 @@
<OrtPackageId Condition="'$(OrtPackageId)' == ''">Microsoft.ML.OnnxRuntime</OrtPackageId>
</PropertyGroup>
<!-- only include the Xamarin mobile targets for the main ORT package,
<!-- only include the Xamarin mobile targets if we're building an ORT package,
and only if the mobile workloads are installed -->
<Choose>
<When Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' AND Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')">
<When Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Gpu') AND Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')">
<PropertyGroup>
<TargetFrameworks>netstandard1.1;netstandard2.0;xamarinios10;monoandroid11.0;net5.0;netcoreapp3.1</TargetFrameworks>
</PropertyGroup>

View file

@ -17,104 +17,109 @@ NOTE: Operators used to manipulate dimensions and indices will support int32 and
|Operator|Opsets|
|--------|------|
|**ai.onnx**||
|ai.onnx:Abs|12, 13|
|ai.onnx:Add|12, 13|
|ai.onnx:And|12, 13|
|ai.onnx:ArgMax|12, 13|
|ai.onnx:ArgMin|12, 13|
|ai.onnx:AveragePool|12, 13|
|ai.onnx:Cast|12, 13|
|ai.onnx:Ceil|12, 13|
|ai.onnx:Clip|12, 13|
|ai.onnx:Concat|12, 13|
|ai.onnx:ConstantOfShape|12, 13|
|ai.onnx:Conv|12, 13|
|ai.onnx:ConvTranspose|12, 13|
|ai.onnx:Cos|12, 13|
|ai.onnx:CumSum|12, 13|
|ai.onnx:DepthToSpace|12, 13|
|ai.onnx:DequantizeLinear|12, 13|
|ai.onnx:Div|12, 13|
|ai.onnx:DynamicQuantizeLinear|12, 13|
|ai.onnx:Elu|12, 13|
|ai.onnx:Equal|12, 13|
|ai.onnx:Exp|12, 13|
|ai.onnx:Expand|12, 13|
|ai.onnx:Flatten|12, 13|
|ai.onnx:Floor|12, 13|
|ai.onnx:Gather|12, 13|
|ai.onnx:GatherND|12, 13|
|ai.onnx:Gemm|12, 13|
|ai.onnx:GlobalAveragePool|12, 13|
|ai.onnx:Greater|12, 13|
|ai.onnx:GreaterOrEqual|12, 13|
|ai.onnx:Identity|12, 13|
|ai.onnx:If|12, 13|
|ai.onnx:LRN|12, 13|
|ai.onnx:LeakyRelu|12, 13|
|ai.onnx:Less|12, 13|
|ai.onnx:LessOrEqual|12, 13|
|ai.onnx:Log|12, 13|
|ai.onnx:LogSoftmax|12, 13|
|ai.onnx:Loop|12, 13|
|ai.onnx:MatMul|12, 13|
|ai.onnx:MatMulInteger|12, 13|
|ai.onnx:Max|12, 13|
|ai.onnx:MaxPool|12, 13|
|ai.onnx:Mean|12, 13|
|ai.onnx:Min|12, 13|
|ai.onnx:Mul|12, 13|
|ai.onnx:Neg|12, 13|
|ai.onnx:NonMaxSuppression|12, 13|
|ai.onnx:NonZero|12, 13|
|ai.onnx:Not|12, 13|
|ai.onnx:Or|12, 13|
|ai.onnx:PRelu|12, 13|
|ai.onnx:Pad|12, 13|
|ai.onnx:Pow|12, 13|
|ai.onnx:QLinearConv|12, 13|
|ai.onnx:QLinearMatMul|12, 13|
|ai.onnx:QuantizeLinear|12, 13|
|ai.onnx:Range|12, 13|
|ai.onnx:Reciprocal|12, 13|
|ai.onnx:ReduceMax|12, 13|
|ai.onnx:ReduceMean|12, 13|
|ai.onnx:ReduceMin|12, 13|
|ai.onnx:ReduceProd|12, 13|
|ai.onnx:ReduceSum|12, 13|
|ai.onnx:Relu|12, 13|
|ai.onnx:Reshape|12, 13|
|ai.onnx:Resize|12, 13|
|ai.onnx:ReverseSequence|12, 13|
|ai.onnx:Round|12, 13|
|ai.onnx:Scan|12, 13|
|ai.onnx:ScatterND|12, 13|
|ai.onnx:Shape|12, 13|
|ai.onnx:Sigmoid|12, 13|
|ai.onnx:Sin|12, 13|
|ai.onnx:Size|12, 13|
|ai.onnx:Slice|12, 13|
|ai.onnx:Softmax|12, 13|
|ai.onnx:SpaceToDepth|12, 13|
|ai.onnx:Split|12, 13|
|ai.onnx:Sqrt|12, 13|
|ai.onnx:Squeeze|12, 13|
|ai.onnx:Sub|12, 13|
|ai.onnx:Sum|12, 13|
|ai.onnx:Tanh|12, 13|
|ai.onnx:ThresholdedRelu|12, 13|
|ai.onnx:Tile|12, 13|
|ai.onnx:TopK|12, 13|
|ai.onnx:Transpose|12, 13|
|ai.onnx:Unique|12, 13|
|ai.onnx:Unsqueeze|12, 13|
|ai.onnx:Where|12, 13|
|ai.onnx:Abs|12, 13, 14, 15|
|ai.onnx:Add|12, 13, 14, 15|
|ai.onnx:And|12, 13, 14, 15|
|ai.onnx:ArgMax|12, 13, 14, 15|
|ai.onnx:ArgMin|12, 13, 14, 15|
|ai.onnx:AveragePool|12, 13, 14, 15|
|ai.onnx:Cast|12, 13, 14, 15|
|ai.onnx:Ceil|12, 13, 14, 15|
|ai.onnx:Clip|12, 13, 14, 15|
|ai.onnx:Concat|12, 13, 14, 15|
|ai.onnx:ConstantOfShape|12, 13, 14, 15|
|ai.onnx:Conv|12, 13, 14, 15|
|ai.onnx:ConvTranspose|12, 13, 14, 15|
|ai.onnx:Cos|12, 13, 14, 15|
|ai.onnx:CumSum|12, 13, 14, 15|
|ai.onnx:DepthToSpace|12, 13, 14, 15|
|ai.onnx:DequantizeLinear|12, 13, 14, 15|
|ai.onnx:Div|12, 13, 14, 15|
|ai.onnx:DynamicQuantizeLinear|12, 13, 14, 15|
|ai.onnx:Elu|12, 13, 14, 15|
|ai.onnx:Equal|12, 13, 14, 15|
|ai.onnx:Erf|12, 13, 14, 15|
|ai.onnx:Exp|12, 13, 14, 15|
|ai.onnx:Expand|12, 13, 14, 15|
|ai.onnx:Flatten|12, 13, 14, 15|
|ai.onnx:Floor|12, 13, 14, 15|
|ai.onnx:Gather|12, 13, 14, 15|
|ai.onnx:GatherND|12, 13, 14, 15|
|ai.onnx:Gemm|12, 13, 14, 15|
|ai.onnx:GlobalAveragePool|12, 13, 14, 15|
|ai.onnx:Greater|12, 13, 14, 15|
|ai.onnx:GreaterOrEqual|12, 13, 14, 15|
|ai.onnx:HardSigmoid|12, 13, 14, 15|
|ai.onnx:Identity|12, 13, 14, 15|
|ai.onnx:If|12, 13, 14, 15|
|ai.onnx:InstanceNormalization|12, 13, 14, 15|
|ai.onnx:LRN|12, 13, 14, 15|
|ai.onnx:LayerNormalization|1|
|ai.onnx:LeakyRelu|12, 13, 14, 15|
|ai.onnx:Less|12, 13, 14, 15|
|ai.onnx:LessOrEqual|12, 13, 14, 15|
|ai.onnx:Log|12, 13, 14, 15|
|ai.onnx:LogSoftmax|12, 13, 14, 15|
|ai.onnx:Loop|12, 13, 14, 15|
|ai.onnx:MatMul|12, 13, 14, 15|
|ai.onnx:MatMulInteger|12, 13, 14, 15|
|ai.onnx:Max|12, 13, 14, 15|
|ai.onnx:MaxPool|12, 13, 14, 15|
|ai.onnx:Mean|12, 13, 14, 15|
|ai.onnx:Min|12, 13, 14, 15|
|ai.onnx:Mul|12, 13, 14, 15|
|ai.onnx:Neg|12, 13, 14, 15|
|ai.onnx:NonMaxSuppression|12, 13, 14, 15|
|ai.onnx:NonZero|12, 13, 14, 15|
|ai.onnx:Not|12, 13, 14, 15|
|ai.onnx:Or|12, 13, 14, 15|
|ai.onnx:PRelu|12, 13, 14, 15|
|ai.onnx:Pad|12, 13, 14, 15|
|ai.onnx:Pow|12, 13, 14, 15|
|ai.onnx:QLinearConv|12, 13, 14, 15|
|ai.onnx:QLinearMatMul|12, 13, 14, 15|
|ai.onnx:QuantizeLinear|12, 13, 14, 15|
|ai.onnx:Range|12, 13, 14, 15|
|ai.onnx:Reciprocal|12, 13, 14, 15|
|ai.onnx:ReduceMax|12, 13, 14, 15|
|ai.onnx:ReduceMean|12, 13, 14, 15|
|ai.onnx:ReduceMin|12, 13, 14, 15|
|ai.onnx:ReduceProd|12, 13, 14, 15|
|ai.onnx:ReduceSum|12, 13, 14, 15|
|ai.onnx:Relu|12, 13, 14, 15|
|ai.onnx:Reshape|12, 13, 14, 15|
|ai.onnx:Resize|12, 13, 14, 15|
|ai.onnx:ReverseSequence|12, 13, 14, 15|
|ai.onnx:Round|12, 13, 14, 15|
|ai.onnx:Scan|12, 13, 14, 15|
|ai.onnx:ScatterND|12, 13, 14, 15|
|ai.onnx:Shape|12, 13, 14, 15|
|ai.onnx:Sigmoid|12, 13, 14, 15|
|ai.onnx:Sin|12, 13, 14, 15|
|ai.onnx:Size|12, 13, 14, 15|
|ai.onnx:Slice|12, 13, 14, 15|
|ai.onnx:Softmax|12, 13, 14, 15|
|ai.onnx:SpaceToDepth|12, 13, 14, 15|
|ai.onnx:Split|12, 13, 14, 15|
|ai.onnx:Sqrt|12, 13, 14, 15|
|ai.onnx:Squeeze|12, 13, 14, 15|
|ai.onnx:Sub|12, 13, 14, 15|
|ai.onnx:Sum|12, 13, 14, 15|
|ai.onnx:Tanh|12, 13, 14, 15|
|ai.onnx:ThresholdedRelu|12, 13, 14, 15|
|ai.onnx:Tile|12, 13, 14, 15|
|ai.onnx:TopK|12, 13, 14, 15|
|ai.onnx:Transpose|12, 13, 14, 15|
|ai.onnx:Unique|12, 13, 14, 15|
|ai.onnx:Unsqueeze|12, 13, 14, 15|
|ai.onnx:Where|12, 13, 14, 15|
|||
|**com.microsoft**||
|com.microsoft:DynamicQuantizeMatMul|1|
|com.microsoft:FusedConv|1|
|com.microsoft:FusedGemm|1|
|com.microsoft:FusedMatMul|1|
|com.microsoft:Gelu|1|
|com.microsoft:MatMulIntegerToFloat|1|
|com.microsoft:NhwcMaxPool|1|
|com.microsoft:QLinearAdd|1|

View file

@ -21,7 +21,7 @@ import numpy
from onnxruntime.datasets import get_example
example2 = get_example("logreg_iris.onnx")
sess = rt.InferenceSession(example2)
sess = rt.InferenceSession(example2, providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
output_name = sess.get_outputs()[0].name

View file

@ -72,7 +72,7 @@ with open("pipeline_vectorize.onnx", "wb") as f:
import onnxruntime as rt
from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
sess = rt.InferenceSession("pipeline_vectorize.onnx")
sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())
import numpy
inp, out = sess.get_inputs()[0], sess.get_outputs()[0]

View file

@ -21,7 +21,7 @@ from onnxruntime.datasets import get_example
# The model is available on github `onnx...test_sigmoid <https://github.com/onnx/onnx/tree/master/onnx/backend/test/data/node/test_sigmoid>`_.
example1 = get_example("sigmoid.onnx")
sess = rt.InferenceSession(example1)
sess = rt.InferenceSession(example1, providers=rt.get_available_providers())
#########################
# Let's see the input name and shape.

View file

@ -31,8 +31,8 @@ print("producer_version={}".format(model.producer_version))
#############################
# With *ONNX Runtime*:
from onnxruntime import InferenceSession
sess = InferenceSession(example)
import onnxruntime as rt
sess = rt.InferenceSession(example, providers=rt.get_available_providers())
meta = sess.get_modelmeta()
print("custom_metadata_map={}".format(meta.custom_metadata_map))

View file

@ -35,7 +35,7 @@ def change_ir_version(filename, ir_version=6):
example1 = get_example("mul_1.onnx")
onnx_model = change_ir_version(example1)
onnx_model_str = onnx_model.SerializeToString()
sess = rt.InferenceSession(onnx_model_str)
sess = rt.InferenceSession(onnx_model_str, providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
x = numpy.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=numpy.float32)
@ -48,7 +48,7 @@ print(res)
options = rt.SessionOptions()
options.enable_profiling = True
sess_profile = rt.InferenceSession(onnx_model_str, options)
sess_profile = rt.InferenceSession(onnx_model_str, options, providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
x = numpy.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=numpy.float32)

View file

@ -64,7 +64,7 @@ with open("logreg_iris.onnx", "wb") as f:
# its input and output.
import onnxruntime as rt
sess = rt.InferenceSession("logreg_iris.onnx")
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
print("input name='{}' and shape={}".format(
sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
@ -180,7 +180,7 @@ with open("rf_iris.onnx", "wb") as f:
###################################
# We compare.
sess = rt.InferenceSession("rf_iris.onnx")
sess = rt.InferenceSession("rf_iris.onnx", providers=rt.get_available_providers())
def sess_predict_proba_rf(x):
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
@ -204,7 +204,7 @@ for n_trees in range(5, 51, 5):
onx = convert_sklearn(rf, initial_types=initial_type)
with open("rf_iris_%d.onnx" % n_trees, "wb") as f:
f.write(onx.SerializeToString())
sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees)
sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers())
def sess_predict_proba_loop(x):
return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
tsk = speed("loop(X_test, rf.predict_proba, 100)", number=5, repeat=5)

View file

@ -82,7 +82,7 @@ for this machine learning model.
import numpy
import onnxruntime as rt
sess = rt.InferenceSession("logreg_iris.onnx")
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
pred_onx = sess.run(None, {input_name: X_test.astype(numpy.float32)})[0]
print(pred_onx)
@ -97,7 +97,7 @@ by specifying its name into a list.
import numpy
import onnxruntime as rt
sess = rt.InferenceSession("logreg_iris.onnx")
sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]

View file

@ -131,7 +131,6 @@ Status ScatterNDBase::PrepareForCompute(OpKernelContext* context, Prepare& p) co
element_counts[i] = input_strides[i];
}
int64_t err_indice = 0;
p.element_bytes = input_tensor->DataType()->Size();
p.element_to_copy = input_shape.SizeFromDimension(last_indice_dimension);
p.bytes_to_copy = p.element_bytes * p.element_to_copy;
@ -150,13 +149,23 @@ Status ScatterNDBase::PrepareForCompute(OpKernelContext* context, Prepare& p) co
for (int64_t i = 0; i < offset_count; ++i) {
for (int64_t j = 0; j < last_indice_dimension; ++j) {
auto indice = *(indice_offset + i * last_indice_dimension + j);
if (indice < 0 || indice >= input_shape[j]) {
err_indice = indice;
if (indice >= 0) {
if (indice >= input_shape[j]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", indice);
}
} else {
if (indice < -input_shape[j]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", indice);
} else {
indice += input_shape[j];
}
}
p.element_offsets[i] += indice * element_counts[j];
}
}
return err_indice == 0 ? Status::OK() : ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid indice found, indice = ", err_indice);
return Status::OK();
}
Status ScatterND::Compute(OpKernelContext* context) const {

View file

@ -5,6 +5,7 @@
#include "instance_norm_impl.h"
#include "core/providers/cpu/nn/instance_norm_helper.h"
#include "core/providers/cpu/nn/batch_norm_helper.h"
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
namespace onnxruntime {
namespace cuda {
@ -45,10 +46,10 @@ Status InstanceNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) co
const TensorShape& x_shape = X->Shape();
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto y_data = reinterpret_cast<CudaT*>(Y->template MutableData<T>());
auto x_data = reinterpret_cast<const CudaT*>(X->template Data<T>());
auto scale_data = reinterpret_cast<const CudaT*>(scale->template Data<T>());
auto bias_data = reinterpret_cast<const CudaT*>(bias->template Data<T>());
auto* y_data = reinterpret_cast<CudaT*>(Y->template MutableData<T>());
const auto* x_data = reinterpret_cast<const CudaT*>(X->template Data<T>());
const auto* scale_data = reinterpret_cast<const CudaT*>(scale->template Data<T>());
const auto* bias_data = reinterpret_cast<const CudaT*>(bias->template Data<T>());
const auto& x_dims = x_shape.GetDims();
const int64_t N = x_dims[0];
@ -160,5 +161,150 @@ Status InstanceNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) co
return Status::OK();
}
template <>
Status InstanceNorm<MLFloat16>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
typedef typename ToCudaType<MLFloat16>::MappedType CudaT;
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
ORT_RETURN_IF_ERROR(InstanceNormHelper::ValidateInputs(X, scale, bias));
const TensorShape& x_shape = X->Shape();
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto* y_data = reinterpret_cast<CudaT*>(Y->template MutableData<MLFloat16>());
const auto* x_data = reinterpret_cast<const CudaT*>(X->template Data<MLFloat16>());
const auto* scale_data = reinterpret_cast<const CudaT*>(scale->template Data<MLFloat16>());
const auto* bias_data = reinterpret_cast<const CudaT*>(bias->template Data<MLFloat16>());
const auto& x_dims = x_shape.GetDims();
const int64_t N = x_dims[0];
const int64_t C = x_dims[1];
const auto one = Consts<CudaT>::One;
const auto zero = Consts<CudaT>::Zero;
if (N == 1) {
// when N == 1, we can treat it as spatial batch normalization in training
// as the mean/variance would be computed from input
CudnnTensor data_desc;
std::vector<int64_t> new_dims;
BatchNormHelper::NormalizeDims(x_shape, new_dims);
ORT_RETURN_IF_ERROR(data_desc.Set(new_dims, CudnnTensor::GetDataType<CudaT>()));
CudnnTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(data_desc, CUDNN_BATCHNORM_SPATIAL));
// For half input data type, alpha, beta, scale, bias need to be float type.
// alpha, beta will be of type float as the Consts struct specialization
// for MLFloat16 type take care of that. Only Convert the scale, bias to float)
auto scale_data_fp32 = GetScratchBuffer<float>(C);
Impl_Cast<CudaT, float>(Stream(), scale_data, scale_data_fp32.get(), C);
auto bias_data_fp32 = GetScratchBuffer<float>(C);
Impl_Cast<CudaT, float>(Stream(), bias_data, bias_data_fp32.get(), C);
CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining(
CudnnHandle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data,
stats_desc,
scale_data_fp32.get(),
bias_data_fp32.get(),
1.0f,
nullptr,
nullptr,
epsilon_,
nullptr,
nullptr));
} else {
// we use cudnnBatchNormalizationForwardTraining to compute mean/variance
// so collapsing NC into channel
auto input_count = x_shape.Size(); // N * C * H * W
auto stats_count = x_shape.SizeToDimension(2); // N * C
auto image_size = input_count / stats_count;
CudnnTensor data_desc;
ORT_RETURN_IF_ERROR(data_desc.Set(std::array<int64_t, 4>{1, stats_count, image_size, 1},
CudnnTensor::GetDataType<CudaT>()));
// stats_desc needs to be of 'float' type even for float16 input as the "stats" are of float type
CudnnTensor stats_desc;
ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1},
CudnnTensor::GetDataType<float>()));
// For half input data type, we need to allocate some "intermediate"
// float buffers for CuDNN to use.
const size_t stats_byte_count = stats_count * sizeof(float);
// Mean & Variance are inputs & outputs and must be initialized to zero to work properly
auto mean = GetScratchBuffer<float>(stats_count);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
auto variance = GetScratchBuffer<float>(stats_count);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
// We must set the scale & bias inputs to zero as they are inputs to the calculation
auto unused_scale = GetScratchBuffer<float>(stats_count);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
auto unused_bias = GetScratchBuffer<float>(stats_count);
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
// first, compute mean and variance per-instance per-channel using cudnnBatchNorm training
CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining(
CudnnHandle(),
CUDNN_BATCHNORM_SPATIAL,
&one,
&zero,
data_desc,
x_data,
data_desc,
y_data, // use y temporarily, would be rewritten later
stats_desc,
unused_scale.get(),
unused_bias.get(),
1.0f,
mean.get(),
variance.get(),
CUDNN_BN_MIN_EPSILON,
nullptr,
nullptr));
// Y = scale * (x - mean) / sqrt (variance + epsilon) + B
// X/Y is (N,C,H,W)
// scale/bias is (1,C,1,1)
// mean/stddev is (N,C,1,1)
// NOTE cudnnBatchNormalization computes unbiased variance sum((Xi - mean)^2) / (count - 1)
// and it needs to be corrected with (count - 1) / count
fast_divmod fdm_HW(gsl::narrow_cast<int>(image_size));
fast_divmod fdm_C(gsl::narrow_cast<int>(C));
// The InstanceNormImpl kernel handles the mean/variance in float32, so no casting required here
InstanceNormImpl<CudaT, float>(
Stream(),
x_data,
scale_data,
bias_data,
mean.get(),
variance.get(),
(image_size - 1.0) / image_size,
static_cast<double>(epsilon_),
fdm_HW,
fdm_C,
y_data,
input_count);
}
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -7,18 +7,18 @@
namespace onnxruntime {
namespace cuda {
template <typename T>
template <typename T1, typename T2>
__global__ void _InstanceNormKernel(
const T* input_data,
const T* scale,
const T* bias,
const T* mean,
const T* variance,
const T1* __restrict__ input_data,
const T1* __restrict__ scale,
const T1* __restrict__ bias,
const T2* __restrict__ mean,
const T2* __restrict__ variance,
const double variance_correction,
const double epsilon,
const fast_divmod fdm_HW,
const fast_divmod fdm_C,
T* output_data,
T1* __restrict__ output_data,
const CUDA_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int nc = fdm_HW.div(id);
@ -26,34 +26,35 @@ __global__ void _InstanceNormKernel(
fdm_C.divmod(nc, n, c);
// Y = scale * (x - mean) / sqrt (std * std + epsilon) + B
output_data[id] = scale[c] * (input_data[id] - mean[nc]) / _Sqrt(variance[nc] * (T)variance_correction + (T)epsilon) + bias[c];
output_data[id] = scale[c] * (input_data[id] - (T1)mean[nc]) / _Sqrt((T1)variance[nc] * (T1)variance_correction + (T1)epsilon) + bias[c];
}
template <typename T>
template <typename T1, typename T2>
void InstanceNormImpl(
cudaStream_t stream,
const T* input_data,
const T* scale,
const T* bias,
const T* mean,
const T* variance,
const T1* input_data,
const T1* scale,
const T1* bias,
const T2* mean,
const T2* variance,
const double variance_correction,
const double epsilon,
const fast_divmod& fdm_HW,
const fast_divmod& fdm_C,
T* output_data,
T1* output_data,
size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
_InstanceNormKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
_InstanceNormKernel<T1, T2><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (CUDA_LONG)N);
}
#define SPECIALIZED_IMPL(T) \
template void InstanceNormImpl<T>(cudaStream_t stream, const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count);
#define SPECIALIZED_IMPL(T1, T2) \
template void InstanceNormImpl<T1, T2>(cudaStream_t stream, const T1* input_data, const T1* scale, const T1* bias, const T2* mean, const T2* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T1* output_data, size_t count);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(float, float)
SPECIALIZED_IMPL(double, double)
// When the input data type is float16, the means and variances will flow in as float32 (special case)
SPECIALIZED_IMPL(half, float)
} // namespace cuda
} // namespace onnxruntime

View file

@ -6,19 +6,19 @@
namespace onnxruntime {
namespace cuda {
template <typename T>
template <typename T1, typename T2>
void InstanceNormImpl(
cudaStream_t stream,
const T* input_data,
const T* scale,
const T* bias,
const T* mean,
const T* variance,
const T1* input_data,
const T1* scale,
const T1* bias,
const T2* mean,
const T2* variance,
const double variance_correction,
const double epsilon,
const fast_divmod& fdm_HW,
const fast_divmod& fdm_C,
T* output_data,
T1* output_data,
size_t count);
} // namespace cuda

View file

@ -34,11 +34,19 @@ __global__ void _ScatterNDKernel(
// This would have been an error in the CPU kernel, but throwing in the CUDA EP
// is hard. This is the approach taken by other frameworks for out of bound indices
// in their corresponding GPU backends as well.
if (index < 0)
index = 0;
// index >= -dim_value && index < dim_value
else if (index >= dim_value)
index = dim_value - 1;
if (index >= 0) {
if (index >= dim_value) {
index = dim_value - 1;
}
} else {
if (index < -dim_value) {
index = 0;
} else {
index += dim_value;
}
}
data_offset += (index * element_count_dim);
}

View file

@ -41,11 +41,11 @@ TEST(InstanceNormalizationOpTest, InstanceNorm) {
-0.14644464F, -0.82262872F, -0.66852817F, 1.63760153F,
-1.65898662F, 0.27618144F, 0.64840618F, 0.734399F};
test.AddOutput<float>("Y", input_dims, expected_output);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
#endif
#endif
}
TEST(InstanceNormalizationOpTest, InstanceNormBatch1) {
@ -58,12 +58,10 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch1) {
vector<int64_t> input_dims = {1, 3, 4};
test.AddInput<float>("input", input_dims, input);
// vector<float> scale = {2.1F, 0.1F, 1.F};
vector<float> scale = {1.0F, 1.0F, 1.F};
vector<int64_t> scale_dims = {3};
test.AddInput<float>("scale", scale_dims, scale);
// vector<float> B = {2.3F, 1.5F, 0.F};
vector<float> B = {0.0F, 0.0F, 0.F};
vector<int64_t> B_dims = {3};
test.AddInput<float>("B", B_dims, B);
@ -72,13 +70,150 @@ TEST(InstanceNormalizationOpTest, InstanceNormBatch1) {
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F};
test.AddOutput<float>("Y", input_dims, expected_output);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
#endif
#endif
}
TEST(InstanceNormalizationOpTest, InstanceNormBatch2) {
OpTester test("InstanceNormalization");
test.AddAttribute("epsilon", 0.3F);
vector<float> input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F,
8.519701F, 1.2382338F, 1.7930176F, 5.1099434F,
7.9195533F, 7.638727F, 8.065445F, 3.8082376F,
3.1513367F, 9.283596F, 1.4546119F, 5.4617004F,
8.519701F, 1.2382338F, 1.7930176F, 5.1099434F,
7.9195533F, 7.638727F, 8.065445F, 3.8082376F};
vector<int64_t> input_dims = {2, 3, 4};
test.AddInput<float>("input", input_dims, input);
vector<float> scale = {1.0F, 1.0F, 1.F};
vector<int64_t> scale_dims = {3};
test.AddInput<float>("scale", scale_dims, scale);
vector<float> B = {0.0F, 0.0F, 0.F};
vector<int64_t> B_dims = {3};
test.AddInput<float>("B", B_dims, B);
vector<float> expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F,
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F,
-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F,
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F};
test.AddOutput<float>("Y", input_dims, expected_output);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
#endif
}
// Only CUDA kernel has float 16 support
#ifdef USE_CUDA
TEST(InstanceNormalizationOpTest, InstanceNormBatch1_fp16) {
OpTester test("InstanceNormalization");
test.AddAttribute("epsilon", 0.3F);
vector<float> input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F,
8.519701F, 1.2382338F, 1.7930176F, 5.1099434F,
7.9195533F, 7.638727F, 8.065445F, 3.8082376F};
vector<int64_t> input_dims = {1, 3, 4};
vector<float> scale = {1.0F, 1.0F, 1.F};
vector<int64_t> scale_dims = {3};
vector<float> B = {0.0F, 0.0F, 0.F};
vector<int64_t> B_dims = {3};
vector<float> expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F,
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F};
constexpr size_t input_size = 1 * 3 * 4;
vector<MLFloat16> input_fp16(input_size);
vector<MLFloat16> scale_fp16(3);
vector<MLFloat16> B_fp16(3);
vector<MLFloat16> expected_output_fp16(input_size);
ConvertFloatToMLFloat16(input.data(), input_fp16.data(), input_size);
ConvertFloatToMLFloat16(scale.data(), scale_fp16.data(), 3);
ConvertFloatToMLFloat16(B.data(), B_fp16.data(), 3);
ConvertFloatToMLFloat16(expected_output.data(), expected_output_fp16.data(), input_size);
test.AddInput<MLFloat16>("X", input_dims, input_fp16);
test.AddInput<MLFloat16>("scale", {3}, scale_fp16);
test.AddInput<MLFloat16>("B", {3}, B_fp16);
test.AddOutput<MLFloat16>("Y", input_dims, expected_output_fp16);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
#endif
}
TEST(InstanceNormalizationOpTest, InstanceNormBatch2_fp16) {
OpTester test("InstanceNormalization");
test.AddAttribute("epsilon", 0.3F);
vector<float> input = {3.1513367F, 9.283596F, 1.4546119F, 5.4617004F,
8.519701F, 1.2382338F, 1.7930176F, 5.1099434F,
7.9195533F, 7.638727F, 8.065445F, 3.8082376F,
3.1513367F, 9.283596F, 1.4546119F, 5.4617004F,
8.519701F, 1.2382338F, 1.7930176F, 5.1099434F,
7.9195533F, 7.638727F, 8.065445F, 3.8082376F};
vector<int64_t> input_dims = {2, 3, 4};
vector<float> scale = {1.0F, 1.0F, 1.F};
vector<int64_t> scale_dims = {3};
vector<float> B = {0.0F, 0.0F, 0.F};
vector<int64_t> B_dims = {3};
vector<float> expected_output = {-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F,
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F,
-0.56495477F, 1.48930046F, -1.13334329F, 0.20899761F,
1.46688162F, -0.98600774F, -0.79911913F, 0.31824524F,
0.57370438F, 0.42193634F, 0.6525492F, -1.64818992F};
constexpr size_t input_size = 2 * 3 * 4;
vector<MLFloat16> input_fp16(input_size);
vector<MLFloat16> scale_fp16(3);
vector<MLFloat16> B_fp16(3);
vector<MLFloat16> expected_output_fp16(input_size);
ConvertFloatToMLFloat16(input.data(), input_fp16.data(), input_size);
ConvertFloatToMLFloat16(scale.data(), scale_fp16.data(), 3);
ConvertFloatToMLFloat16(B.data(), B_fp16.data(), 3);
ConvertFloatToMLFloat16(expected_output.data(), expected_output_fp16.data(), input_size);
test.AddInput<MLFloat16>("X", input_dims, input_fp16);
test.AddInput<MLFloat16>("scale", {3}, scale_fp16);
test.AddInput<MLFloat16>("B", {3}, B_fp16);
test.AddOutput<MLFloat16>("Y", input_dims, expected_output_fp16);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
#endif
}
#endif
TEST(InstanceNormalizationOpTest, InstanceNorm_2) {
OpTester test("InstanceNormalization");
test.AddAttribute("epsilon", 0.3F);
@ -119,7 +254,7 @@ TEST(InstanceNormalizationOpTest, InstanceNorm_2) {
1.88028F, 2.353724F, -0.25549555F,
2.0837004F, 2.8466992F, 2.0773761F};
test.AddOutput<float>("Y", input_dims, expected_output);
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
#if defined(OPENVINO_CONFIG_MYRIAD) //Disabling this test on MYRIADX temporarily due to a bug
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
#else
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});

View file

@ -39,6 +39,15 @@ TEST(ScatterNDOpTest, ScatterND_matrice_int64_int64) {
test.Run();
}
TEST(ScatterNDOpTest, ScatterND_matrice_int64_int64_neg_indices) {
OpTester test("ScatterND", 11);
test.AddInput<int64_t> ("data", {2,2}, {1LL,1LL,2LL,2LL});
test.AddInput<int64_t> ("indices", {2,2}, {0LL,0LL,-1LL,-1LL});
test.AddInput<int64_t>("updates", {2}, {0LL,3LL});
test.AddOutput<int64_t>("output", {2,2}, {0LL,1LL,2LL,3LL});
test.Run();
}
TEST(ScatterNDOpTest, ScatterND_matrice_string_int64) {
OpTester test1("ScatterND", 11);
test1.AddInput<std::string>("data", {2,2,2}, {"egg","dance","bob","air","smart","terry","laugh","kite"});
@ -55,6 +64,22 @@ TEST(ScatterNDOpTest, ScatterND_matrice_string_int64) {
test2.Run();
}
TEST(ScatterNDOpTest, ScatterND_matrice_string_int64_neg_indices) {
OpTester test1("ScatterND", 11);
test1.AddInput<std::string>("data", {2,2,2}, {"egg","dance","bob","air","smart","terry","laugh","kite"});
test1.AddInput<int64_t>("indices", {2,1,2}, {0,-1,-1,0});
test1.AddInput<std::string>("updates", {2,1,2}, {"air","bob","terry","smart"});
test1.AddOutput<std::string>("output", {2,2,2}, {"egg","dance","air","bob","terry","smart","laugh","kite"});
test1.Run();
OpTester test2("ScatterND", 11);
test2.AddInput<std::string>("data", {3,3}, {"egg","","air","","terry","smart","laugh","","hop"});
test2.AddInput<int64_t>("indices", {3,2}, {-1,-2,1,0,0,-2});
test2.AddInput<std::string>("updates", {3}, {"kite","bob","dance"});
test2.AddOutput<std::string>("output", {3,3}, {"egg","dance","air","bob","terry","smart","laugh","kite","hop"});
test2.Run();
}
TEST(ScatterNDOpTest, ScatterND_slice_float_int64_t) {
OpTester test("ScatterND", 11);
test.AddInput<float>("data", {2,2}, {0.0f,0.1f,0.1f,0.1f});
@ -76,14 +101,14 @@ TEST(ScatterNDOpTest, ScatterND_slice_double_int64_t) {
TEST(ScatterNDOpTest, ScatterND_3tensor_int64) {
OpTester test1("ScatterND", 11);
test1.AddInput<int64_t>("data", {2,2,2}, {0LL,1LL,1LL,1LL,1LL,1LL,6LL,7LL});
test1.AddInput<int64_t>("indices", {2,2}, {0LL,1LL,1LL,0LL});
test1.AddInput<int64_t>("indices", {2,2}, {0LL,1LL,-1LL,0LL});
test1.AddInput<int64_t>("updates", {2,2}, {2LL,3LL,4LL,5LL});
test1.AddOutput<int64_t>("output", {2,2,2}, {0LL,1LL,2LL,3LL,4LL,5LL,6LL,7LL});
test1.Run();
OpTester test2("ScatterND", 11);
test2.AddInput<int8_t>("data", {2,2,2}, {0,0,2,3,4,0,6,7});
test2.AddInput<int64_t>("indices", {2,3}, {0,0,1,1,0,1});
test2.AddInput<int64_t>("indices", {2,3}, {0,0,1,-1,0,-1});
test2.AddInput<int8_t>("updates", {2}, {1,5});
test2.AddOutput<int8_t>("output", {2,2,2}, {0,1,2,3,4,5,6,7});
test2.Run();
@ -142,7 +167,7 @@ TEST(ScatterNDOpTest, ScatterND_batched_3tensor_int64) {
OpTester test2("ScatterND", 11);
test2.AddInput<uint32_t>("data", {2,2,2}, {0,0,2,0,4,0,0,7});
test2.AddInput<int64_t>("indices", {2,2,3}, {0,0,1,1,0,1,0,1,1,1,1,0});
test2.AddInput<int64_t>("indices", {2,2,3}, {0,0,-1,-1,0,-1,0,1,-1,1,-1,0});
test2.AddInput<uint32_t>("updates", {2,2}, {1,5,3,6});
test2.AddOutput<uint32_t>("output", {2,2,2}, {0,1,2,3,4,5,6,7});
test2.Run();

View file

@ -68,7 +68,7 @@ class TestInferenceSessionKeras(unittest.TestCase):
# runtime
content = converted_model.SerializeToString()
rt = onnxrt.InferenceSession(content)
rt = onnxrt.InferenceSession(content, providers=onnxrt.get_available_providers())
input = {rt.get_inputs()[0].name: x}
actual_rt = rt.run(None, input)
self.assertEqual(len(actual_rt), 1)

View file

@ -1887,6 +1887,7 @@ TEST(CApiTest, TestPerSessionCustomThreadPoolHooks) {
// Preventing resize tranformer issue:
// https://github.com/microsoft/onnxruntime/issues/9857
#ifndef REDUCED_OPS_BUILD
TEST(CApiTest, crop_and_resize) {
std::vector<float> input_value_0;
input_value_0.resize(2 * 36 * 36 * 3);
@ -1921,6 +1922,7 @@ TEST(CApiTest, crop_and_resize) {
ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, output_type_shape.GetElementType());
ASSERT_EQ(output_shape, output_type_shape.GetShape());
}
#endif
} // namespace TestPerSessionCustomThreadHooks
#endif

View file

@ -658,6 +658,34 @@ def test_gradient_correctness():
_test_helpers.assert_values_are_close(ort_prediction, pt_prediction)
_test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
@pytest.mark.parametrize("device", ['cpu', 'cuda'])
@pytest.mark.parametrize("indices", ([[ 2, 3, -1, -1],[0, 1, -1, -1]],
[[ 2, 3, 4, 4],[ 0, 1, 4, 4]]))
def test_scatternd_correctness(device, indices):
class NeuralNetScatterND(torch.nn.Module):
def __init__(self):
super(NeuralNetScatterND, self).__init__()
def forward(self, rerouted_output, dispatch_mask, expert_output):
rerouted_output[dispatch_mask] = expert_output
return rerouted_output
pt_model = NeuralNetScatterND().to(device)
ort_model = ORTModule(copy.deepcopy(pt_model))
def run_step(model, rerouted_output, dispatch_mask, expert_output):
prediction = model(rerouted_output, dispatch_mask, expert_output)
return prediction
rerouted_output = torch.tensor([[0.],[0.],[0.],[0.],[0.]], device=device)
dispatch_mask = torch.tensor(indices, device=device)
expert_output = torch.tensor([[[0.3817],[0.9625],[0.9625],[0.9625]],[[0.3817],[0.9625],[0.9625],[0.9625]]], device=device)
pt_prediction = run_step(pt_model, rerouted_output, dispatch_mask, expert_output)
ort_prediction = run_step(ort_model, rerouted_output, dispatch_mask, expert_output)
_test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-5)
@pytest.mark.parametrize("use_fp16", [False, True])
@pytest.mark.parametrize("input_requires_grad", [False, True])
def test_gradient_correctness_conv1d(use_fp16, input_requires_grad):

View file

@ -163,11 +163,11 @@ def main():
input_mask = np.ones((batch, sq_length), dtype=np.int64)
# Do forward using the original model.
sess = ort.InferenceSession(model_file_path)
sess = ort.InferenceSession(model_file_path, providers=ort.get_available_providers())
result = sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask})
# Do forward using the new model.
new_sess = ort.InferenceSession(new_model_file_path)
new_sess = ort.InferenceSession(new_model_file_path, providers=ort.get_available_providers())
new_result = new_sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask})
# Compare the outcomes from the two models.

View file

@ -298,11 +298,11 @@ segment_ids = np.random.randint(low=0, high=2, size=(batch, sq_length), dtype=np
input_mask = np.ones((batch, sq_length), dtype=np.int64)
# Do forward using the original model.
sess = ort.InferenceSession(input_model_name)
sess = ort.InferenceSession(input_model_name, providers=ort.get_available_providers())
result = sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask})
# Do forward using the new model.
new_sess = ort.InferenceSession(output_model_name)
new_sess = ort.InferenceSession(output_model_name, providers=ort.get_available_providers())
new_result = new_sess.run(None, {'input1': input_ids, 'input2': segment_ids, 'input3': input_mask})
# Compare the outcomes from the two models.

View file

@ -528,7 +528,7 @@ def main():
is_model_exported = False
import onnxruntime as ort
sess = ort.InferenceSession(onnx_path)
sess = ort.InferenceSession(onnx_path, providers=ort.get_available_providers())
result = sess.run(None, {'input1': input_ids.cpu().numpy(), 'input2': segment_ids.cpu().numpy(), 'input3': input_mask.cpu().numpy()})
print('---ORT result---')