From 318b82ca7e6e73ed46fc4d09e97466181ef35a9f Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Thu, 4 Feb 2021 14:52:37 -0800 Subject: [PATCH 01/41] Cast Op performance fix. (#6509) Update CPU Cast implementation to fix performance regressions. Update Cast unit tests for more coverage. --- .../onnxruntime/core/framework/data_types.h | 44 +--- .../onnxruntime/core/platform/threadpool.h | 2 +- onnxruntime/core/framework/data_types.cc | 4 + .../core/optimizer/conv_activation_fusion.cc | 2 +- .../core/providers/cpu/tensor/cast_op.cc | 234 +++++++++-------- .../core/providers/op_kernel_type_control.h | 15 +- .../test/providers/cpu/tensor/cast_op_test.cc | 188 ++++++++++++++ .../providers/cpu/tensor/tensor_op_test.cc | 239 ------------------ .../test/providers/provider_test_utils.cc | 16 +- 9 files changed, 348 insertions(+), 396 deletions(-) create mode 100644 onnxruntime/test/providers/cpu/tensor/cast_op_test.cc diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h index 4e7f3c6e60..c71376d316 100644 --- a/include/onnxruntime/core/framework/data_types.h +++ b/include/onnxruntime/core/framework/data_types.h @@ -59,50 +59,10 @@ struct MLFloat16 { explicit MLFloat16(uint16_t x) : val(x) {} explicit MLFloat16(float f); - // Taken from https://stackoverflow.com/a/60047308/12627730 - float AsFloat(uint32_t x) const { - float out = 0.0f; - std::memcpy(&out, &x, sizeof(x)); - return out; - } - - // Taken from https://stackoverflow.com/a/60047308/12627730 - uint32_t AsUint(float x) const { - uint32_t out = 0; - std::memcpy(&out, &x, sizeof(x)); - return out; - } - - float HalfToFloat(const uint16_t x) const { - uint16_t half = x; - if (endian::native == endian::big) { - // Taken from https://stackoverflow.com/a/2182184/12627730 - half = (x >> 8) | (x << 8); - } - - // Taken from https://stackoverflow.com/a/60047308/12627730 - // IEEE-754 16-bit floating-point format (without infinity): 1-5-10, exp-15, +-131008.0, +-6.1035156E-5, - // +-5.9604645E-8, 3.311 digits - const uint32_t e = (half & 0x7C00) >> 10; // exponent - const uint32_t m = (half & 0x03FF) << 13; // mantissa - // evil log2 bit hack to count leading zeros in denormalized format - const uint32_t v = AsUint(static_cast(m)) >> 23; - uint32_t full = (half & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m) | - ((e == 0) & (m != 0)) * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)); // sign : normalized : denormalized - - if (endian::native == endian::big) { - // Taken from https://stackoverflow.com/a/2182184/12627730 - full = ((full >> 24) & 0xff) | // move byte 3 to byte 0 - ((full << 8) & 0xff0000) | // move byte 1 to byte 2 - ((full >> 8) & 0xff00) | // move byte 2 to byte 1 - ((full << 24) & 0xff000000); // byte 0 to byte 3 - } - - return AsFloat(full); - } + float ToFloat() const; operator float() const { - return HalfToFloat(val); + return ToFloat(); } }; diff --git a/include/onnxruntime/core/platform/threadpool.h b/include/onnxruntime/core/platform/threadpool.h index 126dd133fa..899a74f9c4 100644 --- a/include/onnxruntime/core/platform/threadpool.h +++ b/include/onnxruntime/core/platform/threadpool.h @@ -281,7 +281,7 @@ class ThreadPool { /** * Tries to call the given function in parallel, with calls split into (num_batches) batches. *\param num_batches If it is zero, it will be replaced to the value of DegreeOfParallelism(). - *\param fn A std::function or STL style functor with signature of "void f(int32_t);" + *\param fn A std::function or STL style functor with signature of "void f(std::ptrdiff_t);" * Pitfall: Caller should cap `num_batches` to a reasonable value based on the cost of `fn` and the value of `total`. *For example, if fn is as simple as: int sum=0; fn = [&](int i){sum +=i;} and `total` is 100, then num_batches should *be just 1. diff --git a/onnxruntime/core/framework/data_types.cc b/onnxruntime/core/framework/data_types.cc index ce5112f1da..2bc32d3cdf 100644 --- a/onnxruntime/core/framework/data_types.cc +++ b/onnxruntime/core/framework/data_types.cc @@ -25,6 +25,10 @@ namespace onnxruntime { MLFloat16::MLFloat16(float f) : val{math::floatToHalf(f)} {} +float MLFloat16::ToFloat() const { + return math::halfToFloat(val); +} + // Return the MLDataType used for a generic Tensor template <> MLDataType DataTypeImpl::GetType() { diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index 1444fb9653..e99a4399ea 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -49,7 +49,7 @@ static bool GetClipConstantMinMax(const Graph& graph, const Node& node, float& m // value = static_cast(*i.data()); // break; case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: - value = math::halfToFloat(i.data()->val); + value = math::halfToFloat(i.data()->val); break; default: ORT_THROW("Unexpected data type for Clip input of ", initializer->data_type()); diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc index 191b777a40..eb016febc3 100644 --- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc @@ -2,8 +2,8 @@ // Licensed under the MIT License. #include -#include -#include +#include +#include #include "boost/mp11.hpp" @@ -18,15 +18,13 @@ #include "core/providers/op_kernel_type_control.h" #include "core/util/math_cpuonly.h" +#include "Eigen/src/Core/arch/Default/BFloat16.h" #include "Eigen/src/Core/arch/Default/Half.h" #if defined(_M_AMD64) #include "core/mlas/inc/mlas.h" #endif -using namespace ONNX_NAMESPACE; -using namespace boost::mp11; - namespace onnxruntime { namespace op_kernel_type_control { @@ -56,20 +54,15 @@ using EnabledSrcTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecu using EnabledDstTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, Cast, Output, 0); -using IndirectCastTypes = TypeList; - -template -using IsDirectCastType = mp_not>; - -template -using AreAllDirectCastTypes = mp_all...>; - // string cast helpers +// Note: when C++17 is available, use functions -// handle floating point input separately +// handle floating point output separately template typename std::enable_if::value, void>::type CastToString(const SrcType& input, std::string& output) { + static_assert(sizeof(SrcType) <= sizeof(double), + "largest supported floating point type is double"); if (std::isnan(input)) { output = "NaN"; } else if (std::isinf(input)) { @@ -79,19 +72,49 @@ CastToString(const SrcType& input, std::string& output) { output = "INF"; } } else { - // setprecision to 8 to match numpy default behavior - std::ostringstream convert; - convert << std::setprecision(8) << input; - output = convert.str(); + // set precision to 8 to match numpy default behavior + constexpr const char* format = "%.8g"; + const double value = static_cast(input); + + char static_buffer[256]; + std::unique_ptr dynamic_buffer{}; + + gsl::span buffer_span = gsl::make_span(static_buffer); + + auto snprintf_result = std::snprintf(buffer_span.data(), buffer_span.size(), format, value); + ORT_ENFORCE(snprintf_result > 0, "snprintf() failed with return value: ", snprintf_result); + + // include trailing '\0' + const size_t required_buffer_size = gsl::narrow_cast(snprintf_result) + 1; + + if (required_buffer_size > buffer_span.size()) { + // didn't get it all, allocate a bigger buffer and retry + dynamic_buffer = onnxruntime::make_unique(required_buffer_size); + buffer_span = gsl::make_span(dynamic_buffer.get(), required_buffer_size); + snprintf_result = std::snprintf(buffer_span.data(), buffer_span.size(), format, value); + ORT_ENFORCE( + snprintf_result > 0 && + gsl::narrow_cast(snprintf_result) == buffer_span.size() - 1, + "Failed to write value with snprintf()."); + } + + output.assign(buffer_span.data(), required_buffer_size - 1); } } template typename std::enable_if::value, void>::type CastToString(const SrcType& input, std::string& output) { - std::ostringstream convert; - convert << input; - output = convert.str(); + output = std::to_string(input); +} + +// overloads for MLFloat16 and BFloat16 +void CastToString(const MLFloat16& input, std::string& output) { + CastToString(static_cast(input), output); +} + +void CastToString(const BFloat16& input, std::string& output) { + CastToString(static_cast(input), output); } template @@ -118,115 +141,121 @@ CastFromString(const std::string& input, DstType& output) { output = gsl::narrow_cast(std::stoll(input)); } -// generic scalar X -> Y -template -struct ScalarDirectCaster { - void Cast(const SrcType& in, DstType& out) const { - out = static_cast(in); - } +// overloads for MLFloat16 and BFloat16 +void CastFromString(const std::string& input, MLFloat16& output) { + float intermediate; + CastFromString(input, intermediate); + output = static_cast(intermediate); +} + +void CastFromString(const std::string& input, BFloat16& output) { + float intermediate; + CastFromString(input, intermediate); + output = static_cast(intermediate); +} + +// type that is usable with Eigen cast +template +struct EigenCastType { + using type = T; }; -// scalar X -> string -template -struct ScalarDirectCaster { - void Cast(const SrcType& in, std::string& out) const { - CastToString(in, out); - } +// ORT float16 types don't support casting, so map them to Eigen ones + +template <> +struct EigenCastType { + using type = Eigen::half; }; -// scalar string -> X -template -struct ScalarDirectCaster { - void Cast(const std::string& in, DstType& out) const { - CastFromString(in, out); - } -}; - -// helper for indirect cast types -template -struct ScalarIndirectCaster { - void Cast(const SrcType& in, DstType& out) const { - IntermediateType intermediate; - ScalarDirectCaster{}.Cast(in, intermediate); - ScalarDirectCaster{}.Cast(intermediate, out); - } -}; - -template -struct ScalarCaster; - -template -struct ScalarCaster< - SrcType, DstType, - typename std::enable_if::value>::type> { - void Cast(const SrcType& in, DstType& out) const { - ScalarDirectCaster{}.Cast(in, out); - } -}; - -template -struct ScalarCaster< - SrcType, DstType, - typename std::enable_if::value>::type> { - void Cast(const SrcType& in, DstType& out) const { - ScalarIndirectCaster{}.Cast(in, out); - } +template <> +struct EigenCastType { + using type = Eigen::bfloat16; }; // generic tensor X -> Y -template +template struct TensorCaster { - void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + using SrcEigenCastType = typename EigenCastType::type; + using DstEigenCastType = typename EigenCastType::type; + const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); - const auto in_vector = ConstEigenVectorMap(in.Data(), shape_size); - auto out_vector = EigenVectorMap(out.MutableData(), shape_size); - out_vector = in_vector.unaryExpr([](const SrcType& in_scalar) { - DstType out_scalar; - ScalarCaster{}.Cast(in_scalar, out_scalar); - return out_scalar; - }); + const auto in_vector = + ConstEigenVectorMap(reinterpret_cast(in.Data()), shape_size); + auto out_vector = + EigenVectorMap(reinterpret_cast(out.MutableData()), shape_size); + out_vector = in_vector.template cast(); } }; -template -void CastStringTensor(const Tensor& in, Tensor& out, const TensorShape& shape) { - static_assert(std::is_same::value || std::is_same::value, - "Either SrcType or DstType must be std::string."); - const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); - const auto in_data = in.DataAsSpan(); - const auto out_data = out.MutableDataAsSpan(); - for (std::ptrdiff_t i = 0; i < shape_size; ++i) { - ScalarCaster{}.Cast(in_data[i], out_data[i]); - } -} - // tensor X -> string template struct TensorCaster { - void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const { - CastStringTensor(in, out, shape); + void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); + const auto* in_data = in.Data(); + auto* out_data = out.MutableData(); + for (std::ptrdiff_t i = 0; i < shape_size; ++i) { + CastToString(in_data[i], out_data[i]); + } } }; // tensor string -> X template struct TensorCaster { - void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const { - CastStringTensor(in, out, shape); + void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); + const auto* in_data = in.Data(); + auto* out_data = out.MutableData(); + for (std::ptrdiff_t i = 0; i < shape_size; ++i) { + CastFromString(in_data[i], out_data[i]); + } } }; #if defined(_M_AMD64) +// specializations to use optimized and Windows x64-specific +// MlasConvertHalfToFloatBuffer() routine for MLFloat16 -> float conversion + +template +void CastMLFloat16ThroughFloat( + const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) { + // use optimized MLFloat16 -> float, then float -> DstType + AllocatorPtr allocator; + ORT_THROW_IF_ERROR(context.GetTempSpaceAllocator(&allocator)); + auto intermediate_buffer = IAllocator::MakeUniquePtr(allocator, gsl::narrow(shape.Size())); + Tensor intermediate_tensor{DataTypeImpl::GetType(), shape, intermediate_buffer.get(), allocator->Info()}; + TensorCaster{}.Cast(context, in, intermediate_tensor, shape); + TensorCaster{}.Cast(context, intermediate_tensor, out, shape); +} + +// tensor MLFloat16 -> X +template +struct TensorCaster { + void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const { + CastMLFloat16ThroughFloat(context, in, out, shape); + } +}; + // tensor MLFloat16 -> float template <> struct TensorCaster { - void Cast(const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { auto out_data = out.MutableData(); auto in_data = in.Data(); const size_t shape_size = gsl::narrow(shape.Size()); MlasConvertHalfToFloatBuffer(&in_data[0].val, out_data, shape_size); } }; + +// tensor MLFloat16 -> string +template <> +struct TensorCaster { + void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const { + CastMLFloat16ThroughFloat(context, in, out, shape); + } +}; #endif class Cast final : public OpKernel { @@ -246,17 +275,18 @@ class Cast final : public OpKernel { template struct Dispatcher { - void operator()(const Tensor& src, Tensor& dst, const TensorShape& shape) { - TensorCaster{}.Cast(src, dst, shape); + void operator()(const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) { + TensorCaster{}.Cast(context, src, dst, shape); } }; template struct SrcDispatcher { - void operator()(int32_t to, const Tensor& src, Tensor& dst, const TensorShape& shape) { - using DstTypes = mp_remove_if_q>; + void operator()( + int32_t to, const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) { + using DstTypes = boost::mp11::mp_remove_if_q>; utils::MLTypeCallDispatcherFromTypeList dispatcher{to}; - dispatcher.template InvokeWithLeadingTemplateArgs>(src, dst, shape); + dispatcher.template InvokeWithLeadingTemplateArgs>(context, src, dst, shape); } }; @@ -278,7 +308,7 @@ Status Cast::Compute(OpKernelContext* context) const { } utils::MLTypeCallDispatcherFromTypeList dispatcher{from}; - dispatcher.Invoke(to_, *X, *Y, shape); + dispatcher.Invoke(to_, *context, *X, *Y, shape); return Status::OK(); } diff --git a/onnxruntime/core/providers/op_kernel_type_control.h b/onnxruntime/core/providers/op_kernel_type_control.h index 72c78165d6..c61c0381d4 100644 --- a/onnxruntime/core/providers/op_kernel_type_control.h +++ b/onnxruntime/core/providers/op_kernel_type_control.h @@ -273,20 +273,25 @@ struct EnabledTypes { * * In MyProvider provider's implementation of MyOp kernel: * + * namespace onnxruntime { + * namespace op_kernel_type_control { * // specify supported types, i.e., the full set of types that can be enabled * ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES( * MyProvider, DomainContainingMyOp, MyOp, Input, 0, * int, float, double); + * } // namespace op_kernel_type_control + * } // namespace onnxruntime + * + * // ... * * // get enabled types * using MyOpFirstInputEnabledTypes = - * ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(MyProvider, DomainContainingMyOp, MyOp, Input, 0) + * ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(MyProvider, DomainContainingMyOp, MyOp, Input, 0); * - * ... + * // ... * - * // in the implementation, we can dispatch to the enabled types - * utils::MLTypeCallDispatcherFromTypeList dispatcher{firstInputRuntimeType}; - * ... + * // use MLTypeCallDispatcher to dispatch to implementations for enabled types + * using Dispatcher = onnxruntime::utils::MLTypeCallDispatcherFromTypeList; */ // all allowed type specifications should be contained in the following file diff --git a/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc new file mode 100644 index 0000000000..609f62cddf --- /dev/null +++ b/onnxruntime/test/providers/cpu/tensor/cast_op_test.cc @@ -0,0 +1,188 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include + +#include "boost/mp11.hpp" + +#include "gsl/gsl" + +#include "gtest/gtest.h" + +#include "core/framework/data_types_internal.h" + +#include "test/common/cuda_op_test_utils.h" +#include "test/providers/provider_test_utils.h" + +namespace onnxruntime { +namespace test { + +template +int GetMinRequiredCudaComputeCapability() { + return 0; +} + +template <> +int GetMinRequiredCudaComputeCapability() { + return 530; +} + +template <> +int GetMinRequiredCudaComputeCapability() { + return 800; +} + +template +void TestCastOp(gsl::span input, + gsl::span output, + const std::vector& dimensions, + OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, + const std::string& expected_failure_string = "") { + OpTester test("Cast", 13); + test.AddAttribute("to", utils::ToTensorProtoElementType()); + test.AddInput("input", dimensions, input.data(), input.size()); + test.AddOutput("output", dimensions, output.data(), output.size()); + + std::unordered_set excluded_provider_types{kTensorrtExecutionProvider}; + const auto min_required_cuda_compute_capability = + std::max(GetMinRequiredCudaComputeCapability(), GetMinRequiredCudaComputeCapability()); + if (!HasCudaEnvironment(min_required_cuda_compute_capability)) { + excluded_provider_types.insert(kCudaExecutionProvider); + } + + test.Run(expect_result, expected_failure_string, excluded_provider_types); +} + +template +using RequiresCastThroughFloat = + boost::mp11::mp_any< + std::is_same, + std::is_same>; + +template +using AnyRequireCastThroughFloat = boost::mp11::mp_any...>; + +template +typename std::enable_if::value>::type +CastSpan(gsl::span src, gsl::span dst) { + std::transform( + src.begin(), src.end(), dst.begin(), + [](SrcType s) { + return static_cast(static_cast(s)); + }); +} + +template +typename std::enable_if::value>::type +CastSpan(gsl::span src, gsl::span dst) { + std::transform( + src.begin(), src.end(), dst.begin(), + [](SrcType s) { + return static_cast(s); + }); +} + +template +std::vector CastedValues(gsl::span src) { + std::vector result(src.size()); + CastSpan(src, gsl::make_span(result)); + return result; +} + +struct CastNonStringTester { + template + void operator()(const std::pair&) { + SCOPED_TRACE( + onnxruntime::MakeString( + "Cast from type ", utils::ToTensorProtoElementType(), + " to type ", utils::ToTensorProtoElementType())); + + const std::vector input_int_values{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + const TensorShape shape{2, 3, 2, 2}; + const size_t size = gsl::narrow(shape.Size()); + ASSERT_EQ(input_int_values.size(), size); + + auto input_buffer = onnxruntime::make_unique(size); + auto input_span = gsl::make_span(input_buffer.get(), size); + CastSpan(gsl::make_span(input_int_values), input_span); + + auto output_buffer = onnxruntime::make_unique(size); + auto output_span = gsl::make_span(output_buffer.get(), size); + CastSpan(input_span, output_span); + + TestCastOp(input_span, output_span, shape.GetDims()); + } +}; + +using CastNonStringTypes = + boost::mp11::mp_list< + bool, + float, double, + uint8_t, uint16_t, uint32_t, uint64_t, + int8_t, int16_t, int32_t, int64_t, + MLFloat16, BFloat16>; + +TEST(CastOpTest, NonStringTypes) { + boost::mp11::mp_for_each>( + CastNonStringTester{}); +} + +TEST(CastOpTest, FromString) { + const std::vector shape{2, 2, 2}; + const std::vector string_data = {"-inf", "+INF", "0.9767611", "0.28280696", + "-0.12019656", "5.0", "NaN", "nan"}; + const std::vector float_output = {-(std::numeric_limits::infinity()), std::numeric_limits::infinity(), + 0.9767611f, 0.28280696f, + -0.12019656f, 5.0f, NAN, NAN}; + TestCastOp(gsl::make_span(string_data), gsl::make_span(float_output), shape); + + const std::vector float16_string_data = {"-inf", "+INF", "0.5", "0.25", + "0.0", "-1.0", "-1.5", "NaN"}; + const std::vector float16_output = + CastedValues( + gsl::make_span( + std::vector{ + -std::numeric_limits::infinity(), std::numeric_limits::infinity(), 0.5f, 0.25f, + 0.0f, -1.0f, -1.5f, NAN})); + TestCastOp(gsl::make_span(float16_string_data), gsl::make_span(float16_output), shape); + + const std::vector int_16_string_data = {"0", "1", "2", "3", "4", "5", "-32768", "32767"}; + const std::vector int_16_output = {0, 1, 2, 3, 4, 5, SHRT_MIN, SHRT_MAX}; + TestCastOp(gsl::make_span(int_16_string_data), gsl::make_span(int_16_output), shape); + + const std::vector int_64_string_data = {"0", "1", "2", "3", "4", "5", "-9223372036854775808", "9223372036854775807"}; + const std::vector int_64_output = {0, 1, 2, 3, 4, 5, LLONG_MIN, LLONG_MAX}; + TestCastOp(gsl::make_span(int_64_string_data), gsl::make_span(int_64_output), shape); +} + +TEST(CastOpTest, ToString) { + const std::vector shape{2, 2, 2}; + const std::vector float_input = {NAN, -1.f, 0.0391877927f, 0.296140194f, -0.120196559f, 5.0f, + -std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; + + // float output precision is 8, so the expected output differs slightly from the input due to that + const std::vector string_output = {"NaN", "-1", "0.039187793", "0.29614019", + "-0.12019656", "5", "-INF", "INF"}; + TestCastOp(gsl::make_span(float_input), gsl::make_span(string_output), shape); + + const std::vector float16_input = + CastedValues( + gsl::make_span( + std::vector{ + -std::numeric_limits::infinity(), std::numeric_limits::infinity(), 0.5f, 0.25f, + 0.0f, -1.0f, -1.5f, NAN})); + const std::vector float16_string_output = {"-INF", "INF", "0.5", "0.25", + "0", "-1", "-1.5", "NaN"}; + TestCastOp(gsl::make_span(float16_input), gsl::make_span(float16_string_output), shape); + + const std::vector int_string_data = {"0", "1", "2", "3", "4", "5", "6", "7"}; + const std::vector int_16_input = {0, 1, 2, 3, 4, 5, 6, 7}; + TestCastOp(gsl::make_span(int_16_input), gsl::make_span(int_string_data), shape); +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc index 0f6a75a3d6..279d82654b 100644 --- a/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/tensor_op_test.cc @@ -84,245 +84,6 @@ TEST(TensorOpTest, ShapeTest3D) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //TensorRT: volume of dimensions is not consistent with weights size } -template -void TestCastOp(const std::initializer_list& input, - const std::initializer_list& output, - const std::vector& dimensions, - int64_t toType, - ExpectResult expect_result = ExpectResult::kExpectSuccess, - const std::string& expected_failure_string = "") { - OpTester test("Cast", 9); - test.AddAttribute("to", toType); - test.AddInput("input", dimensions, input); - test.AddOutput("output", dimensions, output); - test.Run(expect_result, expected_failure_string, {kTensorrtExecutionProvider}); -} - -template -void TestCastFromSrc() { - std::initializer_list input_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - const std::vector shape{3, 2, 2}; - - auto float_output = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - TestCastOp(input_data, float_output, shape, TensorProto::FLOAT); - - auto double_output = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0}; - TestCastOp(input_data, double_output, shape, TensorProto::DOUBLE); - - auto bool_output = {false, true, true, true, true, true, true, true, true, true, true, true}; - TestCastOp(input_data, bool_output, shape, TensorProto::BOOL); - - const std::initializer_list uint8_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, uint8_t_output, shape, TensorProto::UINT8); - - const std::initializer_list uint16_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, uint16_t_output, shape, TensorProto::UINT16); - - const std::initializer_list uint32_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, uint32_t_output, shape, TensorProto::UINT32); - - const std::initializer_list uint64_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, uint64_t_output, shape, TensorProto::UINT64); - - const std::initializer_list int16_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, int16_t_output, shape, TensorProto::INT16); - - const std::initializer_list int32_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, int32_t_output, shape, TensorProto::INT32); - - const std::initializer_list int64_t_output{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input_data, int64_t_output, shape, TensorProto::INT64); -}; - -TEST(TensorOpTest, Cast) { - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); - TestCastFromSrc(); -} - -TEST(TensorOpTest, CastFromBool) { - auto bool_data = {false, true, true, true, true, true, true, true, true, true, false, true}; - const std::vector shape{3, 2, 2}; - - const std::initializer_list float_output = {0.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 1.0f}; - TestCastOp(bool_data, float_output, shape, TensorProto::FLOAT); - - const std::initializer_list double_output = {0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0}; - TestCastOp(bool_data, double_output, shape, TensorProto::DOUBLE); - - auto bool_output = {false, true, true, true, true, true, true, true, true, true, false, true}; - TestCastOp(bool_data, bool_output, shape, TensorProto::BOOL); - - const std::initializer_list uint8_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, uint8_t_output, shape, TensorProto::UINT8); - - const std::initializer_list uint16_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, uint16_t_output, shape, TensorProto::UINT16); - - const std::initializer_list uint32_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, uint32_t_output, shape, TensorProto::UINT32); - - const std::initializer_list uint64_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, uint64_t_output, shape, TensorProto::UINT64); - - const std::initializer_list int16_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, int16_t_output, shape, TensorProto::INT16); - - const std::initializer_list int32_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, int32_t_output, shape, TensorProto::INT32); - - const std::initializer_list int64_t_output{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1}; - TestCastOp(bool_data, int64_t_output, shape, TensorProto::INT64); - - const std::initializer_list float16_output{ - MLFloat16(math::floatToHalf(0.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(0.0f)), - MLFloat16(math::floatToHalf(1.0f))}; - TestCastOp(bool_data, float16_output, shape, TensorProto::FLOAT16); -} - -TEST(TensorOpTest, CastToFloat16) { - const std::vector shape{3, 2, 2}; - std::initializer_list float_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - const std::initializer_list float16_output{ - MLFloat16(math::floatToHalf(0.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(2.0f)), - MLFloat16(math::floatToHalf(3.0f)), - MLFloat16(math::floatToHalf(4.0f)), - MLFloat16(math::floatToHalf(5.0f)), - MLFloat16(math::floatToHalf(6.0f)), - MLFloat16(math::floatToHalf(7.0f)), - MLFloat16(math::floatToHalf(8.0f)), - MLFloat16(math::floatToHalf(9.0f)), - MLFloat16(math::floatToHalf(10.0f)), - MLFloat16(math::floatToHalf(11.0f))}; - - TestCastOp(float_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list uint8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(uint8_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list uint16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(uint16_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list uint32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(uint32_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list uint64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(uint64_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list int8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(int8_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list int16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(int16_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list int32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(int32_t_data, float16_output, shape, TensorProto::FLOAT16); - - std::initializer_list int64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(int64_t_data, float16_output, shape, TensorProto::FLOAT16); -} - -TEST(TensorOpTest, CastFromFloat16) { - const std::vector shape{3, 2, 2}; - const std::initializer_list float_output = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f}; - const std::initializer_list input = { - MLFloat16(math::floatToHalf(0.0f)), - MLFloat16(math::floatToHalf(1.0f)), - MLFloat16(math::floatToHalf(2.0f)), - MLFloat16(math::floatToHalf(3.0f)), - MLFloat16(math::floatToHalf(4.0f)), - MLFloat16(math::floatToHalf(5.0f)), - MLFloat16(math::floatToHalf(6.0f)), - MLFloat16(math::floatToHalf(7.0f)), - MLFloat16(math::floatToHalf(8.0f)), - MLFloat16(math::floatToHalf(9.0f)), - MLFloat16(math::floatToHalf(10.0f)), - MLFloat16(math::floatToHalf(11.0f))}; - - TestCastOp(input, float_output, shape, TensorProto::FLOAT); - - auto bool_data = {false, true, true, true, true, true, true, true, true, true, true, true}; - TestCastOp(input, bool_data, shape, TensorProto::BOOL); - - std::initializer_list uint8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, uint8_t_data, shape, TensorProto::UINT8); - - std::initializer_list uint16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, uint16_t_data, shape, TensorProto::UINT16); - - std::initializer_list uint32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, uint32_t_data, shape, TensorProto::UINT32); - - std::initializer_list uint64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, uint64_t_data, shape, TensorProto::UINT64); - - std::initializer_list int8_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, int8_t_data, shape, TensorProto::INT8); - - std::initializer_list int16_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, int16_t_data, shape, TensorProto::INT16); - - std::initializer_list int32_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, int32_t_data, shape, TensorProto::INT32); - - std::initializer_list int64_t_data = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; - TestCastOp(input, int64_t_data, shape, TensorProto::INT64); -} - -TEST(TensorOpTest, CastFromString) { - const std::vector shape{2, 2, 2}; - std::initializer_list string_data = {"-inf", "+INF", "0.9767611", "0.28280696", - "-0.12019656", "5.0", "NaN", "nan"}; - const std::initializer_list float_output = {-(std::numeric_limits::infinity()), std::numeric_limits::infinity(), - 0.9767611f, 0.28280696f, - -0.12019656f, 5.0f, NAN, NAN}; - TestCastOp(string_data, float_output, shape, TensorProto::FLOAT); - - std::initializer_list int_16_string_data = {"0", "1", "2", "3", "4", "5", "-32768", "32767"}; - const std::initializer_list int_16_output = {0, 1, 2, 3, 4, 5, SHRT_MIN, SHRT_MAX}; - TestCastOp(int_16_string_data, int_16_output, shape, TensorProto::INT16); - - std::initializer_list int_64_string_data = {"0", "1", "2", "3", "4", "5", "-9223372036854775808", "9223372036854775807"}; - const std::initializer_list int_64_output = {0, 1, 2, 3, 4, 5, LLONG_MIN, LLONG_MAX}; - TestCastOp(int_64_string_data, int_64_output, shape, TensorProto::INT64); -} - -TEST(TensorOpTest, CastToString) { - const std::vector shape{2, 2, 2}; - const std::initializer_list float_input = {NAN, -1.f, 0.0391877927f, 0.296140194f, -0.120196559f, 5.0f, - -std::numeric_limits::infinity(), - std::numeric_limits::infinity()}; - - // float output precision is 8, so the expected output differs slightly from the input due to that - std::initializer_list string_output = {"NaN", "-1", "0.039187793", "0.29614019", - "-0.12019656", "5", "-INF", "INF"}; - TestCastOp(float_input, string_output, shape, TensorProto::STRING); - - std::initializer_list int_string_data = {"0", "1", "2", "3", "4", "5", "6", "7"}; - const std::initializer_list int_16_input = {0, 1, 2, 3, 4, 5, 6, 7}; - TestCastOp(int_16_input, int_string_data, shape, TensorProto::STRING); -} - void MeanVarianceNormalizationFunctionDefaultPerChannel() { const int64_t N = 2, C = 2, H = 2, W = 3; diff --git a/onnxruntime/test/providers/provider_test_utils.cc b/onnxruntime/test/providers/provider_test_utils.cc index 506661a83b..38d3eb4163 100644 --- a/onnxruntime/test/providers/provider_test_utils.cc +++ b/onnxruntime/test/providers/provider_test_utils.cc @@ -251,9 +251,11 @@ void Check(const OpTester::Data& expected_data, threshold = 0.005f; #endif for (int i = 0; i < size; ++i) { - if (std::isinf(f_expected[i])) // Test infinity for equality - EXPECT_EQ(f_expected[i], f_output[i]) << "i:" << i; - else { + if (std::isnan(f_expected[i])) { + EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i << ", provider_type: " << provider_type; + } else if (std::isinf(f_expected[i])) { // Test infinity for equality + EXPECT_EQ(f_expected[i], f_output[i]) << "Expected infinity. i:" << i << ", provider_type: " << provider_type; + } else { // the default for existing tests EXPECT_NEAR(f_expected[i], f_output[i], threshold) << "i:" << i << ", provider_type: " << provider_type; @@ -284,9 +286,11 @@ void Check(const OpTester::Data& expected_data, /// XXX: May need to adjust threshold as BFloat is coarse float threshold = 0.001f; for (int i = 0; i < size; ++i) { - if (std::isinf(f_expected[i])) // Test infinity for equality - EXPECT_EQ(f_expected[i], f_output[i]); - else { + if (std::isnan(f_expected[i])) { + EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i << ", provider_type: " << provider_type; + } else if (std::isinf(f_expected[i])) { // Test infinity for equality + EXPECT_EQ(f_expected[i], f_output[i]) << "Expected infinity. i:" << i << ", provider_type: " << provider_type; + } else { // the default for existing tests const float max_value = fmax(fabs(f_expected[i]), fabs(f_output[i])); if (max_value != 0) { // max_value = 0 means output and expected are 0s. From 5fc377f21e7988b2d392a489393aeccab79d4d1c Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Tue, 12 Jan 2021 18:00:33 -0800 Subject: [PATCH 02/41] Partial updating of ROCM reduction code. --- .../cuda/reduction/reduction_functions.h | 2 +- .../rocm/reduction/reduction_functions.cu | 485 +++++++++++++++++- .../rocm/reduction/reduction_functions.h | 104 ++++ .../providers/rocm/reduction/reduction_ops.cc | 41 +- .../providers/rocm/reduction/reduction_ops.h | 64 ++- .../rocm/reduction/reduction_utils.cuh | 28 + 6 files changed, 698 insertions(+), 26 deletions(-) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h index 8d5da0381e..69988862aa 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h @@ -12,7 +12,7 @@ namespace cuda { namespace detail { size_t compute_reduce_matrix_columns_intermediate_buffer_size( int element_size, int num_rows, int num_cols); -} +} // namespace detail /** * Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns(). diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu index e6abc5e8da..cd55592330 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu @@ -1,12 +1,16 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/providers/rocm/reduction/reduction_functions.h" + #include + #include #include -#include "core/providers/rocm/cu_inc/common.cuh" +#include "core/common/common.h" #include "core/providers/rocm/atomic/common.cuh" -#include "core/providers/rocm/reduction/reduction_functions.h" +#include "core/providers/rocm/cu_inc/common.cuh" +#include "core/providers/rocm/shared_inc/rocm_utils.h" #include "core/providers/rocm/reduction/reduction_utils.cuh" #define NUM_ELEMENTS_PER_THREAD 4 @@ -19,6 +23,481 @@ namespace onnxruntime { namespace rocm { +namespace detail { +constexpr auto MAX_NUM_ELEMENTS_PER_THREAD = 4; +constexpr auto MAX_NUM_WARPS_PER_BLOCK = 8; +constexpr auto MAX_NUM_BLOCKS_IN_GRID_ROW = 256; +constexpr auto MAX_NUM_GRID_ROWS = 32768; + +dim3 compute_block_dim(int num_cols) { + const int x = GPU_WARP_SIZE; + const int y = std::min(MAX_NUM_WARPS_PER_BLOCK, std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * x))); + return dim3(x, y); +} + +std::pair compute_grid_and_block_dims(int num_rows, int num_cols) { + const auto block_dim = compute_block_dim(num_cols); + const auto grid_x = + std::min( + MAX_NUM_BLOCKS_IN_GRID_ROW, + std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * block_dim.x * block_dim.y))); + const auto grid_y = std::min(MAX_NUM_GRID_ROWS, num_rows); + const dim3 grid_dim(grid_x, grid_y); + return {grid_dim, block_dim}; +} + +uintptr_t round_up_to_aligned(uintptr_t original, size_t alignment) { + assert((alignment & (alignment - 1)) == 0); + const size_t alignment_mask = ~(alignment - 1); + return (original + alignment - 1) & alignment_mask; +} + +/** + * call_reduce_matrix_columns() intermediate buffer layout + * + * Given buffer element type TBuf, the intermediate buffer layout looks like this: + * + * ----- + * m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row + * alignment padding bytes as needed + * m * sizeof(int) bytes for block done counts per row + * ----- + */ + +size_t compute_reduce_matrix_columns_intermediate_buffer_size( + int element_size, int num_rows, int num_cols) { + ORT_ENFORCE(element_size >= 0 && num_rows >= 0 && num_cols >= 0); + + const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; + + size_t buffer_size{}; + + // at the beginning, for sizing purposes, assume we are aligned + buffer_size += static_cast(num_rows) * grid_dim.x * element_size; + + buffer_size = round_up_to_aligned(buffer_size, alignof(int)); + buffer_size += static_cast(num_rows) * sizeof(int); + + // add padding to give us room to align + buffer_size += alignof(max_align_t) - 1; + + return buffer_size; +} + +template +Status get_reduction_buffers( + int num_rows, int num_cols, void* buffer, size_t buffer_size, + TBuf*& block_reductions_buffer, int*& block_done_counts_buffer) { + const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; + + const uintptr_t begin_addr = reinterpret_cast(buffer); + const uintptr_t block_reductions_addr = + round_up_to_aligned(begin_addr, alignof(TBuf)); + const uintptr_t block_done_counts_buffer_addr = + round_up_to_aligned( + block_reductions_addr + static_cast(num_rows) * grid_dim.x * sizeof(TBuf), alignof(int)); + const uintptr_t end_addr = + block_done_counts_buffer_addr + static_cast(num_rows) * sizeof(int); + const size_t required_size = end_addr - begin_addr; + + ORT_RETURN_IF_NOT( + required_size <= buffer_size, + "Buffer size is too small (", buffer_size, " bytes). ", + "At least ", required_size, " bytes are needed from the given base address (", buffer, ")."); + + block_reductions_buffer = reinterpret_cast(block_reductions_addr); + block_done_counts_buffer = reinterpret_cast(block_done_counts_buffer_addr); + + return Status::OK(); +} + +template +__device__ void reduce_all( + const int num_elements, const TIn* const input, TOut* const output, + TBuf* const block_reductions_buffer, int* const block_done_count_buffer) { + extern __shared__ unsigned char shared_memory_bytes[]; + TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); + // Thread-level indices: + // Linear index of thread in block. + const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x; + // Total number of threads in a 2-D block. + const int num_threads_in_block = blockDim.x * blockDim.y; + + // Warp-level indices: + // Warp index of thread. + const int wid_in_block = tid_in_block / GPU_WARP_SIZE; + // Lane index of thread. + const int lid_in_block = tid_in_block % GPU_WARP_SIZE; + // Warp count per block. + const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE; + + // Grid-level indices: + // Linear index of block in grid row. + const int bid_in_grid_row = blockIdx.x; + // Linear index of thread in grid row. + const int tid_in_grid_row = bid_in_grid_row * (blockDim.x * blockDim.y) + tid_in_block; + // Total number of blocks in a grid row. + const int num_blocks_in_grid_row = gridDim.x; + // Total number of threads in a grid row with 2-D blocks. + const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block; + + const auto write_result = [&output, &num_elements](const TOut result) { + // Compilation time if-else branch controlled by template argument can be + // optimized out, so there will be no branch in real computation phase. + if (DivideResultBySize) { + output[0] = TFinalOp()(result / TOut(num_elements)); + } else { + output[0] = TFinalOp()(result); + } + }; + + // Thread-level reduction (storage change: global memory -> register). + // One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register + // in one iteration. + TBuf value = 0; + for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) { + TIn v[MAX_NUM_ELEMENTS_PER_THREAD]; + +#pragma unroll + for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { + const int offset = id + i * num_threads_in_grid_row; + if (offset < num_elements) { + v[i] = input[offset]; + } + } + +#pragma unroll + for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { + const int offset = id + i * num_threads_in_grid_row; + if (offset < num_elements) { + value += TOp()(TBuf(v[i])); + } + } + } + +#if __ROCM_ARCH__ >= 700 + __syncwarp(); +#else + __syncthreads(); +#endif + + // Warp-level reduction (storage change: register -> register). + // The values in a warp will be summed up to a scalar. After warp-level + // reduction, each block holds num_warps_in_block values in the shared memory. +#pragma unroll + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + value += WARP_SHFL_DOWN(value, stride); + } + + // Return early if only one warp is used for reduction. + // Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as + // 1. two blocks and each of them has only 1 warp (32 threads). + // 2. two warps and each of them has only 2 threads. + if (num_warps_in_block == 1) { + if (tid_in_grid_row == 0) { + write_result(value); + } + return; + } + + if (lid_in_block == 0) { + shared_memory[wid_in_block] = value; + } + + __syncthreads(); + + // Block-level reduction (storage change: shared memory -> global memory). + // The values in a block will be summed up to a scalar. + // Note that the values are stored in the shared memory. + // Here we assume that the size of shared_memory is smaller + // than num_warps_in_block, so we just keep halving the number + // of threads in each iteration. Our assumption is always true because + // the size of shared_memory equals to the number of warps. +#pragma unroll + for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { + if (tid_in_block + stride < num_warps_in_block) { + shared_memory[tid_in_block] += shared_memory[tid_in_block + stride]; + } + __syncthreads(); + } + + // Return early if only one block is used for reduction. + if (num_blocks_in_grid_row == 1) { + if (tid_in_grid_row == 0) { + write_result(shared_memory[0]); + } + return; + } + + if (tid_in_block == 0) { + block_reductions_buffer[bid_in_grid_row] = shared_memory[0]; + } + + __threadfence(); + __syncthreads(); + + // Grid-level reduction. We use the last block to sum up values + // stored in the global block_reductions_buffer. + __shared__ bool is_last_block_done; + + if (tid_in_block == 0) { + const int count = atomicAdd(block_done_count_buffer, 1); + is_last_block_done = (count == (num_blocks_in_grid_row - 1)); + } + + // All threads in each block see if they belong the last active block + // (i.e., the value of is_last_block_done). + __syncthreads(); + + // Only the block which saw that count equals to num_blocks_in_grid_row - 1 can + // enter the following block. + if (is_last_block_done) { + const int pow2_bound = least_pow2_bound(num_blocks_in_grid_row); + for (int stride = pow2_bound / 2; stride > 0; stride /= 2) { + if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid_row) { + block_reductions_buffer[tid_in_block] += block_reductions_buffer[tid_in_block + stride]; + } + __syncthreads(); + } + + // The first thread in the last block assigns the final output. + if (tid_in_block == 0) { + write_result(block_reductions_buffer[0]); + } + } +} + +template +__global__ void reduce_matrix_columns_kernel( + const int num_rows, const int num_cols, const TIn* const input, TOut* const output, + TBuf* const block_reductions_buffer, int* const block_done_counts_buffer) { + const int num_blocks_in_grid_row = gridDim.x; + const int row_id_in_grid = blockIdx.y; + const int num_grid_rows = gridDim.y; + + // one row per iteration + // row_id is int64_t to avoid int overflow in offset calculations + for (int64_t row_id = row_id_in_grid; row_id < num_rows; row_id += num_grid_rows) { + const TIn* const row_data = input + row_id * num_cols; + TOut* const row_output = output + row_id; + TBuf* const row_block_reductions_buffer = block_reductions_buffer + row_id * num_blocks_in_grid_row; + int* const row_block_done_counts_buffer = block_done_counts_buffer + row_id; + + reduce_all( + num_cols, row_data, row_output, + row_block_reductions_buffer, row_block_done_counts_buffer); + } +} + +template +Status call_reduce_matrix_columns( + const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { + ORT_ENFORCE(num_rows >= 0 && num_cols >= 0); + + using TBuf = AccumulationType_t; + + const auto grid_and_block_dims = compute_grid_and_block_dims(num_rows, num_cols); + const dim3& grid_dim = grid_and_block_dims.first; + const dim3& block_dim = grid_and_block_dims.second; + + TBuf* block_reductions_buffer; + int* block_done_counts_buffer; + ORT_RETURN_IF_ERROR(get_reduction_buffers( + num_rows, num_cols, buffer, buffer_size, + block_reductions_buffer, block_done_counts_buffer)); + + // If more than one block is used per grid row, then inter-block reduction is needed. + if (grid_dim.x > 1) { + HIP_RETURN_IF_ERROR(hipMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int))); + } + + const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel), + grid_dim, block_dim, shared_mem_size, 0, + num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); + + return Status::OK(); +} +} // namespace detail + +template +Status reduce_sum( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); +} + +template +Status reduce_square_sum( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); +} + +template +Status reduce_l2_norm( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); +} + +template +Status reduce_mean( + const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, 1, size, buffer, buffer_size); +} + +#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \ + template Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_SUM(half, float); +INSTANTIATE_REDUCE_SUM(float, float); +INSTANTIATE_REDUCE_SUM(double, double); +#undef INSTANTIATE_REDUCE_SUM + +#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \ + template Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_SQUARE_SUM(half, float); +INSTANTIATE_REDUCE_SQUARE_SUM(float, float); +INSTANTIATE_REDUCE_SQUARE_SUM(double, double); +#undef INSTANTIATE_REDUCE_SQUARE_SUM + +#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \ + template Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_L2_NORM(half, float); +INSTANTIATE_REDUCE_L2_NORM(float, float); +INSTANTIATE_REDUCE_L2_NORM(double, double); +#undef INSTANTIATE_REDUCE_L2_NORM + +#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \ + template Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_MEAN(half, float); +INSTANTIATE_REDUCE_MEAN(float, float); +INSTANTIATE_REDUCE_MEAN(double, double); +#undef INSTANTIATE_REDUCE_MEAN + +namespace detail { +template +__global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) { + constexpr int x_load_count_per_thread = 1; + constexpr int y_load_count_per_thread = 4; + const int t_count_x_in_grid = blockDim.x * gridDim.x; + const int t_count_y_in_grid = blockDim.y * gridDim.y; + const int x_grid_stride = t_count_x_in_grid * x_load_count_per_thread; + const int y_grid_stride = t_count_y_in_grid * y_load_count_per_thread; + const int tid_x_in_grid = threadIdx.x + blockDim.x * blockIdx.x; + const int tid_y_in_grid = threadIdx.y + blockDim.y * blockIdx.y; + const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y; + + // Shape is blockDim.y-by-blockDim.x and element type is TBuf. + extern __shared__ unsigned char shared_memory_bytes[]; + TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); + + // to prevent int overflow in index calculation for input size m*n + const int64_t n_int64 = static_cast(n); + + for (int col = tid_x_in_grid; col < n; col += x_grid_stride) { + shared_memory[tid_in_block] = TBuf(0.0f); + TBuf sum = TBuf(0.0f); + // This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input. + for (int row = tid_y_in_grid; row < m; row += y_grid_stride) { + // Thread-level reduction. Each thread loads y_load_count_per_thread values + // and aggregrate them. +#pragma unroll(y_load_count_per_thread) + for (int row_inner = 0; row_inner < y_load_count_per_thread; ++row_inner) { + int row_final = row + row_inner * t_count_y_in_grid; + int col_final = col; + if (row_final < m && col_final < n) { + sum += TBuf(input[row_final * n_int64 + col_final]); + } + } + } + // Write thread-level reduction result into shared memory. + shared_memory[tid_in_block] = sum; + + // Wait all threads to finish their thread-level reductions. + __syncthreads(); + +// This loop conducts reduction on elements stored in shared memory. +// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor. +#pragma unroll(4) + for (int stride = blockDim.y / 2; stride > 0; stride /= 2) { + if (threadIdx.y < stride) { + shared_memory[tid_in_block] += shared_memory[tid_in_block + stride * blockDim.x]; + } + __syncthreads(); + } + + if (threadIdx.y == 0) { + atomic_add(output + col, TOut(shared_memory[threadIdx.x])); + } + } +} + +template +Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { + ORT_ENFORCE(m >= 0 && n >= 0); + + if (reset_initial_output) { + HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut))); + } + + constexpr int max_num_threads_in_block = 512; + constexpr int max_num_blocks_in_grid = 512; + constexpr int load_count_per_thread = 4; + + const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE))); + const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread))); + const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid)); + const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4)); + + const dim3 grid(grid_x_dim, grid_y_dim, 1); + const dim3 block(block_x_dim, block_y_dim, 1); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_rows_kernel), + grid, block, block.y * block.x * sizeof(TBuf), 0, + input, output, m, n); + + return Status::OK(); +} +} // namespace detail + +template +Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { + using TBuf = AccumulationType_t; + return detail::call_reduce_matrix_rows(input, output, m, n, reset_initial_output); +} + +#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \ + template Status reduce_matrix_rows(const T* input, T* output, int m, int n, bool reset_initial_output) +INSTANTIATE_REDUCE_MATRIX_ROWS(half); +INSTANTIATE_REDUCE_MATRIX_ROWS(float); +INSTANTIATE_REDUCE_MATRIX_ROWS(double); +#undef INSTANTIATE_REDUCE_MATRIX_ROWS + +template +Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { + return detail::call_reduce_matrix_columns( + input, output, m, n, buffer, buffer_size); +} + +#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \ + template Status reduce_matrix_columns(const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) +INSTANTIATE_REDUCE_MATRIX_COLUMNS(half); +INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); +INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); +#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS + + + + + + + +// +// TODO: DELETE EVERYTHING BELOW +// + std::pair compute_block_size(int size) { int x = GPU_WARP_SIZE; int y = std::min(NUM_WARPS_PER_BLOCK, std::max(1, size / (NUM_ELEMENTS_PER_THREAD * GPU_WARP_SIZE))); @@ -37,7 +516,7 @@ int compute_reduction_buffer_size(int element_size, int size) { template __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output, TOut* buffer) { - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_) + extern __shared__ unsigned char shared_memory_[]; TOut* shared_memory = reinterpret_cast(shared_memory_); // Thread-level indexes: // Linear index of thread in block. diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h b/onnxruntime/core/providers/rocm/reduction/reduction_functions.h index 5bc2df04c0..3b8c796b42 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_functions.h @@ -2,11 +2,115 @@ // Licensed under the MIT License. #pragma once + #include "core/providers/rocm/rocm_common.h" +#include "core/providers/rocm/shared_inc/accumulation_type.h" namespace onnxruntime { namespace rocm { +namespace detail { +size_t compute_reduce_matrix_columns_intermediate_buffer_size( + int element_size, int num_rows, int num_cols); +} // namespace detail + +/** + * Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns(). + * @tparam TIn The input data type. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @return The size of the intermediate buffer. + */ +template +size_t compute_reduce_matrix_columns_buffer_size(int m, int n) { + using TBuf = AccumulationType_t; + return detail::compute_reduce_matrix_columns_intermediate_buffer_size( + sizeof(TBuf), m, n); +} + +/** + * Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions. + * @tparam TIn The input data type. + * @param size The number of elements. + * @return The size of the intermediate buffer. + */ +template +size_t compute_reduction_buffer_size(int size) { + using TBuf = AccumulationType_t; + return detail::compute_reduce_matrix_columns_intermediate_buffer_size( + sizeof(TBuf), 1, size); +} + +/** Computes the sum of the given elements. */ +template +Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); + +/** Computes the sum of the squares of the given elements. */ +template +Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); + +/** Computes the L2 norm of the given elements. */ +template +Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); + +/** Computes the mean of the given elements. */ +template +Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); + +enum class ApplicableMatrixReduction { + // can use reduce_matrix_rows() + Rows, + // can use reduce_matrix_columns() + Columns, + // no optimized matrix reduction function applies + None, +}; + +/** + * Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function. + * @param miopen_reduce_op The MIOpen reduction op type. + * @param dims The input dimensions. + * @param axes The reduction axes. + * @param[out] m If matrix reduction is possible, the number of matrix rows to use. + * @param[out] n If matrix reduction is possible, the number of matrix columns to use. + * @return The type of matrix reduction that can be done. + */ +ApplicableMatrixReduction get_applicable_matrix_reduction( + const miopenReduceTensorOp_t miopen_reduce_op, + const std::vector& dims, const std::vector& axes, + int& m, int& n); + +/** + * Reduces the rows in a row-major matrix to a single row containing the sum of each column. + * @param input The input data. + * @param output The output data. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @param reset_initial_output Whether to reset (i.e., zero) the output values first. + */ +template +Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output /* TODO: = true*/); + +/** + * Reduces the columns in a row-major matrix to a single column containing the sum of each row. + * @param input The input data. + * @param output The output data. + * @param m The number of matrix rows. + * @param n The number of matrix columns. + * @param buffer The intermediate buffer. + * @param buffer_size The size of the intermediate buffer in bytes. + */ +template +Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); + + + + + +// +// TODO: DELETE EVERYTHING BELOW +// + int compute_reduction_buffer_size(int element_size, int size); template diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index cb66a50764..b7f742eca7 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -1,14 +1,16 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "reduction_ops.h" +#include "core/providers/rocm/reduction/reduction_ops.h" + +#include "core/framework/data_types_internal.h" +#include "core/framework/op_kernel_context_internal.h" #include "core/providers/common.h" +#include "core/providers/cpu/tensor/utils.h" #include "core/providers/rocm/miopen_common.h" -#include "core/providers/rocm/math/unary_elementwise_ops_impl.h" #include "core/providers/rocm/math/binary_elementwise_ops_impl.h" #include "core/providers/rocm/math/binary_elementwise_ops.h" -#include "core/providers/cpu/tensor/utils.h" -#include "core/framework/op_kernel_context_internal.h" +#include "core/providers/rocm/math/unary_elementwise_ops_impl.h" using namespace onnxruntime::common; namespace onnxruntime { @@ -76,7 +78,7 @@ namespace rocm { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); -// CUDA ArgMax/ArgMin doesn't have OpSet12 implementation (with select_last_index attr), keep it in OpSet11 for now. +// ROCM ArgMax/ArgMin doesn't have OpSet12 implementation (with select_last_index attr), keep it in OpSet11 for now. #define REGISTER_KERNEL_TYPED_11(name, T) \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ name, \ @@ -95,6 +97,35 @@ namespace rocm { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); +// Register with the latest version 13 +#define REGISTER_KERNEL_TYPED_13(name, T) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + name, \ + kOnnxDomain, \ + 1, 10, \ + T, \ + kRocmExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + name); \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \ + name, \ + kOnnxDomain, \ + 11, 12, \ + T, \ + kRocmExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + name); \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + name, \ + kOnnxDomain, \ + 13, \ + T, \ + kRocmExecutionProvider, \ + KernelDefBuilder() \ + .InputMemoryType(1) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + name); + static bool is_matrix_row_reduction( const miopenReduceTensorOp_t miopen_reduce_op, const int m, diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h index a2a953e25f..fcd97d3fbe 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h @@ -11,15 +11,33 @@ namespace onnxruntime { namespace rocm { enum miopenReduceTensorOp_t { - MIOPEN_REDUCE_TENSOR_MAX, + MIOPEN_REDUCE_TENSOR_ADD, + MIOPEN_REDUCE_TENSOR_MUL, MIOPEN_REDUCE_TENSOR_MIN, + MIOPEN_REDUCE_TENSOR_MAX, + MIOPEN_REDUCE_TENSOR_AVG, MIOPEN_REDUCE_TENSOR_NORM1, MIOPEN_REDUCE_TENSOR_NORM2, - MIOPEN_REDUCE_TENSOR_AVG, - MIOPEN_REDUCE_TENSOR_MUL, - MIOPEN_REDUCE_TENSOR_ADD }; +enum miopenReduceTensorIndices_t { + MIOPEN_REDUCE_TENSOR_NO_INDICES, + MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES, +}; + +namespace ReductionOps { + +// Implementation that holds the core logic of reduction op processing +// `input_shape_override` is the input shape for compute purposes (if provided) + +template +Tensor ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator, + const Tensor& input, const std::vector& axes, + bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, + bool fast_reduction, const TensorShape* input_shape_override = nullptr); + +} // namespace ReductionOps + // Holds some metadata that will be used during actual reduction op compute time struct PrepareReduceMetadata { int64_t input_count; @@ -30,24 +48,15 @@ struct PrepareReduceMetadata { std::vector squeezed_output_dims; std::vector input_dims_miopen; std::vector output_dims_miopen; + + // + // TODO: delete these fields + // int64_t rank; int64_t stride; bool contiguous_axes; }; -Status PrepareForReduce(const Tensor* X, - bool keepdims, - const std::vector& axes, - PrepareReduceMetadata& prepare_reduce_metadata, - const TensorShape* input_shape_override = nullptr); - -template -Status ReduceComputeCore(const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, - /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, - const std::vector& axes, - bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, - const TensorShape* input_shape_override = nullptr); - template class ReduceKernel : public RocmKernel, public ReduceKernelBase { protected: @@ -217,5 +226,26 @@ class ReduceLogSumExp final : public ReduceKernel { } }; + +Status PrepareForReduce(const Tensor* X, + bool keepdims, + const std::vector& axes, + PrepareReduceMetadata& prepare_reduce_metadata, + const TensorShape* input_shape_override = nullptr); + +template +Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, + /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, + const std::vector& axes, + bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, + const TensorShape* input_shape_override = nullptr); + +template +Status ReduceComputeCore(const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, + /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, + const std::vector& axes, + bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, + const TensorShape* input_shape_override = nullptr); + } // namespace rocm } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh b/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh index 808756b295..9d9d210f3f 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh +++ b/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh @@ -18,6 +18,34 @@ __forceinline__ __host__ __device__ int least_pow2_bound(int value) { return static_cast(++value_); } +struct Square2 { + template + __forceinline__ __device__ T operator()(const T& value) { + return value * value; + } +}; + +struct Sqrt2 { + template + __forceinline__ __device__ T operator()(const T& value) { + return _Sqrt(value); + } +}; + +struct Identity2 { + template + __forceinline__ __device__ T operator()(const T& value) { + return value; + } +}; + + + +// +// TODO: DELETE EVERYTHING BELOW +// TODO: RENAME STRUCTS ABOVE (no '2') +// + template struct Cast { __forceinline__ __device__ TAccumulated operator()(const TValue& value) { From c4b6559be93621f919086788fdcd9969a841d69b Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Tue, 12 Jan 2021 18:18:57 -0800 Subject: [PATCH 03/41] Update reduction_all.cu --- .../training_ops/rocm/reduction/reduction_all.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu index 346e093bd4..d6477ebf08 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu @@ -12,12 +12,12 @@ namespace onnxruntime { namespace rocm { -template +template __global__ void ScalarSqrtKernel(Tin* input, Tout* output) { *output = (Tout)_Sqrt(*input); } -template +template void ScalarSqrt(Tin* input, Tout* output) { hipLaunchKernelGGL(ScalarSqrtKernel, dim3(1), dim3(1), 0, 0, input, output); } @@ -61,7 +61,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) const int wid = threadIdx.x / GPU_WARP_SIZE; // Shape is 2 x warp_count_in_block. - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_) + extern __shared__ unsigned char shared_memory_[]; TBuf* shared_memory = reinterpret_cast(shared_memory_); if (lid == 0) { @@ -79,7 +79,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) } if (threadIdx.x == 0) { - atomic_add(w_norm, TOutOp()(shared_memory[0])); + atomic_add(w_norm, TOutOp()(TOut(shared_memory[0]))); } } @@ -100,7 +100,7 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { using TBuf = AccumulationType_t; - MultiTensorReduce, Cast>(chunk_group, output); + MultiTensorReduce(chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ From 554184bcc493ccd6b26d3b6666d898d8d5243947 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Thu, 14 Jan 2021 10:45:03 -0800 Subject: [PATCH 04/41] Add reduce template parameters. --- .../providers/rocm/reduction/reduction_ops.cc | 22 ++++++------- .../providers/rocm/reduction/reduction_ops.h | 33 ++++++++++--------- .../rocm/reduction/reduction_ops.cc | 6 ++-- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index b7f742eca7..0c981f35e7 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -153,14 +153,14 @@ static bool is_matrix_row_reduction( // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored. template -template +template Status ReduceKernel::ReduceKernelShared( const T* X, const TensorShape& input_shape, OutT* Y, const TensorShape& /*output_shape*/, miopenReduceTensorOp_t miopen_reduce_op, - std::vector /*output_dims*/) const { + std::vector& /*output_dims*/) const { typedef typename ToHipType::MappedType HipT; const auto rank = input_shape.NumDimensions(); @@ -191,7 +191,7 @@ template Status ReduceKernel::ReduceKernelShared( double* Y, const TensorShape& output_shape, miopenReduceTensorOp_t miopen_reduce_op, - std::vector output_dims) const; + std::vector& output_dims) const; template Status ReduceKernel::ReduceKernelShared( const float* X, @@ -199,7 +199,7 @@ template Status ReduceKernel::ReduceKernelShared( float* Y, const TensorShape& output_shape, miopenReduceTensorOp_t miopen_reduce_op, - std::vector output_dims) const; + std::vector& output_dims) const; template Status ReduceKernel::ReduceKernelShared( const MLFloat16* X, @@ -207,7 +207,7 @@ template Status ReduceKernel::ReduceKernelShared( MLFloat16* Y, const TensorShape& output_shape, miopenReduceTensorOp_t miopen_reduce_op, - std::vector output_dims) const; + std::vector& output_dims) const; // `input_shape_override` (if provided) is the input shape for compute purposes Status PrepareForReduce(const Tensor* X, @@ -307,8 +307,8 @@ Status PrepareForReduce(const Tensor* X, } // `input_shape_override` is the input shape for compute purposes (if provided) -template -Status ReduceComputeCore(const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, +template +Status ReduceComputeCore(ROCMExecutionProvider& /*rocm_ep*/, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, const std::vector& axes, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, @@ -361,7 +361,7 @@ Status ReduceComputeCore(const Tensor& input, PrepareReduceMetadata& prepare_red } template -template +template Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { const Tensor* X = ctx->Input(0); @@ -373,13 +373,13 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenR Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute(); - return ReduceComputeCore(*X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes_, - calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); + return ReduceComputeCore(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes_, + calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } template <> template <> -Status ReduceKernel::ComputeImpl(OpKernelContext* /*ctx*/, miopenReduceTensorOp_t /*miopen_reduce_op*/) const { +Status ReduceKernel::ComputeImpl(OpKernelContext* /*ctx*/, miopenReduceTensorOp_t /*miopen_reduce_op*/) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, Node().OpType(), " is not supported"); } diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h index fcd97d3fbe..936735aab7 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h @@ -68,23 +68,29 @@ class ReduceKernel : public RocmKernel, public ReduceKernelBase(static_cast(info.GetExecutionProvider())); + } - template + // Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual + // Only Max Min will have indices output, need to set the indices to nullptr for other ops + template Status ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const; // Used by ReduceSumTraining which will have axes as input - template + template Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const; - template + template Status ReduceKernelShared( const T* X, const TensorShape& input_shape, OutT* Y, const TensorShape& output_shape, miopenReduceTensorOp_t miopen_reduce_op, - std::vector output_dims) const; + std::vector& output_dims) const; using ReduceKernelBase::axes_; using ReduceKernelBase::keepdims_; @@ -94,8 +100,11 @@ class ReduceKernel : public RocmKernel, public ReduceKernelBase @@ -104,7 +113,7 @@ class ArgMax final : public ReduceKernel { ArgMax(const OpKernelInfo& info) : ReduceKernel(info) {} Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MAX); + return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MAX); } }; @@ -114,7 +123,7 @@ class ArgMin final : public ReduceKernel { ArgMin(const OpKernelInfo& info) : ReduceKernel(info) {} Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MIN); + return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_MIN); } }; @@ -226,7 +235,6 @@ class ReduceLogSumExp final : public ReduceKernel { } }; - Status PrepareForReduce(const Tensor* X, bool keepdims, const std::vector& axes, @@ -240,12 +248,5 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override = nullptr); -template -Status ReduceComputeCore(const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, - /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, - const std::vector& axes, - bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, - const TensorShape* input_shape_override = nullptr); - } // namespace rocm } // namespace onnxruntime \ No newline at end of file diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index fcfc0cafa9..3b9832d48a 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -32,7 +32,7 @@ REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double) // REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t) template -template +template Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { const Tensor* X = ctx->Input(0); @@ -59,8 +59,8 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miope Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute(); - return ReduceComputeCore(*X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes, - calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); + return ReduceComputeCore(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes, + calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } } // namespace rocm From 4c1db50df52c889bccc538a8dd83afec4e4bb095 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Thu, 14 Jan 2021 13:57:18 -0800 Subject: [PATCH 05/41] miopen common --- .../core/providers/rocm/miopen_common.cc | 26 +++++++- .../core/providers/rocm/miopen_common.h | 1 + .../providers/rocm/reduction/reduction_ops.cc | 2 + .../providers/rocm/reduction/reduction_ops.h | 63 +++++++++++++------ 4 files changed, 71 insertions(+), 21 deletions(-) diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc index 8f2054e4ee..6c18b202a7 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.cc +++ b/onnxruntime/core/providers/rocm/miopen_common.cc @@ -41,16 +41,36 @@ Status MiopenTensor::Set(const std::vector& input_dims, miopenDataType_ return Status::OK(); } +Status MiopenTensor::Set(const MiopenTensor& x_desc, miopenBatchNormMode_t mode) { + ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); + MIOPEN_RETURN_IF_ERROR(miopenDeriveBNTensorDescriptor(tensor_, x_desc, mode)); + return Status::OK(); +} + template miopenDataType_t MiopenTensor::GetDataType() { - ORT_THROW("miopen engine currently supports only single/half precision data types."); + ORT_THROW("miopen engine currently supports only single/half/int32/int8 precision data types."); +} + +template<> +miopenDataType_t MiopenTensor::GetDataType() { + return miopenFloat; } template <> -miopenDataType_t MiopenTensor::GetDataType() { return miopenFloat; } +miopenDataType_t MiopenTensor::GetDataType() { + return miopenHalf; +} template <> -miopenDataType_t MiopenTensor::GetDataType() { return miopenHalf; } +miopenDataType_t MiopenTensor::GetDataType() { + return miopenInt32; +} + +template <> +miopenDataType_t MiopenTensor::GetDataType() { + return miopenInt8; +} template <> const float Consts::One = 1; diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h index b71f5413e9..73d865dcfd 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.h +++ b/onnxruntime/core/providers/rocm/miopen_common.h @@ -16,6 +16,7 @@ class MiopenTensor final { public: MiopenTensor(); ~MiopenTensor(); + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(MiopenTensor); Status Set(const std::vector& input_dims, miopenDataType_t dataType); Status Set(const MiopenTensor& x_desc, miopenBatchNormMode_t mode); diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 0c981f35e7..38f855eaee 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -162,6 +162,8 @@ Status ReduceKernel::ReduceKernelShared( miopenReduceTensorOp_t miopen_reduce_op, std::vector& /*output_dims*/) const { typedef typename ToHipType::MappedType HipT; + //typedef typename ToHipType::MappedType HipOutT; + //miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); const auto rank = input_shape.NumDimensions(); // Block of fast matrix row reduction. diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h index 936735aab7..f402851db1 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h @@ -10,21 +10,6 @@ namespace onnxruntime { namespace rocm { -enum miopenReduceTensorOp_t { - MIOPEN_REDUCE_TENSOR_ADD, - MIOPEN_REDUCE_TENSOR_MUL, - MIOPEN_REDUCE_TENSOR_MIN, - MIOPEN_REDUCE_TENSOR_MAX, - MIOPEN_REDUCE_TENSOR_AVG, - MIOPEN_REDUCE_TENSOR_NORM1, - MIOPEN_REDUCE_TENSOR_NORM2, -}; - -enum miopenReduceTensorIndices_t { - MIOPEN_REDUCE_TENSOR_NO_INDICES, - MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES, -}; - namespace ReductionOps { // Implementation that holds the core logic of reduction op processing @@ -133,7 +118,8 @@ class ReduceL1 final : public ReduceKernel { ReduceL1(const OpKernelInfo& info) : ReduceKernel(info) {} Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM1); + //return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM1); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm1."); } }; @@ -143,7 +129,8 @@ class ReduceL2 final : public ReduceKernel { ReduceL2(const OpKernelInfo& info) : ReduceKernel(info) {} Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM2); + //return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_NORM2); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce norm2."); } }; @@ -163,7 +150,8 @@ class ReduceMean final : public ReduceKernel { ReduceMean(const OpKernelInfo& info) : ReduceKernel(info) {} Status ComputeInternal(OpKernelContext* ctx) const override { - return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_AVG); + //return ComputeImpl(ctx, MIOPEN_REDUCE_TENSOR_AVG); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen does not yet support reduce avg."); } }; @@ -248,5 +236,44 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override = nullptr); +// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so +// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor. +// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t +// is implemented below, so ROCM can seamlessly work. +class MiopenReduceDescriptor final { + public: + MiopenReduceDescriptor() : desc_(nullptr) { + } + + ~MiopenReduceDescriptor() { + if (desc_ != nullptr) { + miopenDestroyReduceTensorDescriptor(desc_); + desc_ = nullptr; + } + } + + MiopenReduceDescriptor(const MiopenReduceDescriptor&) = delete; + MiopenReduceDescriptor& operator=(const MiopenReduceDescriptor&) = delete; + + Status Set(miopenReduceTensorOp_t op, miopenDataType_t type, miopenReduceTensorIndices_t indices) { + if (!desc_) + MIOPEN_RETURN_IF_ERROR(miopenCreateReduceTensorDescriptor(&desc_)); + + MIOPEN_RETURN_IF_ERROR(miopenSetReduceTensorDescriptor( + desc_, + op, + type, + MIOPEN_PROPAGATE_NAN, + indices, + MIOPEN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported. + return Status::OK(); + } + + operator miopenReduceTensorDescriptor_t() const { return desc_; } + + private: + miopenReduceTensorDescriptor_t desc_; +}; + } // namespace rocm } // namespace onnxruntime \ No newline at end of file From 196132925ece5590acb6d39510ae4b19009ed221 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Thu, 14 Jan 2021 17:29:23 -0800 Subject: [PATCH 06/41] Reuse CUDA's reduction_functions.cc --- .../providers/rocm/reduction/reduction_ops.cc | 54 +++++++++++++++++-- tools/ci_build/amd_hipify.py | 1 - 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 38f855eaee..74df6924f9 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -187,7 +187,7 @@ Status ReduceKernel::ReduceKernelShared( return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "reduction1 is not supported"); } -template Status ReduceKernel::ReduceKernelShared( +template Status ReduceKernel::ReduceKernelShared( const double* X, const TensorShape& input_shape, double* Y, @@ -195,7 +195,7 @@ template Status ReduceKernel::ReduceKernelShared( miopenReduceTensorOp_t miopen_reduce_op, std::vector& output_dims) const; -template Status ReduceKernel::ReduceKernelShared( +template Status ReduceKernel::ReduceKernelShared( const float* X, const TensorShape& input_shape, float* Y, @@ -203,7 +203,7 @@ template Status ReduceKernel::ReduceKernelShared( miopenReduceTensorOp_t miopen_reduce_op, std::vector& output_dims) const; -template Status ReduceKernel::ReduceKernelShared( +template Status ReduceKernel::ReduceKernelShared( const MLFloat16* X, const TensorShape& input_shape, MLFloat16* Y, @@ -385,6 +385,54 @@ Status ReduceKernel::ComputeImpl return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, Node().OpType(), " is not supported"); } +namespace ReductionOps { + +template +Tensor ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator, + const Tensor& input, const std::vector& axes, + bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, + bool fast_reduction, const TensorShape* input_shape_override) { + PrepareReduceMetadata prepare_reduce_metadata; + auto status = PrepareForReduce(&input, + keep_dims, + axes, + prepare_reduce_metadata, + input_shape_override); + + if (!status.IsOK()) { + ORT_THROW(ONNXRUNTIME, FAIL, "Failed to perform reduce op: ", status.ErrorMessage()); + } + + Tensor output(input.DataType(), prepare_reduce_metadata.squeezed_output_dims, allocator); + + status = ReduceComputeCore(rocm_ep, input, prepare_reduce_metadata, output, miopen_reduce_op, axes, + calculate_log, calculate_sqt, log_sum_exp, fast_reduction, input_shape_override); + + if (!status.IsOK()) { + ORT_THROW(ONNXRUNTIME, FAIL, "Failed to perform reduce op: ", status.ErrorMessage()); + } + + return output; +} + +// Explicit template instantiation (needed to be used in einsum_auxiliary_ops.cc) + +template Tensor ReduceCompute( + ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, + AllocatorPtr allocator, + const Tensor& input, const std::vector& axes, + bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, + bool fast_reduction, const TensorShape* input_shape_override); + +template Tensor ReduceCompute( + ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, + AllocatorPtr allocator, + const Tensor& input, const std::vector& axes, + bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, + bool fast_reduction, const TensorShape* input_shape_override); + +} // namespace ReductionOps + #define REGISTER_KERNEL_HFD(name) \ REGISTER_KERNEL_TYPED(name, MLFloat16) \ REGISTER_KERNEL_TYPED(name, float) \ diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index 6b498b025c..4eb0bdd446 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -131,7 +131,6 @@ provider_excluded_files = [ 'object_detection/roialign.h', 'object_detection/roialign_impl.cu', 'object_detection/roialign_impl.h', - 'reduction/reduction_functions.cc', 'reduction/reduction_functions.cu', 'reduction/reduction_functions.h', 'reduction/reduction_ops.cc', From a28ddb85b63c465388fa60faab15fa778a20ff60 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Thu, 14 Jan 2021 18:17:06 -0800 Subject: [PATCH 07/41] Reduction ops. --- .../rocm/reduction/reduction_all.cc | 32 ++++--- .../rocm/reduction/reduction_ops.cc | 92 +++++++++++++++++++ .../github/pai/pai-excluded-tests.txt | 4 + 3 files changed, 114 insertions(+), 14 deletions(-) diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc index 17dad9e9c0..c054b96816 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc @@ -64,32 +64,36 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // alternate path only for deterministic compute .. typedef AccumulationType_t HipTAcc; - // find scratch buffer size needed by 'reduce_square_sum' for each tensor - int scratch_size = 0; + // find reduction buffer size needed by 'reduce_square_sum' for each tensor + size_t reduction_buffer_size = 0; for (int i = 0; i < total_tensor_count; ++i) { - scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(HipTAcc), tensor_sizes[i])); + reduction_buffer_size = + std::max(reduction_buffer_size, compute_reduction_buffer_size(tensor_sizes[i])); } - // enlarge scratch buffer size for 'reduce_sum' over tensor square norms - scratch_size = std::max(scratch_size, compute_reduction_buffer_size(sizeof(HipTAcc), total_tensor_count)); - - // add head room for final output and square norms of each tensor - scratch_size += (1 + total_tensor_count) * sizeof(HipTAcc); + // enlarge reduction buffer size for 'reduce_sum' over tensor square norms + reduction_buffer_size = + std::max(reduction_buffer_size, compute_reduction_buffer_size(total_tensor_count)); // create GPU scratch space and zero target for each tensor square norm - auto scratch_buffer = GetScratchBuffer(scratch_size); - HIP_RETURN_IF_ERROR(hipMemsetAsync(scratch_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count))); + auto reduction_buffer = GetScratchBuffer(reduction_buffer_size); - HipTAcc* p_global_sqnorm = reinterpret_cast(scratch_buffer.get()); + // buffer for final output and square norms of each tensor + auto results_buffer = GetScratchBuffer(1 + total_tensor_count); + + HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count))); + + HipTAcc* p_global_sqnorm = results_buffer.get(); HipTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; - HipTAcc* p_reduce_buffer = p_tensor_sqnorm + total_tensor_count; // perform reduction l2norm = sqrt[sum(tensor[i][j]**2)] for i,j over all tensor elements for (int i = 0; i < total_tensor_count; ++i) { HipTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); - reduce_square_sum(p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], p_reduce_buffer); + ORT_RETURN_IF_ERROR(reduce_square_sum( + p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } - reduce_sum(p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, p_reduce_buffer); + ORT_RETURN_IF_ERROR(reduce_sum( + p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); ScalarSqrt(p_global_sqnorm, p_output); } diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index 3b9832d48a..2e4d64e347 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -63,5 +63,97 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miope calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } +template <> +template <> +Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { + typedef typename ToHipType::MappedType HipT; + + const Tensor* X = ctx->Input(0); + + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + std::vector axes(data, data + nDims); + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* Y = ctx->Output(0, X->Shape()); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + return Status::OK(); + } + + PrepareReduceMetadata prepare_reduce_metadata; + + ORT_RETURN_IF_ERROR(PrepareForReduce(X, + keepdims_, + axes, + prepare_reduce_metadata)); + + Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); + + int64_t input_count = prepare_reduce_metadata.input_count; + int64_t output_count = prepare_reduce_metadata.output_count; + std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; + std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; + + // special case when there is a dim value of 0 in the shape. + if (input_count == 0) { + assert(Y->Shape().Size() == 0); + return Status::OK(); + } + + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + if (input_count == output_count) { + if (Y->template MutableData() != X->template Data()) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + } + return Status::OK(); + } + + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + + size_t indices_bytes = 0; + size_t workspace_bytes = 0; + MiopenTensor input_tensor; + MiopenTensor output_tensor; + MiopenReduceDescriptor reduce_desc; + + miopenDataType_t miopen_type_X = miopenFloat; + IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); + Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes)); + IAllocatorUniquePtr indices_miopen = GetScratchBuffer(indices_bytes); + IAllocatorUniquePtr workspace_miopen = GetScratchBuffer(workspace_bytes); + + const auto one = Consts::One; + const auto zero = Consts::Zero; + auto temp_Y = GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), + reduce_desc, + indices_miopen.get(), + indices_bytes, + workspace_miopen.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); + + Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); + + return Status::OK(); +} + } // namespace rocm } // namespace onnxruntime \ No newline at end of file diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index 50318f1ee2..b8477d363d 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -32,6 +32,7 @@ ReductionOpTest.ReduceL1_do_not_keep_dims_2 ReductionOpTest.ReduceL1_keepdims ReductionOpTest.ReduceL1 ReductionOpTest.ReduceL1_int32 +ReductionOpTest.ReduceL10DTensor ReductionOpTest.ReduceL2_default_axes_keepdims ReductionOpTest.ReduceL2_default_axes_do_not_keep_dims ReductionOpTest.ReduceL2_do_not_keepdims @@ -39,6 +40,7 @@ ReductionOpTest.ReduceL2_do_not_keepdims_2 ReductionOpTest.ReduceL2_keepdims ReductionOpTest.ReduceL2 ReductionOpTest.ReduceL2_int32 +ReductionOpTest.ReduceL20DTensor ReductionOpTest.ReduceLogSum ReductionOpTest.ReduceLogSum_samesize ReductionOpTest.ReduceLogSum_do_not_keepdims_2 @@ -76,6 +78,8 @@ ReductionOpTest.ReduceMean_keepdims_double ReductionOpTest.ReduceMean ReductionOpTest.ReduceMean_double ReductionOpTest.ReduceMean_int32 +ReductionOpTest.ReduceMean0DTensor +ReductionOpTest.ReduceMean0DTensor_double ReductionOpTest.ReduceMin_default_axes_keepdims ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims_2D From 0b147702af9de6ddede6f139223454b545baacd9 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Fri, 15 Jan 2021 11:57:51 -0800 Subject: [PATCH 08/41] Update remaining reduction ops to use MIOpen. double datatype is not supported, so disable those typed kernels. --- .../rocm/reduction/reduction_functions.cu | 358 +-------- .../rocm/reduction/reduction_functions.h | 27 +- .../providers/rocm/reduction/reduction_ops.cc | 743 ++++++++++++++---- .../providers/rocm/reduction/reduction_ops.h | 7 - .../rocm/reduction/reduction_utils.cuh | 49 +- .../providers/rocm/rocm_execution_provider.cc | 112 +-- .../cuda/loss/softmaxcrossentropy_impl.cc | 8 +- .../loss/softmax_cross_entropy_loss_impl.cc | 58 +- .../rocm/loss/softmaxcrossentropy_impl.cc | 61 +- .../training_ops/rocm/optimizer/lamb.cc | 48 +- .../rocm/reduction/reduction_all.cu | 2 +- .../rocm/reduction/reduction_ops.cc | 4 +- .../rocm/rocm_training_kernels.cc | 6 +- .../github/pai/pai-excluded-tests.txt | 64 +- 14 files changed, 752 insertions(+), 795 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu index cd55592330..d6e1ee4181 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu @@ -323,28 +323,28 @@ Status call_reduce_matrix_columns( template Status reduce_sum( const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( + return detail::call_reduce_matrix_columns( input, output, 1, size, buffer, buffer_size); } template Status reduce_square_sum( const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( + return detail::call_reduce_matrix_columns( input, output, 1, size, buffer, buffer_size); } template Status reduce_l2_norm( const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( + return detail::call_reduce_matrix_columns( input, output, 1, size, buffer, buffer_size); } template Status reduce_mean( const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( + return detail::call_reduce_matrix_columns( input, output, 1, size, buffer, buffer_size); } @@ -477,7 +477,7 @@ INSTANTIATE_REDUCE_MATRIX_ROWS(double); template Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( + return detail::call_reduce_matrix_columns( input, output, m, n, buffer, buffer_size); } @@ -488,353 +488,5 @@ INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); #undef INSTANTIATE_REDUCE_MATRIX_COLUMNS - - - - - - -// -// TODO: DELETE EVERYTHING BELOW -// - -std::pair compute_block_size(int size) { - int x = GPU_WARP_SIZE; - int y = std::min(NUM_WARPS_PER_BLOCK, std::max(1, size / (NUM_ELEMENTS_PER_THREAD * GPU_WARP_SIZE))); - return std::make_pair(x, y); -} - -int compute_grid_size(int size) { - const auto block = compute_block_size(size); - return std::min(MAX_NUM_BLOCKS, std::max(1, size / (NUM_ELEMENTS_PER_THREAD * block.first * block.second))); -} - -int compute_reduction_buffer_size(int element_size, int size) { - const int num_blocks = compute_grid_size(size); - return static_cast(num_blocks * element_size + sizeof(int)); -} - -template -__global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output, TOut* buffer) { - extern __shared__ unsigned char shared_memory_[]; - TOut* shared_memory = reinterpret_cast(shared_memory_); - // Thread-level indexes: - // Linear index of thread in block. - const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x; - // Total number of threads in a 2-D block. - const int num_threads_in_block = blockDim.x * blockDim.y; - - // Warp-level indexes: - // Warp index of thread. - const int wid_in_block = tid_in_block / GPU_WARP_SIZE; - // Lane index of thread. - const int lid_in_block = tid_in_block % GPU_WARP_SIZE; - // Warp count per block. - const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE; - - // Grid-level indexes: - // Linear index of block in grid. - const int bid_in_grid = blockIdx.x + blockIdx.y * gridDim.x; - // Linear index of thread in grid. - const int tid_in_grid = bid_in_grid * (blockDim.x * blockDim.y) + tid_in_block; - // Total number of blocks in a 2-D grid. - const int num_blocks_in_grid = gridDim.x * gridDim.y; - // Total number of threads in a 2-D grid with 2-D blocks. - const int num_threads_in_grid = num_blocks_in_grid * num_threads_in_block; - - // Thread-level reduction (storage change: global memory -> register). - // One thread reduces NUM_ELEMENTS_PER_THREAD elements to a thread register - // in one iteration. - TOut value = 0; - for (int id = tid_in_grid; id < size; id += NUM_ELEMENTS_PER_THREAD * num_threads_in_grid) { - TOut v[NUM_ELEMENTS_PER_THREAD]; - - #pragma unroll - for (int i = 0; i < NUM_ELEMENTS_PER_THREAD; i++) { - int offset = id + i * num_threads_in_grid; - if (offset < size) { - v[i] = TOut(TOp()(data[offset])); - } else { - v[i] = TOut(0.0f); - } - } - - #pragma unroll - for (int i = 0; i < NUM_ELEMENTS_PER_THREAD; i++) { - value += v[i]; - } - } - - __syncthreads(); - - // Warp-level reduction (storage change: register -> register). - // The values in a warp will be summed up to a scalar. After warp-level - // reduction, each block holds num_warps_in_block values in the shared memory. - TOut value_ = value; -#pragma unroll - for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { - value_ += WARP_SHFL_DOWN(value_, stride); - } - - // Return early if only one warp is used for reduction. - // Given a fixed amount of threads, we perfer threads over warps over blocks so that we never have cases such as - // 1. two blocks and each of them has only 1 warp (32 threads). - // 2. two warps and each of them has only 2 threads. - if (num_warps_in_block == 1) { - if (tid_in_grid == 0) { - // Compilation time if-else branch controlled by template argument can be - // optimized out, so there will be no branch in real computation phase. - if (DivideResultBySize) { - output[0] = TFinalOp()(value_ / TOut(size)); - } else { - output[0] = TFinalOp()(value_); - } - } - return; - } - - if (lid_in_block == 0) { - shared_memory[wid_in_block] = value_; - } - - __syncthreads(); - - // Block-level reduction (storage change: shared memory -> global memory). - // The values in a block will be summed up to a scalar. - // Note that the values are stored in the shared memory. - // Here we assume that the size of shared_memory is smaller - // than num_warps_in_block, so we just keep halving the number - // of threads in each iteartion. Our assumption is always true because - // the size of shared_memory equals to the number of warps. -#pragma unroll - for (int stride = NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { - if (tid_in_block + stride < num_warps_in_block) { - shared_memory[tid_in_block] += shared_memory[tid_in_block + stride]; - } - __syncthreads(); - } - - // Return early if only one block is used for reduction. - if (num_blocks_in_grid == 1) { - if (tid_in_grid == 0) { - // Compilation time if-else branch controlled by template argument can be - // optimized out, so there will be no branch in real computation phase. - if (DivideResultBySize) { - output[0] = TFinalOp()(shared_memory[0] / TOut(size)); - } else { - output[0] = TFinalOp()(shared_memory[0]); - } - } - return; - } - - if (tid_in_block == 0) { - buffer[bid_in_grid] = shared_memory[0]; - } - - __threadfence(); - __syncthreads(); - - // Grid-level reduciton. We use the last block to sum up values - // stored in the global buffer. - __shared__ bool is_last_block_done; - - if (tid_in_block == 0) { - int* p_lock = reinterpret_cast(buffer + num_blocks_in_grid); - int count = atomicAdd(p_lock, 1); - is_last_block_done = (count == (num_blocks_in_grid - 1)); - } - - // All threads in each block see if they belong the last active block - // (i.e., the value of is_last_block_done). - __syncthreads(); - - // Only the block which saw that count equals to num_blocks_in_grid - 1 can - // enter the following block. - if (is_last_block_done) { - const int pow2_bound = least_pow2_bound(num_blocks_in_grid); - for (int stride = pow2_bound / 2; stride > 0; stride /= 2) { - if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid) { - buffer[tid_in_block] += buffer[tid_in_block + stride]; - } - __syncthreads(); - } - - // The first thread in the last block assigns the final output. - if (tid_in_block == 0) { - // Compilation time if-else branch controlled by template argument can be - // optimized out, so there will be no branch in real computation phase. - if (DivideResultBySize) { - output[0] = TFinalOp()(buffer[0] / TOut(size)); - } else { - output[0] = TFinalOp()(buffer[0]); - } - } - } -} - -template -void call_reduce_all_kernel(const TIn *data, TOut *output, int size, TOut *buffer) { - const auto block_size = compute_block_size(size); - const int num_blocks = compute_grid_size(size); - const dim3 block(block_size.first, block_size.second, 1); - const dim3 grid(num_blocks, 1, 1); - - // If more than one blocks are used, then inter-blocks reduction is needed. - if (num_blocks != 1) { - HIP_CALL_THROW(hipMemsetAsync(buffer + num_blocks, 0, sizeof(int))); - } - - const int shared_mem_size = sizeof(TOut) * block_size.first * block_size.second / GPU_WARP_SIZE; - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_all_kernel), dim3(grid), dim3(block), shared_mem_size, 0, size, data, output, buffer); -} - -template -void reduce_sum(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, false>( - data, output, size, buffer); -} - -template -void reduce_square_sum(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, false>( - data, output, size, buffer); -} - -template -void reduce_l2_norm(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Sqrt, false>( - data, output, size, buffer); -} - -template -void reduce_mean(const TIn* data, TOut* output, int size, TOut* buffer) { - call_reduce_all_kernel, Identity, true>( - data, output, size, buffer); -} - -template void reduce_sum( - const half* data, float* output, int size, float* buffer); -template void reduce_sum( - const float* data, float* output, int size, float* buffer); -template void reduce_sum( - const double* data, double* output, int size, double* buffer); - -template void reduce_square_sum( - const half* data, float* output, int size, float* buffer); -template void reduce_square_sum( - const float* data, float* output, int size, float* buffer); -template void reduce_square_sum( - const double* data, double* output, int size, double* buffer); - -template void reduce_l2_norm( - const half* data, float* output, int size, float* buffer); -template void reduce_l2_norm( - const float* data, float* output, int size, float* buffer); -template void reduce_l2_norm( - const double* data, double* output, int size, double* buffer); - -template void reduce_mean( - const half* data, float* output, int size, float* buffer); -template void reduce_mean( - const float* data, float* output, int size, float* buffer); -template void reduce_mean( - const double* data, double* output, int size, double* buffer); - -template -__global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) { - constexpr int x_load_count_per_thread = 1; - constexpr int y_load_count_per_thread = 4; - const int t_count_x_in_grid = blockDim.x * gridDim.x; - const int t_count_y_in_grid = blockDim.y * gridDim.y; - const int x_grid_stride = t_count_x_in_grid * x_load_count_per_thread; - const int y_grid_stride = t_count_y_in_grid * y_load_count_per_thread; - const int tid_x_in_grid = threadIdx.x + blockDim.x * blockIdx.x; - const int tid_y_in_grid = threadIdx.y + blockDim.y * blockIdx.y; - const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y; - - // Shape is blockDim.y-by-blockDim.x and element type is TBuf. - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_) - TBuf* shared_memory = reinterpret_cast(shared_memory_); - - // to prevent int overflow in index calculation for input size m*n - const int64_t n_int64 = static_cast(n); - - for (int col = tid_x_in_grid; col < n; col += x_grid_stride) { - shared_memory[tid_in_block] = TBuf(0.0f); - TBuf sum = TBuf(0.0f); - - // This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input. - for (int row = tid_y_in_grid; row < m; row += y_grid_stride) { - // Thread-level reduction. Each thread loads y_load_count_per_thread values - // and aggregrate them. -#pragma unroll(y_load_count_per_thread) - for (int row_inner = 0; row_inner < y_load_count_per_thread; ++row_inner) { - int row_final = row + row_inner * t_count_y_in_grid; - int col_final = col; - if (row_final < m && col_final < n) { - sum += TBuf(input[row_final * n_int64 + col_final]); - } - } - } - - // Write thread-level reduction result into shared memory. - shared_memory[tid_in_block] = sum; - - // Wait all threads to finish their thread-level reductions. - __syncthreads(); - -// This loop conducts reduction on elements stored in shared memory. -// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor. -#pragma unroll(4) - for (int stride = blockDim.y / 2; stride > 0; stride /= 2) { - if (threadIdx.y < stride) { - shared_memory[tid_in_block] += shared_memory[tid_in_block + stride * blockDim.x]; - } - __syncthreads(); - } - - if (threadIdx.y == 0) { - atomic_add(output + col, TOut(shared_memory[threadIdx.x])); - } - } -} - -// This function reduces the given input tensor along all but the last axis. -// For example, [N, C, H, W]-tensor may lead to a output [W]-tensor. -// It's implementation is in reduction_ops.cu and called in reduction_ops.cc. -template -void call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n) { - constexpr int max_num_threads_in_block = 512; - constexpr int max_num_blocks_in_grid = 512; - constexpr int load_count_per_thread = 4; - - const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE))); - const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread))); - const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid)); - const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4)); - - const dim3 grid(grid_x_dim, grid_y_dim, 1); - const dim3 block(block_x_dim, block_y_dim, 1); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_rows_kernel), dim3(grid), dim3(block), block.y * block.x * sizeof(TBuf), 0, - input, output, m, n); -} - -template -void reduce_matrix_rows(const TIn* data, TOut* output, int m, int n) { - call_reduce_matrix_rows(data, output, m, n); -} - -template <> -void reduce_matrix_rows(const half* data, half* output, int m, int n) { - call_reduce_matrix_rows(data, output, m, n); -} - -template void reduce_matrix_rows( - const float* data, float* output, int m, int n); -template void reduce_matrix_rows( - const double* data, double* output, int m, int n); - } // namespace rocm } // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h b/onnxruntime/core/providers/rocm/reduction/reduction_functions.h index 3b8c796b42..2f677c3c4b 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_functions.h @@ -89,7 +89,7 @@ ApplicableMatrixReduction get_applicable_matrix_reduction( * @param reset_initial_output Whether to reset (i.e., zero) the output values first. */ template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output /* TODO: = true*/); +Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); /** * Reduces the columns in a row-major matrix to a single column containing the sum of each row. @@ -103,30 +103,5 @@ Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool res template Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); - - - - -// -// TODO: DELETE EVERYTHING BELOW -// - -int compute_reduction_buffer_size(int element_size, int size); - -template -void reduce_sum(const TIn* input, TOut* output, int size, TOut* buffer); - -template -void reduce_square_sum(const TIn* input, TOut* output, int size, TOut* buffer); - -template -void reduce_l2_norm(const TIn* input, TOut* output, int size, TOut* buffer); - -template -void reduce_mean(const TIn* data, TOut* output, int size, TOut* buffer); - -template -void reduce_matrix_rows(const TIn* data, TOut* output, int m, int n); - } // namespace rocm } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 74df6924f9..c16ced3e80 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -126,31 +126,6 @@ namespace rocm { .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); -static bool is_matrix_row_reduction( - const miopenReduceTensorOp_t miopen_reduce_op, - const int m, - const int n, - const size_t rank, - std::vector axes) { - if (m < 1) - return false; - - if (n < 1) - return false; - - if (rank < 2) - return false; - - if (miopen_reduce_op != MIOPEN_REDUCE_TENSOR_ADD) - return false; - - //empty axes, default reduction - if (axes.size() < 1) - return false; - - return true; -} - // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored. template template @@ -158,42 +133,189 @@ Status ReduceKernel::ReduceKernelShared( const T* X, const TensorShape& input_shape, OutT* Y, - const TensorShape& /*output_shape*/, - miopenReduceTensorOp_t miopen_reduce_op, - std::vector& /*output_dims*/) const { - typedef typename ToHipType::MappedType HipT; - //typedef typename ToHipType::MappedType HipOutT; - //miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); - const auto rank = input_shape.NumDimensions(); - - // Block of fast matrix row reduction. - // It relies on new atomicAdd for half type, so old hip can't use it. - const auto stride = input_shape[input_shape.NumDimensions() - 1]; - const auto reduction_size = input_shape.Size() / stride; - if (fast_reduction_ && reduction_size <= std::numeric_limits::max() && stride <= std::numeric_limits::max() && - is_matrix_row_reduction(miopen_reduce_op, - static_cast(reduction_size), - static_cast(stride), rank, axes_)) { - reduce_matrix_rows( - reinterpret_cast(X), - reinterpret_cast(Y), - static_cast(reduction_size), - static_cast(stride)); - return Status::OK(); - } - - // TODO: miOpen doesn't support reduction op as CUDNN. Two options: - // 1) implement reduction ops by ourselves 2) ask AMD to support same reduction functionality as CUDNN. - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "reduction1 is not supported"); -} - -template Status ReduceKernel::ReduceKernelShared( - const double* X, - const TensorShape& input_shape, - double* Y, const TensorShape& output_shape, miopenReduceTensorOp_t miopen_reduce_op, - std::vector& output_dims) const; + std::vector& output_dims) const { + typedef typename ToHipType::MappedType HipT; + typedef typename ToHipType::MappedType HipOutT; + miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); + const auto rank = input_shape.NumDimensions(); + + // Block of fast matrix reduction. + if (fast_reduction_) { + int m{}, n{}; + const auto applicable_matrix_reduction = get_applicable_matrix_reduction( + miopen_reduce_op, input_shape.GetDims(), axes_, m, n); + switch (applicable_matrix_reduction) { + case ApplicableMatrixReduction::Rows: { + return reduce_matrix_rows( + reinterpret_cast(X), + reinterpret_cast(Y), + m, n, false); + } + case ApplicableMatrixReduction::Columns: + // don't call reduce_matrix_columns() since it will reset initial output data + default: + break; + } + } + + const auto& input_dims = input_shape.GetDims(); + int64_t input_count = input_shape.Size(); + IAllocatorUniquePtr temp_X; + if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES && std::is_same::value) { + // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen + temp_X = GetScratchBuffer(input_count); + miopen_type_X = miopenFloat; + Impl_Cast(reinterpret_cast(X), temp_X.get(), input_shape.Size()); + } + + // MIOpen requires at least 3D input, so pad 1s if needed + std::vector input_dims_miopen = input_dims; + std::vector output_dims_miopen = output_dims; + if (rank < 3) { + std::vector pads(3 - rank, 1); + input_dims_miopen.insert(input_dims_miopen.end(), pads.begin(), pads.end()); + output_dims_miopen.insert(output_dims_miopen.end(), pads.begin(), pads.end()); + } + + MiopenReduceDescriptor reduce_desc; + if (std::is_same::value) + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType(), ReduceTensorIndices)); + else + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); + const auto one = Consts::One; + const auto zero = Consts::Zero; + MiopenTensor input_tensor; + MiopenTensor output_tensor; + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); + size_t workspace_bytes = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes)); + auto workspace_rocm = GetScratchBuffer(workspace_bytes); + + size_t indices_bytes = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes)); + auto indices_rocm = GetScratchBuffer(indices_bytes); + + // need to allocate a separate buffer for ArgMin/ArgMax comparsion output + auto output_count = output_shape.Size(); + + if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) { + HipT* input_data = nullptr; + if (calculate_sqt_) { + input_data = reinterpret_cast(GetScratchBuffer(input_count).get()); + fast_divmod tmp_div; + Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + reinterpret_cast(X), nullptr, + reinterpret_cast(X), nullptr, + tmp_div, tmp_div, + input_data, input_count); + } else if (log_sum_exp_) { + // Reduce max -- Max/Min will output indices data + MiopenReduceDescriptor reduce_max_desc; + ORT_RETURN_IF_ERROR(reduce_max_desc.Set(MIOPEN_REDUCE_TENSOR_MAX, miopen_type_X, MIOPEN_REDUCE_TENSOR_NO_INDICES)); + size_t indices_bytes_max = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_max_desc, input_tensor, output_tensor, &indices_bytes_max)); + auto indices_rocm_max = GetScratchBuffer(indices_bytes); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_max_desc, indices_rocm_max.get(), indices_bytes_max, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(X), + &zero, output_tensor, reinterpret_cast(Y))); + + // Exp(X-ReduceMax) + const TensorShape rhs_shape(output_dims); + auto exp_result = GetScratchBuffer(input_count).get(); + auto log_sum_result = GetScratchBuffer(output_count).get(); + BinaryElementwisePreparation prepare; + ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); + Impl_Sub(prepare.output_rank_or_simple_broadcast, + &prepare.lhs_padded_strides, + reinterpret_cast(X), + &prepare.rhs_padded_strides, + reinterpret_cast(Y), + &prepare.fdm_output_strides, + prepare.fdm_H, prepare.fdm_C, + reinterpret_cast(exp_result), input_count); + + Impl_Exp(reinterpret_cast(exp_result), + reinterpret_cast(exp_result), + input_count); + + // ReduceSum + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, exp_result, + &zero, output_tensor, reinterpret_cast(log_sum_result))); + + // Log(Sum) + Impl_Log(reinterpret_cast(log_sum_result), + reinterpret_cast(log_sum_result), + output_count); + + // Log + ReduceMax + fast_divmod tmp_div; + Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + reinterpret_cast(log_sum_result), nullptr, + reinterpret_cast(Y), nullptr, + tmp_div, tmp_div, + reinterpret_cast(Y), output_count); + + return Status::OK(); + } + if (calculate_sqt_) { + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, input_data, + &zero, output_tensor, reinterpret_cast(Y))); + } else { + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + if (input_count == output_count) { + if (reinterpret_cast(Y) != reinterpret_cast(X)) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y, X, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + } + } else { + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(X), + &zero, output_tensor, reinterpret_cast(Y))); + } + } + } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type + if (temp_X) { + auto temp_output = GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, temp_X.get(), + &zero, output_tensor, temp_output.get())); + } else { + auto temp_output = GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + MiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(X), + &zero, output_tensor, temp_output.get())); + } + + // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec + Impl_Cast(reinterpret_cast(indices_rocm.get()), reinterpret_cast(Y), output_count); + } + + if (calculate_log_) { + Impl_Log(reinterpret_cast(Y), + reinterpret_cast(Y), + output_count); + } + + return Status::OK(); +} + +// template Status ReduceKernel::ReduceKernelShared( +// const double* X, +// const TensorShape& input_shape, +// double* Y, +// const TensorShape& output_shape, +// miopenReduceTensorOp_t miopen_reduce_op, +// std::vector& output_dims) const; template Status ReduceKernel::ReduceKernelShared( const float* X, @@ -220,50 +342,26 @@ Status PrepareForReduce(const Tensor* X, ORT_ENFORCE(nullptr != X); const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape(); - int64_t rank = static_cast(input_shape.NumDimensions()); - prepare_reduce_metadata.rank = rank; + const int64_t rank = gsl::narrow(input_shape.NumDimensions()); prepare_reduce_metadata.input_count = input_shape.Size(); - prepare_reduce_metadata.stride = (rank > 0) ? input_shape[input_shape.NumDimensions() - 1] : 1; - prepare_reduce_metadata.contiguous_axes = false; if (rank > 8) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "miopen only supports up to 8-D tensors in reduction"); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "MIOpen only supports up to 8-D tensors in reduction"); } const auto& input_dims = input_shape.GetDims(); std::vector reduced(rank, false); prepare_reduce_metadata.output_dims.reserve(input_dims.size()); if (axes.size() > 0) { - int64_t reduced_axis; - std::vector reduced_axes(axes.size()); prepare_reduce_metadata.output_dims = input_dims; - for (size_t i = 0; i < axes.size(); i++) { - reduced_axis = axes[i]; - const int64_t axis = HandleNegativeAxis(reduced_axis, rank); + for (auto axis : axes) { + axis = HandleNegativeAxis(axis, rank); ORT_ENFORCE(input_dims[axis] != 0, "Can't reduce on dim with value of 0 if 'keepdims' is false. " "Invalid output shape would be produced. input_shape:", input_shape); prepare_reduce_metadata.output_dims[axis] = 1; reduced[axis] = true; - reduced_axes[i] = axis; - } - - bool contiguous_axes = true; - std::sort(reduced_axes.begin(), reduced_axes.end()); - for (size_t i = 0; i < reduced_axes.size(); i++) { - if (reduced_axes[i] != i) { - contiguous_axes = false; - break; - } - } - int64_t stride = 1; - if (contiguous_axes) { - for (size_t s = rank - 1; s >= reduced_axes.size(); s--) { - stride *= input_dims[s]; - } - prepare_reduce_metadata.stride = stride; - prepare_reduce_metadata.contiguous_axes = true; } } else { // no axes provided (i.e.) default axes => reduce on all dims @@ -290,7 +388,7 @@ Status PrepareForReduce(const Tensor* X, // so the result is just a scalar, we keep 'squeezed_output_dims' empty (i.e.) no-op } - // miopen requires at least 3D input, so pad 1s if needed + // MIOpen requires at least 3D input, so pad 1s if needed prepare_reduce_metadata.input_dims_miopen = input_dims; prepare_reduce_metadata.output_dims_miopen = prepare_reduce_metadata.output_dims; if (rank < 3) { @@ -301,30 +399,24 @@ Status PrepareForReduce(const Tensor* X, prepare_reduce_metadata.output_count = TensorShape(prepare_reduce_metadata.output_dims).Size(); - if (prepare_reduce_metadata.rank == 0) { - prepare_reduce_metadata.rank = 1; - } - return Status::OK(); } // `input_shape_override` is the input shape for compute purposes (if provided) template -Status ReduceComputeCore(ROCMExecutionProvider& /*rocm_ep*/, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, +Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, /*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op, const std::vector& axes, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override) { typedef typename ToHipType::MappedType HipT; - // const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape(); + const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape(); int64_t input_count = prepare_reduce_metadata.input_count; int64_t output_count = prepare_reduce_metadata.output_count; - // std::vector& output_dims = prepare_reduce_metadata.output_dims; - // std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; - // std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; - int64_t rank = prepare_reduce_metadata.rank; - int64_t stride = prepare_reduce_metadata.stride; + std::vector& output_dims = prepare_reduce_metadata.output_dims; + std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; + std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; // special case when there is a dim value of 0 in the shape. if (input_count == 0) { @@ -332,57 +424,426 @@ Status ReduceComputeCore(ROCMExecutionProvider& /*rocm_ep*/, const Tensor& input return Status::OK(); } + // Block of fast matrix reduction. + if (fast_reduction) { + int m{}, n{}; + const auto applicable_matrix_reduction = get_applicable_matrix_reduction( + miopen_reduce_op, input_shape.GetDims(), axes, m, n); + switch (applicable_matrix_reduction) { + case ApplicableMatrixReduction::Rows: { + return reduce_matrix_rows( + reinterpret_cast(input.template Data()), + reinterpret_cast(output.template MutableData()), + m, n); + } + case ApplicableMatrixReduction::Columns: { + const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); + auto buffer = rocm_ep.GetScratchBuffer(buffer_size_bytes); + return reduce_matrix_columns( + reinterpret_cast(input.template Data()), + reinterpret_cast(output.template MutableData()), + m, n, buffer.get(), buffer_size_bytes); + } + default: + break; + } + } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required HIP_RETURN_IF_ERROR(hipMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); - // Block of fast matrix row reduction. - // It relies on new atomicAdd for half type, so old CUDA can't use it. - const auto reduction_size = input_count / stride; - if (!std::is_same::value && !std::is_same::value) { - if (fast_reduction && reduction_size <= std::numeric_limits::max() && stride <= std::numeric_limits::max() && - prepare_reduce_metadata.contiguous_axes && - is_matrix_row_reduction(miopen_reduce_op, static_cast(reduction_size), static_cast(stride), rank, axes)) { - reduce_matrix_rows( - reinterpret_cast(input.template Data()), - reinterpret_cast(output.template MutableData()), - static_cast(reduction_size), - static_cast(stride)); + IAllocatorUniquePtr temp_X; + miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); + + if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES && std::is_same::value) { + // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen + temp_X = rocm_ep.GetScratchBuffer(input_count); + miopen_type_X = miopenFloat; + Impl_Cast(reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); + } + + MiopenReduceDescriptor reduce_desc; + if (std::is_same::value) { + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType(), ReduceTensorIndices)); + } else { + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); + } + + const auto one = Consts::One; + const auto zero = Consts::Zero; + MiopenTensor input_tensor; + MiopenTensor output_tensor; + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); + size_t workspace_bytes = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(rocm_ep.PerThreadMiopenHandle(), reduce_desc, + input_tensor, output_tensor, &workspace_bytes)); + auto workspace_rocm = rocm_ep.GetScratchBuffer(workspace_bytes); + + size_t indices_bytes = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(rocm_ep.PerThreadMiopenHandle(), reduce_desc, + input_tensor, output_tensor, &indices_bytes)); + auto indices_rocm = rocm_ep.GetScratchBuffer(indices_bytes); + + if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) { + IAllocatorUniquePtr input_data_buffer(nullptr, [](T*) {}); + HipT* input_data = nullptr; + if (calculate_sqt) { + input_data_buffer = rocm_ep.GetScratchBuffer(input_count); + input_data = reinterpret_cast(input_data_buffer.get()); + fast_divmod tmp_div; + Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + reinterpret_cast(input.template Data()), nullptr, + reinterpret_cast(input.template Data()), nullptr, + tmp_div, tmp_div, + input_data, input_count); + } else if (log_sum_exp) { + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + // This happens when the input is Scalar + if (input_count == output_count) { + if (output.template MutableData() != input.template Data()) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + } + } else { + // Reduce max -- Max/Min will output indices data + MiopenReduceDescriptor reduce_max_desc; + miopenDataType_t miopen_reduce_max_type = miopen_type_X; + if ((std::is_same::value)) { + miopen_reduce_max_type = miopenFloat; + } + ORT_RETURN_IF_ERROR(reduce_max_desc.Set(MIOPEN_REDUCE_TENSOR_MAX, miopen_reduce_max_type, MIOPEN_REDUCE_TENSOR_NO_INDICES)); + size_t indices_bytes_max = 0; + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(rocm_ep.PerThreadMiopenHandle(), reduce_max_desc, + input_tensor, output_tensor, &indices_bytes_max)); + auto indices_rocm_max = rocm_ep.GetScratchBuffer(indices_bytes); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_max_desc, indices_rocm_max.get(), indices_bytes_max, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(input.template Data()), + &zero, output_tensor, reinterpret_cast(output.template MutableData()))); + } + + // Exp(X-ReduceMax) + const TensorShape output_shape(output_dims); + auto exp_result_buffer = rocm_ep.GetScratchBuffer(input_count); + auto exp_result = exp_result_buffer.get(); + auto log_sum_result_buffer = rocm_ep.GetScratchBuffer(output_count); + auto log_sum_result = log_sum_result_buffer.get(); + BinaryElementwisePreparation prepare; + ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); + Impl_Sub(prepare.output_rank_or_simple_broadcast, + &prepare.lhs_padded_strides, + reinterpret_cast(input.template Data()), + &prepare.rhs_padded_strides, + reinterpret_cast(output.template MutableData()), + &prepare.fdm_output_strides, + prepare.fdm_H, prepare.fdm_C, + reinterpret_cast(exp_result), input_count); + + Impl_Exp(reinterpret_cast(exp_result), + reinterpret_cast(exp_result), + input_count); + + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + // This happens when the input is Scalar. We do not need to add anything in this case. + if (input_count == output_count) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + } else { + // ReduceSum + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, exp_result, + &zero, output_tensor, reinterpret_cast(log_sum_result))); + } + + // Log(Sum) + Impl_Log(reinterpret_cast(log_sum_result), + reinterpret_cast(log_sum_result), + output_count); + + // Log + ReduceMax + fast_divmod tmp_div; + Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + reinterpret_cast(log_sum_result), nullptr, + reinterpret_cast(output.template MutableData()), nullptr, + tmp_div, tmp_div, + reinterpret_cast(output.template MutableData()), output_count); + return Status::OK(); } - } - - if (input_count == output_count) { - if (output.template MutableData() != input.template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + if (calculate_sqt) { + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + // This happens when the input is Scalar. We do not need to add anything in this case. + if (input_count == output_count) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + } else { + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, input_data, + &zero, output_tensor, reinterpret_cast(output.template MutableData()))); + } + } else { + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + if (input_count == output_count) { + if (output.template MutableData() != input.template Data()) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + } + } else { + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(input.template Data()), + &zero, output_tensor, reinterpret_cast(output.template MutableData()))); + } } - return Status::OK(); + } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type + if (temp_X) { + auto temp_output = rocm_ep.GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, temp_X.get(), + &zero, output_tensor, temp_output.get())); + } else { + auto temp_output = rocm_ep.GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(input.template Data()), + &zero, output_tensor, temp_output.get())); + } + + // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec + Impl_Cast(reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); } - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "reduction2 is not supported"); + if (calculate_log) { + Impl_Log(reinterpret_cast(output.template MutableData()), + reinterpret_cast(output.template MutableData()), + output_count); + } + + return Status::OK(); } + template template Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { const Tensor* X = ctx->Input(0); + std::vector axes; + + size_t num_inputs = ctx->InputCount(); + if (num_inputs == 2) { + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + axes.assign(data, data + nDims); + } else { + axes.assign(axes_.begin(), axes_.end()); + } + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* Y = ctx->Output(0, X->Shape()); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + return Status::OK(); + } PrepareReduceMetadata prepare_reduce_metadata; ORT_RETURN_IF_ERROR(PrepareForReduce(X, keepdims_, - axes_, + axes, prepare_reduce_metadata)); Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute(); - return ReduceComputeCore(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes_, + return ReduceComputeCore(*rocm_ep_, *X, prepare_reduce_metadata, *Y, miopen_reduce_op, axes, calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction); } template <> template <> -Status ReduceKernel::ComputeImpl(OpKernelContext* /*ctx*/, miopenReduceTensorOp_t /*miopen_reduce_op*/) const { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, Node().OpType(), " is not supported"); +Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { + typedef typename ToHipType::MappedType HipT; + + const Tensor* X = ctx->Input(0); + std::vector axes; + + size_t num_inputs = ctx->InputCount(); + if (num_inputs == 2) { + //override the attribute value with the input value for reduction_axes + const Tensor* axes_tensor = ctx->Input(1); + ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null"); + ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor."); + auto nDims = static_cast(axes_tensor->Shape()[0]); + const auto* data = axes_tensor->template Data(); + axes.assign(data, data + nDims); + } else { + axes.assign(axes_.begin(), axes_.end()); + } + + // empty axes and no-op + if (axes.empty() && noop_with_empty_axes_) { + auto* Y = ctx->Output(0, X->Shape()); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + return Status::OK(); + } + + PrepareReduceMetadata prepare_reduce_metadata; + + ORT_RETURN_IF_ERROR(PrepareForReduce(X, + keepdims_, + axes, + prepare_reduce_metadata)); + + Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); + + int64_t input_count = prepare_reduce_metadata.input_count; + int64_t output_count = prepare_reduce_metadata.output_count; + std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; + std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; + + // special case when there is a dim value of 0 in the shape. + if (input_count == 0) { + assert(Y->Shape().Size() == 0); + return Status::OK(); + } + + // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case + if (input_count == output_count) { + if (Y->template MutableData() != X->template Data()) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + } + return Status::OK(); + } + + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + + size_t indices_bytes = 0; + size_t workspace_bytes = 0; + MiopenTensor input_tensor; + MiopenTensor output_tensor; + MiopenReduceDescriptor reduce_desc; + + miopenDataType_t miopen_type_X = miopenFloat; + IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); + Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes)); + IAllocatorUniquePtr indices_rocm = GetScratchBuffer(indices_bytes); + IAllocatorUniquePtr workspace_rocm = GetScratchBuffer(workspace_bytes); + + const auto one = Consts::One; + const auto zero = Consts::Zero; + auto temp_Y = GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), + reduce_desc, + indices_rocm.get(), + indices_bytes, + workspace_rocm.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); + + Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); + + return Status::OK(); +} + +template <> +template <> +Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { + typedef typename ToHipType::MappedType HipT; + + const Tensor* X = ctx->Input(0); + PrepareReduceMetadata prepare_reduce_metadata; + + ORT_RETURN_IF_ERROR(PrepareForReduce(X, + keepdims_, + axes_, + prepare_reduce_metadata)); + + Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims); + + int64_t input_count = prepare_reduce_metadata.input_count; + int64_t output_count = prepare_reduce_metadata.output_count; + std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; + std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; + + // special case when there is a dim value of 0 in the shape. + if (input_count == 0) { + assert(Y->Shape().Size() == 0); + return Status::OK(); + } + + // miopenReduceTensor has issue if input and output has same size, we just need to copy the data for this case + auto* const dst = Y->template MutableData(); + const auto* const src = X->template Data(); + if (input_count == output_count) { + if (src != dst) { + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst, src, input_count * sizeof(int8_t), hipMemcpyDeviceToDevice)); + } + return Status::OK(); + } + + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + + size_t indices_bytes = 0; + size_t workspace_bytes = 0; + MiopenTensor input_tensor; + MiopenTensor output_tensor; + MiopenReduceDescriptor reduce_desc; + + miopenDataType_t miopen_type_X = miopenFloat; + IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); + Impl_Cast(reinterpret_cast(src), temp_X.get(), X->Shape().Size()); + + ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); + ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); + ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes)); + MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(MiopenHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes)); + IAllocatorUniquePtr indices_rocm = GetScratchBuffer(indices_bytes); + IAllocatorUniquePtr workspace_rocm = GetScratchBuffer(workspace_bytes); + + const auto one = Consts::One; + const auto zero = Consts::Zero; + auto temp_Y = GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), + reduce_desc, + indices_rocm.get(), + indices_bytes, + workspace_rocm.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); + + Impl_Cast(temp_Y.get(), dst, output_count); + + return Status::OK(); } namespace ReductionOps { @@ -424,24 +885,24 @@ template Tensor ReduceCompute( bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction, const TensorShape* input_shape_override); -template Tensor ReduceCompute( - ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, - AllocatorPtr allocator, - const Tensor& input, const std::vector& axes, - bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, - bool fast_reduction, const TensorShape* input_shape_override); +// template Tensor ReduceCompute( +// ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, +// AllocatorPtr allocator, +// const Tensor& input, const std::vector& axes, +// bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp, +// bool fast_reduction, const TensorShape* input_shape_override); } // namespace ReductionOps #define REGISTER_KERNEL_HFD(name) \ REGISTER_KERNEL_TYPED(name, MLFloat16) \ - REGISTER_KERNEL_TYPED(name, float) \ - REGISTER_KERNEL_TYPED(name, double) + REGISTER_KERNEL_TYPED(name, float) + // REGISTER_KERNEL_TYPED(name, double) #define REGISTER_KERNEL_HFD_11(name) \ REGISTER_KERNEL_TYPED_11(name, MLFloat16) \ - REGISTER_KERNEL_TYPED_11(name, float) \ - REGISTER_KERNEL_TYPED_11(name, double) + REGISTER_KERNEL_TYPED_11(name, float) + // REGISTER_KERNEL_TYPED_11(name, double) REGISTER_KERNEL_HFD_11(ArgMax) REGISTER_KERNEL_HFD_11(ArgMin) @@ -450,22 +911,27 @@ REGISTER_KERNEL_HFD(ReduceL2) REGISTER_KERNEL_TYPED_12(ReduceMax, MLFloat16) REGISTER_KERNEL_TYPED_12(ReduceMax, float) -REGISTER_KERNEL_TYPED_12(ReduceMax, double) +// REGISTER_KERNEL_TYPED_12(ReduceMax, double) REGISTER_KERNEL_TYPED_12(ReduceMax, int32_t) REGISTER_KERNEL_TYPED_12(ReduceMax, int8_t) -REGISTER_KERNEL_TYPED_12(ReduceMax, uint8_t) +// REGISTER_KERNEL_TYPED_12(ReduceMax, uint8_t) REGISTER_KERNEL_HFD(ReduceMean) REGISTER_KERNEL_TYPED_12(ReduceMin, MLFloat16) REGISTER_KERNEL_TYPED_12(ReduceMin, float) -REGISTER_KERNEL_TYPED_12(ReduceMin, double) +// REGISTER_KERNEL_TYPED_12(ReduceMin, double) REGISTER_KERNEL_TYPED_12(ReduceMin, int32_t) REGISTER_KERNEL_TYPED_12(ReduceMin, int8_t) -REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t) +// REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t) REGISTER_KERNEL_HFD(ReduceProd) -REGISTER_KERNEL_HFD(ReduceSum) + +REGISTER_KERNEL_TYPED_13(ReduceSum, MLFloat16) +REGISTER_KERNEL_TYPED_13(ReduceSum, float) +// REGISTER_KERNEL_TYPED_13(ReduceSum, double) +REGISTER_KERNEL_TYPED_13(ReduceSum, int32_t) + REGISTER_KERNEL_HFD(ReduceLogSum) REGISTER_KERNEL_HFD(ReduceSumSquare) REGISTER_KERNEL_HFD(ReduceLogSumExp) @@ -478,7 +944,6 @@ REGISTER_KERNEL_INT32(ReduceL2) REGISTER_KERNEL_INT32(ReduceMean) REGISTER_KERNEL_INT32(ReduceProd) -REGISTER_KERNEL_INT32(ReduceSum) } // namespace rocm -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h index f402851db1..3cacb4367e 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h @@ -33,13 +33,6 @@ struct PrepareReduceMetadata { std::vector squeezed_output_dims; std::vector input_dims_miopen; std::vector output_dims_miopen; - - // - // TODO: delete these fields - // - int64_t rank; - int64_t stride; - bool contiguous_axes; }; template diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh b/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh index 9d9d210f3f..4ac5710d10 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh +++ b/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh @@ -18,65 +18,22 @@ __forceinline__ __host__ __device__ int least_pow2_bound(int value) { return static_cast(++value_); } -struct Square2 { +struct Square { template __forceinline__ __device__ T operator()(const T& value) { return value * value; } }; -struct Sqrt2 { - template - __forceinline__ __device__ T operator()(const T& value) { - return _Sqrt(value); - } -}; - -struct Identity2 { - template - __forceinline__ __device__ T operator()(const T& value) { - return value; - } -}; - - - -// -// TODO: DELETE EVERYTHING BELOW -// TODO: RENAME STRUCTS ABOVE (no '2') -// - -template -struct Cast { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - return TAccumulated(value); - } -}; - -template -struct Square { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - return TAccumulated(value) * TAccumulated(value); - } -}; - -template -struct Abs { - __forceinline__ __device__ TAccumulated operator()(const TValue& value) { - TAccumulated value_ = TAccumulated(value); - return value_ > TAccumulated(0) ? value_ : -value_; - } -}; - -template struct Sqrt { + template __forceinline__ __device__ T operator()(const T& value) { return _Sqrt(value); } }; -template struct Identity { + template __forceinline__ __device__ T operator()(const T& value) { return value; } diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index 6df7546236..dbaa7c3a20 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -1167,47 +1167,47 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1337,10 +1337,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // opset 11 BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1355,41 +1355,41 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, @@ -1451,18 +1451,18 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -1620,39 +1620,39 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, }; diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc index 6e93b8795b..ce45bd8c7b 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc @@ -73,15 +73,13 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { std::vector output_dims(2, 1); Tensor* Y = ctx->Output(0, TensorShape({})); // Sum((label * log(softmax)) using Reduction - ReduceKernelShared( + return ReduceKernelShared( temp_X.get(), logit_reshape, Y->template MutableData(), TensorShape({}), CUDNN_REDUCE_TENSOR_ADD, output_dims); - - return Status::OK(); } template @@ -199,15 +197,13 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) // ReduceSum on loss_per_sample std::vector output_dims(1, 1); - ReduceKernelShared( + return ReduceKernelShared( tmp_loss_sample.get(), label_reshape, total_loss_data, TensorShape({}), CUDNN_REDUCE_TENSOR_ADD, output_dims); - - return Status::OK(); } template diff --git a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc index fe112e7d77..c7681e2115 100644 --- a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc +++ b/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc @@ -113,16 +113,17 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N_D))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N_D)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data_nd_data, + normalize_factor_data.get(), + static_cast(N_D), + reduction_buffer.get(), + buffer_size)); } else { const T normalize_factor = static_cast(1); HIP_RETURN_IF_ERROR(hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice)); @@ -153,26 +154,14 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co if (reduction_ != ReductionType::NONE) { // ReduceSum on loss_per_sample - // std::vector output_dims(1, 1); - // ReduceKernelShared( - // tmp_loss_sample_buffer, - // label_reshape, - // total_loss_data, - // TensorShape({}), - // MIOPEN_REDUCE_TENSOR_ADD, - // output_dims); - // Compute buffer size in byte for reduction APIs. - const auto tmp_buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N_D))); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr tmp_reduction_buffer = GetScratchBuffer( - tmp_buffer_size); - reduce_sum(tmp_loss_sample_buffer, - total_loss_data, - static_cast(N_D), - reinterpret_cast(tmp_reduction_buffer.get())); - return Status::OK(); + std::vector output_dims(1, 1); + ReduceKernelShared( + tmp_loss_sample_buffer, + label_reshape, + total_loss_data, + TensorShape({}), + MIOPEN_REDUCE_TENSOR_ADD, + output_dims); } return Status::OK(); @@ -225,16 +214,17 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N_D))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N_D)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data_nd_data, + normalize_factor_data.get(), + static_cast(N_D), + reduction_buffer.get(), + buffer_size)); } else { const T normalize_factor = static_cast(1); HIP_RETURN_IF_ERROR(hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice)); diff --git a/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc index b89f98d4ab..7cc017f360 100644 --- a/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc +++ b/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc @@ -73,7 +73,7 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { std::vector output_dims(2, 1); Tensor* Y = ctx->Output(0, TensorShape({})); // Sum((label * log(softmax)) using Reduction - return ReduceKernelShared( + return ReduceKernelShared( temp_X.get(), logit_reshape, Y->template MutableData(), @@ -173,16 +173,17 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); } else { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data, - normalize_factor_data.get(), - static_cast(N), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data, + normalize_factor_data.get(), + static_cast(N), + reduction_buffer.get(), + buffer_size)); } } @@ -194,28 +195,15 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) N, D); - // Compute buffer size in byte for reduction APIs. - const auto tmp_buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N))); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr tmp_reduction_buffer = GetScratchBuffer( - tmp_buffer_size); - reduce_sum(tmp_loss_sample.get(), - total_loss_data, - static_cast(N), - reinterpret_cast(tmp_reduction_buffer.get())); - return Status::OK(); - // ReduceSum on loss_per_sample - // std::vector output_dims(1, 1); - // return ReduceKernelShared( - // tmp_loss_sample.get(), - // label_reshape, - // total_loss_data, - // TensorShape({}), - // MIOPEN_REDUCE_TENSOR_ADD, - // output_dims); + std::vector output_dims(1, 1); + return ReduceKernelShared( + tmp_loss_sample.get(), + label_reshape, + total_loss_data, + TensorShape({}), + MIOPEN_REDUCE_TENSOR_ADD, + output_dims); } template @@ -260,16 +248,17 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); } else { // Compute buffer size in byte for reduction APIs. - const auto buffer_size = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T)), static_cast(N))); + const auto buffer_size = + compute_reduction_buffer_size(static_cast(N)); // Allocate reduction buffer whose size is buffer_size bytes. IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); - reduce_sum(weight_data, - normalize_factor_data.get(), - static_cast(N), - reinterpret_cast(reduction_buffer.get())); + ORT_RETURN_IF_ERROR(reduce_sum( + weight_data, + normalize_factor_data.get(), + static_cast(N), + reduction_buffer.get(), + buffer_size)); } } diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc index 2c51b6ec9c..e7ecdfcde0 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc @@ -69,7 +69,7 @@ std::vector> GenerateLambExtraAliasMapping() { REGISTER_LAMB_KERNEL_TYPED(float, float, MLFloat16, float, MLFloat16, MLFloat16) REGISTER_LAMB_KERNEL_TYPED(float, float, MLFloat16, float, float, MLFloat16) REGISTER_LAMB_KERNEL_TYPED(float, float, float, float, float, MLFloat16) -REGISTER_LAMB_KERNEL_TYPED(double, double, double, double, double, MLFloat16) +// REGISTER_LAMB_KERNEL_TYPED(double, double, double, double, double, MLFloat16) // REGISTER_LAMB_KERNEL_TYPED(MLFloat16, float, MLFloat16, MLFloat16, MLFloat16, MLFloat16) // REGISTER_LAMB_KERNEL_TYPED(MLFloat16, float, MLFloat16, MLFloat16, float, MLFloat16) REGISTER_LAMB_KERNEL_TYPED(MLFloat16, float, MLFloat16, float, MLFloat16, MLFloat16) @@ -204,8 +204,10 @@ Status launch_lamb_compute_direction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { // For the first iteration (indexed by 0), the update count should be 2. - const float alpha_correction = do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(alphas[i], update_count) : 1.f; - const float beta_correction = do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; + const float alpha_correction = + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(alphas[i], update_count) : 1.f; + const float beta_correction = + do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( p_ws[i], @@ -274,7 +276,7 @@ Status launch_lamb_reduction( std::vector& p_d_norms, std::vector& p_ws, std::vector& p_ds, - HipTNorm* reduction_buffer, + void* reduction_buffer, size_t reduction_buffer_size) { ORT_ENFORCE(group_count == static_cast(tensor_sizes.size())); @@ -293,16 +295,18 @@ Status launch_lamb_reduction( const int max_tensor_size = compute_max_tensor_size_per_launch(4); for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { - reduce_square_sum( + ORT_RETURN_IF_ERROR(reduce_square_sum( p_ws[i], p_w_norms[i], tensor_sizes[i], - reduction_buffer); - reduce_square_sum( + reduction_buffer, + reduction_buffer_size)); + ORT_RETURN_IF_ERROR(reduce_square_sum( p_ds[i], p_d_norms[i], tensor_sizes[i], - reduction_buffer); + reduction_buffer, + reduction_buffer_size)); } else { std::vector ptrs(tensor_count_per_group); ptrs[0] = const_cast(p_ws[i]); // weight tensor @@ -333,7 +337,7 @@ Status launch_lamb_reduction( buckets, reducer, kernel, - reinterpret_cast(reduction_buffer), + reduction_buffer, reduction_buffer_size); } @@ -414,8 +418,7 @@ Status launch_lamb_update( LambStage2; LambStage2 lamb_stage2; - launch_multi_tensor_functor< - tensor_count_per_group, LambStage2>( + launch_multi_tensor_functor( 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -544,11 +547,9 @@ Status LambOptimizer::Compute max_tensor_size = std::max(max_tensor_size, static_cast(w.Shape().Size())); } - const size_t buffer_size = [&]() { + const size_t reduction_buffer_size = [&]() { // Allocate a buffer in byte for reduction API calls. - size_t rbs = static_cast( - compute_reduction_buffer_size( - static_cast(sizeof(T2)), max_tensor_size)); + size_t rbs = compute_reduction_buffer_size(max_tensor_size); // Enlarge reduction buffer to accomodate multi-tensor reduction kernel as well const int tensor_group_size = 4; // w, d, w_norm, d_norm @@ -559,9 +560,8 @@ Status LambOptimizer::Compute return rbs; }(); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer(buffer_size); - HipT2* reduction_data = reinterpret_cast(reduction_buffer.get()); + // Allocate reduction buffer whose size is reduction_buffer_size bytes. + IAllocatorUniquePtr reduction_buffer = GetScratchBuffer(reduction_buffer_size); // Input tensors' pointers. std::vector p_ws(group_count); @@ -641,7 +641,7 @@ Status LambOptimizer::Compute p_w_mixed_precision_news[group_index] = w_mixed_precision_new != nullptr ? reinterpret_cast(w_mixed_precision_new->template MutableData()) : nullptr; } - launch_lamb_compute_direction( + ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( step_data ? *step_data : 0, group_count, loss_scale_data, @@ -653,7 +653,7 @@ Status LambOptimizer::Compute alpha_, beta_, lambda_, epsilon_, max_norm_clip_, do_bias_correction_); - launch_lamb_reduction( + ORT_RETURN_IF_ERROR(launch_lamb_reduction( *this, group_count, tensor_sizes, @@ -661,10 +661,10 @@ Status LambOptimizer::Compute p_d_norms, p_ws, p_ds, - reduction_data, - buffer_size); + reduction_buffer.get(), + reduction_buffer_size)); - launch_lamb_update( + ORT_RETURN_IF_ERROR(launch_lamb_update( group_count, eta_data, ratio_min_, @@ -676,7 +676,7 @@ Status LambOptimizer::Compute p_ds, p_w_news, p_g_news, - p_w_mixed_precision_news); + p_w_mixed_precision_news)); if (step_tensor) { Tensor* step_tensor_new = ctx->Output(0, step_tensor->Shape()); diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu index d6477ebf08..b6bd48bd1d 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu @@ -100,7 +100,7 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { using TBuf = AccumulationType_t; - MultiTensorReduce(chunk_group, output); + MultiTensorReduce(chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index 2e4d64e347..d7a5e4477f 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -28,8 +28,8 @@ namespace rocm { REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16) REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float) -REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double) -// REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t) +// REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double) +REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t) template template diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc index d76b62ded5..fc5e6d32e7 100644 --- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc +++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc @@ -139,8 +139,8 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -159,7 +159,7 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index b8477d363d..1aaeba32ea 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -41,30 +41,14 @@ ReductionOpTest.ReduceL2_keepdims ReductionOpTest.ReduceL2 ReductionOpTest.ReduceL2_int32 ReductionOpTest.ReduceL20DTensor -ReductionOpTest.ReduceLogSum -ReductionOpTest.ReduceLogSum_samesize -ReductionOpTest.ReduceLogSum_do_not_keepdims_2 -ReductionOpTest.ReduceLogSumAxes01 -ReductionOpTest.ReduceLogSum0DTensor -ReductionOpTest.ReduceLogSumExp_default_axes_keepdims ReductionOpTest.ReduceLogSumExp_default_axes_keepdims_double -ReductionOpTest.ReduceLogSumExp_default_axes_do_not_keep_dims ReductionOpTest.ReduceLogSumExp_default_axes_do_not_keep_dims_double -ReductionOpTest.ReduceLogSumExp_do_not_keepdims ReductionOpTest.ReduceLogSumExp_do_not_keepdims_double -ReductionOpTest.ReduceLogSumExp_do_not_keepdims_2 ReductionOpTest.ReduceLogSumExp_do_not_keepdims_2_double -ReductionOpTest.ReduceLogSumExp_keepdims ReductionOpTest.ReduceLogSumExp_keepdims_double -ReductionOpTest.ReduceLogSumExp ReductionOpTest.ReduceLogSumExp_double -ReductionOpTest.ReduceMax_default_axes_keepdims -ReductionOpTest.ReduceMax_default_axes_do_not_keep_dims -ReductionOpTest.ReduceMax_do_not_keepdims -ReductionOpTest.ReduceMax_do_not_keepdims_2 -ReductionOpTest.ReduceMax_keepdims -ReductionOpTest.ReduceMax ReductionOpTest.ReduceMax_int32 +ReductionOpTest.ReduceMax_int8 ReductionOpTest.ReduceMean_default_axes_keepdims ReductionOpTest.ReduceMean_default_axes_keepdims_double ReductionOpTest.ReduceMean_default_axes_do_not_keep_dims @@ -80,54 +64,10 @@ ReductionOpTest.ReduceMean_double ReductionOpTest.ReduceMean_int32 ReductionOpTest.ReduceMean0DTensor ReductionOpTest.ReduceMean0DTensor_double -ReductionOpTest.ReduceMin_default_axes_keepdims -ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims -ReductionOpTest.ReduceMin_default_axes_do_not_keep_dims_2D -ReductionOpTest.ReduceMin_do_not_keepdims -ReductionOpTest.ReduceMin_do_not_keepdims_2 -ReductionOpTest.ReduceMin_keepdims -ReductionOpTest.ReduceMin ReductionOpTest.ReduceMin_int32 -ReductionOpTest.ReduceSum +ReductionOpTest.ReduceMin_int8 ReductionOpTest.ReduceSum_double -ReductionOpTest.ReduceSum_axes01 -ReductionOpTest.ReduceSum_axes02 -ReductionOpTest.ReduceSum_int32 -ReductionOpTest.ReduceSum_default_axes_keepdims -ReductionOpTest.ReduceSum_default_axes_do_not_keep_dims -ReductionOpTest.ReduceSum_do_not_keepdims -ReductionOpTest.ReduceSum_do_not_keepdims_2 -ReductionOpTest.ReduceSum_keepdims -ReductionOpTest.ReduceSumSquare ReductionOpTest.ReduceSumSquare_double -ReductionOpTest.ReduceSumSquare_default_axes_keepdims -ReductionOpTest.ReduceSumSquare_default_axes_do_not_keep_dims -ReductionOpTest.ReduceSumSquare_do_not_keepdims -ReductionOpTest.ReduceSumSquare_do_not_keepdims_2 -ReductionOpTest.ReduceSumSquare_keepdims -ReductionOpTest.ReduceSumSquare0DTensor -ReductionOpTest.ReduceSumTraining_default_axes_keepdims -ReductionOpTest.ReduceSumTraining_axes_not_initializer -ReductionOpTest.ReduceSumTraining_do_not_keepdims -ReductionOpTest.ReduceSumTraining_neg_axis -ReductionOpTest.ReduceProd_default_axes_keepdims -ReductionOpTest.ReduceProd_default_axes_do_not_keep_dims -ReductionOpTest.ReduceProd_do_not_keepdims -ReductionOpTest.ReduceProd_do_not_keepdims_2 -ReductionOpTest.ReduceProd_keepdims -ReductionOpTest.ReduceProd -ReductionOpTest.ReduceProd_int32 -ReductionOpTest.ArgMax -ReductionOpTest.ArgMax_Double_Type -ReductionOpTest.ArgMax_do_not_keepdims -ReductionOpTest.ArgMax_do_not_keepdims_2 -ReductionOpTest.ArgMax2D -ReductionOpTest.ArgMax2D_dim1 -ReductionOpTest.ArgMin -ReductionOpTest.ArgMin_Double_Type -ReductionOpTest.ArgMin_Double_Precision -ReductionOpTest.ArgMin_do_not_keepdims -ReductionOpTest.ArgMin_do_not_keepdims_2 ReductionOpTest.ReduceInfMax ReductionOpTest.ReduceInfMin ReductionOpTest.ReduceInfSum From 21a47ec8d985f8fd723a2baff842315f1b2d9a56 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Fri, 15 Jan 2021 12:28:41 -0800 Subject: [PATCH 09/41] Disable a couple more unsupported tests. --- tools/ci_build/github/pai/pai-excluded-tests.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index 1aaeba32ea..98b0269869 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -5,6 +5,7 @@ OptimizerTest.AdamOptimizerMixPrecision_FP16Weight_NoClipNorm_Test OptimizerTest.AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test OptimizerTest.AdamOptimizerMixPrecisionTestFloatEta OptimizerTest.AdamOptimizerMixPrecisionTest_Gradient +OptimizerTest.LambOptimizerTestExternalBaselineDouble OptimizerTest.LambOptimizerTest5DTensorMixPrecision32_16 OptimizerTest.LambOptimizerTestSimpleBaselineMixPrecision32_16 OptimizerTest.LambOptimizerTestBaselineMixPrecision32_16 @@ -15,6 +16,7 @@ CudaKernelTest.SoftmaxCrossEntropy_TinySizeTensor CudaKernelTest.SoftmaxCrossEntropy_SmallSizeTensor CudaKernelTest.SoftmaxCrossEntropy_MediumSizeTensor CudaKernelTest.SoftmaxCrossEntropy_LargeSizeTensor +CudaKernelTest.SparseSoftmaxCrossEntropy_LargeSizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor From 5d8792705b062b8386fc58470f509f9696ab0380 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Fri, 15 Jan 2021 14:50:11 -0800 Subject: [PATCH 10/41] Code formatting. --- .../providers/rocm/reduction/reduction_ops.cc | 139 +++++++++--------- .../loss/softmax_cross_entropy_loss_impl.cc | 20 +-- 2 files changed, 79 insertions(+), 80 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index c16ced3e80..2cbe727ed6 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -207,10 +207,10 @@ Status ReduceKernel::ReduceKernelShared( input_data = reinterpret_cast(GetScratchBuffer(input_count).get()); fast_divmod tmp_div; Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, - reinterpret_cast(X), nullptr, - reinterpret_cast(X), nullptr, - tmp_div, tmp_div, - input_data, input_count); + reinterpret_cast(X), nullptr, + reinterpret_cast(X), nullptr, + tmp_div, tmp_div, + input_data, input_count); } else if (log_sum_exp_) { // Reduce max -- Max/Min will output indices data MiopenReduceDescriptor reduce_max_desc; @@ -230,17 +230,17 @@ Status ReduceKernel::ReduceKernelShared( BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); Impl_Sub(prepare.output_rank_or_simple_broadcast, - &prepare.lhs_padded_strides, - reinterpret_cast(X), - &prepare.rhs_padded_strides, - reinterpret_cast(Y), - &prepare.fdm_output_strides, - prepare.fdm_H, prepare.fdm_C, - reinterpret_cast(exp_result), input_count); + &prepare.lhs_padded_strides, + reinterpret_cast(X), + &prepare.rhs_padded_strides, + reinterpret_cast(Y), + &prepare.fdm_output_strides, + prepare.fdm_H, prepare.fdm_C, + reinterpret_cast(exp_result), input_count); Impl_Exp(reinterpret_cast(exp_result), - reinterpret_cast(exp_result), - input_count); + reinterpret_cast(exp_result), + input_count); // ReduceSum MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -250,16 +250,16 @@ Status ReduceKernel::ReduceKernelShared( // Log(Sum) Impl_Log(reinterpret_cast(log_sum_result), - reinterpret_cast(log_sum_result), - output_count); + reinterpret_cast(log_sum_result), + output_count); // Log + ReduceMax fast_divmod tmp_div; Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, - reinterpret_cast(log_sum_result), nullptr, - reinterpret_cast(Y), nullptr, - tmp_div, tmp_div, - reinterpret_cast(Y), output_count); + reinterpret_cast(log_sum_result), nullptr, + reinterpret_cast(Y), nullptr, + tmp_div, tmp_div, + reinterpret_cast(Y), output_count); return Status::OK(); } @@ -302,8 +302,8 @@ Status ReduceKernel::ReduceKernelShared( if (calculate_log_) { Impl_Log(reinterpret_cast(Y), - reinterpret_cast(Y), - output_count); + reinterpret_cast(Y), + output_count); } return Status::OK(); @@ -478,12 +478,12 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_miopen, miopen_type_X)); size_t workspace_bytes = 0; MIOPEN_RETURN_IF_ERROR(miopenGetReductionWorkspaceSize(rocm_ep.PerThreadMiopenHandle(), reduce_desc, - input_tensor, output_tensor, &workspace_bytes)); + input_tensor, output_tensor, &workspace_bytes)); auto workspace_rocm = rocm_ep.GetScratchBuffer(workspace_bytes); size_t indices_bytes = 0; MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(rocm_ep.PerThreadMiopenHandle(), reduce_desc, - input_tensor, output_tensor, &indices_bytes)); + input_tensor, output_tensor, &indices_bytes)); auto indices_rocm = rocm_ep.GetScratchBuffer(indices_bytes); if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) { @@ -494,10 +494,10 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, - reinterpret_cast(input.template Data()), nullptr, - reinterpret_cast(input.template Data()), nullptr, - tmp_div, tmp_div, - input_data, input_count); + reinterpret_cast(input.template Data()), nullptr, + reinterpret_cast(input.template Data()), nullptr, + tmp_div, tmp_div, + input_data, input_count); } else if (log_sum_exp) { // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar @@ -515,7 +515,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr ORT_RETURN_IF_ERROR(reduce_max_desc.Set(MIOPEN_REDUCE_TENSOR_MAX, miopen_reduce_max_type, MIOPEN_REDUCE_TENSOR_NO_INDICES)); size_t indices_bytes_max = 0; MIOPEN_RETURN_IF_ERROR(miopenGetReductionIndicesSize(rocm_ep.PerThreadMiopenHandle(), reduce_max_desc, - input_tensor, output_tensor, &indices_bytes_max)); + input_tensor, output_tensor, &indices_bytes_max)); auto indices_rocm_max = rocm_ep.GetScratchBuffer(indices_bytes); MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( rocm_ep.PerThreadMiopenHandle(), reduce_max_desc, indices_rocm_max.get(), indices_bytes_max, @@ -533,17 +533,17 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); Impl_Sub(prepare.output_rank_or_simple_broadcast, - &prepare.lhs_padded_strides, - reinterpret_cast(input.template Data()), - &prepare.rhs_padded_strides, - reinterpret_cast(output.template MutableData()), - &prepare.fdm_output_strides, - prepare.fdm_H, prepare.fdm_C, - reinterpret_cast(exp_result), input_count); + &prepare.lhs_padded_strides, + reinterpret_cast(input.template Data()), + &prepare.rhs_padded_strides, + reinterpret_cast(output.template MutableData()), + &prepare.fdm_output_strides, + prepare.fdm_H, prepare.fdm_C, + reinterpret_cast(exp_result), input_count); Impl_Exp(reinterpret_cast(exp_result), - reinterpret_cast(exp_result), - input_count); + reinterpret_cast(exp_result), + input_count); // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. @@ -560,16 +560,16 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // Log(Sum) Impl_Log(reinterpret_cast(log_sum_result), - reinterpret_cast(log_sum_result), - output_count); + reinterpret_cast(log_sum_result), + output_count); // Log + ReduceMax fast_divmod tmp_div; Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, - reinterpret_cast(log_sum_result), nullptr, - reinterpret_cast(output.template MutableData()), nullptr, - tmp_div, tmp_div, - reinterpret_cast(output.template MutableData()), output_count); + reinterpret_cast(log_sum_result), nullptr, + reinterpret_cast(output.template MutableData()), nullptr, + tmp_div, tmp_div, + reinterpret_cast(output.template MutableData()), output_count); return Status::OK(); } @@ -622,14 +622,13 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr if (calculate_log) { Impl_Log(reinterpret_cast(output.template MutableData()), - reinterpret_cast(output.template MutableData()), - output_count); + reinterpret_cast(output.template MutableData()), + output_count); } return Status::OK(); } - template template Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const { @@ -750,17 +749,17 @@ Status ReduceKernel::ComputeImpl const auto zero = Consts::Zero; auto temp_Y = GetScratchBuffer(output_count); MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), - reduce_desc, - indices_rocm.get(), - indices_bytes, - workspace_rocm.get(), - workspace_bytes, - &one, - input_tensor, - temp_X.get(), - &zero, - output_tensor, - temp_Y.get())); + reduce_desc, + indices_rocm.get(), + indices_bytes, + workspace_rocm.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); @@ -829,17 +828,17 @@ Status ReduceKernel::ComputeImpl( const auto zero = Consts::Zero; auto temp_Y = GetScratchBuffer(output_count); MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), - reduce_desc, - indices_rocm.get(), - indices_bytes, - workspace_rocm.get(), - workspace_bytes, - &one, - input_tensor, - temp_X.get(), - &zero, - output_tensor, - temp_Y.get())); + reduce_desc, + indices_rocm.get(), + indices_bytes, + workspace_rocm.get(), + workspace_bytes, + &one, + input_tensor, + temp_X.get(), + &zero, + output_tensor, + temp_Y.get())); Impl_Cast(temp_Y.get(), dst, output_count); @@ -897,12 +896,12 @@ template Tensor ReduceCompute( #define REGISTER_KERNEL_HFD(name) \ REGISTER_KERNEL_TYPED(name, MLFloat16) \ REGISTER_KERNEL_TYPED(name, float) - // REGISTER_KERNEL_TYPED(name, double) +// REGISTER_KERNEL_TYPED(name, double) #define REGISTER_KERNEL_HFD_11(name) \ REGISTER_KERNEL_TYPED_11(name, MLFloat16) \ REGISTER_KERNEL_TYPED_11(name, float) - // REGISTER_KERNEL_TYPED_11(name, double) +// REGISTER_KERNEL_TYPED_11(name, double) REGISTER_KERNEL_HFD_11(ArgMax) REGISTER_KERNEL_HFD_11(ArgMin) diff --git a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc index c7681e2115..d4b62fc0f3 100644 --- a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc +++ b/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc @@ -11,16 +11,16 @@ namespace onnxruntime { namespace rocm { -#define REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endver) \ - ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - startver, endver, \ - T, Tin, \ - kRocmExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ +#define REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endver) \ + ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ + Class, \ + domain, \ + startver, endver, \ + T, Tin, \ + kRocmExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ Class); #define REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ From 86ac11af1a04bd5c60e7b28e725ea66a9dadcc7c Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Tue, 19 Jan 2021 12:51:25 -0800 Subject: [PATCH 11/41] Delete ROCM-specific reduction code that is identical to CUDA reduction code. --- .../rocm/reduction/reduction_functions.cu | 23 +- .../rocm/reduction/reduction_functions.h | 107 ------- .../providers/rocm/reduction/reduction_ops.h | 6 +- .../rocm/reduction/reduction_utils.cuh | 43 --- .../loss/softmax_cross_entropy_loss_impl.cc | 272 ---------------- .../rocm/loss/softmaxcrossentropy_impl.cc | 294 ------------------ .../rocm/reduction/reduction_all.cu | 116 ------- .../rocm/reduction/reduction_ops.cc | 10 +- tools/ci_build/amd_hipify.py | 5 - 9 files changed, 13 insertions(+), 863 deletions(-) delete mode 100644 onnxruntime/core/providers/rocm/reduction/reduction_functions.h delete mode 100644 onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh delete mode 100644 orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc delete mode 100644 orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc delete mode 100644 orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu index d6e1ee4181..8089d8e5e4 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu @@ -13,13 +13,6 @@ #include "core/providers/rocm/shared_inc/rocm_utils.h" #include "core/providers/rocm/reduction/reduction_utils.cuh" -#define NUM_ELEMENTS_PER_THREAD 4 -#define NUM_WARPS_PER_BLOCK 8 -#define MAX_NUM_BLOCKS 256 - -#define ALL_ONE_MASK 0xFFFFFFFF -#define ONE_MASK 0x00000001 - namespace onnxruntime { namespace rocm { @@ -115,7 +108,7 @@ template (shared_memory_bytes); // Thread-level indices: // Linear index of thread in block. @@ -175,11 +168,7 @@ __device__ void reduce_all( } } -#if __ROCM_ARCH__ >= 700 - __syncwarp(); -#else __syncthreads(); -#endif // Warp-level reduction (storage change: register -> register). // The values in a warp will be summed up to a scalar. After warp-level @@ -312,9 +301,8 @@ Status call_reduce_matrix_columns( } const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel), - grid_dim, block_dim, shared_mem_size, 0, - num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel), dim3(grid_dim), dim3(block_dim), shared_mem_size, 0, + num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); return Status::OK(); } @@ -390,7 +378,7 @@ __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y; // Shape is blockDim.y-by-blockDim.x and element type is TBuf. - extern __shared__ unsigned char shared_memory_bytes[]; + HIP_DYNAMIC_SHARED( unsigned char, shared_memory_bytes) TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); // to prevent int overflow in index calculation for input size m*n @@ -454,8 +442,7 @@ Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, boo const dim3 grid(grid_x_dim, grid_y_dim, 1); const dim3 block(block_x_dim, block_y_dim, 1); - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_rows_kernel), - grid, block, block.y * block.x * sizeof(TBuf), 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_rows_kernel), dim3(grid), dim3(block), block.y * block.x * sizeof(TBuf), 0, input, output, m, n); return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h b/onnxruntime/core/providers/rocm/reduction/reduction_functions.h deleted file mode 100644 index 2f677c3c4b..0000000000 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/providers/rocm/rocm_common.h" -#include "core/providers/rocm/shared_inc/accumulation_type.h" - -namespace onnxruntime { -namespace rocm { - -namespace detail { -size_t compute_reduce_matrix_columns_intermediate_buffer_size( - int element_size, int num_rows, int num_cols); -} // namespace detail - -/** - * Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns(). - * @tparam TIn The input data type. - * @param m The number of matrix rows. - * @param n The number of matrix columns. - * @return The size of the intermediate buffer. - */ -template -size_t compute_reduce_matrix_columns_buffer_size(int m, int n) { - using TBuf = AccumulationType_t; - return detail::compute_reduce_matrix_columns_intermediate_buffer_size( - sizeof(TBuf), m, n); -} - -/** - * Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions. - * @tparam TIn The input data type. - * @param size The number of elements. - * @return The size of the intermediate buffer. - */ -template -size_t compute_reduction_buffer_size(int size) { - using TBuf = AccumulationType_t; - return detail::compute_reduce_matrix_columns_intermediate_buffer_size( - sizeof(TBuf), 1, size); -} - -/** Computes the sum of the given elements. */ -template -Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); - -/** Computes the sum of the squares of the given elements. */ -template -Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); - -/** Computes the L2 norm of the given elements. */ -template -Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); - -/** Computes the mean of the given elements. */ -template -Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); - -enum class ApplicableMatrixReduction { - // can use reduce_matrix_rows() - Rows, - // can use reduce_matrix_columns() - Columns, - // no optimized matrix reduction function applies - None, -}; - -/** - * Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function. - * @param miopen_reduce_op The MIOpen reduction op type. - * @param dims The input dimensions. - * @param axes The reduction axes. - * @param[out] m If matrix reduction is possible, the number of matrix rows to use. - * @param[out] n If matrix reduction is possible, the number of matrix columns to use. - * @return The type of matrix reduction that can be done. - */ -ApplicableMatrixReduction get_applicable_matrix_reduction( - const miopenReduceTensorOp_t miopen_reduce_op, - const std::vector& dims, const std::vector& axes, - int& m, int& n); - -/** - * Reduces the rows in a row-major matrix to a single row containing the sum of each column. - * @param input The input data. - * @param output The output data. - * @param m The number of matrix rows. - * @param n The number of matrix columns. - * @param reset_initial_output Whether to reset (i.e., zero) the output values first. - */ -template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); - -/** - * Reduces the columns in a row-major matrix to a single column containing the sum of each row. - * @param input The input data. - * @param output The output data. - * @param m The number of matrix rows. - * @param n The number of matrix columns. - * @param buffer The intermediate buffer. - * @param buffer_size The size of the intermediate buffer in bytes. - */ -template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); - -} // namespace rocm -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h index 3cacb4367e..eb3c9d9e45 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.h @@ -3,8 +3,8 @@ #pragma once #include "core/common/optional.h" -#include "core/providers/cpu/reduction/reduction_ops.h" #include "core/providers/rocm/rocm_kernel.h" +#include "core/providers/cpu/reduction/reduction_ops.h" #include "core/providers/rocm/reduction/reduction_functions.h" namespace onnxruntime { @@ -61,7 +61,7 @@ class ReduceKernel : public RocmKernel, public ReduceKernelBase Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const; - template + template Status ReduceKernelShared( const T* X, const TensorShape& input_shape, @@ -269,4 +269,4 @@ class MiopenReduceDescriptor final { }; } // namespace rocm -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh b/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh deleted file mode 100644 index 4ac5710d10..0000000000 --- a/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/providers/rocm/cu_inc/common.cuh" - -namespace onnxruntime { -namespace rocm { - -__forceinline__ __host__ __device__ int least_pow2_bound(int value) { - unsigned int value_ = static_cast(value); - --value_; - value_ |= value_ >> 1; - value_ |= value_ >> 2; - value_ |= value_ >> 4; - value_ |= value_ >> 8; - value_ |= value_ >> 16; - return static_cast(++value_); -} - -struct Square { - template - __forceinline__ __device__ T operator()(const T& value) { - return value * value; - } -}; - -struct Sqrt { - template - __forceinline__ __device__ T operator()(const T& value) { - return _Sqrt(value); - } -}; - -struct Identity { - template - __forceinline__ __device__ T operator()(const T& value) { - return value; - } -}; - -} // namespace rocm -} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc deleted file mode 100644 index d4b62fc0f3..0000000000 --- a/orttraining/orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.cc +++ /dev/null @@ -1,272 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/rocm/math/softmax.h" -#include "core/providers/rocm/reduction/reduction_functions.h" -#include "core/providers/rocm/tensor/transpose.h" -#include "core/providers/cpu/controlflow/scan_utils.h" -#include "orttraining/training_ops/cpu/loss/softmax_cross_entropy_loss.h" -#include "orttraining/training_ops/rocm/loss/softmax_cross_entropy_loss_impl.h" - -namespace onnxruntime { -namespace rocm { - -#define REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endver) \ - ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - startver, endver, \ - T, Tin, \ - kRocmExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ - Class); - -#define REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - version, \ - T, Tin, \ - kRocmExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ - Class); - -template -Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& logit = *ctx->Input(0); - const Tensor& label = *ctx->Input(1); - const TensorShape logit_shape{logit.Shape()}; - const TensorShape label_shape{label.Shape()}; - onnxruntime::contrib::VerifyLogitWeightAndLabelShape(logit_shape, label_shape, - OpKernel::Node().InputDefs().size() == 3 ? &(*(ctx->Input(2))).Shape() : nullptr); - - // N_D = N * D1 * D2...D*K - int64_t N_D; - int64_t C; - onnxruntime::contrib::GetNDCFromLogitAndLabelShape(logit_shape, label_shape, N_D, C); - const TensorShape logit_reshape({N_D, C}); - const TensorShape label_reshape({N_D}); - Tensor* total_loss = ctx->Output(0, reduction_ == ReductionType::NONE ? TensorShape(label.Shape()) : TensorShape({})); - T* total_loss_data = total_loss->template MutableData(); - T* tmp_loss_sample_buffer = nullptr; - IAllocatorUniquePtr tmp_loss_sample; - if (reduction_ == ReductionType::NONE) { - tmp_loss_sample_buffer = total_loss_data; - } else { - tmp_loss_sample = GetScratchBuffer(N_D); - tmp_loss_sample_buffer = tmp_loss_sample.get(); - } - - const T* logit_data = logit.template Data(); - const Tin* label_data = label.template Data(); - - T* log_prob_data = nullptr; - Tensor* log_prob = nullptr; - IAllocatorUniquePtr log_prob_scratch_buffer; - if (ctx->OutputCount() > 1) { - log_prob = ctx->Output(1, logit_shape); - log_prob_data = log_prob->template MutableData(); - } else { - log_prob_scratch_buffer = GetScratchBuffer(logit_shape.Size()); - log_prob_data = log_prob_scratch_buffer.get(); - } - - OrtValue transpose_output; - Tensor transpose_tensor; - std::vector new_shape; - std::vector permutations; - AllocatorPtr alloc; - const OpKernelInfo& info = OpKernel::Info(); - - // Transpose logit from [N, C, D1, D2 .. Dk] to [N, D1, D2...Dk, C] - if (logit_shape.NumDimensions() > 2) { - ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc)); - onnxruntime::contrib::GetPermutationAndShape(true, logit_shape, new_shape, permutations); - transpose_output = scan::detail::AllocateTensorInMLValue(logit.DataType(), new_shape, alloc); - ORT_RETURN_IF_ERROR(rocm::Transpose::DoTranspose(rocm::Transpose(info), permutations, logit, *transpose_output.GetMutable())); - logit_data = (*transpose_output.GetMutable()).template Data(); - } - - // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, - logit_reshape, - log_prob_data, - MiopenHandle(), - 1); - ORT_RETURN_IF_ERROR(status); - - const T* weight_data = nullptr; - if (OpKernel::Node().InputDefs().size() == 3) { - const Tensor& weight = *ctx->Input(2); - weight_data = weight.template Data(); - } - - IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); - T* weight_data_nd_data = weight_data_nd.get(); - HIP_RETURN_IF_ERROR(hipMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); - - auto normalize_factor_data = GetScratchBuffer(1); - if (reduction_ == ReductionType::MEAN) { - // Compute buffer size in byte for reduction APIs. - const auto buffer_size = - compute_reduction_buffer_size(static_cast(N_D)); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( - buffer_size); - ORT_RETURN_IF_ERROR(reduce_sum( - weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reduction_buffer.get(), - buffer_size)); - } else { - const T normalize_factor = static_cast(1); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice)); - } - - SoftmaxCrossEntropyLossImpl(log_prob_data, - label_data, - weight_data_nd_data, - normalize_factor_data.get(), - N_D, - C, - ignore_index_, - tmp_loss_sample_buffer); - - // Transpose log probability from [N, D1, D2...Dk, C] to [N, C, D1, D2 .. Dk]. - if (logit_shape.NumDimensions() > 2 && log_prob != nullptr) { - TensorShape log_prob_shape = new_shape; - new_shape.clear(); - permutations.clear(); - onnxruntime::contrib::GetPermutationAndShape(false, log_prob_shape, new_shape, permutations); - auto* transposed_data = (*transpose_output.GetMutable()).template MutableData(); - transpose_output.GetMutable()->Reshape(log_prob->Shape()); - log_prob->Reshape(log_prob_shape); - ORT_RETURN_IF_ERROR(rocm::Transpose::DoTranspose(rocm::Transpose(info), permutations, *log_prob, *transpose_output.GetMutable())); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(log_prob_data, transposed_data, sizeof(T) * logit_shape.Size(), hipMemcpyDeviceToDevice)); - log_prob->Reshape(new_shape); - } - - if (reduction_ != ReductionType::NONE) { - // ReduceSum on loss_per_sample - std::vector output_dims(1, 1); - ReduceKernelShared( - tmp_loss_sample_buffer, - label_reshape, - total_loss_data, - TensorShape({}), - MIOPEN_REDUCE_TENSOR_ADD, - output_dims); - } - - return Status::OK(); -} - -template -Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& dY = *ctx->Input(0); - const Tensor& log_prob = *ctx->Input(1); - const Tensor& label = *ctx->Input(2); - const TensorShape probability_shape{log_prob.Shape()}; - const TensorShape label_shape{label.Shape()}; - onnxruntime::contrib::VerifyLogitWeightAndLabelShape(probability_shape, label_shape, - OpKernel::Node().InputDefs().size() == 4 ? &(*(ctx->Input(3))).Shape() : nullptr); - - // N_D = N * D1 * D2...D*K - int64_t N_D; - int64_t C; - onnxruntime::contrib::GetNDCFromLogitAndLabelShape(probability_shape, label_shape, N_D, C); - Tensor* d_logit = ctx->Output(0, probability_shape); - const T* dY_data = dY.template Data(); - const T* log_prob_data = log_prob.template Data(); - const Tin* label_data = label.template Data(); - T* d_logit_data = d_logit->template MutableData(); - const T* weight_data = nullptr; - OrtValue transpose_output; - std::vector new_shape; - std::vector permutations; - AllocatorPtr alloc; - const OpKernelInfo& info = OpKernel::Info(); - - // Transpose logit from [N, C, D1, D2 .. Dk] to [N, D1, D2...Dk, C] - if (probability_shape.NumDimensions() > 2) { - ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc)); - onnxruntime::contrib::GetPermutationAndShape(true, probability_shape, new_shape, permutations); - transpose_output = scan::detail::AllocateTensorInMLValue(log_prob.DataType(), new_shape, alloc); - ORT_RETURN_IF_ERROR(rocm::Transpose::DoTranspose(rocm::Transpose(info), permutations, log_prob, *transpose_output.GetMutable())); - log_prob_data = (*transpose_output.GetMutable()).template Data(); - } - - if (OpKernel::Node().InputDefs().size() == 4) { - const Tensor& weight = *ctx->Input(3); - weight_data = weight.template Data(); - } - - IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); - T* weight_data_nd_data = weight_data_nd.get(); - HIP_RETURN_IF_ERROR(hipMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); - auto normalize_factor_data = GetScratchBuffer(1); - if (reduction_ == ReductionType::MEAN) { - // Compute buffer size in byte for reduction APIs. - const auto buffer_size = - compute_reduction_buffer_size(static_cast(N_D)); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( - buffer_size); - ORT_RETURN_IF_ERROR(reduce_sum( - weight_data_nd_data, - normalize_factor_data.get(), - static_cast(N_D), - reduction_buffer.get(), - buffer_size)); - } else { - const T normalize_factor = static_cast(1); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice)); - } - - SoftmaxCrossEntropyLossGradImpl(dY_data, - log_prob_data, - label_data, - weight_data_nd_data, - normalize_factor_data.get(), - N_D, - C, - ReductionType::NONE == reduction_, - d_logit_data); - - // Transpose logit from [N, D1, D2...Dk, C] to [N, C, D1, D2 .. Dk] - if (probability_shape.NumDimensions() > 2) { - TensorShape logit_shape = new_shape; - new_shape.clear(); - permutations.clear(); - onnxruntime::contrib::GetPermutationAndShape(false, logit_shape, new_shape, permutations); - transpose_output.GetMutable()->Reshape(d_logit->Shape()); - d_logit->Reshape(logit_shape); - ORT_RETURN_IF_ERROR(rocm::Transpose::DoTranspose(rocm::Transpose(info), permutations, *d_logit, *transpose_output.GetMutable())); - auto* transposed_data = (*transpose_output.GetMutable()).template Data(); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(d_logit_data, transposed_data, sizeof(T) * probability_shape.Size(), hipMemcpyDeviceToDevice)); - d_logit->Reshape(new_shape); - } - - return Status::OK(); -} - -#define SPECIALIZED_VERSIONED_COMPUTE_SPARSE(Class, T, Tin, domain, startver, endvar) \ - REGISTER_KERNEL_VERSIONED_TYPED_TWO_TYPES(Class, T, Tin, domain, startver, endvar) - -#define SPECIALIZED_COMPUTE_SPARSE(Class, T, Tin, domain, version) \ - REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ - template Status Class::ComputeInternal(OpKernelContext* ctx) const; - -SPECIALIZED_VERSIONED_COMPUTE_SPARSE(SoftmaxCrossEntropyLoss, float, int64_t, kOnnxDomain, 12, 12) -SPECIALIZED_COMPUTE_SPARSE(SoftmaxCrossEntropyLoss, float, int64_t, kOnnxDomain, 13) -SPECIALIZED_COMPUTE_SPARSE(SoftmaxCrossEntropyLossGrad, float, int64_t, kMSDomain, 1) - -} // namespace rocm -} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc deleted file mode 100644 index 7cc017f360..0000000000 --- a/orttraining/orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.cc +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/rocm/reduction/reduction_functions.h" -#include "core/providers/rocm/math/softmax.h" -#include "orttraining/training_ops/rocm/loss/softmaxcrossentropy_impl.h" - -namespace onnxruntime { -namespace rocm { -#define REGISTER_KERNEL_TYPED(Class, T, domain, version) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - version, \ - T, \ - kRocmExecutionProvider, \ - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Class); - -#define REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - Class, \ - domain, \ - version, \ - T, Tin, \ - kRocmExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("Tin", DataTypeImpl::GetTensorType()), \ - Class); - -template -Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& logit = *ctx->Input(0); - const Tensor& label = *ctx->Input(1); - - const TensorShape logit_shape{logit.Shape()}; - const TensorShape label_shape{label.Shape()}; - ORT_ENFORCE(label_shape == logit_shape, "The shape in logits and labels is not identical"); - - int64_t N = logit_shape.SizeToDimension(logit_shape.NumDimensions() - 1); - int64_t D = logit_shape[logit_shape.NumDimensions() - 1]; - const TensorShape logit_reshape({N, D}); - - Tensor* log_prob = ctx->Output(1, logit_shape); - - const T* logit_data = logit.template Data(); - const T* label_data = label.template Data(); - T* log_prob_data = log_prob->template MutableData(); - - // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, - logit_reshape, - log_prob_data, - MiopenHandle(), - 1 /*axis default*/); - ORT_RETURN_IF_ERROR(status); - - size_t normalize_factor = N; - if (reduction_ == ReductionType::SUM) { - normalize_factor = static_cast(1); - } - - // calculate (label * log(softmax)) for each element - IAllocatorUniquePtr temp_X = GetScratchBuffer(N * D); - SoftMaxCrossEntropyImpl( - log_prob_data, // logsoftmax result - label_data, // label - normalize_factor, // normalize_factor - temp_X.get(), // -(label * log(softmax)) - N * D); - - std::vector output_dims(2, 1); - Tensor* Y = ctx->Output(0, TensorShape({})); - // Sum((label * log(softmax)) using Reduction - return ReduceKernelShared( - temp_X.get(), - logit_reshape, - Y->template MutableData(), - TensorShape({}), - MIOPEN_REDUCE_TENSOR_ADD, - output_dims); -} - -template -Status SoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& dY = *ctx->Input(0); - const Tensor& log_prob = *ctx->Input(1); - const Tensor& label = *ctx->Input(2); - - const TensorShape probability_shape{log_prob.Shape()}; - const TensorShape label_shape{label.Shape()}; - ORT_ENFORCE(label_shape == probability_shape, "The shape in probability and label is not identical"); - - int64_t N = probability_shape.SizeToDimension(probability_shape.NumDimensions() - 1); - int64_t ND = probability_shape.Size(); - - Tensor* d_logits = ctx->Output(0, probability_shape); - - const T* dY_data = dY.template Data(); - const T* log_prob_data = log_prob.template Data(); - const T* label_data = label.template Data(); - - size_t normalize_factor = N; - if (reduction_ == ReductionType::SUM) { - normalize_factor = static_cast(1); - } - - T* d_logits_data = d_logits->template MutableData(); - - SoftMaxCrossEntropyGradImpl( - dY_data, // Dy - log_prob_data, // log(pi) - label_data, // Label - normalize_factor, // normalize_factor - d_logits_data, // gradient - ND); - - return Status::OK(); -} - -template -Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& logit = *ctx->Input(0); - const Tensor& label = *ctx->Input(1); - - const TensorShape logit_shape{logit.Shape()}; - const TensorShape label_shape{label.Shape()}; - ORT_ENFORCE(logit_shape.NumDimensions() == label_shape.NumDimensions() + 1, - "logits_shape must be (1 + label_shape)"); - for (size_t i = 0; i < label_shape.NumDimensions(); i++) { - ORT_ENFORCE(label_shape[i] == logit_shape[i], "The shape in logits and labels does not match"); - } - - int64_t N = label_shape.Size(); - int64_t D = logit_shape[logit_shape.NumDimensions() - 1]; - const TensorShape logit_reshape({N, D}); - const TensorShape label_reshape({N}); - - IAllocatorUniquePtr tmp_loss_sample = GetScratchBuffer(N); - Tensor* total_loss = ctx->Output(0, TensorShape({})); - Tensor* log_prob = ctx->Output(1, logit_shape); - - const T* logit_data = logit.template Data(); - const Tin* label_data = label.template Data(); - T* total_loss_data = total_loss->template MutableData(); - T* log_prob_data = log_prob->template MutableData(); - - // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, - logit_reshape, - log_prob_data, - MiopenHandle(), - 1 /*axis default*/); - ORT_RETURN_IF_ERROR(status); - - // calculate (label * log(softmax)) for each sample - const T* weight_data = nullptr; - if (OpKernel::Node().InputDefs().size() == 3) { - const Tensor& weight = *ctx->Input(2); - const TensorShape weight_shape{weight.Shape()}; - ORT_ENFORCE(weight_shape == label_shape, "The shape in weights and labels is different"); - weight_data = weight.template Data(); - } - - auto normalize_factor_data = GetScratchBuffer(1); - if (reduction_ == ReductionType::SUM) { - const T normalize_factor = static_cast(1); - hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); - } else if (reduction_ == ReductionType::MEAN) { - if (weight_data == nullptr) { - const T normalize_factor = static_cast(N); - hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); - } else { - // Compute buffer size in byte for reduction APIs. - const auto buffer_size = - compute_reduction_buffer_size(static_cast(N)); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( - buffer_size); - ORT_RETURN_IF_ERROR(reduce_sum( - weight_data, - normalize_factor_data.get(), - static_cast(N), - reduction_buffer.get(), - buffer_size)); - } - } - - SparseSoftmaxCrossEntropyImpl(log_prob_data, - label_data, - weight_data, - normalize_factor_data.get(), - tmp_loss_sample.get(), - N, - D); - - // ReduceSum on loss_per_sample - std::vector output_dims(1, 1); - return ReduceKernelShared( - tmp_loss_sample.get(), - label_reshape, - total_loss_data, - TensorShape({}), - MIOPEN_REDUCE_TENSOR_ADD, - output_dims); -} - -template -Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* ctx) const { - const Tensor& dY = *ctx->Input(0); - const Tensor& log_prob = *ctx->Input(1); - const Tensor& label = *ctx->Input(2); - - const TensorShape probability_shape{log_prob.Shape()}; - const TensorShape label_shape{label.Shape()}; - ORT_ENFORCE(probability_shape.NumDimensions() == label_shape.NumDimensions() + 1, - "probability_shape must be (1 + label_shape)"); - for (size_t i = 0; i < label_shape.NumDimensions(); i++) { - ORT_ENFORCE(label_shape[i] == probability_shape[i], "The shape in probability and labels does not match"); - } - - int64_t N = label_shape.Size(); - int64_t D = probability_shape[probability_shape.NumDimensions() - 1]; - - Tensor* d_logit = ctx->Output(0, probability_shape); - - const T* dY_data = dY.template Data(); - const T* log_prob_data = log_prob.template Data(); - const Tin* label_data = label.template Data(); - T* d_logit_data = d_logit->template MutableData(); - - const T* weight_data = nullptr; - if (OpKernel::Node().InputDefs().size() == 4) { - const Tensor& weight = *ctx->Input(3); - const TensorShape weight_shape{weight.Shape()}; - ORT_ENFORCE(weight_shape == label_shape, "The shape in weights and labels is different"); - weight_data = weight.template Data(); - } - - auto normalize_factor_data = GetScratchBuffer(1); - if (reduction_ == ReductionType::SUM) { - const T normalize_factor = static_cast(1); - hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); - } else if (reduction_ == ReductionType::MEAN) { - if (weight_data == nullptr) { - const T normalize_factor = static_cast(N); - hipMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), hipMemcpyHostToDevice); - } else { - // Compute buffer size in byte for reduction APIs. - const auto buffer_size = - compute_reduction_buffer_size(static_cast(N)); - // Allocate reduction buffer whose size is buffer_size bytes. - IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( - buffer_size); - ORT_RETURN_IF_ERROR(reduce_sum( - weight_data, - normalize_factor_data.get(), - static_cast(N), - reduction_buffer.get(), - buffer_size)); - } - } - - SparseSoftmaxCrossEntropyGradImpl(dY_data, - log_prob_data, - label_data, - weight_data, - normalize_factor_data.get(), - d_logit_data, - N, - D); - - return Status::OK(); -} - -#define SPECIALIZED_COMPUTE(Class, T, domain, version) \ - REGISTER_KERNEL_TYPED(Class, T, domain, version) \ - template Status Class::ComputeInternal(OpKernelContext* ctx) const; - -SPECIALIZED_COMPUTE(SoftmaxCrossEntropy, float, kMSDomain, 1) -SPECIALIZED_COMPUTE(SoftmaxCrossEntropyGrad, float, kMSDomain, 1) - -#define SPECIALIZED_COMPUTE_SPARSE(Class, T, Tin, domain, version) \ - REGISTER_KERNEL_TYPED_TWO_TYPES(Class, T, Tin, domain, version) \ - template Status Class::ComputeInternal(OpKernelContext* ctx) const; - -// SPECIALIZED_COMPUTE_SPARSE(SparseSoftmaxCrossEntropy, float, int32_t, kOnnxDomain, 9) -SPECIALIZED_COMPUTE_SPARSE(SparseSoftmaxCrossEntropy, float, int64_t, kOnnxDomain, 9) -// SPECIALIZED_COMPUTE_SPARSE(SparseSoftmaxCrossEntropyGrad, float, int32_t, kOnnxDomain, 9) -SPECIALIZED_COMPUTE_SPARSE(SparseSoftmaxCrossEntropyGrad, float, int64_t, kOnnxDomain, 9) - -} // namespace rocm -} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu deleted file mode 100644 index b6bd48bd1d..0000000000 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cu +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include "orttraining/training_ops/rocm/reduction/reduction_all.h" -#include "core/providers/rocm/cu_inc/common.cuh" -#include "core/providers/rocm/rocm_common.h" -#include "core/providers/rocm/atomic/common.cuh" -#include "core/providers/rocm/reduction/reduction_utils.cuh" -#include "core/providers/rocm/shared_inc/accumulation_type.h" - -namespace onnxruntime { -namespace rocm { - -template -__global__ void ScalarSqrtKernel(Tin* input, Tout* output) { - *output = (Tout)_Sqrt(*input); -} - -template -void ScalarSqrt(Tin* input, Tout* output) { - hipLaunchKernelGGL(ScalarSqrtKernel, dim3(1), dim3(1), 0, 0, input, output); -} - -template void ScalarSqrt(float* input, float* output); -template void ScalarSqrt(half* input, half* output); -template void ScalarSqrt(float* input, half* output); - -template -__launch_bounds__(ChunkGroup<1>::thread_count_per_block) -__global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) { - const int group_index = chunk_group.block_index_to_tensor_group_index[blockIdx.x]; - const int tensor_size = chunk_group.tensor_sizes[group_index]; - const int chunk_size = chunk_group.chunk_size; - const int chunk_start = chunk_group.block_index_to_chunk_start_index[blockIdx.x]; - const TIn* w = reinterpret_cast(chunk_group.tensor_ptrs[0][group_index]) + chunk_start; - TOut* w_norm = output; - - TBuf w_sum = TBuf(0.f); - constexpr int load_count_per_thread = 4; - for (int i = threadIdx.x; i < chunk_size && i + chunk_start < tensor_size; i += blockDim.x * load_count_per_thread) { -#pragma unroll - for (int j = 0; j < load_count_per_thread; ++j) { - const int index_in_chunk = i + j * blockDim.x; - const int index_in_tensor = chunk_start + index_in_chunk; - if (index_in_chunk < chunk_size && index_in_tensor < tensor_size) { - const TBuf w_element = TBuf(w[index_in_chunk]); - w_sum += TInOp()(w_element); - } - } - } - - // Thread count in a block must be a multiple of GPU_WARP_SIZE. -#pragma unroll - for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { - w_sum += WARP_SHFL_DOWN(w_sum, stride); - } - - const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE; - const int lid = threadIdx.x % GPU_WARP_SIZE; - const int wid = threadIdx.x / GPU_WARP_SIZE; - - // Shape is 2 x warp_count_in_block. - extern __shared__ unsigned char shared_memory_[]; - TBuf* shared_memory = reinterpret_cast(shared_memory_); - - if (lid == 0) { - shared_memory[wid] = w_sum; - } - - __syncthreads(); - -#pragma unroll - for (int stride = warp_count_in_block / 2; stride > 0; stride /= 2) { - if (threadIdx.x < stride) { - shared_memory[threadIdx.x] += shared_memory[threadIdx.x + stride]; - } - __syncthreads(); - } - - if (threadIdx.x == 0) { - atomic_add(w_norm, TOutOp()(TOut(shared_memory[0]))); - } -} - -template -void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { - // thread count per block. - constexpr int thread_count = ChunkGroup<1>::thread_count_per_block; - // shared memory's size per block. - const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf); - - // Enforce assumptions used inside this reduction ROCM kernel. - ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); - ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(MultiTensorReduceKernel), dim3(chunk_group.chunk_count), dim3(thread_count), shared_memory_size, 0, chunk_group, output); -} - -template -void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { - using TBuf = AccumulationType_t; - MultiTensorReduce(chunk_group, output); -} - -#define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ - template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output); - -INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(double, float) -INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(float, float) -INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(half, float) -INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(float, half) -INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(half, half) - -} // namespace rocm -} // namespace onnxruntime \ No newline at end of file diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index d7a5e4477f..2bafe92209 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -131,17 +131,17 @@ Status ReduceKernel::ComputeImplEx indices_miopen = GetScratchBuffer(indices_bytes); - IAllocatorUniquePtr workspace_miopen = GetScratchBuffer(workspace_bytes); + IAllocatorUniquePtr indices_rocm = GetScratchBuffer(indices_bytes); + IAllocatorUniquePtr workspace_rocm = GetScratchBuffer(workspace_bytes); const auto one = Consts::One; const auto zero = Consts::Zero; auto temp_Y = GetScratchBuffer(output_count); MIOPEN_RETURN_IF_ERROR(miopenReduceTensor(MiopenHandle(), reduce_desc, - indices_miopen.get(), + indices_rocm.get(), indices_bytes, - workspace_miopen.get(), + workspace_rocm.get(), workspace_bytes, &one, input_tensor, @@ -156,4 +156,4 @@ Status ReduceKernel::ComputeImplEx Date: Thu, 21 Jan 2021 11:02:34 -0800 Subject: [PATCH 12/41] Fix scratch buffer early free. --- .../core/providers/cuda/reduction/reduction_ops.cc | 10 +++++++--- .../core/providers/rocm/reduction/reduction_ops.cc | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 4568c0785d..c4d6bc11c0 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -202,9 +202,11 @@ Status ReduceKernel::ReduceKernelShared( auto output_count = output_shape.Size(); if (ReduceTensorIndices == CUDNN_REDUCE_TENSOR_NO_INDICES) { + IAllocatorUniquePtr input_data_buffer(nullptr, [](T*) {}); CudaT* input_data = nullptr; if (calculate_sqt_) { - input_data = reinterpret_cast(GetScratchBuffer(input_count).get()); + input_data_buffer = GetScratchBuffer(input_count); + input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, @@ -225,8 +227,10 @@ Status ReduceKernel::ReduceKernelShared( // Exp(X-ReduceMax) const TensorShape rhs_shape(output_dims); - auto exp_result = GetScratchBuffer(input_count).get(); - auto log_sum_result = GetScratchBuffer(output_count).get(); + auto exp_result_buffer = GetScratchBuffer(input_count); + auto exp_result = exp_result_buffer.get(); + auto log_sum_result_buffer = GetScratchBuffer(output_count); + auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); Impl_Sub(prepare.output_rank_or_simple_broadcast, diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 2cbe727ed6..43e4023d19 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -202,9 +202,11 @@ Status ReduceKernel::ReduceKernelShared( auto output_count = output_shape.Size(); if (ReduceTensorIndices == MIOPEN_REDUCE_TENSOR_NO_INDICES) { + IAllocatorUniquePtr input_data_buffer(nullptr, [](T*) {}); HipT* input_data = nullptr; if (calculate_sqt_) { - input_data = reinterpret_cast(GetScratchBuffer(input_count).get()); + input_data_buffer = GetScratchBuffer(input_count); + input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, @@ -225,8 +227,10 @@ Status ReduceKernel::ReduceKernelShared( // Exp(X-ReduceMax) const TensorShape rhs_shape(output_dims); - auto exp_result = GetScratchBuffer(input_count).get(); - auto log_sum_result = GetScratchBuffer(output_count).get(); + auto exp_result_buffer = GetScratchBuffer(input_count); + auto exp_result = exp_result_buffer.get(); + auto log_sum_result_buffer = GetScratchBuffer(output_count); + auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); Impl_Sub(prepare.output_rank_or_simple_broadcast, From a9e4d70b508becf51430bde292fc67533cd3e848 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Tue, 26 Jan 2021 09:02:34 -0800 Subject: [PATCH 13/41] Fix merge conflict. --- orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc index e7ecdfcde0..f45b5b1a85 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc @@ -651,7 +651,7 @@ Status LambOptimizer::Compute p_ds, p_m1_news, p_m2_news, alpha_, beta_, lambda_, epsilon_, max_norm_clip_, - do_bias_correction_); + do_bias_correction_)); ORT_RETURN_IF_ERROR(launch_lamb_reduction( *this, From 3c441849635cec342da5906720742a6c9632a41a Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Wed, 3 Feb 2021 11:31:48 -0800 Subject: [PATCH 14/41] Pick up changes from: https://github.com/microsoft/onnxruntime/pull/6490 --- .../providers/rocm/reduction/reduction_ops.cc | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 43e4023d19..5f177e1ecc 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -604,24 +604,30 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } } } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type - if (temp_X) { - auto temp_output = rocm_ep.GetScratchBuffer(output_count); - MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( - rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, - workspace_rocm.get(), workspace_bytes, - &one, input_tensor, temp_X.get(), - &zero, output_tensor, temp_output.get())); + // miopenReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. + // the output is zeros of the output size + if (input_count == output_count) { + HIP_RETURN_IF_ERROR(hipMemsetAsync(output.template MutableData(), static_cast(0), output_count * sizeof(int64_t))); } else { - auto temp_output = rocm_ep.GetScratchBuffer(output_count); - MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( - rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, - workspace_rocm.get(), workspace_bytes, - &one, input_tensor, reinterpret_cast(input.template Data()), - &zero, output_tensor, temp_output.get())); - } + if (temp_X) { + auto temp_output = rocm_ep.GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, temp_X.get(), + &zero, output_tensor, temp_output.get())); + } else { + auto temp_output = rocm_ep.GetScratchBuffer(output_count); + MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( + rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, + workspace_rocm.get(), workspace_bytes, + &one, input_tensor, reinterpret_cast(input.template Data()), + &zero, output_tensor, temp_output.get())); + } - // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); + // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec + Impl_Cast(reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); + } } if (calculate_log) { From d914e29fe1f165ce7055ae402306f72473a5fd59 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Thu, 4 Feb 2021 12:06:48 -0800 Subject: [PATCH 15/41] Reuse reduction_functions.cu --- .../rocm/reduction/reduction_functions.cu | 479 ------------------ tools/ci_build/amd_hipify.py | 1 - 2 files changed, 480 deletions(-) delete mode 100644 onnxruntime/core/providers/rocm/reduction/reduction_functions.cu diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu b/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu deleted file mode 100644 index 8089d8e5e4..0000000000 --- a/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu +++ /dev/null @@ -1,479 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/rocm/reduction/reduction_functions.h" - -#include - -#include -#include -#include "core/common/common.h" -#include "core/providers/rocm/atomic/common.cuh" -#include "core/providers/rocm/cu_inc/common.cuh" -#include "core/providers/rocm/shared_inc/rocm_utils.h" -#include "core/providers/rocm/reduction/reduction_utils.cuh" - -namespace onnxruntime { -namespace rocm { - -namespace detail { -constexpr auto MAX_NUM_ELEMENTS_PER_THREAD = 4; -constexpr auto MAX_NUM_WARPS_PER_BLOCK = 8; -constexpr auto MAX_NUM_BLOCKS_IN_GRID_ROW = 256; -constexpr auto MAX_NUM_GRID_ROWS = 32768; - -dim3 compute_block_dim(int num_cols) { - const int x = GPU_WARP_SIZE; - const int y = std::min(MAX_NUM_WARPS_PER_BLOCK, std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * x))); - return dim3(x, y); -} - -std::pair compute_grid_and_block_dims(int num_rows, int num_cols) { - const auto block_dim = compute_block_dim(num_cols); - const auto grid_x = - std::min( - MAX_NUM_BLOCKS_IN_GRID_ROW, - std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * block_dim.x * block_dim.y))); - const auto grid_y = std::min(MAX_NUM_GRID_ROWS, num_rows); - const dim3 grid_dim(grid_x, grid_y); - return {grid_dim, block_dim}; -} - -uintptr_t round_up_to_aligned(uintptr_t original, size_t alignment) { - assert((alignment & (alignment - 1)) == 0); - const size_t alignment_mask = ~(alignment - 1); - return (original + alignment - 1) & alignment_mask; -} - -/** - * call_reduce_matrix_columns() intermediate buffer layout - * - * Given buffer element type TBuf, the intermediate buffer layout looks like this: - * - * ----- - * m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row - * alignment padding bytes as needed - * m * sizeof(int) bytes for block done counts per row - * ----- - */ - -size_t compute_reduce_matrix_columns_intermediate_buffer_size( - int element_size, int num_rows, int num_cols) { - ORT_ENFORCE(element_size >= 0 && num_rows >= 0 && num_cols >= 0); - - const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; - - size_t buffer_size{}; - - // at the beginning, for sizing purposes, assume we are aligned - buffer_size += static_cast(num_rows) * grid_dim.x * element_size; - - buffer_size = round_up_to_aligned(buffer_size, alignof(int)); - buffer_size += static_cast(num_rows) * sizeof(int); - - // add padding to give us room to align - buffer_size += alignof(max_align_t) - 1; - - return buffer_size; -} - -template -Status get_reduction_buffers( - int num_rows, int num_cols, void* buffer, size_t buffer_size, - TBuf*& block_reductions_buffer, int*& block_done_counts_buffer) { - const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first; - - const uintptr_t begin_addr = reinterpret_cast(buffer); - const uintptr_t block_reductions_addr = - round_up_to_aligned(begin_addr, alignof(TBuf)); - const uintptr_t block_done_counts_buffer_addr = - round_up_to_aligned( - block_reductions_addr + static_cast(num_rows) * grid_dim.x * sizeof(TBuf), alignof(int)); - const uintptr_t end_addr = - block_done_counts_buffer_addr + static_cast(num_rows) * sizeof(int); - const size_t required_size = end_addr - begin_addr; - - ORT_RETURN_IF_NOT( - required_size <= buffer_size, - "Buffer size is too small (", buffer_size, " bytes). ", - "At least ", required_size, " bytes are needed from the given base address (", buffer, ")."); - - block_reductions_buffer = reinterpret_cast(block_reductions_addr); - block_done_counts_buffer = reinterpret_cast(block_done_counts_buffer_addr); - - return Status::OK(); -} - -template -__device__ void reduce_all( - const int num_elements, const TIn* const input, TOut* const output, - TBuf* const block_reductions_buffer, int* const block_done_count_buffer) { - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_bytes) - TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); - // Thread-level indices: - // Linear index of thread in block. - const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x; - // Total number of threads in a 2-D block. - const int num_threads_in_block = blockDim.x * blockDim.y; - - // Warp-level indices: - // Warp index of thread. - const int wid_in_block = tid_in_block / GPU_WARP_SIZE; - // Lane index of thread. - const int lid_in_block = tid_in_block % GPU_WARP_SIZE; - // Warp count per block. - const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE; - - // Grid-level indices: - // Linear index of block in grid row. - const int bid_in_grid_row = blockIdx.x; - // Linear index of thread in grid row. - const int tid_in_grid_row = bid_in_grid_row * (blockDim.x * blockDim.y) + tid_in_block; - // Total number of blocks in a grid row. - const int num_blocks_in_grid_row = gridDim.x; - // Total number of threads in a grid row with 2-D blocks. - const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block; - - const auto write_result = [&output, &num_elements](const TOut result) { - // Compilation time if-else branch controlled by template argument can be - // optimized out, so there will be no branch in real computation phase. - if (DivideResultBySize) { - output[0] = TFinalOp()(result / TOut(num_elements)); - } else { - output[0] = TFinalOp()(result); - } - }; - - // Thread-level reduction (storage change: global memory -> register). - // One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register - // in one iteration. - TBuf value = 0; - for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) { - TIn v[MAX_NUM_ELEMENTS_PER_THREAD]; - -#pragma unroll - for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { - const int offset = id + i * num_threads_in_grid_row; - if (offset < num_elements) { - v[i] = input[offset]; - } - } - -#pragma unroll - for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) { - const int offset = id + i * num_threads_in_grid_row; - if (offset < num_elements) { - value += TOp()(TBuf(v[i])); - } - } - } - - __syncthreads(); - - // Warp-level reduction (storage change: register -> register). - // The values in a warp will be summed up to a scalar. After warp-level - // reduction, each block holds num_warps_in_block values in the shared memory. -#pragma unroll - for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { - value += WARP_SHFL_DOWN(value, stride); - } - - // Return early if only one warp is used for reduction. - // Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as - // 1. two blocks and each of them has only 1 warp (32 threads). - // 2. two warps and each of them has only 2 threads. - if (num_warps_in_block == 1) { - if (tid_in_grid_row == 0) { - write_result(value); - } - return; - } - - if (lid_in_block == 0) { - shared_memory[wid_in_block] = value; - } - - __syncthreads(); - - // Block-level reduction (storage change: shared memory -> global memory). - // The values in a block will be summed up to a scalar. - // Note that the values are stored in the shared memory. - // Here we assume that the size of shared_memory is smaller - // than num_warps_in_block, so we just keep halving the number - // of threads in each iteration. Our assumption is always true because - // the size of shared_memory equals to the number of warps. -#pragma unroll - for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) { - if (tid_in_block + stride < num_warps_in_block) { - shared_memory[tid_in_block] += shared_memory[tid_in_block + stride]; - } - __syncthreads(); - } - - // Return early if only one block is used for reduction. - if (num_blocks_in_grid_row == 1) { - if (tid_in_grid_row == 0) { - write_result(shared_memory[0]); - } - return; - } - - if (tid_in_block == 0) { - block_reductions_buffer[bid_in_grid_row] = shared_memory[0]; - } - - __threadfence(); - __syncthreads(); - - // Grid-level reduction. We use the last block to sum up values - // stored in the global block_reductions_buffer. - __shared__ bool is_last_block_done; - - if (tid_in_block == 0) { - const int count = atomicAdd(block_done_count_buffer, 1); - is_last_block_done = (count == (num_blocks_in_grid_row - 1)); - } - - // All threads in each block see if they belong the last active block - // (i.e., the value of is_last_block_done). - __syncthreads(); - - // Only the block which saw that count equals to num_blocks_in_grid_row - 1 can - // enter the following block. - if (is_last_block_done) { - const int pow2_bound = least_pow2_bound(num_blocks_in_grid_row); - for (int stride = pow2_bound / 2; stride > 0; stride /= 2) { - if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid_row) { - block_reductions_buffer[tid_in_block] += block_reductions_buffer[tid_in_block + stride]; - } - __syncthreads(); - } - - // The first thread in the last block assigns the final output. - if (tid_in_block == 0) { - write_result(block_reductions_buffer[0]); - } - } -} - -template -__global__ void reduce_matrix_columns_kernel( - const int num_rows, const int num_cols, const TIn* const input, TOut* const output, - TBuf* const block_reductions_buffer, int* const block_done_counts_buffer) { - const int num_blocks_in_grid_row = gridDim.x; - const int row_id_in_grid = blockIdx.y; - const int num_grid_rows = gridDim.y; - - // one row per iteration - // row_id is int64_t to avoid int overflow in offset calculations - for (int64_t row_id = row_id_in_grid; row_id < num_rows; row_id += num_grid_rows) { - const TIn* const row_data = input + row_id * num_cols; - TOut* const row_output = output + row_id; - TBuf* const row_block_reductions_buffer = block_reductions_buffer + row_id * num_blocks_in_grid_row; - int* const row_block_done_counts_buffer = block_done_counts_buffer + row_id; - - reduce_all( - num_cols, row_data, row_output, - row_block_reductions_buffer, row_block_done_counts_buffer); - } -} - -template -Status call_reduce_matrix_columns( - const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { - ORT_ENFORCE(num_rows >= 0 && num_cols >= 0); - - using TBuf = AccumulationType_t; - - const auto grid_and_block_dims = compute_grid_and_block_dims(num_rows, num_cols); - const dim3& grid_dim = grid_and_block_dims.first; - const dim3& block_dim = grid_and_block_dims.second; - - TBuf* block_reductions_buffer; - int* block_done_counts_buffer; - ORT_RETURN_IF_ERROR(get_reduction_buffers( - num_rows, num_cols, buffer, buffer_size, - block_reductions_buffer, block_done_counts_buffer)); - - // If more than one block is used per grid row, then inter-block reduction is needed. - if (grid_dim.x > 1) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int))); - } - - const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel), dim3(grid_dim), dim3(block_dim), shared_mem_size, 0, - num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); - - return Status::OK(); -} -} // namespace detail - -template -Status reduce_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); -} - -template -Status reduce_square_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); -} - -template -Status reduce_l2_norm( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); -} - -template -Status reduce_mean( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); -} - -#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \ - template Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) -INSTANTIATE_REDUCE_SUM(half, float); -INSTANTIATE_REDUCE_SUM(float, float); -INSTANTIATE_REDUCE_SUM(double, double); -#undef INSTANTIATE_REDUCE_SUM - -#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \ - template Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) -INSTANTIATE_REDUCE_SQUARE_SUM(half, float); -INSTANTIATE_REDUCE_SQUARE_SUM(float, float); -INSTANTIATE_REDUCE_SQUARE_SUM(double, double); -#undef INSTANTIATE_REDUCE_SQUARE_SUM - -#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \ - template Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) -INSTANTIATE_REDUCE_L2_NORM(half, float); -INSTANTIATE_REDUCE_L2_NORM(float, float); -INSTANTIATE_REDUCE_L2_NORM(double, double); -#undef INSTANTIATE_REDUCE_L2_NORM - -#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \ - template Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) -INSTANTIATE_REDUCE_MEAN(half, float); -INSTANTIATE_REDUCE_MEAN(float, float); -INSTANTIATE_REDUCE_MEAN(double, double); -#undef INSTANTIATE_REDUCE_MEAN - -namespace detail { -template -__global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) { - constexpr int x_load_count_per_thread = 1; - constexpr int y_load_count_per_thread = 4; - const int t_count_x_in_grid = blockDim.x * gridDim.x; - const int t_count_y_in_grid = blockDim.y * gridDim.y; - const int x_grid_stride = t_count_x_in_grid * x_load_count_per_thread; - const int y_grid_stride = t_count_y_in_grid * y_load_count_per_thread; - const int tid_x_in_grid = threadIdx.x + blockDim.x * blockIdx.x; - const int tid_y_in_grid = threadIdx.y + blockDim.y * blockIdx.y; - const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y; - - // Shape is blockDim.y-by-blockDim.x and element type is TBuf. - HIP_DYNAMIC_SHARED( unsigned char, shared_memory_bytes) - TBuf* shared_memory = reinterpret_cast(shared_memory_bytes); - - // to prevent int overflow in index calculation for input size m*n - const int64_t n_int64 = static_cast(n); - - for (int col = tid_x_in_grid; col < n; col += x_grid_stride) { - shared_memory[tid_in_block] = TBuf(0.0f); - TBuf sum = TBuf(0.0f); - // This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input. - for (int row = tid_y_in_grid; row < m; row += y_grid_stride) { - // Thread-level reduction. Each thread loads y_load_count_per_thread values - // and aggregrate them. -#pragma unroll(y_load_count_per_thread) - for (int row_inner = 0; row_inner < y_load_count_per_thread; ++row_inner) { - int row_final = row + row_inner * t_count_y_in_grid; - int col_final = col; - if (row_final < m && col_final < n) { - sum += TBuf(input[row_final * n_int64 + col_final]); - } - } - } - // Write thread-level reduction result into shared memory. - shared_memory[tid_in_block] = sum; - - // Wait all threads to finish their thread-level reductions. - __syncthreads(); - -// This loop conducts reduction on elements stored in shared memory. -// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor. -#pragma unroll(4) - for (int stride = blockDim.y / 2; stride > 0; stride /= 2) { - if (threadIdx.y < stride) { - shared_memory[tid_in_block] += shared_memory[tid_in_block + stride * blockDim.x]; - } - __syncthreads(); - } - - if (threadIdx.y == 0) { - atomic_add(output + col, TOut(shared_memory[threadIdx.x])); - } - } -} - -template -Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { - ORT_ENFORCE(m >= 0 && n >= 0); - - if (reset_initial_output) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut))); - } - - constexpr int max_num_threads_in_block = 512; - constexpr int max_num_blocks_in_grid = 512; - constexpr int load_count_per_thread = 4; - - const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE))); - const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread))); - const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid)); - const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4)); - - const dim3 grid(grid_x_dim, grid_y_dim, 1); - const dim3 block(block_x_dim, block_y_dim, 1); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_rows_kernel), dim3(grid), dim3(block), block.y * block.x * sizeof(TBuf), 0, - input, output, m, n); - - return Status::OK(); -} -} // namespace detail - -template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { - using TBuf = AccumulationType_t; - return detail::call_reduce_matrix_rows(input, output, m, n, reset_initial_output); -} - -#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \ - template Status reduce_matrix_rows(const T* input, T* output, int m, int n, bool reset_initial_output) -INSTANTIATE_REDUCE_MATRIX_ROWS(half); -INSTANTIATE_REDUCE_MATRIX_ROWS(float); -INSTANTIATE_REDUCE_MATRIX_ROWS(double); -#undef INSTANTIATE_REDUCE_MATRIX_ROWS - -template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { - return detail::call_reduce_matrix_columns( - input, output, m, n, buffer, buffer_size); -} - -#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \ - template Status reduce_matrix_columns(const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) -INSTANTIATE_REDUCE_MATRIX_COLUMNS(half); -INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); -INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); -#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS - -} // namespace rocm -} // namespace onnxruntime diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index 7c2d12c859..92b422142f 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -131,7 +131,6 @@ provider_excluded_files = [ 'object_detection/roialign.h', 'object_detection/roialign_impl.cu', 'object_detection/roialign_impl.h', - 'reduction/reduction_functions.cu', 'reduction/reduction_ops.cc', 'reduction/reduction_ops.h', 'rnn/cudnn_rnn_base.cc', From 4e61e254ecf3c6de5388efb3678208c127d3104e Mon Sep 17 00:00:00 2001 From: Prasanth Pulavarthi Date: Thu, 4 Feb 2021 15:28:39 -0800 Subject: [PATCH 16/41] Update link in readme (#6537) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b03aafe95e..83e18e4449 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86) [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84) -**ONNX Runtime** is a cross-platform **inferencing and training accelerator** compatible with many popular ML/DNN frameworks, including PyTorch, TensorFlow/Keras, scikit-learn, and more. **[aka.ms/onnxruntime](https://aka.ms/onnxruntime)** +**ONNX Runtime** is a cross-platform **inferencing and training accelerator** compatible with many popular ML/DNN frameworks, including PyTorch, TensorFlow/Keras, scikit-learn, and more. **[onnxruntime.ai](https://onnxruntime.ai)** Many users can benefit from ONNX Runtime, including those looking to: From 615acf156c87f809e3aac635fbde56af12de2e90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Fri, 5 Feb 2021 01:10:11 +0100 Subject: [PATCH 17/41] remove keras example from python documentation (#6574) --- docs/python/examples/plot_dl_keras.py | 94 --------------------------- docs/python/requirements.txt | 2 - tools/doc/rename_folders.py | 2 + 3 files changed, 2 insertions(+), 96 deletions(-) delete mode 100644 docs/python/examples/plot_dl_keras.py diff --git a/docs/python/examples/plot_dl_keras.py b/docs/python/examples/plot_dl_keras.py deleted file mode 100644 index 949ee895e5..0000000000 --- a/docs/python/examples/plot_dl_keras.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -""" - -.. _l-example-backend-api-tensorflow: - -ONNX Runtime for Keras -====================== - -The following demonstrates how to compute the predictions -of a pretrained deep learning model obtained from -`keras `_ -with *onnxruntime*. The conversion requires -`keras `_, -`tensorflow `_, -`keras-onnx `_, -`onnxmltools `_ -but then only *onnxruntime* is required -to compute the predictions. -""" -import os -if not os.path.exists('dense121.onnx'): - from keras.applications.densenet import DenseNet121 - model = DenseNet121(include_top=True, weights='imagenet') - - from keras2onnx import convert_keras - onx = convert_keras(model, 'dense121.onnx') - with open("dense121.onnx", "wb") as f: - f.write(onx.SerializeToString()) - -################################## -# Let's load an image (source: wikipedia). - -from keras.preprocessing.image import array_to_img, img_to_array, load_img -img = load_img('Sannosawa1.jpg') -ximg = img_to_array(img) - -import matplotlib.pyplot as plt -plt.imshow(ximg / 255) -plt.axis('off') - -############################################# -# Let's load the model with onnxruntime. -import onnxruntime as rt -from onnxruntime.capi.onnxruntime_pybind11_state import InvalidGraph - -try: - sess = rt.InferenceSession('dense121.onnx') - ok = True -except (InvalidGraph, TypeError, RuntimeError) as e: - # Probably a mismatch between onnxruntime and onnx version. - print(e) - ok = False - -if ok: - print("The model expects input shape:", sess.get_inputs()[0].shape) - print("image shape:", ximg.shape) - -####################################### -# Let's resize the image. - -if ok: - from skimage.transform import resize - import numpy - - ximg224 = resize(ximg / 255, (224, 224, 3), anti_aliasing=True) - ximg = ximg224[numpy.newaxis, :, :, :] - ximg = ximg.astype(numpy.float32) - - print("new shape:", ximg.shape) - -################################## -# Let's compute the output. - -if ok: - input_name = sess.get_inputs()[0].name - res = sess.run(None, {input_name: ximg}) - prob = res[0] - print(prob.ravel()[:10]) # Too big to be displayed. - - -################################## -# Let's get more comprehensive results. - -if ok: - from keras.applications.densenet import decode_predictions - decoded = decode_predictions(prob) - - import pandas - df = pandas.DataFrame(decoded[0], columns=["class_id", "name", "P"]) - print(df) - - diff --git a/docs/python/requirements.txt b/docs/python/requirements.txt index da66f486e8..8bfb4e15bd 100644 --- a/docs/python/requirements.txt +++ b/docs/python/requirements.txt @@ -1,5 +1,3 @@ -keras -keras-onnx sphinx sphinx-gallery pyquickhelper diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py index bb3fd854cd..d19fb86482 100644 --- a/tools/doc/rename_folders.py +++ b/tools/doc/rename_folders.py @@ -23,6 +23,8 @@ def rename_folder(root): renamed.append((r, name, into)) full_src = os.path.join(r, name) full_into = os.path.join(r, into) + if os.path.exists(full_into): + raise RuntimeError("%r already exists, previous documentation should be removed.") print("rename %r" % full_src) os.rename(full_src, full_into) From 89627a81787c514ca9af4022857c330327408b94 Mon Sep 17 00:00:00 2001 From: Yulong Wang Date: Thu, 4 Feb 2021 17:07:06 -0800 Subject: [PATCH 18/41] [Node.js binding] support NPM v7+ (#6559) --- nodejs/package-lock.json | 7108 +++++++++++++++++++++++++++++++++----- nodejs/package.json | 40 +- 2 files changed, 6333 insertions(+), 815 deletions(-) diff --git a/nodejs/package-lock.json b/nodejs/package-lock.json index 6a41c2223c..e2a6a26e21 100644 --- a/nodejs/package-lock.json +++ b/nodejs/package-lock.json @@ -1,8 +1,5444 @@ { "name": "onnxruntime", "version": "1.6.0", - "lockfileVersion": 1, + "lockfileVersion": 2, "requires": true, + "packages": { + "": { + "name": "onnxruntime", + "version": "1.6.0", + "hasInstallScript": true, + "license": "MIT", + "os": [ + "win32", + "darwin", + "linux" + ], + "dependencies": { + "prebuild-install": "^6.0.0" + }, + "devDependencies": { + "@types/fs-extra": "^9.0.6", + "@types/klaw-sync": "^6.0.0", + "@types/minimist": "1.2.1", + "@types/mocha": "^8.2.0", + "@types/tar-stream": "^2.2.0", + "@typescript-eslint/eslint-plugin": "^4.14.2", + "@typescript-eslint/parser": "^4.14.2", + "clang-format": "^1.5.0", + "cmake-js": "^6.1.0", + "eslint": "^7.19.0", + "eslint-plugin-import": "^2.22.1", + "eslint-plugin-jsdoc": "^31.6.0", + "eslint-plugin-prefer-arrow": "^1.2.3", + "fs-extra": "^9.1.0", + "globby": "^11.0.2", + "jsonc": "^2.0.0", + "klaw-sync": "^6.0.0", + "minimist": "^1.2.5", + "mocha": "^8.2.1", + "node-addon-api": "^3.1.0", + "node-pre-gyp-github": "^1.4.3", + "onnx-proto": "^4.0.4", + "tar-stream": "^2.2.0", + "typedoc": "^0.20.20", + "typescript": "^4.1.3" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.10.4.tgz", + "integrity": "sha1-Fo2ho26Q2miujUnA8bSMfGJJITo=", + "dev": true, + "dependencies": { + "@babel/highlight": "^7.10.4" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.10.4.tgz", + "integrity": "sha1-p4x6clHgH2FlEtMbEK3PUq2l4NI=", + "dev": true + }, + "node_modules/@babel/highlight": { + "version": "7.10.4", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.10.4.tgz", + "integrity": "sha1-fRvf1ldTU4+r5sOFls23bZrGAUM=", + "dev": true, + "dependencies": { + "@babel/helper-validator-identifier": "^7.10.4", + "chalk": "^2.0.0", + "js-tokens": "^4.0.0" + } + }, + "node_modules/@eslint/eslintrc": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-0.3.0.tgz", + "integrity": "sha512-1JTKgrOKAHVivSvOYw+sJOunkBjUOvjqWk1DPja7ZFhIS2mX/4EgTT8M7eTK9jrKhL/FvXXEbQwIs3pg1xp3dg==", + "dev": true, + "dependencies": { + "ajv": "^6.12.4", + "debug": "^4.1.1", + "espree": "^7.3.0", + "globals": "^12.1.0", + "ignore": "^4.0.6", + "import-fresh": "^3.2.1", + "js-yaml": "^3.13.1", + "lodash": "^4.17.20", + "minimatch": "^3.0.4", + "strip-json-comments": "^3.1.1" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + } + }, + "node_modules/@eslint/eslintrc/node_modules/strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.3.tgz", + "integrity": "sha1-Olgr21OATGum0UZXnEblITDPSjs=", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "2.0.3", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.3.tgz", + "integrity": "sha1-NNxfTKu8cg9OYPdadH5+zWwXW9M=", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.4.tgz", + "integrity": "sha1-ARuSAqcKY2bkNspcBlhEUoqwSXY=", + "dev": true, + "dependencies": { + "@nodelib/fs.scandir": "2.1.3", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@octokit/rest": { + "version": "15.18.3", + "resolved": "https://registry.npmjs.org/@octokit/rest/-/rest-15.18.3.tgz", + "integrity": "sha1-/07Lt4TKKGxAzB1WirztptmbNvw=", + "dev": true, + "dependencies": { + "before-after-hook": "^1.1.0", + "btoa-lite": "^1.0.0", + "debug": "^3.1.0", + "http-proxy-agent": "^2.1.0", + "https-proxy-agent": "^2.2.0", + "lodash": "^4.17.4", + "node-fetch": "^2.1.1", + "universal-user-agent": "^2.0.0", + "url-template": "^2.0.8" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/@octokit/rest/node_modules/debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha1-6D0X3hbYp++3cX7b5fsQE17uYps=", + "dev": true, + "dependencies": { + "ms": "^2.1.1" + } + }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha1-m4sMxmPWaafY9vXQiToU00jzD78=", + "dev": true + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha1-TIVzDlm5ofHzSQR9vyQpYDS7JzU=", + "dev": true + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz", + "integrity": "sha1-fvN/DQEPsCitGtWXIuUG2SYoFcs=", + "dev": true + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz", + "integrity": "sha1-NVy8mLr61ZePntCV85diHx0Ga3A=", + "dev": true + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz", + "integrity": "sha1-upn7WYYUr2VwDBYZ/wbUVLDYTEU=", + "dev": true, + "dependencies": { + "@protobufjs/aspromise": "^1.1.1", + "@protobufjs/inquire": "^1.1.0" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha1-Xp4avctz/Ap8uLKR33jIy9l7h9E=", + "dev": true + }, + "node_modules/@protobufjs/inquire": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz", + "integrity": "sha1-/yAOPnzyQp4tyvwRQIKOjMY48Ik=", + "dev": true + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha1-bMKyDFya1q0NzP0hynZz2Nf79o0=", + "dev": true + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha1-Cf0V8tbTq/qbZbw2ZQbWrXhG/1Q=", + "dev": true + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz", + "integrity": "sha1-p3c2C1s5oaLlEG+OhY8v0tBgxXA=", + "dev": true + }, + "node_modules/@types/fs-extra": { + "version": "9.0.6", + "resolved": "https://registry.npmjs.org/@types/fs-extra/-/fs-extra-9.0.6.tgz", + "integrity": "sha512-ecNRHw4clCkowNOBJH1e77nvbPxHYnWIXMv1IAoG/9+MYGkgoyr3Ppxr7XYFNL41V422EDhyV4/4SSK8L2mlig==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/json-schema": { + "version": "7.0.7", + "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.7.tgz", + "integrity": "sha512-cxWFQVseBm6O9Gbw1IWb8r6OS4OhSt3hPZLkFApLjM8TEXROBuQGLAH2i2gZpcXdLBIrpXuTDhH7Vbm1iXmNGA==", + "dev": true + }, + "node_modules/@types/json5": { + "version": "0.0.29", + "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz", + "integrity": "sha1-7ihweulOEdK4J7y+UnC86n8+ce4=", + "dev": true + }, + "node_modules/@types/klaw-sync": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/@types/klaw-sync/-/klaw-sync-6.0.0.tgz", + "integrity": "sha1-/ws2YB76qhCdUTxM7RCTEf0GujY=", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/long": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.1.tgz", + "integrity": "sha1-RZxl+hhn2v5qjzIsTFFpVmPMVek=", + "dev": true + }, + "node_modules/@types/minimist": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.1.tgz", + "integrity": "sha512-fZQQafSREFyuZcdWFAExYjBiCL7AUCdgsk80iO0q4yihYYdcIiH28CcuPTGFgLOCC8RlW49GSQxdHwZP+I7CNg==", + "dev": true + }, + "node_modules/@types/mocha": { + "version": "8.2.0", + "resolved": "https://registry.npmjs.org/@types/mocha/-/mocha-8.2.0.tgz", + "integrity": "sha512-/Sge3BymXo4lKc31C8OINJgXLaw+7vL1/L1pGiBNpGrBiT8FQiaFpSYV0uhTaG4y78vcMBTMFsWaHDvuD+xGzQ==", + "dev": true + }, + "node_modules/@types/node": { + "version": "14.11.8", + "resolved": "https://registry.npmjs.org/@types/node/-/node-14.11.8.tgz", + "integrity": "sha1-/iAS8jVeTOCLykSus6u7Ic+I0z8=", + "dev": true + }, + "node_modules/@types/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@types/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-sRTpT180sVigzD4SiCWJQQrqcdkWnmscWvx+cXvAoPtXbLFC5+QmKi2xwRcPe4iRu0GcVl1qTeJKUTS5hULfrw==", + "dev": true, + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@typescript-eslint/eslint-plugin": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.14.2.tgz", + "integrity": "sha512-uMGfG7GFYK/nYutK/iqYJv6K/Xuog/vrRRZX9aEP4Zv1jsYXuvFUMDFLhUnc8WFv3D2R5QhNQL3VYKmvLS5zsQ==", + "dev": true, + "dependencies": { + "@typescript-eslint/experimental-utils": "4.14.2", + "@typescript-eslint/scope-manager": "4.14.2", + "debug": "^4.1.1", + "functional-red-black-tree": "^1.0.1", + "lodash": "^4.17.15", + "regexpp": "^3.0.0", + "semver": "^7.3.2", + "tsutils": "^3.17.1" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "@typescript-eslint/parser": "^4.0.0", + "eslint": "^5.0.0 || ^6.0.0 || ^7.0.0" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/@typescript-eslint/eslint-plugin/node_modules/lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@typescript-eslint/eslint-plugin/node_modules/semver": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@typescript-eslint/eslint-plugin/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + }, + "node_modules/@typescript-eslint/experimental-utils": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-4.14.2.tgz", + "integrity": "sha512-mV9pmET4C2y2WlyHmD+Iun8SAEqkLahHGBkGqDVslHkmoj3VnxnGP4ANlwuxxfq1BsKdl/MPieDbohCEQgKrwA==", + "dev": true, + "dependencies": { + "@types/json-schema": "^7.0.3", + "@typescript-eslint/scope-manager": "4.14.2", + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/typescript-estree": "4.14.2", + "eslint-scope": "^5.0.0", + "eslint-utils": "^2.0.0" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "*" + } + }, + "node_modules/@typescript-eslint/parser": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-4.14.2.tgz", + "integrity": "sha512-ipqSP6EuUsMu3E10EZIApOJgWSpcNXeKZaFeNKQyzqxnQl8eQCbV+TSNsl+s2GViX2d18m1rq3CWgnpOxDPgHg==", + "dev": true, + "dependencies": { + "@typescript-eslint/scope-manager": "4.14.2", + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/typescript-estree": "4.14.2", + "debug": "^4.1.1" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^5.0.0 || ^6.0.0 || ^7.0.0" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/@typescript-eslint/scope-manager": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-4.14.2.tgz", + "integrity": "sha512-cuV9wMrzKm6yIuV48aTPfIeqErt5xceTheAgk70N1V4/2Ecj+fhl34iro/vIssJlb7XtzcaD07hWk7Jk0nKghg==", + "dev": true, + "dependencies": { + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/visitor-keys": "4.14.2" + }, + "engines": { + "node": "^8.10.0 || ^10.13.0 || >=11.10.1" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/types": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-4.14.2.tgz", + "integrity": "sha512-LltxawRW6wXy4Gck6ZKlBD05tCHQUj4KLn4iR69IyRiDHX3d3NCAhO+ix5OR2Q+q9bjCrHE/HKt+riZkd1At8Q==", + "dev": true, + "engines": { + "node": "^8.10.0 || ^10.13.0 || >=11.10.1" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/typescript-estree": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-4.14.2.tgz", + "integrity": "sha512-ESiFl8afXxt1dNj8ENEZT12p+jl9PqRur+Y19m0Z/SPikGL6rqq4e7Me60SU9a2M28uz48/8yct97VQYaGl0Vg==", + "dev": true, + "dependencies": { + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/visitor-keys": "4.14.2", + "debug": "^4.1.1", + "globby": "^11.0.1", + "is-glob": "^4.0.1", + "lodash": "^4.17.15", + "semver": "^7.3.2", + "tsutils": "^3.17.1" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/semver": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + }, + "node_modules/@typescript-eslint/visitor-keys": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-4.14.2.tgz", + "integrity": "sha512-KBB+xLBxnBdTENs/rUgeUKO0UkPBRs2vD09oMRRIkj5BEN8PX1ToXV532desXfpQnZsYTyLLviS7JrPhdL154w==", + "dev": true, + "dependencies": { + "@typescript-eslint/types": "4.14.2", + "eslint-visitor-keys": "^2.0.0" + }, + "engines": { + "node": "^8.10.0 || ^10.13.0 || >=11.10.1" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@ungap/promise-all-settled": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@ungap/promise-all-settled/-/promise-all-settled-1.1.2.tgz", + "integrity": "sha512-sL/cEvJWAnClXw0wHk85/2L0G6Sj8UB0Ctc1TEMbKSsmpRosqhwj9gWgFRZSrBr2f9tiXISwNhCPmlfqUqyb9Q==", + "dev": true + }, + "node_modules/acorn": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", + "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==", + "dev": true, + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/acorn-jsx": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.1.tgz", + "integrity": "sha512-K0Ptm/47OKfQRpNQ2J/oIN/3QYiK6FwW+eJbILhsdxh2WTLdl+30o8aGdTbm5JbffpFFAg/g+zi1E+jvJha5ng==", + "dev": true, + "peerDependencies": { + "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" + } + }, + "node_modules/agent-base": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz", + "integrity": "sha1-gWXwHENgCbzK0LHRIvBe13Dvxu4=", + "dev": true, + "dependencies": { + "es6-promisify": "^5.0.0" + }, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha1-uvWmLoArB9l3A0WG+MO69a3ybfQ=", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + } + }, + "node_modules/ansi": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/ansi/-/ansi-0.3.1.tgz", + "integrity": "sha1-DELU+xcWDVqa8eSEus4cZpIsGyE=", + "dev": true + }, + "node_modules/ansi-colors": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-4.1.1.tgz", + "integrity": "sha512-JoX0apGbHaUJBNl6yF+p6JAFYZ666/hhCGKN5t9QFjbJQKUU/g8MNbFDbvfrgKXvI1QpZplPOnwIo99lX/AAmA==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/ansi-regex": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", + "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha1-QfuyAkPlCxK+DwS43tvwdSDOhB0=", + "dev": true, + "dependencies": { + "color-convert": "^1.9.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/anymatch": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.1.tgz", + "integrity": "sha512-mM8522psRCqzV+6LhomX5wgp25YVibjh8Wj23I5RPkPppSVSjyKD2A2mBJmWGa+KN7f2D6LNh9jkBCeyLktzjg==", + "dev": true, + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/aproba": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", + "integrity": "sha1-aALmJk79GMeQobDVF/DyYnvyyUo=" + }, + "node_modules/are-we-there-yet": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.1.5.tgz", + "integrity": "sha1-SzXClE8GKov82mZBB2A1D+nd/CE=", + "dependencies": { + "delegates": "^1.0.0", + "readable-stream": "^2.0.6" + } + }, + "node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha1-vNZ5HqWuCXJeF+WtmIE0zUCz2RE=", + "dev": true, + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/array-includes": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.1.tgz", + "integrity": "sha1-zdZ+aFK9+cEhVGB4ZzIlXtJFk0g=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.0", + "is-string": "^1.0.5" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/array-union": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", + "integrity": "sha1-t5hCCtvrHego2ErNii4j0+/oXo0=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/array.prototype.flat": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.2.3.tgz", + "integrity": "sha1-DegrQmsDGNv9uUAInjiwQ9N/bHs=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.0-next.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/asn1": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", + "integrity": "sha1-jSR136tVO7M+d7VOWeiAu4ziMTY=", + "dev": true, + "dependencies": { + "safer-buffer": "~2.1.0" + } + }, + "node_modules/assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=", + "dev": true, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/astral-regex": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/astral-regex/-/astral-regex-2.0.0.tgz", + "integrity": "sha512-Z7tMw1ytTXt5jqMcOP+OQteU1VuNK9Y02uuJtKQ1Sv69jXQKKg5cibLwGJow8yzZP+eAc18EmLGPal0bp36rvQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/async": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/async/-/async-1.5.2.tgz", + "integrity": "sha1-7GphrlZIDAw8skHJVhjiCJL5Zyo=", + "dev": true + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=", + "dev": true + }, + "node_modules/at-least-node": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz", + "integrity": "sha1-YCzUtG6EStTv/JKoARo8RuAjjcI=", + "dev": true, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/aws-sign2": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz", + "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg=", + "dev": true, + "engines": { + "node": "*" + } + }, + "node_modules/aws4": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.10.1.tgz", + "integrity": "sha1-4eguTz6Zniz9YbFhKA0WoRH4ZCg=", + "dev": true + }, + "node_modules/balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "dev": true + }, + "node_modules/base64-js": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.3.1.tgz", + "integrity": "sha1-WOzoy3XdB+ce0IxzarxfrE2/jfE=" + }, + "node_modules/bcrypt-pbkdf": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", + "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=", + "dev": true, + "dependencies": { + "tweetnacl": "^0.14.3" + } + }, + "node_modules/before-after-hook": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/before-after-hook/-/before-after-hook-1.4.0.tgz", + "integrity": "sha1-K2vyPcpPMuYo/SdHwQo3x0pLSE0=", + "dev": true + }, + "node_modules/big-integer": { + "version": "1.6.48", + "resolved": "https://registry.npmjs.org/big-integer/-/big-integer-1.6.48.tgz", + "integrity": "sha1-j9iL0WMsukocjD49cVnwi7lbS54=", + "dev": true, + "engines": { + "node": ">=0.6" + } + }, + "node_modules/binary": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/binary/-/binary-0.3.0.tgz", + "integrity": "sha1-n2BVO8XOjDOG87VTz/R0Yq3sqnk=", + "dev": true, + "dependencies": { + "buffers": "~0.1.1", + "chainsaw": "~0.1.0" + } + }, + "node_modules/binary-extensions": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz", + "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/bl": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.0.3.tgz", + "integrity": "sha1-EtYoetwpCA4ipwXldksqlSLNxIk=", + "dependencies": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, + "node_modules/bl/node_modules/readable-stream": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", + "integrity": "sha1-M3u9o63AcGvT4CRCaihtS0sskZg=", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha1-9y12C+Cbf3bQjtj66Ysomo0F+rM=", + "dev": true + }, + "node_modules/brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha1-PH/L9SnYcibz0vUrlm/1Jx60Qd0=", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "node_modules/braces": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", + "integrity": "sha1-NFThpGLujVmeI23zNs2epPiv4Qc=", + "dev": true, + "dependencies": { + "fill-range": "^7.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/browser-stdout": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.1.tgz", + "integrity": "sha1-uqVZ7hTO1zRSIputcyZGfGH6vWA=", + "dev": true + }, + "node_modules/btoa-lite": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/btoa-lite/-/btoa-lite-1.0.0.tgz", + "integrity": "sha1-M3dm2hWAEhD92VbCLpxokaudAzc=", + "dev": true + }, + "node_modules/buffer": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.6.0.tgz", + "integrity": "sha1-oxdJ3H2B2E2wir+Te2uMQDP2J4Y=", + "dependencies": { + "base64-js": "^1.0.2", + "ieee754": "^1.1.4" + } + }, + "node_modules/buffer-indexof-polyfill": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/buffer-indexof-polyfill/-/buffer-indexof-polyfill-1.0.2.tgz", + "integrity": "sha1-0nMhNcWZnGSyd/z5savjSYJUcpw=", + "dev": true, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/buffer-shims": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/buffer-shims/-/buffer-shims-1.0.0.tgz", + "integrity": "sha1-mXjOMXOIxkmth5MCjDR37wRKi1E=", + "dev": true + }, + "node_modules/buffers": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/buffers/-/buffers-0.1.1.tgz", + "integrity": "sha1-skV5w77U1tOWru5tmorn9Ugqt7s=", + "dev": true, + "engines": { + "node": ">=0.2.0" + } + }, + "node_modules/callsites": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", + "integrity": "sha1-s2MKvYlDQy9Us/BRkjjjPNffL3M=", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/camelcase": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-2.1.1.tgz", + "integrity": "sha1-fB0W1nmhu+WcoCys7PsBHiAfWh8=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/caseless": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", + "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=", + "dev": true + }, + "node_modules/chainsaw": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz", + "integrity": "sha1-XqtQsor+WAdNDVgpE4iCi15fvJg=", + "dev": true, + "dependencies": { + "traverse": ">=0.3.0 <0.4" + } + }, + "node_modules/chalk": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.2.tgz", + "integrity": "sha1-zUJUFnelQzPPVBpJEIwUMrRMlCQ=", + "dev": true, + "dependencies": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/chokidar": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.4.3.tgz", + "integrity": "sha512-DtM3g7juCXQxFVSNPNByEC2+NImtBuxQQvWlHunpJIS5Ocr0lG306cC7FCi7cEA0fzmybPUIl4txBIobk1gGOQ==", + "dev": true, + "dependencies": { + "anymatch": "~3.1.1", + "braces": "~3.0.2", + "fsevents": "~2.1.2", + "glob-parent": "~5.1.0", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.5.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "optionalDependencies": { + "fsevents": "~2.1.2" + } + }, + "node_modules/chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha1-b8nXtC0ypYNZYzdmbn0ICE2izGs=" + }, + "node_modules/clang-format": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/clang-format/-/clang-format-1.5.0.tgz", + "integrity": "sha512-C1LucFX7E+ABVYcPEbBHM4PYQ2+WInXsqsLpFlQ9cmRfSbk7A7b1I06h/nE4bQ3MsyEkb31jY2gC0Dtc76b4IA==", + "dev": true, + "dependencies": { + "async": "^1.5.2", + "glob": "^7.0.0", + "resolve": "^1.1.6" + }, + "bin": { + "check-clang-format": "bin/check-clang-format.js", + "clang-format": "index.js", + "git-clang-format": "bin/git-clang-format" + } + }, + "node_modules/cliui": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-3.2.0.tgz", + "integrity": "sha1-EgYBU3qRbSmUD5NNo7SNWFo5IT0=", + "dev": true, + "dependencies": { + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1", + "wrap-ansi": "^2.0.0" + } + }, + "node_modules/cmake-js": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/cmake-js/-/cmake-js-6.1.0.tgz", + "integrity": "sha1-vsc4G1jUVKzuCdT7AEcVOgBQY6Y=", + "dev": true, + "dependencies": { + "debug": "^4", + "fs-extra": "^5.0.0", + "is-iojs": "^1.0.1", + "lodash": "^4", + "memory-stream": "0", + "npmlog": "^1.2.0", + "rc": "^1.2.7", + "request": "^2.54.0", + "semver": "^5.0.3", + "splitargs": "0", + "tar": "^4", + "unzipper": "^0.8.13", + "url-join": "0", + "which": "^1.0.9", + "yargs": "^3.6.0" + }, + "bin": { + "cmake-js": "bin/cmake-js" + }, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/cmake-js/node_modules/are-we-there-yet": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.0.6.tgz", + "integrity": "sha1-otKMkxAqpsyWJFomy5VN4G7FPww=", + "dev": true, + "dependencies": { + "delegates": "^1.0.0", + "readable-stream": "^2.0.0 || ^1.1.13" + } + }, + "node_modules/cmake-js/node_modules/fs-extra": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-5.0.0.tgz", + "integrity": "sha1-QU0BEM3QZwVzTQVWUsVBEmDDGr0=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.2", + "jsonfile": "^4.0.0", + "universalify": "^0.1.0" + } + }, + "node_modules/cmake-js/node_modules/gauge": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/gauge/-/gauge-1.2.7.tgz", + "integrity": "sha1-6c7FSD09TuDvRLYKfZnkk14TbZM=", + "dev": true, + "dependencies": { + "ansi": "^0.3.0", + "has-unicode": "^2.0.0", + "lodash.pad": "^4.1.0", + "lodash.padend": "^4.1.0", + "lodash.padstart": "^4.1.0" + } + }, + "node_modules/cmake-js/node_modules/npmlog": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-1.2.1.tgz", + "integrity": "sha1-KOe+YZYJtT960d0wChDWTXFiaLY=", + "dev": true, + "dependencies": { + "ansi": "~0.3.0", + "are-we-there-yet": "~1.0.0", + "gauge": "~1.2.0" + } + }, + "node_modules/code-point-at": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", + "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/color-convert": { + "version": "1.9.3", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", + "integrity": "sha1-u3GFBpDh8TZWfeYp0tVHHe2kweg=", + "dev": true, + "dependencies": { + "color-name": "1.1.3" + } + }, + "node_modules/color-name": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", + "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", + "dev": true + }, + "node_modules/colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==", + "dev": true, + "engines": { + "node": ">=0.1.90" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha1-w9RaizT9cwYxoRCoolIGgrMdWn8=", + "dev": true, + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha1-/UhehMA+tIgcIHIrpIA16FMa6zM=", + "dev": true + }, + "node_modules/comment-parser": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.1.1.tgz", + "integrity": "sha512-vue7cRi1ZO5/72FJ+wZ5+siTSBlUv3ZksTk8bWD2IkaA6obitzMZP3yI65azTJLckwmi8lxfPP5Sd9oGuZ8e2g==", + "dev": true, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "dev": true + }, + "node_modules/console-control-strings": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/console-control-strings/-/console-control-strings-1.1.0.tgz", + "integrity": "sha1-PXz0Rk22RG6mRL9LOVB/mFEAjo4=" + }, + "node_modules/contains-path": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/contains-path/-/contains-path-0.1.0.tgz", + "integrity": "sha1-/ozxhP9mcLa67wGp1IYaXL7EEgo=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" + }, + "node_modules/cross-spawn": { + "version": "6.0.5", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz", + "integrity": "sha1-Sl7Hxk364iw6FBJNus3uhG2Ay8Q=", + "dev": true, + "dependencies": { + "nice-try": "^1.0.4", + "path-key": "^2.0.1", + "semver": "^5.5.0", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + }, + "engines": { + "node": ">=4.8" + } + }, + "node_modules/dashdash": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", + "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", + "dev": true, + "dependencies": { + "assert-plus": "^1.0.0" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/debug": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", + "integrity": "sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==", + "dev": true, + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/decompress-response": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-4.2.1.tgz", + "integrity": "sha1-QUAjzHowLaJc4uyC0NUjjMr9iYY=", + "dependencies": { + "mimic-response": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/deep-extend": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", + "integrity": "sha1-xPp8lUBKF6nD6Mp+FTcxK3NjMKw=", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/deep-is": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.3.tgz", + "integrity": "sha1-s2nW+128E+7PUk+RsHD+7cNXzzQ=", + "dev": true + }, + "node_modules/define-properties": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.1.3.tgz", + "integrity": "sha1-z4jabL7ib+bbcJT2HYcMvYTO6fE=", + "dev": true, + "dependencies": { + "object-keys": "^1.0.12" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=", + "dev": true, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/delegates": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delegates/-/delegates-1.0.0.tgz", + "integrity": "sha1-hMbhWbgZBP3KWaDvRM2HDTElD5o=" + }, + "node_modules/detect-libc": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", + "integrity": "sha1-+hN8S9aY7fVc1c0CrFWfkaTEups=", + "bin": { + "detect-libc": "bin/detect-libc.js" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/diff": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", + "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", + "dev": true, + "engines": { + "node": ">=0.3.1" + } + }, + "node_modules/dir-glob": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", + "integrity": "sha1-Vtv3PZkqSpO6FYT0U0Bj/S5BcX8=", + "dev": true, + "dependencies": { + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/dir-glob/node_modules/path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha1-hO0BwKe6OAr+CdkKjBgNzZ0DBDs=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/doctrine": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-3.0.0.tgz", + "integrity": "sha1-rd6+rXKmV023g2OdyHoSF3OXOWE=", + "dev": true, + "dependencies": { + "esutils": "^2.0.2" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/duplexer2": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.1.4.tgz", + "integrity": "sha1-ixLauHjA1p4+eJEFFmKjL8a93ME=", + "dev": true, + "dependencies": { + "readable-stream": "^2.0.2" + } + }, + "node_modules/ecc-jsbn": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", + "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=", + "dev": true, + "dependencies": { + "jsbn": "~0.1.0", + "safer-buffer": "^2.1.0" + } + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true + }, + "node_modules/end-of-stream": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", + "integrity": "sha1-WuZKX0UFe682JuwU2gyl5LJDHrA=", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/enquirer": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/enquirer/-/enquirer-2.3.6.tgz", + "integrity": "sha512-yjNnPr315/FjS4zIsUxYguYUPP2e1NK4d7E7ZOLiyYCcbFBiTMyID+2wvm2w6+pZ/odMA7cRkjhsPbltwBOrLg==", + "dev": true, + "dependencies": { + "ansi-colors": "^4.1.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/error-ex": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", + "integrity": "sha1-tKxAZIEH/c3PriQvQovqihTU8b8=", + "dev": true, + "dependencies": { + "is-arrayish": "^0.2.1" + } + }, + "node_modules/es-abstract": { + "version": "1.17.7", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.17.7.tgz", + "integrity": "sha1-pN5hsvZpifx0IWdsHLl4dXOs5Uw=", + "dev": true, + "dependencies": { + "es-to-primitive": "^1.2.1", + "function-bind": "^1.1.1", + "has": "^1.0.3", + "has-symbols": "^1.0.1", + "is-callable": "^1.2.2", + "is-regex": "^1.1.1", + "object-inspect": "^1.8.0", + "object-keys": "^1.1.1", + "object.assign": "^4.1.1", + "string.prototype.trimend": "^1.0.1", + "string.prototype.trimstart": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-to-primitive": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.2.1.tgz", + "integrity": "sha1-5VzUyc3BiLzvsDs2bHNjI/xciYo=", + "dev": true, + "dependencies": { + "is-callable": "^1.1.4", + "is-date-object": "^1.0.1", + "is-symbol": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es6-promise": { + "version": "4.2.8", + "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-4.2.8.tgz", + "integrity": "sha1-TrIVlMlyvEBVPSduUQU5FD21Pgo=", + "dev": true + }, + "node_modules/es6-promisify": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", + "dev": true, + "dependencies": { + "es6-promise": "^4.0.3" + } + }, + "node_modules/escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", + "dev": true, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/eslint": { + "version": "7.19.0", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-7.19.0.tgz", + "integrity": "sha512-CGlMgJY56JZ9ZSYhJuhow61lMPPjUzWmChFya71Z/jilVos7mR/jPgaEfVGgMBY5DshbKdG8Ezb8FDCHcoMEMg==", + "dev": true, + "dependencies": { + "@babel/code-frame": "^7.0.0", + "@eslint/eslintrc": "^0.3.0", + "ajv": "^6.10.0", + "chalk": "^4.0.0", + "cross-spawn": "^7.0.2", + "debug": "^4.0.1", + "doctrine": "^3.0.0", + "enquirer": "^2.3.5", + "eslint-scope": "^5.1.1", + "eslint-utils": "^2.1.0", + "eslint-visitor-keys": "^2.0.0", + "espree": "^7.3.1", + "esquery": "^1.2.0", + "esutils": "^2.0.2", + "file-entry-cache": "^6.0.0", + "functional-red-black-tree": "^1.0.1", + "glob-parent": "^5.0.0", + "globals": "^12.1.0", + "ignore": "^4.0.6", + "import-fresh": "^3.0.0", + "imurmurhash": "^0.1.4", + "is-glob": "^4.0.0", + "js-yaml": "^3.13.1", + "json-stable-stringify-without-jsonify": "^1.0.1", + "levn": "^0.4.1", + "lodash": "^4.17.20", + "minimatch": "^3.0.4", + "natural-compare": "^1.4.0", + "optionator": "^0.9.1", + "progress": "^2.0.0", + "regexpp": "^3.1.0", + "semver": "^7.2.1", + "strip-ansi": "^6.0.0", + "strip-json-comments": "^3.1.0", + "table": "^6.0.4", + "text-table": "^0.2.0", + "v8-compile-cache": "^2.0.3" + }, + "bin": { + "eslint": "bin/eslint.js" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/eslint-import-resolver-node": { + "version": "0.3.4", + "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.4.tgz", + "integrity": "sha1-hf+oGULCUBLYIxCW3fZ5wDBCxxc=", + "dev": true, + "dependencies": { + "debug": "^2.6.9", + "resolve": "^1.13.1" + } + }, + "node_modules/eslint-import-resolver-node/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha1-XRKFFd8TT/Mn6QpMk/Tgd6U2NB8=", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/eslint-import-resolver-node/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "node_modules/eslint-module-utils": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.6.0.tgz", + "integrity": "sha1-V569CU9Wr3eX0ZyYZsnJSGYpv6Y=", + "dev": true, + "dependencies": { + "debug": "^2.6.9", + "pkg-dir": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/eslint-module-utils/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha1-XRKFFd8TT/Mn6QpMk/Tgd6U2NB8=", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/eslint-module-utils/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "node_modules/eslint-plugin-import": { + "version": "2.22.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.22.1.tgz", + "integrity": "sha1-CJbH5qDPRBCaLZe5WQPCu2iddwI=", + "dev": true, + "dependencies": { + "array-includes": "^3.1.1", + "array.prototype.flat": "^1.2.3", + "contains-path": "^0.1.0", + "debug": "^2.6.9", + "doctrine": "1.5.0", + "eslint-import-resolver-node": "^0.3.4", + "eslint-module-utils": "^2.6.0", + "has": "^1.0.3", + "minimatch": "^3.0.4", + "object.values": "^1.1.1", + "read-pkg-up": "^2.0.0", + "resolve": "^1.17.0", + "tsconfig-paths": "^3.9.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/eslint-plugin-import/node_modules/debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha1-XRKFFd8TT/Mn6QpMk/Tgd6U2NB8=", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/eslint-plugin-import/node_modules/doctrine": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-1.5.0.tgz", + "integrity": "sha1-N53Ocw9hZvds76TmcHoVmwLFpvo=", + "dev": true, + "dependencies": { + "esutils": "^2.0.2", + "isarray": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/eslint-plugin-import/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "node_modules/eslint-plugin-jsdoc": { + "version": "31.6.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-31.6.0.tgz", + "integrity": "sha512-kYhdW+BXHij9n12oHvAC27oDHKEFITz1YJP/C0NPtb+gsGJWxejh5B6dEmmj6oLYOsmNvuCVkdIcqYOyabP2QA==", + "dev": true, + "dependencies": { + "comment-parser": "1.1.1", + "debug": "^4.3.1", + "jsdoctypeparser": "^9.0.0", + "lodash": "^4.17.20", + "regextras": "^0.7.1", + "semver": "^7.3.4", + "spdx-expression-parse": "^3.0.1" + }, + "engines": { + "node": ">=10" + }, + "peerDependencies": { + "eslint": "^6.0.0 || ^7.0.0" + } + }, + "node_modules/eslint-plugin-jsdoc/node_modules/lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint-plugin-jsdoc/node_modules/semver": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint-plugin-jsdoc/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + }, + "node_modules/eslint-plugin-prefer-arrow": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/eslint-plugin-prefer-arrow/-/eslint-plugin-prefer-arrow-1.2.3.tgz", + "integrity": "sha512-J9I5PKCOJretVuiZRGvPQxCbllxGAV/viI20JO3LYblAodofBxyMnZAJ+WGeClHgANnSJberTNoFWWjrWKBuXQ==", + "dev": true, + "peerDependencies": { + "eslint": ">=2.0.0" + } + }, + "node_modules/eslint-scope": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-5.1.1.tgz", + "integrity": "sha1-54blmmbLkrP2wfsNUIqrF0hI9Iw=", + "dev": true, + "dependencies": { + "esrecurse": "^4.3.0", + "estraverse": "^4.1.1" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/eslint-utils": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-2.1.0.tgz", + "integrity": "sha512-w94dQYoauyvlDc43XnGB8lU3Zt713vNChgt4EWwhXAP2XkBvndfxF0AgIqKOOasjPIPzj9JqgwkwbCYD0/V3Zg==", + "dev": true, + "dependencies": { + "eslint-visitor-keys": "^1.1.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/mysticatea" + } + }, + "node_modules/eslint-utils/node_modules/eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/eslint-visitor-keys": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-2.0.0.tgz", + "integrity": "sha512-QudtT6av5WXels9WjIM7qz1XD1cWGvX4gGXvp/zBn9nXG02D0utdU3Em2m/QjTnrsk6bBjmCygl3rmj118msQQ==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint/node_modules/ansi-regex": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", + "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/eslint/node_modules/chalk": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.0.tgz", + "integrity": "sha512-qwx12AxXe2Q5xQ43Ac//I6v5aXTipYrSESdOgzrN+9XjgEpyjpKuvSGaN4qE93f7TQTlerQQ8S+EQ0EyDoVL1A==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/eslint/node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/eslint/node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "node_modules/eslint/node_modules/cross-spawn": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", + "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "dev": true, + "dependencies": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/eslint/node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint/node_modules/path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/semver": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/eslint/node_modules/shebang-command": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", + "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", + "dev": true, + "dependencies": { + "shebang-regex": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/shebang-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", + "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/strip-ansi": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", + "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha1-MfEoGzgyYwQ0gxwxDAHMzajL4AY=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/eslint/node_modules/which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "dev": true, + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "node-which": "bin/node-which" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/eslint/node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + }, + "node_modules/espree": { + "version": "7.3.1", + "resolved": "https://registry.npmjs.org/espree/-/espree-7.3.1.tgz", + "integrity": "sha512-v3JCNCE64umkFpmkFGqzVKsOT0tN1Zr+ueqLZfpV1Ob8e+CEgPWa+OxCoGH3tnhimMKIaBm4m/vaRpJ/krRz2g==", + "dev": true, + "dependencies": { + "acorn": "^7.4.0", + "acorn-jsx": "^5.3.1", + "eslint-visitor-keys": "^1.3.0" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + } + }, + "node_modules/espree/node_modules/eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/esprima": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", + "integrity": "sha1-E7BM2z5sXRnfkatph6hpVhmwqnE=", + "dev": true, + "bin": { + "esparse": "bin/esparse.js", + "esvalidate": "bin/esvalidate.js" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/esquery": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.3.1.tgz", + "integrity": "sha1-t4tYKKqOIU4p+3TE1bdS4cAz2lc=", + "dev": true, + "dependencies": { + "estraverse": "^5.1.0" + }, + "engines": { + "node": ">=0.10" + } + }, + "node_modules/esquery/node_modules/estraverse": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", + "integrity": "sha1-MH30JUfmzHMk088DwVXVzbjFOIA=", + "dev": true, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esrecurse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", + "integrity": "sha1-eteWTWeauyi+5yzsY3WLHF0smSE=", + "dev": true, + "dependencies": { + "estraverse": "^5.2.0" + }, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esrecurse/node_modules/estraverse": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz", + "integrity": "sha1-MH30JUfmzHMk088DwVXVzbjFOIA=", + "dev": true, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/estraverse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.3.0.tgz", + "integrity": "sha1-OYrT88WiSUi+dyXoPRGn3ijNvR0=", + "dev": true, + "engines": { + "node": ">=4.0" + } + }, + "node_modules/esutils": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", + "integrity": "sha1-dNLrTeC42hKTcRkQ1Qd1ubcQ72Q=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/execa": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/execa/-/execa-1.0.0.tgz", + "integrity": "sha1-xiNqW7TfbW8V6I5/AXeYIWdJ3dg=", + "dev": true, + "dependencies": { + "cross-spawn": "^6.0.0", + "get-stream": "^4.0.0", + "is-stream": "^1.1.0", + "npm-run-path": "^2.0.0", + "p-finally": "^1.0.0", + "signal-exit": "^3.0.0", + "strip-eof": "^1.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/expand-template": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", + "integrity": "sha1-bhSz/O4POmNA7LV9LokYaSBSpHw=", + "engines": { + "node": ">=6" + } + }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha1-+LETa0Bx+9jrFAr/hYsQGewpFfo=", + "dev": true + }, + "node_modules/extsprintf": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", + "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=", + "dev": true, + "engines": [ + "node >=0.6.0" + ] + }, + "node_modules/fast-deep-equal": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", + "integrity": "sha1-On1WtVnWy8PrUSMlJE5hmmXGxSU=", + "dev": true + }, + "node_modules/fast-glob": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.2.4.tgz", + "integrity": "sha1-0grvv5lXk4Pn88xmUpFYybmFVNM=", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.0", + "merge2": "^1.3.0", + "micromatch": "^4.0.2", + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/fast-json-stable-stringify": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", + "integrity": "sha1-h0v2nG9ATCtdmcSBNBOZ/VWJJjM=", + "dev": true + }, + "node_modules/fast-levenshtein": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", + "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=", + "dev": true + }, + "node_modules/fast-safe-stringify": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz", + "integrity": "sha1-EkqohYmSYfaK7bQqfAgN6dpgh0M=", + "dev": true + }, + "node_modules/fastq": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.8.0.tgz", + "integrity": "sha1-VQ4fn1m7xl/hhctqm02VNXEH9IE=", + "dev": true, + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/file-entry-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.0.tgz", + "integrity": "sha512-fqoO76jZ3ZnYrXLDRxBR1YvOvc0k844kcOg40bgsPrE25LAb/PDqTY+ho64Xh2c8ZXgIKldchCFHczG2UVRcWA==", + "dev": true, + "dependencies": { + "flat-cache": "^3.0.4" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + } + }, + "node_modules/fill-range": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", + "integrity": "sha1-GRmmp8df44ssfHflGYU12prN2kA=", + "dev": true, + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "dependencies": { + "locate-path": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/flat": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/flat/-/flat-5.0.2.tgz", + "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==", + "dev": true, + "bin": { + "flat": "cli.js" + } + }, + "node_modules/flat-cache": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.0.4.tgz", + "integrity": "sha512-dm9s5Pw7Jc0GvMYbshN6zchCA9RgQlzzEZX3vylR9IqFfS8XciblUXOKfW6SiuJ0e13eDYZoZV5wdrev7P3Nwg==", + "dev": true, + "dependencies": { + "flatted": "^3.1.0", + "rimraf": "^3.0.2" + }, + "engines": { + "node": "^10.12.0 || >=12.0.0" + } + }, + "node_modules/flat-cache/node_modules/rimraf": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", + "dev": true, + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/flatted": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.1.1.tgz", + "integrity": "sha512-zAoAQiudy+r5SvnSw3KJy5os/oRJYHzrzja/tBDqrZtNhUw8bt6y8OBzMWcjWr+8liV8Eb6yOhw8WZ7VFZ5ZzA==", + "dev": true + }, + "node_modules/forever-agent": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", + "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=", + "dev": true, + "engines": { + "node": "*" + } + }, + "node_modules/form-data": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", + "integrity": "sha1-3M5SwF9kTymManq5Nr1yTO/786Y=", + "dev": true, + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.6", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 0.12" + } + }, + "node_modules/fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha1-a+Dem+mYzhavivwkSXue6bfM2a0=" + }, + "node_modules/fs-extra": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.1.0.tgz", + "integrity": "sha512-hcg3ZmepS30/7BSFqRvoo3DOMQu7IjqxO5nCDt+zM9XWjb33Wg7ziNT+Qvqbuc3+gWpzO02JubVyk2G4Zvo1OQ==", + "dev": true, + "dependencies": { + "at-least-node": "^1.0.0", + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/fs-extra/node_modules/jsonfile": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.0.1.tgz", + "integrity": "sha1-mJZsuiFDeMjIS4LghZB7QL9hQXk=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.6", + "universalify": "^1.0.0" + } + }, + "node_modules/fs-extra/node_modules/jsonfile/node_modules/universalify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-1.0.0.tgz", + "integrity": "sha512-rb6X1W158d7pRQBg5gkR8uPaSfiids68LTJQYOtEUhoJUWBdaQHsuT/EUduxXYxcrt4r5PJ4fuHW1MHT6p0qug==", + "dev": true, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/fs-extra/node_modules/universalify": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.0.tgz", + "integrity": "sha512-hAZsKq7Yy11Zu1DE0OzWjw7nnLZmJZYTDZZyEFHZdUhV8FkH5MCfoU1XMaxXovpyW5nq5scPqq0ZDP9Zyl04oQ==", + "dev": true, + "engines": { + "node": ">= 10.0.0" + } + }, + "node_modules/fs-minipass": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/fs-minipass/-/fs-minipass-1.2.7.tgz", + "integrity": "sha1-zP+FcIQef+QmVpPaiJNsVa7X98c=", + "dev": true, + "dependencies": { + "minipass": "^2.6.0" + } + }, + "node_modules/fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true + }, + "node_modules/fsevents": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.1.3.tgz", + "integrity": "sha512-Auw9a4AxqWpa9GUfj370BMPzzyncfBABW8Mab7BGWBYDj4Isgq+cDKtx0i6u9jcX9pQDnswsaaOTgTmA5pEjuQ==", + "deprecated": "\"Please update to latest v2.3 or v2.2\"", + "dev": true, + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/fstream": { + "version": "1.0.12", + "resolved": "https://registry.npmjs.org/fstream/-/fstream-1.0.12.tgz", + "integrity": "sha1-Touo7i1Ivk99DeUFRVVI6uWTIEU=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.2", + "inherits": "~2.0.0", + "mkdirp": ">=0.5 0", + "rimraf": "2" + }, + "engines": { + "node": ">=0.6" + } + }, + "node_modules/function-bind": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", + "integrity": "sha1-pWiZ0+o8m6uHS7l3O3xe3pL0iV0=", + "dev": true + }, + "node_modules/functional-red-black-tree": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz", + "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=", + "dev": true + }, + "node_modules/gauge": { + "version": "2.7.4", + "resolved": "https://registry.npmjs.org/gauge/-/gauge-2.7.4.tgz", + "integrity": "sha1-LANAXHU4w51+s3sxcCLjJfsBi/c=", + "dependencies": { + "aproba": "^1.0.3", + "console-control-strings": "^1.0.0", + "has-unicode": "^2.0.0", + "object-assign": "^4.1.0", + "signal-exit": "^3.0.0", + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1", + "wide-align": "^1.1.0" + } + }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha1-T5RBKoLbMvNuOwuXQfipf+sDH34=", + "dev": true, + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha1-wbJVV189wh1Zv8ec09K0axw6VLU=", + "dev": true, + "dependencies": { + "pump": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/getpass": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", + "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", + "dev": true, + "dependencies": { + "assert-plus": "^1.0.0" + } + }, + "node_modules/github-from-package": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", + "integrity": "sha1-l/tdlr/eiXMxPyDoKI75oWf6ZM4=" + }, + "node_modules/glob": { + "version": "7.1.6", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz", + "integrity": "sha1-FB8zuBp8JJLhJVlDB0gMRmeSeKY=", + "dev": true, + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + } + }, + "node_modules/glob-parent": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.1.tgz", + "integrity": "sha1-tsHvQXxOVmPqSY8cRa+saRa7wik=", + "dev": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/globals": { + "version": "12.4.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-12.4.0.tgz", + "integrity": "sha1-oYgTV2pBsAokqX5/gVkYwuGZJfg=", + "dev": true, + "dependencies": { + "type-fest": "^0.8.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/globby": { + "version": "11.0.2", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.0.2.tgz", + "integrity": "sha512-2ZThXDvvV8fYFRVIxnrMQBipZQDr7MxKAmQK1vujaj9/7eF0efG7BPUKJ7jP7G5SLF37xKDXvO4S/KKLj/Z0og==", + "dev": true, + "dependencies": { + "array-union": "^2.1.0", + "dir-glob": "^3.0.1", + "fast-glob": "^3.1.1", + "ignore": "^5.1.4", + "merge2": "^1.3.0", + "slash": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/globby/node_modules/ignore": { + "version": "5.1.8", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.1.8.tgz", + "integrity": "sha1-8VCotQo0KJsz4i9YiavU2AFvDlc=", + "dev": true, + "engines": { + "node": ">= 4" + } + }, + "node_modules/graceful-fs": { + "version": "4.2.4", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.4.tgz", + "integrity": "sha1-Ila94U02MpWMRl68ltxGfKB6Kfs=", + "dev": true + }, + "node_modules/growl": { + "version": "1.10.5", + "resolved": "https://registry.npmjs.org/growl/-/growl-1.10.5.tgz", + "integrity": "sha1-8nNdwig2dPpnR4sQGBBZNVw2nl4=", + "dev": true, + "engines": { + "node": ">=4.x" + } + }, + "node_modules/handlebars": { + "version": "4.7.6", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.6.tgz", + "integrity": "sha1-1MBcG6+Q6ZRfd6pop6IZqkp9904=", + "dev": true, + "dependencies": { + "minimist": "^1.2.5", + "neo-async": "^2.6.0", + "source-map": "^0.6.1", + "uglify-js": "^3.1.4", + "wordwrap": "^1.0.0" + }, + "bin": { + "handlebars": "bin/handlebars" + }, + "engines": { + "node": ">=0.4.7" + }, + "optionalDependencies": { + "uglify-js": "^3.1.4" + } + }, + "node_modules/har-schema": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz", + "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/har-validator": { + "version": "5.1.5", + "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz", + "integrity": "sha1-HwgDufjLIMD6E4It8ezds2veHv0=", + "dev": true, + "dependencies": { + "ajv": "^6.12.3", + "har-schema": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/has": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", + "integrity": "sha1-ci18v8H2qoJB8W3YFOAR4fQeh5Y=", + "dev": true, + "dependencies": { + "function-bind": "^1.1.1" + }, + "engines": { + "node": ">= 0.4.0" + } + }, + "node_modules/has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/has-symbols": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.1.tgz", + "integrity": "sha1-n1IUdYpEGWxAbZvXbOv4HsLdMeg=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/has-unicode": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/has-unicode/-/has-unicode-2.0.1.tgz", + "integrity": "sha1-4Ob+aijPUROIVeCG0Wkedx3iqLk=" + }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha1-hK5l+n6vsWX922FWauFLrwVmTw8=", + "dev": true, + "bin": { + "he": "bin/he" + } + }, + "node_modules/hosted-git-info": { + "version": "2.8.8", + "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.8.tgz", + "integrity": "sha1-dTm9S8Hg4KiVgVouAmJCCxKFhIg=", + "dev": true + }, + "node_modules/http-proxy-agent": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-2.1.0.tgz", + "integrity": "sha1-5IIb7vWyFCogJr1zkm/lN2McVAU=", + "dev": true, + "dependencies": { + "agent-base": "4", + "debug": "3.1.0" + }, + "engines": { + "node": ">= 4.5.0" + } + }, + "node_modules/http-proxy-agent/node_modules/debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha1-W7WgZyYotkFJVmuhaBnmFRjGcmE=", + "dev": true, + "dependencies": { + "ms": "2.0.0" + } + }, + "node_modules/http-proxy-agent/node_modules/ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "node_modules/http-signature": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz", + "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=", + "dev": true, + "dependencies": { + "assert-plus": "^1.0.0", + "jsprim": "^1.2.2", + "sshpk": "^1.7.0" + }, + "engines": { + "node": ">=0.8", + "npm": ">=1.3.7" + } + }, + "node_modules/https-proxy-agent": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-2.2.4.tgz", + "integrity": "sha1-TuenN6vZJniik9mzShr00NCMeHs=", + "dev": true, + "dependencies": { + "agent-base": "^4.3.0", + "debug": "^3.1.0" + }, + "engines": { + "node": ">= 4.5.0" + } + }, + "node_modules/https-proxy-agent/node_modules/debug": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", + "integrity": "sha1-6D0X3hbYp++3cX7b5fsQE17uYps=", + "dev": true, + "dependencies": { + "ms": "^2.1.1" + } + }, + "node_modules/ieee754": { + "version": "1.1.13", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.13.tgz", + "integrity": "sha1-7BaFWOlaoYH9h9N/VcMrvLZwi4Q=" + }, + "node_modules/ignore": { + "version": "4.0.6", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-4.0.6.tgz", + "integrity": "sha1-dQ49tYYgh7RzfrrIIH/9HvJ7Jfw=", + "dev": true, + "engines": { + "node": ">= 4" + } + }, + "node_modules/import-fresh": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.2.1.tgz", + "integrity": "sha1-Yz/2GFBueTr1rJG/SLcmd+FcvmY=", + "dev": true, + "dependencies": { + "parent-module": "^1.0.0", + "resolve-from": "^4.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/imurmurhash": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", + "integrity": "sha1-khi5srkoojixPcT7a21XbyMUU+o=", + "dev": true, + "engines": { + "node": ">=0.8.19" + } + }, + "node_modules/inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "dev": true, + "dependencies": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha1-D6LGT5MpF8NDOg3tVTY6rjdBa3w=" + }, + "node_modules/ini": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" + }, + "node_modules/interpret": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz", + "integrity": "sha1-Zlq4vE2iendKQFhOgS4+D6RbGh4=", + "dev": true, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/invert-kv": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-1.0.0.tgz", + "integrity": "sha1-EEqOSqym09jNFXqO+L+rLXo//bY=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-arrayish": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", + "integrity": "sha1-d8mYQFJ6qOyxqLppe4BkWnqSap0=", + "dev": true + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "dev": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-callable": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.2.tgz", + "integrity": "sha1-x8ZxXNItTdtI0+GZcCI6zquwgNk=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-date-object": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.2.tgz", + "integrity": "sha1-vac28s2P0G0yhE53Q7+nSUw7/X4=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-fullwidth-code-point": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", + "integrity": "sha1-754xOG8DGn8NZDr4L95QxFfvAMs=", + "dependencies": { + "number-is-nan": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-glob": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.1.tgz", + "integrity": "sha1-dWfb6fL14kZ7x3q4PEopSCQHpdw=", + "dev": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-iojs": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-iojs/-/is-iojs-1.1.0.tgz", + "integrity": "sha1-TBEDO11dlNbqs3dd7cm+fQCDJfE=", + "dev": true + }, + "node_modules/is-negative-zero": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.0.tgz", + "integrity": "sha1-lVOxIbD6wohp2p7UWeIMdUN4hGE=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha1-dTU0W4lnNNX4DE0GxQlVUnoU8Ss=", + "dev": true, + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/is-plain-obj": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-2.1.0.tgz", + "integrity": "sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-regex": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.1.tgz", + "integrity": "sha1-xvmKrMVG9s7FRooHt7FTq1ZKV7k=", + "dev": true, + "dependencies": { + "has-symbols": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-stream": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", + "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-string": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.0.5.tgz", + "integrity": "sha1-QEk+0ZjvP/R3uMf5L2ROyCpc06Y=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-symbol": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.0.3.tgz", + "integrity": "sha1-OOEBS55jKb4N6dJKQU/XRB7GGTc=", + "dev": true, + "dependencies": { + "has-symbols": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/is-typedarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", + "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=", + "dev": true + }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=" + }, + "node_modules/isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=", + "dev": true + }, + "node_modules/isstream": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", + "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=", + "dev": true + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha1-GSA/tZmR35jjoocFDUZHzerzJJk=", + "dev": true + }, + "node_modules/js-yaml": { + "version": "3.14.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.14.0.tgz", + "integrity": "sha1-p6NBcPJqIbsWJCTYray0ETpp5II=", + "dev": true, + "dependencies": { + "argparse": "^1.0.7", + "esprima": "^4.0.0" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/jsbn": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", + "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=", + "dev": true + }, + "node_modules/jsdoctypeparser": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/jsdoctypeparser/-/jsdoctypeparser-9.0.0.tgz", + "integrity": "sha512-jrTA2jJIL6/DAEILBEh2/w9QxCuwmvNXIry39Ay/HVfhE3o2yVV0U44blYkqdHA/OKloJEqvJy0xU+GSdE2SIw==", + "dev": true, + "bin": { + "jsdoctypeparser": "bin/jsdoctypeparser" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/json-parse-better-errors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz", + "integrity": "sha1-u4Z8+zRQ5pEHwTHRxRS6s9yLyqk=", + "dev": true + }, + "node_modules/json-schema": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", + "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=", + "dev": true + }, + "node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha1-afaofZUTq4u4/mO9sJecRI5oRmA=", + "dev": true + }, + "node_modules/json-stable-stringify-without-jsonify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", + "integrity": "sha1-nbe1lJatPzz+8wp1FC0tkwrXJlE=", + "dev": true + }, + "node_modules/json-stringify-safe": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", + "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=", + "dev": true + }, + "node_modules/json5": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.1.tgz", + "integrity": "sha1-d5+wAYYE+oVOrL9iUhgNg1Q+Pb4=", + "dev": true, + "dependencies": { + "minimist": "^1.2.0" + }, + "bin": { + "json5": "lib/cli.js" + } + }, + "node_modules/jsonc": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/jsonc/-/jsonc-2.0.0.tgz", + "integrity": "sha1-niolEA0WSpu4ZMV1F1Y3F/qIJVE=", + "dev": true, + "dependencies": { + "fast-safe-stringify": "^2.0.6", + "graceful-fs": "^4.1.15", + "mkdirp": "^0.5.1", + "parse-json": "^4.0.0", + "strip-bom": "^4.0.0", + "strip-json-comments": "^3.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/jsonc/node_modules/parse-json": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", + "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", + "dev": true, + "dependencies": { + "error-ex": "^1.3.1", + "json-parse-better-errors": "^1.0.1" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/jsonc/node_modules/strip-bom": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-4.0.0.tgz", + "integrity": "sha1-nDUFwdtFvO3KPZz3oW9cWqOQGHg=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jsonc/node_modules/strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha1-MfEoGzgyYwQ0gxwxDAHMzajL4AY=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/jsonfile": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-4.0.0.tgz", + "integrity": "sha1-h3Gq4HmbZAdrdmQPygWPnBDjPss=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.6" + } + }, + "node_modules/jsprim": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz", + "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", + "dev": true, + "engines": [ + "node >=0.6.0" + ], + "dependencies": { + "assert-plus": "1.0.0", + "extsprintf": "1.3.0", + "json-schema": "0.2.3", + "verror": "1.10.0" + } + }, + "node_modules/klaw-sync": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/klaw-sync/-/klaw-sync-6.0.0.tgz", + "integrity": "sha1-H9LP1W67YlAYERTwpYEWcJnCsow=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.11" + } + }, + "node_modules/lcid": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/lcid/-/lcid-1.0.0.tgz", + "integrity": "sha1-MIrMr6C8SDo4Z7S28rlQYlHRuDU=", + "dev": true, + "dependencies": { + "invert-kv": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/levn": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", + "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==", + "dev": true, + "dependencies": { + "prelude-ls": "^1.2.1", + "type-check": "~0.4.0" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/listenercount": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/listenercount/-/listenercount-1.0.1.tgz", + "integrity": "sha1-hMinKrWcRyUyFIDJdeZQg0LnCTc=", + "dev": true + }, + "node_modules/load-json-file": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-2.0.0.tgz", + "integrity": "sha1-eUfkIUmvgNaWy/eXvKq8/h/inKg=", + "dev": true, + "dependencies": { + "graceful-fs": "^4.1.2", + "parse-json": "^2.2.0", + "pify": "^2.0.0", + "strip-bom": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/locate-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-2.0.0.tgz", + "integrity": "sha1-K1aLJl7slExtnA3pw9u7ygNUzY4=", + "dev": true, + "dependencies": { + "p-locate": "^2.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/lodash": { + "version": "4.17.20", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.20.tgz", + "integrity": "sha1-tEqbYpe8tpjxxRo1RaKzs2jVnFI=", + "dev": true + }, + "node_modules/lodash.pad": { + "version": "4.5.1", + "resolved": "https://registry.npmjs.org/lodash.pad/-/lodash.pad-4.5.1.tgz", + "integrity": "sha1-QzCUmoM6fI2iLMIPaibE1Z3runA=", + "dev": true + }, + "node_modules/lodash.padend": { + "version": "4.6.1", + "resolved": "https://registry.npmjs.org/lodash.padend/-/lodash.padend-4.6.1.tgz", + "integrity": "sha1-U8y6BH0G4VjTEfRdpiX05J5vFm4=", + "dev": true + }, + "node_modules/lodash.padstart": { + "version": "4.6.1", + "resolved": "https://registry.npmjs.org/lodash.padstart/-/lodash.padstart-4.6.1.tgz", + "integrity": "sha1-0uPuv/DZ05rVD1y9G1KnvOa7YRs=", + "dev": true + }, + "node_modules/log-symbols": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-4.0.0.tgz", + "integrity": "sha512-FN8JBzLx6CzeMrB0tg6pqlGU1wCrXW+ZXGH481kfsBqer0hToTIiHdjH4Mq8xJUbvATujKCvaREGWpGUionraA==", + "dev": true, + "dependencies": { + "chalk": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/log-symbols/node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/log-symbols/node_modules/chalk": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.0.tgz", + "integrity": "sha512-qwx12AxXe2Q5xQ43Ac//I6v5aXTipYrSESdOgzrN+9XjgEpyjpKuvSGaN4qE93f7TQTlerQQ8S+EQ0EyDoVL1A==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/log-symbols/node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/log-symbols/node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "node_modules/log-symbols/node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/log-symbols/node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/long": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz", + "integrity": "sha1-mntxz7fTYaGU6lVSQckvdGjVvyg=", + "dev": true + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/lunr": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", + "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", + "dev": true + }, + "node_modules/macos-release": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/macos-release/-/macos-release-2.4.1.tgz", + "integrity": "sha1-ZAM9Dsal5jdRVadLGh66jlCYIKw=", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/marked": { + "version": "1.2.9", + "resolved": "https://registry.npmjs.org/marked/-/marked-1.2.9.tgz", + "integrity": "sha512-H8lIX2SvyitGX+TRdtS06m1jHMijKN/XjfH6Ooii9fvxMlh8QdqBfBDkGUpMWH2kQNrtixjzYUa3SH8ROTgRRw==", + "dev": true, + "bin": { + "marked": "bin/marked" + }, + "engines": { + "node": ">= 8.16.2" + } + }, + "node_modules/memory-stream": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/memory-stream/-/memory-stream-0.0.3.tgz", + "integrity": "sha1-6+jdHDuLw4wOeUHp3dWuvmtN6D8=", + "dev": true, + "dependencies": { + "readable-stream": "~1.0.26-2" + } + }, + "node_modules/memory-stream/node_modules/isarray": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", + "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=", + "dev": true + }, + "node_modules/memory-stream/node_modules/readable-stream": { + "version": "1.0.34", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", + "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", + "dev": true, + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "0.0.1", + "string_decoder": "~0.10.x" + } + }, + "node_modules/memory-stream/node_modules/string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", + "dev": true + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha1-Q2iJL4hekHRVpv19xVwMnUBJkK4=", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromatch": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.2.tgz", + "integrity": "sha1-T8sJmb+fvC/L3SEvbWKbmlbDklk=", + "dev": true, + "dependencies": { + "braces": "^3.0.1", + "picomatch": "^2.0.5" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/mime-db": { + "version": "1.44.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.44.0.tgz", + "integrity": "sha1-+hHF6wrKEzS0Izy01S8QxaYnL5I=", + "dev": true, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.27", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.27.tgz", + "integrity": "sha1-R5SfmOJ56lMRn1ci4PNOUpvsAJ8=", + "dev": true, + "dependencies": { + "mime-db": "1.44.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mimic-response": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-2.1.0.tgz", + "integrity": "sha1-0Tdj019hPQnsN+uzC6wEacDuj0M=", + "engines": { + "node": ">=8" + } + }, + "node_modules/minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha1-UWbihkV/AzBgZL5Ul+jbsMPTIIM=", + "dev": true, + "dependencies": { + "brace-expansion": "^1.1.7" + }, + "engines": { + "node": "*" + } + }, + "node_modules/minimist": { + "version": "1.2.5", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.5.tgz", + "integrity": "sha1-Z9ZgFLZqaoqqDAg8X9WN9OTpdgI=" + }, + "node_modules/minipass": { + "version": "2.9.0", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-2.9.0.tgz", + "integrity": "sha1-5xN2Ln0+Mv7YAxFc+T4EvKn8yaY=", + "dev": true, + "dependencies": { + "safe-buffer": "^5.1.2", + "yallist": "^3.0.0" + } + }, + "node_modules/minizlib": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.3.3.tgz", + "integrity": "sha1-IpDeloGKNMKVUcio0wEha9Zahh0=", + "dev": true, + "dependencies": { + "minipass": "^2.9.0" + } + }, + "node_modules/mkdirp": { + "version": "0.5.5", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.5.tgz", + "integrity": "sha1-2Rzv1i0UNsoPQWIOJRKI1CAJne8=", + "dev": true, + "dependencies": { + "minimist": "^1.2.5" + }, + "bin": { + "mkdirp": "bin/cmd.js" + } + }, + "node_modules/mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha1-+hDJEVzG2IZb4iG6R+6b7XhgERM=" + }, + "node_modules/mocha": { + "version": "8.2.1", + "resolved": "https://registry.npmjs.org/mocha/-/mocha-8.2.1.tgz", + "integrity": "sha512-cuLBVfyFfFqbNR0uUKbDGXKGk+UDFe6aR4os78XIrMQpZl/nv7JYHcvP5MFIAb374b2zFXsdgEGwmzMtP0Xg8w==", + "dev": true, + "dependencies": { + "@ungap/promise-all-settled": "1.1.2", + "ansi-colors": "4.1.1", + "browser-stdout": "1.3.1", + "chokidar": "3.4.3", + "debug": "4.2.0", + "diff": "4.0.2", + "escape-string-regexp": "4.0.0", + "find-up": "5.0.0", + "glob": "7.1.6", + "growl": "1.10.5", + "he": "1.2.0", + "js-yaml": "3.14.0", + "log-symbols": "4.0.0", + "minimatch": "3.0.4", + "ms": "2.1.2", + "nanoid": "3.1.12", + "serialize-javascript": "5.0.1", + "strip-json-comments": "3.1.1", + "supports-color": "7.2.0", + "which": "2.0.2", + "wide-align": "1.1.3", + "workerpool": "6.0.2", + "yargs": "13.3.2", + "yargs-parser": "13.1.2", + "yargs-unparser": "2.0.0" + }, + "bin": { + "_mocha": "bin/_mocha", + "mocha": "bin/mocha" + }, + "engines": { + "node": ">= 10.12.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mochajs" + } + }, + "node_modules/mocha/node_modules/ansi-regex": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.0.tgz", + "integrity": "sha1-i5+PCM8ay4Q3Vqg5yox+MWjFGZc=", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/cliui": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-5.0.0.tgz", + "integrity": "sha1-3u/P2y6AB4SqNPRvoI4GhRx7u8U=", + "dev": true, + "dependencies": { + "string-width": "^3.1.0", + "strip-ansi": "^5.2.0", + "wrap-ansi": "^5.1.0" + } + }, + "node_modules/mocha/node_modules/debug": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.2.0.tgz", + "integrity": "sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==", + "deprecated": "Debug versions >=3.2.0 <3.2.7 || >=4 <4.3.1 have a low-severity ReDos regression when used in a Node.js environment. It is recommended you upgrade to 3.2.7 or 4.3.1. (https://github.com/visionmedia/debug/issues/797)", + "dev": true, + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/mocha/node_modules/emoji-regex": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", + "integrity": "sha1-kzoEBShgyF6DwSJHnEdIqOTHIVY=", + "dev": true + }, + "node_modules/mocha/node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "dev": true, + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/mocha/node_modules/is-fullwidth-code-point": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", + "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/mocha/node_modules/locate-path": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", + "dev": true, + "dependencies": { + "p-locate": "^5.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/p-limit": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", + "dev": true, + "dependencies": { + "yocto-queue": "^0.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/p-locate": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", + "dev": true, + "dependencies": { + "p-limit": "^3.0.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/p-try": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/path-exists": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/mocha/node_modules/string-width": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz", + "integrity": "sha1-InZ74htirxCBV0MG9prFG2IgOWE=", + "dev": true, + "dependencies": { + "emoji-regex": "^7.0.1", + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^5.1.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/strip-ansi": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz", + "integrity": "sha1-jJpTb+tq/JYr36WxBKUJHBrZwK4=", + "dev": true, + "dependencies": { + "ansi-regex": "^4.1.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", + "dev": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/mocha/node_modules/which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "dev": true, + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "node-which": "bin/node-which" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/mocha/node_modules/wrap-ansi": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-5.1.0.tgz", + "integrity": "sha1-H9H2cjXVttD+54EFYAG/tpTAOwk=", + "dev": true, + "dependencies": { + "ansi-styles": "^3.2.0", + "string-width": "^3.0.0", + "strip-ansi": "^5.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/y18n": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", + "integrity": "sha1-le+U+F7MgdAHwmThkKEg8KPIVms=", + "dev": true + }, + "node_modules/mocha/node_modules/yargs": { + "version": "13.3.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-13.3.2.tgz", + "integrity": "sha1-rX/+/sGqWVZayRX4Lcyzipwxot0=", + "dev": true, + "dependencies": { + "cliui": "^5.0.0", + "find-up": "^3.0.0", + "get-caller-file": "^2.0.1", + "require-directory": "^2.1.1", + "require-main-filename": "^2.0.0", + "set-blocking": "^2.0.0", + "string-width": "^3.0.0", + "which-module": "^2.0.0", + "y18n": "^4.0.0", + "yargs-parser": "^13.1.2" + } + }, + "node_modules/mocha/node_modules/yargs/node_modules/find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "dependencies": { + "locate-path": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/yargs/node_modules/locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "dependencies": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/yargs/node_modules/p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "dependencies": { + "p-try": "^2.0.0" + }, + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/mocha/node_modules/yargs/node_modules/p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "dependencies": { + "p-limit": "^2.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/mocha/node_modules/yargs/node_modules/path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha1-0J0fNXtEP0kzgqjrPM0YOHKuYAk=", + "dev": true + }, + "node_modules/nanoid": { + "version": "3.1.12", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.1.12.tgz", + "integrity": "sha512-1qstj9z5+x491jfiC4Nelk+f8XBad7LN20PmyWINJEMRSf3wcAjAWysw1qaA8z6NSKe2sjq1hRSDpBH5paCb6A==", + "dev": true, + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || >=13.7" + } + }, + "node_modules/napi-build-utils": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz", + "integrity": "sha1-sf3cCyxG44Cgt6dvmE3UfEGhOAY=" + }, + "node_modules/natural-compare": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", + "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", + "dev": true + }, + "node_modules/neo-async": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz", + "integrity": "sha1-tKr7k+OustgXTKU88WOrfXMIMF8=", + "dev": true + }, + "node_modules/nice-try": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.5.tgz", + "integrity": "sha1-ozeKdpbOfSI+iPybdkvX7xCJ42Y=", + "dev": true + }, + "node_modules/node-abi": { + "version": "2.19.1", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-2.19.1.tgz", + "integrity": "sha1-aqMlYdCl4v22gQ2MJWQbZXqM6oU=", + "dependencies": { + "semver": "^5.4.1" + } + }, + "node_modules/node-addon-api": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-3.1.0.tgz", + "integrity": "sha512-flmrDNB06LIl5lywUz7YlNGZH/5p0M7W28k8hzd9Lshtdh1wshD2Y+U4h9LD6KObOy1f+fEVdgprPrEymjM5uw==", + "dev": true + }, + "node_modules/node-fetch": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz", + "integrity": "sha1-BFvTI2Mfdu0uK1VXM5RBa2OaAFI=", + "dev": true, + "engines": { + "node": "4.x || >=6.0.0" + } + }, + "node_modules/node-pre-gyp-github": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/node-pre-gyp-github/-/node-pre-gyp-github-1.4.3.tgz", + "integrity": "sha1-VhmZmEMhbfaHDLLaukbBOCWAaMU=", + "dev": true, + "dependencies": { + "@octokit/rest": "^15.9.5", + "commander": "^2.17.0", + "mime-types": "^2.1.19" + }, + "bin": { + "node-pre-gyp-github": "bin/node-pre-gyp-github.js" + } + }, + "node_modules/noop-logger": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/noop-logger/-/noop-logger-0.1.1.tgz", + "integrity": "sha1-lKKxYzxPExdVMAfYlm/Q6EG2pMI=" + }, + "node_modules/normalize-package-data": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.5.0.tgz", + "integrity": "sha1-5m2xg4sgDB38IzIl0SyzZSDiNKg=", + "dev": true, + "dependencies": { + "hosted-git-info": "^2.1.4", + "resolve": "^1.10.0", + "semver": "2 || 3 || 4 || 5", + "validate-npm-package-license": "^3.0.1" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/npm-run-path": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-2.0.2.tgz", + "integrity": "sha1-NakjLfo11wZ7TLLd8jV7GHFTbF8=", + "dev": true, + "dependencies": { + "path-key": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/npmlog": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", + "integrity": "sha1-CKfyqL9zRgR3mp76StXMcXq7lUs=", + "dependencies": { + "are-we-there-yet": "~1.1.2", + "console-control-strings": "~1.1.0", + "gauge": "~2.7.3", + "set-blocking": "~2.0.0" + } + }, + "node_modules/number-is-nan": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", + "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/oauth-sign": { + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", + "integrity": "sha1-R6ewFrqmi1+g7PPe4IqFxnmsZFU=", + "dev": true, + "engines": { + "node": "*" + } + }, + "node_modules/object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-inspect": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.8.0.tgz", + "integrity": "sha1-34B+Xs9TpgnMa/6T6sPMe+WzqdA=", + "dev": true + }, + "node_modules/object-keys": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", + "integrity": "sha1-HEfyct8nfzsdrwYWd9nILiMixg4=", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.1.tgz", + "integrity": "sha1-MDhnpmbN1Bk27N7fsfjz4ypHjN0=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.18.0-next.0", + "has-symbols": "^1.0.1", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.assign/node_modules/es-abstract": { + "version": "1.18.0-next.1", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.18.0-next.1.tgz", + "integrity": "sha1-bjoKS9pxflAjqzuOkL7DYQjSLGg=", + "dev": true, + "dependencies": { + "es-to-primitive": "^1.2.1", + "function-bind": "^1.1.1", + "has": "^1.0.3", + "has-symbols": "^1.0.1", + "is-callable": "^1.2.2", + "is-negative-zero": "^2.0.0", + "is-regex": "^1.1.1", + "object-inspect": "^1.8.0", + "object-keys": "^1.1.1", + "object.assign": "^4.1.1", + "string.prototype.trimend": "^1.0.1", + "string.prototype.trimstart": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.values": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.1.1.tgz", + "integrity": "sha1-aKmezeNWt+kpWjxeDOMdyMlT3l4=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.0-next.1", + "function-bind": "^1.1.1", + "has": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/onigasm": { + "version": "2.2.5", + "resolved": "https://registry.npmjs.org/onigasm/-/onigasm-2.2.5.tgz", + "integrity": "sha512-F+th54mPc0l1lp1ZcFMyL/jTs2Tlq4SqIHKIXGZOR/VkHkF9A7Fr5rRr5+ZG/lWeRsyrClLYRq7s/yFQ/XhWCA==", + "dev": true, + "dependencies": { + "lru-cache": "^5.1.1" + } + }, + "node_modules/onnx-proto": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz", + "integrity": "sha1-JDGiW+4lFI6RWQbdoGh6r+O54EQ=", + "dev": true, + "dependencies": { + "protobufjs": "^6.8.8" + } + }, + "node_modules/optionator": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.1.tgz", + "integrity": "sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==", + "dev": true, + "dependencies": { + "deep-is": "^0.1.3", + "fast-levenshtein": "^2.0.6", + "levn": "^0.4.1", + "prelude-ls": "^1.2.1", + "type-check": "^0.4.0", + "word-wrap": "^1.2.3" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/os-locale": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-1.4.0.tgz", + "integrity": "sha1-IPnxeuKe00XoveWDsT0gCYA8FNk=", + "dev": true, + "dependencies": { + "lcid": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/os-name": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/os-name/-/os-name-3.1.0.tgz", + "integrity": "sha1-3sGdlmKW4c1i1wGlpm7h3ernCAE=", + "dev": true, + "dependencies": { + "macos-release": "^2.2.0", + "windows-release": "^3.1.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha1-P7z7FbiZpEEjs0ttzBi3JDNqLK4=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/p-limit": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-1.3.0.tgz", + "integrity": "sha1-uGvV8MJWkJEcdZD8v8IBDVSzzLg=", + "dev": true, + "dependencies": { + "p-try": "^1.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/p-locate": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-2.0.0.tgz", + "integrity": "sha1-IKAQOyIqcMj9OcwuWAaA893l7EM=", + "dev": true, + "dependencies": { + "p-limit": "^1.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/p-try": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-1.0.0.tgz", + "integrity": "sha1-y8ec26+P1CKOE/Yh8rGiN8GyB7M=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/parent-module": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", + "integrity": "sha1-aR0nCeeMefrjoVZiJFLQB2LKqqI=", + "dev": true, + "dependencies": { + "callsites": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/parse-json": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-2.2.0.tgz", + "integrity": "sha1-9ID0BDTvgHQfhGkJn43qGPVaTck=", + "dev": true, + "dependencies": { + "error-ex": "^1.2.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/path-key": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz", + "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/path-parse": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.6.tgz", + "integrity": "sha1-1i27VnlAXXLEc37FhgDp3c8G0kw=", + "dev": true + }, + "node_modules/path-type": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-2.0.0.tgz", + "integrity": "sha1-8BLMuEFbcJb8LaoQVMPXI4lZTHM=", + "dev": true, + "dependencies": { + "pify": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/performance-now": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", + "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=", + "dev": true + }, + "node_modules/picomatch": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.2.2.tgz", + "integrity": "sha1-IfMz6ba46v8CRo9RRupAbTRfTa0=", + "dev": true, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/pkg-dir": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-2.0.0.tgz", + "integrity": "sha1-9tXREJ4Z1j7fQo4L1X4Sd3YVM0s=", + "dev": true, + "dependencies": { + "find-up": "^2.1.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/prebuild-install": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-6.0.0.tgz", + "integrity": "sha512-h2ZJ1PXHKWZpp1caLw0oX9sagVpL2YTk+ZwInQbQ3QqNd4J03O6MpFNmMTJlkfgPENWqe5kP0WjQLqz5OjLfsw==", + "dependencies": { + "detect-libc": "^1.0.3", + "expand-template": "^2.0.3", + "github-from-package": "0.0.0", + "minimist": "^1.2.3", + "mkdirp-classic": "^0.5.3", + "napi-build-utils": "^1.0.1", + "node-abi": "^2.7.0", + "noop-logger": "^0.1.1", + "npmlog": "^4.0.1", + "pump": "^3.0.0", + "rc": "^1.2.7", + "simple-get": "^3.0.3", + "tar-fs": "^2.0.0", + "tunnel-agent": "^0.6.0", + "which-pm-runs": "^1.0.0" + }, + "bin": { + "prebuild-install": "bin.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/prelude-ls": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", + "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==", + "dev": true, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha1-eCDZsWEgzFXKmud5JoCufbptf+I=" + }, + "node_modules/progress": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", + "integrity": "sha1-foz42PW48jnBvGi+tOt4Vn1XLvg=", + "dev": true, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/protobufjs": { + "version": "6.10.1", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-6.10.1.tgz", + "integrity": "sha1-5qSE3Y8EspYp6QUzROOXDMzxPNI=", + "dev": true, + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.4", + "@protobufjs/eventemitter": "^1.1.0", + "@protobufjs/fetch": "^1.1.0", + "@protobufjs/float": "^1.0.2", + "@protobufjs/inquire": "^1.1.0", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.0", + "@types/long": "^4.0.1", + "@types/node": "^13.7.0", + "long": "^4.0.0" + }, + "bin": { + "pbjs": "bin/pbjs", + "pbts": "bin/pbts" + } + }, + "node_modules/protobufjs/node_modules/@types/node": { + "version": "13.13.25", + "resolved": "https://registry.npmjs.org/@types/node/-/node-13.13.25.tgz", + "integrity": "sha1-do1nErFQmadEgSqSuE/96PThPPE=", + "dev": true + }, + "node_modules/psl": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.8.0.tgz", + "integrity": "sha1-kyb4vPsBOtzABf3/BWrM4CDlHCQ=", + "dev": true + }, + "node_modules/pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha1-tKIRaBW94vTh6mAjVOjHVWUQemQ=", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/punycode": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", + "integrity": "sha1-tYsBCsQMIsVldhbI0sLALHv0eew=", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/qs": { + "version": "6.5.2", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", + "integrity": "sha1-yzroBuh0BERYTvFUzo7pjUA/PjY=", + "dev": true, + "engines": { + "node": ">=0.6" + } + }, + "node_modules/randombytes": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", + "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", + "dev": true, + "dependencies": { + "safe-buffer": "^5.1.0" + } + }, + "node_modules/rc": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", + "integrity": "sha1-zZJL9SAKB1uDwYjNa54hG3/A0+0=", + "dependencies": { + "deep-extend": "^0.6.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "cli.js" + } + }, + "node_modules/read-pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-2.0.0.tgz", + "integrity": "sha1-jvHAYjxqbbDcZxPEv6xGMysjaPg=", + "dev": true, + "dependencies": { + "load-json-file": "^2.0.0", + "normalize-package-data": "^2.3.2", + "path-type": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/read-pkg-up": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-2.0.0.tgz", + "integrity": "sha1-a3KoBImE4MQeeVEP1en6mbO1Sb4=", + "dev": true, + "dependencies": { + "find-up": "^2.0.0", + "read-pkg": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/readable-stream": { + "version": "2.3.7", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz", + "integrity": "sha1-Hsoc9xGu+BTAT2IlKjamL2yyO1c=", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/readdirp": { + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.5.0.tgz", + "integrity": "sha512-cMhu7c/8rdhkHXWsY+osBhfSy0JikwpHK/5+imo+LpeasTF8ouErHrlYkwT0++njiyuDvc7OFY5T3ukvZ8qmFQ==", + "dev": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/rechoir": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/rechoir/-/rechoir-0.6.2.tgz", + "integrity": "sha1-hSBLVNuoLVdC4oyWdW70OvUOM4Q=", + "dev": true, + "dependencies": { + "resolve": "^1.1.6" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/regexpp": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-3.1.0.tgz", + "integrity": "sha1-IG0K0KVkjP+9uK5GQ489xRyfeOI=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/regextras": { + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/regextras/-/regextras-0.7.1.tgz", + "integrity": "sha1-vpVxnV9D+e8Ln6B62Jt8YGmVo7I=", + "dev": true, + "engines": { + "node": ">=0.1.14" + } + }, + "node_modules/request": { + "version": "2.88.2", + "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz", + "integrity": "sha1-1zyRhzHLWofaBH4gcjQUb2ZNErM=", + "dev": true, + "dependencies": { + "aws-sign2": "~0.7.0", + "aws4": "^1.8.0", + "caseless": "~0.12.0", + "combined-stream": "~1.0.6", + "extend": "~3.0.2", + "forever-agent": "~0.6.1", + "form-data": "~2.3.2", + "har-validator": "~5.1.3", + "http-signature": "~1.2.0", + "is-typedarray": "~1.0.0", + "isstream": "~0.1.2", + "json-stringify-safe": "~5.0.1", + "mime-types": "~2.1.19", + "oauth-sign": "~0.9.0", + "performance-now": "^2.1.0", + "qs": "~6.5.2", + "safe-buffer": "^5.1.2", + "tough-cookie": "~2.5.0", + "tunnel-agent": "^0.6.0", + "uuid": "^3.3.2" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/require-main-filename": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz", + "integrity": "sha1-0LMp7MfMD2Fkn2IhW+aa9UqomJs=", + "dev": true + }, + "node_modules/resolve": { + "version": "1.17.0", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.17.0.tgz", + "integrity": "sha1-sllBtUloIxzC0bt2p5y38sC/hEQ=", + "dev": true, + "dependencies": { + "path-parse": "^1.0.6" + } + }, + "node_modules/resolve-from": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", + "integrity": "sha1-SrzYUq0y3Xuqv+m0DgCjbbXzkuY=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha1-kNo4Kx4SbvwCFG6QhFqI2xKSXXY=", + "dev": true, + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/rimraf": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.7.1.tgz", + "integrity": "sha1-NXl/E6f9rcVmFCwp1PB8ytSD4+w=", + "dev": true, + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + } + }, + "node_modules/run-parallel": { + "version": "1.1.9", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.1.9.tgz", + "integrity": "sha1-yd06fPn0ssS2JE4XOm7YZuYd1nk=", + "dev": true + }, + "node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha1-mR7GnSluAxN0fVm9/St0XDX4go0=" + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha1-RPoWGwGHuVSd2Eu5GAL5vYOFzWo=", + "dev": true + }, + "node_modules/semver": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", + "integrity": "sha1-qVT5Ma66UI0we78Gnv8MAclhFvc=", + "bin": { + "semver": "bin/semver" + } + }, + "node_modules/serialize-javascript": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-5.0.1.tgz", + "integrity": "sha512-SaaNal9imEO737H2c05Og0/8LUXG7EnsZyMa8MzkmuHoELfT6txuj0cMqRj6zfPKnmQ1yasR4PCJc8x+M4JSPA==", + "dev": true, + "dependencies": { + "randombytes": "^2.1.0" + } + }, + "node_modules/set-blocking": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", + "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=" + }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha1-KQy7Iy4waULX1+qbg3Mqt4VvgoU=", + "dev": true + }, + "node_modules/shebang-command": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", + "integrity": "sha1-RKrGW2lbAzmJaMOfNj/uXer98eo=", + "dev": true, + "dependencies": { + "shebang-regex": "^1.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shebang-regex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz", + "integrity": "sha1-2kL0l0DAtC2yypcoVxyxkMmO/qM=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shelljs": { + "version": "0.8.4", + "resolved": "https://registry.npmjs.org/shelljs/-/shelljs-0.8.4.tgz", + "integrity": "sha1-3naE/ut2f4cWsyYHiooAh1iQ48I=", + "dev": true, + "dependencies": { + "glob": "^7.0.0", + "interpret": "^1.0.0", + "rechoir": "^0.6.2" + }, + "bin": { + "shjs": "bin/shjs" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/shiki": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.2.7.tgz", + "integrity": "sha512-bwVc7cdtYYHEO9O+XJ8aNOskKRfaQd5Y4ovLRfbQkmiLSUaR+bdlssbZUUhbQ0JAFMYcTcJ5tjG5KtnufttDHQ==", + "dev": true, + "dependencies": { + "onigasm": "^2.2.5", + "shiki-languages": "^0.2.7", + "shiki-themes": "^0.2.7", + "vscode-textmate": "^5.2.0" + } + }, + "node_modules/shiki-languages": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki-languages/-/shiki-languages-0.2.7.tgz", + "integrity": "sha512-REmakh7pn2jCn9GDMRSK36oDgqhh+rSvJPo77sdWTOmk44C5b0XlYPwJZcFOMJWUZJE0c7FCbKclw4FLwUKLRw==", + "dev": true, + "dependencies": { + "vscode-textmate": "^5.2.0" + } + }, + "node_modules/shiki-themes": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki-themes/-/shiki-themes-0.2.7.tgz", + "integrity": "sha512-ZMmboDYw5+SEpugM8KGUq3tkZ0vXg+k60XX6NngDK7gc1Sv6YLUlanpvG3evm57uKJvfXsky/S5MzSOTtYKLjA==", + "dev": true, + "dependencies": { + "json5": "^2.1.0", + "vscode-textmate": "^5.2.0" + } + }, + "node_modules/shiki-themes/node_modules/json5": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.0.tgz", + "integrity": "sha512-f+8cldu7X/y7RAJurMEJmdoKXGB/X550w2Nr3tTbezL6RwEE/iMcm+tZnXeoZtKuOq6ft8+CqzEkrIgx1fPoQA==", + "dev": true, + "dependencies": { + "minimist": "^1.2.5" + }, + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/signal-exit": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.3.tgz", + "integrity": "sha1-oUEMLt2PB3sItOJTyOrPyvBXRhw=" + }, + "node_modules/simple-concat": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", + "integrity": "sha1-9Gl2CCujXCJj8cirXt/ibEHJVS8=" + }, + "node_modules/simple-get": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-3.1.0.tgz", + "integrity": "sha1-tFvgYkNeUNFZVAtXYgLO7EC5xrM=", + "dependencies": { + "decompress-response": "^4.2.0", + "once": "^1.3.1", + "simple-concat": "^1.0.0" + } + }, + "node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha1-ZTm+hwwWWtvVJAIg2+Nh8bxNRjQ=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/slice-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-4.0.0.tgz", + "integrity": "sha512-qMCMfhY040cVHT43K9BFygqYbUPFZKHOg7K73mtTWJRb8pyP3fzf4Ixd5SzdEJQ6MRUg/WBnOLxghZtKKurENQ==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.0.0", + "astral-regex": "^2.0.0", + "is-fullwidth-code-point": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/slice-ansi?sponsor=1" + } + }, + "node_modules/slice-ansi/node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/slice-ansi/node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/slice-ansi/node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "node_modules/slice-ansi/node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha1-dHIq8y6WFOnCh6jQu95IteLxomM=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/spdx-correct": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz", + "integrity": "sha1-3s6BrJweZxPl99G28X1Gj6U9iak=", + "dev": true, + "dependencies": { + "spdx-expression-parse": "^3.0.0", + "spdx-license-ids": "^3.0.0" + } + }, + "node_modules/spdx-exceptions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.3.0.tgz", + "integrity": "sha1-PyjOGnegA3JoPq3kpDMYNSeiFj0=", + "dev": true + }, + "node_modules/spdx-expression-parse": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.1.tgz", + "integrity": "sha1-z3D1BILu/cmOPOCmgz5KU87rpnk=", + "dev": true, + "dependencies": { + "spdx-exceptions": "^2.1.0", + "spdx-license-ids": "^3.0.0" + } + }, + "node_modules/spdx-license-ids": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.6.tgz", + "integrity": "sha1-yAdXODwoq/cpZ0SZjLwQaui4VM4=", + "dev": true + }, + "node_modules/splitargs": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/splitargs/-/splitargs-0.0.7.tgz", + "integrity": "sha1-/p965lc3GzOxDLgNoUPPgknPazs=", + "dev": true + }, + "node_modules/sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", + "dev": true + }, + "node_modules/sshpk": { + "version": "1.16.1", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz", + "integrity": "sha1-+2YcC+8ps520B2nuOfpwCT1vaHc=", + "dev": true, + "dependencies": { + "asn1": "~0.2.3", + "assert-plus": "^1.0.0", + "bcrypt-pbkdf": "^1.0.0", + "dashdash": "^1.12.0", + "ecc-jsbn": "~0.1.1", + "getpass": "^0.1.1", + "jsbn": "~0.1.0", + "safer-buffer": "^2.0.2", + "tweetnacl": "~0.14.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha1-nPFhG6YmhdcDCunkujQUnDrwP8g=", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/string-width": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", + "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=", + "dependencies": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/string.prototype.trimend": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.1.tgz", + "integrity": "sha1-hYEqa4R6wAInD1gIFGBkyZX7aRM=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.5" + } + }, + "node_modules/string.prototype.trimstart": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.1.tgz", + "integrity": "sha1-FK9tnzSwU/fPyJty+PLuFLkDmlQ=", + "dev": true, + "dependencies": { + "define-properties": "^1.1.3", + "es-abstract": "^1.17.5" + } + }, + "node_modules/strip-ansi": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", + "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", + "dependencies": { + "ansi-regex": "^2.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/strip-eof": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", + "integrity": "sha1-u0P/VZim6wXYm1n80SnJgzE2Br8=", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/strip-json-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", + "integrity": "sha1-PFMZQukIwml8DsNEhYwobHygpgo=", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/supports-color": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", + "integrity": "sha1-4uaaRKyHcveKHsCzW2id9lMO/I8=", + "dev": true, + "dependencies": { + "has-flag": "^3.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/table": { + "version": "6.0.7", + "resolved": "https://registry.npmjs.org/table/-/table-6.0.7.tgz", + "integrity": "sha512-rxZevLGTUzWna/qBLObOe16kB2RTnnbhciwgPbMMlazz1yZGVEgnZK762xyVdVznhqxrfCeBMmMkgOOaPwjH7g==", + "dev": true, + "dependencies": { + "ajv": "^7.0.2", + "lodash": "^4.17.20", + "slice-ansi": "^4.0.0", + "string-width": "^4.2.0" + }, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/table/node_modules/ajv": { + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-7.0.4.tgz", + "integrity": "sha512-xzzzaqgEQfmuhbhAoqjJ8T/1okb6gAzXn/eQRNpAN1AEUoHJTNF9xCDRTtf/s3SKldtZfa+RJeTs+BQq+eZ/sw==", + "dev": true, + "dependencies": { + "fast-deep-equal": "^3.1.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/table/node_modules/ansi-regex": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", + "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/table/node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/table/node_modules/json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", + "dev": true + }, + "node_modules/table/node_modules/string-width": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.0.tgz", + "integrity": "sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==", + "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/table/node_modules/strip-ansi": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", + "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/tar": { + "version": "4.4.13", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.13.tgz", + "integrity": "sha1-Q7NkvFKIjVVSmGN7ENYHkCVKtSU=", + "dev": true, + "dependencies": { + "chownr": "^1.1.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.8.6", + "minizlib": "^1.2.1", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.3" + }, + "engines": { + "node": ">=4.5" + } + }, + "node_modules/tar-fs": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.0.tgz", + "integrity": "sha1-0c3RIatGXuDrnM3i01BJ0/Pa8NU=", + "dependencies": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.0.0" + } + }, + "node_modules/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "dependencies": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/tar-stream/node_modules/readable-stream": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", + "integrity": "sha1-M3u9o63AcGvT4CRCaihtS0sskZg=", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/text-table": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", + "integrity": "sha1-f17oI66AUgfACvLfSoTsP8+lcLQ=", + "dev": true + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha1-FkjESq58jZiKMmAY7XL1tN0DkuQ=", + "dev": true, + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/tough-cookie": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz", + "integrity": "sha1-zZ+yoKodWhK0c72fuW+j3P9lreI=", + "dev": true, + "dependencies": { + "psl": "^1.1.28", + "punycode": "^2.1.1" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/traverse": { + "version": "0.3.9", + "resolved": "https://registry.npmjs.org/traverse/-/traverse-0.3.9.tgz", + "integrity": "sha1-cXuPIgzAu3tE5AUUwisui7xw2Lk=", + "dev": true + }, + "node_modules/tsconfig-paths": { + "version": "3.9.0", + "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.9.0.tgz", + "integrity": "sha1-CYVHpsREiAfo/Ljq4IEGTumjyQs=", + "dev": true, + "dependencies": { + "@types/json5": "^0.0.29", + "json5": "^1.0.1", + "minimist": "^1.2.0", + "strip-bom": "^3.0.0" + } + }, + "node_modules/tslib": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", + "dev": true + }, + "node_modules/tsutils": { + "version": "3.20.0", + "resolved": "https://registry.npmjs.org/tsutils/-/tsutils-3.20.0.tgz", + "integrity": "sha512-RYbuQuvkhuqVeXweWT3tJLKOEJ/UUw9GjNEZGWdrLLlM+611o1gwLHBpxoFJKKl25fLprp2eVthtKs5JOrNeXg==", + "dev": true, + "dependencies": { + "tslib": "^1.8.1" + }, + "engines": { + "node": ">= 6" + }, + "peerDependencies": { + "typescript": ">=2.8.0 || >= 3.2.0-dev || >= 3.3.0-dev || >= 3.4.0-dev || >= 3.5.0-dev || >= 3.6.0-dev || >= 3.6.0-beta || >= 3.7.0-dev || >= 3.7.0-beta" + } + }, + "node_modules/tunnel-agent": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", + "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=", + "dependencies": { + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": "*" + } + }, + "node_modules/tweetnacl": { + "version": "0.14.5", + "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", + "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=", + "dev": true + }, + "node_modules/type-check": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", + "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", + "dev": true, + "dependencies": { + "prelude-ls": "^1.2.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/type-fest": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.8.1.tgz", + "integrity": "sha1-CeJJ696FHTseSNJ8EFREZn8XuD0=", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/typedoc": { + "version": "0.20.20", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.20.20.tgz", + "integrity": "sha512-qXB40ttDGaqv6q6UIiAVqOpX/GlXoBur0lB4g9fePoYjfwa6OsPkoYufLtsjEaBB0EokShR2aIoI5GX4RB83cw==", + "dev": true, + "dependencies": { + "colors": "^1.4.0", + "fs-extra": "^9.1.0", + "handlebars": "^4.7.6", + "lodash": "^4.17.20", + "lunr": "^2.3.9", + "marked": "^1.2.8", + "minimatch": "^3.0.0", + "progress": "^2.0.3", + "shelljs": "^0.8.4", + "shiki": "^0.2.7", + "typedoc-default-themes": "^0.12.7" + }, + "bin": { + "typedoc": "bin/typedoc" + }, + "engines": { + "node": ">= 10.8.0" + }, + "peerDependencies": { + "typescript": "3.9.x || 4.0.x || 4.1.x" + } + }, + "node_modules/typedoc-default-themes": { + "version": "0.12.7", + "resolved": "https://registry.npmjs.org/typedoc-default-themes/-/typedoc-default-themes-0.12.7.tgz", + "integrity": "sha512-0XAuGEqID+gon1+fhi4LycOEFM+5Mvm2PjwaiVZNAzU7pn3G2DEpsoXnFOPlLDnHY6ZW0BY0nO7ur9fHOFkBLQ==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/typescript": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.1.3.tgz", + "integrity": "sha512-B3ZIOf1IKeH2ixgHhj6la6xdwR9QrLC5d1VKeCSY4tvkqhF2eqd9O7txNlS0PO3GrBAFIdr3L1ndNwteUbZLYg==", + "dev": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + }, + "node_modules/uglify-js": { + "version": "3.11.2", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.11.2.tgz", + "integrity": "sha1-n1AyVUQnPCeyDlht7xQOdybFJeo=", + "dev": true, + "optional": true, + "bin": { + "uglifyjs": "bin/uglifyjs" + }, + "engines": { + "node": ">=0.8.0" + } + }, + "node_modules/universal-user-agent": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/universal-user-agent/-/universal-user-agent-2.1.0.tgz", + "integrity": "sha1-Wr+8wDahukkMuUH4/WjEbTZp6OQ=", + "dev": true, + "dependencies": { + "os-name": "^3.0.0" + } + }, + "node_modules/universalify": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz", + "integrity": "sha1-tkb2m+OULavOzJ1mOcgNwQXvqmY=", + "dev": true, + "engines": { + "node": ">= 4.0.0" + } + }, + "node_modules/unzipper": { + "version": "0.8.14", + "resolved": "https://registry.npmjs.org/unzipper/-/unzipper-0.8.14.tgz", + "integrity": "sha1-reBSTNL8FNEbjeJYviL50kfT95s=", + "dev": true, + "dependencies": { + "big-integer": "^1.6.17", + "binary": "~0.3.0", + "bluebird": "~3.4.1", + "buffer-indexof-polyfill": "~1.0.0", + "duplexer2": "~0.1.4", + "fstream": "~1.0.10", + "listenercount": "~1.0.1", + "readable-stream": "~2.1.5", + "setimmediate": "~1.0.4" + } + }, + "node_modules/unzipper/node_modules/process-nextick-args": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz", + "integrity": "sha1-FQ4gt1ZZCtP5EJPyWk8q2L/zC6M=", + "dev": true + }, + "node_modules/unzipper/node_modules/readable-stream": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.1.5.tgz", + "integrity": "sha1-ZvqLcg4UOLNkaB8q0aY8YYRIydA=", + "dev": true, + "dependencies": { + "buffer-shims": "^1.0.0", + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "~1.0.0", + "process-nextick-args": "~1.0.6", + "string_decoder": "~0.10.x", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/unzipper/node_modules/string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", + "dev": true + }, + "node_modules/uri-js": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.0.tgz", + "integrity": "sha1-qnFCYd55PoqCNHp7zJznTobyhgI=", + "dev": true, + "dependencies": { + "punycode": "^2.1.0" + } + }, + "node_modules/url-join": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/url-join/-/url-join-0.0.1.tgz", + "integrity": "sha1-HbSK1CLTQCRpqH99l73r/k+x48g=", + "dev": true + }, + "node_modules/url-template": { + "version": "2.0.8", + "resolved": "https://registry.npmjs.org/url-template/-/url-template-2.0.8.tgz", + "integrity": "sha1-/FZaPMy/93MMd19WQflVV5FDnyE=", + "dev": true + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=" + }, + "node_modules/uuid": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz", + "integrity": "sha1-sj5DWK+oogL+ehAK8fX4g/AgB+4=", + "dev": true, + "bin": { + "uuid": "bin/uuid" + } + }, + "node_modules/v8-compile-cache": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/v8-compile-cache/-/v8-compile-cache-2.1.1.tgz", + "integrity": "sha1-VLw83UMxe8qR413K8wWxpyN950U=", + "dev": true + }, + "node_modules/validate-npm-package-license": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz", + "integrity": "sha1-/JH2uce6FchX9MssXe/uw51PQQo=", + "dev": true, + "dependencies": { + "spdx-correct": "^3.0.0", + "spdx-expression-parse": "^3.0.0" + } + }, + "node_modules/verror": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz", + "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=", + "dev": true, + "engines": [ + "node >=0.6.0" + ], + "dependencies": { + "assert-plus": "^1.0.0", + "core-util-is": "1.0.2", + "extsprintf": "^1.2.0" + } + }, + "node_modules/vscode-textmate": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", + "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==", + "dev": true + }, + "node_modules/which": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", + "integrity": "sha1-pFBD1U9YBTFtqNYvn1CRjT2nCwo=", + "dev": true, + "dependencies": { + "isexe": "^2.0.0" + }, + "bin": { + "which": "bin/which" + } + }, + "node_modules/which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, + "node_modules/which-pm-runs": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/which-pm-runs/-/which-pm-runs-1.0.0.tgz", + "integrity": "sha1-Zws6+8VS4LVd9rd4DKdGFfI60cs=" + }, + "node_modules/wide-align": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/wide-align/-/wide-align-1.1.3.tgz", + "integrity": "sha1-rgdOa9wMFKQx6ATmJFScYzsABFc=", + "dependencies": { + "string-width": "^1.0.2 || 2" + } + }, + "node_modules/window-size": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/window-size/-/window-size-0.1.4.tgz", + "integrity": "sha1-+OGqHuWlPsW/FR/6CXQqatdpeHY=", + "dev": true, + "bin": { + "window-size": "cli.js" + }, + "engines": { + "node": ">= 0.10.0" + } + }, + "node_modules/windows-release": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/windows-release/-/windows-release-3.3.3.tgz", + "integrity": "sha1-HBACfHIldD7sa4nfFg1kwuApOZk=", + "dev": true, + "dependencies": { + "execa": "^1.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/word-wrap": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", + "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=", + "dev": true + }, + "node_modules/workerpool": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-6.0.2.tgz", + "integrity": "sha512-DSNyvOpFKrNusaaUwk+ej6cBj1bmhLcBfj80elGk+ZIo5JSkq+unB1dLKEOcNfJDZgjGICfhQ0Q5TbP0PvF4+Q==", + "dev": true + }, + "node_modules/wrap-ansi": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", + "integrity": "sha1-2Pw9KE3QV5T+hJc8rs3Rz4JP3YU=", + "dev": true, + "dependencies": { + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" + }, + "node_modules/y18n": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-3.2.1.tgz", + "integrity": "sha1-bRX7qITAhnnA136I53WegR4H+kE=", + "dev": true + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha1-27fa+b/YusmrRev2ArjLrQ1dCP0=", + "dev": true + }, + "node_modules/yargs": { + "version": "3.32.0", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-3.32.0.tgz", + "integrity": "sha1-AwiOnr+edWtpdRYR0qXvWRSCyZU=", + "dev": true, + "dependencies": { + "camelcase": "^2.0.1", + "cliui": "^3.0.3", + "decamelize": "^1.1.1", + "os-locale": "^1.4.0", + "string-width": "^1.0.1", + "window-size": "^0.1.4", + "y18n": "^3.2.0" + } + }, + "node_modules/yargs-parser": { + "version": "13.1.2", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz", + "integrity": "sha1-Ew8JcC667vJlDVTObj5XBvek+zg=", + "dev": true, + "dependencies": { + "camelcase": "^5.0.0", + "decamelize": "^1.2.0" + } + }, + "node_modules/yargs-parser/node_modules/camelcase": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.3.1.tgz", + "integrity": "sha1-48mzFWnhBoEd8kL3FXJaH0xJQyA=", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/yargs-unparser": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/yargs-unparser/-/yargs-unparser-2.0.0.tgz", + "integrity": "sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==", + "dev": true, + "dependencies": { + "camelcase": "^6.0.0", + "decamelize": "^4.0.0", + "flat": "^5.0.2", + "is-plain-obj": "^2.1.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/yargs-unparser/node_modules/camelcase": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.2.0.tgz", + "integrity": "sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/yargs-unparser/node_modules/decamelize": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-4.0.0.tgz", + "integrity": "sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/yocto-queue": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + }, "dependencies": { "@babel/code-frame": { "version": "7.10.4", @@ -30,6 +5466,32 @@ "js-tokens": "^4.0.0" } }, + "@eslint/eslintrc": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-0.3.0.tgz", + "integrity": "sha512-1JTKgrOKAHVivSvOYw+sJOunkBjUOvjqWk1DPja7ZFhIS2mX/4EgTT8M7eTK9jrKhL/FvXXEbQwIs3pg1xp3dg==", + "dev": true, + "requires": { + "ajv": "^6.12.4", + "debug": "^4.1.1", + "espree": "^7.3.0", + "globals": "^12.1.0", + "ignore": "^4.0.6", + "import-fresh": "^3.2.1", + "js-yaml": "^3.13.1", + "lodash": "^4.17.20", + "minimatch": "^3.0.4", + "strip-json-comments": "^3.1.1" + }, + "dependencies": { + "strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", + "dev": true + } + } + }, "@nodelib/fs.scandir": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.3.tgz", @@ -148,25 +5610,19 @@ "integrity": "sha1-p3c2C1s5oaLlEG+OhY8v0tBgxXA=", "dev": true }, - "@types/eslint-visitor-keys": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@types/eslint-visitor-keys/-/eslint-visitor-keys-1.0.0.tgz", - "integrity": "sha1-HuMNeVRMqE1o1LPNsK9PIFZj3S0=", - "dev": true - }, "@types/fs-extra": { - "version": "8.1.1", - "resolved": "https://registry.npmjs.org/@types/fs-extra/-/fs-extra-8.1.1.tgz", - "integrity": "sha1-HknyLQmqRuGbUcCwE8tj0NkjoGg=", + "version": "9.0.6", + "resolved": "https://registry.npmjs.org/@types/fs-extra/-/fs-extra-9.0.6.tgz", + "integrity": "sha512-ecNRHw4clCkowNOBJH1e77nvbPxHYnWIXMv1IAoG/9+MYGkgoyr3Ppxr7XYFNL41V422EDhyV4/4SSK8L2mlig==", "dev": true, "requires": { "@types/node": "*" } }, "@types/json-schema": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.6.tgz", - "integrity": "sha1-9MfsQ+gbMZqYFRFQMXCfJph4kfA=", + "version": "7.0.7", + "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.7.tgz", + "integrity": "sha512-cxWFQVseBm6O9Gbw1IWb8r6OS4OhSt3hPZLkFApLjM8TEXROBuQGLAH2i2gZpcXdLBIrpXuTDhH7Vbm1iXmNGA==", "dev": true }, "@types/json5": { @@ -191,15 +5647,15 @@ "dev": true }, "@types/minimist": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.0.tgz", - "integrity": "sha1-aaI6OtKcrwCX8G7aWbNh7i8GOfY=", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@types/minimist/-/minimist-1.2.1.tgz", + "integrity": "sha512-fZQQafSREFyuZcdWFAExYjBiCL7AUCdgsk80iO0q4yihYYdcIiH28CcuPTGFgLOCC8RlW49GSQxdHwZP+I7CNg==", "dev": true }, "@types/mocha": { - "version": "7.0.2", - "resolved": "https://registry.npmjs.org/@types/mocha/-/mocha-7.0.2.tgz", - "integrity": "sha1-sX8Wz5M1l+ENbXjq4yUeaSzosM4=", + "version": "8.2.0", + "resolved": "https://registry.npmjs.org/@types/mocha/-/mocha-8.2.0.tgz", + "integrity": "sha512-/Sge3BymXo4lKc31C8OINJgXLaw+7vL1/L1pGiBNpGrBiT8FQiaFpSYV0uhTaG4y78vcMBTMFsWaHDvuD+xGzQ==", "dev": true }, "@types/node": { @@ -209,84 +5665,168 @@ "dev": true }, "@types/tar-stream": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/@types/tar-stream/-/tar-stream-2.1.0.tgz", - "integrity": "sha1-iEscvmw1/0WcBaXrqGtAaAWUPvY=", + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@types/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-sRTpT180sVigzD4SiCWJQQrqcdkWnmscWvx+cXvAoPtXbLFC5+QmKi2xwRcPe4iRu0GcVl1qTeJKUTS5hULfrw==", "dev": true, "requires": { "@types/node": "*" } }, "@typescript-eslint/eslint-plugin": { - "version": "2.34.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-2.34.0.tgz", - "integrity": "sha1-b4zopGx96kpvHRcdK7j7rm2sK+k=", + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.14.2.tgz", + "integrity": "sha512-uMGfG7GFYK/nYutK/iqYJv6K/Xuog/vrRRZX9aEP4Zv1jsYXuvFUMDFLhUnc8WFv3D2R5QhNQL3VYKmvLS5zsQ==", "dev": true, "requires": { - "@typescript-eslint/experimental-utils": "2.34.0", + "@typescript-eslint/experimental-utils": "4.14.2", + "@typescript-eslint/scope-manager": "4.14.2", + "debug": "^4.1.1", "functional-red-black-tree": "^1.0.1", + "lodash": "^4.17.15", "regexpp": "^3.0.0", + "semver": "^7.3.2", "tsutils": "^3.17.1" + }, + "dependencies": { + "lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "requires": { + "yallist": "^4.0.0" + } + }, + "semver": { + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "requires": { + "lru-cache": "^6.0.0" + } + }, + "yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + } } }, "@typescript-eslint/experimental-utils": { - "version": "2.34.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-2.34.0.tgz", - "integrity": "sha1-01JLZEzbQO687KZ/jPPkzJyPmA8=", + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-4.14.2.tgz", + "integrity": "sha512-mV9pmET4C2y2WlyHmD+Iun8SAEqkLahHGBkGqDVslHkmoj3VnxnGP4ANlwuxxfq1BsKdl/MPieDbohCEQgKrwA==", "dev": true, "requires": { "@types/json-schema": "^7.0.3", - "@typescript-eslint/typescript-estree": "2.34.0", + "@typescript-eslint/scope-manager": "4.14.2", + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/typescript-estree": "4.14.2", "eslint-scope": "^5.0.0", "eslint-utils": "^2.0.0" } }, "@typescript-eslint/parser": { - "version": "2.34.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-2.34.0.tgz", - "integrity": "sha1-UCUmMMoxloVCDpo5ygX+GFola8g=", + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-4.14.2.tgz", + "integrity": "sha512-ipqSP6EuUsMu3E10EZIApOJgWSpcNXeKZaFeNKQyzqxnQl8eQCbV+TSNsl+s2GViX2d18m1rq3CWgnpOxDPgHg==", "dev": true, "requires": { - "@types/eslint-visitor-keys": "^1.0.0", - "@typescript-eslint/experimental-utils": "2.34.0", - "@typescript-eslint/typescript-estree": "2.34.0", - "eslint-visitor-keys": "^1.1.0" + "@typescript-eslint/scope-manager": "4.14.2", + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/typescript-estree": "4.14.2", + "debug": "^4.1.1" } }, - "@typescript-eslint/typescript-estree": { - "version": "2.34.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-2.34.0.tgz", - "integrity": "sha1-FK62NTs57wcyzH8bgoUpSTfPN9U=", + "@typescript-eslint/scope-manager": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-4.14.2.tgz", + "integrity": "sha512-cuV9wMrzKm6yIuV48aTPfIeqErt5xceTheAgk70N1V4/2Ecj+fhl34iro/vIssJlb7XtzcaD07hWk7Jk0nKghg==", "dev": true, "requires": { + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/visitor-keys": "4.14.2" + } + }, + "@typescript-eslint/types": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-4.14.2.tgz", + "integrity": "sha512-LltxawRW6wXy4Gck6ZKlBD05tCHQUj4KLn4iR69IyRiDHX3d3NCAhO+ix5OR2Q+q9bjCrHE/HKt+riZkd1At8Q==", + "dev": true + }, + "@typescript-eslint/typescript-estree": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-4.14.2.tgz", + "integrity": "sha512-ESiFl8afXxt1dNj8ENEZT12p+jl9PqRur+Y19m0Z/SPikGL6rqq4e7Me60SU9a2M28uz48/8yct97VQYaGl0Vg==", + "dev": true, + "requires": { + "@typescript-eslint/types": "4.14.2", + "@typescript-eslint/visitor-keys": "4.14.2", "debug": "^4.1.1", - "eslint-visitor-keys": "^1.1.0", - "glob": "^7.1.6", + "globby": "^11.0.1", "is-glob": "^4.0.1", "lodash": "^4.17.15", "semver": "^7.3.2", "tsutils": "^3.17.1" }, "dependencies": { + "lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "requires": { + "yallist": "^4.0.0" + } + }, "semver": { - "version": "7.3.2", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.2.tgz", - "integrity": "sha1-YElisFK4HtB4aq6EOJ/7pw/9OTg=", + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "requires": { + "lru-cache": "^6.0.0" + } + }, + "yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", "dev": true } } }, + "@typescript-eslint/visitor-keys": { + "version": "4.14.2", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-4.14.2.tgz", + "integrity": "sha512-KBB+xLBxnBdTENs/rUgeUKO0UkPBRs2vD09oMRRIkj5BEN8PX1ToXV532desXfpQnZsYTyLLviS7JrPhdL154w==", + "dev": true, + "requires": { + "@typescript-eslint/types": "4.14.2", + "eslint-visitor-keys": "^2.0.0" + } + }, + "@ungap/promise-all-settled": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@ungap/promise-all-settled/-/promise-all-settled-1.1.2.tgz", + "integrity": "sha512-sL/cEvJWAnClXw0wHk85/2L0G6Sj8UB0Ctc1TEMbKSsmpRosqhwj9gWgFRZSrBr2f9tiXISwNhCPmlfqUqyb9Q==", + "dev": true + }, "acorn": { "version": "7.4.1", "resolved": "https://registry.npmjs.org/acorn/-/acorn-7.4.1.tgz", - "integrity": "sha1-/q7SVZc9LndVW4PbwIhRpsY1IPo=", + "integrity": "sha512-nQyp0o1/mNdbTO1PO6kHkwSrmgZ0MT/jCCpNiwbUjGoRN4dlBhqJtoQuCnEOKzgTVwg0ZWiCoQy6SxMebQVh8A==", "dev": true }, "acorn-jsx": { "version": "5.3.1", "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.1.tgz", - "integrity": "sha1-/IZh4Rt6wVOcR9v+oucrOvNNJns=", - "dev": true + "integrity": "sha512-K0Ptm/47OKfQRpNQ2J/oIN/3QYiK6FwW+eJbILhsdxh2WTLdl+30o8aGdTbm5JbffpFFAg/g+zi1E+jvJha5ng==", + "dev": true, + "requires": {} }, "agent-base": { "version": "4.3.0", @@ -316,28 +5856,11 @@ "dev": true }, "ansi-colors": { - "version": "3.2.3", - "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-3.2.3.tgz", - "integrity": "sha1-V9NbhoboUeLMBMQD8cACA5dqGBM=", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/ansi-colors/-/ansi-colors-4.1.1.tgz", + "integrity": "sha512-JoX0apGbHaUJBNl6yF+p6JAFYZ666/hhCGKN5t9QFjbJQKUU/g8MNbFDbvfrgKXvI1QpZplPOnwIo99lX/AAmA==", "dev": true }, - "ansi-escapes": { - "version": "4.3.1", - "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.1.tgz", - "integrity": "sha1-pcR8xDGB8fOP/XB2g3cA05VSKmE=", - "dev": true, - "requires": { - "type-fest": "^0.11.0" - }, - "dependencies": { - "type-fest": { - "version": "0.11.0", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.11.0.tgz", - "integrity": "sha1-l6vwhyMQ/tiKXEZrJWgVdhReM/E=", - "dev": true - } - } - }, "ansi-regex": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", @@ -355,7 +5878,7 @@ "anymatch": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.1.tgz", - "integrity": "sha1-xV7PAhheJGklk5kxDBc84xIzsUI=", + "integrity": "sha512-mM8522psRCqzV+6LhomX5wgp25YVibjh8Wj23I5RPkPppSVSjyKD2A2mBJmWGa+KN7f2D6LNh9jkBCeyLktzjg==", "dev": true, "requires": { "normalize-path": "^3.0.0", @@ -428,9 +5951,9 @@ "dev": true }, "astral-regex": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/astral-regex/-/astral-regex-1.0.0.tgz", - "integrity": "sha1-bIw/uCfdQ+45GPJ7gngqt2WKb9k=", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/astral-regex/-/astral-regex-2.0.0.tgz", + "integrity": "sha512-Z7tMw1ytTXt5jqMcOP+OQteU1VuNK9Y02uuJtKQ1Sv69jXQKKg5cibLwGJow8yzZP+eAc18EmLGPal0bp36rvQ==", "dev": true }, "async": { @@ -506,9 +6029,9 @@ } }, "binary-extensions": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.1.0.tgz", - "integrity": "sha1-MPpAyef+B9vIlWeM0ocCTeokHdk=", + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz", + "integrity": "sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==", "dev": true }, "bl": { @@ -635,26 +6158,20 @@ "supports-color": "^5.3.0" } }, - "chardet": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.7.0.tgz", - "integrity": "sha1-kAlISfCTfy7twkJdDSip5fDLrZ4=", - "dev": true - }, "chokidar": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.3.0.tgz", - "integrity": "sha1-EsBxRmjFWAD2WeJi1JYql/r1VKY=", + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.4.3.tgz", + "integrity": "sha512-DtM3g7juCXQxFVSNPNByEC2+NImtBuxQQvWlHunpJIS5Ocr0lG306cC7FCi7cEA0fzmybPUIl4txBIobk1gGOQ==", "dev": true, "requires": { "anymatch": "~3.1.1", "braces": "~3.0.2", - "fsevents": "~2.1.1", + "fsevents": "~2.1.2", "glob-parent": "~5.1.0", "is-binary-path": "~2.1.0", "is-glob": "~4.0.1", "normalize-path": "~3.0.0", - "readdirp": "~3.2.0" + "readdirp": "~3.5.0" } }, "chownr": { @@ -663,9 +6180,9 @@ "integrity": "sha1-b8nXtC0ypYNZYzdmbn0ICE2izGs=" }, "clang-format": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/clang-format/-/clang-format-1.4.0.tgz", - "integrity": "sha1-HuLxBjfrW7C9fQuCyUmvaOhINn4=", + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/clang-format/-/clang-format-1.5.0.tgz", + "integrity": "sha512-C1LucFX7E+ABVYcPEbBHM4PYQ2+WInXsqsLpFlQ9cmRfSbk7A7b1I06h/nE4bQ3MsyEkb31jY2gC0Dtc76b4IA==", "dev": true, "requires": { "async": "^1.5.2", @@ -673,21 +6190,6 @@ "resolve": "^1.1.6" } }, - "cli-cursor": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-3.1.0.tgz", - "integrity": "sha1-JkMFp65JDR0Dvwybp8kl0XU68wc=", - "dev": true, - "requires": { - "restore-cursor": "^3.1.0" - } - }, - "cli-width": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-3.0.0.tgz", - "integrity": "sha1-ovSEN6LKqaIkNueUvwceyeYc7fY=", - "dev": true - }, "cliui": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/cliui/-/cliui-3.2.0.tgz", @@ -789,6 +6291,12 @@ "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=", "dev": true }, + "colors": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.4.0.tgz", + "integrity": "sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==", + "dev": true + }, "combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -805,9 +6313,9 @@ "dev": true }, "comment-parser": { - "version": "0.7.6", - "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-0.7.6.tgz", - "integrity": "sha1-DnQ6U8jmRsiZoTI9sx9s0zexDxI=", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.1.1.tgz", + "integrity": "sha512-vue7cRi1ZO5/72FJ+wZ5+siTSBlUv3ZksTk8bWD2IkaA6obitzMZP3yI65azTJLckwmi8lxfPP5Sd9oGuZ8e2g==", "dev": true }, "concat-map": { @@ -855,9 +6363,9 @@ } }, "debug": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.2.0.tgz", - "integrity": "sha1-fxUPk5IOlMWPVXTC/QGjEQ7/5/E=", + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz", + "integrity": "sha512-doEwdvm4PCeK4K3RQN2ZC2BYUBaxwLARCqZmMjtF8a51J2Rb0xpVloFRnCODwqjpwnAoao4pelN8l3RJdv3gRQ==", "dev": true, "requires": { "ms": "2.1.2" @@ -914,9 +6422,9 @@ "integrity": "sha1-+hN8S9aY7fVc1c0CrFWfkaTEups=" }, "diff": { - "version": "3.5.0", - "resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz", - "integrity": "sha1-gAwN0eCov7yVg1wgKtIg/jF+WhI=", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", + "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", "dev": true }, "dir-glob": { @@ -967,7 +6475,7 @@ "emoji-regex": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", - "integrity": "sha1-6Bj9ac5cz8tARZT4QpY79TFkzDc=", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", "dev": true }, "end-of-stream": { @@ -978,6 +6486,15 @@ "once": "^1.4.0" } }, + "enquirer": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/enquirer/-/enquirer-2.3.6.tgz", + "integrity": "sha512-yjNnPr315/FjS4zIsUxYguYUPP2e1NK4d7E7ZOLiyYCcbFBiTMyID+2wvm2w6+pZ/odMA7cRkjhsPbltwBOrLg==", + "dev": true, + "requires": { + "ansi-colors": "^4.1.1" + } + }, "error-ex": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", @@ -1039,84 +6556,153 @@ "dev": true }, "eslint": { - "version": "6.8.0", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-6.8.0.tgz", - "integrity": "sha1-YiYtZylzn5J1cjgkMC+yJ8jJP/s=", + "version": "7.19.0", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-7.19.0.tgz", + "integrity": "sha512-CGlMgJY56JZ9ZSYhJuhow61lMPPjUzWmChFya71Z/jilVos7mR/jPgaEfVGgMBY5DshbKdG8Ezb8FDCHcoMEMg==", "dev": true, "requires": { "@babel/code-frame": "^7.0.0", + "@eslint/eslintrc": "^0.3.0", "ajv": "^6.10.0", - "chalk": "^2.1.0", - "cross-spawn": "^6.0.5", + "chalk": "^4.0.0", + "cross-spawn": "^7.0.2", "debug": "^4.0.1", "doctrine": "^3.0.0", - "eslint-scope": "^5.0.0", - "eslint-utils": "^1.4.3", - "eslint-visitor-keys": "^1.1.0", - "espree": "^6.1.2", - "esquery": "^1.0.1", + "enquirer": "^2.3.5", + "eslint-scope": "^5.1.1", + "eslint-utils": "^2.1.0", + "eslint-visitor-keys": "^2.0.0", + "espree": "^7.3.1", + "esquery": "^1.2.0", "esutils": "^2.0.2", - "file-entry-cache": "^5.0.1", + "file-entry-cache": "^6.0.0", "functional-red-black-tree": "^1.0.1", "glob-parent": "^5.0.0", "globals": "^12.1.0", "ignore": "^4.0.6", "import-fresh": "^3.0.0", "imurmurhash": "^0.1.4", - "inquirer": "^7.0.0", "is-glob": "^4.0.0", "js-yaml": "^3.13.1", "json-stable-stringify-without-jsonify": "^1.0.1", - "levn": "^0.3.0", - "lodash": "^4.17.14", + "levn": "^0.4.1", + "lodash": "^4.17.20", "minimatch": "^3.0.4", - "mkdirp": "^0.5.1", "natural-compare": "^1.4.0", - "optionator": "^0.8.3", + "optionator": "^0.9.1", "progress": "^2.0.0", - "regexpp": "^2.0.1", - "semver": "^6.1.2", - "strip-ansi": "^5.2.0", - "strip-json-comments": "^3.0.1", - "table": "^5.2.3", + "regexpp": "^3.1.0", + "semver": "^7.2.1", + "strip-ansi": "^6.0.0", + "strip-json-comments": "^3.1.0", + "table": "^6.0.4", "text-table": "^0.2.0", "v8-compile-cache": "^2.0.3" }, "dependencies": { "ansi-regex": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.0.tgz", - "integrity": "sha1-i5+PCM8ay4Q3Vqg5yox+MWjFGZc=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", + "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==", "dev": true }, - "eslint-utils": { - "version": "1.4.3", - "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-1.4.3.tgz", - "integrity": "sha1-dP7HxU0Hdrb2fgJRBAtYBlZOmB8=", + "ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", "dev": true, "requires": { - "eslint-visitor-keys": "^1.1.0" + "color-convert": "^2.0.1" } }, - "regexpp": { + "chalk": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.0.tgz", + "integrity": "sha512-qwx12AxXe2Q5xQ43Ac//I6v5aXTipYrSESdOgzrN+9XjgEpyjpKuvSGaN4qE93f7TQTlerQQ8S+EQ0EyDoVL1A==", + "dev": true, + "requires": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + } + }, + "color-convert": { "version": "2.0.1", - "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-2.0.1.tgz", - "integrity": "sha1-jRnTHPYySCtYkEn4KB+T28uk0H8=", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "requires": { + "color-name": "~1.1.4" + } + }, + "color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "cross-spawn": { + "version": "7.0.3", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", + "integrity": "sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==", + "dev": true, + "requires": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" + } + }, + "has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true + }, + "lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "requires": { + "yallist": "^4.0.0" + } + }, + "path-key": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz", + "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==", "dev": true }, "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha1-7gpkyK9ejO6mdoexM3YeG+y9HT0=", + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "requires": { + "lru-cache": "^6.0.0" + } + }, + "shebang-command": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", + "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==", + "dev": true, + "requires": { + "shebang-regex": "^3.0.0" + } + }, + "shebang-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz", + "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==", "dev": true }, "strip-ansi": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz", - "integrity": "sha1-jJpTb+tq/JYr36WxBKUJHBrZwK4=", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", + "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", "dev": true, "requires": { - "ansi-regex": "^4.1.0" + "ansi-regex": "^5.0.0" } }, "strip-json-comments": { @@ -1124,6 +6710,30 @@ "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", "integrity": "sha1-MfEoGzgyYwQ0gxwxDAHMzajL4AY=", "dev": true + }, + "supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "requires": { + "has-flag": "^4.0.0" + } + }, + "which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "dev": true, + "requires": { + "isexe": "^2.0.0" + } + }, + "yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true } } }, @@ -1230,33 +6840,52 @@ } }, "eslint-plugin-jsdoc": { - "version": "24.0.6", - "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-24.0.6.tgz", - "integrity": "sha1-Ng91t7eaZOtvBy3p83MiWIV4q/A=", + "version": "31.6.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-31.6.0.tgz", + "integrity": "sha512-kYhdW+BXHij9n12oHvAC27oDHKEFITz1YJP/C0NPtb+gsGJWxejh5B6dEmmj6oLYOsmNvuCVkdIcqYOyabP2QA==", "dev": true, "requires": { - "comment-parser": "^0.7.4", - "debug": "^4.1.1", - "jsdoctypeparser": "^6.1.0", - "lodash": "^4.17.15", - "regextras": "^0.7.0", - "semver": "^6.3.0", - "spdx-expression-parse": "^3.0.0" + "comment-parser": "1.1.1", + "debug": "^4.3.1", + "jsdoctypeparser": "^9.0.0", + "lodash": "^4.17.20", + "regextras": "^0.7.1", + "semver": "^7.3.4", + "spdx-expression-parse": "^3.0.1" }, "dependencies": { + "lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "requires": { + "yallist": "^4.0.0" + } + }, "semver": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.0.tgz", - "integrity": "sha1-7gpkyK9ejO6mdoexM3YeG+y9HT0=", + "version": "7.3.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.4.tgz", + "integrity": "sha512-tCfb2WLjqFAtXn4KEdxIhalnRtoKFN7nAwj0B3ZXCbQloV2tq5eDbcTmT68JJD3nRJq24/XgxtQKFIpQdtvmVw==", + "dev": true, + "requires": { + "lru-cache": "^6.0.0" + } + }, + "yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", "dev": true } } }, "eslint-plugin-prefer-arrow": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/eslint-plugin-prefer-arrow/-/eslint-plugin-prefer-arrow-1.2.2.tgz", - "integrity": "sha1-DG0lprlMs+ARCiPRKXYK9YYO224=", - "dev": true + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/eslint-plugin-prefer-arrow/-/eslint-plugin-prefer-arrow-1.2.3.tgz", + "integrity": "sha512-J9I5PKCOJretVuiZRGvPQxCbllxGAV/viI20JO3LYblAodofBxyMnZAJ+WGeClHgANnSJberTNoFWWjrWKBuXQ==", + "dev": true, + "requires": {} }, "eslint-scope": { "version": "5.1.1", @@ -1271,27 +6900,43 @@ "eslint-utils": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-2.1.0.tgz", - "integrity": "sha1-0t5eA0JOcH3BDHQGjd7a5wh0Gyc=", + "integrity": "sha512-w94dQYoauyvlDc43XnGB8lU3Zt713vNChgt4EWwhXAP2XkBvndfxF0AgIqKOOasjPIPzj9JqgwkwbCYD0/V3Zg==", "dev": true, "requires": { "eslint-visitor-keys": "^1.1.0" + }, + "dependencies": { + "eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "dev": true + } } }, "eslint-visitor-keys": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", - "integrity": "sha1-MOvR73wv3/AcOk8VEESvJfqwUj4=", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-2.0.0.tgz", + "integrity": "sha512-QudtT6av5WXels9WjIM7qz1XD1cWGvX4gGXvp/zBn9nXG02D0utdU3Em2m/QjTnrsk6bBjmCygl3rmj118msQQ==", "dev": true }, "espree": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/espree/-/espree-6.2.1.tgz", - "integrity": "sha1-d/xy4f10SiBSwg84pbV1gy6Cc0o=", + "version": "7.3.1", + "resolved": "https://registry.npmjs.org/espree/-/espree-7.3.1.tgz", + "integrity": "sha512-v3JCNCE64umkFpmkFGqzVKsOT0tN1Zr+ueqLZfpV1Ob8e+CEgPWa+OxCoGH3tnhimMKIaBm4m/vaRpJ/krRz2g==", "dev": true, "requires": { - "acorn": "^7.1.1", - "acorn-jsx": "^5.2.0", - "eslint-visitor-keys": "^1.1.0" + "acorn": "^7.4.0", + "acorn-jsx": "^5.3.1", + "eslint-visitor-keys": "^1.3.0" + }, + "dependencies": { + "eslint-visitor-keys": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.3.0.tgz", + "integrity": "sha512-6J72N8UNa462wa/KFODt/PJ3IU60SDpC3QXC1Hjc1BXXpfL2C9R5+AU7jhe0F6GREqVMh4Juu+NY7xn+6dipUQ==", + "dev": true + } } }, "esprima": { @@ -1372,17 +7017,6 @@ "integrity": "sha1-+LETa0Bx+9jrFAr/hYsQGewpFfo=", "dev": true }, - "external-editor": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/external-editor/-/external-editor-3.1.0.tgz", - "integrity": "sha1-ywP3QL764D6k0oPK7SdBqD8zVJU=", - "dev": true, - "requires": { - "chardet": "^0.7.0", - "iconv-lite": "^0.4.24", - "tmp": "^0.0.33" - } - }, "extsprintf": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", @@ -1436,22 +7070,13 @@ "reusify": "^1.0.4" } }, - "figures": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/figures/-/figures-3.2.0.tgz", - "integrity": "sha1-YlwYvSk8YE3EqN2y/r8MiDQXRq8=", - "dev": true, - "requires": { - "escape-string-regexp": "^1.0.5" - } - }, "file-entry-cache": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-5.0.1.tgz", - "integrity": "sha1-yg9u+m3T1WEzP7FFFQZcL6/fQ5w=", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.0.tgz", + "integrity": "sha512-fqoO76jZ3ZnYrXLDRxBR1YvOvc0k844kcOg40bgsPrE25LAb/PDqTY+ho64Xh2c8ZXgIKldchCFHczG2UVRcWA==", "dev": true, "requires": { - "flat-cache": "^2.0.1" + "flat-cache": "^3.0.4" } }, "fill-range": { @@ -1473,29 +7098,25 @@ } }, "flat": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/flat/-/flat-4.1.1.tgz", - "integrity": "sha1-o5IFnMOCiB/5hkL12k3eCpWfMJs=", - "dev": true, - "requires": { - "is-buffer": "~2.0.3" - } + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/flat/-/flat-5.0.2.tgz", + "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==", + "dev": true }, "flat-cache": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-2.0.1.tgz", - "integrity": "sha1-XSltbwS9pEpGMKMBQTvbwuwIXsA=", + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.0.4.tgz", + "integrity": "sha512-dm9s5Pw7Jc0GvMYbshN6zchCA9RgQlzzEZX3vylR9IqFfS8XciblUXOKfW6SiuJ0e13eDYZoZV5wdrev7P3Nwg==", "dev": true, "requires": { - "flatted": "^2.0.0", - "rimraf": "2.6.3", - "write": "1.0.3" + "flatted": "^3.1.0", + "rimraf": "^3.0.2" }, "dependencies": { "rimraf": { - "version": "2.6.3", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.3.tgz", - "integrity": "sha1-stEE/g2Psnz54KHNqCYt04M8bKs=", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", "dev": true, "requires": { "glob": "^7.1.3" @@ -1504,9 +7125,9 @@ } }, "flatted": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-2.0.2.tgz", - "integrity": "sha1-RXWyHivO50NKqb5mL0t7X5wrUTg=", + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.1.1.tgz", + "integrity": "sha512-zAoAQiudy+r5SvnSw3KJy5os/oRJYHzrzja/tBDqrZtNhUw8bt6y8OBzMWcjWr+8liV8Eb6yOhw8WZ7VFZ5ZzA==", "dev": true }, "forever-agent": { @@ -1532,15 +7153,15 @@ "integrity": "sha1-a+Dem+mYzhavivwkSXue6bfM2a0=" }, "fs-extra": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.0.1.tgz", - "integrity": "sha1-kQ2gBiQ3ukw5/t2GPxZ1zP78ufw=", + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.1.0.tgz", + "integrity": "sha512-hcg3ZmepS30/7BSFqRvoo3DOMQu7IjqxO5nCDt+zM9XWjb33Wg7ziNT+Qvqbuc3+gWpzO02JubVyk2G4Zvo1OQ==", "dev": true, "requires": { "at-least-node": "^1.0.0", "graceful-fs": "^4.2.0", "jsonfile": "^6.0.1", - "universalify": "^1.0.0" + "universalify": "^2.0.0" }, "dependencies": { "jsonfile": { @@ -1551,12 +7172,20 @@ "requires": { "graceful-fs": "^4.1.6", "universalify": "^1.0.0" + }, + "dependencies": { + "universalify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-1.0.0.tgz", + "integrity": "sha512-rb6X1W158d7pRQBg5gkR8uPaSfiids68LTJQYOtEUhoJUWBdaQHsuT/EUduxXYxcrt4r5PJ4fuHW1MHT6p0qug==", + "dev": true + } } }, "universalify": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/universalify/-/universalify-1.0.0.tgz", - "integrity": "sha1-thodoXPoQ1sv48Z9Kbmt+FlL0W0=", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.0.tgz", + "integrity": "sha512-hAZsKq7Yy11Zu1DE0OzWjw7nnLZmJZYTDZZyEFHZdUhV8FkH5MCfoU1XMaxXovpyW5nq5scPqq0ZDP9Zyl04oQ==", "dev": true } } @@ -1684,9 +7313,9 @@ } }, "globby": { - "version": "11.0.1", - "resolved": "https://registry.npmjs.org/globby/-/globby-11.0.1.tgz", - "integrity": "sha1-mivxB6Bo8//qvEmtcCx57ejP01c=", + "version": "11.0.2", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.0.2.tgz", + "integrity": "sha512-2ZThXDvvV8fYFRVIxnrMQBipZQDr7MxKAmQK1vujaj9/7eF0efG7BPUKJ7jP7G5SLF37xKDXvO4S/KKLj/Z0og==", "dev": true, "requires": { "array-union": "^2.1.0", @@ -1778,12 +7407,6 @@ "integrity": "sha1-hK5l+n6vsWX922FWauFLrwVmTw8=", "dev": true }, - "highlight.js": { - "version": "10.4.1", - "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-10.4.1.tgz", - "integrity": "sha512-yR5lWvNz7c85OhVAEAeFhVCc/GV4C30Fjzc/rCP0aCWzc1UUOPUk55dK/qdwTZHBvMZo+eZ2jpk62ndX/xMFlg==", - "dev": true - }, "hosted-git-info": { "version": "2.8.8", "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.8.8.tgz", @@ -1849,15 +7472,6 @@ } } }, - "iconv-lite": { - "version": "0.4.24", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", - "integrity": "sha1-ICK0sl+93CHS9SSXSkdKr+czkIs=", - "dev": true, - "requires": { - "safer-buffer": ">= 2.1.2 < 3" - } - }, "ieee754": { "version": "1.1.13", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.13.tgz", @@ -1905,110 +7519,6 @@ "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" }, - "inquirer": { - "version": "7.3.3", - "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-7.3.3.tgz", - "integrity": "sha1-BNF2sq8Er8FXqD/XwQDpjuCq0AM=", - "dev": true, - "requires": { - "ansi-escapes": "^4.2.1", - "chalk": "^4.1.0", - "cli-cursor": "^3.1.0", - "cli-width": "^3.0.0", - "external-editor": "^3.0.3", - "figures": "^3.0.0", - "lodash": "^4.17.19", - "mute-stream": "0.0.8", - "run-async": "^2.4.0", - "rxjs": "^6.6.0", - "string-width": "^4.1.0", - "strip-ansi": "^6.0.0", - "through": "^2.3.6" - }, - "dependencies": { - "ansi-regex": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", - "integrity": "sha1-OIU59VF5vzkznIGvMKZU1p+Hy3U=", - "dev": true - }, - "ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha1-7dgDYornHATIWuegkG7a00tkiTc=", - "dev": true, - "requires": { - "color-convert": "^2.0.1" - } - }, - "chalk": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.0.tgz", - "integrity": "sha1-ThSHCmGNni7dl92DRf2dncMVZGo=", - "dev": true, - "requires": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" - } - }, - "color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha1-ctOmjVmMm9s68q0ehPIdiWq9TeM=", - "dev": true, - "requires": { - "color-name": "~1.1.4" - } - }, - "color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha1-wqCah6y95pVD3m9j+jmVyCbFNqI=", - "dev": true - }, - "has-flag": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", - "integrity": "sha1-lEdx/ZyByBJlxNaUGGDaBrtZR5s=", - "dev": true - }, - "is-fullwidth-code-point": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", - "integrity": "sha1-8Rb4Bk/pCz94RKOJl8C3UFEmnx0=", - "dev": true - }, - "string-width": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.0.tgz", - "integrity": "sha1-lSGCxGzHssMT0VluYjmSvRY7crU=", - "dev": true, - "requires": { - "emoji-regex": "^8.0.0", - "is-fullwidth-code-point": "^3.0.0", - "strip-ansi": "^6.0.0" - } - }, - "strip-ansi": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", - "integrity": "sha1-CxVx3XZpzNTz4G4U7x7tJiJa5TI=", - "dev": true, - "requires": { - "ansi-regex": "^5.0.0" - } - }, - "supports-color": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", - "integrity": "sha1-G33NyzK4E4gBs+R4umpRyqiWSNo=", - "dev": true, - "requires": { - "has-flag": "^4.0.0" - } - } - } - }, "interpret": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.4.0.tgz", @@ -2030,18 +7540,12 @@ "is-binary-path": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", - "integrity": "sha1-6h9/O4DwZCNug0cPhsCcJU+0Wwk=", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", "dev": true, "requires": { "binary-extensions": "^2.0.0" } }, - "is-buffer": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.4.tgz", - "integrity": "sha1-PlcvI8hBGlz9lVfISeNmXgspBiM=", - "dev": true - }, "is-callable": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.2.tgz", @@ -2095,6 +7599,12 @@ "integrity": "sha1-dTU0W4lnNNX4DE0GxQlVUnoU8Ss=", "dev": true }, + "is-plain-obj": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-2.1.0.tgz", + "integrity": "sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==", + "dev": true + }, "is-regex": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.1.1.tgz", @@ -2171,9 +7681,9 @@ "dev": true }, "jsdoctypeparser": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/jsdoctypeparser/-/jsdoctypeparser-6.1.0.tgz", - "integrity": "sha1-rPuTbCYwDZjxQFywPiCwZ0jlEqg=", + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/jsdoctypeparser/-/jsdoctypeparser-9.0.0.tgz", + "integrity": "sha512-jrTA2jJIL6/DAEILBEh2/w9QxCuwmvNXIry39Ay/HVfhE3o2yVV0U44blYkqdHA/OKloJEqvJy0xU+GSdE2SIw==", "dev": true }, "json-parse-better-errors": { @@ -2293,13 +7803,13 @@ } }, "levn": { - "version": "0.3.0", - "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz", - "integrity": "sha1-OwmSTt+fCDwEkP3UwLxEIeBHZO4=", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", + "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==", "dev": true, "requires": { - "prelude-ls": "~1.1.2", - "type-check": "~0.3.2" + "prelude-ls": "^1.2.1", + "type-check": "~0.4.0" } }, "listenercount": { @@ -2355,12 +7865,63 @@ "dev": true }, "log-symbols": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-3.0.0.tgz", - "integrity": "sha1-86CFFqXeqJMzan3uFNGKHP2rd8Q=", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-4.0.0.tgz", + "integrity": "sha512-FN8JBzLx6CzeMrB0tg6pqlGU1wCrXW+ZXGH481kfsBqer0hToTIiHdjH4Mq8xJUbvATujKCvaREGWpGUionraA==", "dev": true, "requires": { - "chalk": "^2.4.2" + "chalk": "^4.0.0" + }, + "dependencies": { + "ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "requires": { + "color-convert": "^2.0.1" + } + }, + "chalk": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.0.tgz", + "integrity": "sha512-qwx12AxXe2Q5xQ43Ac//I6v5aXTipYrSESdOgzrN+9XjgEpyjpKuvSGaN4qE93f7TQTlerQQ8S+EQ0EyDoVL1A==", + "dev": true, + "requires": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + } + }, + "color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "requires": { + "color-name": "~1.1.4" + } + }, + "color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true + }, + "supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "requires": { + "has-flag": "^4.0.0" + } + } } }, "long": { @@ -2369,10 +7930,19 @@ "integrity": "sha1-mntxz7fTYaGU6lVSQckvdGjVvyg=", "dev": true }, + "lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "requires": { + "yallist": "^3.0.2" + } + }, "lunr": { "version": "2.3.9", "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", - "integrity": "sha1-GLEjFCgyM33W6WTfGlp3B7JdNeE=", + "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==", "dev": true }, "macos-release": { @@ -2382,9 +7952,9 @@ "dev": true }, "marked": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/marked/-/marked-1.0.0.tgz", - "integrity": "sha1-01eEJFoEhx5ZiKSR4ohnNi6UFpM=", + "version": "1.2.9", + "resolved": "https://registry.npmjs.org/marked/-/marked-1.2.9.tgz", + "integrity": "sha512-H8lIX2SvyitGX+TRdtS06m1jHMijKN/XjfH6Ooii9fvxMlh8QdqBfBDkGUpMWH2kQNrtixjzYUa3SH8ROTgRRw==", "dev": true }, "memory-stream": { @@ -2453,12 +8023,6 @@ "mime-db": "1.44.0" } }, - "mimic-fn": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-2.1.0.tgz", - "integrity": "sha1-ftLCzMyvhNP/y3pptXcR/CCDQBs=", - "dev": true - }, "mimic-response": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-2.1.0.tgz", @@ -2501,6 +8065,7 @@ "version": "0.5.5", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.5.tgz", "integrity": "sha1-2Rzv1i0UNsoPQWIOJRKI1CAJne8=", + "dev": true, "requires": { "minimist": "^1.2.5" } @@ -2511,35 +8076,36 @@ "integrity": "sha1-+hDJEVzG2IZb4iG6R+6b7XhgERM=" }, "mocha": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/mocha/-/mocha-7.2.0.tgz", - "integrity": "sha1-AcwiewDYdase7QOnUQZonP7VpgQ=", + "version": "8.2.1", + "resolved": "https://registry.npmjs.org/mocha/-/mocha-8.2.1.tgz", + "integrity": "sha512-cuLBVfyFfFqbNR0uUKbDGXKGk+UDFe6aR4os78XIrMQpZl/nv7JYHcvP5MFIAb374b2zFXsdgEGwmzMtP0Xg8w==", "dev": true, "requires": { - "ansi-colors": "3.2.3", + "@ungap/promise-all-settled": "1.1.2", + "ansi-colors": "4.1.1", "browser-stdout": "1.3.1", - "chokidar": "3.3.0", - "debug": "3.2.6", - "diff": "3.5.0", - "escape-string-regexp": "1.0.5", - "find-up": "3.0.0", - "glob": "7.1.3", + "chokidar": "3.4.3", + "debug": "4.2.0", + "diff": "4.0.2", + "escape-string-regexp": "4.0.0", + "find-up": "5.0.0", + "glob": "7.1.6", "growl": "1.10.5", "he": "1.2.0", - "js-yaml": "3.13.1", - "log-symbols": "3.0.0", + "js-yaml": "3.14.0", + "log-symbols": "4.0.0", "minimatch": "3.0.4", - "mkdirp": "0.5.5", - "ms": "2.1.1", - "node-environment-flags": "1.0.6", - "object.assign": "4.1.0", - "strip-json-comments": "2.0.1", - "supports-color": "6.0.0", - "which": "1.3.1", + "ms": "2.1.2", + "nanoid": "3.1.12", + "serialize-javascript": "5.0.1", + "strip-json-comments": "3.1.1", + "supports-color": "7.2.0", + "which": "2.0.2", "wide-align": "1.1.3", + "workerpool": "6.0.2", "yargs": "13.3.2", "yargs-parser": "13.1.2", - "yargs-unparser": "1.6.0" + "yargs-unparser": "2.0.0" }, "dependencies": { "ansi-regex": { @@ -2560,12 +8126,12 @@ } }, "debug": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.6.tgz", - "integrity": "sha1-6D0X3hbYp++3cX7b5fsQE17uYps=", + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.2.0.tgz", + "integrity": "sha512-IX2ncY78vDTjZMFUdmsvIRFY2Cf4FnD0wRs+nQwJU8Lu99/tPFdb0VybiiMTPe3I6rQmwsqQqRBvxU+bZ/I8sg==", "dev": true, "requires": { - "ms": "^2.1.1" + "ms": "2.1.2" } }, "emoji-regex": { @@ -2574,28 +8140,27 @@ "integrity": "sha1-kzoEBShgyF6DwSJHnEdIqOTHIVY=", "dev": true }, + "escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true + }, "find-up": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", - "integrity": "sha1-SRafHXmTQwZG2mHsxa41XCHJe3M=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", "dev": true, "requires": { - "locate-path": "^3.0.0" + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" } }, - "glob": { - "version": "7.1.3", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.3.tgz", - "integrity": "sha1-OWCDLT8VdBCDQtr9OmezMsCWnfE=", - "dev": true, - "requires": { - "fs.realpath": "^1.0.0", - "inflight": "^1.0.4", - "inherits": "2", - "minimatch": "^3.0.4", - "once": "^1.3.0", - "path-is-absolute": "^1.0.0" - } + "has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true }, "is-fullwidth-code-point": { "version": "2.0.0", @@ -2603,66 +8168,43 @@ "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", "dev": true }, - "js-yaml": { - "version": "3.13.1", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.13.1.tgz", - "integrity": "sha1-r/FRswv9+o5J4F2iLnQV6d+jeEc=", - "dev": true, - "requires": { - "argparse": "^1.0.7", - "esprima": "^4.0.0" - } - }, "locate-path": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", - "integrity": "sha1-2+w7OrdZdYBxtY/ln8QYca8hQA4=", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", "dev": true, "requires": { - "p-locate": "^3.0.0", - "path-exists": "^3.0.0" - } - }, - "ms": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.1.tgz", - "integrity": "sha1-MKWGTrPrsKZvLr5tcnrwagnYbgo=", - "dev": true - }, - "object.assign": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.0.tgz", - "integrity": "sha1-lovxEA15Vrs8oIbwBvhGs7xACNo=", - "dev": true, - "requires": { - "define-properties": "^1.1.2", - "function-bind": "^1.1.1", - "has-symbols": "^1.0.0", - "object-keys": "^1.0.11" + "p-locate": "^5.0.0" } }, "p-limit": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", - "integrity": "sha1-PdM8ZHohT9//2DWTPrCG2g3CHbE=", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", "dev": true, "requires": { - "p-try": "^2.0.0" + "yocto-queue": "^0.1.0" } }, "p-locate": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", - "integrity": "sha1-Mi1poFwCZLJZl9n0DNiokasAZKQ=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", "dev": true, "requires": { - "p-limit": "^2.0.0" + "p-limit": "^3.0.2" } }, "p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", - "integrity": "sha1-yyhoVA4xPWHeWPr741zpAE1VQOY=", + "integrity": "sha512-R4nPAVTAU0B9D35/Gk3uJf/7XYbQcyohSKdvAxIRSNghFl4e71hVoGnBNQz9cWaXxO2I10KTC+3jMdvvoKw6dQ==", + "dev": true + }, + "path-exists": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", "dev": true }, "string-width": { @@ -2685,13 +8227,28 @@ "ansi-regex": "^4.1.0" } }, + "strip-json-comments": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", + "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==", + "dev": true + }, "supports-color": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-6.0.0.tgz", - "integrity": "sha1-ds/nQs8fQbubHCmtAwaMBbTA5Ao=", + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", "dev": true, "requires": { - "has-flag": "^3.0.0" + "has-flag": "^4.0.0" + } + }, + "which": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", + "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==", + "dev": true, + "requires": { + "isexe": "^2.0.0" } }, "wrap-ansi": { @@ -2727,6 +8284,51 @@ "which-module": "^2.0.0", "y18n": "^4.0.0", "yargs-parser": "^13.1.2" + }, + "dependencies": { + "find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "requires": { + "locate-path": "^3.0.0" + } + }, + "locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "requires": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + } + }, + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + }, + "p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "requires": { + "p-limit": "^2.0.0" + } + }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + } } } } @@ -2737,10 +8339,10 @@ "integrity": "sha1-0J0fNXtEP0kzgqjrPM0YOHKuYAk=", "dev": true }, - "mute-stream": { - "version": "0.0.8", - "resolved": "https://registry.npmjs.org/mute-stream/-/mute-stream-0.0.8.tgz", - "integrity": "sha1-FjDEKyJR/4HiooPelqVJfqkuXg0=", + "nanoid": { + "version": "3.1.12", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.1.12.tgz", + "integrity": "sha512-1qstj9z5+x491jfiC4Nelk+f8XBad7LN20PmyWINJEMRSf3wcAjAWysw1qaA8z6NSKe2sjq1hRSDpBH5paCb6A==", "dev": true }, "napi-build-utils": { @@ -2775,21 +8377,11 @@ } }, "node-addon-api": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-3.0.2.tgz", - "integrity": "sha1-BLx7g/2EW6eFu26uJbyFfh73VoE=", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-3.1.0.tgz", + "integrity": "sha512-flmrDNB06LIl5lywUz7YlNGZH/5p0M7W28k8hzd9Lshtdh1wshD2Y+U4h9LD6KObOy1f+fEVdgprPrEymjM5uw==", "dev": true }, - "node-environment-flags": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/node-environment-flags/-/node-environment-flags-1.0.6.tgz", - "integrity": "sha1-owrBNiH299Z0JgpU3t4EjDmCwIg=", - "dev": true, - "requires": { - "object.getownpropertydescriptors": "^2.0.3", - "semver": "^5.7.0" - } - }, "node-fetch": { "version": "2.6.1", "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz", @@ -2827,7 +8419,7 @@ "normalize-path": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", - "integrity": "sha1-Dc1p/yOhybEf0JeDFmRKA4ghamU=", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", "dev": true }, "npm-run-path": { @@ -2912,16 +8504,6 @@ } } }, - "object.getownpropertydescriptors": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/object.getownpropertydescriptors/-/object.getownpropertydescriptors-2.1.0.tgz", - "integrity": "sha1-Npvx+VktiridcS3O1cuBx8U1Jkk=", - "dev": true, - "requires": { - "define-properties": "^1.1.3", - "es-abstract": "^1.17.0-next.1" - } - }, "object.values": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.1.1.tgz", @@ -2942,13 +8524,13 @@ "wrappy": "1" } }, - "onetime": { - "version": "5.1.2", - "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz", - "integrity": "sha1-0Oluu1awdHbfHdnEgG5SN5hcpF4=", + "onigasm": { + "version": "2.2.5", + "resolved": "https://registry.npmjs.org/onigasm/-/onigasm-2.2.5.tgz", + "integrity": "sha512-F+th54mPc0l1lp1ZcFMyL/jTs2Tlq4SqIHKIXGZOR/VkHkF9A7Fr5rRr5+ZG/lWeRsyrClLYRq7s/yFQ/XhWCA==", "dev": true, "requires": { - "mimic-fn": "^2.1.0" + "lru-cache": "^5.1.1" } }, "onnx-proto": { @@ -2961,17 +8543,17 @@ } }, "optionator": { - "version": "0.8.3", - "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.3.tgz", - "integrity": "sha1-hPodA2/p08fiHZmIS2ARZ+yPtJU=", + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.1.tgz", + "integrity": "sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==", "dev": true, "requires": { - "deep-is": "~0.1.3", - "fast-levenshtein": "~2.0.6", - "levn": "~0.3.0", - "prelude-ls": "~1.1.2", - "type-check": "~0.3.2", - "word-wrap": "~1.2.3" + "deep-is": "^0.1.3", + "fast-levenshtein": "^2.0.6", + "levn": "^0.4.1", + "prelude-ls": "^1.2.1", + "type-check": "^0.4.0", + "word-wrap": "^1.2.3" } }, "os-locale": { @@ -2993,12 +8575,6 @@ "windows-release": "^3.1.0" } }, - "os-tmpdir": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", - "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", - "dev": true - }, "p-finally": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", @@ -3108,15 +8684,15 @@ } }, "prebuild-install": { - "version": "5.3.5", - "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-5.3.5.tgz", - "integrity": "sha1-5+ceQlKYeF6p0i1PlY26zPi7Dhs=", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-6.0.0.tgz", + "integrity": "sha512-h2ZJ1PXHKWZpp1caLw0oX9sagVpL2YTk+ZwInQbQ3QqNd4J03O6MpFNmMTJlkfgPENWqe5kP0WjQLqz5OjLfsw==", "requires": { "detect-libc": "^1.0.3", "expand-template": "^2.0.3", "github-from-package": "0.0.0", "minimist": "^1.2.3", - "mkdirp": "^0.5.1", + "mkdirp-classic": "^0.5.3", "napi-build-utils": "^1.0.1", "node-abi": "^2.7.0", "noop-logger": "^0.1.1", @@ -3130,9 +8706,9 @@ } }, "prelude-ls": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz", - "integrity": "sha1-IZMqVJ9eUv/ZqCf1cOBL5iqX2lQ=", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", + "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==", "dev": true }, "process-nextick-args": { @@ -3202,6 +8778,15 @@ "integrity": "sha1-yzroBuh0BERYTvFUzo7pjUA/PjY=", "dev": true }, + "randombytes": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", + "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", + "dev": true, + "requires": { + "safe-buffer": "^5.1.0" + } + }, "rc": { "version": "1.2.8", "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", @@ -3249,12 +8834,12 @@ } }, "readdirp": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.2.0.tgz", - "integrity": "sha1-wwwzNSsSyW37S4lUIaSf1alZODk=", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.5.0.tgz", + "integrity": "sha512-cMhu7c/8rdhkHXWsY+osBhfSy0JikwpHK/5+imo+LpeasTF8ouErHrlYkwT0++njiyuDvc7OFY5T3ukvZ8qmFQ==", "dev": true, "requires": { - "picomatch": "^2.0.4" + "picomatch": "^2.2.1" } }, "rechoir": { @@ -3312,6 +8897,12 @@ "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=", "dev": true }, + "require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "dev": true + }, "require-main-filename": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-2.0.0.tgz", @@ -3333,16 +8924,6 @@ "integrity": "sha1-SrzYUq0y3Xuqv+m0DgCjbbXzkuY=", "dev": true }, - "restore-cursor": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-3.1.0.tgz", - "integrity": "sha1-OfZ8VLOnpYzqUjbZXPADQjljH34=", - "dev": true, - "requires": { - "onetime": "^5.1.0", - "signal-exit": "^3.0.2" - } - }, "reusify": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", @@ -3358,27 +8939,12 @@ "glob": "^7.1.3" } }, - "run-async": { - "version": "2.4.1", - "resolved": "https://registry.npmjs.org/run-async/-/run-async-2.4.1.tgz", - "integrity": "sha1-hEDsz5nqPnC9QJ1JqriOEMGJpFU=", - "dev": true - }, "run-parallel": { "version": "1.1.9", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.1.9.tgz", "integrity": "sha1-yd06fPn0ssS2JE4XOm7YZuYd1nk=", "dev": true }, - "rxjs": { - "version": "6.6.3", - "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.6.3.tgz", - "integrity": "sha1-jKhGNcTaqQDA05Z6buesYCce5VI=", - "dev": true, - "requires": { - "tslib": "^1.9.0" - } - }, "safe-buffer": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", @@ -3395,6 +8961,15 @@ "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz", "integrity": "sha1-qVT5Ma66UI0we78Gnv8MAclhFvc=" }, + "serialize-javascript": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-5.0.1.tgz", + "integrity": "sha512-SaaNal9imEO737H2c05Og0/8LUXG7EnsZyMa8MzkmuHoELfT6txuj0cMqRj6zfPKnmQ1yasR4PCJc8x+M4JSPA==", + "dev": true, + "requires": { + "randombytes": "^2.1.0" + } + }, "set-blocking": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", @@ -3432,6 +9007,48 @@ "rechoir": "^0.6.2" } }, + "shiki": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.2.7.tgz", + "integrity": "sha512-bwVc7cdtYYHEO9O+XJ8aNOskKRfaQd5Y4ovLRfbQkmiLSUaR+bdlssbZUUhbQ0JAFMYcTcJ5tjG5KtnufttDHQ==", + "dev": true, + "requires": { + "onigasm": "^2.2.5", + "shiki-languages": "^0.2.7", + "shiki-themes": "^0.2.7", + "vscode-textmate": "^5.2.0" + } + }, + "shiki-languages": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki-languages/-/shiki-languages-0.2.7.tgz", + "integrity": "sha512-REmakh7pn2jCn9GDMRSK36oDgqhh+rSvJPo77sdWTOmk44C5b0XlYPwJZcFOMJWUZJE0c7FCbKclw4FLwUKLRw==", + "dev": true, + "requires": { + "vscode-textmate": "^5.2.0" + } + }, + "shiki-themes": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/shiki-themes/-/shiki-themes-0.2.7.tgz", + "integrity": "sha512-ZMmboDYw5+SEpugM8KGUq3tkZ0vXg+k60XX6NngDK7gc1Sv6YLUlanpvG3evm57uKJvfXsky/S5MzSOTtYKLjA==", + "dev": true, + "requires": { + "json5": "^2.1.0", + "vscode-textmate": "^5.2.0" + }, + "dependencies": { + "json5": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.0.tgz", + "integrity": "sha512-f+8cldu7X/y7RAJurMEJmdoKXGB/X550w2Nr3tTbezL6RwEE/iMcm+tZnXeoZtKuOq6ft8+CqzEkrIgx1fPoQA==", + "dev": true, + "requires": { + "minimist": "^1.2.5" + } + } + } + }, "signal-exit": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.3.tgz", @@ -3459,20 +9076,44 @@ "dev": true }, "slice-ansi": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-2.1.0.tgz", - "integrity": "sha1-ys12k0YaY3pXiNkqfdT7oGjoFjY=", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-4.0.0.tgz", + "integrity": "sha512-qMCMfhY040cVHT43K9BFygqYbUPFZKHOg7K73mtTWJRb8pyP3fzf4Ixd5SzdEJQ6MRUg/WBnOLxghZtKKurENQ==", "dev": true, "requires": { - "ansi-styles": "^3.2.0", - "astral-regex": "^1.0.0", - "is-fullwidth-code-point": "^2.0.0" + "ansi-styles": "^4.0.0", + "astral-regex": "^2.0.0", + "is-fullwidth-code-point": "^3.0.0" }, "dependencies": { + "ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "requires": { + "color-convert": "^2.0.1" + } + }, + "color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "requires": { + "color-name": "~1.1.4" + } + }, + "color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, "is-fullwidth-code-point": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", - "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", "dev": true } } @@ -3544,6 +9185,14 @@ "tweetnacl": "~0.14.0" } }, + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha1-nPFhG6YmhdcDCunkujQUnDrwP8g=", + "requires": { + "safe-buffer": "~5.1.0" + } + }, "string-width": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", @@ -3574,14 +9223,6 @@ "es-abstract": "^1.17.5" } }, - "string_decoder": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha1-nPFhG6YmhdcDCunkujQUnDrwP8g=", - "requires": { - "safe-buffer": "~5.1.0" - } - }, "strip-ansi": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", @@ -3617,53 +9258,65 @@ } }, "table": { - "version": "5.4.6", - "resolved": "https://registry.npmjs.org/table/-/table-5.4.6.tgz", - "integrity": "sha1-EpLRlQDOP4YFOwXw6Ofko7shB54=", + "version": "6.0.7", + "resolved": "https://registry.npmjs.org/table/-/table-6.0.7.tgz", + "integrity": "sha512-rxZevLGTUzWna/qBLObOe16kB2RTnnbhciwgPbMMlazz1yZGVEgnZK762xyVdVznhqxrfCeBMmMkgOOaPwjH7g==", "dev": true, "requires": { - "ajv": "^6.10.2", - "lodash": "^4.17.14", - "slice-ansi": "^2.1.0", - "string-width": "^3.0.0" + "ajv": "^7.0.2", + "lodash": "^4.17.20", + "slice-ansi": "^4.0.0", + "string-width": "^4.2.0" }, "dependencies": { - "ansi-regex": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.0.tgz", - "integrity": "sha1-i5+PCM8ay4Q3Vqg5yox+MWjFGZc=", - "dev": true + "ajv": { + "version": "7.0.4", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-7.0.4.tgz", + "integrity": "sha512-xzzzaqgEQfmuhbhAoqjJ8T/1okb6gAzXn/eQRNpAN1AEUoHJTNF9xCDRTtf/s3SKldtZfa+RJeTs+BQq+eZ/sw==", + "dev": true, + "requires": { + "fast-deep-equal": "^3.1.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2", + "uri-js": "^4.2.2" + } }, - "emoji-regex": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", - "integrity": "sha1-kzoEBShgyF6DwSJHnEdIqOTHIVY=", + "ansi-regex": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz", + "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==", "dev": true }, "is-fullwidth-code-point": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", - "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true + }, + "json-schema-traverse": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "dev": true }, "string-width": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz", - "integrity": "sha1-InZ74htirxCBV0MG9prFG2IgOWE=", + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.0.tgz", + "integrity": "sha512-zUz5JD+tgqtuDjMhwIg5uFVV3dtqZ9yQJlZVfq4I01/K5Paj5UHj7VyrQOJvzawSVlKpObApbfD0Ed6yJc+1eg==", "dev": true, "requires": { - "emoji-regex": "^7.0.1", - "is-fullwidth-code-point": "^2.0.0", - "strip-ansi": "^5.1.0" + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.0" } }, "strip-ansi": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz", - "integrity": "sha1-jJpTb+tq/JYr36WxBKUJHBrZwK4=", + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz", + "integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==", "dev": true, "requires": { - "ansi-regex": "^4.1.0" + "ansi-regex": "^5.0.0" } } } @@ -3695,9 +9348,9 @@ } }, "tar-stream": { - "version": "2.1.4", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.1.4.tgz", - "integrity": "sha1-xPsaEesNopuJOlslR2OXui0FO/o=", + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", "requires": { "bl": "^4.0.3", "end-of-stream": "^1.4.1", @@ -3724,21 +9377,6 @@ "integrity": "sha1-f17oI66AUgfACvLfSoTsP8+lcLQ=", "dev": true }, - "through": { - "version": "2.3.8", - "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", - "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", - "dev": true - }, - "tmp": { - "version": "0.0.33", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", - "integrity": "sha1-bTQzWIl2jSGyvNoKonfO07G/rfk=", - "dev": true, - "requires": { - "os-tmpdir": "~1.0.2" - } - }, "to-regex-range": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", @@ -3779,13 +9417,13 @@ "tslib": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", - "integrity": "sha1-zy04vcNKE0vK8QkcQfZhni9nLQA=", + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", "dev": true }, "tsutils": { - "version": "3.17.1", - "resolved": "https://registry.npmjs.org/tsutils/-/tsutils-3.17.1.tgz", - "integrity": "sha1-7XGZF/EcoN7lhicrKsSeAVot11k=", + "version": "3.20.0", + "resolved": "https://registry.npmjs.org/tsutils/-/tsutils-3.20.0.tgz", + "integrity": "sha512-RYbuQuvkhuqVeXweWT3tJLKOEJ/UUw9GjNEZGWdrLLlM+611o1gwLHBpxoFJKKl25fLprp2eVthtKs5JOrNeXg==", "dev": true, "requires": { "tslib": "^1.8.1" @@ -3806,12 +9444,12 @@ "dev": true }, "type-check": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz", - "integrity": "sha1-WITKtRLPHTVeP7eE8wgEsrUg23I=", + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", + "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", "dev": true, "requires": { - "prelude-ls": "~1.1.2" + "prelude-ls": "^1.2.1" } }, "type-fest": { @@ -3821,49 +9459,34 @@ "dev": true }, "typedoc": { - "version": "0.17.8", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.17.8.tgz", - "integrity": "sha1-lrZ+lFSqeFO/xNyaVciget/VR44=", + "version": "0.20.20", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.20.20.tgz", + "integrity": "sha512-qXB40ttDGaqv6q6UIiAVqOpX/GlXoBur0lB4g9fePoYjfwa6OsPkoYufLtsjEaBB0EokShR2aIoI5GX4RB83cw==", "dev": true, "requires": { - "fs-extra": "^8.1.0", + "colors": "^1.4.0", + "fs-extra": "^9.1.0", "handlebars": "^4.7.6", - "highlight.js": "^10.0.0", - "lodash": "^4.17.15", - "lunr": "^2.3.8", - "marked": "1.0.0", + "lodash": "^4.17.20", + "lunr": "^2.3.9", + "marked": "^1.2.8", "minimatch": "^3.0.0", "progress": "^2.0.3", "shelljs": "^0.8.4", - "typedoc-default-themes": "^0.10.2" - }, - "dependencies": { - "fs-extra": { - "version": "8.1.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-8.1.0.tgz", - "integrity": "sha1-SdQ8RaiM2Wd2aMt74bRu/bjS4cA=", - "dev": true, - "requires": { - "graceful-fs": "^4.2.0", - "jsonfile": "^4.0.0", - "universalify": "^0.1.0" - } - } + "shiki": "^0.2.7", + "typedoc-default-themes": "^0.12.7" } }, "typedoc-default-themes": { - "version": "0.10.2", - "resolved": "https://registry.npmjs.org/typedoc-default-themes/-/typedoc-default-themes-0.10.2.tgz", - "integrity": "sha1-dDOAqAr+YsXvksob1KvirFlr5NI=", - "dev": true, - "requires": { - "lunr": "^2.3.8" - } + "version": "0.12.7", + "resolved": "https://registry.npmjs.org/typedoc-default-themes/-/typedoc-default-themes-0.12.7.tgz", + "integrity": "sha512-0XAuGEqID+gon1+fhi4LycOEFM+5Mvm2PjwaiVZNAzU7pn3G2DEpsoXnFOPlLDnHY6ZW0BY0nO7ur9fHOFkBLQ==", + "dev": true }, "typescript": { - "version": "3.9.7", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-3.9.7.tgz", - "integrity": "sha1-mNYApevcOPQMsndSLxLcgA6eJfo=", + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.1.3.tgz", + "integrity": "sha512-B3ZIOf1IKeH2ixgHhj6la6xdwR9QrLC5d1VKeCSY4tvkqhF2eqd9O7txNlS0PO3GrBAFIdr3L1ndNwteUbZLYg==", "dev": true }, "uglify-js": { @@ -3993,6 +9616,12 @@ "extsprintf": "^1.2.0" } }, + "vscode-textmate": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", + "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==", + "dev": true + }, "which": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz", @@ -4039,7 +9668,7 @@ "word-wrap": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", - "integrity": "sha1-YQY29rH3A4kb00dxzLF/uTtHB5w=", + "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==", "dev": true }, "wordwrap": { @@ -4048,6 +9677,12 @@ "integrity": "sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=", "dev": true }, + "workerpool": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-6.0.2.tgz", + "integrity": "sha512-DSNyvOpFKrNusaaUwk+ej6cBj1bmhLcBfj80elGk+ZIo5JSkq+unB1dLKEOcNfJDZgjGICfhQ0Q5TbP0PvF4+Q==", + "dev": true + }, "wrap-ansi": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", @@ -4063,15 +9698,6 @@ "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=" }, - "write": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/write/-/write-1.0.3.tgz", - "integrity": "sha1-CADhRSO5I6OH5BUSPIZWFqrg9cM=", - "dev": true, - "requires": { - "mkdirp": "^0.5.1" - } - }, "y18n": { "version": "3.2.1", "resolved": "https://registry.npmjs.org/y18n/-/y18n-3.2.1.tgz", @@ -4118,144 +9744,36 @@ } }, "yargs-unparser": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/yargs-unparser/-/yargs-unparser-1.6.0.tgz", - "integrity": "sha1-7yXCx2n/a9CeSw+dfGBfsnhG6p8=", + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/yargs-unparser/-/yargs-unparser-2.0.0.tgz", + "integrity": "sha512-7pRTIA9Qc1caZ0bZ6RYRGbHJthJWuakf+WmHK0rVeLkNrrGhfoabBNdue6kdINI6r4if7ocq9aD/n7xwKOdzOA==", "dev": true, "requires": { - "flat": "^4.1.0", - "lodash": "^4.17.15", - "yargs": "^13.3.0" + "camelcase": "^6.0.0", + "decamelize": "^4.0.0", + "flat": "^5.0.2", + "is-plain-obj": "^2.1.0" }, "dependencies": { - "ansi-regex": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.1.0.tgz", - "integrity": "sha1-i5+PCM8ay4Q3Vqg5yox+MWjFGZc=", + "camelcase": { + "version": "6.2.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.2.0.tgz", + "integrity": "sha512-c7wVvbw3f37nuobQNtgsgG9POC9qMbNuMQmTCqZv23b6MIz0fcYpBiOlv9gEN/hdLdnZTDQhg6e9Dq5M1vKvfg==", "dev": true }, - "cliui": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/cliui/-/cliui-5.0.0.tgz", - "integrity": "sha1-3u/P2y6AB4SqNPRvoI4GhRx7u8U=", - "dev": true, - "requires": { - "string-width": "^3.1.0", - "strip-ansi": "^5.2.0", - "wrap-ansi": "^5.1.0" - } - }, - "emoji-regex": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-7.0.3.tgz", - "integrity": "sha1-kzoEBShgyF6DwSJHnEdIqOTHIVY=", - "dev": true - }, - "find-up": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", - "integrity": "sha1-SRafHXmTQwZG2mHsxa41XCHJe3M=", - "dev": true, - "requires": { - "locate-path": "^3.0.0" - } - }, - "is-fullwidth-code-point": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", - "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", - "dev": true - }, - "locate-path": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", - "integrity": "sha1-2+w7OrdZdYBxtY/ln8QYca8hQA4=", - "dev": true, - "requires": { - "p-locate": "^3.0.0", - "path-exists": "^3.0.0" - } - }, - "p-limit": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", - "integrity": "sha1-PdM8ZHohT9//2DWTPrCG2g3CHbE=", - "dev": true, - "requires": { - "p-try": "^2.0.0" - } - }, - "p-locate": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", - "integrity": "sha1-Mi1poFwCZLJZl9n0DNiokasAZKQ=", - "dev": true, - "requires": { - "p-limit": "^2.0.0" - } - }, - "p-try": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", - "integrity": "sha1-yyhoVA4xPWHeWPr741zpAE1VQOY=", - "dev": true - }, - "string-width": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-3.1.0.tgz", - "integrity": "sha1-InZ74htirxCBV0MG9prFG2IgOWE=", - "dev": true, - "requires": { - "emoji-regex": "^7.0.1", - "is-fullwidth-code-point": "^2.0.0", - "strip-ansi": "^5.1.0" - } - }, - "strip-ansi": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.2.0.tgz", - "integrity": "sha1-jJpTb+tq/JYr36WxBKUJHBrZwK4=", - "dev": true, - "requires": { - "ansi-regex": "^4.1.0" - } - }, - "wrap-ansi": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-5.1.0.tgz", - "integrity": "sha1-H9H2cjXVttD+54EFYAG/tpTAOwk=", - "dev": true, - "requires": { - "ansi-styles": "^3.2.0", - "string-width": "^3.0.0", - "strip-ansi": "^5.0.0" - } - }, - "y18n": { + "decamelize": { "version": "4.0.0", - "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", - "integrity": "sha1-le+U+F7MgdAHwmThkKEg8KPIVms=", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-4.0.0.tgz", + "integrity": "sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==", "dev": true - }, - "yargs": { - "version": "13.3.2", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-13.3.2.tgz", - "integrity": "sha1-rX/+/sGqWVZayRX4Lcyzipwxot0=", - "dev": true, - "requires": { - "cliui": "^5.0.0", - "find-up": "^3.0.0", - "get-caller-file": "^2.0.1", - "require-directory": "^2.1.1", - "require-main-filename": "^2.0.0", - "set-blocking": "^2.0.0", - "string-width": "^3.0.0", - "which-module": "^2.0.0", - "y18n": "^4.0.0", - "yargs-parser": "^13.1.2" - } } } + }, + "yocto-queue": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", + "dev": true } } } diff --git a/nodejs/package.json b/nodejs/package.json index a1129a8577..a400656bf6 100644 --- a/nodejs/package.json +++ b/nodejs/package.json @@ -40,33 +40,33 @@ "author": "fs-eire", "license": "MIT", "devDependencies": { - "@types/fs-extra": "^8.1.0", + "@types/fs-extra": "^9.0.6", "@types/klaw-sync": "^6.0.0", - "@types/minimist": "1.2.0", - "@types/mocha": "^7.0.2", - "@types/tar-stream": "^2.1.0", - "@typescript-eslint/eslint-plugin": "^2.29.0", - "@typescript-eslint/parser": "^2.29.0", - "clang-format": "^1.4.0", + "@types/minimist": "1.2.1", + "@types/mocha": "^8.2.0", + "@types/tar-stream": "^2.2.0", + "@typescript-eslint/eslint-plugin": "^4.14.2", + "@typescript-eslint/parser": "^4.14.2", + "clang-format": "^1.5.0", "cmake-js": "^6.1.0", - "eslint": "^6.8.0", - "eslint-plugin-import": "^2.20.2", - "eslint-plugin-jsdoc": "^24.0.0", - "eslint-plugin-prefer-arrow": "^1.2.0", - "fs-extra": "^9.0.0", - "globby": "^11.0.0", + "eslint": "^7.19.0", + "eslint-plugin-import": "^2.22.1", + "eslint-plugin-jsdoc": "^31.6.0", + "eslint-plugin-prefer-arrow": "^1.2.3", + "fs-extra": "^9.1.0", + "globby": "^11.0.2", "jsonc": "^2.0.0", "klaw-sync": "^6.0.0", "minimist": "^1.2.5", - "mocha": "^7.1.1", - "node-addon-api": "^3.0.0", + "mocha": "^8.2.1", + "node-addon-api": "^3.1.0", "node-pre-gyp-github": "^1.4.3", "onnx-proto": "^4.0.4", - "tar-stream": "^2.1.4", - "typedoc": "^0.17.3", - "typescript": "^3.8.3" + "tar-stream": "^2.2.0", + "typedoc": "^0.20.20", + "typescript": "^4.1.3" }, "dependencies": { - "prebuild-install": "^5.3.5" + "prebuild-install": "^6.0.0" } -} \ No newline at end of file +} From c49d1dbc4b71cd3e0f3531504c0c0d805aa45cd2 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 5 Feb 2021 11:08:23 +1000 Subject: [PATCH 19/41] Add type reduction support to Slice and Transpose (#6547) * Add type reduction support to Slice and Transpose --- .../core/framework/data_types_internal.h | 2 +- .../core/providers/cpu/tensor/slice.cc | 181 ++++++++++++------ onnxruntime/core/providers/cpu/tensor/slice.h | 16 +- .../core/providers/cpu/tensor/transpose.cc | 101 +++++++--- .../core/providers/cpu/tensor/transpose.h | 7 +- .../core/providers/op_kernel_type_control.h | 57 +----- .../providers/op_kernel_type_control_utils.h | 50 +++++ .../providers/cpu/tensor/transpose_test.cc | 7 +- onnxruntime/test/testdata/mnist.readme.txt | 2 +- .../operator_type_usage_processors.py | 28 ++- 10 files changed, 286 insertions(+), 165 deletions(-) create mode 100644 onnxruntime/core/providers/op_kernel_type_control_utils.h diff --git a/include/onnxruntime/core/framework/data_types_internal.h b/include/onnxruntime/core/framework/data_types_internal.h index c195deaee0..7dfd5da467 100644 --- a/include/onnxruntime/core/framework/data_types_internal.h +++ b/include/onnxruntime/core/framework/data_types_internal.h @@ -116,7 +116,7 @@ constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType(__VA_ARGS__); \ break; \ case ONNX_NAMESPACE::TensorProto_DataType_UINT8: \ - function(__VA_ARGS__); \ + function(__VA_ARGS__); \ break; \ case ONNX_NAMESPACE::TensorProto_DataType_INT16: \ function(__VA_ARGS__); \ diff --git a/onnxruntime/core/providers/cpu/tensor/slice.cc b/onnxruntime/core/providers/cpu/tensor/slice.cc index a95d6a5510..52784823f5 100644 --- a/onnxruntime/core/providers/cpu/tensor/slice.cc +++ b/onnxruntime/core/providers/cpu/tensor/slice.cc @@ -4,6 +4,8 @@ #include "core/providers/cpu/tensor/slice.h" #include "core/providers/cpu/tensor/utils.h" #include "core/providers/common.h" +#include "core/providers/op_kernel_type_control.h" +#include "core/providers/op_kernel_type_control_utils.h" #include #include @@ -11,40 +13,28 @@ using namespace ::onnxruntime::common; using namespace std; namespace onnxruntime { -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Slice, - 1, 9, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), - Slice1); +namespace op_kernel_type_control { +// we're using one set of types for all opsets +ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( + kCpuExecutionProvider, kOnnxDomain, Slice, Input, 0, + ORT_OP_KERNEL_TYPE_CTRL_ALL_TENSOR_DATA_TYPES); -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Slice, - 10, 10, - KernelDefBuilder() - .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), - Slice10); +ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( + kCpuExecutionProvider, kOnnxDomain, Slice, Input, 1, int32_t, int64_t); +} // namespace op_kernel_type_control -ONNX_CPU_OPERATOR_VERSIONED_KERNEL( - Slice, - 11, - 12, - KernelDefBuilder() - .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), - Slice10); - -ONNX_CPU_OPERATOR_KERNEL( - Slice, - 13, - KernelDefBuilder() - .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), - Slice10); namespace { +using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, + Slice, Input, 0); +using EnabledIndicesTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, + Slice, Input, 1); + +const std::vector dataTypeConstraints = + BuildKernelDefConstraintsFunctorFromTypeList{}(); + +const std::vector indicesTypeConstraints = + BuildKernelDefConstraintsFunctorFromTypeList{}(); + // std::clamp doesn't exist until C++17 so create a local version template const T& clamp(const T& v, const T& lo, const T& hi) { @@ -54,6 +44,37 @@ const T& clamp(const T& v, const T& lo, const T& hi) { } } // namespace +ONNX_CPU_OPERATOR_VERSIONED_KERNEL( + Slice, + 1, 9, + KernelDefBuilder().TypeConstraint("T", dataTypeConstraints), + Slice1); + +ONNX_CPU_OPERATOR_VERSIONED_KERNEL( + Slice, + 10, 10, + KernelDefBuilder() + .TypeConstraint("T", dataTypeConstraints) + .TypeConstraint("Tind", indicesTypeConstraints), + Slice10); + +ONNX_CPU_OPERATOR_VERSIONED_KERNEL( + Slice, + 11, + 12, + KernelDefBuilder() + .TypeConstraint("T", dataTypeConstraints) + .TypeConstraint("Tind", indicesTypeConstraints), + Slice10); + +ONNX_CPU_OPERATOR_KERNEL( + Slice, + 13, + KernelDefBuilder() + .TypeConstraint("T", dataTypeConstraints) + .TypeConstraint("Tind", indicesTypeConstraints), + Slice10); + // Check if it's possible to combine innermost dimensions so we copy larger blocks. // Sets flattened_output_dims to nullptr if it is not. // Updates starts and steps to match flattened_output_dims if it is. @@ -218,19 +239,21 @@ Status SliceBase::PrepareForCompute(const std::vector& raw_starts, } // Slice V10 & DynamicSlice -void SliceBase::FillVectorsFromInput(const Tensor& start_tensor, - const Tensor& ends_tensor, - const Tensor* axes_tensor, - const Tensor* steps_tensor, - std::vector& input_starts, - std::vector& input_ends, - std::vector& input_axes, - std::vector& input_steps) { - ORT_ENFORCE(start_tensor.Shape().NumDimensions() == 1, "Starts must be a 1-D array"); - ORT_ENFORCE(ends_tensor.Shape().NumDimensions() == 1, "Ends must be a 1-D array"); - ORT_ENFORCE(start_tensor.Shape() == ends_tensor.Shape(), "Starts and ends shape mismatch"); - ORT_ENFORCE(nullptr == axes_tensor || start_tensor.Shape() == axes_tensor->Shape(), "Starts and axes shape mismatch"); - ORT_ENFORCE(nullptr == steps_tensor || start_tensor.Shape() == steps_tensor->Shape(), "Starts and steps shape mismatch"); +Status SliceBase::FillVectorsFromInput(const Tensor& start_tensor, + const Tensor& ends_tensor, + const Tensor* axes_tensor, + const Tensor* steps_tensor, + std::vector& input_starts, + std::vector& input_ends, + std::vector& input_axes, + std::vector& input_steps) { + ORT_RETURN_IF_NOT(start_tensor.Shape().NumDimensions() == 1, "Starts must be a 1-D array"); + ORT_RETURN_IF_NOT(ends_tensor.Shape().NumDimensions() == 1, "Ends must be a 1-D array"); + ORT_RETURN_IF_NOT(start_tensor.Shape() == ends_tensor.Shape(), "Starts and ends shape mismatch"); + ORT_RETURN_IF_NOT(nullptr == axes_tensor || start_tensor.Shape() == axes_tensor->Shape(), + "Starts and axes shape mismatch"); + ORT_RETURN_IF_NOT(nullptr == steps_tensor || start_tensor.Shape() == steps_tensor->Shape(), + "Starts and steps shape mismatch"); const auto& size = start_tensor.Shape().Size(); input_starts.resize(size); @@ -241,7 +264,11 @@ void SliceBase::FillVectorsFromInput(const Tensor& start_tensor, if (nullptr != steps_tensor) input_steps.resize(size); - if (start_tensor.IsDataType()) { + // check for type reduction of supported indices types + constexpr bool int32_enabled = utils::HasType(); + constexpr bool int64_enabled = utils::HasType(); + + if (int32_enabled && start_tensor.IsDataType()) { std::copy(start_tensor.Data(), start_tensor.Data() + size, input_starts.begin()); std::copy(ends_tensor.Data(), ends_tensor.Data() + size, input_ends.begin()); if (nullptr != axes_tensor) @@ -251,7 +278,7 @@ void SliceBase::FillVectorsFromInput(const Tensor& start_tensor, std::copy(steps_tensor->Data(), steps_tensor->Data() + size, input_steps.begin()); } - else if (start_tensor.IsDataType()) { + else if (int64_enabled && start_tensor.IsDataType()) { std::copy(start_tensor.Data(), start_tensor.Data() + size, input_starts.begin()); std::copy(ends_tensor.Data(), ends_tensor.Data() + size, input_ends.begin()); if (nullptr != axes_tensor) @@ -261,10 +288,13 @@ void SliceBase::FillVectorsFromInput(const Tensor& start_tensor, std::copy(steps_tensor->Data(), steps_tensor->Data() + size, input_steps.begin()); } - // should not reach this as no kernel is registered for this condition to be triggered - just an additional safety check else { - ORT_THROW("Data type for starts and ends inputs' need to be int32_t or int64_t, but instead got ", start_tensor.DataType()); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Data type for starts and ends inputs' is not supported in this build. Got ", + start_tensor.DataType()); } + + return Status::OK(); } template @@ -304,22 +334,40 @@ static Status SliceImpl(OpKernelContext* ctx, flattened_input_dims.back() = compute_metadata.p_flattened_output_dims_->back(); TensorShape input_shape(std::move(flattened_input_dims)); - auto input_iterator = SliceIterator(input_tensor, input_shape, compute_metadata.starts_, *compute_metadata.p_flattened_output_dims_, compute_metadata.steps_); + auto input_iterator = SliceIterator(input_tensor, input_shape, compute_metadata.starts_, + *compute_metadata.p_flattened_output_dims_, compute_metadata.steps_); create_output(input_iterator); } else { - auto input_iterator = SliceIterator(input_tensor, compute_metadata.starts_, compute_metadata.output_dims_, compute_metadata.steps_); + auto input_iterator = SliceIterator(input_tensor, compute_metadata.starts_, compute_metadata.output_dims_, + compute_metadata.steps_); create_output(input_iterator); } return Status::OK(); } +template +static inline bool CallSliceImplIfEnabled(OpKernelContext* ctx, + const Tensor& input_tensor, + SliceOp::PrepareForComputeMetadata& compute_metadata, + Status& status) { + constexpr bool enabled = utils::HasTypeWithSameSize(); + if (enabled) { + status = SliceImpl(ctx, input_tensor, compute_metadata); + } + + return enabled; +} + Status SliceBase::Compute(OpKernelContext* ctx) const { const auto* input_tensor_ptr = ctx->Input(0); - ORT_ENFORCE(input_tensor_ptr != nullptr, "Missing input tensor to be processed"); const auto& input_tensor = *input_tensor_ptr; const auto& input_dimensions = input_tensor.Shape().GetDims(); - if (input_dimensions.empty()) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot slice scalars"); + + if (input_dimensions.empty()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot slice scalars"); + } + SliceOp::PrepareForComputeMetadata compute_metadata(input_dimensions); // Slice V10 & DynamicSlice @@ -328,8 +376,9 @@ Status SliceBase::Compute(OpKernelContext* ctx) const { std::vector input_ends; std::vector input_axes; std::vector input_steps; - FillVectorsFromInput(*ctx->Input(1), *ctx->Input(2), ctx->Input(3), - ctx->Input(4), input_starts, input_ends, input_axes, input_steps); + ORT_RETURN_IF_ERROR(FillVectorsFromInput(*ctx->Input(1), *ctx->Input(2), + ctx->Input(3), ctx->Input(4), + input_starts, input_ends, input_axes, input_steps)); ORT_RETURN_IF_ERROR(PrepareForCompute(input_starts, input_ends, input_axes, input_steps, compute_metadata)); } @@ -340,28 +389,38 @@ Status SliceBase::Compute(OpKernelContext* ctx) const { Status status = Status::OK(); + bool supported = false; if (input_tensor.IsDataTypeString()) { - status = SliceImpl(ctx, input_tensor, compute_metadata); + if (utils::HasType()) { + supported = true; + status = SliceImpl(ctx, input_tensor, compute_metadata); + } } else { const auto element_size = input_tensor.DataType()->Size(); - + // call SliceImpl switch (element_size) { case sizeof(uint32_t): - status = SliceImpl(ctx, input_tensor, compute_metadata); + supported = CallSliceImplIfEnabled(ctx, input_tensor, compute_metadata, status); break; case sizeof(uint64_t): - status = SliceImpl(ctx, input_tensor, compute_metadata); + supported = CallSliceImplIfEnabled(ctx, input_tensor, compute_metadata, status); break; case sizeof(uint16_t): - status = SliceImpl(ctx, input_tensor, compute_metadata); + supported = CallSliceImplIfEnabled(ctx, input_tensor, compute_metadata, status); break; case sizeof(uint8_t): - status = SliceImpl(ctx, input_tensor, compute_metadata); + supported = CallSliceImplIfEnabled(ctx, input_tensor, compute_metadata, status); break; default: - ORT_THROW("Unsupported input data type of ", input_tensor.DataType()); + // leave 'supported' as false + break; } } + + if (!supported) { + status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported input data type of ", input_tensor.DataType()); + } + return status; } diff --git a/onnxruntime/core/providers/cpu/tensor/slice.h b/onnxruntime/core/providers/cpu/tensor/slice.h index 7febbf84d2..be69df1b38 100644 --- a/onnxruntime/core/providers/cpu/tensor/slice.h +++ b/onnxruntime/core/providers/cpu/tensor/slice.h @@ -44,14 +44,14 @@ class SliceBase { SliceOp::PrepareForComputeMetadata& compute_metadata); // Slice V10 & DynamicSlice - static void FillVectorsFromInput(const Tensor& start_tensor, - const Tensor& ends_tensor, - const Tensor* axes_tensor, - const Tensor* steps_tensor, - std::vector& input_starts, - std::vector& input_ends, - std::vector& input_axes, - std::vector& input_steps); + static Status FillVectorsFromInput(const Tensor& start_tensor, + const Tensor& ends_tensor, + const Tensor* axes_tensor, + const Tensor* steps_tensor, + std::vector& input_starts, + std::vector& input_ends, + std::vector& input_axes, + std::vector& input_steps); protected: SliceBase(const OpKernelInfo& info, bool dynamic = false) diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc index 24770b235b..482dd019c7 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc @@ -2,12 +2,30 @@ // Licensed under the MIT License. #include "core/providers/cpu/tensor/transpose.h" + #include "core/framework/utils.h" #include "core/mlas/inc/mlas.h" +#include "core/providers/op_kernel_type_control.h" +#include "core/providers/op_kernel_type_control_utils.h" #include "utils.h" namespace onnxruntime { +namespace op_kernel_type_control { +// we're using one set of types for all opsets +ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( + kCpuExecutionProvider, kOnnxDomain, Transpose, Input, 0, + ORT_OP_KERNEL_TYPE_CTRL_ALL_TENSOR_DATA_TYPES); +} // namespace op_kernel_type_control + +namespace { +// reduce the supported types with any global or op specific lists +using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, + Transpose, Input, 0); + +const std::vector dataTypeConstraints = BuildKernelDefConstraintsFunctorFromTypeList{}(); +} // namespace + /* A permutation [a,b,c,...] indicates that - The 0-th dimension of the output corresponds to the a-th dimension of input - The 1-st dimension of the output corresponds to the b-th dimension of input @@ -152,42 +170,54 @@ inline void CopyPrim(uint8_t* target, const uint8_t* source) { // The function does not check num_axes > 0 but this is expected. template -static void TypedDoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, +static bool TypedDoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, const std::vector& stride, const uint8_t* source, uint8_t* target) { - MultiIndex mindex; - IncrementIndexAndComputeOffsetSetup(mindex, num_axes, target_dims, stride, sizeof(T)); + constexpr bool enabled = utils::HasTypeWithSameSize(); - const uint8_t* local_source = source; - uint8_t* target_end = target + sizeof(T) * num_blocks; - for (; target != target_end; target += sizeof(T)) { - ORT_ENFORCE((local_source >= source) && (local_source < source + sizeof(T) * num_blocks)); - CopyPrim(target, local_source); - IncrementIndexAndComputeOffset(mindex, local_source); + if (enabled) { + MultiIndex mindex; + IncrementIndexAndComputeOffsetSetup(mindex, num_axes, target_dims, stride, sizeof(T)); + + const uint8_t* local_source = source; + uint8_t* target_end = target + sizeof(T) * num_blocks; + for (; target != target_end; target += sizeof(T)) { + ORT_ENFORCE((local_source >= source) && (local_source < source + sizeof(T) * num_blocks)); + CopyPrim(target, local_source); + IncrementIndexAndComputeOffset(mindex, local_source); + } } + + return enabled; } // DoTransposeEltWise: specialization of DoTranspose for the num_elts_in_block=1 case. // copies source tensor to target, transposing elements. // The stride vector indicates the transposition. -void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, - const std::vector& stride, const uint8_t* source, uint8_t* target, - size_t element_size) { +Status DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, + const std::vector& stride, const uint8_t* source, uint8_t* target, + size_t element_size) { + bool enabled = false; switch (element_size) { case sizeof(uint64_t): - TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); + enabled = TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); break; case sizeof(uint32_t): - TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); + enabled = TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); break; case sizeof(uint16_t): - TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); + enabled = TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); break; case sizeof(uint8_t): - TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); + enabled = TypedDoTransposeEltWise(num_axes, target_dims, num_blocks, stride, source, target); break; default: - assert(false); + // leave enabled as false + break; } + + return enabled ? Status::OK() + : ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Transpose of element size not supported in this build. Size=", + element_size); } static void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, @@ -243,17 +273,25 @@ static Status DoUntypedTranspose(const std::vector& permutations, const } } + Status status = Status::OK(); + if (is_string_type) { - const auto* input_data = input.template Data(); - auto* output_data = output.template MutableData(); - if (1 == prefix_blocksize) { - DoTransposeSingleBlock(suffix_blocksize, input_data, output_data); - } else if (1 == suffix_blocksize) { - DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, - input_data, output_data); + constexpr bool string_enabled = utils::HasType(); + + if (string_enabled) { + const auto* input_data = input.template Data(); + auto* output_data = output.template MutableData(); + if (1 == prefix_blocksize) { + DoTransposeSingleBlock(suffix_blocksize, input_data, output_data); + } else if (1 == suffix_blocksize) { + DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, + input_data, output_data); + } else { + DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, + input_data, output_data); + } } else { - DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, - input_data, output_data); + status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Transpose of std::string is not supported in this build."); } } else { const auto* input_data = reinterpret_cast(input.DataRaw()); @@ -261,15 +299,16 @@ static Status DoUntypedTranspose(const std::vector& permutations, const if (1 == prefix_blocksize) { DoTransposeSingleBlock(suffix_blocksize, input_data, output_data, element_size); } else if (1 == suffix_blocksize) { - DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, - input_data, output_data, element_size); + // this may return a failed status if the data size is not supported in this build + status = DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, + input_data, output_data, element_size); } else { DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, input_data, output_data, element_size); } } - return Status::OK(); + return status; } /* @@ -686,13 +725,13 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( Transpose, 1, 12, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), + KernelDefBuilder().TypeConstraint("T", dataTypeConstraints), Transpose); ONNX_CPU_OPERATOR_KERNEL( Transpose, 13, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), + KernelDefBuilder().TypeConstraint("T", dataTypeConstraints), Transpose); } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h index 341975d475..c003b2e8f2 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.h +++ b/onnxruntime/core/providers/cpu/tensor/transpose.h @@ -16,9 +16,10 @@ namespace onnxruntime { */ bool IsTransposeReshape(const std::vector& perm, const std::vector& input_dims); -void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, - const std::vector& stride, const uint8_t* source, uint8_t* target, - size_t element_size); +// Public function for element-wise transpose, primarily to unit test any out of bounds access +Status DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, + const std::vector& stride, const uint8_t* source, uint8_t* target, + size_t element_size); class TransposeBase { public: diff --git a/onnxruntime/core/providers/op_kernel_type_control.h b/onnxruntime/core/providers/op_kernel_type_control.h index c61c0381d4..d70ca7600b 100644 --- a/onnxruntime/core/providers/op_kernel_type_control.h +++ b/onnxruntime/core/providers/op_kernel_type_control.h @@ -62,14 +62,15 @@ struct GlobalAllowed {}; } // namespace tags // optionally holds a list of types associated with a tag class -// if types are defined, the data member 'types' should contain them in a type list -// otherwise, if no types are defined (distinct from an empty list of types), there should be no data member 'types' +// if types are defined, the type alias member called 'types' should contain them in a type list +// (e.g. using something like std::tuple or a boost::mp11::mp_list) +// otherwise, if no types are defined (distinct from an empty list of types), there should be no 'types' type alias // see the tags in onnxruntime::op_kernel_type_control::tags for intended uses template struct TypesHolder {}; /** - * Provides a type list of enabled types via the 'types' data member. + * Provides a type list of enabled types via the 'types' type alias member. * Enabled types are the set intersection of supported and allowed types. * * @tparam SupportedTypesHolder A 'TypesHolder' with a list of supported types. @@ -84,15 +85,15 @@ struct EnabledTypes { template using GetTypesMember = typename T::types; - // checks whether T has data member 'types' + // checks whether T has a type alias member called 'types' template using HasTypesMember = boost::mp11::mp_valid; static_assert(HasTypesMember::value, - "SupportedTypesHolder must have a 'types' data member."); + "SupportedTypesHolder must have a type alias called 'types'."); // the allowed type lists to consider - // for each element of AllowedTypesHolders, get and include a 'types' data member if present + // for each element of AllowedTypesHolders, get and include the 'types' type alias member if present using AllowedTypesMembers = boost::mp11::mp_transform< GetTypesMember, @@ -105,17 +106,10 @@ struct EnabledTypes { boost::mp11::mp_push_front>; static_assert(boost::mp11::mp_all_of::value, - "All 'types' data members must be type lists."); - - // converts type list L into a type set (type list with unique elements) - template - using MakeSet = - boost::mp11::mp_apply< - boost::mp11::mp_set_push_back, - boost::mp11::mp_append>, L>>; + "All 'types' type aliases must be type lists."); // type lists converted to type sets - using TypeSetsToConsider = boost::mp11::mp_transform; + using TypeSetsToConsider = boost::mp11::mp_transform; public: using types = boost::mp11::mp_apply; @@ -237,37 +231,6 @@ struct EnabledTypes { ::onnxruntime::op_kernel_type_control::kAllOpSets, \ ArgDirection, ArgIndex) -/** - * std::tuple type with the enabled types for a given Op kernel argument. - * - * @param OpProvider The Op provider. - * @param OpDomain The Op domain. - * @param OpName The Op name. - * @param OpSet The opset to use for the supported types list. - * @param ArgDirection Direction of the given Op kernel argument - Input or Output. - * @param ArgIndex Index of the given Op kernel argument. - */ -#define ORT_OP_KERNEL_ARG_ENABLED_TYPE_TUPLE( \ - OpProvider, OpDomain, OpName, OpSet, ArgDirection, ArgIndex) \ - ::boost::mp11::mp_rename< \ - ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST( \ - OpProvider, OpDomain, OpName, OpSet, ArgDirection, ArgIndex, SupportedTypeList), \ - std::tuple> - -/** - * std::tuple type with the enabled types for a given Op kernel argument that is valid for all opsets. - * - * @param OpProvider The Op provider. - * @param OpDomain The Op domain. - * @param OpName The Op name. - * @param ArgDirection Direction of the given Op kernel argument - Input or Output. - * @param ArgIndex Index of the given Op kernel argument. - */ -#define ORT_OP_KERNEL_ARG_ENABLED_TYPE_TUPLE_ALL_OPSETS( \ - OpProvider, OpDomain, OpName, ArgDirection, ArgIndex) \ - ORT_OP_KERNEL_ARG_ENABLED_TYPE_TUPLE(OpProvider, OpDomain, OpName, \ - ::onnxruntime::op_kernel_type_control::kAllOpSets, \ - ArgDirection, ArgIndex) /** * Usage example: * @@ -277,7 +240,7 @@ struct EnabledTypes { * namespace op_kernel_type_control { * // specify supported types, i.e., the full set of types that can be enabled * ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES( - * MyProvider, DomainContainingMyOp, MyOp, Input, 0, + * MyProvider, DomainContainingMyOp, MyOp, OpSet, Input, 0, * int, float, double); * } // namespace op_kernel_type_control * } // namespace onnxruntime diff --git a/onnxruntime/core/providers/op_kernel_type_control_utils.h b/onnxruntime/core/providers/op_kernel_type_control_utils.h new file mode 100644 index 0000000000..3e1354d43a --- /dev/null +++ b/onnxruntime/core/providers/op_kernel_type_control_utils.h @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "boost/mp11.hpp" + +#include "core/framework/data_types.h" + +namespace onnxruntime { +namespace utils { +/** +* Check if the set of types contains the specified type. +*/ +template +constexpr bool HasType() { + static_assert(boost::mp11::mp_is_set::value, "TypeSet must be a type set."); + + return boost::mp11::mp_set_contains::value; +} + +template +using SizeOfT = boost::mp11::mp_size_t; + +/** +* Check if the set of types contains a type with the same size as T. +* +* @remarks e.g. will return true if T is int32_t and the list contains any 4 byte type (i.e. sizeof(int32_t)) +* such as int32_t, uint32_t or float. +*/ +template +constexpr bool HasTypeWithSameSize() { + static_assert(boost::mp11::mp_is_set::value, "TypeSet must be a type set."); + + using EnabledTypeSizes = boost::mp11::mp_unique>; + return boost::mp11::mp_set_contains>::value; +} + +} // namespace utils +} // namespace onnxruntime + +/** Data types that are used in DataTypeImpl::AllTensorTypes() +*/ +#define ORT_OP_KERNEL_TYPE_CTRL_ALL_TENSOR_DATA_TYPES \ + bool, \ + float, double, \ + uint8_t, uint16_t, uint32_t, uint64_t, \ + int8_t, int16_t, int32_t, int64_t, \ + MLFloat16, BFloat16, \ + std::string diff --git a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc index ae2fed48f9..f929c781e2 100644 --- a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc @@ -5,6 +5,7 @@ #include "test/providers/provider_test_utils.h" #include "test/providers/compare_provider_test_utils.h" #include "core/providers/cpu/tensor/transpose.h" +#include "test/util/include/asserts.h" namespace onnxruntime { namespace test { @@ -560,9 +561,9 @@ TEST(TransposeOpTest, DoTransposeEltWise) { 13.0f, 15.0f, 14.0f, 16.0f, 17.0f, 17.0f}; - DoTransposeEltWise(input_shape.size(), input_shape, 16, - stride, (uint8_t*)input_vals_end.data(), (uint8_t*)target.data(), - sizeof(float)); + ASSERT_STATUS_OK(DoTransposeEltWise(input_shape.size(), input_shape, 16, + stride, (uint8_t*)input_vals_end.data(), (uint8_t*)target.data(), + sizeof(float))); for (size_t i = 0; i < input_vals_end.size(); ++i) { ASSERT_TRUE(target[i] == expected_vals3[i]); } diff --git a/onnxruntime/test/testdata/mnist.readme.txt b/onnxruntime/test/testdata/mnist.readme.txt index 8d3bd63e37..cc370ab542 100644 --- a/onnxruntime/test/testdata/mnist.readme.txt +++ b/onnxruntime/test/testdata/mnist.readme.txt @@ -1,4 +1,4 @@ -The mnist model is used in a multiple tests for minimal/mobile builds in both ONNX and ORT formats. +The mnist model is used in multiple tests for minimal/mobile builds in both ONNX and ORT formats. We also save both ONNX and ORT format versions of the model with level 1 (aka 'basic') optimizations applied. - mnist.level1_opt.onnx makes sure the required operators for this model are automatically included in diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py index 3c179e3355..8367900532 100644 --- a/tools/python/util/ort_format_model/operator_type_usage_processors.py +++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py @@ -109,13 +109,18 @@ class DefaultTypeUsageProcessor(TypeUsageProcessor): def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict): for i in self._input_types.keys(): if i >= node.InputsLength(): - raise RuntimeError('Node has {} inputs. Tracker for {} incorrectly configured as it requires {}.' - .format(node.InputsLength(), self.name, i)) - - type_str = value_name_to_typestr(node.Inputs(i), value_name_to_typeinfo) - self._input_types[i].add(type_str) + # Some operators have fewer inputs in earlier versions where data that was as an attribute + # become an input in later versions to allow it to be dynamically provided. Allow for that. + # e.g. Slice-1 had attributes for the indices, and Slice-10 moved those to be inputs + # raise RuntimeError('Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.' + # .format(node.OutputsLength(), self.name, o)) + pass + else: + type_str = value_name_to_typestr(node.Inputs(i), value_name_to_typeinfo) + self._input_types[i].add(type_str) for o in self._output_types.keys(): + # Don't know of any ops where the number of outputs changed across versions, so require a valid length if o >= node.OutputsLength(): raise RuntimeError('Node has {} outputs. Tracker for {} incorrectly configured as it requires {}.' .format(node.OutputsLength(), self.name, o)) @@ -127,7 +132,7 @@ class DefaultTypeUsageProcessor(TypeUsageProcessor): if 0 not in self._input_types.keys(): # currently all standard typed registrations are for input 0. # custom registrations can be handled by operator specific processors (e.g. OneHotProcessor below). - raise RuntimeError('Expected typed registration to use type from input 0.') + raise RuntimeError('Expected typed registration to use type from input 0. Node:{}'.format(self.name)) return type_in_registration in self._input_types[0] @@ -254,8 +259,8 @@ def _create_operator_type_usage_processors(): # - some known large kernels # # Ops we are ignoring currently so as not to produce meaningless/unused output: - # - Implementation is not type specific: - # If, Loop, Reshape, Scan, Shape, Squeeze, Unsqueeze + # - Implementation is type agnostic: + # DynamicQuantizeMatMul, If, Loop, Reshape, Scan, Shape, Squeeze, Unsqueeze # - Only one type supported in the ORT implementation: # FusedConv, FusedGemm, FusedMatMul, TransposeMatMul # - Implementation does not have any significant type specific code: @@ -264,7 +269,7 @@ def _create_operator_type_usage_processors(): 'DequantizeLinear', 'Div', 'Equal', 'Exp', 'Expand', 'Gemm', 'Greater', 'Less', 'MatMul', 'Max', 'Min', 'Mul', 'NonMaxSuppression', 'NonZero', 'Pad', 'Range', 'Relu', 'Resize', - 'Sigmoid', 'Slice', 'Softmax', 'Split', 'Sub', 'Tile', 'TopK', 'Transpose'] + 'Sigmoid', 'Softmax', 'Split', 'Sub', 'Tile', 'TopK', 'Transpose'] internal_ops = ['QLinearAdd', 'QLinearMul'] @@ -286,12 +291,15 @@ def _create_operator_type_usage_processors(): # # Operators that require custom handling # - add(DefaultTypeUsageProcessor('ai.onnx', 'Cast', inputs=[0], outputs=[0])) # track input0 and output0 + + # Cast switches on types of input 0 and output 0 + add(DefaultTypeUsageProcessor('ai.onnx', 'Cast', inputs=[0], outputs=[0])) # Operators that switch on the type of input 0 and 1 add(DefaultTypeUsageProcessor('ai.onnx', 'Gather', inputs=[0, 1])) add(DefaultTypeUsageProcessor('ai.onnx', 'GatherElements', inputs=[0, 1])) add(DefaultTypeUsageProcessor('ai.onnx', 'Pow', inputs=[0, 1])) + add(DefaultTypeUsageProcessor('ai.onnx', 'Slice', inputs=[0, 1])) # Operators that switch on output type add(DefaultTypeUsageProcessor('ai.onnx', 'ConstantOfShape', inputs=[], outputs=[0])) From f14c621c103e1880aa12abd5d148c98318741f5f Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Fri, 5 Feb 2021 09:44:27 +0530 Subject: [PATCH 20/41] Tile perf enhancements - continued (#6561) --- onnxruntime/core/providers/cpu/tensor/tile.cc | 75 +++++++++++++--- onnxruntime/core/providers/cpu/tensor/tile.h | 16 +++- .../core/providers/cuda/tensor/tile.cc | 90 ++++++++++++++----- .../core/providers/cuda/tensor/tile_impl.cu | 36 +++++++- .../core/providers/cuda/tensor/tile_impl.h | 9 ++ .../test/providers/cpu/tensor/tile_op_test.cc | 24 +++++ 6 files changed, 211 insertions(+), 39 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc index 9f26e4e88a..73f5122aeb 100644 --- a/onnxruntime/core/providers/cpu/tensor/tile.cc +++ b/onnxruntime/core/providers/cpu/tensor/tile.cc @@ -100,20 +100,32 @@ Status TileCoreForFixedSizeTypes(const Tensor& input_tensor, Tensor& output_tens } namespace TileOp { -// Find the first non-1 repeat and check the input shape to the left of that dimension, -// if the dim values are 1, then the tiling logic is essentially copying the input buffer -// multiple times. The number of times can be computed as the product of the repeat values. +// Find the first non-1 repeat and check the input shape to the left of that dimension: +// 1) If the dim values to the left are all 1s (or don't exist), then the tiling logic is essentially copying the input buffer +// multiple times. The number of times can be computed as the product of the repeat values. (OR) +// 2) Allow at-most one non-1 dim value to the left (for the batch dimension), in this case, the sub-tensor at each batch index +// is copied multiple times. This is still faster because it avoids other Tile operator's machinery. bool IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, - /*out*/ size_t& num_of_copies) { + /*out*/ bool& is_batched_memcpy, + /*out*/ size_t& num_of_elements_per_batch, + /*out*/ size_t& num_of_copies_per_batch, + /*out*/ size_t& num_of_batch_copies) { for (int64_t i = static_cast(rank) - 1; i >= 0; --i) { if (repeats[i] != 1) { if (input_shape.SizeToDimension(i) == 1) { - num_of_copies = 1; + num_of_copies_per_batch = 1; for (int64_t j = 0; j <= i; ++j) { - num_of_copies *= repeats[j]; + num_of_copies_per_batch *= repeats[j]; } + is_batched_memcpy = false; + return true; + } else if (i == 1) { // else check if the previous dim is just the batch dim + num_of_elements_per_batch = static_cast(input_shape.SizeFromDimension(1)); + num_of_copies_per_batch = repeats[i]; + num_of_batch_copies = repeats[0]; + is_batched_memcpy = true; return true; } else { break; @@ -166,20 +178,57 @@ Status Tile::Compute(OpKernelContext* ctx) const { return Status::OK(); } - size_t num_of_copies = 1; - if (TileOp::IsTileMemcpy(input_shape, repeats, input_rank, num_of_copies)) { + bool is_batched_memcpy = false; + size_t num_of_elements_per_batch = 1; + size_t num_of_copies_per_batch = 1; + size_t num_of_batch_copies = 1; + if (TileOp::IsTileMemcpy(input_shape, + repeats, + input_rank, + is_batched_memcpy, + num_of_elements_per_batch, + num_of_copies_per_batch, + num_of_batch_copies)) { // TODO: Handle string copies when the kernel eventually supports string type. // For now, it shouldn't throw in the enforce as the kernel doesn't claim string support ORT_ENFORCE(!input_tensor.IsDataType(), "Tile doesn't support string type yet"); int8_t* output_data_casted = reinterpret_cast(output_tensor.MutableDataRaw()); + const int8_t* input_data_casted = reinterpret_cast(input_tensor.DataRaw()); const void* input_data_raw = input_tensor.DataRaw(); - size_t tensor_size_in_bytes = input_tensor.SizeInBytes(); - // TODO: Add multi-threading logic if num_of_copies is large enough - for (size_t i = 0; i < num_of_copies; ++i) { - memcpy(static_cast(output_data_casted), input_data_raw, tensor_size_in_bytes); - output_data_casted += tensor_size_in_bytes; + if (!is_batched_memcpy) { + size_t copy_bytes = input_tensor.SizeInBytes(); + // TODO: Add multi-threading logic if num_of_copies_per_batch is large enough + for (size_t i = 0; i < num_of_copies_per_batch; ++i) { + memcpy(static_cast(output_data_casted), input_data_raw, copy_bytes); + output_data_casted += copy_bytes; + } + } else { + size_t copy_bytes = num_of_elements_per_batch * input_tensor.DataType()->Size(); + size_t batch_count = static_cast(input_tensor.Shape()[0]); // The tensor is atleast 1-D- this is safe + + // TODO: Multi-thread if needed + for (size_t batch = 0; batch < batch_count; ++batch) { + for (size_t i = 0; i < num_of_copies_per_batch; ++i) { + memcpy(static_cast(output_data_casted), static_cast(input_data_casted), copy_bytes); + output_data_casted += copy_bytes; + } + input_data_casted += copy_bytes; + } + + // Now account for batch dim repeat + if (num_of_batch_copies > 1) { + // reset some values + output_data_casted = reinterpret_cast(output_tensor.MutableDataRaw()); + copy_bytes *= num_of_copies_per_batch * batch_count; + int8_t* copy_ptr = output_data_casted + copy_bytes; + + for (size_t i = 1; i < num_of_batch_copies; ++i) { + memcpy(static_cast(copy_ptr), static_cast(output_data_casted), copy_bytes); + copy_ptr += copy_bytes; + } + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/tensor/tile.h b/onnxruntime/core/providers/cpu/tensor/tile.h index 74d8ee7ded..093ee74969 100644 --- a/onnxruntime/core/providers/cpu/tensor/tile.h +++ b/onnxruntime/core/providers/cpu/tensor/tile.h @@ -14,10 +14,24 @@ namespace TileOp { // repeats: [1, 200, 1] // output shape: [1, 200, 256 * 50] +// As a slight extension, it also supports "batched" multiple copies of the input data buffer +// (`is_batched_memcpy` will be set to true) +// E.g.: input_shape: [5, 1, 256 * 50] +// repeats: [1, 200, 1] +// output shape: [5, 200, 256 * 50] + +// Repeating the batch is also supported +// E.g.: input_shape: [5, 1, 256 * 50] +// repeats: [2, 200, 1] +// output shape: [10, 200, 256 * 50] + bool IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, - /*out*/ size_t& num_of_copies); + /*out*/ bool& is_batched_memcpy, + /*out*/ size_t& num_of_elements_per_batch, + /*out*/ size_t& num_of_copies_per_batch, + /*out*/ size_t& num_of_batch_copies); } // namespace TileOp struct Tile : OpKernel { diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index 5aac0d0430..7b68b19ce3 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -76,31 +76,73 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - size_t num_of_copies = 1; - if (TileOp::IsTileMemcpy(input_shape, repeats, rank, num_of_copies)) { - if (input_tensor.IsDataType() || - input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); - } else if (input_tensor.IsDataType() || - input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); - } else if (input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); + bool is_batched_memcpy = false; + size_t num_of_elements_per_batch = 1; + size_t num_of_copies_per_batch = 1; + size_t num_of_batch_copies = 1; + if (TileOp::IsTileMemcpy(input_shape, + repeats, + rank, + is_batched_memcpy, + num_of_elements_per_batch, + num_of_copies_per_batch, + num_of_batch_copies)) { + if (!is_batched_memcpy) { + if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else { + // Won't hit this as the kernel doesn't claim support for any type that will trigger this + ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + } } else { - // Won't hit this as the kernel doesn't claim support for any type that will trigger this - ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else { + // Won't hit this as the kernel doesn't claim support for any type that will trigger this + ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu index 33696d1a26..a66db85a2f 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu @@ -67,9 +67,43 @@ void TileMemcpyImpl( input_data, num_input_elements, output_data, (CUDA_LONG)num_output_elements); } +template +__global__ void _TileBatchedMemcpyKernel( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod num_of_elements_per_output_batch, + T* output_data, + const size_t N) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + CUDA_LONG batch_idx = 0; + CUDA_LONG element_idx = 0; + num_of_elements_per_output_batch.divmod(id, batch_idx, element_idx); + output_data[id] = input_data[(batch_idx % num_input_batch_count) * num_of_elements_per_input_batch + (element_idx % num_of_elements_per_input_batch)]; +} + +template +void TileBatchedMemcpyImpl( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod& num_of_elements_per_output_batch, + T* output_data, + const size_t num_output_elements) { + int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); + _TileBatchedMemcpyKernel<<>>( + input_data, + num_of_elements_per_input_batch, + num_input_batch_count, + num_of_elements_per_output_batch, + output_data, + (CUDA_LONG)num_output_elements); +} + #define SPECIALIZED_IMPL(T) \ template void TileImpl(const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ - template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); + template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ + template void TileBatchedMemcpyImpl(const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.h b/onnxruntime/core/providers/cuda/tensor/tile_impl.h index cfe5391073..27404c8d39 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.h @@ -25,5 +25,14 @@ void TileMemcpyImpl( T* output_data, const size_t num_output_elements); +template +void TileBatchedMemcpyImpl( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod& num_of_elements_per_output_batch, + T* output_data, + const size_t num_output_elements); + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc index 4d5c2384f7..e7636fbba1 100644 --- a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc @@ -48,10 +48,21 @@ void RunTestWrapper() { RunTest({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 1, 1}, {3}, {111, 112, 113, 122, 123, 124}, {2, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 + // This will trigger the MemCpy optimization path RunTest({111, 112, 113}, {1, 1, 3}, {2, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113, 111, 112, 113}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 + // This will trigger the MemCpy optimization path RunTest({111, 112, 113}, {1, 1, 3}, {3, 1, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113}, {3, 1, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {2, 2, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {2, 2, 1}, {3}, + {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13, 111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {4, 2, 3}); } template <> @@ -78,10 +89,23 @@ void RunTestWrapper() { RunTest({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, {true, false, true, false, true, true}, {2, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 + // This will trigger the MemCpy optimization path RunTest({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 + // This will trigger the MemCpy optimization path RunTest({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, {true, false, true, true, false, true, true, false, true}, {3, 1, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true}, + {4, 2, 3}); } TEST(TensorOpTest, TileFloatType) { From c5d2538314d17e2351a0f002ce8a70fede435930 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 5 Feb 2021 15:10:54 +1000 Subject: [PATCH 21/41] Add more kernels that have typed registrations to the operators we track type usage for. (#6565) --- .../operator_type_usage_processors.py | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py index 8367900532..716fd4c765 100644 --- a/tools/python/util/ort_format_model/operator_type_usage_processors.py +++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py @@ -260,16 +260,31 @@ def _create_operator_type_usage_processors(): # # Ops we are ignoring currently so as not to produce meaningless/unused output: # - Implementation is type agnostic: - # DynamicQuantizeMatMul, If, Loop, Reshape, Scan, Shape, Squeeze, Unsqueeze + # ai.onnx: If, Loop, Reshape, Scan, Shape, Squeeze, Unsqueeze + # com.microsoft: DynamicQuantizeMatMul, MatMulIntegerToFloat # - Only one type supported in the ORT implementation: - # FusedConv, FusedGemm, FusedMatMul, TransposeMatMul + # com.microsoft: FusedConv, FusedGemm, FusedMatMul, TransposeMatMul # - Implementation does not have any significant type specific code: - # Concat, Flatten, Not, QLinearConv, Reshape, Shape, Squeeze, Unsqueeze - default_processor_onnx_ops = ['Add', 'AveragePool', 'BatchNormalization', 'Clip', 'Conv', - 'DequantizeLinear', 'Div', 'Equal', 'Exp', 'Expand', - 'Gemm', 'Greater', 'Less', 'MatMul', 'Max', 'Min', 'Mul', - 'NonMaxSuppression', 'NonZero', 'Pad', 'Range', 'Relu', 'Resize', - 'Sigmoid', 'Softmax', 'Split', 'Sub', 'Tile', 'TopK', 'Transpose'] + # ai.onnx: Concat, Flatten, Not, QLinearConv, Reshape, Shape, Squeeze, Unsqueeze + # + default_processor_onnx_ops = ['Abs', 'Add', 'ArgMax', 'ArgMin', 'AveragePool', + 'BatchNormalization', 'BitShift', + 'Ceil', 'Clip', 'Conv', 'CumSum', + 'DequantizeLinear', 'Div', + 'Equal', 'Exp', 'Expand', + 'Floor', + 'Gemm', 'Greater', + 'IsNaN' + 'Less', 'Log', 'LogSoftmax', 'LpNormalization', + 'MatMul', 'Max', 'Min', 'Mul', + 'Neg', 'NonMaxSuppression', 'NonZero', + 'Pad', + 'Range', 'Reciprocal', 'ReduceL1', 'ReduceL2', 'ReduceLogSum', 'ReduceLogSumExp', + 'ReduceMax', 'ReduceMean', 'ReduceMin', 'ReduceProd', 'ReduceSum', 'ReduceSumSquare', + 'Relu', 'Resize', 'RoiAlign', 'Round', + 'Sigmoid', 'Sin', 'Softmax', 'Split', 'Sqrt', 'Sub', + 'Tanh', 'Tile', 'TopK', 'Transpose', + 'Where'] internal_ops = ['QLinearAdd', 'QLinearMul'] @@ -303,7 +318,6 @@ def _create_operator_type_usage_processors(): # Operators that switch on output type add(DefaultTypeUsageProcessor('ai.onnx', 'ConstantOfShape', inputs=[], outputs=[0])) - add(DefaultTypeUsageProcessor('com.microsoft', 'DynamicQuantizeMatMul', inputs=[], outputs=[0])) # Random generator ops produce new data so we track the output type onnx_random_ops = ['RandomNormal', 'RandomNormalLike', 'RandomUniform', 'RandomUniformLike', 'Multinomial'] From 3b376da37c0d720991ea45b68a95e0b0853b9037 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Thu, 4 Feb 2021 23:22:22 -0800 Subject: [PATCH 22/41] Enable type reduction for Gather CPU kernel. (#6579) * Enable type reduction in Gather. --- .../core/providers/cpu/tensor/gather.cc | 32 +++++++++++++------ .../core/providers/op_kernel_type_control.h | 8 ++--- .../providers/op_kernel_type_control_utils.h | 4 +-- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/gather.cc b/onnxruntime/core/providers/cpu/tensor/gather.cc index fdf62b6302..b0fd5dcdf9 100644 --- a/onnxruntime/core/providers/cpu/tensor/gather.cc +++ b/onnxruntime/core/providers/cpu/tensor/gather.cc @@ -5,17 +5,31 @@ #include "core/providers/cpu/tensor/gather.h" #include "core/common/common.h" #include "core/platform/threadpool.h" +#include "core/providers/op_kernel_type_control.h" +#include "core/providers/op_kernel_type_control_utils.h" namespace onnxruntime { +namespace op_kernel_type_control { +ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( + kCpuExecutionProvider, kOnnxDomain, Gather, Input, 1, int32_t, int64_t); +} + +namespace { +using EnabledIndexTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS( + kCpuExecutionProvider, kOnnxDomain, Gather, Input, 1); + +const auto index_type_constraints = + BuildKernelDefConstraintsFunctorFromTypeList{}(); +} // namespace + ONNX_CPU_OPERATOR_VERSIONED_KERNEL( Gather, 1, 10, KernelDefBuilder() .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", std::vector{DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), + .TypeConstraint("Tind", index_type_constraints), Gather); ONNX_CPU_OPERATOR_VERSIONED_KERNEL( @@ -24,8 +38,7 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( 12, KernelDefBuilder() .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", std::vector{DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), + .TypeConstraint("Tind", index_type_constraints), Gather); ONNX_CPU_OPERATOR_KERNEL( @@ -33,8 +46,7 @@ ONNX_CPU_OPERATOR_KERNEL( 13, KernelDefBuilder() .TypeConstraint("T", DataTypeImpl::AllTensorTypes()) - .TypeConstraint("Tind", std::vector{DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), + .TypeConstraint("Tind", index_type_constraints), Gather); Status GatherBase::PrepareForCompute(OpKernelContext* context, Prepare& p) const { @@ -132,16 +144,18 @@ Status Gather::Compute(OpKernelContext* context) const { concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); - if (p.indices_tensor->IsDataType()) { + if (utils::HasTypeWithSameSize() && + p.indices_tensor->IsDataType()) { return GatherCopyData(p.indices_tensor, src_base, dst_base, is_string_type, element_bytes, block_size, M, N, data_batch_bytes, gathered_batch_bytes, input_data_shape, p.axis, tp); } - if (p.indices_tensor->IsDataType()) { + if (utils::HasTypeWithSameSize() && + p.indices_tensor->IsDataType()) { return GatherCopyData(p.indices_tensor, src_base, dst_base, is_string_type, element_bytes, block_size, M, N, data_batch_bytes, gathered_batch_bytes, input_data_shape, p.axis, tp); } - return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Type for Tind not supported yet in Gather."); + return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Gather Tind type not supported in this build."); } } // namespace onnxruntime diff --git a/onnxruntime/core/providers/op_kernel_type_control.h b/onnxruntime/core/providers/op_kernel_type_control.h index d70ca7600b..a9030838ac 100644 --- a/onnxruntime/core/providers/op_kernel_type_control.h +++ b/onnxruntime/core/providers/op_kernel_type_control.h @@ -20,7 +20,7 @@ * - Enabled types are the types that are supported in the actual, compiled implementation. They are obtained from the * intersection of supported and allowed types. * - * The types are associated with an Op kernel argument. It is also possible to specify a global list of allowed types. + * The types are associated with an Op argument. It is also possible to specify a global list of allowed types. * * Use of these utilities is optional. They are useful for cases where one registered Op kernel handles multiple types. * @@ -239,8 +239,8 @@ struct EnabledTypes { * namespace onnxruntime { * namespace op_kernel_type_control { * // specify supported types, i.e., the full set of types that can be enabled - * ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES( - * MyProvider, DomainContainingMyOp, MyOp, OpSet, Input, 0, + * ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( + * MyProvider, DomainContainingMyOp, MyOp, Input, 0, * int, float, double); * } // namespace op_kernel_type_control * } // namespace onnxruntime @@ -249,7 +249,7 @@ struct EnabledTypes { * * // get enabled types * using MyOpFirstInputEnabledTypes = - * ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(MyProvider, DomainContainingMyOp, MyOp, Input, 0); + * ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(MyProvider, DomainContainingMyOp, MyOp, Input, 0); * * // ... * diff --git a/onnxruntime/core/providers/op_kernel_type_control_utils.h b/onnxruntime/core/providers/op_kernel_type_control_utils.h index 3e1354d43a..e79bcdb8ba 100644 --- a/onnxruntime/core/providers/op_kernel_type_control_utils.h +++ b/onnxruntime/core/providers/op_kernel_type_control_utils.h @@ -24,8 +24,8 @@ using SizeOfT = boost::mp11::mp_size_t; /** * Check if the set of types contains a type with the same size as T. -* -* @remarks e.g. will return true if T is int32_t and the list contains any 4 byte type (i.e. sizeof(int32_t)) +* +* @remarks e.g. will return true if T is int32_t and the list contains any 4 byte type (i.e. sizeof(int32_t)) * such as int32_t, uint32_t or float. */ template From f2ce3aae1398490e483f1853d6d41dda4f856c8f Mon Sep 17 00:00:00 2001 From: Chun-Wei Chen Date: Fri, 5 Feb 2021 09:30:49 -0800 Subject: [PATCH 23/41] add set_model_dir and update ONNX (#6119) --- cgmanifests/submodules/cgmanifest.json | 2 +- cmake/external/onnx | 2 +- onnxruntime/core/graph/graph.cc | 3 +++ .../github/linux/docker/scripts/manylinux/requirements.txt | 2 +- tools/ci_build/github/linux/docker/scripts/requirements.txt | 2 +- 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cgmanifests/submodules/cgmanifest.json b/cgmanifests/submodules/cgmanifest.json index 4d61c1f44a..3f894e4689 100644 --- a/cgmanifests/submodules/cgmanifest.json +++ b/cgmanifests/submodules/cgmanifest.json @@ -242,7 +242,7 @@ "component": { "type": "git", "git": { - "commitHash": "174de7d086a768cba29374a56a7461eff87cfdb3", + "commitHash": "237926eab41de21fb9addc4b03b751fd6a3343ec", "repositoryUrl": "https://github.com/onnx/onnx" }, "comments": "git submodule at cmake/external/onnx" diff --git a/cmake/external/onnx b/cmake/external/onnx index 174de7d086..237926eab4 160000 --- a/cmake/external/onnx +++ b/cmake/external/onnx @@ -1 +1 @@ -Subproject commit 174de7d086a768cba29374a56a7461eff87cfdb3 +Subproject commit 237926eab41de21fb9addc4b03b751fd6a3343ec diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 038aaa41e7..e9b5225fa5 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -2265,6 +2265,9 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) { ctx.set_ir_version(gsl::narrow_cast(IrVersion())); ctx.set_opset_imports(DomainToVersionMap()); ctx.set_schema_registry(schema_registry_.get()); + // Set the parent directory of model path to load external tensors if exist + ctx.set_model_dir(ToMBString(ModelPath().ParentPath().ToPathString())); + LexicalScopeContext lsc; lsc.output_names.insert(resolve_context_.inputs_and_initializers.cbegin(), diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 6812cd3953..9587e53554 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -3,7 +3,7 @@ mypy pytest setuptools>=41.4.0 wheel -git+http://github.com/onnx/onnx.git@174de7d086a768cba29374a56a7461eff87cfdb3#egg=onnx +git+http://github.com/onnx/onnx.git@237926eab41de21fb9addc4b03b751fd6a3343ec#egg=onnx protobuf sympy==1.1.1 flake8 diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index aa33c41908..b5eec6ca4a 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -4,7 +4,7 @@ mypy pytest setuptools>=41.4.0 wheel -git+http://github.com/onnx/onnx.git@174de7d086a768cba29374a56a7461eff87cfdb3#egg=onnx +git+http://github.com/onnx/onnx.git@237926eab41de21fb9addc4b03b751fd6a3343ec#egg=onnx argparse sympy==1.1.1 flake8 From 68193e28dedfb0d3847d924c550a4d74646e2bed Mon Sep 17 00:00:00 2001 From: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Date: Fri, 5 Feb 2021 12:14:55 -0800 Subject: [PATCH 24/41] Let execution fall back to CPU EP if Compile of a partition on current EP fails (#6580) * Let exccution fall back to CPU EP if compile of a partition fails * Removed debugging logs * Addressed CR comments --- include/onnxruntime/core/graph/graph.h | 9 +- .../core/framework/graph_partitioner.cc | 61 ++++++------ onnxruntime/core/graph/graph.cc | 29 +++++- .../internal_testing_tests.cc | 97 +++++++++++++++++++ 4 files changed, 162 insertions(+), 34 deletions(-) diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 8ce8e4ff2f..e2d57cfda4 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -921,11 +921,18 @@ class Graph { @returns Node with fused subgraph. @remarks As a new Graph instance for the fused nodes is not created, a GraphViewer can be constructed with the IndexedSubGraph information to provide a view of the subgraph. The original nodes are left in place - while this is in use. + while this is in use. Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created. */ Node& BeginFuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name); + /** + If we have BeginFuseSubGraph, but somehow hit errors, such as Compile of an EP failed on thesub_graph. + We can call CancelFuseSubGraph to undo the changes of BeginFuseSubGraph + @param fused_node The fused node and it's function body to be removed from the graph + */ + void CancelFuseSubGraph(const Node& fused_node); + void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node); #endif diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc index 9abb3cb6f3..9a17ffe285 100644 --- a/onnxruntime/core/framework/graph_partitioner.cc +++ b/onnxruntime/core/framework/graph_partitioner.cc @@ -448,42 +448,43 @@ static Status PartitionOrtFormatModelImpl(Graph& graph, FuncManager& func_mgr, nodes_and_viewers.push_back(IExecutionProvider::FusedNodeAndGraph{fused_node, *viewers.back()}); } - std::vector node_compute_funcs; - node_compute_funcs.reserve(nodes_and_viewers.size()); - - ORT_RETURN_IF_ERROR(current_ep.Compile(nodes_and_viewers, node_compute_funcs)); - - if (node_compute_funcs.size() != nodes_and_viewers.size()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, type, " did not return correct number of compiled functions"); - } - - for (size_t j = 0, end = nodes_and_viewers.size(); j < end; j++) { + // We will compile the fused nodes one by one, and fuse the subgraph if successful. + // If a compilation fails we undo the fusion and leave the original nodes available for other EPs to take + for (size_t j = 0, end = nodes_and_viewers.size(); j < end; ++j) { Node& node = nodes_and_viewers[j].fused_node; + std::vector single_node_compute_func; + auto status = current_ep.Compile({nodes_and_viewers[j]}, single_node_compute_func); + if (!status.IsOK()) { + // There is compile error with the nodes_and_viewer[j], remove the fused_node and function from the graph + LOGS_DEFAULT(ERROR) << "EP: " << current_ep.Type() << " has Compile error: " << status.ErrorMessage(); + graph.CancelFuseSubGraph(node); + } else { + ORT_RETURN_IF(single_node_compute_func.empty(), "single_node_compute_func should have 1 elements"); + ORT_RETURN_IF_ERROR(func_mgr.AddFuncInfo(node.Name(), std::move(single_node_compute_func[0]))); - ORT_RETURN_IF_ERROR(func_mgr.AddFuncInfo(node.Name(), std::move(node_compute_funcs[j]))); + const auto& cur_capability = capabilities[j]; + const IndexedSubGraph& indexed_sub_graph = *cur_capability->sub_graph; + const IndexedSubGraph::MetaDef& metadef = *indexed_sub_graph.GetMetaDef(); - const auto& cur_capability = capabilities[j]; - const IndexedSubGraph& indexed_sub_graph = *cur_capability->sub_graph; - const IndexedSubGraph::MetaDef& metadef = *indexed_sub_graph.GetMetaDef(); + KernelDefBuilder builder; + BuildFusedKernelDef(builder, metadef, type); + auto kernel_def = builder.Build(); - KernelDefBuilder builder; - BuildFusedKernelDef(builder, metadef, type); - auto kernel_def = builder.Build(); + // save hash so SessionState can find the kernel. each kernel name should be unique + if (compiled_kernel_hashes.insert({metadef.name, kernel_def->GetHash()}).second == false) { + ORT_THROW("Existing entry in compiled kernel hashes for ", metadef.name, + ". Execution Provider must generate unique names across the entire model."); + } - // save hash so SessionState can find the kernel. each kernel name should be unique - if (compiled_kernel_hashes.insert({metadef.name, kernel_def->GetHash()}).second == false) { - ORT_THROW("Existing entry in compiled kernel hashes for ", metadef.name, - ". Execution Provider must generate unique names across the entire model."); + ORT_RETURN_IF_ERROR(fused_kernel_registry.Register( + KernelCreateInfo(std::move(kernel_def), static_cast( + [](const OpKernelInfo& info) -> OpKernel* { + return new FunctionKernel(info); + })))); + + // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one + graph.FinalizeFuseSubGraph(indexed_sub_graph, node); } - - ORT_RETURN_IF_ERROR(fused_kernel_registry.Register( - KernelCreateInfo(std::move(kernel_def), static_cast( - [](const OpKernelInfo& info) -> OpKernel* { - return new FunctionKernel(info); - })))); - - // now that we're done compiling we can remove the original nodes from the Graph and wire in the new one - graph.FinalizeFuseSubGraph(indexed_sub_graph, node); } return Status::OK(); diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index e9b5225fa5..8f72f43c0e 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -3392,6 +3392,31 @@ Node& Graph::BeginFuseSubGraph(const IndexedSubGraph& sub_graph, const std::stri return node; } +void Graph::CancelFuseSubGraph(const Node& fused_node) { + auto node_idx = fused_node.Index(); + if (!GetNode(node_idx)) + return; + + if (fused_node.NodeType() != Node::Type::Fused) + return; + +#if !defined(ORT_MINIMAL_BUILD) + // Remove the function body from function container + const auto* fused_node_func = fused_node.GetFunctionBody(); + auto it = std::find_if( + function_container_.begin(), function_container_.end(), + [fused_node_func](const std::unique_ptr& func) { + return func.get() == fused_node_func; + }); + if (it != function_container_.end()) { + function_container_.erase(it); + } +#endif + + // Remove the fused_node + RemoveNode(node_idx); +} + void Graph::FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node) { const auto* func_meta_def = sub_graph.GetMetaDef(); ORT_ENFORCE(nullptr != func_meta_def); @@ -3432,9 +3457,7 @@ void Graph::FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_n if (it != input_indexes.cend()) { AddEdge(producer_idx, new_node_idx, src_idx, it->second); } - } - else - { + } else { int dst_implicit_input_idx = dst_idx - (int)node->InputDefs().size(); ORT_ENFORCE(dst_implicit_input_idx < (int)node->ImplicitInputDefs().size()); auto it = input_indexes.find(node->ImplicitInputDefs()[dst_implicit_input_idx]->Name()); diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc index d752d57959..aba2ed93c3 100644 --- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc +++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc @@ -254,5 +254,102 @@ TEST(InternalTestingEP, TestModelWithSubgraph) { feeds); } +// A custom InternalTestingEP extension +// This is to testing execution fall back to CPU EP if Compile fails, for ORT format +// This EP will take an additional compile_failure_ops +// If in Compile() any nodes in the partition is also in compile_failure_ops +// The Compile will fail +class CompileFailureTestExecutionProvider : public InternalTestingExecutionProvider { + public: + CompileFailureTestExecutionProvider(const std::unordered_set& supported_ops, + const std::unordered_set& compile_failure_ops); + virtual ~CompileFailureTestExecutionProvider() = default; + + Status Compile(const std::vector& fused_nodes, + std::vector& node_compute_funcs) override; + + private: + std::unordered_set compile_failure_ops_; +}; + +CompileFailureTestExecutionProvider::CompileFailureTestExecutionProvider( + const std::unordered_set& supported_ops, + const std::unordered_set& compile_failure_ops) + : InternalTestingExecutionProvider(supported_ops), + compile_failure_ops_(compile_failure_ops) {} + +Status CompileFailureTestExecutionProvider::Compile(const std::vector& fused_nodes, + std::vector& node_compute_funcs) { + for (const auto& fused_node_and_graph : fused_nodes) { + // If any nodes in this partition is also in compile_failure_ops_, the Compile will fail + const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph); + for (const auto& node : graph_viewer.Nodes()) { + if (compile_failure_ops_.find(node.OpType()) != compile_failure_ops_.end()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "CompileFailureTestExecutionProvider::Compile failed for node: ", node.Name()); + } + } + } + + return InternalTestingExecutionProvider::Compile(fused_nodes, node_compute_funcs); +} + +TEST(InternalTestingEP, TestOrtModelWithCompileFailure) { + // In the test file, there are 2 Conv and 1 Gemm nodes, all disconnected + // So we should have 3 partitions be taken by InternalTestingExecutionProvider/CompileFailureTestExecutionProvider + // But CompileFailureTestExecutionProvider will fail the Compile for partition contains "Gemm" node + // This is to test the model initialization won't fail and Gemm node will not be replaced by the fused_node + const ORTCHAR_T* ort_model_path = ORT_TSTR("testdata/mnist.ort"); + + const std::unordered_set& supported_ops{"Conv", "Gemm"}; + const std::unordered_set& compile_failure_ops{"Gemm"}; + + // Use InternalTestingExecutionProvider + // We should have 3 partitions taken by the EP + // 2 Conv and 1 Gemm + { + InferenceSessionWrapper session(SessionOptions(), GetEnvironment()); + ASSERT_STATUS_OK(session.RegisterExecutionProvider( + onnxruntime::make_unique(supported_ops))); + ASSERT_STATUS_OK(session.Load(ort_model_path)); + ASSERT_STATUS_OK(session.Initialize()); + + int num_replaced_nodes = CountAndValidateAssignedNodes( + session.GetGraph(), supported_ops, session.GetSessionState().GetFuncMgr()); + + ASSERT_EQ(num_replaced_nodes, 3); + } + + // Use CompileFailureTestExecutionProvider which will fail Compile on "Gemm" + // We should have 2 partitions taken by the EP + // 2 Conv + { + InferenceSessionWrapper session(SessionOptions(), GetEnvironment()); + ASSERT_STATUS_OK(session.RegisterExecutionProvider( + onnxruntime::make_unique(supported_ops, compile_failure_ops))); + ASSERT_STATUS_OK(session.Load(ort_model_path)); + ASSERT_STATUS_OK(session.Initialize()); + + // 2 Conv nodes shoule be replaced with fused nodes + const auto& graph = session.GetGraph(); + int num_replaced_nodes = CountAndValidateAssignedNodes( + session.GetGraph(), {"Conv"}, session.GetSessionState().GetFuncMgr()); + + ASSERT_EQ(num_replaced_nodes, 2); + + // The Gemm node should still not have been replaced + int count_compile_failure_nodes = 0; + for (const auto& node : graph.Nodes()) { + if (compile_failure_ops.find(node.OpType()) != compile_failure_ops.end()) + count_compile_failure_nodes++; + } + ASSERT_EQ(count_compile_failure_nodes, 1); + + // Execute the session, since the last node is Gemm, and its input 0 is all 0s + // So the result should be the bias initializer of the Gemm node + ExecuteMnist(session, true /* enable_custom_ep */); + } +} + } // namespace test } // namespace onnxruntime From 973c3917a68036c6627fd908f106b68153271596 Mon Sep 17 00:00:00 2001 From: sfatimar <64512376+sfatimar@users.noreply.github.com> Date: Sat, 6 Feb 2021 01:48:02 +0530 Subject: [PATCH 25/41] OpenVino add build_shared_lib flag in the build command (#6560) * Dockerfile changes to add build_shared_lib 2021_1 indendation changes * csharp shared library Co-authored-by: sfatimar --- BUILD.md | 4 ++-- dockerfiles/Dockerfile.openvino | 2 +- dockerfiles/Dockerfile.openvino-csharp | 4 ++-- .../core/providers/openvino/ov_versions/capability_2021_1.cc | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/BUILD.md b/BUILD.md index bb09229e11..72a936b6b9 100644 --- a/BUILD.md +++ b/BUILD.md @@ -396,13 +396,13 @@ Note that OpenVINO is built as a [shared provider library](#Execution-Provider-S ##### Windows ``` -.\build.bat --config RelWithDebInfo --use_openvino +.\build.bat --config RelWithDebInfo --use_openvino --build_shared_lib ``` *Note: The default Windows CMake Generator is Visual Studio 2017, but you can also use the newer Visual Studio 2019 by passing `--cmake_generator "Visual Studio 16 2019"` to `.\build.bat`* ##### Linux ``` -./build.sh --config RelWithDebInfo --use_openvino +./build.sh --config RelWithDebInfo --use_openvino --build_shared_lib ``` --use_openvino: Builds the OpenVINO Execution Provider in ONNX Runtime. diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino index 01ac9baab2..03625511e3 100644 --- a/dockerfiles/Dockerfile.openvino +++ b/dockerfiles/Dockerfile.openvino @@ -84,7 +84,7 @@ RUN apt update && apt -y install --no-install-recommends apt-transport-https ca- git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh && \ cd onnxruntime/cmake/external/onnx && python3 setup.py install && \ - cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_wheel && \ + cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \ pip install build/Linux/Release/dist/*-linux_x86_64.whl && \ cd ${MY_ROOT}/ && rm -rf onnxruntime && cd /opt && rm -rf v1.0.22.zip && cd ${MY_ROOT} &&\ apt remove -y cmake && cd /usr/share/python-wheels/ && rm -rf *.whl &&\ diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp index deb8be1b7c..b84d89a0b7 100644 --- a/dockerfiles/Dockerfile.openvino-csharp +++ b/dockerfiles/Dockerfile.openvino-csharp @@ -102,7 +102,7 @@ RUN apt update && \ git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh && \ cd onnxruntime/cmake/external/onnx && python3 setup.py install && \ - cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget && \ + cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib && \ mv ${MY_ROOT}/onnxruntime/build/Linux/Release/nuget-artifacts ${MY_ROOT} && \ # Clean-up unnecessary files rm -rf ${MY_ROOT}/cmake* /opt/cmake ${MY_ROOT}/onnxruntime && \ @@ -111,4 +111,4 @@ RUN apt update && \ apt remove -y git && apt autoremove -y && apt remove -y cmake && \ cd /usr/lib/ && rm -rf python2.7 python3.6 python3.8 && cd && rm -rf .cache && \ cd /usr/share/python-wheels/ && rm -rf *.whl - \ No newline at end of file + diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability_2021_1.cc b/onnxruntime/core/providers/openvino/ov_versions/capability_2021_1.cc index d6459c4c3a..be8e2204f0 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability_2021_1.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability_2021_1.cc @@ -272,8 +272,8 @@ static bool IsUnsupportedOpMode(const Node* node, const GraphViewer& graph_viewe return true; } } else if (optype == "Max" || optype == "Min" || optype == "Mean" || optype == "Sum") { - if (GetInputCount(node, initializers) == 1) - return true; + if (GetInputCount(node, initializers) == 1) + return true; if (optype == "Max" || optype == "Min") { for (size_t i = 0; i < node->InputDefs().size(); i++) { auto dtype = node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type(); From 299ace0759192557e97e14d03c08f0334c84e9d5 Mon Sep 17 00:00:00 2001 From: Weixing Zhang Date: Fri, 5 Feb 2021 15:48:18 -0800 Subject: [PATCH 26/41] Support to allow user to specify compute stream per session (#3723) * Support to allow user to specify compute stream per session Create computation cuda stream explicitly rather than use default legacy stream or per-thread default stream. remove some redudant cudaStreamSynchronize fix gpt2 model test failures don't use default stream in nccl either. add stream schronization in OnRunEnd() using cub::DeviceScan::InclusiveSum which can be called with stream specified. fix topK failure due to latest rebase fix tensorrt support user specified stream add user_stream support in tensorrt EP use same stream for both tensort and CUDA EP. fix ScatterND specify stream for adasum and p2p kernels. fix loop fix CApiTest.custom_op_handler fix CApiTest.varied_input_custom_op_handler change for cudaMemcpyFromSymbol improve provider options for user specified compute stream * add changes for ROCM EP * fix GatherGrad UT for ROCM EP * clean code and fix NonMaxSuppression * use default stream for ROCM now * fix CApiTest.custom_op_handler:OrtFormatCustomOpTests.ConvertOnnxModelToOrt * fix tensorrt ut: CApiTest.io_binding_cuda Co-authored-by: Weixing Zhang --- cmake/CMakeLists.txt | 4 +- .../core/framework/execution_provider.h | 3 + .../core/session/onnxruntime_c_api.h | 17 ++++ .../core/session/onnxruntime_cxx_api.h | 1 + .../core/session/onnxruntime_cxx_inline.h | 5 + .../cuda/activation/activations.cc | 1 + .../cuda/activation/activations_impl.cu | 5 +- .../cuda/activation/activations_impl.h | 1 + .../contrib_ops/cuda/bert/attention.cc | 1 + .../contrib_ops/cuda/bert/attention_impl.cu | 4 +- .../contrib_ops/cuda/bert/attention_impl.h | 1 + .../contrib_ops/cuda/bert/embed_layer_norm.cc | 1 + .../cuda/bert/embed_layer_norm_impl.cu | 5 +- .../cuda/bert/embed_layer_norm_impl.h | 3 +- .../contrib_ops/cuda/bert/fast_gelu.cc | 2 +- .../cuda/bert/longformer_attention.cc | 1 + .../cuda/bert/longformer_attention_impl.cu | 4 +- .../cuda/bert/longformer_attention_impl.h | 1 + .../contrib_ops/cuda/bert/skip_layer_norm.cc | 1 + .../cuda/bert/skip_layer_norm_impl.cu | 4 +- .../cuda/bert/skip_layer_norm_impl.h | 1 + onnxruntime/contrib_ops/cuda/fused_conv.cc | 2 +- onnxruntime/contrib_ops/cuda/inverse.cc | 52 +++++----- onnxruntime/contrib_ops/cuda/layer_norm.cc | 2 +- .../contrib_ops/cuda/layer_norm_impl.cu | 5 +- .../contrib_ops/cuda/layer_norm_impl.h | 1 + .../contrib_ops/cuda/math/bias_softmax.cc | 10 +- .../contrib_ops/cuda/math/bias_softmax.h | 2 + .../cuda/math/bias_softmax_impl.cu | 27 +++--- .../cuda/math/binary_elementwise_ops.cc | 1 + .../cuda/math/binary_elementwise_ops_impl.cu | 6 +- .../cuda/math/binary_elementwise_ops_impl.h | 1 + .../contrib_ops/cuda/math/complex_mul.cc | 1 + .../contrib_ops/cuda/math/complex_mul_impl.cu | 8 +- .../contrib_ops/cuda/math/complex_mul_impl.h | 1 + onnxruntime/contrib_ops/cuda/math/fft_ops.cc | 4 +- .../contrib_ops/cuda/math/fft_ops_impl.cu | 6 +- .../contrib_ops/cuda/math/fft_ops_impl.h | 2 +- .../quantization/attention_quantization.cc | 2 + .../attention_quantization_impl.cu | 8 +- .../attention_quantization_impl.cuh | 2 +- onnxruntime/contrib_ops/cuda/tensor/crop.cc | 1 + .../contrib_ops/cuda/tensor/crop_impl.cu | 5 +- .../contrib_ops/cuda/tensor/crop_impl.h | 1 + .../contrib_ops/cuda/tensor/image_scaler.cc | 3 +- .../cuda/tensor/image_scaler_impl.cu | 7 +- .../cuda/tensor/image_scaler_impl.h | 1 + .../contrib_ops/rocm/math/bias_softmax.cc | 10 +- .../contrib_ops/rocm/math/bias_softmax.h | 2 + .../rocm/math/bias_softmax_impl.cu | 27 +++--- .../core/framework/provider_bridge_ort.cc | 31 ++++-- .../core/providers/cpu/controlflow/loop.cc | 18 ++-- .../core/providers/cpu/controlflow/loop.h | 4 +- .../math/einsum_utils/einsum_auxiliary_ops.cc | 4 +- .../math/einsum_utils/einsum_auxiliary_ops.h | 8 +- .../einsum_compute_preprocessor.cc | 2 +- .../einsum_typed_compute_processor.cc | 4 +- .../providers/cuda/activation/activations.cc | 1 + .../cuda/activation/activations_impl.cu | 5 +- .../cuda/activation/activations_impl.h | 1 + .../core/providers/cuda/controlflow/loop.cc | 7 +- .../cuda/cu_inc/binary_elementwise_impl.cuh | 20 ++-- .../cuda/cu_inc/unary_elementwise_impl.cuh | 3 +- .../cuda/cu_inc/variadic_elementwise_impl.cuh | 3 +- .../providers/cuda/cuda_execution_provider.cc | 37 ++++++- .../providers/cuda/cuda_execution_provider.h | 19 ++-- .../cuda/cuda_execution_provider_info.h | 2 + onnxruntime/core/providers/cuda/cuda_kernel.h | 6 +- .../providers/cuda/cuda_provider_factory.cc | 3 +- onnxruntime/core/providers/cuda/cuda_utils.cu | 10 +- onnxruntime/core/providers/cuda/fpgeneric.cu | 12 +-- .../cuda/generator/constant_of_shape.cc | 2 +- .../core/providers/cuda/generator/range.cc | 10 +- .../providers/cuda/generator/range_impl.cu | 6 +- .../providers/cuda/generator/range_impl.h | 2 +- .../core/providers/cuda/gpu_data_transfer.cc | 25 ++--- .../core/providers/cuda/gpu_data_transfer.h | 3 +- .../cuda/math/binary_elementwise_ops.cc | 19 ++-- .../cuda/math/binary_elementwise_ops.h | 3 +- .../cuda/math/binary_elementwise_ops_impl.cu | 18 ++-- .../cuda/math/binary_elementwise_ops_impl.h | 3 + onnxruntime/core/providers/cuda/math/clip.cc | 8 +- .../core/providers/cuda/math/clip_impl.cu | 18 ++-- .../core/providers/cuda/math/clip_impl.h | 2 +- .../core/providers/cuda/math/cumsum.cc | 14 +-- .../core/providers/cuda/math/cumsum_impl.cu | 10 +- .../core/providers/cuda/math/cumsum_impl.h | 1 + .../math/einsum_utils/einsum_auxiliary_ops.cc | 9 +- .../math/einsum_utils/einsum_auxiliary_ops.h | 4 +- .../einsum_auxiliary_ops_diagonal.cu | 5 +- .../einsum_auxiliary_ops_diagonal.h | 1 + onnxruntime/core/providers/cuda/math/gemm.cc | 3 +- .../providers/cuda/math/matmul_integer.cc | 7 +- .../providers/cuda/math/matmul_integer.cu | 17 ++-- .../providers/cuda/math/matmul_integer.cuh | 7 +- .../core/providers/cuda/math/softmax.cc | 16 ++-- .../core/providers/cuda/math/softmax.h | 3 +- .../core/providers/cuda/math/softmax_impl.cu | 28 +++--- .../core/providers/cuda/math/topk_impl.cu | 27 +++--- .../cuda/math/unary_elementwise_ops.cc | 1 + .../cuda/math/unary_elementwise_ops_impl.cu | 11 ++- .../cuda/math/unary_elementwise_ops_impl.h | 2 + .../cuda/math/variadic_elementwise_ops.cc | 23 +++-- .../cuda/math/variadic_elementwise_ops.h | 6 +- .../math/variadic_elementwise_ops_impl.cu | 8 ++ .../cuda/math/variadic_elementwise_ops_impl.h | 2 + .../providers/cuda/multi_tensor/common.cuh | 7 +- .../core/providers/cuda/nn/batch_norm.cc | 8 +- onnxruntime/core/providers/cuda/nn/conv.cc | 9 +- onnxruntime/core/providers/cuda/nn/conv.h | 3 +- onnxruntime/core/providers/cuda/nn/dropout.h | 9 +- .../core/providers/cuda/nn/dropout_impl.cu | 4 +- .../core/providers/cuda/nn/dropout_impl.h | 1 + .../core/providers/cuda/nn/instance_norm.cc | 1 + .../providers/cuda/nn/instance_norm_impl.cu | 5 +- .../providers/cuda/nn/instance_norm_impl.h | 1 + .../providers/cuda/nn/max_pool_with_index.cu | 4 +- .../providers/cuda/nn/max_pool_with_index.h | 1 + onnxruntime/core/providers/cuda/nn/pool.cc | 5 +- onnxruntime/core/providers/cuda/nn/shrink.cc | 2 +- .../core/providers/cuda/nn/shrink_impl.cu | 5 +- .../core/providers/cuda/nn/shrink_impl.h | 1 + .../object_detection/non_max_suppression.cc | 4 +- .../non_max_suppression_impl.cu | 42 ++++---- .../non_max_suppression_impl.h | 1 + .../cuda/object_detection/roialign.cc | 1 + .../cuda/object_detection/roialign_impl.cu | 4 +- .../cuda/object_detection/roialign_impl.h | 1 + .../cuda/reduction/reduction_functions.cu | 48 +++++----- .../cuda/reduction/reduction_functions.h | 12 +-- .../providers/cuda/reduction/reduction_ops.cc | 83 +++++++++------- .../core/providers/cuda/rnn/cudnn_rnn_base.cc | 19 ++-- .../core/providers/cuda/rnn/rnn_impl.cu | 32 ++++--- .../core/providers/cuda/rnn/rnn_impl.h | 12 ++- .../providers/cuda/shared_inc/cuda_utils.h | 4 +- .../providers/cuda/shared_inc/fpgeneric.h | 14 +-- .../core/providers/cuda/tensor/cast_op.cc | 1 + .../core/providers/cuda/tensor/compress.cc | 21 +++- .../providers/cuda/tensor/compress_impl.cu | 29 +++--- .../providers/cuda/tensor/compress_impl.h | 8 +- .../core/providers/cuda/tensor/concat.cc | 3 +- .../core/providers/cuda/tensor/concat_impl.cu | 11 ++- .../core/providers/cuda/tensor/concat_impl.h | 3 +- .../core/providers/cuda/tensor/expand.cc | 1 + .../core/providers/cuda/tensor/expand_impl.cu | 21 ++-- .../core/providers/cuda/tensor/expand_impl.h | 1 + .../core/providers/cuda/tensor/eye_like.cc | 3 +- .../providers/cuda/tensor/eye_like_impl.cu | 4 +- .../providers/cuda/tensor/eye_like_impl.h | 1 + .../core/providers/cuda/tensor/flatten.cc | 2 +- .../core/providers/cuda/tensor/gather.cc | 1 + .../providers/cuda/tensor/gather_elements.cc | 1 + .../cuda/tensor/gather_elements_impl.cu | 9 +- .../cuda/tensor/gather_elements_impl.h | 1 + .../core/providers/cuda/tensor/gather_impl.cu | 9 +- .../core/providers/cuda/tensor/gather_impl.h | 1 + .../core/providers/cuda/tensor/gather_nd.cc | 15 ++- .../core/providers/cuda/tensor/gather_nd.h | 1 + .../providers/cuda/tensor/gather_nd_impl.cu | 9 +- .../providers/cuda/tensor/gather_nd_impl.h | 3 + .../core/providers/cuda/tensor/identity_op.h | 4 +- .../providers/cuda/tensor/nonzero_impl.cu | 40 ++++---- .../core/providers/cuda/tensor/nonzero_impl.h | 8 +- .../core/providers/cuda/tensor/nonzero_op.cc | 13 +-- .../core/providers/cuda/tensor/onehot.cc | 8 +- .../core/providers/cuda/tensor/onehot.cu | 8 +- .../core/providers/cuda/tensor/onehot.h | 2 + onnxruntime/core/providers/cuda/tensor/pad.cc | 3 +- .../core/providers/cuda/tensor/pad_impl.cu | 9 +- .../core/providers/cuda/tensor/pad_impl.h | 1 + .../providers/cuda/tensor/quantize_linear.cc | 4 +- .../providers/cuda/tensor/quantize_linear.cu | 24 ++--- .../providers/cuda/tensor/quantize_linear.cuh | 4 +- .../core/providers/cuda/tensor/resize_impl.cu | 96 +++++++++++-------- .../core/providers/cuda/tensor/resize_impl.h | 1 + .../providers/cuda/tensor/reverse_sequence.cc | 3 +- .../cuda/tensor/reverse_sequence_impl.cu | 6 +- .../cuda/tensor/reverse_sequence_impl.h | 1 + .../providers/cuda/tensor/scatter_elements.cc | 7 +- .../cuda/tensor/scatter_elements_impl.cu | 20 ++-- .../cuda/tensor/scatter_elements_impl.h | 1 + .../core/providers/cuda/tensor/scatter_nd.cc | 3 +- .../providers/cuda/tensor/scatter_nd_impl.cu | 9 +- .../providers/cuda/tensor/scatter_nd_impl.h | 1 + .../core/providers/cuda/tensor/slice.cc | 15 ++- .../core/providers/cuda/tensor/slice.h | 3 +- .../core/providers/cuda/tensor/slice_impl.cu | 15 +-- .../core/providers/cuda/tensor/slice_impl.h | 9 +- .../core/providers/cuda/tensor/split.cc | 3 +- .../core/providers/cuda/tensor/split_impl.cu | 11 ++- .../core/providers/cuda/tensor/split_impl.h | 3 +- .../core/providers/cuda/tensor/squeeze.cc | 2 +- .../core/providers/cuda/tensor/tile.cc | 11 ++- .../core/providers/cuda/tensor/tile_impl.cu | 15 +-- .../core/providers/cuda/tensor/tile_impl.h | 3 + .../core/providers/cuda/tensor/transpose.cc | 22 +++-- .../core/providers/cuda/tensor/transpose.h | 1 + .../providers/cuda/tensor/transpose_impl.cu | 30 +++--- .../providers/cuda/tensor/transpose_impl.h | 6 +- .../core/providers/cuda/tensor/unsqueeze.cc | 2 +- .../core/providers/cuda/tensor/upsample.cc | 5 +- .../providers/cuda/tensor/upsample_impl.cu | 12 ++- .../providers/cuda/tensor/upsample_impl.h | 3 +- .../core/providers/cuda/tensor/where.cc | 1 + .../core/providers/cuda/tensor/where_impl.cu | 8 +- .../core/providers/cuda/tensor/where_impl.h | 1 + onnxruntime/core/providers/rocm/fpgeneric.cu | 8 +- .../core/providers/rocm/gpu_data_transfer.cc | 35 ++++--- .../core/providers/rocm/gpu_data_transfer.h | 3 +- onnxruntime/core/providers/rocm/math/gemm.cc | 3 +- .../core/providers/rocm/math/softmax.cc | 11 ++- .../core/providers/rocm/math/softmax_impl.cu | 28 +++--- .../providers/rocm/reduction/reduction_ops.cc | 83 +++++++++------- .../providers/rocm/rocm_execution_provider.cc | 27 +++++- .../providers/rocm/rocm_execution_provider.h | 26 ++++- .../rocm/rocm_execution_provider_info.h | 3 + onnxruntime/core/providers/rocm/rocm_kernel.h | 4 +- onnxruntime/core/providers/rocm/rocm_utils.cu | 10 +- .../providers/rocm/shared_inc/fpgeneric.h | 12 +-- .../providers/rocm/tensor/gather_nd_impl.cu | 9 +- .../core/providers/rocm/tensor/transpose.cc | 22 +++-- .../core/providers/rocm/tensor/transpose.h | 1 + .../providers/shared_library/provider_api.h | 2 +- .../provider_bridge_provider.cc | 4 +- .../shared_library/provider_interfaces.h | 10 +- .../tensorrt/tensorrt_execution_provider.cc | 53 +++++++--- .../tensorrt/tensorrt_execution_provider.h | 10 ++ .../tensorrt/tensorrt_provider_factory.cc | 31 ++++-- onnxruntime/core/session/inference_session.cc | 5 + onnxruntime/core/session/onnxruntime_c_api.cc | 10 ++ onnxruntime/core/session/ort_apis.h | 2 + .../python/onnxruntime_pybind_state.cc | 11 ++- .../test/framework/inference_session_test.cc | 3 +- onnxruntime/test/onnx/main.cc | 22 ++++- onnxruntime/test/perftest/ort_test_session.cc | 4 +- .../cuda/reduction_functions_test.cc | 11 ++- onnxruntime/test/shared_lib/test_inference.cc | 3 + onnxruntime/test/shared_lib/utils.cc | 1 + onnxruntime/test/util/default_providers.cc | 5 +- .../test/training_ops/cuda/cuda_utils_test.cc | 2 +- .../cuda/activation/activations_grad.cc | 6 +- .../cuda/activation/activations_grad_impl.cu | 5 +- .../cuda/activation/activations_grad_impl.h | 3 +- .../cuda/activation/bias_gelu_grad.cc | 4 +- .../cuda/activation/bias_gelu_grad.h | 1 + .../cuda/activation/bias_gelu_grad_impl.cu | 5 +- .../cuda/activation/bias_gelu_grad_impl.h | 1 + .../cuda/collective/adasum_kernels.cc | 8 +- .../cuda/collective/nccl_kernels.cc | 21 ++-- .../training_ops/cuda/communication/recv.cc | 8 +- .../training_ops/cuda/communication/send.cc | 8 +- .../loss/softmax_cross_entropy_loss_impl.cc | 27 +++--- .../loss/softmax_cross_entropy_loss_impl.cu | 14 ++- .../loss/softmax_cross_entropy_loss_impl.h | 3 + .../cuda/loss/softmaxcrossentropy_impl.cc | 24 +++-- .../cuda/loss/softmaxcrossentropy_impl.cu | 20 ++-- .../cuda/loss/softmaxcrossentropy_impl.h | 4 + .../training_ops/cuda/math/div_grad.cc | 6 ++ .../training_ops/cuda/math/div_grad_impl.cu | 56 ++++++----- .../training_ops/cuda/math/div_grad_impl.h | 4 + .../training_ops/cuda/math/isfinite.cc | 5 +- .../training_ops/cuda/math/isfinite.cu | 12 +-- .../training_ops/cuda/math/isfinite.h | 4 +- .../cuda/math/mixed_precision_scale.cc | 1 + .../cuda/math/mixed_precision_scale.cu | 4 +- .../cuda/math/mixed_precision_scale.h | 1 + .../training_ops/cuda/math/scale.cc | 1 + .../training_ops/cuda/math/scale.cu | 4 +- .../training_ops/cuda/math/scale.h | 1 + .../training_ops/cuda/math/softmax_grad.cc | 10 +- .../training_ops/cuda/math/softmax_grad.h | 2 +- .../cuda/math/softmax_grad_impl.cu | 28 +++--- .../training_ops/cuda/nn/dropout.cc | 12 ++- .../training_ops/cuda/nn/dropout_impl.cu | 12 ++- .../training_ops/cuda/nn/dropout_impl.h | 2 + .../training_ops/cuda/nn/layer_norm.cc | 4 +- .../training_ops/cuda/nn/layer_norm_impl.cu | 13 +-- .../training_ops/cuda/nn/layer_norm_impl.h | 1 + .../training_ops/cuda/optimizer/adam.cc | 11 ++- .../training_ops/cuda/optimizer/adam.cu | 6 +- .../training_ops/cuda/optimizer/adam.h | 1 + .../training_ops/cuda/optimizer/common.h | 4 +- .../cuda/optimizer/gradient_control.cc | 5 +- .../cuda/optimizer/gradient_control.cu | 4 +- .../cuda/optimizer/gradient_control.h | 1 + .../training_ops/cuda/optimizer/lamb.cc | 28 ++++-- .../training_ops/cuda/optimizer/lamb.cu | 22 +++-- .../training_ops/cuda/optimizer/lamb.h | 5 + .../training_ops/cuda/optimizer/sg.cc | 1 + .../training_ops/cuda/optimizer/sg.cu | 4 +- .../training_ops/cuda/optimizer/sg.h | 1 + .../training_ops/cuda/reduction/all.cc | 5 +- .../training_ops/cuda/reduction/all.cu | 6 +- .../training_ops/cuda/reduction/all.h | 2 +- .../cuda/reduction/reduction_all.cc | 14 +-- .../cuda/reduction/reduction_all.cu | 26 ++--- .../cuda/reduction/reduction_all.h | 4 +- .../cuda/reduction/reduction_ops.cc | 12 +-- .../training_ops/cuda/tensor/concat.cc | 5 +- .../cuda/tensor/gather_elements_grad.cc | 9 +- .../cuda/tensor/gather_elements_grad_impl.h | 1 + .../training_ops/cuda/tensor/gather_grad.cc | 18 ++-- .../cuda/tensor/gather_grad_impl.cu | 67 ++++++++----- .../cuda/tensor/gather_grad_impl.h | 1 + .../cuda/tensor/gather_nd_grad.cc | 13 ++- .../cuda/tensor/gather_nd_grad_impl.cu | 5 +- .../cuda/tensor/gather_nd_grad_impl.h | 1 + .../training_ops/cuda/tensor/slice_grad.cc | 5 +- .../training_ops/cuda/tensor/split.cc | 3 +- .../training_ops/cuda/tensor/view.cc | 2 +- .../training_ops/rocm/math/softmax_grad.cc | 7 +- .../rocm/math/softmax_grad_impl.cu | 28 +++--- .../training_ops/rocm/optimizer/adam.cc | 11 ++- .../training_ops/rocm/optimizer/adam.cu | 6 +- .../training_ops/rocm/optimizer/lamb.cc | 29 ++++-- .../rocm/reduction/reduction_all.cc | 14 +-- .../rocm/reduction/reduction_ops.cc | 12 +-- .../training_ops/rocm/tensor/gather_grad.cc | 2 +- .../rocm/tensor/gather_grad_impl.cu | 11 ++- .../rocm/tensor/gather_nd_grad_impl.cu | 5 +- 320 files changed, 1876 insertions(+), 1109 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 7d2a7f2bea..d7e5e2b9e3 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1198,9 +1198,9 @@ if (onnxruntime_USE_CUDA) endif() endif() endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --default-stream legacy") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") if (NOT WIN32) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --expt-relaxed-constexpr --compiler-options -fPIC") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC") endif() # Options passed to cudafe set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"") diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 6069ed4839..a2454997bc 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -165,6 +165,9 @@ class IExecutionProvider { */ virtual common::Status OnSessionInitializationEnd() { return Status::OK(); } + virtual common::Status SetComputeStream(void*) { return Status::OK(); } + virtual void* GetComputeStream() const { return nullptr; } + void InsertAllocator(AllocatorPtr allocator); void ReplaceAllocator(AllocatorPtr allocator); // TODO: temparary sulotion, need to unify the interface in EP and AllocatorManager diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 3b5c4af359..b0985608fc 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -266,8 +266,19 @@ typedef struct OrtCUDAProviderOptions { size_t cuda_mem_limit; // default cuda memory limitation to maximum finite value of size_t. int arena_extend_strategy; // default area extend strategy to KNextPowerOfTwo. int do_copy_in_default_stream; + int has_user_compute_stream; + void* user_compute_stream; } OrtCUDAProviderOptions; +/// +/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT +/// +typedef struct OrtTensorRTProviderOptions { + int device_id; + int has_user_compute_stream; + void* user_compute_stream; +} OrtTensorRTProviderOptions; + /// /// Options for the OpenVINO provider that are passed to SessionOptionsAppendExecutionProvider_OpenVINO /// @@ -1146,6 +1157,12 @@ struct OrtApi { */ ORT_API2_STATUS(ModelMetadataGetGraphDescription, _In_ const OrtModelMetadata* model_metadata, _Inout_ OrtAllocator* allocator, _Outptr_ char** value); + /** + * Append TensorRT execution provider to the session options + * If TensorRT is not available (due to a non TensorRT enabled build), this function will return failure. + */ + ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); }; /* diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index d5aa79a79d..be43d9cd21 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -326,6 +326,7 @@ struct SessionOptions : Base { SessionOptions& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options); SessionOptions& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options); + SessionOptions& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options); }; struct ModelMetadata : Base { diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index a5ce8219f6..a818c3c691 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -490,6 +490,11 @@ inline SessionOptions& SessionOptions::AppendExecutionProvider_CUDA(const OrtCUD return *this; } +inline SessionOptions& SessionOptions::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) { + ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(p_, &provider_options)); + return *this; +} + inline SessionOptions& SessionOptions::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) { ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(p_, &provider_options)); return *this; diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc index 45bda90b1e..6a26e0f6c3 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations.cc +++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc @@ -29,6 +29,7 @@ namespace cuda { ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ &func_ctx, p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu index 62601a1c69..7988ecd42f 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu @@ -45,14 +45,15 @@ struct OP_Gelu : public CtxGelu { #define UNARY_ACTIVATION_IMPL(name) \ UNARY_ACTIVATION_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); #define SPECIALIZED_UNARY_ACTIVATIONL_HFD(name) \ SPECIALIZED_UNARY_ACTIVATION_IMPL(name, half) \ diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h index 95ea6d5af6..56ece01e46 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h @@ -22,6 +22,7 @@ typedef onnxruntime::cuda::CtxNull CtxGelu; #define UNARY_ACTIVATION_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index 25a23a5111..ce9147ad1b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -88,6 +88,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { auto temp_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchAttentionKernel( device_prop, + Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu index 00f92b4f1c..a342168c6d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu @@ -148,6 +148,7 @@ bool QkvToContext( bool LaunchAttentionKernel( const cudaDeviceProp& prop, + cudaStream_t stream, const void* input, const int* mask_index, const std::vector* mask_index_dims, @@ -163,9 +164,6 @@ bool LaunchAttentionKernel( int past_sequence_length, const void* past, void* present) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return QkvToContext(prop, cublas, stream, batch_size, sequence_length, num_heads, head_size, element_size, diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h index c51c007290..30f03b8668 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h @@ -20,6 +20,7 @@ size_t GetAttentionWorkspaceSize( bool LaunchAttentionKernel( const cudaDeviceProp& prop, // Device Properties + cudaStream_t stream, // cuda stream const void* input, // Input tensor const int* mask_index, // Attention mask raw data or index (end position of each sequence, or end positions and start positions). NULL means no mask. const std::vector* mask_index_dims, // Mask index shape diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc index 8adffa85ed..e975181d29 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm.cc @@ -61,6 +61,7 @@ Status EmbedLayerNorm::ComputeInternal(OpKernelContext* context) const { size_t element_size = sizeof(T); if (!LaunchEmbedLayerNormKernel( + Stream(), output->template MutableData(), mask_index->template MutableData(), input_ids->template Data(), diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu index 9e856e2e35..ad005e40e0 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu @@ -173,6 +173,7 @@ bool EmbedSkipLayerNorm( } bool LaunchEmbedLayerNormKernel( + cudaStream_t stream, void* output, void* mask_index, const int* input_ids, @@ -188,10 +189,8 @@ bool LaunchEmbedLayerNormKernel( int batch_size, int sequence_length, const size_t element_size) { - const cudaStream_t stream = nullptr; // default stream - if (nullptr == input_mask) { - if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size))) + if (!CUDA_CALL(cudaMemsetAsync(mask_index, 0, sizeof(int) * batch_size, stream))) return false; } else if (!ComputeMaskIndex(stream, sequence_length, batch_size, input_mask, static_cast(mask_index))) { return false; diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h index 18648e6799..6977fd3e8e 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.h @@ -6,7 +6,8 @@ namespace onnxruntime { namespace contrib { namespace cuda { -bool LaunchEmbedLayerNormKernel(void* output, // output tensor +bool LaunchEmbedLayerNormKernel(cudaStream_t stream, + void* output, // output tensor void* mask_index, // output mask index const int* input_ids, // input word IDs const int* segment_ids, // input segment IDs diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc index 642ef3458c..8e4bfb1c84 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc +++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc @@ -47,7 +47,7 @@ Status FastGelu::ComputeInternal(OpKernelContext* context) const { int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size(); typedef typename ToCudaType::MappedType CudaT; if (!LaunchFastGeluKernel(GetDeviceProp(), - nullptr, + Stream(), static_cast(input_length), static_cast(bias_length), reinterpret_cast(input->template Data()), diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc index ef2eecb1ec..9ec5298c2b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc @@ -111,6 +111,7 @@ Status LongformerAttention::ComputeInternal(OpKernelContext* context) const { auto workspace_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchLongformerAttentionKernel( device_prop, + Stream(), reinterpret_cast(gemm_buffer.get()), reinterpret_cast(mask->template Data()), reinterpret_cast(global_gemm_buffer.get()), diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu index fd9637dfc9..191a979fc9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu @@ -814,6 +814,7 @@ bool LongformerQkvToContext( bool LaunchLongformerAttentionKernel( const cudaDeviceProp& prop, + cudaStream_t stream, const void* input, const void* attention_mask, const void* global_input, @@ -828,9 +829,6 @@ bool LaunchLongformerAttentionKernel( void* workspace, cublasHandle_t& cublas, const size_t element_size) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return LongformerQkvToContext(prop, cublas, stream, batch_size, sequence_length, num_heads, head_size, window, element_size, diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h index 632f6d6e5c..c08461e800 100644 --- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.h @@ -18,6 +18,7 @@ size_t GetLongformerAttentionWorkspaceSize( bool LaunchLongformerAttentionKernel( const cudaDeviceProp& device_prop, // Device Properties + cudaStream_t stream, // CUDA stream const void* input, // Input tensor const void* attention_mask, // Attention mask with shape (B, S) const void* global_input, // Global attention input, or nullptr when max_num_global == 0. diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc index f8f6c2ad49..b8238f7690 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc @@ -93,6 +93,7 @@ Status SkipLayerNorm::ComputeInternal(OpKernelContext* ctx) const { size_t element_size = sizeof(T); if (!LaunchSkipLayerNormKernel( + Stream(), output->template MutableData(), input->template Data(), skip->template Data(), diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu index 9c11ff85e0..a7b6aabe52 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu @@ -100,6 +100,7 @@ bool ComputeSkipLayerNorm( } bool LaunchSkipLayerNormKernel( + cudaStream_t stream, void* output, const void* input, const void* skip, @@ -110,9 +111,6 @@ bool LaunchSkipLayerNormKernel( int hidden_size, int element_count, size_t element_size) { - // use default stream - const cudaStream_t stream = nullptr; - if (element_size == 2) { return ComputeSkipLayerNorm( stream, diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h index 308242c010..0148231f2b 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h @@ -8,6 +8,7 @@ namespace contrib { namespace cuda { bool LaunchSkipLayerNormKernel( + cudaStream_t stream, void* output, // output tensor const void* input, // input tensor const void* skip, // skip tensor diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index 0e24032c48..6cce365871 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -90,7 +90,7 @@ class FusedConv : public onnxruntime::cuda::Conv { Base::s_.y_data, beta, Base::s_.y_tensor, Base::s_.y_data)); } if (Base::s_.post_slicing_required) { - onnxruntime::cuda::SliceOutUnwantedOutputSection(Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), + onnxruntime::cuda::SliceOutUnwantedOutputSection(this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), Base::s_.y_dims, Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cuda/inverse.cc b/onnxruntime/contrib_ops/cuda/inverse.cc index 546fc105de..f762b09d9f 100644 --- a/onnxruntime/contrib_ops/cuda/inverse.cc +++ b/onnxruntime/contrib_ops/cuda/inverse.cc @@ -35,22 +35,24 @@ ONNX_OPERATOR_KERNEL_EX( namespace inverse_internal { template -Status ComputeMatrixOffsets(T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr& matrix_ptrs) { +Status ComputeMatrixOffsets(cudaStream_t stream, T* workspace_data, size_t num_batches, size_t rows, IAllocatorUniquePtr& matrix_ptrs) { std::vector cuda_ptrs; const size_t matrix_size = rows * rows; for (size_t i = 0; i < num_batches; ++i) { cuda_ptrs.push_back(workspace_data); workspace_data += matrix_size; } - CUDA_RETURN_IF_ERROR(cudaMemcpy(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches, - cudaMemcpyHostToDevice)); + + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(matrix_ptrs.get(), cuda_ptrs.data(), sizeof(T*) * num_batches, + cudaMemcpyHostToDevice, stream)); return Status::OK(); } -Status CheckForSingularity(const IAllocatorUniquePtr& info, const std::unique_ptr& info_cpu, size_t num_batches) { +Status CheckForSingularity(cudaStream_t stream, const IAllocatorUniquePtr& info, const std::unique_ptr& info_cpu, size_t num_batches) { // Let's check if any of the info values is non-zero - CUDA_RETURN_IF_ERROR(cudaMemcpy(info_cpu.get(), info.get(), sizeof(int) * num_batches, - cudaMemcpyDeviceToHost)); + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(info_cpu.get(), info.get(), sizeof(int) * num_batches, + cudaMemcpyDeviceToHost, stream)); for (size_t i = 0; i < num_batches; ++i) { if (info_cpu[i] != 0) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Matrix is singular at batch:", i); @@ -63,7 +65,7 @@ Status CheckForSingularity(const IAllocatorUniquePtr& info, const std::uniq template struct Inverse::ComputeImpl { - Status operator()(Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output, + Status operator()(cudaStream_t stream, Inverse::CublasHandle cublas_h, const Inverse* inst, const Tensor& input, Tensor& output, const IAllocatorUniquePtr& info, const IAllocatorUniquePtr& pivots, size_t num_batches, size_t rows) const { using namespace onnxruntime::cuda; @@ -79,52 +81,52 @@ struct Inverse::ComputeImpl { IAllocatorUniquePtr input_workspace = inst->GetScratchBuffer(input_count); if (std::is_same::value) { // Convert from MLFloat16(half) to float - Impl_Cast(reinterpret_cast(input.Data()), input_workspace.get(), input_count); + Impl_Cast(stream, reinterpret_cast(input.Data()), input_workspace.get(), input_count); } else { - CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data(), sizeof(float) * input_count, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data(), sizeof(float) * input_count, + cudaMemcpyDeviceToDevice, stream)); } IAllocatorUniquePtr matrix_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(input_workspace.get(), num_batches, rows, matrix_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, input_workspace.get(), num_batches, rows, matrix_ptrs)); // Do LU factorization CUBLAS_RETURN_IF_ERROR(cublasSgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Need to compute ptrs for output buffers // Output for MLFloat IAllocatorUniquePtr output_ptrs = inst->GetScratchBuffer(n_batches); if (std::is_same::value) { IAllocatorUniquePtr ml_float_output = inst->GetScratchBuffer(input_count); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(ml_float_output.get(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, ml_float_output.get(), num_batches, rows, output_ptrs)); // Do the inverse CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Copy the result to output with casting - Impl_Cast(ml_float_output.get(), reinterpret_cast(output.MutableData()), input_count); + Impl_Cast(stream, ml_float_output.get(), reinterpret_cast(output.MutableData()), input_count); // We are done here } else { - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(output.MutableData(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, output.MutableData(), num_batches, rows, output_ptrs)); // Do the inverse CUBLAS_RETURN_IF_ERROR(cublasSgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // We are done here } } else if (std::is_same::value) { IAllocatorUniquePtr input_workspace = inst->GetScratchBuffer(static_cast(input_count)); - CUDA_RETURN_IF_ERROR(cudaMemcpy(input_workspace.get(), input.Data(), sizeof(double) * input_count, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input_workspace.get(), input.Data(), sizeof(double) * input_count, + cudaMemcpyDeviceToDevice, stream)); IAllocatorUniquePtr matrix_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(input_workspace.get(), num_batches, rows, matrix_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, input_workspace.get(), num_batches, rows, matrix_ptrs)); // Do LU factorization CUBLAS_RETURN_IF_ERROR(cublasDgetrfBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // Need to compute ptrs for output buffers IAllocatorUniquePtr output_ptrs = inst->GetScratchBuffer(n_batches); - ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(output.MutableData(), num_batches, rows, output_ptrs)); + ORT_RETURN_IF_ERROR(ComputeMatrixOffsets(stream, output.MutableData(), num_batches, rows, output_ptrs)); CUBLAS_RETURN_IF_ERROR(cublasDgetriBatched(cublas_h, dim, matrix_ptrs.get(), dim, pivots.get(), output_ptrs.get(), dim, info.get(), n_batches)); - ORT_RETURN_IF_ERROR(CheckForSingularity(info, info_cpu, num_batches)); + ORT_RETURN_IF_ERROR(CheckForSingularity(stream, info, info_cpu, num_batches)); // We are done here } else { ORT_THROW("Type is not supported"); @@ -148,11 +150,11 @@ Status Inverse::ComputeInternal(OpKernelContext* ctx) const { } IAllocatorUniquePtr info = GetScratchBuffer(num_batches); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches)); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches, Stream())); IAllocatorUniquePtr pivots = GetScratchBuffer(rows * num_batches); utils::MLTypeCallDispatcherRet t_disp(input->GetElementType()); - return t_disp.Invoke(Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows); + return t_disp.Invoke(Stream(), Base::CublasHandle(), this, *input, *output, info, pivots, num_batches, rows); } } // namespace cuda diff --git a/onnxruntime/contrib_ops/cuda/layer_norm.cc b/onnxruntime/contrib_ops/cuda/layer_norm.cc index 3a864bc7b7..12f37f36a0 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/layer_norm.cc @@ -98,7 +98,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) const inv_var_data = reinterpret_cast(var->template MutableData()); } - HostApplyLayerNorm(GetDeviceProp(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data); + HostApplyLayerNorm(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data); return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu index 0d2d6fd2e2..46e8fa2900 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu @@ -350,6 +350,7 @@ __global__ void cuApplyLayerNorm( template void HostApplyLayerNorm( const cudaDeviceProp& prop, + cudaStream_t stream, T* output, U* mean, U* invvar, @@ -367,7 +368,7 @@ void HostApplyLayerNorm( const dim3 blocks(1, std::min(n1, maxGridY), 1); int nshared = threads.y > 1 ? threads.y * sizeof(U) + (threads.y / 2) * sizeof(U) : 0; - cuApplyLayerNorm<<>>( + cuApplyLayerNorm<<>>( output, mean, invvar, @@ -378,7 +379,7 @@ void HostApplyLayerNorm( } #define LAYERNORM_LINEAR_IMPL(T, U, simplified) \ - template void HostApplyLayerNorm(const cudaDeviceProp& prop, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \ + template void HostApplyLayerNorm(const cudaDeviceProp& prop, cudaStream_t stream, T* output, U* mean, U* invvar, const T* input, int n1, int n2, \ double epsilon, const T* gamma, const T* beta); LAYERNORM_LINEAR_IMPL(float, float, true) diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/layer_norm_impl.h index 039b7700a6..1705d99915 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.h +++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.h @@ -32,6 +32,7 @@ namespace cuda { template void HostApplyLayerNorm( const cudaDeviceProp& prop, + cudaStream_t stream, T* output, U* mean, U* invvar, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc b/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc index 71d8679319..d9d30055dd 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax.cc @@ -15,6 +15,7 @@ namespace cuda { template void DispatchBiasSoftmaxForwardImpl( + cudaStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl( template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -64,12 +66,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { // expect thread blocks can fill SM at high occupancy without overflowing registers utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(Y, X, B, D, N, D, broadcast_size); + t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size); } else { // need to fallback to add kernel + CUDA DNN library softmax call :/ utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); + t_disp.Invoke(Stream(), CudnnHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); } return Status::OK(); @@ -77,6 +79,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { template void DispatchBiasSoftmaxForward::operator()( + cudaStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -85,6 +88,7 @@ void DispatchBiasSoftmaxForward::operator()( int batch_stride, int bias_broadcast_size_per_batch) { DispatchBiasSoftmaxForwardImpl( + stream, output, input, input_bias, @@ -96,6 +100,7 @@ void DispatchBiasSoftmaxForward::operator()( template void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -107,6 +112,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( const onnxruntime::Tensor* B, onnxruntime::Tensor* Y) { DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + stream, cudaDnnHandle, element_count, batch_count, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax.h b/onnxruntime/contrib_ops/cuda/math/bias_softmax.h index 5bbc7266a3..03baec8d35 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax.h +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax.h @@ -13,6 +13,7 @@ namespace cuda { template struct DispatchBiasSoftmaxForward { void operator()( + cudaStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward { template struct DispatchBiasSoftMaxForwardViaDnnLibrary { void operator()( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, diff --git a/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu b/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu index 959a2d191c..27b2363219 100644 --- a/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/bias_softmax_impl.cu @@ -127,6 +127,7 @@ __global__ void BiasSoftmaxWarpForward( template void DispatchBiasSoftmaxForwardImpl( + cudaStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -167,47 +168,47 @@ void DispatchBiasSoftmaxForwardImpl( switch (log2_elements) { case 0: // 1 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 1: // 2 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 2: // 4 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 3: // 8 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 4: // 16 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 5: // 32 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 6: // 64 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 7: // 128 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 8: // 256 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 9: // 512 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 10: // 1024 BiasSoftmaxWarpForward - <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); + <<>>(output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; default: break; @@ -216,6 +217,7 @@ void DispatchBiasSoftmaxForwardImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \ template void DispatchBiasSoftmaxForwardImpl( \ + cudaStream_t stream, \ Tensor * output_tensor, \ const Tensor* input_tensor, \ const Tensor* input_bias_tensor, \ @@ -232,6 +234,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16) // note: This is an unhappy path! There is no performance benefit for the fusion. template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + cudaStream_t stream, cudnnHandle_t cudaDnnHandle, int element_count, int batch_count, @@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( // invoke elementwise add with broadcast kernel ::onnxruntime::cuda::BinaryElementWiseImpl( + stream, (int32_t)X_shape.NumDimensions(), &lhs_padded_strides, X_data, @@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \ template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( \ + cudaStream_t stream, \ cudnnHandle_t cudaDnnHandle, \ int element_count, \ int batch_count, \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc index a96e576b7d..5f85223a6b 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc @@ -25,6 +25,7 @@ namespace cuda { BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ Impl_##x::MappedType>( \ + Stream(), \ prepare.output_rank_or_simple_broadcast, \ &prepare.lhs_padded_strides, \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu index c6b977ddbe..01791ed94c 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu @@ -20,7 +20,8 @@ namespace cuda { #define CONTRIB_BINARY_ELEMENTWISE_IMPL(name) \ CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -34,7 +35,8 @@ namespace cuda { } #define CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \ - template void Impl_##x(int32_t output_rank, \ + template void Impl_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ const TArray* rhs_padded_strides, \ diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h index bb2af2f55a..6ff4233278 100644 --- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h @@ -20,6 +20,7 @@ namespace cuda { #define CONTRIB_BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul.cc b/onnxruntime/contrib_ops/cuda/math/complex_mul.cc index 70d286ae0d..9584e8de3c 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul.cc +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul.cc @@ -42,6 +42,7 @@ Status ComplexMul::ComputeInternal(OpKernelContext* context) const { BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); ComplexMul_Impl::MappedType>( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu index 0004cf9433..fdbc986b89 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.cu @@ -90,6 +90,7 @@ __global__ void _ElementWiseWithStrideTwo( template void ComplexMul_Impl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -110,7 +111,7 @@ void ComplexMul_Impl( CUDA_LONG N = static_cast(count); if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -123,7 +124,7 @@ void ComplexMul_Impl( rhs_size, is_conj); else if (lhs_padded_strides && lhs_padded_strides->Size()) - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -136,7 +137,7 @@ void ComplexMul_Impl( rhs_size, is_conj); else - _ElementWiseWithStrideTwo<<>>( + _ElementWiseWithStrideTwo<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -152,6 +153,7 @@ void ComplexMul_Impl( #define SPECIALIZE_STACKEDCOMPLEXMUL_IMPL(T) \ template void ComplexMul_Impl( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ diff --git a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h index d48eea9a9f..dae66d8325 100644 --- a/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/complex_mul_impl.h @@ -13,6 +13,7 @@ using namespace ::onnxruntime::cuda; template void ComplexMul_Impl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc index 3c60644d70..c685882e92 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops.cc +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops.cc @@ -127,11 +127,11 @@ Status FFTBase::DoFFT(OpKernelContext* context, const Tensor* X, bool complex Tensor* Y = const_cast(context)->Output(0, TensorShape(output_dims)); auto* x_data = reinterpret_cast(X->template Data()); auto* y_data = reinterpret_cast(Y->template MutableData()); - + CUFFT_RETURN_IF_ERROR(cufftSetStream(plan_info.plan, Stream())); CUFFT_RETURN_IF_ERROR(cufftXtExec(plan_info.plan, const_cast(x_data), y_data, inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); if (inverse) { - PostProcess(signal_dims, output_size, y_data); + PostProcess(Stream(), signal_dims, output_size, y_data); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu index 20d6272628..c1f4a088e0 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu @@ -27,14 +27,14 @@ __global__ void _Normalize( } template -void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data) { +void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data) { int64_t scale = std::accumulate(signal_dims.begin(), signal_dims.end(), 1ll, std::multiplies()); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _Normalize<<>>(output_data, N, static_cast(scale)); + _Normalize<<>>(output_data, N, static_cast(scale)); } #define SPECIALIZED_IMPL(T) \ - template void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data); + template void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h index 8a7f7789c0..2312acd5d3 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.h @@ -12,7 +12,7 @@ namespace contrib { namespace cuda { template -void PostProcess(const std::vector& signal_dims, int64_t N, T* output_data); +void PostProcess(cudaStream_t stream, const std::vector& signal_dims, int64_t N, T* output_data); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc index 67d51b53d5..5833e2fcee 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc @@ -158,6 +158,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { } // scale back and bias CudaDequantizeWithBias( + Stream(), gemm_buffer_quantized.get(), reinterpret_cast(bias->template Data()), reinterpret_cast(gemm_buffer.get()), @@ -172,6 +173,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { auto temp_buffer = GetScratchBuffer(workSpaceSize); if (!LaunchAttentionKernel( GetDeviceProp(), + Stream(), reinterpret_cast(gemm_buffer.get()), nullptr == mask_index ? nullptr : mask_index->template Data(), nullptr == mask_index ? nullptr : &(mask_index->Shape().GetDims()), diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu index 42791ae795..168c8a6f42 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cu @@ -31,10 +31,10 @@ __global__ void DequantizeLinearKernel(const int32_t* quantize, const T* bias, T } template -Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) { +Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const T* bias, T* output, T scale, int m, int n) { int blocksPerGrid = static_cast(CeilDiv(m * n, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(m * n); - DequantizeLinearKernel<<>>( + DequantizeLinearKernel<<>>( quantize, bias, output, @@ -44,8 +44,8 @@ Status CudaDequantizeWithBias(const int32_t* quantize, const T* bias, T* output, return Status::OK(); } -template Status CudaDequantizeWithBias(const int32_t* quantize, const float* bias, float* output, float scale, int m, int n); -template Status CudaDequantizeWithBias(const int32_t* quantize, const half* bias, half* output, half scale, int m, int n); +template Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const float* bias, float* output, float scale, int m, int n); +template Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const half* bias, half* output, half scale, int m, int n); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh index dc0ba262fa..b1aa2b9226 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization_impl.cuh @@ -8,7 +8,7 @@ namespace onnxruntime { namespace contrib { namespace cuda { template -Status CudaDequantizeWithBias(const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n); +Status CudaDequantizeWithBias(cudaStream_t stream, const int32_t* quantize, const Tin* bias, Tin* output, Tin scale, int m, int n); } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop.cc b/onnxruntime/contrib_ops/cuda/tensor/crop.cc index 66e022e3c4..76495c8b23 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop.cc +++ b/onnxruntime/contrib_ops/cuda/tensor/crop.cc @@ -56,6 +56,7 @@ Status Crop::ComputeInternal(OpKernelContext* context) const { fast_divmod fdm_YHW(gsl::narrow_cast((bottomLimit - topBorder) * (rightLimit - leftBorder))); CropImpl( + Stream(), reinterpret_cast(X->template Data()), gsl::narrow_cast(leftBorder), gsl::narrow_cast(topBorder), diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu index c69c274cce..e407164e37 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu +++ b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.cu @@ -31,6 +31,7 @@ __global__ void _CropKernel( template void CropImpl( + cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, @@ -41,12 +42,12 @@ void CropImpl( T* output_data, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _CropKernel<<>>( + _CropKernel<<>>( input_data, src_start_x, src_start_y, src_w, src_hw, fdm_dst_w, fdm_dst_hw, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void CropImpl(const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N); + template void CropImpl(cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, const int src_w, const int src_hw, const fast_divmod& fdm_dst_w, const fast_divmod& fdm_dst_hw, T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h index 07ffb64d2d..8eb649a48c 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h +++ b/onnxruntime/contrib_ops/cuda/tensor/crop_impl.h @@ -12,6 +12,7 @@ using namespace onnxruntime::cuda; template void CropImpl( + cudaStream_t stream, const T* input_data, const int src_start_x, const int src_start_y, diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc b/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc index bf1f33e84a..fb592a5cf3 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler.cc @@ -30,7 +30,7 @@ ImageScaler::ImageScaler(const OpKernelInfo& info) : CudaKernel(info) { ORT_ENFORCE(info.GetAttrs("bias", bias_).IsOK()); b_data_ = GetScratchBuffer(bias_.size()); - CUDA_CALL_THROW(cudaMemcpy(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice)); + CUDA_CALL_THROW(cudaMemcpyAsync(b_data_.get(), bias_.data(), sizeof(float) * bias_.size(), cudaMemcpyHostToDevice, Stream())); } template @@ -53,6 +53,7 @@ Status ImageScaler::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; ImageScalerImpl( + Stream(), reinterpret_cast(X->template Data()), scale_, b_data_.get(), diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu index d0eb35d267..a63cd4755c 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.cu @@ -30,6 +30,7 @@ __global__ void _ImageScalerKernel( template void ImageScalerImpl( + cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, @@ -40,17 +41,17 @@ void ImageScalerImpl( fast_divmod fdm_HW((int)(dims[2] * dims[3])); fast_divmod fdm_C; if (dims[0] == 1) { - _ImageScalerKernel<<>>( + _ImageScalerKernel<<>>( input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N); } else { fdm_C = fast_divmod((int)dims[1]); - _ImageScalerKernel<<>>( + _ImageScalerKernel<<>>( input_data, scale, bias_data, fdm_C, fdm_HW, output_data, N); } } #define SPECIALIZED_IMPL(T) \ - template void ImageScalerImpl(const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N); + template void ImageScalerImpl(cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, const int64_t dims[4], T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h index c014870894..7194041a71 100644 --- a/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h +++ b/onnxruntime/contrib_ops/cuda/tensor/image_scaler_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void ImageScalerImpl( + cudaStream_t stream, const T* input_data, const float scale, const float* bias_data, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc b/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc index 9d318a3edf..954474aa09 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax.cc @@ -15,6 +15,7 @@ namespace rocm { template void DispatchBiasSoftmaxForwardImpl( + hipStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -25,6 +26,7 @@ void DispatchBiasSoftmaxForwardImpl( template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -67,12 +69,12 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { // expect thread blocks can fill SM at high occupancy without overflowing registers utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(Y, X, B, D, N, D, broadcast_size); + t_disp.Invoke(Stream(), Y, X, B, D, N, D, broadcast_size); } else { // need to fallback to add kernel + CUDA DNN library softmax call :/ utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); + t_disp.Invoke(Stream(), MiopenHandle(), D, N, broadcast_axis, softmax_axis, X_shape, X, B_shape, B, Y); } return Status::OK(); @@ -80,6 +82,7 @@ Status BiasSoftmax::ComputeInternal(OpKernelContext* ctx) const { template void DispatchBiasSoftmaxForward::operator()( + hipStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -88,6 +91,7 @@ void DispatchBiasSoftmaxForward::operator()( int batch_stride, int bias_broadcast_size_per_batch) { DispatchBiasSoftmaxForwardImpl( + stream, output, input, input_bias, @@ -99,6 +103,7 @@ void DispatchBiasSoftmaxForward::operator()( template void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -110,6 +115,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibrary::operator()( const onnxruntime::Tensor* B, onnxruntime::Tensor* Y) { DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + stream, miopenHandle, element_count, batch_count, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax.h b/onnxruntime/contrib_ops/rocm/math/bias_softmax.h index 04bc4d93b0..602ac5fafb 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax.h +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax.h @@ -13,6 +13,7 @@ namespace rocm { template struct DispatchBiasSoftmaxForward { void operator()( + hipStream_t stream, Tensor* output, const Tensor* input, const Tensor* input_bias, @@ -25,6 +26,7 @@ struct DispatchBiasSoftmaxForward { template struct DispatchBiasSoftMaxForwardViaDnnLibrary { void operator()( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, diff --git a/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu b/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu index e8aad12e68..6bbc2c98d9 100644 --- a/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu +++ b/onnxruntime/contrib_ops/rocm/math/bias_softmax_impl.cu @@ -128,6 +128,7 @@ __global__ void BiasSoftmaxWarpForward( template void DispatchBiasSoftmaxForwardImpl( + hipStream_t stream, Tensor* output_tensor, const Tensor* input_tensor, const Tensor* input_bias_tensor, @@ -168,47 +169,47 @@ void DispatchBiasSoftmaxForwardImpl( // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(BiasSoftmaxWarpForward), dim3(blocks), dim3(threads), 0, stream, output, input, input_bias, element_count, batch_count, batch_stride, bias_broadcast_size_per_batch); break; default: @@ -218,6 +219,7 @@ void DispatchBiasSoftmaxForwardImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL(T) \ template void DispatchBiasSoftmaxForwardImpl( \ + hipStream_t stream, \ Tensor * output_tensor, \ const Tensor* input_tensor, \ const Tensor* input_bias_tensor, \ @@ -234,6 +236,7 @@ SPECIALIZED_BIAS_SOFTMAX_IMPL(MLFloat16) // note: This is an unhappy path! There is no performance benefit for the fusion. template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( + hipStream_t stream, miopenHandle_t miopenHandle, int element_count, int batch_count, @@ -278,6 +281,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( // invoke elementwise add with broadcast kernel ::onnxruntime::rocm::BinaryElementWiseImpl( + stream, (int32_t)X_shape.NumDimensions(), &lhs_padded_strides, X_data, @@ -311,6 +315,7 @@ void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( #define SPECIALIZED_BIAS_SOFTMAX_IMPL_VIA_DNN(T) \ template void DispatchBiasSoftMaxForwardViaDnnLibraryImpl( \ + hipStream_t stream, \ miopenHandle_t miopenHandle, \ int element_count, \ int batch_count, \ diff --git a/onnxruntime/core/framework/provider_bridge_ort.cc b/onnxruntime/core/framework/provider_bridge_ort.cc index 5e6a1f85eb..a12b5989d7 100644 --- a/onnxruntime/core/framework/provider_bridge_ort.cc +++ b/onnxruntime/core/framework/provider_bridge_ort.cc @@ -160,14 +160,16 @@ struct ProviderHostImpl : ProviderHost { return onnxruntime::make_unique(device_id, name); } - std::unique_ptr CreateGPUDataTransfer() override { return onnxruntime::make_unique(); } - - void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) override { - return cuda::Impl_Cast(input_data, output_data, count); + std::unique_ptr CreateGPUDataTransfer(void* stream) override { + return onnxruntime::make_unique(static_cast(stream)); } - void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) override { - return cuda::Impl_Cast(input_data, output_data, count); + void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { + return cuda::Impl_Cast(static_cast(stream), input_data, output_data, count); + } + + void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { + return cuda::Impl_Cast(static_cast(stream), input_data, output_data, count); } bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return CudaCall(cudaError(retCode), exprString, libName, cudaError(successCode), msg); } @@ -684,6 +686,13 @@ std::shared_ptr CreateExecutionProviderFactory_Tensor return nullptr; } +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* provider_options) { + if (auto provider = s_library_tensorrt.Get()) + return provider->CreateExecutionProviderFactory(provider_options); + + return nullptr; +} + std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* provider_options) { if (auto provider = s_library_openvino.Get()) return provider->CreateExecutionProviderFactory(provider_options); @@ -719,6 +728,16 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tensorrt, _In_ OrtS return nullptr; } +ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) { + auto factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(tensorrt_options); + if (!factory) { + return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_Tensorrt: Failed to load shared library"); + } + + options->provider_factories.push_back(factory); + return nullptr; +} + ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_OpenVINO, _In_ OrtSessionOptions* options, _In_ const OrtOpenVINOProviderOptions* provider_options) { auto factory = onnxruntime::CreateExecutionProviderFactory_OpenVINO(provider_options); if (!factory) { diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.cc b/onnxruntime/core/providers/cpu/controlflow/loop.cc index f352f28746..23d8ab6427 100644 --- a/onnxruntime/core/providers/cpu/controlflow/loop.cc +++ b/onnxruntime/core/providers/cpu/controlflow/loop.cc @@ -178,7 +178,8 @@ class LoopImpl { LoopImpl(OpKernelContextInternal& context, const SessionState& session_state, const Loop::Info& info, - const Loop::ConcatOutput& concat_output_func); + const Loop::ConcatOutput& concat_output_func, + void* stream); // Initialize by validating all the inputs, and allocating the output tensors Status Initialize(); @@ -211,9 +212,11 @@ class LoopImpl { std::vector> loop_output_tensors_; const Loop::ConcatOutput& concat_output_func_; + void* stream_; }; -static Status ConcatenateCpuOutput(std::vector& per_iteration_output, +static Status ConcatenateCpuOutput(void* /*stream*/, + std::vector& per_iteration_output, void* output, size_t output_size_in_bytes) { const auto& first_output = per_iteration_output.front().Get(); const auto& per_iteration_shape = first_output.Shape(); @@ -253,6 +256,7 @@ Loop::Loop(const OpKernelInfo& info) : IControlFlowKernel(info) { ORT_IGNORE_RETURN_VALUE(proto); concat_output_func_ = ConcatenateCpuOutput; + stream_ = nullptr; } // we need this to be in the .cc so 'unique_ptr info_' can be handled @@ -345,7 +349,7 @@ Status Loop::Compute(OpKernelContext* ctx) const { ORT_ENFORCE(session_state, "Subgraph SessionState was not found for 'body' attribute."); ORT_ENFORCE(feeds_fetches_manager_, "CreateFeedsFetchesManager must be called prior to execution of graph."); - LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_}; + LoopImpl loop_impl{*ctx_internal, *session_state, *info_, concat_output_func_, stream_}; auto status = loop_impl.Initialize(); ORT_RETURN_IF_ERROR(status); @@ -358,12 +362,14 @@ Status Loop::Compute(OpKernelContext* ctx) const { LoopImpl::LoopImpl(OpKernelContextInternal& context, const SessionState& session_state, const Loop::Info& subgraph_info, - const Loop::ConcatOutput& concat_output_func) + const Loop::ConcatOutput& concat_output_func, + void* stream) : context_(context), session_state_(session_state), info_(subgraph_info), implicit_inputs_(context_.GetImplicitInputs()), - concat_output_func_(concat_output_func) { + concat_output_func_(concat_output_func), + stream_(stream) { auto* max_trip_count_tensor = context.Input(0); max_trip_count_ = max_trip_count_tensor ? *max_trip_count_tensor->Data() : INT64_MAX; @@ -457,7 +463,7 @@ Status LoopImpl::ConcatenateLoopOutput(std::vector& per_iteration_outp TensorShape output_shape{dims}; Tensor* output = context_.Output(output_index, output_shape); - ORT_RETURN_IF_ERROR(concat_output_func_(per_iteration_output, output->MutableDataRaw(), output->SizeInBytes())); + ORT_RETURN_IF_ERROR(concat_output_func_(stream_, per_iteration_output, output->MutableDataRaw(), output->SizeInBytes())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cpu/controlflow/loop.h b/onnxruntime/core/providers/cpu/controlflow/loop.h index 958dd162aa..73f863b22d 100644 --- a/onnxruntime/core/providers/cpu/controlflow/loop.h +++ b/onnxruntime/core/providers/cpu/controlflow/loop.h @@ -29,17 +29,19 @@ class Loop : public controlflow::IControlFlowKernel { // function to concatenate the OrtValue instances from each Loop iteration into a single output buffer. // @param per_iteration_output OrtValue instances from each iteration. Never empty. All should have the same shape. // @param output Pre-allocated output buffer. On device specific to the ExecutionProvider running the Loop node. - using ConcatOutput = std::function& per_iteration_output, + using ConcatOutput = std::function& per_iteration_output, void* output, size_t output_size_in_bytes)>; protected: // derived class can provide implementation for handling concatenation of Loop output on a different device void SetConcatOutputFunc(const ConcatOutput& concat_output_func) { concat_output_func_ = concat_output_func; } + void SetComputeStream(void* stream) { stream_ = stream; } private: // Info and FeedsFetchesManager re-used for each subgraph execution. std::unique_ptr info_; std::unique_ptr feeds_fetches_manager_; ConcatOutput concat_output_func_; + void* stream_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc index c70b32fc95..af83133cee 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.cc @@ -14,7 +14,7 @@ namespace DeviceHelpers { namespace CpuDeviceHelpers { // CPU specific Data copy helper -Status DataCopy(const Tensor& input, Tensor& output) { +Status DataCopy(const Tensor& input, Tensor& output, void* /*einsum_cuda_assets*/) { ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(), "Einsum op: The candidate output does not match the actual output's shape"); // There are no string tensors in Einsum's case - so safely use memcpy @@ -156,7 +156,7 @@ static std::unique_ptr DiagonalInnermostDims(const Tensor& input, return output; } -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) { +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* /*einsum_cuda_assets*/) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); auto rank = static_cast(input_dims.size()); diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h index baaf6821d7..d5f96a79b9 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_auxiliary_ops.h @@ -23,7 +23,7 @@ namespace EinsumOp { namespace DeviceHelpers { // Data copy op - Copies raw data from the source tensor's buffer to the destination tensor's buffer -using DataCopy = std::function; +using DataCopy = std::function; // Transpose op - Transposes given input based on data in `permutation` using Transpose = std::function& permutation, const Tensor& input, @@ -54,12 +54,12 @@ using ReduceSum = std::function(const Tensor& input, int64_t dim_1, int64_t dim_2, - AllocatorPtr allocator)>; + AllocatorPtr allocator, void* einsum_cuda_assets)>; // These are CPU specific device helper implementations namespace CpuDeviceHelpers { -Status DataCopy(const Tensor& input, Tensor& output); +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets); Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets); @@ -76,7 +76,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, const TensorShape* input_shape_override, concurrency::ThreadPool* tp, void* einsum_cuda_assets); -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator); +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets); } // namespace CpuDeviceHelpers diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc index 91c700568a..88ef7738b2 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.cc @@ -440,7 +440,7 @@ Status EinsumComputePreprocessor::PreprocessInputs() { preprocessed = device_diagonal_func_(preprocessed ? *preprocessed : *inputs_[input_iter], subscript_indices_to_input_index[subscript_index], dim_index_in_preprocessed_input, - allocator_); + allocator_, einsum_ep_assets_); } ++dim_index_in_original_input; } diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc index 1c55b9d7c8..0d6bb37ba3 100644 --- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc +++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_typed_compute_processor.cc @@ -58,13 +58,13 @@ void EinsumTypedComputeProcessor::FinalizeOutput(const Tensor& candidate_outp // into the buffer of the actual output given to us by the execution frame // We need to do this because the buffer owned by the output tensor of the op could be user provided buffer - auto status = device_data_copy_func_(*candidate_output_transposed, output); + auto status = device_data_copy_func_(*candidate_output_transposed, output, einsum_ep_assets_); ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ", status.ErrorMessage()); } else { // Copy the output candidate into the op's output - auto status = device_data_copy_func_(candidate_output, output); + auto status = device_data_copy_func_(candidate_output, output, einsum_ep_assets_); ORT_ENFORCE(status.IsOK(), "Einsum op: Could not copy the intermediate output's buffer into the op's output buffer. Error: ", status.ErrorMessage()); } diff --git a/onnxruntime/core/providers/cuda/activation/activations.cc b/onnxruntime/core/providers/cuda/activation/activations.cc index cb985fd024..5015c40109 100644 --- a/onnxruntime/core/providers/cuda/activation/activations.cc +++ b/onnxruntime/core/providers/cuda/activation/activations.cc @@ -38,6 +38,7 @@ namespace cuda { ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ &func_ctx, p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/core/providers/cuda/activation/activations_impl.cu b/onnxruntime/core/providers/cuda/activation/activations_impl.cu index bd7a4f7bf8..2ff5a4748f 100644 --- a/onnxruntime/core/providers/cuda/activation/activations_impl.cu +++ b/onnxruntime/core/providers/cuda/activation/activations_impl.cu @@ -84,14 +84,15 @@ struct OP_ThresholdedRelu : public CtxThresholdedRelu { #define UNARY_ACTIVATION_IMPL(name) \ UNARY_ACTIVATION_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_UNARY_ACTIVATION_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, const Ctx##name* func_ctx, size_t count); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) #define SPECIALIZED_UNARY_ACTIVATION_IMPL_BF16(name) SPECIALIZED_UNARY_ACTIVATION_IMPL(name, nv_bfloat16) diff --git a/onnxruntime/core/providers/cuda/activation/activations_impl.h b/onnxruntime/core/providers/cuda/activation/activations_impl.h index a3a39df63b..53359ae7a7 100644 --- a/onnxruntime/core/providers/cuda/activation/activations_impl.h +++ b/onnxruntime/core/providers/cuda/activation/activations_impl.h @@ -48,6 +48,7 @@ typedef CtxAlpha CtxThresholdedRelu; #define UNARY_ACTIVATION_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/onnxruntime/core/providers/cuda/controlflow/loop.cc b/onnxruntime/core/providers/cuda/controlflow/loop.cc index 12779aca3d..5d430ecfa6 100644 --- a/onnxruntime/core/providers/cuda/controlflow/loop.cc +++ b/onnxruntime/core/providers/cuda/controlflow/loop.cc @@ -51,7 +51,7 @@ ONNX_OPERATOR_KERNEL_EX(Loop, .TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorTypes()), Loop); -static Status ConcatenateGpuOutput(std::vector& per_iteration_output, +static Status ConcatenateGpuOutput(void* stream, std::vector& per_iteration_output, void* output, ptrdiff_t output_size_in_bytes) { const auto& first_output = per_iteration_output.front().Get(); const auto& per_iteration_shape = first_output.Shape(); @@ -68,8 +68,8 @@ static Status ConcatenateGpuOutput(std::vector& per_iteration_output, " Expected:", per_iteration_shape, " Got:", iteration_data.Shape()); } - CUDA_RETURN_IF_ERROR(cudaMemcpy(cur_output, iteration_data.DataRaw(), bytes_per_iteration, - cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cur_output, iteration_data.DataRaw(), bytes_per_iteration, + cudaMemcpyDeviceToDevice, static_cast(stream))); cur_output = static_cast((static_cast(cur_output) + bytes_per_iteration)); } @@ -82,6 +82,7 @@ static Status ConcatenateGpuOutput(std::vector& per_iteration_output, Loop::Loop(const OpKernelInfo& info) : onnxruntime::Loop(info) { SetConcatOutputFunc(ConcatenateGpuOutput); + SetComputeStream(static_cast(info.GetExecutionProvider()->GetComputeStream())); } Status Loop::Compute(OpKernelContext* ctx) const { diff --git a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh index 379b09fe39..069cf0658d 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh @@ -180,6 +180,7 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN( template void BinaryElementWiseNoBroadcastImpl( + cudaStream_t stream, const T1* lhs_data, const T2* rhs_data, T* output_data, @@ -190,7 +191,7 @@ void BinaryElementWiseNoBroadcastImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, @@ -200,6 +201,7 @@ void BinaryElementWiseNoBroadcastImpl( template void BinaryElementWiseImpl( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T1* lhs_data, @@ -217,14 +219,14 @@ void BinaryElementWiseImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::NoBroadcast)) { - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::LeftScalar)) { - _BinaryElementWiseSimple<<>>( + _BinaryElementWiseSimple<<>>( lhs_data, rhs_data, output_data, @@ -232,14 +234,14 @@ void BinaryElementWiseImpl( N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightScalar)) { _BinaryElementWiseSimple<<>>( + GridDim::maxElementsPerThread><<>>( lhs_data, rhs_data, output_data, func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatch1)) { - _BinaryElementWiseRhsPerChannelBatch1<<>>( + _BinaryElementWiseRhsPerChannelBatch1<<>>( lhs_data, rhs_data, fdm_H, @@ -247,7 +249,7 @@ void BinaryElementWiseImpl( func, N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatchN)) { - _BinaryElementWiseRhsPerChannelBatchN<<>>( + _BinaryElementWiseRhsPerChannelBatchN<<>>( lhs_data, rhs_data, fdm_H, @@ -257,7 +259,7 @@ void BinaryElementWiseImpl( N); } else { if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -268,7 +270,7 @@ void BinaryElementWiseImpl( func, N); else if (lhs_padded_strides && lhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, *lhs_padded_strides, lhs_data, @@ -279,7 +281,7 @@ void BinaryElementWiseImpl( func, N); else if (rhs_padded_strides && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( + _BinaryElementWise<<>>( output_rank_or_simple_broadcast, TArray(), // lhs is not computed, so no need to deference lhs_padded_strides lhs_data, diff --git a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh index 04d04e2488..66113a1dff 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/unary_elementwise_impl.cuh @@ -39,6 +39,7 @@ __global__ void _UnaryElementWise( template void UnaryElementWiseImpl( + cudaStream_t stream, const InT* input_data, OutT* output_data, const FuncT& func, @@ -49,7 +50,7 @@ void UnaryElementWiseImpl( int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _UnaryElementWise - <<>>( + <<>>( input_data, output_data, func, diff --git a/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh index cd4269542a..e3a3c6a969 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/variadic_elementwise_impl.cuh @@ -62,6 +62,7 @@ __global__ void VariadicElementWiseNoBroadcastInputBatchKernel( // - inputs and output have N elements template void VariadicElementWiseNoBroadcastInputBatchImpl( + cudaStream_t stream, Func func, size_t N, TArray inputs, @@ -70,7 +71,7 @@ void VariadicElementWiseNoBroadcastInputBatchImpl( constexpr int32_t threads_per_block = GridDim::maxThreadsPerBlock; const int32_t blocks_per_grid = static_cast(CeilDiv(N, elements_per_thread * threads_per_block)); VariadicElementWiseNoBroadcastInputBatchKernel - <<>>(func, N, inputs, output); + <<>>(func, N, inputs, output); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 17cd8c1336..ff542b78cb 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -59,10 +59,15 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace cuda -CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { +CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { CUDA_CALL_THROW(cudaSetDevice(device_id)); + stream_ = stream; + CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); + CUBLAS_CALL_THROW(cublasSetStream(cublas_handle_, stream)); + CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); + CUDNN_CALL_THROW(cudnnSetStream(cudnn_handle_, stream)); AllocatorCreationInfo default_memory_info( [](OrtDevice::DeviceId id) { @@ -103,6 +108,12 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in // must wait GPU idle, otherwise cudaGetDeviceProperties might fail CUDA_CALL_THROW(cudaDeviceSynchronize()); CUDA_CALL_THROW(cudaGetDeviceProperties(&device_prop_, info_.device_id)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + } size_t free = 0; size_t total = 0; @@ -136,6 +147,10 @@ CUDAExecutionProvider::~CUDAExecutionProvider() { ORT_IGNORE_RETURN_VALUE(cache->erase(this)); } } + + if (!external_stream_ && stream_) { + CUDA_CALL(cudaStreamDestroy(stream_)); + } } CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadContext() const { @@ -156,7 +171,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(info_.device_id, info_.cuda_mem_limit, info_.arena_extend_strategy); + context = std::make_shared(info_.device_id, static_cast(GetComputeStream()), info_.cuda_mem_limit, info_.arena_extend_strategy); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -254,10 +269,24 @@ Status CUDAExecutionProvider::OnRunStart() { Status CUDAExecutionProvider::OnRunEnd() { // record deferred release event on default stream, and release per_thread_context auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent(); - CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, nullptr)); + CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast(GetComputeStream()))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast(GetComputeStream()))); ReleasePerThreadContext(); std::lock_guard lock(deferred_release_cpu_ptr_mutex_); deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true; + + return Status::OK(); +} + +Status CUDAExecutionProvider::SetComputeStream(void* stream) { + if (stream != stream_) { + if (stream_) { + CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } return Status::OK(); } @@ -1878,7 +1907,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { } std::unique_ptr CUDAExecutionProvider::GetDataTransfer() const { - return onnxruntime::make_unique(info_.do_copy_in_default_stream); + return onnxruntime::make_unique(static_cast(GetComputeStream()), info_.do_copy_in_default_stream); } std::vector> diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 2f55ee6bbb..f44a341e30 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -13,8 +13,8 @@ #include "core/platform/ort_mutex.h" #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_pch.h" -#include "core/providers/cuda/gpu_data_transfer.h" #include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/providers/cuda/shared_inc/cuda_call.h" namespace onnxruntime { @@ -37,6 +37,10 @@ class CUDAExecutionProvider : public IExecutionProvider { return nullptr; } + Status SetComputeStream(void* stream) override; + + void* GetComputeStream() const override { return static_cast(stream_); } + cublasHandle_t PerThreadCublasHandle() { return GetPerThreadContext().CublasHandle(); } @@ -80,6 +84,8 @@ class CUDAExecutionProvider : public IExecutionProvider { private: CUDAExecutionProviderInfo info_; cudaDeviceProp device_prop_; + bool external_stream_ = false; + cudaStream_t stream_ = nullptr; struct DeferredReleaseCPUPtrs { bool recorded = false; std::vector cpu_ptrs; @@ -90,7 +96,7 @@ class CUDAExecutionProvider : public IExecutionProvider { class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); + PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); ~PerThreadContext(); cublasHandle_t CublasHandle() const { @@ -111,23 +117,23 @@ class CUDAExecutionProvider : public IExecutionProvider { if (!constant_ones_float_) { constant_ones_float_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + return reinterpret_cast(constant_ones_float_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_double_) { constant_ones_double_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + return reinterpret_cast(constant_ones_double_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_half_) { constant_ones_half_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + return reinterpret_cast(constant_ones_half_->GetBuffer(stream_, count)); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } else if (std::is_same::value) { if (!constant_ones_bfloat16_) { constant_ones_bfloat16_ = cuda::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_bfloat16_->GetBuffer(count)); + return reinterpret_cast(constant_ones_bfloat16_->GetBuffer(stream_, count)); #endif } else { return nullptr; @@ -139,6 +145,7 @@ class CUDAExecutionProvider : public IExecutionProvider { } private: + cudaStream_t stream_ = nullptr; cublasHandle_t cublas_handle_ = nullptr; cudnnHandle_t cudnn_handle_ = nullptr; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h index 5ba2d07b9c..0b1e7bfe86 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h @@ -18,6 +18,8 @@ struct CUDAExecutionProviderInfo { ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo}; OrtCudnnConvAlgoSearch cudnn_conv_algo_search{OrtCudnnConvAlgoSearch::EXHAUSTIVE}; bool do_copy_in_default_stream{true}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h index 12fccc9f19..8dd0a5b781 100644 --- a/onnxruntime/core/providers/cuda/cuda_kernel.h +++ b/onnxruntime/core/providers/cuda/cuda_kernel.h @@ -59,7 +59,9 @@ class CudaKernel : public OpKernel { provider_->AddDeferredReleaseCPUPtr(p); } - const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); }; + const cudaDeviceProp& GetDeviceProp() const { return provider_->GetDeviceProp(); } + + inline cudaStream_t Stream() const { return static_cast(provider_->GetComputeStream()); } // To support cudaMemcpyAsync, the cpu memory should be allocated in pinned memory // and it can only be released after the copy has finished @@ -94,7 +96,7 @@ class CudaKernel : public OpKernel { Status CopyToGpu() { if (cpu_pinned_copy_) { gpu_copy_ = op_kernel_->GetScratchBuffer(count_); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), cudaMemcpyHostToDevice, op_kernel_->Stream())); op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release()); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index ef3c7a4269..6a5e8fd8a0 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -57,7 +57,8 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA, info.arena_extend_strategy = static_cast(cuda_options->arena_extend_strategy); info.cudnn_conv_algo_search = cuda_options->cudnn_conv_algo_search; info.do_copy_in_default_stream = cuda_options->do_copy_in_default_stream; - + info.has_user_compute_stream = cuda_options->has_user_compute_stream; + info.user_compute_stream = cuda_options->user_compute_stream; options->provider_factories.push_back(onnxruntime::CreateExecutionProviderFactory_CUDA(info)); return nullptr; diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu index a0ee56c6a3..c9cf75ef29 100644 --- a/onnxruntime/core/providers/cuda/cuda_utils.cu +++ b/onnxruntime/core/providers/cuda/cuda_utils.cu @@ -27,11 +27,11 @@ __global__ void _Fill( } template -void Fill(T* output, T value, int64_t count) { +void Fill(cudaStream_t stream, T* output, T value, int64_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _Fill - <<>>(output, value, N); + <<>>(output, value, N); } template class ConstantBufferImpl : public IConstantBuffer { @@ -43,7 +43,7 @@ class ConstantBufferImpl : public IConstantBuffer { cudaFree(buffer_); } - virtual const T* GetBuffer(size_t count) { + virtual const T* GetBuffer(cudaStream_t stream, size_t count) { if (count > count_) { if (buffer_) { cudaFree(buffer_); @@ -52,7 +52,7 @@ class ConstantBufferImpl : public IConstantBuffer { CUDA_CALL_THROW(cudaMalloc(&buffer_, count * sizeof(T))); count_ = count; - Fill(buffer_, val_, count); + Fill(stream, buffer_, val_, count); } return buffer_; } @@ -76,7 +76,7 @@ template std::unique_ptr> CreateConstantOnes(T * output, T value, int64_t count); + template void Fill(cudaStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) SPECIALIZED_FILL(int16_t) diff --git a/onnxruntime/core/providers/cuda/fpgeneric.cu b/onnxruntime/core/providers/cuda/fpgeneric.cu index 8f03004d89..695c6038c0 100644 --- a/onnxruntime/core/providers/cuda/fpgeneric.cu +++ b/onnxruntime/core/providers/cuda/fpgeneric.cu @@ -65,30 +65,30 @@ __global__ void CopyVectorBFloat16(const nv_bfloat16* x, int incx, nv_bfloat16* } // namespace -cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { +cublasStatus_t cublasTransposeHelper(cudaStream_t stream, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1); - transposeNoOverlap<<>>(C, A, n, m); + transposeNoOverlap<<>>(C, A, n, m); } else { return CUBLAS_STATUS_NOT_SUPPORTED; } return CUBLAS_STATUS_SUCCESS; } -cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const half* x, int incx, half* y, int incy) { +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const half* x, int incx, half* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - CopyVectorHalf<<>>(x, incx, y, incy, n); + CopyVectorHalf<<>>(x, incx, y, incy, n); return CUBLAS_STATUS_SUCCESS; } #if CUDA_VERSION >= 11000 -cublasStatus_t cublasCopyHelper(cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) { +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - CopyVectorBFloat16<<>>(x, incx, y, incy, n); + CopyVectorBFloat16<<>>(x, incx, y, incy, n); return CUBLAS_STATUS_SUCCESS; } diff --git a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc index f598a0337b..052fdf5dfe 100644 --- a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc +++ b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc @@ -32,7 +32,7 @@ Status ConstantOfShape::ComputeInternal(OpKernelContext* ctx) const { #define CASE(TYPE) \ case sizeof(TYPE): \ if (size > 0) { \ - cuda::Fill(reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ + cuda::Fill(Stream(), reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ } \ break; diff --git a/onnxruntime/core/providers/cuda/generator/range.cc b/onnxruntime/core/providers/cuda/generator/range.cc index cc57d9d2a5..bc9c94c120 100644 --- a/onnxruntime/core/providers/cuda/generator/range.cc +++ b/onnxruntime/core/providers/cuda/generator/range.cc @@ -30,7 +30,7 @@ ONNX_OPERATOR_KERNEL_EX( Range); template -static Status ComputeRange(OpKernelContext* ctx) { +static Status ComputeRange(cudaStream_t stream, OpKernelContext* ctx) { const auto& start_tensor = *ctx->Input(0); const auto& limit_tensor = *ctx->Input(1); const auto* delta_tensor_ptr = ctx->Input(2); @@ -71,7 +71,7 @@ static Status ComputeRange(OpKernelContext* ctx) { T* y = ctx->Output(0, shape)->template MutableData(); if (count > 0) { - if (!RangeImpl(start, delta, count, y)) { + if (!RangeImpl(stream, start, delta, count, y)) { CUDA_CALL(cudaGetLastError()); return Status(common::ONNXRUNTIME, common::FAIL); } @@ -84,8 +84,8 @@ namespace cuda_range_internal { template struct CallCudaRangeImpl { - Status operator()(OpKernelContext* ctx) const { - return ComputeRange(ctx); + Status operator()(cudaStream_t stream, OpKernelContext* ctx) const { + return ComputeRange(stream, ctx); } }; @@ -100,7 +100,7 @@ Status Range::ComputeInternal(OpKernelContext* ctx) const { utils::MLTypeCallDispatcherRet t_disp(input_tensor->GetElementType()); - return t_disp.Invoke(ctx); + return t_disp.Invoke(Stream(), ctx); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.cu b/onnxruntime/core/providers/cuda/generator/range_impl.cu index 4756350392..ede8146478 100644 --- a/onnxruntime/core/providers/cuda/generator/range_impl.cu +++ b/onnxruntime/core/providers/cuda/generator/range_impl.cu @@ -22,15 +22,15 @@ __global__ void RangeKernel(const T start, const T delta, const int count, T* ou } template -bool RangeImpl(const T start, const T delta, const int count, T* output) { +bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output) { constexpr int block_size = 256; int grid_size = (count + block_size - 1) / block_size; - RangeKernel<<>>(start, delta, count, output); + RangeKernel<<>>(start, delta, count, output); return CUDA_CALL(cudaPeekAtLastError()); } #define SPECIALIZED_IMPL(T) \ - template bool RangeImpl(const T start, const T delta, const int count, T* output); + template bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output); SPECIALIZED_IMPL(int16_t) SPECIALIZED_IMPL(int32_t) diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.h b/onnxruntime/core/providers/cuda/generator/range_impl.h index 684978d544..608c65223a 100644 --- a/onnxruntime/core/providers/cuda/generator/range_impl.h +++ b/onnxruntime/core/providers/cuda/generator/range_impl.h @@ -9,7 +9,7 @@ namespace cuda { template -bool RangeImpl(const T start, const T delta, const int count, T* output); +bool RangeImpl(cudaStream_t stream, const T start, const T delta, const int count, T* output); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc index 6618688087..cd83b3d612 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc @@ -9,12 +9,13 @@ // so we leave it as optional, in case user need the previous behavior // a full fix to BFC arena is being looked at, and once it's in, we can revert this change namespace onnxruntime { -GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) { +GPUDataTransfer::GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream) { // create streams, default is nullptr - streams_[kCudaStreamDefault] = nullptr; + do_copy_in_default_stream_ = do_copy_in_default_stream; + streams_[kCudaStreamDefault] = stream; if (do_copy_in_default_stream) { - streams_[kCudaStreamCopyIn] = nullptr; - streams_[kCudaStreamCopyOut] = nullptr; + streams_[kCudaStreamCopyIn] = stream; + streams_[kCudaStreamCopyOut] = stream; } else { CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyIn], cudaStreamNonBlocking)); CUDA_CALL_THROW(cudaStreamCreateWithFlags(&streams_[kCudaStreamCopyOut], cudaStreamNonBlocking)); @@ -22,10 +23,10 @@ GPUDataTransfer::GPUDataTransfer(bool do_copy_in_default_stream) { } GPUDataTransfer::~GPUDataTransfer() { - if (streams_[kCudaStreamCopyIn] != nullptr) { + if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyIn] != nullptr) { CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyIn])); } - if (streams_[kCudaStreamCopyOut] != nullptr) { + if (!do_copy_in_default_stream_ && streams_[kCudaStreamCopyOut] != nullptr) { CUDA_CALL(cudaStreamDestroy(streams_[kCudaStreamCopyOut])); } } @@ -46,24 +47,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e if (dst_device.Type() == OrtDevice::GPU) { if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copy from pinned memory to GPU, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, streams_[exec_queue_id])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(exec_queue_id))); } else if (src_device.Type() == OrtDevice::GPU) { // copying between GPU, this is non-blocking // Copy only if the two addresses are different. if (dst_data != src_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, streams_[kCudaStreamDefault])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice, GetStream(kCudaStreamDefault))); } } else { // copy from other CPU memory to GPU, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyHostToDevice, GetStream(kCudaStreamDefault))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault))); } } else if (src_device.Type() == OrtDevice::GPU) { if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copying from GPU to pinned memory, this is non-blocking - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, streams_[exec_queue_id])); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(exec_queue_id))); } else { // copying from GPU to CPU memory, this is blocking - CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dst_data, src_data, bytes, cudaMemcpyDeviceToHost, GetStream(kCudaStreamDefault))); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(GetStream(kCudaStreamDefault))); } } else { // copying between cpu memory diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.h b/onnxruntime/core/providers/cuda/gpu_data_transfer.h index 055e2a90fd..f8eeb5fa97 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.h +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.h @@ -17,7 +17,7 @@ enum CUDAStreamType : int { class GPUDataTransfer : public IDataTransfer { public: - GPUDataTransfer(bool do_copy_in_default_stream = true); + GPUDataTransfer(cudaStream_t stream, bool do_copy_in_default_stream = true); ~GPUDataTransfer(); bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; @@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer { } private: + bool do_copy_in_default_stream_; cudaStream_t streams_[kTotalCudaStreams]; }; diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc index c733fb85df..52bf4c0a41 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc @@ -145,6 +145,7 @@ Status BinaryElementwise::Prepare(OpKernelContext* context, Bin BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ Impl_##x::MappedType>( \ + Stream(), \ prepare.output_rank_or_simple_broadcast, \ &prepare.lhs_padded_strides, \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ @@ -315,12 +316,13 @@ ONNX_OPERATOR_KERNEL_EX( namespace pow12_internal { template -Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { +Status DispatchOnFirstArg(cudaStream_t stream, const BinaryElementwisePreparation& prepare) { namespace on = ONNX_NAMESPACE; Status s; switch (prepare.rhs_tensor->GetElementType()) { case on::TensorProto_DataType_INT32: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -334,6 +336,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_INT64: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -347,6 +350,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_FLOAT: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -360,6 +364,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_DOUBLE: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -373,6 +378,7 @@ Status DispatchOnFirstArg(const BinaryElementwisePreparation& prepare) { break; case on::TensorProto_DataType_FLOAT16: ImplT1_Pow::MappedType, typename ToCudaType::MappedType>( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), @@ -402,19 +408,19 @@ Status Pow::ComputeInternal(OpKernelContext* context) const { switch (prepare.lhs_tensor->GetElementType()) { case on::TensorProto_DataType_INT32: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_INT64: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_FLOAT: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_DOUBLE: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; case on::TensorProto_DataType_FLOAT16: - s = DispatchOnFirstArg(prepare); + s = DispatchOnFirstArg(Stream(), prepare); break; default: s = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported X type: ", @@ -431,6 +437,7 @@ Status CompareFunction::CompareMethod(OpKernelContext* context, ImplCo ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); Impl_Compare( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h index 4651004b91..f4c1675aaf 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h @@ -219,7 +219,8 @@ class CompareFunction : public BinaryElementwise { public: CompareFunction(const OpKernelInfo& info) : BinaryElementwise(info) {} - typedef void (*ImplCompare)(int32_t output_rank_or_simple_broadcast, + typedef void (*ImplCompare)(cudaStream_t stream, + int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const CudaT* lhs_data, const TArray* rhs_padded_strides, diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu index f0cb62faaa..8dc09b7fbc 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.cu @@ -12,7 +12,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -27,7 +28,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_T1(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -42,7 +44,8 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_T2(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) { \ - BinaryElementWiseImpl(output_rank_or_simple_broadcast, \ + BinaryElementWiseImpl(stream, \ + output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ rhs_padded_strides, \ @@ -56,19 +59,22 @@ namespace cuda { } #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, T) \ - template void Impl_##x(int32_t output_rank, \ + template void Impl_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T* lhs_data, \ const TArray* rhs_padded_strides, const T* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T1(x, T, T1) \ - template void ImplT1_##x(int32_t output_rank, \ + template void ImplT1_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T* lhs_data, \ const TArray* rhs_padded_strides, const T1* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_T2(x, T, T1, T2) \ - template void ImplT2_##x(int32_t output_rank, \ + template void ImplT2_##x(cudaStream_t stream, \ + int32_t output_rank, \ const TArray* lhs_padded_strides, const T1* lhs_data, \ const TArray* rhs_padded_strides, const T2* rhs_data, \ const TArray* fdm_output_strides, const fast_divmod& fdm_H, const fast_divmod& fdm_C, T* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h index dbc7e89a03..c9a8c0f1d3 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h @@ -34,6 +34,7 @@ namespace cuda { #define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -52,6 +53,7 @@ BINARY_OPS() #define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \ template \ void ImplT1_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -68,6 +70,7 @@ BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow); #define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \ template \ void ImplT2_##name( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T1* lhs_data, \ diff --git a/onnxruntime/core/providers/cuda/math/clip.cc b/onnxruntime/core/providers/cuda/math/clip.cc index 6189f703f2..c6d41f4cb7 100644 --- a/onnxruntime/core/providers/cuda/math/clip.cc +++ b/onnxruntime/core/providers/cuda/math/clip.cc @@ -62,7 +62,7 @@ Status Clip_6::ComputeInternal(OpKernelContext* ctx) const { if (count > 0) { auto* y_data = Y->template MutableData(); const auto* x_data = X.template Data(); - ClipImpl(x_data, y_data, this->min_, this->max_, count); + ClipImpl(Stream(), x_data, y_data, this->min_, this->max_, count); } return Status::OK(); } @@ -91,7 +91,7 @@ struct LowMax { template struct Clip::ComputeImpl { - void operator()(const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const { + void operator()(cudaStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const { auto min_val = clip_internal::LowMax::low(); auto max_val = clip_internal::LowMax::max(); @@ -110,7 +110,7 @@ struct Clip::ComputeImpl { if (count > 0) { auto* y_data = Y->template MutableData(); const auto* x_data = X->template Data(); - ClipImpl(x_data, y_data, min_val, max_val, count); + ClipImpl(stream, x_data, y_data, min_val, max_val, count); } } }; @@ -124,7 +124,7 @@ Status Clip::ComputeInternal(OpKernelContext* ctx) const { utils::MLTypeCallDispatcher t_disp(X->GetElementType()); - t_disp.Invoke(X, min, max, Y); + t_disp.Invoke(Stream(), X, min, max, Y); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.cu b/onnxruntime/core/providers/cuda/math/clip_impl.cu index 3af9283bd8..4164038972 100644 --- a/onnxruntime/core/providers/cuda/math/clip_impl.cu +++ b/onnxruntime/core/providers/cuda/math/clip_impl.cu @@ -13,24 +13,24 @@ __global__ void _Clip(const T* input, T* output, T min, T max, size_t N) { } template -void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count) { +void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count) { typedef typename ToCudaType::MappedType CudaT; int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); - _Clip<<>>(reinterpret_cast(input_data), + _Clip<<>>(reinterpret_cast(input_data), reinterpret_cast(output_data), *reinterpret_cast(&min), *reinterpret_cast(&max), count); } -template void ClipImpl(const float* input_data, float* output_data, float min, float max, size_t count); -template void ClipImpl(const double* input_data, double* output_data, double min, double max, size_t count); -template void ClipImpl(const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count); -template void ClipImpl(const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count); -template void ClipImpl(const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count); -template void ClipImpl(const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count); -template void ClipImpl(const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const float* input_data, float* output_data, float min, float max, size_t count); +template void ClipImpl(cudaStream_t stream, const double* input_data, double* output_data, double min, double max, size_t count); +template void ClipImpl(cudaStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, MLFloat16 min, MLFloat16 max, size_t count); +template void ClipImpl(cudaStream_t stream, const int8_t* input_data, int8_t* output_data, int8_t min, int8_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const uint8_t* input_data, uint8_t* output_data, uint8_t min, uint8_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const int64_t* input_data, int64_t* output_data, int64_t min, int64_t max, size_t count); +template void ClipImpl(cudaStream_t stream, const uint64_t* input_data, uint64_t* output_data, uint64_t min, uint64_t max, size_t count); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.h b/onnxruntime/core/providers/cuda/math/clip_impl.h index b497a44888..6aefa7b90b 100644 --- a/onnxruntime/core/providers/cuda/math/clip_impl.h +++ b/onnxruntime/core/providers/cuda/math/clip_impl.h @@ -10,7 +10,7 @@ namespace onnxruntime { namespace cuda { template -void ClipImpl(const T* input_data, T* output_data, T min, T max, size_t count); +void ClipImpl(cudaStream_t stream, const T* input_data, T* output_data, T min, T max, size_t count); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/cumsum.cc b/onnxruntime/core/providers/cuda/math/cumsum.cc index e0daf9e980..9541eec6da 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum.cc +++ b/onnxruntime/core/providers/cuda/math/cumsum.cc @@ -77,7 +77,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { fast_divmod fast_divmod_input_stride_along_axis(static_cast(input_stride_along_axis)); if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -85,7 +85,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -93,7 +93,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -101,7 +101,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -109,7 +109,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -117,7 +117,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), @@ -125,7 +125,7 @@ Status CumSum::ComputeInternal(OpKernelContext* ctx) const { exclusive_, reverse_); } else if (input->IsDataType()) { - CumSumImpl(reinterpret_cast::MappedType*>(input->Data()), + CumSumImpl(Stream(), reinterpret_cast::MappedType*>(input->Data()), fast_divmod_input_dim_along_axis, fast_divmod_input_stride_along_axis, reinterpret_cast::MappedType*>(output.MutableData()), diff --git a/onnxruntime/core/providers/cuda/math/cumsum_impl.cu b/onnxruntime/core/providers/cuda/math/cumsum_impl.cu index 901bf8d2be..8a657dd9dc 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum_impl.cu +++ b/onnxruntime/core/providers/cuda/math/cumsum_impl.cu @@ -71,6 +71,7 @@ __global__ void _CumSumKernel( template void CumSumImpl( + cudaStream_t stream, const T* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -81,7 +82,7 @@ void CumSumImpl( if (output_size > 0) { int blocksPerGrid = static_cast((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock); - _CumSumKernel<<>>(input_data, + _CumSumKernel<<>>(input_data, input_dim_along_axis, input_stride_along_axis, output_data, @@ -92,6 +93,7 @@ void CumSumImpl( } template void CumSumImpl( + cudaStream_t stream, const int32_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -101,6 +103,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const int64_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -110,6 +113,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const uint32_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -119,6 +123,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const uint64_t* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -128,6 +133,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const float* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -137,6 +143,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const double* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, @@ -146,6 +153,7 @@ template void CumSumImpl( bool reverse); template void CumSumImpl( + cudaStream_t stream, const half* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, diff --git a/onnxruntime/core/providers/cuda/math/cumsum_impl.h b/onnxruntime/core/providers/cuda/math/cumsum_impl.h index f64a863ec9..ad77f748b0 100644 --- a/onnxruntime/core/providers/cuda/math/cumsum_impl.h +++ b/onnxruntime/core/providers/cuda/math/cumsum_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void CumSumImpl( + cudaStream_t stream, const T* input_data, const fast_divmod& input_dim_along_axis, const fast_divmod& input_stride_along_axis, diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc index b1da3135f9..4d3fd9d83b 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.cc @@ -12,14 +12,15 @@ namespace DeviceHelpers { namespace CudaDeviceHelpers { // CUDA EP specific Data copy helper -Status DataCopy(const Tensor& input, Tensor& output) { +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets) { ORT_ENFORCE(output.SizeInBytes() == input.SizeInBytes(), "Einsum op: The candidate output does not match the actual output's shape"); // There are no string tensors in Einsum's case - so safely use memcpy // TODO: Currently, triggers copy on stream 0, investigate if we can still do that // *if* the kernel is launched in a different stream CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.MutableDataRaw(), input.DataRaw(), input.SizeInBytes(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()))); return Status::OK(); } @@ -28,6 +29,7 @@ Status DataCopy(const Tensor& input, Tensor& output) { Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets) { return cuda::Transpose::DoTranspose(static_cast(einsum_cuda_assets)->cuda_ep_->GetDeviceProp(), + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()), static_cast(einsum_cuda_assets)->cublas_handle_, permutation, input, output, input_shape_override); } @@ -79,7 +81,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, } // CUDA EP specific Diagonal helper -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator) { +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); auto rank = static_cast(input_dims.size()); @@ -117,6 +119,7 @@ std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim } DiagonalImpl( + static_cast(static_cast(einsum_cuda_assets)->cuda_ep_->GetComputeStream()), input.DataRaw(), input.Shape().GetDims().size(), first_dim, diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h index 90aa863a87..797f8f301e 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops.h @@ -38,7 +38,7 @@ namespace CudaDeviceHelpers { Status Transpose(const std::vector& permutation, const Tensor& input, Tensor& output, const TensorShape* input_shape_override, void* einsum_cuda_assets); -Status DataCopy(const Tensor& input, Tensor& output); +Status DataCopy(const Tensor& input, Tensor& output, void* einsum_cuda_assets); template Status MatMul(const T* input_1_data, const T* input_2_data, T* output_data, @@ -52,7 +52,7 @@ Tensor ReduceSum(const Tensor& input, const std::vector& reduce_axes, const TensorShape* input_shape_override, concurrency::ThreadPool* /*tp*/, void* einsum_cuda_assets); -std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator); +std::unique_ptr Diagonal(const Tensor& input, int64_t dim_1, int64_t dim_2, AllocatorPtr allocator, void* einsum_cuda_assets); } // namespace CudaDeviceHelpers diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu index 6b73ae3117..d84396cc21 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.cu @@ -47,6 +47,7 @@ __global__ void _DiagonalKernel( } void DiagonalImpl( + cudaStream_t stream, const void* input_data, const int64_t input_rank, const int64_t dim_1, @@ -61,14 +62,14 @@ void DiagonalImpl( switch (element_size) { case sizeof(int32_t): - _DiagonalKernel<<>>( + _DiagonalKernel<<>>( reinterpret_cast::MappedType*>(input_data), input_rank, dim_1, dim_2, input_strides, reinterpret_cast::MappedType*>(output_data), output_strides, output_size); break; case sizeof(int64_t): - _DiagonalKernel<<>>( + _DiagonalKernel<<>>( reinterpret_cast::MappedType*>(input_data), input_rank, dim_1, dim_2, input_strides, reinterpret_cast::MappedType*>(output_data), output_strides, output_size); diff --git a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h index 483978e663..f0d8416809 100644 --- a/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h +++ b/onnxruntime/core/providers/cuda/math/einsum_utils/einsum_auxiliary_ops_diagonal.h @@ -10,6 +10,7 @@ namespace onnxruntime { namespace cuda { void DiagonalImpl( + cudaStream_t stream, const void* input_data, const int64_t input_rank, const int64_t dim_1, diff --git a/onnxruntime/core/providers/cuda/math/gemm.cc b/onnxruntime/core/providers/cuda/math/gemm.cc index 03819891e9..79eeb5eaf2 100644 --- a/onnxruntime/core/providers/cuda/math/gemm.cc +++ b/onnxruntime/core/providers/cuda/math/gemm.cc @@ -86,6 +86,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { if (b_shape.Size() == 1) { // if B is (), (1,) or (1, 1), broadcast the scalar CUBLAS_RETURN_IF_ERROR(cublasCopyHelper( + Stream(), CublasHandle(), M * N, b_data, @@ -118,7 +119,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { out_data, N, device_prop)); } else { // B is (M, N), no broadcast needed. - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(out_data, b_data, M * N * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } } diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc index 5c7cb81320..89e738fca0 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc @@ -70,19 +70,20 @@ Status MatMulInteger::ComputeInternal(OpKernelContext* ctx) cons IAllocatorUniquePtr a_row_buf; if (b_offset != 0) { a_row_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.N()); - ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper)); + ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper)); } IAllocatorUniquePtr b_col_buf; if (a_offset != 0) { b_col_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.M()); - ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper)); + ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper)); } int alpha = 1; int beta = 0; if (a_offset != 0 || b_offset != 0) { - OffsetOutput(a_row_buf.get(), + OffsetOutput(Stream(), + a_row_buf.get(), b_col_buf.get(), output_ptr, a_offset, diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cu b/onnxruntime/core/providers/cuda/math/matmul_integer.cu index 267cf198c9..f6a9d6488b 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cu +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cu @@ -26,9 +26,9 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s } } -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) { +Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ReduceRowSumOnMatrixAKernel(GridDim::maxThreadsPerBlock)><<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch], + ReduceRowSumOnMatrixAKernel(GridDim::maxThreadsPerBlock)><<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch], row_sum + batch * helper.M(), offset, static_cast(helper.K())); @@ -54,9 +54,9 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s } } -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) { +Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ReduceColSumOnMatrixBKernel(GridDim::maxThreadsPerBlock)><<(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch], + ReduceColSumOnMatrixBKernel(GridDim::maxThreadsPerBlock)><<(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch], col_sum + batch * helper.N(), offset, static_cast(helper.K()), @@ -92,7 +92,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum, } } -Status OffsetOutput(const int32_t* row_sum, +Status OffsetOutput(cudaStream_t stream, + const int32_t* row_sum, const int32_t* col_sum, int32_t* output, const int8_t a_offset, @@ -100,7 +101,7 @@ Status OffsetOutput(const int32_t* row_sum, const MatMulComputeHelper& helper) { if (a_offset && b_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixAB<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixAB<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( row_sum + batch * helper.M(), col_sum + batch * helper.N(), output + helper.OutputOffsets()[batch], @@ -109,14 +110,14 @@ Status OffsetOutput(const int32_t* row_sum, } } else if (a_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixA<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixA<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( col_sum + batch * helper.N(), output + helper.OutputOffsets()[batch], static_cast(helper.N())); } } else if (b_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { - ComputeOffsetOfMatrixB<<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>( + ComputeOffsetOfMatrixB<<(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>( row_sum + batch * helper.M(), output + helper.OutputOffsets()[batch], static_cast(helper.N())); diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh index e22bbf4d24..e6dc24fc08 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh @@ -11,9 +11,10 @@ namespace onnxruntime { namespace cuda { -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper); -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper); -Status OffsetOutput(const int32_t* row_sum, +Status ReduceRowSumOnMatrixA(cudaStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper); +Status ReduceColSumOnMatrixB(cudaStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper); +Status OffsetOutput(cudaStream_t stream, + const int32_t* row_sum, const int32_t* col_sum, int32_t* output, const int8_t a_offset, diff --git a/onnxruntime/core/providers/cuda/math/softmax.cc b/onnxruntime/core/providers/cuda/math/softmax.cc index 09753b66a2..7efeb6425e 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.cc +++ b/onnxruntime/core/providers/cuda/math/softmax.cc @@ -13,6 +13,7 @@ namespace cuda { template Status SoftMaxComputeHelper( + cudaStream_t stream, const T* X, const TensorShape& input_shape, T* Y, @@ -28,7 +29,7 @@ Status SoftMaxComputeHelper( // cudnnSoftmaxForward/Backward is not optimal implementation. // TODO: remove cudnn path completely in the future. if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_softmax_forward, is_log_softmax>(Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + dispatch_softmax_forward, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -50,8 +51,8 @@ Status SoftMaxComputeHelper( } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); + template Status SoftMaxComputeHelper(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); \ + template Status SoftMaxComputeHelper(cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, cudnnHandle_t handle, int64_t axis); SPECIALIZED_SOFTMAX_HELPER_IMPL(float) SPECIALIZED_SOFTMAX_HELPER_IMPL(double) @@ -62,6 +63,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16) #define SPECIALIZED_SOFTMAX_HELPER_IMPL_BFloat16(is_log_softmax) \ template <> \ Status SoftMaxComputeHelper( \ + cudaStream_t stream, \ const BFloat16* X, \ const TensorShape& input_shape, \ BFloat16* Y, \ @@ -73,7 +75,7 @@ SPECIALIZED_SOFTMAX_HELPER_IMPL(MLFloat16) auto Y_data = reinterpret_cast(Y); \ auto X_data = reinterpret_cast(X); \ dispatch_softmax_forward, is_log_softmax>( \ - Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ + stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ return Status::OK(); \ } @@ -183,6 +185,7 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { // Perform the transpose ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(), + Stream(), CublasHandle(), permutation, *X, temp_input)); transposed_input = std::move(temp_input); @@ -208,11 +211,11 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { Status status; if (log_softmax_) { - status = SoftMaxComputeHelper(X_data, *compute_input_shape, Y_data, CudnnHandle(), + status = SoftMaxComputeHelper(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(), is_transpose_required ? static_cast(rank) - 1 : static_cast(axis)); } else { - status = SoftMaxComputeHelper(X_data, *compute_input_shape, Y_data, CudnnHandle(), + status = SoftMaxComputeHelper(Stream(), X_data, *compute_input_shape, Y_data, CudnnHandle(), is_transpose_required ? static_cast(rank) - 1 : static_cast(axis)); } @@ -227,6 +230,7 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { } // Perform the transpose to get the axes back to the original ordering ORT_RETURN_IF_ERROR(Transpose::DoTranspose(cuda_ep_->GetDeviceProp(), + Stream(), CublasHandle(), reverse_permutation, intermediate_output, *Y)); } diff --git a/onnxruntime/core/providers/cuda/math/softmax.h b/onnxruntime/core/providers/cuda/math/softmax.h index 772920de88..3af26690dc 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.h +++ b/onnxruntime/core/providers/cuda/math/softmax.h @@ -11,6 +11,7 @@ namespace cuda { template Status SoftMaxComputeHelper( + cudaStream_t stream, const T* input, const TensorShape& shape, T* Y, @@ -18,7 +19,7 @@ Status SoftMaxComputeHelper( int64_t axis); template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); template class Softmax final : public CudaKernel { diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cu b/onnxruntime/core/providers/cuda/math/softmax_impl.cu index f4658e93fe..80a680963f 100644 --- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu +++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu @@ -135,7 +135,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc } template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -159,47 +159,47 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele switch (log2_elements) { case 0: // 1 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); + <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -208,8 +208,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele } #define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_forward(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_forward(cudaStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_IMPL(float, float, float) SPECIALIZED_SOFTMAX_IMPL(half, half, float) diff --git a/onnxruntime/core/providers/cuda/math/topk_impl.cu b/onnxruntime/core/providers/cuda/math/topk_impl.cu index 2ba3ac4086..db609b9aa9 100644 --- a/onnxruntime/core/providers/cuda/math/topk_impl.cu +++ b/onnxruntime/core/providers/cuda/math/topk_impl.cu @@ -419,23 +419,24 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) { template Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) { typedef typename ToCudaType::MappedType CudaT; + cudaStream_t stream = kernel->Stream(); const CudaT* input_x_ptr = reinterpret_cast(input_x); CudaT* output_v_ptr = reinterpret_cast(output_v); auto aligned_K = ALIGN(K); auto aligned_dimension = ALIGN(dimension); if (aligned_dimension <= GridDim::maxThreadsPerBlock) { - BitonicTopK<<)>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits::Lowest(), NumericLimits::Max()); + BitonicTopK<<), stream>>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, aligned_K, largest, sorted, dimension, aligned_dimension, NumericLimits::Lowest(), NumericLimits::Max()); } else if (K <= BT*16 || 0 == sorted) { auto XPT = static_cast(ceil(static_cast(dimension) / GridDim::maxThreadsPerBlock)); if (BT*2 >= K || 0 == sorted) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT*4>=K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else if (BT*8>=K) { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } else { - RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); + RadixTopK<<>>(input_x_ptr, output_v_ptr, output_i, elem_nums, size, axis, K, largest, sorted, dimension, XPT, NumericLimits::Lowest(), NumericLimits::Max()); } } else { auto input_key_buffer = kernel->GetScratchBuffer(dimension); @@ -447,21 +448,21 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t auto* input_value = input_value_buffer.get(); auto* output_value = output_value_buffer.get(); size_t temp_bytes = 0; - CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(nullptr, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)); auto temp_storage_buffer = kernel->GetScratchBuffer(temp_bytes); auto* temp_storage = temp_storage_buffer.get(); auto blocks_per_grid_D = (int)(ceil(static_cast(dimension) / BT)); auto blocks_per_grid_K = (int)(ceil(static_cast(K) / BT)); for (int64_t i = 0; i < N; i++) { - FillInput<<>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension); - CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension) - : cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension)); + FillInput<<>>(input_x_ptr, input_key, input_value, elem_nums, size, axis, K, i, dimension); + CUDA_RETURN_IF_ERROR(1 == largest ? cub::DeviceRadixSort::SortPairsDescending(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream) + : cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, input_key, output_key, input_value, output_value, dimension, 0, sizeof(T)*8, stream)); if (1 == sorted) { - FillOutput<<>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); + FillOutput<<>>(output_key, output_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); } else { //reorder by ascending index - ExcludeOutput<<>>(output_value, K, dimension); - CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension)); - FillOutput<<>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); + ExcludeOutput<<>>(output_value, K, dimension); + CUDA_RETURN_IF_ERROR(cub::DeviceRadixSort::SortPairs(temp_storage, temp_bytes, output_value, input_value, output_key, input_key, dimension, 0, sizeof(T)*8, stream)); + FillOutput<<>>(input_key, input_value, output_v_ptr, output_i, elem_nums, size, axis, K, i, dimension); } } } diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc index deae967c84..53220ae131 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops.cc @@ -50,6 +50,7 @@ Status UnaryElementwise::Prepare(OpKernelContext* context, UnaryElementwisePrepa UnaryElementwisePreparation p; \ ORT_RETURN_IF_ERROR(UnaryElementwise::Prepare(context, &p)); \ Impl_##x( \ + Stream(), \ reinterpret_cast::MappedType*>(p.input_tensor->template Data()), \ reinterpret_cast::MappedType*>(p.output_tensor->template MutableData()), \ p.output_tensor->Shape().Size()); \ diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu index fe60e66856..5b5102938d 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.cu @@ -19,14 +19,15 @@ namespace cuda { #define UNARY_ELEMENTWISE_IMPL(name) \ UNARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - UnaryElementWiseImpl(input_data, \ + UnaryElementWiseImpl(stream, \ + input_data, \ output_data, \ OP_##name(), \ count); \ } #define SPECIALIZED_UNARY_ELEMENTWISE_IMPL(name, T) \ - template void Impl_##name(const T* input_data, T* output_data, size_t count); + template void Impl_##name(cudaStream_t stream, const T* input_data, T* output_data, size_t count); #define UNARY_OP_NAME_EXPR(name, expr) \ OP(name, expr) \ @@ -116,17 +117,19 @@ struct OP_Cast { template void Impl_Cast( + cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count) { - UnaryElementWiseImpl(input_data, + UnaryElementWiseImpl(stream, + input_data, output_data, OP_Cast(), count); } #define SPECIALIZED_CAST_IMPL2(InT, OutT) \ - template void Impl_Cast(const InT* input_data, OutT* output_data, size_t count); + template void Impl_Cast(cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) #define SPECIALIZED_CAST_IMPL2_BF16(T) SPECIALIZED_CAST_IMPL2(T, nv_bfloat16) diff --git a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h index 81123c46bf..2b28886386 100644 --- a/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/unary_elementwise_ops_impl.h @@ -29,6 +29,7 @@ namespace cuda { #define UNARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ void Impl_##name( \ + cudaStream_t stream, \ const T* input_data, \ T* output_data, \ size_t count) @@ -39,6 +40,7 @@ UNARY_OPS() template void Impl_Cast( + cudaStream_t stream, const InT* input_data, OutT* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc index beb3e829f3..7db97d0ed1 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.cc @@ -17,7 +17,7 @@ namespace cuda { template template Status VariadicElementwiseOp:: - NoBroadcastBatchImplDispatchTarget::operator()(const InputTensorVector& inputs, Tensor& output) const { + NoBroadcastBatchImplDispatchTarget::operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const { assert(inputs.size() > 1); using CudaT = typename ToCudaType::MappedType; @@ -30,7 +30,7 @@ Status VariadicElementwiseOp CudaT* output_data = reinterpret_cast(output.template MutableData()); Impl_NoBroadcastInputBatch( - input_data_batch, output_data, output.Shape().Size()); + stream, input_data_batch, output_data, output.Shape().Size()); return Status::OK(); } @@ -39,13 +39,14 @@ Status VariadicElementwiseOp template template Status VariadicElementwiseOp:: - BinaryImplDispatchTarget::operator()(const Tensor& lhs, const Tensor& rhs, Tensor& output) const { + BinaryImplDispatchTarget::operator()(cudaStream_t stream, const Tensor& lhs, const Tensor& rhs, Tensor& output) const { using CudaT = typename ToCudaType::MappedType; BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&lhs, &rhs, &output, &prepare)); Impl_General( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -64,17 +65,18 @@ Status VariadicElementwiseOp template template Status VariadicElementwiseOp:: - GeneralImplDispatchTarget::operator()(const InputTensorVector& inputs, Tensor& output) const { + GeneralImplDispatchTarget::operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const { assert(inputs.size() > 1); using CudaT = typename ToCudaType::MappedType; - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&output, &inputs[0].get(), &output, &prepare)); Impl_Add( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -90,6 +92,7 @@ Status VariadicElementwiseOp ORT_RETURN_IF_ERROR(BinaryElementwiseBroadcastPrepare(&output, &inputs[index].get(), &output, &prepare)); Impl_General( + stream, prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(prepare.lhs_tensor->template Data()), @@ -132,7 +135,7 @@ Status VariadicElementwiseOp if (first_input_tensor.DataRaw() != output_tensor.DataRaw()) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( output_tensor.MutableDataRaw(), first_input_tensor.DataRaw(), first_input_tensor.SizeInBytes(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); @@ -152,14 +155,14 @@ Status VariadicElementwiseOp // special case for no broadcasting and 2 inputs if (input_count == 2) { utils::MLTypeCallDispatcherRet dispatcher(element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors[0], input_tensors[1], output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors[0], input_tensors[1], output_tensor)); return Status::OK(); } utils::MLTypeCallDispatcherRet dispatcher( element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors, output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors, output_tensor)); return Status::OK(); } @@ -177,7 +180,7 @@ Status VariadicElementwiseOp // special case for 2 inputs if (input_count == 2) { utils::MLTypeCallDispatcherRet dispatcher(element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors[0], input_tensors[1], output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors[0], input_tensors[1], output_tensor)); return Status::OK(); } @@ -186,7 +189,7 @@ Status VariadicElementwiseOp { utils::MLTypeCallDispatcherRet dispatcher( element_type); - ORT_RETURN_IF_ERROR(dispatcher.Invoke(input_tensors, output_tensor)); + ORT_RETURN_IF_ERROR(dispatcher.Invoke(Stream(), input_tensors, output_tensor)); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h index 101e8389cd..42d83f81ad 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops.h @@ -24,17 +24,17 @@ class VariadicElementwiseOp : public CudaKernel { template struct NoBroadcastBatchImplDispatchTarget { - Status operator()(const InputTensorVector& inputs, Tensor& output) const; + Status operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const; }; template struct BinaryImplDispatchTarget { - Status operator()(const Tensor& lhs, const Tensor& rhs, Tensor& output) const; + Status operator()(cudaStream_t stream, const Tensor& lhs, const Tensor& rhs, Tensor& output) const; }; template struct GeneralImplDispatchTarget { - Status operator()(const InputTensorVector& inputs, Tensor& output) const; + Status operator()(cudaStream_t stream, const InputTensorVector& inputs, Tensor& output) const; }; }; diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu index ad975f85c3..da1f228e5b 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.cu @@ -20,6 +20,7 @@ struct VariadicElementwiseOpTraits; using ScalarComputeFunctor = OP_##ImplName; \ \ static void ComputeFn( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -31,6 +32,7 @@ struct VariadicElementwiseOpTraits; T* output_data, \ size_t count) { \ Impl_##ImplName( \ + stream, \ output_rank_or_simple_broadcast, \ lhs_padded_strides, \ lhs_data, \ @@ -52,6 +54,7 @@ DEFINE_TRAITS(variadic_elementwise_ops::Max, Max) template void Impl_General( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -63,6 +66,7 @@ void Impl_General( T* output_data, size_t count) { VariadicElementwiseOpTraits::ComputeFn( + stream, output_rank_or_simple_broadcast, lhs_padded_strides, lhs_data, @@ -77,12 +81,14 @@ void Impl_General( template void Impl_NoBroadcastInputBatch( + cudaStream_t stream, InputBatchArray input_data_batch, T* output_data, size_t count) { VariadicElementWiseNoBroadcastInputBatchImpl< T, typename VariadicElementwiseOpTraits::ScalarComputeFunctor, k_max_input_batch_size>( + stream, typename VariadicElementwiseOpTraits::ScalarComputeFunctor{}, count, input_data_batch, @@ -91,6 +97,7 @@ void Impl_NoBroadcastInputBatch( #define SPECIALIZE_IMPL(T, VariadicElementwiseOpTag) \ template void Impl_General( \ + cudaStream_t stream, \ int32_t output_rank_or_simple_broadcast, \ const TArray* lhs_padded_strides, \ const T* lhs_data, \ @@ -103,6 +110,7 @@ void Impl_NoBroadcastInputBatch( size_t count); \ \ template void Impl_NoBroadcastInputBatch( \ + cudaStream_t stream, \ InputBatchArray input_data_batch, \ T * output_data, \ size_t count); diff --git a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h index 39806f0ccc..72316332b1 100644 --- a/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/variadic_elementwise_ops_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void Impl_General( + cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, const T* lhs_data, @@ -30,6 +31,7 @@ using InputBatchArray = TArray; template void Impl_NoBroadcastInputBatch( + cudaStream_t stream, InputBatchArray input_data_batch, T* output_data, size_t count); diff --git a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh index 6c779ad501..84cd10ad24 100644 --- a/onnxruntime/core/providers/cuda/multi_tensor/common.cuh +++ b/onnxruntime/core/providers/cuda/multi_tensor/common.cuh @@ -73,6 +73,7 @@ int compute_max_tensor_size_per_launch(int element_count_per_thread) { template void launch_multi_tensor_functor( + cudaStream_t stream, const int chunk_size, std::vector& tensor_sizes, std::vector>& grouped_tensor_pointers, @@ -121,7 +122,7 @@ void launch_multi_tensor_functor( chunk_group.chunk_count = block_index; if (block_index == chunk_group.max_block_count) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; } } @@ -129,7 +130,7 @@ void launch_multi_tensor_functor( // After ++tensor_group_index, tensor_group_index becomes the count of tensor group in chunk_group. ++tensor_group_index; if (tensor_group_index == chunk_group.max_tensor_group_count) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; tensor_group_index = 0; } @@ -138,7 +139,7 @@ void launch_multi_tensor_functor( // This round of processing tensor group is finished. // All the groups remain in chunk group should be processed right now. if (block_index != 0) { - multipleTensorKernel(chunk_group, std::forward(kernelParams)...); + multipleTensorKernel(stream, chunk_group, std::forward(kernelParams)...); block_index = 0; tensor_group_index = 0; } diff --git a/onnxruntime/core/providers/cuda/nn/batch_norm.cc b/onnxruntime/core/providers/cuda/nn/batch_norm.cc index a1af24a4ef..db312fdd2c 100644 --- a/onnxruntime/core/providers/cuda/nn/batch_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/batch_norm.cc @@ -81,10 +81,10 @@ Status BatchNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) const auto f_B = GetScratchBuffer(C); auto f_mean = GetScratchBuffer(C); auto f_var = GetScratchBuffer(C); - Impl_Cast(scale_data, f_scale.get(), C); - Impl_Cast(b_data, f_B.get(), C); - Impl_Cast(mean_data, f_mean.get(), C); - Impl_Cast(var_data, f_var.get(), C); + Impl_Cast(Stream(), scale_data, f_scale.get(), C); + Impl_Cast(Stream(), b_data, f_B.get(), C); + Impl_Cast(Stream(), mean_data, f_mean.get(), C); + Impl_Cast(Stream(), var_data, f_var.get(), C); CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardInference( CudnnHandle(), diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 20ada62308..f61f93fab5 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -34,7 +34,8 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -Status SliceOutUnwantedOutputSection(const void* input_data, +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, const std::vector& input_dims, void* output_data, const std::vector& output_dims, @@ -49,7 +50,7 @@ Status SliceOutUnwantedOutputSection(const void* input_data, // As a sanity check, ensure that the slice operator's output shape matches with the expected output shape ORT_ENFORCE(compute_metadata.output_dims_ == output_dims); - return SliceCuda::Impl(input_data, input_dims, output_data, compute_metadata, element_size); + return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } template @@ -195,7 +196,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const s_.b_zero = nullptr; } CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); - CUDA_CALL_THROW(cudaMemset(s_.b_zero, 0, malloc_size)); + CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream())); } if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { @@ -306,7 +307,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions // This may have lead to extra results that are unnecessary and hence we slice that off here if (s_.post_slicing_required) { - SliceOutUnwantedOutputSection(s_.y_data, s_.y_dims_with_adjusted_pads, s_.Y->MutableDataRaw(), + SliceOutUnwantedOutputSection(Stream(), s_.y_data, s_.y_dims_with_adjusted_pads, s_.Y->MutableDataRaw(), s_.y_dims, s_.slice_starts, s_.slice_ends, s_.slice_axes, s_.element_size); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index e39bda1c59..04f9865a1a 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -189,7 +189,8 @@ class Conv : public CudaKernel { constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; }; -Status SliceOutUnwantedOutputSection(const void* input_data, +Status SliceOutUnwantedOutputSection(cudaStream_t stream, + const void* input_data, const std::vector& input_dims, void* output_data, const std::vector& output_dims, diff --git a/onnxruntime/core/providers/cuda/nn/dropout.h b/onnxruntime/core/providers/cuda/nn/dropout.h index 47cf5ce511..5e38a587df 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout.h +++ b/onnxruntime/core/providers/cuda/nn/dropout.h @@ -22,6 +22,7 @@ struct GetRatioDataImpl { template struct DropoutComputeImpl { void operator()(const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio_data, PhiloxGenerator& generator, @@ -33,7 +34,7 @@ struct DropoutComputeImpl { const CudaT* X_data = reinterpret_cast(X.template Data()); CudaT* Y_data = reinterpret_cast(Y.template MutableData()); - DropoutKernelImpl(prop, N, ratio_data, generator, X_data, Y_data, mask_data); + DropoutKernelImpl(prop, stream, N, ratio_data, generator, X_data, Y_data, mask_data); } }; @@ -81,12 +82,12 @@ Status Dropout::ComputeInternal(OpKernelContext* context) const { const void* X_data = X->DataRaw(); void* Y_data = Y->MutableDataRaw(); if (Y_data != X_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y_data, X_data, X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y_data, X_data, X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); } // If mask is requested, return all 1s. if (mask != nullptr) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask->MutableData(), true, N * sizeof(bool))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask->MutableData(), true, N * sizeof(bool), Stream())); } return Status::OK(); @@ -106,7 +107,7 @@ Status Dropout::ComputeInternal(OpKernelContext* context) const { #else utils::MLTypeCallDispatcher t_disp(X->GetElementType()); #endif - t_disp.Invoke(GetDeviceProp(), N, ratio_data, generator, *X, *Y, mask_data); + t_disp.Invoke(GetDeviceProp(), Stream(), N, ratio_data, generator, *X, *Y, mask_data); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu index ded4a87c40..47d73aa450 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.cu @@ -69,6 +69,7 @@ __global__ void DropoutKernel( template void DropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio, PhiloxGenerator& generator, @@ -83,12 +84,13 @@ void DropoutKernelImpl( const uint64_t counter_offset = static_cast(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL); auto seeds = generator.NextPhiloxSeeds(counter_offset); - DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); + DropoutKernel<<>>(N, ratio, seeds, X_data, Y_data, mask_data); } #define SPECIALIZED_DROPOUT_IMPL(T) \ template void DropoutKernelImpl( \ const cudaDeviceProp& prop, \ + cudaStream_t stream, \ const int64_t N, \ const float ratio, \ PhiloxGenerator& generator, \ diff --git a/onnxruntime/core/providers/cuda/nn/dropout_impl.h b/onnxruntime/core/providers/cuda/nn/dropout_impl.h index 5c52af1318..37e16710e6 100644 --- a/onnxruntime/core/providers/cuda/nn/dropout_impl.h +++ b/onnxruntime/core/providers/cuda/nn/dropout_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void DropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const float ratio, PhiloxGenerator& generator, diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc index 1bd1d236f7..8945d639fc 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc +++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc @@ -135,6 +135,7 @@ Status InstanceNorm::ComputeInternal(OpKernelContext* p_op_kernel_context) co fast_divmod fdm_C(gsl::narrow_cast(C)); InstanceNormImpl( + Stream(), x_data, scale_data, bias_data, diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu index 98cf179601..c0af3d0580 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.cu @@ -31,6 +31,7 @@ __global__ void _InstanceNormKernel( template void InstanceNormImpl( + cudaStream_t stream, const T* input_data, const T* scale, const T* bias, @@ -43,12 +44,12 @@ void InstanceNormImpl( T* output_data, size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _InstanceNormKernel<<>>( + _InstanceNormKernel<<>>( input_data, scale, bias, mean, variance, variance_correction, epsilon, fdm_HW, fdm_C, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void InstanceNormImpl(const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count); + template void InstanceNormImpl(cudaStream_t stream, const T* input_data, const T* scale, const T* bias, const T* mean, const T* stddev, const double variance_correction, const double epsilon, const fast_divmod& fdm_HW, const fast_divmod& fdm_C, T* output_data, size_t count); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h index 5748746db6..cda9684416 100644 --- a/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h +++ b/onnxruntime/core/providers/cuda/nn/instance_norm_impl.h @@ -8,6 +8,7 @@ namespace cuda { template void InstanceNormImpl( + cudaStream_t stream, const T* input_data, const T* scale, const T* bias, diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu index 02335cad1d..2409ee12e3 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu @@ -88,6 +88,7 @@ __global__ void MaxPoolWithIndexKernel( template void MaxPoolWithIndex( + cudaStream_t stream, const TensorShape& input_shape, const TensorShape& output_shape, const std::vector& kernel_shape, @@ -130,7 +131,7 @@ void MaxPoolWithIndex( fast_divmod fdm_d(static_cast(pooled_depth)); int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock); - MaxPoolWithIndexKernel<<>>( + MaxPoolWithIndexKernel<<>>( batchs, channels, height, @@ -164,6 +165,7 @@ void MaxPoolWithIndex( #define INSTANTIATEMAXPOOLWITHINDEX(T) \ template void MaxPoolWithIndex( \ + cudaStream_t stream, \ const TensorShape& input_shape, \ const TensorShape& output_shape, \ const std::vector& kernel_shape, \ diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h index ec796c3d95..3c2420b45b 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { template void MaxPoolWithIndex( + cudaStream_t stream, const TensorShape& input_shape, const TensorShape& output_shape, const std::vector& kernel_shape, diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index 367930d9b3..af9eeb5381 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -187,9 +187,9 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); auto temp_Y = GetScratchBuffer(output_count); - Impl_Cast(reinterpret_cast(x_data), temp_X.get(), input_count); + Impl_Cast(Stream(), reinterpret_cast(x_data), temp_X.get(), input_count); CUDNN_RETURN_IF_ERROR(cudnnPoolingForward(CudnnHandle(), pooling_desc, &alpha, x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), y_data, output_count); + Impl_Cast(Stream(), temp_Y.get(), y_data, output_count); } else { const auto alpha = Consts::One; const auto beta = Consts::Zero; @@ -239,6 +239,7 @@ Status Pool>::ComputeInternal(OpKernelContext* context) const { if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->template MutableData(); MaxPoolWithIndex( + this->Stream(), x_shape, TensorShape(y_dims), kernel_shape, diff --git a/onnxruntime/core/providers/cuda/nn/shrink.cc b/onnxruntime/core/providers/cuda/nn/shrink.cc index 09eb264b74..cd8d9e2cf3 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink.cc +++ b/onnxruntime/core/providers/cuda/nn/shrink.cc @@ -33,7 +33,7 @@ Status Shrink::ComputeInternal(OpKernelContext* p_op_kernel_context) const { Tensor* Y = p_op_kernel_context->Output(0, x_shape); auto* y_data = reinterpret_cast(Y->template MutableData()); - ShrinkImpl(x_data, bias_, lambd_, y_data, x_size); + ShrinkImpl(Stream(), x_data, bias_, lambd_, y_data, x_size); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/shrink_impl.cu b/onnxruntime/core/providers/cuda/nn/shrink_impl.cu index 867822561c..4883c1dd69 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink_impl.cu +++ b/onnxruntime/core/providers/cuda/nn/shrink_impl.cu @@ -51,18 +51,19 @@ __global__ void _ShrinkKernel( template void ShrinkImpl( + cudaStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _ShrinkKernel<<>>( + _ShrinkKernel<<>>( input_data, bias, lambda, output_data, (CUDA_LONG)N); } #define SPECIALIZED_IMPL(T) \ - template void ShrinkImpl(const T* input_data, const float bias, const float lambda, T* output_data, size_t N); + template void ShrinkImpl(cudaStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/nn/shrink_impl.h b/onnxruntime/core/providers/cuda/nn/shrink_impl.h index 05b7183a89..0b324315f7 100644 --- a/onnxruntime/core/providers/cuda/nn/shrink_impl.h +++ b/onnxruntime/core/providers/cuda/nn/shrink_impl.h @@ -8,6 +8,7 @@ namespace cuda { template void ShrinkImpl( + cudaStream_t stream, const T* input_data, const float bias, const float lambda, diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc index b62575f71a..b75a09b20b 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc @@ -66,6 +66,7 @@ Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { auto* h_number_selected = static_cast(h_number_selected_ptr.get()); ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl( + Stream(), [this](size_t bytes) { return GetScratchBuffer(bytes); }, pc, GetCenterPointBox(), @@ -120,7 +121,8 @@ Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { concat_sizes_range_gpu.CopyToGpu(); input_ptr.CopyToGpu(); - ORT_RETURN_IF_ERROR(ConcatImpl(sizeof(int64_t), + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + sizeof(int64_t), num_elements, last_dim, concat_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu index 270b8283db..28cc457c62 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.cu @@ -228,7 +228,8 @@ __global__ void NormalizeOutput(const int num_elements, const int* original, int } } -Status NmsGpu(std::function(size_t)> allocator, +Status NmsGpu(cudaStream_t stream, + std::function(size_t)> allocator, const int64_t center_point_box, const float* d_sorted_boxes_float_ptr, const int num_boxes, @@ -249,7 +250,7 @@ Status NmsGpu(std::function(size_t)> allocator, auto* d_nms_mask = static_cast(d_nms_mask_ptr.get()); int blocksPerGrid = (int)(ceil(static_cast(max_nms_mask_size) / GridDim::maxThreadsPerBlock)); - SetZero<<>>(max_nms_mask_size, d_nms_mask); + SetZero<<>>(max_nms_mask_size, d_nms_mask); int* d_delete_mask = d_nms_mask; int* h_selected_count = h_nkeep; @@ -264,7 +265,7 @@ Status NmsGpu(std::function(size_t)> allocator, thread_block.x = kNmsBlockDim; thread_block.y = kNmsBlockDim; thread_block.z = 1; - NMSKernel<<>>(center_point_box, + NMSKernel<<>>(center_point_box, d_sorted_boxes, num_boxes, iou_threshold, @@ -277,9 +278,9 @@ Status NmsGpu(std::function(size_t)> allocator, auto* d_indices = static_cast(d_indices_ptr.get()); blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); - Iota<<>>(num_boxes, 0, d_indices); + Iota<<>>(num_boxes, 0, d_indices); - NMSReduce<<<1, 1024, bit_mask_len * sizeof(int)>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes); + NMSReduce<<<1, 1024, bit_mask_len * sizeof(int), stream>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes); size_t flagged_buffer_size = 0; CUDA_RETURN_IF_ERROR(cub::DeviceSelect::Flagged(static_cast(nullptr), // temp_storage @@ -288,7 +289,8 @@ Status NmsGpu(std::function(size_t)> allocator, static_cast(nullptr), // selection flag static_cast(nullptr), // selected items static_cast(nullptr), // num_selected - num_boxes)); + num_boxes, + stream)); IAllocatorUniquePtr d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)}; auto* d_cub_scratch_buffer = static_cast(d_cub_scratch_buffer_ptr.get()); @@ -301,8 +303,10 @@ Status NmsGpu(std::function(size_t)> allocator, d_indices, // input d_selected_boxes, // selection flag d_selected_indices, // selected items - d_num_selected, num_boxes)); - CUDA_RETURN_IF_ERROR(cudaMemcpy(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost)); + d_num_selected, num_boxes, stream)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(h_selected_count, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream)); + // cudaStreamSynchronize is needed since the value of h_selected_count will be used by host after this function. + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); return Status::OK(); } @@ -320,6 +324,7 @@ struct DeviceGreaterThan { } // namespace Status NonMaxSuppressionImpl( + cudaStream_t stream, std::function(size_t)> allocator, const PrepareContext& pc, const int64_t center_point_box, @@ -346,8 +351,8 @@ Status NonMaxSuppressionImpl( static_cast(nullptr), // input indices static_cast(nullptr), // sorted indices num_boxes, // num items - 0, 8 * sizeof(float) // sort all bits - )); + 0, 8 * sizeof(float), // sort all bits + stream)); // allocate temporary memory IAllocatorUniquePtr d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)}; @@ -365,7 +370,7 @@ Status NonMaxSuppressionImpl( // create sequense of indices int blocksPerGrid = (int)(ceil(static_cast(num_boxes) / GridDim::maxThreadsPerBlock)); - Iota<<>>(num_boxes, 0, d_indices); + Iota<<>>(num_boxes, 0, d_indices); CUDA_RETURN_IF_ERROR(cudaGetLastError()); // sort scores @@ -378,23 +383,25 @@ Status NonMaxSuppressionImpl( d_sorted_indices, num_boxes, 0, - 8 * sizeof(float) // sort all bits - )); + 8 * sizeof(float), // sort all bits + stream)); // pick sorted scores const Box* original_boxes = reinterpret_cast(boxes_data); Box* sorted_boxes = reinterpret_cast(d_sorted_boxes); - IndexMultiSelect<<>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes); + IndexMultiSelect<<>>(num_boxes, d_sorted_indices, original_boxes, sorted_boxes); CUDA_RETURN_IF_ERROR(cudaGetLastError()); // STEP 2. filter boxes by scores int limited_num_boxes = num_boxes; if (pc.score_threshold_ != nullptr) { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); thrust::device_ptr sorted_scores_device_ptr(d_sorted_scores); limited_num_boxes = thrust::count_if( sorted_scores_device_ptr, sorted_scores_device_ptr + num_boxes, DeviceGreaterThan(score_threshold)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(0)); CUDA_RETURN_IF_ERROR(cudaGetLastError()); if (limited_num_boxes == 0) { @@ -404,7 +411,8 @@ Status NonMaxSuppressionImpl( } // STEP 3. launch NMS kernels - ORT_RETURN_IF_ERROR(NmsGpu(allocator, + ORT_RETURN_IF_ERROR(NmsGpu(stream, + allocator, center_point_box, d_sorted_boxes, limited_num_boxes, @@ -424,8 +432,8 @@ Status NonMaxSuppressionImpl( auto* d_normalized_output_indices = static_cast(d_normalized_output_indices_ptr.get()); blocksPerGrid = (int)(ceil(static_cast(num_to_keep) / GridDim::maxThreadsPerBlock)); - IndexMultiSelect<<>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices); - NormalizeOutput<<>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index); + IndexMultiSelect<<>>(num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices); + NormalizeOutput<<>>(num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index); CUDA_RETURN_IF_ERROR(cudaGetLastError()); selected_indices = std::move(d_normalized_output_indices_ptr); diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h index 493c115e52..648420125c 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression_impl.h @@ -14,6 +14,7 @@ namespace onnxruntime { namespace cuda { Status NonMaxSuppressionImpl( + cudaStream_t stream, std::function(size_t)> allocator, const PrepareContext& pc, const int64_t center_point_box, diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.cc b/onnxruntime/core/providers/cuda/object_detection/roialign.cc index 5ca757382f..513b82f6aa 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cuda/object_detection/roialign.cc @@ -45,6 +45,7 @@ Status RoiAlign::ComputeInternal(OpKernelContext* context) const { if (output_size > 0) { RoiAlignImpl( + Stream(), output_size, // num threads reinterpret_cast::MappedType*>(X_ptr->template Data()), ToCudaType::FromFloat(this->spatial_scale_), diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu index 45a35b291e..937007f57b 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu +++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu @@ -174,6 +174,7 @@ __global__ void RoIAlignForward( template void RoiAlignImpl( + cudaStream_t stream, const int64_t nthreads, const T* bottom_data, const T spatial_scale, @@ -189,7 +190,7 @@ void RoiAlignImpl( const bool is_mode_avg, const int64_t* batch_indices_ptr) { int blocksPerGrid = (int)(ceil(static_cast(nthreads) / GridDim::maxThreadsPerBlock)); - RoIAlignForward<<>>( + RoIAlignForward<<>>( nthreads, bottom_data, spatial_scale, @@ -208,6 +209,7 @@ void RoiAlignImpl( #define SPECIALIZED_IMPL(T) \ template void RoiAlignImpl( \ + cudaStream_t stream, \ const int64_t nthreads, \ const T* bottom_data, \ const T spatial_scale, \ diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h index 712771bdf6..312c35a93a 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h +++ b/onnxruntime/core/providers/cuda/object_detection/roialign_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void RoiAlignImpl( + cudaStream_t stream, const int64_t nthreads, const T* bottom_data, const T spatial_scale, diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu index ad8533b00c..6ac4e64900 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu @@ -284,7 +284,7 @@ __global__ void reduce_matrix_columns_kernel( template Status call_reduce_matrix_columns( - const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) { ORT_ENFORCE(num_rows >= 0 && num_cols >= 0); using TBuf = AccumulationType_t; @@ -301,12 +301,12 @@ Status call_reduce_matrix_columns( // If more than one block is used per grid row, then inter-block reduction is needed. if (grid_dim.x > 1) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int), stream)); } const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE; reduce_matrix_columns_kernel - <<>>( + <<>>( num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer); return Status::OK(); @@ -315,41 +315,41 @@ Status call_reduce_matrix_columns( template Status reduce_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_square_sum( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_l2_norm( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } template Status reduce_mean( - const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { + cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, 1, size, buffer, buffer_size); + stream, input, output, 1, size, buffer, buffer_size); } #define INSTANTIATE_REDUCE_SUM(TIn, TOut) \ - template Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_SUM(half, float); INSTANTIATE_REDUCE_SUM(float, float); INSTANTIATE_REDUCE_SUM(double, double); #undef INSTANTIATE_REDUCE_SUM #define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \ - template Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_square_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_SQUARE_SUM(half, float); INSTANTIATE_REDUCE_SQUARE_SUM(float, float); INSTANTIATE_REDUCE_SQUARE_SUM(double, double); @@ -359,14 +359,14 @@ INSTANTIATE_REDUCE_SQUARE_SUM(nv_bfloat16, float); #undef INSTANTIATE_REDUCE_SQUARE_SUM #define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \ - template Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_l2_norm(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_L2_NORM(half, float); INSTANTIATE_REDUCE_L2_NORM(float, float); INSTANTIATE_REDUCE_L2_NORM(double, double); #undef INSTANTIATE_REDUCE_L2_NORM #define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \ - template Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) + template Status reduce_mean(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_MEAN(half, float); INSTANTIATE_REDUCE_MEAN(float, float); INSTANTIATE_REDUCE_MEAN(double, double); @@ -431,11 +431,11 @@ __global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, } template -Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { +Status call_reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { ORT_ENFORCE(m >= 0 && n >= 0); if (reset_initial_output) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output, 0, n * sizeof(TOut))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output, 0, n * sizeof(TOut), stream)); } constexpr int max_num_threads_in_block = 512; @@ -450,7 +450,7 @@ Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, boo const dim3 grid(grid_x_dim, grid_y_dim, 1); const dim3 block(block_x_dim, block_y_dim, 1); - reduce_matrix_rows_kernel<<>>( + reduce_matrix_rows_kernel<<>>( input, output, m, n); return Status::OK(); @@ -458,13 +458,13 @@ Status call_reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, boo } // namespace detail template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { +Status reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) { using TBuf = AccumulationType_t; - return detail::call_reduce_matrix_rows(input, output, m, n, reset_initial_output); + return detail::call_reduce_matrix_rows(stream, input, output, m, n, reset_initial_output); } #define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \ - template Status reduce_matrix_rows(const T* input, T* output, int m, int n, bool reset_initial_output) + template Status reduce_matrix_rows(cudaStream_t stream, const T* input, T* output, int m, int n, bool reset_initial_output) INSTANTIATE_REDUCE_MATRIX_ROWS(half); INSTANTIATE_REDUCE_MATRIX_ROWS(float); INSTANTIATE_REDUCE_MATRIX_ROWS(double); @@ -474,13 +474,13 @@ INSTANTIATE_REDUCE_MATRIX_ROWS(nv_bfloat16); #undef INSTANTIATE_REDUCE_MATRIX_ROWS template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { +Status reduce_matrix_columns(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) { return detail::call_reduce_matrix_columns( - input, output, m, n, buffer, buffer_size); + stream, input, output, m, n, buffer, buffer_size); } #define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \ - template Status reduce_matrix_columns(const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) + template Status reduce_matrix_columns(cudaStream_t stream, const T* input, T* output, int m, int n, void* buffer, size_t buffer_size) INSTANTIATE_REDUCE_MATRIX_COLUMNS(half); INSTANTIATE_REDUCE_MATRIX_COLUMNS(float); INSTANTIATE_REDUCE_MATRIX_COLUMNS(double); diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h index 69988862aa..965de5a2bd 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.h +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.h @@ -43,19 +43,19 @@ size_t compute_reduction_buffer_size(int size) { /** Computes the sum of the given elements. */ template -Status reduce_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the sum of the squares of the given elements. */ template -Status reduce_square_sum(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_square_sum(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the L2 norm of the given elements. */ template -Status reduce_l2_norm(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_l2_norm(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); /** Computes the mean of the given elements. */ template -Status reduce_mean(const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); +Status reduce_mean(cudaStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size); enum class ApplicableMatrixReduction { // can use reduce_matrix_rows() @@ -89,7 +89,7 @@ ApplicableMatrixReduction get_applicable_matrix_reduction( * @param reset_initial_output Whether to reset (i.e., zero) the output values first. */ template -Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); +Status reduce_matrix_rows(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true); /** * Reduces the columns in a row-major matrix to a single column containing the sum of each row. @@ -101,7 +101,7 @@ Status reduce_matrix_rows(const TIn* input, TOut* output, int m, int n, bool res * @param buffer_size The size of the intermediate buffer in bytes. */ template -Status reduce_matrix_columns(const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); +Status reduce_matrix_columns(cudaStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index c4d6bc11c0..612dee590a 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -149,6 +149,7 @@ Status ReduceKernel::ReduceKernelShared( switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + Stream(), reinterpret_cast(X), reinterpret_cast(Y), m, n, false); @@ -167,7 +168,7 @@ Status ReduceKernel::ReduceKernelShared( // ArgMax/ArgMin with FP16 are not supported by cudnn, so convert input to fp32 then call cudnn temp_X = GetScratchBuffer(input_count); cudnn_type_X = CUDNN_DATA_FLOAT; - Impl_Cast(reinterpret_cast(X), temp_X.get(), input_shape.Size()); + Impl_Cast(Stream(), reinterpret_cast(X), temp_X.get(), input_shape.Size()); } // CUDNN requires at least 3D input, so pad 1s if needed @@ -208,7 +209,7 @@ Status ReduceKernel::ReduceKernelShared( input_data_buffer = GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(Stream(), static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, reinterpret_cast(X), nullptr, tmp_div, tmp_div, @@ -233,7 +234,8 @@ Status ReduceKernel::ReduceKernelShared( auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(Stream(), + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(X), &prepare.rhs_padded_strides, @@ -242,7 +244,7 @@ Status ReduceKernel::ReduceKernelShared( prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(Stream(), reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); @@ -253,13 +255,13 @@ Status ReduceKernel::ReduceKernelShared( &zero, output_tensor, reinterpret_cast(log_sum_result))); // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(Stream(), reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(Stream(), static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(Y), nullptr, tmp_div, tmp_div, @@ -276,7 +278,7 @@ Status ReduceKernel::ReduceKernelShared( // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (reinterpret_cast(Y) != reinterpret_cast(X)) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y, X, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y, X, input_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -301,11 +303,11 @@ Status ReduceKernel::ReduceKernelShared( } // CUDA reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_cuda.get()), reinterpret_cast(Y), output_count); + Impl_Cast(Stream(), reinterpret_cast(indices_cuda.get()), reinterpret_cast(Y), output_count); } if (calculate_log_) { - Impl_Log(reinterpret_cast(Y), + Impl_Log(Stream(), reinterpret_cast(Y), reinterpret_cast(Y), output_count); } @@ -421,7 +423,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr std::vector& output_dims = prepare_reduce_metadata.output_dims; std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - + cudaStream_t stream = static_cast(cuda_ep.GetComputeStream()); // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(output.Shape().Size() == 0); @@ -436,6 +438,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n); @@ -444,6 +447,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = cuda_ep.GetScratchBuffer(buffer_size_bytes); return reduce_matrix_columns( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n, buffer.get(), buffer_size_bytes); @@ -455,7 +459,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); IAllocatorUniquePtr temp_X; cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType(); @@ -464,7 +468,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // ArgMax/ArgMin with FP16 are not supported by cudnn, so convert input to fp32 then call cudnn temp_X = cuda_ep.GetScratchBuffer(input_count); cudnn_type_X = CUDNN_DATA_FLOAT; - Impl_Cast(reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); + Impl_Cast(stream, reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); } CudnnReduceDescriptor reduce_desc; @@ -497,7 +501,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr input_data_buffer = cuda_ep.GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(input.template Data()), nullptr, reinterpret_cast(input.template Data()), nullptr, tmp_div, tmp_div, @@ -507,7 +512,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // This happens when the input is Scalar if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { // Reduce max -- Max/Min will output indices data @@ -536,7 +541,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(stream, + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(input.template Data()), &prepare.rhs_padded_strides, @@ -545,14 +551,15 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(stream, + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } else { // ReduceSum CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -563,13 +570,13 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(stream, reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(stream, static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(output.template MutableData()), nullptr, tmp_div, tmp_div, @@ -581,7 +588,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( cuda_ep.PerThreadCudnnHandle(), reduce_desc, indices_cuda.get(), indices_bytes, @@ -593,7 +600,7 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { CUDNN_RETURN_IF_ERROR(cudnnReduceTensor( @@ -603,7 +610,8 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr &zero, output_tensor, reinterpret_cast(output.template MutableData()))); } } - } else { // For ArgMax & ArgMin ops, use the indicies as the output with int64 type + } else { + // For ArgMax & ArgMin ops, use the indicies as the output with int64 type // cudnnReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. // the output is zeros of the output size if (input_count == output_count) { @@ -626,12 +634,13 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } // CUDA reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_cuda.get()), output.template MutableData(), output_count); + Impl_Cast(stream, reinterpret_cast(indices_cuda.get()), output.template MutableData(), output_count); } } if (calculate_log) { - Impl_Log(reinterpret_cast(output.template MutableData()), + Impl_Log(stream, + reinterpret_cast(output.template MutableData()), reinterpret_cast(output.template MutableData()), output_count); } @@ -661,7 +670,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -700,7 +709,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe if (axes.empty() && noop_with_empty_axes_) { \ auto* Y = ctx->Output(0, X->Shape()); \ CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), \ - cudaMemcpyDeviceToDevice)); \ + cudaMemcpyDeviceToDevice, Stream())); \ return Status::OK(); \ } \ \ @@ -722,12 +731,12 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe if (input_count == output_count) { \ if (Y->template MutableData() != X->template Data()) { \ CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), \ - input_count * sizeof(T), cudaMemcpyDeviceToDevice)); \ + input_count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); \ } \ return Status::OK(); \ } \ \ - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); \ + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); \ \ size_t indices_bytes = 0; \ size_t workspace_bytes = 0; \ @@ -737,7 +746,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe \ cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT; \ IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); \ - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); \ + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); \ \ ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); \ ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X)); \ @@ -756,7 +765,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, cudnnRe workspace_cuda.get(), workspace_bytes, &one, input_tensor, temp_X.get(), \ &zero, output_tensor, temp_Y.get())); \ \ - Impl_Cast(temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); \ + Impl_Cast(Stream(), temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); \ \ return Status::OK(); \ } @@ -788,7 +797,7 @@ Status ReduceKernel::ComputeImpl if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), - X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -810,7 +819,7 @@ Status ReduceKernel::ComputeImpl if (input_count == output_count) { if (Y->template MutableData() != X->template Data()) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), - input_count * sizeof(BFloat16), cudaMemcpyDeviceToDevice)); + input_count * sizeof(BFloat16), cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); } @@ -821,13 +830,13 @@ Status ReduceKernel::ComputeImpl get_applicable_matrix_reduction(cudnn_reduce_op, X->Shape().GetDims(), axes, m, n); switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { - return reduce_matrix_rows(reinterpret_cast(X->template Data()), + return reduce_matrix_rows(Stream(), reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), m, n); } case ApplicableMatrixReduction::Columns: { const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = cuda_ep_->GetScratchBuffer(buffer_size_bytes); - return reduce_matrix_columns(reinterpret_cast(X->template Data()), + return reduce_matrix_columns(Stream(), reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), m, n, buffer.get(), buffer_size_bytes); } @@ -836,7 +845,7 @@ Status ReduceKernel::ComputeImpl } } - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -846,7 +855,7 @@ Status ReduceKernel::ComputeImpl cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); @@ -866,7 +875,7 @@ Status ReduceKernel::ComputeImpl workspace_cuda.get(), workspace_bytes, &one, input_tensor, temp_X.get(), &zero, output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); + Impl_Cast(Stream(), temp_Y.get(), reinterpret_cast(Y->template MutableData()), output_count); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc index 24d1f4e3c7..01c237e0f5 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc @@ -34,7 +34,7 @@ void CudnnRnnBase::SetWeightBias(const cudnnHandle_t handle, cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data()); int count = matDims[0] * matDims[1] * matDims[2]; - cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice); + CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); offset += count; } template @@ -190,7 +190,8 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { if (reverse_) { // reverse input data x_reversed_data = GetScratchBuffer(seq_length * batch_size * input_size); - ReverseBySequence(gsl::narrow_cast(seq_length), + ReverseBySequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(input_size), reinterpret_cast(x_data), @@ -331,14 +332,16 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { y_reorganized_data = GetScratchBuffer(output_size); if (reverse_) { //reverse output data - ReverseBySequence(gsl::narrow_cast(seq_length), + ReverseBySequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), reinterpret_cast(y_reorganized_data.get()), output_size); } else { - ReorderBidirectionalDataInSequence(gsl::narrow_cast(seq_length), + ReorderBidirectionalDataInSequence(Stream(), + gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), @@ -348,7 +351,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { if (Y != nullptr) { // User specified this optional output, so need to copy the reversed data to orignial place - cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream())); } else { y_data = y_reorganized_data.get(); } @@ -363,7 +366,8 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { CudaAsyncBuffer sequence_lens_buffer(this, batch_size); memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t)); ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu()); - RnnMaskImpl(gsl::narrow_cast(num_directions_), + RnnMaskImpl(Stream(), + gsl::narrow_cast(num_directions_), gsl::narrow_cast(seq_length), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), @@ -386,7 +390,8 @@ void CudnnRnnBase::SetZeroSequences(const int64_t zero_seq_index_cache_size, CudaAsyncBuffer zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size); memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t)); ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu()); - MaskZeroSequences(gsl::narrow_cast(hidden_size_), + MaskZeroSequences(Stream(), + gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), reinterpret_cast(y_h_data), reinterpret_cast(y_c_data), diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu index 930c3a4ddd..d485855ddb 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu @@ -22,7 +22,8 @@ __global__ void _ReverseBySequenceKernel(const int32_t seq_length, } template -void ReverseBySequence(const int32_t seq_length, +void ReverseBySequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -32,7 +33,7 @@ void ReverseBySequence(const int32_t seq_length, int32_t block_size = batch_size * input_or_hidden_size; fast_divmod div_batch_block(block_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _ReverseBySequenceKernel<<>>( + _ReverseBySequenceKernel<<>>( seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N); } @@ -61,7 +62,8 @@ __global__ void _BidirectionalDataKernel(const int32_t seq_length, } template -void ReorderBidirectionalDataInSequence(const int32_t seq_length, +void ReorderBidirectionalDataInSequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, const T* data, @@ -74,7 +76,7 @@ void ReorderBidirectionalDataInSequence(const int32_t seq_length, fast_divmod div_output_block(hidden_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _BidirectionalDataKernel<<>>( + _BidirectionalDataKernel<<>>( seq_length, batch_size, hidden_size, seq_block_size, div_seq_block, div_output_block, data, reordered_data, (CUDA_LONG)N); @@ -116,7 +118,8 @@ __global__ void _RnnMaskKernel(const int32_t seq_length, } template -void RnnMaskImpl(const int32_t num_directions, +void RnnMaskImpl(cudaStream_t stream, + const int32_t num_directions, const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, @@ -128,7 +131,7 @@ void RnnMaskImpl(const int32_t num_directions, fast_divmod div_dir_block(batch_size * hidden_size); fast_divmod div_batch_block(hidden_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _RnnMaskKernel<<>>( + _RnnMaskKernel<<>>( seq_length, batch_size, hidden_size, sequence_lens, div_seq_block, div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N); } @@ -164,19 +167,21 @@ __global__ void _MaskZeroSequences(const int32_t hidden_size, } template -void MaskZeroSequences(const int32_t hidden_size, +void MaskZeroSequences(cudaStream_t stream, + const int32_t hidden_size, T* y_output_data, T* y_h_output_data, T* y_c_output_data, const int32_t* zeor_seq_index_cache, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _MaskZeroSequences<<>>( + _MaskZeroSequences<<>>( hidden_size, y_output_data, y_h_output_data, y_c_output_data, zeor_seq_index_cache, (CUDA_LONG)N); } #define SPECIALIZED_RNN_IMPL(T) \ - template void RnnMaskImpl(const int32_t num_directions, \ + template void RnnMaskImpl(cudaStream_t stream, \ + const int32_t num_directions, \ const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size, \ @@ -184,19 +189,22 @@ void MaskZeroSequences(const int32_t hidden_size, T* y_output_data, \ T* y_h_output_data, \ const size_t N); \ - template void ReverseBySequence(const int32_t seq_length, \ + template void ReverseBySequence(cudaStream_t stream, \ + const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size, \ const T* data, \ T* reversed_data, \ const size_t N); \ - template void ReorderBidirectionalDataInSequence(const int32_t seq_length, \ + template void ReorderBidirectionalDataInSequence(cudaStream_t stream,\ + const int32_t seq_length, \ const int32_t batch_size, \ const int32_t hidden_size,\ const T* data, \ T* reordered_data, \ const size_t N); \ -template void MaskZeroSequences(const int32_t hidden_size, \ +template void MaskZeroSequences(cudaStream_t stream, \ + const int32_t hidden_size, \ T* y_output_data, \ T* y_h_output_data, \ T* y_c_output_data, \ diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h index 78ceabf23b..0c00c2d2a9 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { template -void ReverseBySequence(const int32_t seq_length, +void ReverseBySequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -17,7 +18,8 @@ void ReverseBySequence(const int32_t seq_length, const size_t N); template -void ReorderBidirectionalDataInSequence(const int32_t seq_length, +void ReorderBidirectionalDataInSequence(cudaStream_t stream, + const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, const T* data, @@ -25,7 +27,8 @@ void ReorderBidirectionalDataInSequence(const int32_t seq_length, const size_t N); template -void RnnMaskImpl(const int32_t num_directions, +void RnnMaskImpl(cudaStream_t stream, + const int32_t num_directions, const int32_t seq_length, const int32_t batch_size, const int32_t hidden_size, @@ -35,7 +38,8 @@ void RnnMaskImpl(const int32_t num_directions, const size_t N); template -void MaskZeroSequences(const int32_t hidden_size, +void MaskZeroSequences(cudaStream_t stream, + const int32_t hidden_size, T* y_output_data, T* y_h_output_data, T* y_c_output_data, diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index 1efd51d1ba..483934990b 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -34,14 +34,14 @@ template class IConstantBuffer { public: virtual ~IConstantBuffer(){}; - virtual const T* GetBuffer(size_t count) = 0; + virtual const T* GetBuffer(cudaStream_t stream, size_t count) = 0; }; template std::unique_ptr> CreateConstantOnes(); template -void Fill(T* output, T value, int64_t count); +void Fill(cudaStream_t stream, T* output, T value, int64_t count); /* This is a utility wrapper for arbitrary type array diff --git a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h index 90fdd2aea5..6b46550fae 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h @@ -417,24 +417,24 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, #endif // transpose using geam -inline cublasStatus_t cublasTransposeHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { +inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline cublasStatus_t cublasTransposeHelper(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { +inline cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -cublasStatus_t cublasTransposeHelper(cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); +cublasStatus_t cublasTransposeHelper(cudaStream_t, cublasHandle_t, cublasOperation_t, cublasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy -inline cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) { +inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy) { return cublasScopy(handle, n, x, incx, y, incy); } -inline cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) { +inline cublasStatus_t cublasCopyHelper(cudaStream_t, cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy) { return cublasDcopy(handle, n, x, incx, y, incy); } -cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy); +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const half* x, int incx, half* y, int incy); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 -cublasStatus_t cublasCopyHelper(cublasHandle_t handle, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy); +cublasStatus_t cublasCopyHelper(cudaStream_t stream, cublasHandle_t handle, int n, const nv_bfloat16* x, int incx, nv_bfloat16* y, int incy); #endif diff --git a/onnxruntime/core/providers/cuda/tensor/cast_op.cc b/onnxruntime/core/providers/cuda/tensor/cast_op.cc index 156597e3a0..fe5146c17c 100644 --- a/onnxruntime/core/providers/cuda/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/cast_op.cc @@ -71,6 +71,7 @@ Status Cast::ComputeInternal(OpKernelContext* context) const { case TP_TYPE: \ if (count > 0) { \ Impl_Cast::MappedType>( \ + Stream(), \ x_data, \ reinterpret_cast::MappedType*>(Y->template MutableData()), \ count); \ diff --git a/onnxruntime/core/providers/cuda/tensor/compress.cc b/onnxruntime/core/providers/cuda/tensor/compress.cc index 56445f8333..91dd3f5222 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress.cc +++ b/onnxruntime/core/providers/cuda/tensor/compress.cc @@ -52,10 +52,24 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { auto condition_cumulative_sum_buffer = GetScratchBuffer(valid_condition_length); auto condition_cumulative_sum = condition_cumulative_sum_buffer.get(); - PrefixSumImpl(reinterpret_cast(condition_data), condition_cumulative_sum, valid_condition_length); + size_t temp_storage_bytes = 0; + CUDA_RETURN_IF_ERROR(CompressCalcPrefixSumTempStorageBytes(Stream(), + reinterpret_cast(condition_data), + condition_cumulative_sum, + static_cast(valid_condition_length), + temp_storage_bytes)); + auto temp_buffer = GetScratchBuffer(temp_storage_bytes); + auto d_temp_storage = temp_buffer.get(); + CUDA_RETURN_IF_ERROR(CompressInclusivePrefixSum(Stream(), + d_temp_storage, + temp_storage_bytes, + reinterpret_cast(condition_data), + condition_cumulative_sum, + static_cast(valid_condition_length))); + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. int32_t positive_condition_count = 0; - CUDA_RETURN_IF_ERROR(cudaMemcpy(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), cudaMemcpyDeviceToHost, Stream())); std::vector output_dims(input_dimensions); if (has_axis_) { @@ -80,7 +94,8 @@ Status Compress::ComputeInternal(OpKernelContext* ctx) const { } } - ORT_RETURN_IF_ERROR(CompressImpl(element_bytes, + ORT_RETURN_IF_ERROR(CompressImpl(Stream(), + element_bytes, gsl::narrow_cast(valid_condition_length), gsl::narrow_cast(axis_right_stride), has_axis_ ? gsl::narrow_cast(input_dimensions[axis]) diff --git a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu index 58d4102936..6f936e5965 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu @@ -3,21 +3,25 @@ #include "core/providers/cuda/cu_inc/common.cuh" #include "core/providers/cuda/cuda_common.h" -#include "compress_impl.h" + //TODO:fix the warnings #ifdef _MSC_VER #pragma warning(disable : 4244) #endif -#include -#include + +#include "core/providers/cuda/tensor/compress_impl.h" +#include namespace onnxruntime { namespace cuda { -void PrefixSumImpl(const int8_t* condition_data, - int32_t* condition_cumulative_sum, - const size_t length) { - thrust::inclusive_scan(thrust::device, condition_data, condition_data + length, condition_cumulative_sum); +cudaError_t CompressCalcPrefixSumTempStorageBytes(cudaStream_t stream, const int8_t* condition_data, int* condition_cumulative_sum, int length, size_t& temp_storage_bytes) { + return cub::DeviceScan::InclusiveSum( + nullptr, temp_storage_bytes, condition_data, condition_cumulative_sum, length, stream); +} +cudaError_t CompressInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int* condition_cumulative_sum, int length) { + return cub::DeviceScan::InclusiveSum( + d_temp_storage, temp_storage_bytes, condition_data, condition_cumulative_sum, length, stream); } template @@ -44,7 +48,8 @@ __global__ void _CompressKernel(const int32_t valid_condition_length, } } -Status CompressImpl(const size_t element_bytes, +Status CompressImpl(cudaStream_t stream, + const size_t element_bytes, const int32_t valid_condition_length, const int32_t axis_right_stride, const int32_t input_axis_dim_length, @@ -62,7 +67,7 @@ Status CompressImpl(const size_t element_bytes, switch (element_bytes) { case sizeof(int8_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -74,7 +79,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int16_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -86,7 +91,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int32_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, @@ -98,7 +103,7 @@ Status CompressImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int64_t): - _CompressKernel<<>>( + _CompressKernel<<>>( valid_condition_length, axis_right_stride_div, input_axis_included_stride_div, diff --git a/onnxruntime/core/providers/cuda/tensor/compress_impl.h b/onnxruntime/core/providers/cuda/tensor/compress_impl.h index 08005944cc..3397841476 100644 --- a/onnxruntime/core/providers/cuda/tensor/compress_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/compress_impl.h @@ -9,11 +9,11 @@ namespace onnxruntime { namespace cuda { -void PrefixSumImpl(const int8_t* condition_data, - int32_t* condition_cumulative_sum, - const size_t length); +cudaError_t CompressCalcPrefixSumTempStorageBytes(cudaStream_t stream, const int8_t* condition_data, int* condition_cumulative_sum, int length, size_t& temp_storage_bytes); +cudaError_t CompressInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int* condition_cumulative_sum, int length); -Status CompressImpl(const size_t element_bytes, +Status CompressImpl(cudaStream_t stream, + const size_t element_bytes, const int32_t valid_condition_length, const int32_t axis_right_stride, const int32_t input_axis_dim_length, diff --git a/onnxruntime/core/providers/cuda/tensor/concat.cc b/onnxruntime/core/providers/cuda/tensor/concat.cc index 309d9c8243..dfeace4b7e 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat.cc +++ b/onnxruntime/core/providers/cuda/tensor/concat.cc @@ -77,7 +77,8 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const { int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); int block_size_including_axis_dim = static_cast(p.output_axis_pitch); auto element_bytes = p.output_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu index 2a24efe9ca..6047f12189 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu @@ -38,7 +38,8 @@ __global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_di output_data[id] = reinterpret_cast(input_ptr[input_index])[input_pos]; } -Status ConcatImpl(const size_t element_bytes, +Status ConcatImpl(cudaStream_t stream, + const size_t element_bytes, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* concat_sizes, @@ -54,7 +55,7 @@ Status ConcatImpl(const size_t element_bytes, switch (element_bytes) { case sizeof(int8_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -62,7 +63,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int16_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -70,7 +71,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int32_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), @@ -78,7 +79,7 @@ Status ConcatImpl(const size_t element_bytes, (CUDA_LONG)N); break; case sizeof(int64_t): - _ConcatKernel<<>>( + _ConcatKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, reinterpret_cast(output_data), diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.h b/onnxruntime/core/providers/cuda/tensor/concat_impl.h index 110bf5bf32..2a3b6ba9f9 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { -Status ConcatImpl(const size_t element_bytes, +Status ConcatImpl(cudaStream_t stream, + const size_t element_bytes, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* concat_sizes, diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index a4040261d4..8ee8e3df91 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -98,6 +98,7 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { } return ExpandImpl( + Stream(), input_data_tensor.DataType()->Size(), gsl::narrow_cast(output_shape.Size()), gsl::narrow_cast(input_data_tensor.Shape().Size()), diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu index 79a7dababa..fe7716696a 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.cu @@ -22,11 +22,11 @@ __global__ void _FillFromDataPtrKernel(T* output_data, const T* input_data, CUDA } template -void FillFromDataPtr(T* output_data, const T* input_data, int64_t count) { +void FillFromDataPtr(cudaStream_t stream, T* output_data, const T* input_data, int64_t count) { int blocksPerGrid = gsl::narrow_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); _FillFromDataPtrKernel - <<>>(output_data, input_data, N); + <<>>(output_data, input_data, N); } template @@ -89,10 +89,11 @@ __global__ void ExpandKernel( } } -Status ExpandByFill(const size_t element_size, const int N, const void* input_data, void* output_data) { +Status ExpandByFill(cudaStream_t stream, const size_t element_size, const int N, const void* input_data, void* output_data) { #define EXPAND_FILL_ON(TYPE) \ case sizeof(TYPE): \ - FillFromDataPtr(reinterpret_cast(output_data), \ + FillFromDataPtr(stream, \ + reinterpret_cast(output_data), \ reinterpret_cast(input_data), \ static_cast(N)); \ break @@ -109,6 +110,7 @@ Status ExpandByFill(const size_t element_size, const int N, const void* input_da } Status Expand2D( + cudaStream_t stream, const size_t element_size, const int N, const void* input_data, @@ -118,7 +120,7 @@ Status Expand2D( const int input_view_stride1) { #define EXPAND2D_ON(TYPE) \ case sizeof(TYPE): \ - ExpandKernel2D<<>>( \ + ExpandKernel2D<<>>( \ N, reinterpret_cast(input_data), reinterpret_cast(output_data), \ fdm_output_stride0, input_view_stride0, input_view_stride1); \ break @@ -136,6 +138,7 @@ Status Expand2D( } Status ExpandImpl( + cudaStream_t stream, const size_t element_size, const int N_output, const int N_input, @@ -146,12 +149,12 @@ Status ExpandImpl( const int rank = static_cast(output_strides.Size()); if (rank == 1) { if (N_input == N_output) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice, stream)); } else { // N_input == 1 - return ExpandByFill(element_size, N_output, input_data, output_data); + return ExpandByFill(stream, element_size, N_output, input_data, output_data); } } else if (rank == 2) { - return Expand2D(element_size, N_output, input_data, output_data, + return Expand2D(stream, element_size, N_output, input_data, output_data, output_strides[0], static_cast(input_strides[0]), static_cast(input_strides[1])); @@ -162,7 +165,7 @@ Status ExpandImpl( #define EXPAND_ON(TYPE) \ case sizeof(TYPE): \ ExpandKernel \ - <<>>( \ + <<>>( \ rank, N_output, reinterpret_cast(input_data), reinterpret_cast(output_data), \ output_strides, input_strides); \ break diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.h b/onnxruntime/core/providers/cuda/tensor/expand_impl.h index 27d5d69d9c..e64c601323 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.h @@ -12,6 +12,7 @@ namespace onnxruntime { namespace cuda { Status ExpandImpl( + cudaStream_t stream, const size_t element_size, const int N_output, const int N_input, diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.cc b/onnxruntime/core/providers/cuda/tensor/eye_like.cc index 61ae265d60..82ea145da9 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like.cc +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc @@ -35,6 +35,7 @@ ONNX_OPERATOR_KERNEL_EX( #define TYPED_FUNCTION_CALL(T) \ EyeLikeImpl::MappedType>( \ + Stream(), \ offset, \ dim1 + 1, \ reinterpret_cast::MappedType*>(T2->template MutableData()), \ @@ -52,7 +53,7 @@ Status EyeLike::ComputeInternal(OpKernelContext* context) const { // set output tensor shape same as input tensor and set all values to zero auto* T2 = context->Output(0, input_dims); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(T2->MutableDataRaw(), 0, T2->SizeInBytes(), Stream())); auto dim0 = input_dims[0]; auto dim1 = input_dims[1]; diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu index 8f1216e43c..a3e588a288 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.cu @@ -23,6 +23,7 @@ __global__ void _EyeLikeKernel( template void EyeLikeImpl( + cudaStream_t stream, size_t offset, size_t stripe, T* output_data, @@ -31,11 +32,12 @@ void EyeLikeImpl( int blocksPerGrid = (int)(ceil(static_cast(diag_count) / block_size)); CUDA_LONG N = static_cast(diag_count); - _EyeLikeKernel<<>>(offset, stripe, output_data, N); + _EyeLikeKernel<<>>(offset, stripe, output_data, N); } #define SPECIALIZED_IMPL(T) \ template void EyeLikeImpl( \ + cudaStream_t stream, \ size_t offset, \ size_t stripe, \ T* output_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h index f95ca63782..db06a2d3ea 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void EyeLikeImpl( + cudaStream_t stream, size_t offset, // offset of first element in diagnal size_t stripe, // stripe, here it's width + 1 T* output_data, // output buffer diff --git a/onnxruntime/core/providers/cuda/tensor/flatten.cc b/onnxruntime/core/providers/cuda/tensor/flatten.cc index 0ac18a5dff..7f36a5fdb0 100644 --- a/onnxruntime/core/providers/cuda/tensor/flatten.cc +++ b/onnxruntime/core/providers/cuda/tensor/flatten.cc @@ -66,7 +66,7 @@ Status Flatten::ComputeInternal(OpKernelContext* ctx) const { void* target = Y->MutableDataRaw(); if (target != source) { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X_shape.Size() * X->DataType()->Size(), - cudaMemcpyDeviceToDevice)); + cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/gather.cc b/onnxruntime/core/providers/cuda/tensor/gather.cc index d857c5719a..f648a59e06 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather.cc @@ -77,6 +77,7 @@ Status Gather::ComputeInternal(OpKernelContext* context) const { if (p.indices_tensor->IsDataType() || p.indices_tensor->IsDataType()) { GatherImpl( + Stream(), input_block_size, indices_max, divmod_output_block_size, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements.cc b/onnxruntime/core/providers/cuda/tensor/gather_elements.cc index 82da342db7..6ada2248f7 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements.cc @@ -77,6 +77,7 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const { if (indices_tensor->IsDataType() || indices_tensor->IsDataType()) { GatherElementsImpl( + Stream(), input_rank, input_tensor->DataRaw(), input_dims[axis], diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu index fc4cc644c1..87920a7fcb 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu @@ -83,6 +83,7 @@ __global__ void _GatherElementsKernel( } void GatherElementsImpl( + cudaStream_t stream, const int64_t rank, const void* input_data, const int64_t input_dim_along_axis, @@ -103,7 +104,7 @@ void GatherElementsImpl( switch (element_size) { case sizeof(int8_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -111,7 +112,7 @@ void GatherElementsImpl( case sizeof(int16_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -119,7 +120,7 @@ void GatherElementsImpl( case sizeof(int32_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); @@ -127,7 +128,7 @@ void GatherElementsImpl( case sizeof(int64_t): { using CudaType = typename ToCudaType::MappedType; - _GatherElementsKernel<<>>( + _GatherElementsKernel<<>>( rank, reinterpret_cast(input_data), input_dim_along_axis, input_strides, indices_data, indices_size, index_element_size, indices_strides, axis, reinterpret_cast(output_data)); diff --git a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h index 1caaea647c..920415678b 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h @@ -10,6 +10,7 @@ namespace onnxruntime { namespace cuda { void GatherElementsImpl( + cudaStream_t stream, const int64_t rank, // both inputs have same rank and this is validated in the main Compute const void* input_data, const int64_t input_dim_along_axis, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_impl.cu index aa42e6d6fb..2fb91e7ce5 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherKernel( } void GatherImpl( + cudaStream_t stream, const int64_t input_block_size, const int64_t indices_max, const fast_divmod& output_block_size, @@ -68,28 +69,28 @@ void GatherImpl( switch (element_size) { case sizeof(int8_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int16_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int32_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); } break; case sizeof(int64_t): { using CudaType = typename ToCudaType::MappedType; - _GatherKernel<<>>( + _GatherKernel<<>>( input_block_size, indices_max, output_block_size, block_size, indices_data, index_element_size, reinterpret_cast(input_data), reinterpret_cast(output_data), (CUDA_LONG)N); diff --git a/onnxruntime/core/providers/cuda/tensor/gather_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_impl.h index 11af5c3888..03fd1dee46 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { void GatherImpl( + cudaStream_t stream, const int64_t input_block_size, const int64_t indices_max, const fast_divmod& output_block_size, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc index 209fd57eca..1fd4b3f89e 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc @@ -40,6 +40,7 @@ Status CheckBatchDimensionsMatch( template Status GatherNDBase::PrepareCompute( + cudaStream_t stream, const int64_t batch_dims, const TensorShape& input_shape, const TensorShape& indices_shape, @@ -70,13 +71,14 @@ Status GatherNDBase::PrepareCompute( sizes_from_slice_dims_buffer.get(), sizes_from_slice_dims.data(), sizes_from_slice_dims.size() * sizeof(int64_t), - cudaMemcpyHostToDevice)); + cudaMemcpyHostToDevice, stream)); input_slice_offsets_buffer = GetScratchBuffer(num_slices); TArray input_dims(input_shape.GetDims()); ComputeSliceOffsetsImpl( + stream, batch_dims, input_dims, num_slices, @@ -145,13 +147,15 @@ REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 12, 12) template struct GatherNDComputeImpl { - void operator()(const int64_t num_slices, + void operator()(cudaStream_t stream, + const int64_t num_slices, const int64_t slice_size, const void* const kernel_input_data, void* const kernel_output_data, int64_t* const input_slice_offsets_data) const { typedef typename ToCudaType::MappedType CudaT; - GatherNDImpl(num_slices, kernel_input_data, + GatherNDImpl(stream, + num_slices, kernel_input_data, kernel_output_data, slice_size, input_slice_offsets_data); } @@ -191,14 +195,15 @@ Status GatherND::ComputeInternal(OpKernelContext* context) const { int64_t num_slices; int64_t slice_size; IAllocatorUniquePtr input_slice_offsets_buffer; - ORT_RETURN_IF_ERROR(PrepareCompute(batch_dims_, input_shape, indices_shape, indices_tensor, + ORT_RETURN_IF_ERROR(PrepareCompute(Stream(), + batch_dims_, input_shape, indices_shape, indices_tensor, num_slices, slice_size, input_slice_offsets_buffer)); const void* const kernel_input_data = input_tensor->DataRaw(); void* const kernel_output_data = output_tensor->MutableDataRaw(); utils::MLTypeCallDispatcher t_disp(input_tensor->GetElementType()); - t_disp.Invoke(num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); + t_disp.Invoke(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd.h b/onnxruntime/core/providers/cuda/tensor/gather_nd.h index 56414fad69..527a4b8c54 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd.h @@ -23,6 +23,7 @@ class GatherNDBase : public CudaKernel { protected: template Status PrepareCompute( + cudaStream_t stream, const int64_t batch_dims, const TensorShape& input_shape, const TensorShape& indices_shape, diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu index c0323acaec..3f0275547c 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherNDKernel( template void ComputeSliceOffsetsImpl( + cudaStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -62,7 +63,7 @@ void ComputeSliceOffsetsImpl( const TIndex* const indices_data, // num_slices * num_slice_dims elements int64_t* const input_slice_offsets_data) { // num_slices elements const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices, GridDim::maxThreadsPerBlock)); - _ComputeSliceOffsetsKernel<<>>( + _ComputeSliceOffsetsKernel<<>>( batch_dims, input_dims, num_slices, @@ -76,18 +77,20 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock)); - _GatherNDKernel<<>>( + _GatherNDKernel<<>>( num_slices, static_cast(input_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \ template void ComputeSliceOffsetsImpl( \ + cudaStream_t stream, \ const int64_t batch_dims, \ const TArray input_dims, \ const size_t num_slices, \ @@ -99,7 +102,7 @@ void GatherNDImpl( int64_t* const input_slice_offsets_data); #define SPECIALIZED_IMPL(T) \ - template void GatherNDImpl(const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); + template void GatherNDImpl(cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t) SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t) diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h index e989fb330a..828f6ab6af 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd_impl.h @@ -9,6 +9,7 @@ namespace cuda { template void ComputeSliceOffsetsImpl( + cudaStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -21,6 +22,7 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + cudaStream_t stream, const size_t num_slices, const void* input_data, void* output_data, @@ -30,6 +32,7 @@ void GatherNDImpl( #ifdef ENABLE_TRAINING template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.h b/onnxruntime/core/providers/cuda/tensor/identity_op.h index 3a6c48b31c..f00bb6414d 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.h +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.h @@ -25,7 +25,7 @@ class IdentityOp final : public CudaKernel { void* target = Y->MutableDataRaw(X_type); //If source and target pointers are not equal, we need to copy the data. if (target != source) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), cudaMemcpyDeviceToDevice, Stream())); } if (is_dropout) { @@ -39,7 +39,7 @@ class IdentityOp final : public CudaKernel { void* mask_data = mask->MutableDataRaw(); // In 'test'/'inference' mode, there are no input values dropped out // so fill the buffer with 0/false - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mask_data, 0, mask->SizeInBytes(), Stream())); } } diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu index 1ac1ae79e9..90be2b8b27 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu @@ -17,15 +17,15 @@ int NonZeroCalcBlockCount(int64_t x_size) { } cudaError_t NonZeroCalcPrefixSumTempStorageBytes( - int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) { + cudaStream_t stream, int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) { temp_storage_bytes = 0; - return cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks); + return cub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream); } cudaError_t NonZeroInclusivePrefixSum( - void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) { + cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) { return cub::DeviceScan::InclusiveSum( - d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks); + d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream); } template @@ -70,37 +70,37 @@ __global__ void NonZeroOutputPositionsKernel( } template -cudaError_t NonZeroCountEachBlock(const InputT* x, int64_t x_size, int* count_in_blocks) { +cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const InputT* x, int64_t x_size, int* count_in_blocks) { int num_blocks = NonZeroCalcBlockCount(x_size); - NonZeroCountEachBlockKernel<<>>( + NonZeroCountEachBlockKernel<<>>( x, x_size, count_in_blocks); return cudaSuccess; } template cudaError_t NonZeroOutputPositions( - const InputT* x, int64_t x_size, int x_rank, const TArray& x_strides, + cudaStream_t stream, const InputT* x, int64_t x_size, int x_rank, const TArray& x_strides, const int* prefix_counts, int nonzero_elements, int64_t* results) { int num_blocks = NonZeroCalcBlockCount(x_size); - NonZeroOutputPositionsKernel<<>>( + NonZeroOutputPositionsKernel<<>>( x, x_size, x_rank, x_strides, prefix_counts, nonzero_elements, results); return cudaSuccess; } -template cudaError_t NonZeroCountEachBlock(const bool*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const uint8_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const int64_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const int32_t*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const float*, int64_t, int*); -template cudaError_t NonZeroCountEachBlock(const half*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const bool*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const uint8_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const int64_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const int32_t*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const float*, int64_t, int*); +template cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const half*, int64_t, int*); -template cudaError_t NonZeroOutputPositions(const bool*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const uint8_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const int64_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const int32_t*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const float*, int64_t, int, const TArray&, const int*, int, int64_t*); -template cudaError_t NonZeroOutputPositions(const half*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const bool*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const uint8_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const int64_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const int32_t*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const float*, int64_t, int, const TArray&, const int*, int, int64_t*); +template cudaError_t NonZeroOutputPositions(cudaStream_t stream, const half*, int64_t, int, const TArray&, const int*, int, int64_t*); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h index 7d55e83133..dfbe433bd5 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h @@ -10,19 +10,19 @@ namespace cuda { int NonZeroCalcBlockCount(int64_t x_size); -cudaError_t NonZeroCalcPrefixSumTempStorageBytes(int* prefix_counts, int number_of_blocks, size_t& ); +cudaError_t NonZeroCalcPrefixSumTempStorageBytes(cudaStream_t stream, int* prefix_counts, int number_of_blocks, size_t& ); -cudaError_t NonZeroInclusivePrefixSum(void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks); +cudaError_t NonZeroInclusivePrefixSum(cudaStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks); // count nonzero elements in each block into counts_in_blocks, // the counts_in_blocks buffer is pre-allocated on gpu first. template -cudaError_t NonZeroCountEachBlock(const InputT* x, int64_t x_size, int* counts_in_blocks); +cudaError_t NonZeroCountEachBlock(cudaStream_t stream, const InputT* x, int64_t x_size, int* counts_in_blocks); // output nonzero positions using input x and prefix_counts for each blocks template cudaError_t NonZeroOutputPositions( - const InputT *x, int64_t x_size, int x_rank, const TArray& x_strides, + cudaStream_t stream, const InputT *x, int64_t x_size, int x_rank, const TArray& x_strides, const int* prefix_counts, int nonzero_elements, int64_t* results); } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc index 67cab1e5df..992fe5dfab 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_op.cc @@ -66,17 +66,18 @@ Status NonZero::ComputeInternal(OpKernelContext* context) const { const int number_of_blocks = NonZeroCalcBlockCount(x_size); auto prefix_buffer = GetScratchBuffer(number_of_blocks); int* prefix_counts = prefix_buffer.get(); - CUDA_RETURN_IF_ERROR(NonZeroCountEachBlock(x_data, x_size, prefix_counts)); + CUDA_RETURN_IF_ERROR(NonZeroCountEachBlock(Stream(), x_data, x_size, prefix_counts)); size_t temp_storage_bytes = 0; - CUDA_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(prefix_counts, number_of_blocks, temp_storage_bytes)); + CUDA_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(Stream(), prefix_counts, number_of_blocks, temp_storage_bytes)); auto temp_buffer = GetScratchBuffer(temp_storage_bytes); auto d_temp_storage = temp_buffer.get(); - CUDA_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks)); + CUDA_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(Stream(), d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks)); - CUDA_RETURN_IF_ERROR(cudaMemcpy( + // cudaMemcpyAsync from device memory to pageable host memory will return only once the copy has completed. + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( &nonzero_elements, prefix_counts + number_of_blocks - 1, - sizeof(int), cudaMemcpyDeviceToHost)); + sizeof(int), cudaMemcpyDeviceToHost, Stream())); TArray fdm_x_strides(x_rank); TensorPitches x_strides(x_dims); @@ -87,7 +88,7 @@ Status NonZero::ComputeInternal(OpKernelContext* context) const { auto* output_tensor = context->Output(0, {x_rank, nonzero_elements}); ORT_ENFORCE(output_tensor, "failed to get first output!"); CUDA_RETURN_IF_ERROR(NonZeroOutputPositions( - x_data, x_size, x_rank, fdm_x_strides, + Stream(), x_data, x_size, x_rank, fdm_x_strides, prefix_counts, nonzero_elements, output_tensor->template MutableData())); } else { context->Output(0, {x_rank, nonzero_elements}); diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.cc b/onnxruntime/core/providers/cuda/tensor/onehot.cc index de68c6b752..7847a2309b 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.cc +++ b/onnxruntime/core/providers/cuda/tensor/onehot.cc @@ -66,8 +66,9 @@ Status OneHotOp::ComputeInternal(OpKernelContext* auto* output_data = reinterpret_cast(output->MutableData()); if (values_data[0] == CudaT_Out(0.f)) { - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes())); - OneHotWithZeroOffValueImpl(indices_data, + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream())); + OneHotWithZeroOffValueImpl(Stream(), + indices_data, fdm_suffix, depth_val, values_data[1], @@ -77,7 +78,8 @@ Status OneHotOp::ComputeInternal(OpKernelContext* } const fast_divmod fdm_depth_suffix(gsl::narrow_cast(depth_val * suffix_dim_size)); - OneHotImpl(indices_data, fdm_depth_suffix, fdm_suffix, depth_val, + OneHotImpl(Stream(), + indices_data, fdm_depth_suffix, fdm_suffix, depth_val, values_data[1], values_data[0], output_data, diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.cu b/onnxruntime/core/providers/cuda/tensor/onehot.cu index 88cf5576dc..1fb8dbe8b8 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.cu +++ b/onnxruntime/core/providers/cuda/tensor/onehot.cu @@ -56,6 +56,7 @@ __global__ void _OneHotWithZeroOffValueImpl( template void OneHotImpl( + cudaStream_t stream, const in_type* indices_data, const fast_divmod fdm_depth_suffix, const fast_divmod fdm_suffix, @@ -66,7 +67,7 @@ void OneHotImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _OneHotImpl<<>>( + _OneHotImpl<<>>( indices_data, fdm_depth_suffix, fdm_suffix, @@ -79,6 +80,7 @@ void OneHotImpl( template void OneHotWithZeroOffValueImpl( + cudaStream_t stream, const in_type* indices_data, const fast_divmod fdm_suffix, const int64_t depth_val, @@ -87,7 +89,7 @@ void OneHotWithZeroOffValueImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _OneHotWithZeroOffValueImpl<<>>( + _OneHotWithZeroOffValueImpl<<>>( indices_data, fdm_suffix, depth_val, @@ -98,6 +100,7 @@ void OneHotWithZeroOffValueImpl( #define SPECIALIZED_OneHotImpl(in_type, out_type) \ template void OneHotImpl( \ + cudaStream_t stream, \ const in_type* indices_data, \ const fast_divmod fdm_depth_suffix, \ const fast_divmod fdm_suffix, \ @@ -115,6 +118,7 @@ SPECIALIZED_OneHotImpl(int32_t, half) #define SPECIALIZED_OneHotWithZeroOffValueImpl(in_type, out_type) \ template void OneHotWithZeroOffValueImpl( \ + cudaStream_t stream, \ const in_type* indices_data, \ const fast_divmod fdm_suffix, \ const int64_t depth_val, \ diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.h b/onnxruntime/core/providers/cuda/tensor/onehot.h index 55d7a961e0..fff0acd1f1 100644 --- a/onnxruntime/core/providers/cuda/tensor/onehot.h +++ b/onnxruntime/core/providers/cuda/tensor/onehot.h @@ -11,6 +11,7 @@ namespace cuda { template void OneHotImpl( + cudaStream_t stream, const in_type* indices, const fast_divmod fdm_depth_suffix, const fast_divmod fdm_suffix, @@ -22,6 +23,7 @@ void OneHotImpl( template void OneHotWithZeroOffValueImpl( + cudaStream_t stream, const in_type* indices, const fast_divmod fdm_suffix, const int64_t depth_val, diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index e870306c31..2e344ebb7e 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -123,7 +123,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { CUDA_RETURN_IF_ERROR(cudaMemcpyAsync( output_tensor.template MutableData(), input_tensor.template Data(), sizeof(typename ToCudaType::MappedType) * output_shape.Size(), - cudaMemcpyDeviceToDevice, 0)); + cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -134,6 +134,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { } PadImpl( + Stream(), dimension_count, input_dims, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu index 400189b535..2e1820f198 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.cu @@ -69,6 +69,7 @@ __global__ void _PadKernel( template void PadImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, @@ -86,17 +87,17 @@ void PadImpl( int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); switch (pad_mode) { case 0: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; case 1: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; case 2: - _PadKernel<<>>( + _PadKernel<<>>( shape_rank, input_dims, input_strides, lower_pads, upper_pads, pad_value, input_data, fdm_output_strides, output_data, N); break; @@ -104,7 +105,7 @@ void PadImpl( } #define SPECIALIZED_IMPL(T) \ - template void PadImpl(const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, const TArray& lower_pads, const TArray& upper_pads, const T pad_value, const int pad_mode, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); + template void PadImpl(cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, const TArray& lower_pads, const TArray& upper_pads, const T pad_value, const int pad_mode, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/pad_impl.h b/onnxruntime/core/providers/cuda/tensor/pad_impl.h index 68365512d8..8be69dcb1f 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/pad_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void PadImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_dims, const TArray& input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc index 3f321fadb0..17fafa0af5 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -32,7 +32,7 @@ Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); - CudaQuantizeLinear(input, output, scale, zero_point, num_of_elements); + CudaQuantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements); return Status::OK(); } @@ -59,7 +59,7 @@ Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const CudaU* scale = reinterpret_cast(y_scale.template Data()); const auto num_of_elements = x_shape.Size(); - CudaDequantizeLinear(input, output, scale, zero_point, num_of_elements); + CudaDequantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu index 8f31ea9e01..ff300e4bda 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu @@ -44,12 +44,12 @@ __global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* } template -Status CudaQuantizeLinear(const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) { +Status CudaQuantizeLinear(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - QuantizeLinearKernel<<>>( + QuantizeLinearKernel<<>>( input, output, scale, @@ -75,12 +75,12 @@ __global__ void DequantizeLinearKernel(const InT* input, OutT* output, const Out } template -Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) { +Status CudaDequantizeLinear(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) { if (num_of_element <= 0) return Status::OK(); int blocksPerGrid = static_cast(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); - DequantizeLinearKernel<<>>( + DequantizeLinearKernel<<>>( input, output, scale, @@ -89,15 +89,15 @@ Status CudaDequantizeLinear(const InT* input, OutT* output, const OutT* scale, c return Status::OK(); } -template Status CudaQuantizeLinear(const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaQuantizeLinear(const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinear(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); -template Status CudaDequantizeLinear(const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh index 5d140981d6..b6773de316 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh @@ -12,10 +12,10 @@ namespace onnxruntime { namespace cuda { template -Status CudaQuantizeLinear(const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element); template -Status CudaDequantizeLinear(const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index c93e8accd2..36d138f107 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -42,23 +42,29 @@ __device__ CudaFunctionNearestPixel func_NearestPixel_ROUND_PREFER_CEIL = Neares __device__ CudaFunctionNearestPixel func_NearestPixel_FLOOR = NearestPixel_FLOOR; __device__ CudaFunctionNearestPixel func_NearestPixel_CEIL = NearestPixel_CEIL; -CudaFunctionNearestPixel GetDeviceNearstPixelFunction(ResizeNearestMode nearest_mode) { +CudaFunctionNearestPixel GetDeviceNearstPixelFunction(cudaStream_t stream, ResizeNearestMode nearest_mode) { static bool already_copied = false; static std::mutex s_mutext; static CudaFunctionNearestPixel s_nearest_pixel[ResizeNearestMode::NearestModeCount]; if (!already_copied) { std::lock_guard lock(s_mutext); if (!already_copied) { - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::SIMPLE], - func_NearestPixel_SIMPLE, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_FLOOR], - func_NearestPixel_ROUND_PREFER_FLOOR, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_CEIL], - func_NearestPixel_ROUND_PREFER_CEIL, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::FLOOR], - func_NearestPixel_FLOOR, sizeof(CudaFunctionNearestPixel))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_nearest_pixel[ResizeNearestMode::CEIL], - func_NearestPixel_CEIL, sizeof(CudaFunctionNearestPixel))); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::SIMPLE], + func_NearestPixel_SIMPLE, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_FLOOR], + func_NearestPixel_ROUND_PREFER_FLOOR, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::ROUND_PREFER_CEIL], + func_NearestPixel_ROUND_PREFER_CEIL, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::FLOOR], + func_NearestPixel_FLOOR, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_nearest_pixel[ResizeNearestMode::CEIL], + func_NearestPixel_CEIL, sizeof(CudaFunctionNearestPixel), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaStreamSynchronize(stream)); already_copied = true; } } @@ -105,25 +111,32 @@ __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_ALIGN_CORNERS __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN = TransformCoordinate_TF_HALF_PIXEL_FOR_NN; __device__ CudaFunctionOriginalCoordinate func_TransformCoordinate_TF_CROP_AND_RESIZE = TransformCoordinate_TF_CROP_AND_RESIZE; -CudaFunctionOriginalCoordinate GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode coordinate_transform_mode) { +CudaFunctionOriginalCoordinate GetDeviceOriginalCoordinateFunc(cudaStream_t stream, ResizeCoordinateTransformationMode coordinate_transform_mode) { static bool already_copied = false; static std::mutex s_mutext; static CudaFunctionOriginalCoordinate s_coordinate_tranforms[ResizeCoordinateTransformationMode::CoordinateTransformationModeCount]; if (!already_copied) { std::lock_guard lock(s_mutext); if (!already_copied) { - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::HALF_PIXEL], - func_TransformCoordinate_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ASYMMETRIC], - func_TransformCoordinate_ASYMMETRIC, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL], - func_TransformCoordinate_PYTORCH_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ALIGN_CORNERS], - func_TransformCoordinate_ALIGN_CORNERS, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN], - func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN, sizeof(CudaFunctionOriginalCoordinate))); - CUDA_CALL(cudaMemcpyFromSymbol(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE], - func_TransformCoordinate_TF_CROP_AND_RESIZE, sizeof(CudaFunctionOriginalCoordinate))); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::HALF_PIXEL], + func_TransformCoordinate_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ASYMMETRIC], + func_TransformCoordinate_ASYMMETRIC, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL], + func_TransformCoordinate_PYTORCH_HALF_PIXEL, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::ALIGN_CORNERS], + func_TransformCoordinate_ALIGN_CORNERS, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN], + func_TransformCoordinate_TF_HALF_PIXEL_FOR_NN, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaMemcpyFromSymbolAsync(&s_coordinate_tranforms[ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE], + func_TransformCoordinate_TF_CROP_AND_RESIZE, sizeof(CudaFunctionOriginalCoordinate), + 0, cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaStreamSynchronize(stream)); already_copied = true; } } @@ -591,6 +604,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, template void ResizeNearestImpl( + cudaStream_t stream, const int rank, TArray& input_shape, TArray& output_shape, @@ -611,7 +625,7 @@ void ResizeNearestImpl( unsigned int blocksPerGrid = static_cast(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); bool could2d = rank >= 2 && - transform_coordinate != GetDeviceOriginalCoordinateFunc(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && + transform_coordinate != GetDeviceOriginalCoordinateFunc(stream, ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE) && std::all_of(scales_vals.Data(), scales_vals.Data() + (rank - 2), [](float v) { return v == 1.0; }); if (could2d) { int64_t output_height = output_shape[rank - 2]; @@ -619,7 +633,7 @@ void ResizeNearestImpl( fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast(output_height * output_width)); int blocksPerDimsMappingGrid = static_cast(ceil((output_height + output_width) / 32.0)); - _ResizeNearestMappingKernel2D<<>>( + _ResizeNearestMappingKernel2D<<>>( static_cast(input_shape[rank - 2]), static_cast(input_shape[rank - 1]), static_cast(output_height), static_cast(output_width), scales_vals[rank - 2], scales_vals[rank - 1], @@ -628,7 +642,7 @@ void ResizeNearestImpl( extrapolation_enabled, transform_coordinate, calc_nearest_pixel, dims_mapping); if (extrapolation_enabled) { - _ResizeNearestKernel2D<<>>( + _ResizeNearestKernel2D<<>>( output_height, output_width, input_shape[rank - 2] * input_shape[rank - 1], static_cast(input_shape[rank - 1]), div_output_image, output_div_pitches[rank - 2], @@ -636,7 +650,7 @@ void ResizeNearestImpl( extrapolation_value, dims_mapping); } else { - _ResizeNearestKernel2D<<>>( + _ResizeNearestKernel2D<<>>( output_height, output_width, input_shape[rank - 2] * input_shape[rank - 1], static_cast(input_shape[rank - 1]), div_output_image, output_div_pitches[rank - 2], @@ -649,14 +663,14 @@ void ResizeNearestImpl( int64_t total_dim_sum = std::accumulate(output_shape.Data(), output_shape.Data() + rank, (int64_t)0); int blocksPerDimsMappingGrid = (int)(ceil(static_cast(total_dim_sum) / 32)); - _ResizeNearestMappingKernel<<>>( + _ResizeNearestMappingKernel<<>>( rank, input_shape, output_shape, scales_vals, roi_vals, total_dim_sum, extrapolation_enabled, transform_coordinate, calc_nearest_pixel, reinterpret_cast(dims_mapping), reinterpret_cast(reinterpret_cast(dims_mapping) + rank)); - _ResizeNearestKernel<<>>( + _ResizeNearestKernel<<>>( rank, input_strides, output_div_pitches, input_data, output_data, N, extrapolation_value, @@ -667,6 +681,7 @@ void ResizeNearestImpl( template void ResizeImpl( + cudaStream_t stream, const UpsampleMode upsample_mode, const int rank, TArray& input_shape, @@ -688,15 +703,15 @@ void ResizeImpl( bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) && (coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE); if (isSame) { - cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_data, input_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream); return; } - CudaFunctionOriginalCoordinate transform_coordinate = GetDeviceOriginalCoordinateFunc(coordinate_transform_mode); - CudaFunctionNearestPixel calc_nearest_pixel = GetDeviceNearstPixelFunction(nearest_mode); + CudaFunctionOriginalCoordinate transform_coordinate = GetDeviceOriginalCoordinateFunc(stream, coordinate_transform_mode); + CudaFunctionNearestPixel calc_nearest_pixel = GetDeviceNearstPixelFunction(stream, nearest_mode); if (upsample_mode == UpsampleMode::NN) { ResizeNearestImpl( - rank, input_shape, output_shape, input_strides, output_div_pitches, + stream, rank, input_shape, output_shape, input_strides, output_div_pitches, scales_vals, roi_vals, input_data, output_data, N, extrapolation_enabled, extrapolation_value, cubic_coeff_a, transform_coordinate, calc_nearest_pixel, @@ -734,7 +749,7 @@ void ResizeImpl( switch (upsample_mode) { case UpsampleMode::LINEAR: if (is_2D) { - _ResizeBilinearCoordinateMapping<<>>( + _ResizeBilinearCoordinateMapping<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, scales_vals[rank - 2], scales_vals[rank - 1], @@ -742,7 +757,7 @@ void ResizeImpl( roi_vals[rank - 1], roi_vals[rank - 1 + rank], output_height + output_width, extrapolation_enabled, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeBilinearKernel<<>>( + _ResizeBilinearKernel<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, output_div_pitches[rank - 2], div_output_image, @@ -750,7 +765,7 @@ void ResizeImpl( reinterpret_cast(dims_mapping)); return; } else if (is_3D) { - _ResizeTrilinearCoordinateMapping<<>>( + _ResizeTrilinearCoordinateMapping<<>>( input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1], output_depth, output_height, output_width, scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1], @@ -759,7 +774,7 @@ void ResizeImpl( roi_vals[rank - 1], roi_vals[rank - 1 + rank], output_depth + output_height + output_width, extrapolation_enabled, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeTrilinearKernel<<>>( + _ResizeTrilinearKernel<<>>( input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1], output_depth, output_height, output_width, output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image, @@ -772,7 +787,7 @@ void ResizeImpl( case UpsampleMode::CUBIC: if (is_2D) { - _ResizeCubicCoordinateMapping<<>>( + _ResizeCubicCoordinateMapping<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, scales_vals[rank - 2], scales_vals[rank - 1], @@ -781,7 +796,7 @@ void ResizeImpl( output_height + output_width, extrapolation_enabled, cubic_coeff_a, exclude_outside, transform_coordinate, reinterpret_cast(dims_mapping)); - _ResizeBiCubicKernel<<>>( + _ResizeBiCubicKernel<<>>( input_shape[rank - 2], input_shape[rank - 1], output_height, output_width, output_div_pitches[rank - 2], div_output_image, @@ -794,6 +809,7 @@ void ResizeImpl( #define SPECIALIZED_IMPL(T) \ template void ResizeImpl( \ + cudaStream_t stream, \ const UpsampleMode upsample_mode, \ const int rank, \ TArray& input_shape, \ diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h index c82616d644..c2359c260c 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h @@ -16,6 +16,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, template void ResizeImpl( + cudaStream_t stream, const onnxruntime::UpsampleMode upsample_mode, const int rank, TArray& input_shape, diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc index f51d99c549..7f70c57503 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc @@ -20,8 +20,9 @@ ONNX_OPERATOR_KERNEL_EX( ReverseSequenceOp); #define ReverseSequenceCallCudaImplTypeAs(T, TEqual) \ - if (X.IsDataType()) { \ + if (X.IsDataType()) { \ CUDA_RETURN_IF_ERROR(ReverseSequenceCudaImpl( \ + Stream(), \ reinterpret_cast::MappedType*>(X.template Data()), \ seq_lengths.Data(), \ reinterpret_cast::MappedType*>(Y.template MutableData()), \ diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu index b7de4c3323..4d37b6a206 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.cu @@ -52,6 +52,7 @@ __global__ void ReverseSequenceImplKernel( template cudaError_t ReverseSequenceCudaImpl( + cudaStream_t stream, const T* x_data, const int64_t* seq_len_data, T* y_data, @@ -66,11 +67,11 @@ cudaError_t ReverseSequenceCudaImpl( int blocksPerGrid = CeilDiv(group_count, GridDim::maxThreadsPerBlock); if (time_major) { - ReverseSequenceImplKernel<<>>( + ReverseSequenceImplKernel<<>>( x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size, group_count, fdm_grouped_stride_0, fdm_grouped_stride_1); } else { - ReverseSequenceImplKernel<<>>( + ReverseSequenceImplKernel<<>>( x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size, group_count, fdm_grouped_stride_0, fdm_grouped_stride_1); } @@ -79,6 +80,7 @@ cudaError_t ReverseSequenceCudaImpl( #define InstantiateReverseSequenceImpl(T) \ template cudaError_t ReverseSequenceCudaImpl( \ + cudaStream_t stream, \ const T* x_data, \ const int64_t* seq_len_data, \ T* y_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h index bc0973b50d..15268be59e 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence_impl.h @@ -11,6 +11,7 @@ namespace cuda { template cudaError_t ReverseSequenceCudaImpl( + cudaStream_t stream, const T* x_data, const int64_t* seq_len_data, T* y_data, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc index 5a31a472aa..ab04fb150a 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements.cc @@ -49,7 +49,8 @@ ONNX_OPERATOR_KERNEL_EX( template struct ScatterElements::ComputeImpl { - Status operator()(const Tensor* data_tensor, + Status operator()(cudaStream_t stream, + const Tensor* data_tensor, const Tensor* updates_tensor, const Tensor* indices_tensor, Tensor* output_tensor, @@ -69,6 +70,7 @@ struct ScatterElements::ComputeImpl { if (utils::IsPrimitiveDataType(Tin_type)) { const int32_t* indices_data = indices_tensor->template Data(); return ScatterElementsImpl( + stream, rank, reinterpret_cast(input_data), input_data_size, @@ -84,6 +86,7 @@ struct ScatterElements::ComputeImpl { } else if (utils::IsPrimitiveDataType(Tin_type)) { const int64_t* indices_data = indices_tensor->template Data(); return ScatterElementsImpl( + stream, rank, reinterpret_cast(input_data), input_data_size, @@ -163,7 +166,7 @@ Status ScatterElements::ComputeInternal(OpKernelContext* context) const { utils::MLTypeCallDispatcherRet t_disp(data_tensor->GetElementType()); - return t_disp.Invoke(data_tensor, updates_tensor, indices_tensor, output_tensor, rank, + return t_disp.Invoke(Stream(), data_tensor, updates_tensor, indices_tensor, output_tensor, rank, input_data_size, buffer_input_dims, buffer_input_strides, indices_size, buffer_indices_dims, fdm_indices_strides, axis); } diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu index 03e4d6afbf..c4536ba112 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.cu @@ -145,6 +145,7 @@ static int CompactInputIndicesDims( template Status ScatterElementsImpl2D( + cudaStream_t stream, const T* input_data, const std::vector& input_dims, const Tin* indices_data, @@ -157,12 +158,12 @@ Status ScatterElementsImpl2D( int blocksPerGrid = gsl::narrow_cast(CeilDiv(indices_size, GridDim::maxThreadsPerBlock)); fast_divmod indices_stride_row(static_cast(indices_dims[1])); if (axis == 0) { - _ScatterElementsKernel2D<<>>( + _ScatterElementsKernel2D<<>>( gsl::narrow_cast(input_dims[0]), input_data, indices_data, indices_size, indices_stride_row, updates, input_dims[1], output_data, func); } else { - _ScatterElementsKernel2D<<>>( + _ScatterElementsKernel2D<<>>( gsl::narrow_cast(input_dims[1]), input_data, indices_data, indices_size, indices_stride_row, updates, input_dims[1], output_data, func); @@ -172,6 +173,7 @@ Status ScatterElementsImpl2D( template Status ScatterElementsImplInternal( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, @@ -186,7 +188,7 @@ Status ScatterElementsImplInternal( T* output_data, const FuncT& func) { if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, input_size * sizeof(T), cudaMemcpyDeviceToDevice, 0)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, input_size * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } if (indices_size > 0) { @@ -196,12 +198,12 @@ Status ScatterElementsImplInternal( rank, axis, buffer_input_dims.Data(), buffer_indices_dims.Data(), eff_input_dims, eff_indices_dims); if (eff_input_dims.size() == 2) { return ScatterElementsImpl2D( - input_data, eff_input_dims, indices_data, indices_size, eff_indices_dims, updates, new_axis, output_data, + stream, input_data, eff_input_dims, indices_data, indices_size, eff_indices_dims, updates, new_axis, output_data, func); } int blocksPerGrid = gsl::narrow_cast(CeilDiv(indices_size, GridDim::maxThreadsPerBlock)); - _ScatterElementsKernel<<>>( + _ScatterElementsKernel<<>>( rank, input_data, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, func); @@ -218,6 +220,7 @@ struct Func_Assignment { template Status ScatterElementsImpl( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, @@ -230,13 +233,14 @@ Status ScatterElementsImpl( const T* updates, const int axis, T* output_data) { - return ScatterElementsImplInternal(rank, input_data, input_size, buffer_input_dims, + return ScatterElementsImplInternal(stream, rank, input_data, input_size, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, Func_Assignment()); } #define SCATTER_ELEMENTS_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ template Status ScatterElementsImpl( \ + cudaStream_t stream, \ const int rank, \ const T* input_data, \ const int64_t input_size, \ @@ -278,6 +282,7 @@ struct Func_AtomicAdd { template Status GatherElementsGradImpl( + cudaStream_t stream, const int rank, TArray& buffer_input_dims, TArray& buffer_input_strides, @@ -290,7 +295,7 @@ Status GatherElementsGradImpl( T* output_data) { // Give output_data as the input_data parameter by intention, // to skip input_data copy, which is not applicable for GatherElementsGrad. - return ScatterElementsImplInternal(rank, output_data, 0, + return ScatterElementsImplInternal(stream, rank, output_data, 0, buffer_input_dims, buffer_input_strides, indices_data, indices_size, buffer_indices_dims, fdm_indices_strides, updates, axis, output_data, Func_AtomicAdd()); @@ -298,6 +303,7 @@ Status GatherElementsGradImpl( #define GATHER_ELEMENTS_GRAD_SPECIALIZED_TINDEX_IMPL(T, TIndex) \ template Status GatherElementsGradImpl( \ + cudaStream_t stream, \ const int rank, \ TArray& buffer_input_dims, \ TArray& buffer_input_strides, \ diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h index 5eea6ab808..8f4e676042 100755 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h @@ -11,6 +11,7 @@ namespace cuda { template Status ScatterElementsImpl( + cudaStream_t stream, const int rank, const T* input_data, const int64_t input_size, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc b/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc index 07dd5df43b..d5f632a1bf 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd.cc @@ -48,7 +48,7 @@ Status ScatterND::ComputeInternal(OpKernelContext* context) const { if (input_data != output_data) { // TODO: Run benchmarks to determine if a dedicated kernel doing data copy will be faster than invoking cudaMemcpy ? - cudaMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream()); } // Bail out early @@ -71,6 +71,7 @@ Status ScatterND::ComputeInternal(OpKernelContext* context) const { element_counts_and_input_dims_gpu.CopyToGpu(); ORT_RETURN_IF_ERROR(ScatterNDImpl( + Stream(), output_data, element_size, indices_shape.Size() / static_cast(last_index_dimension), diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu index 213e8d9ed2..0651049a5f 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _ScatterNDKernel( } Status ScatterNDImpl( + cudaStream_t stream, void* output_data, const size_t element_size, const size_t num_indices, @@ -68,7 +69,7 @@ Status ScatterNDImpl( switch (element_size) { case sizeof(int8_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -79,7 +80,7 @@ Status ScatterNDImpl( break; case sizeof(int16_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -90,7 +91,7 @@ Status ScatterNDImpl( break; case sizeof(int32_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, @@ -101,7 +102,7 @@ Status ScatterNDImpl( break; case sizeof(int64_t): - _ScatterNDKernel<<>>( + _ScatterNDKernel<<>>( reinterpret_cast(output_data), num_indices, indices_data, diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h index de9bad886d..874d275f94 100644 --- a/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_nd_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { Status ScatterNDImpl( + cudaStream_t stream, void* output_data, const size_t element_size, const size_t num_indices, diff --git a/onnxruntime/core/providers/cuda/tensor/slice.cc b/onnxruntime/core/providers/cuda/tensor/slice.cc index d23a686c6f..cf4d7ad75c 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.cc +++ b/onnxruntime/core/providers/cuda/tensor/slice.cc @@ -83,7 +83,8 @@ REGISTER_V13_TYPED_SLICE(int32_t) REGISTER_V13_TYPED_SLICE(int64_t) REGISTER_V13_TYPED_SLICE(float) -static Status SliceImpCore(const void* input_data, void* output_data, +static Status SliceImpCore(cudaStream_t stream, + const void* input_data, void* output_data, size_t element_size, size_t dimension_count, const TArray& starts_buffer, const TArray& steps_buffer, const TArray& input_strides, const TArray& output_strides, @@ -92,7 +93,8 @@ static Status SliceImpCore(const void* input_data, void* output_data, return Status::OK(); } - return SliceImpl(element_size, + return SliceImpl(stream, + element_size, gsl::narrow_cast(dimension_count), starts_buffer, steps_buffer, @@ -146,7 +148,8 @@ static Status ComputeSliceStrides(const TensorShape& input_shape, return Status::OK(); } -Status Impl(const void* input_data, +Status Impl(cudaStream_t stream, + const void* input_data, const TensorShape& input_shape, void* output_data, SliceOp::PrepareForComputeMetadata& compute_metadata, @@ -163,7 +166,8 @@ Status Impl(const void* input_data, TensorShape output_shape(compute_metadata.output_dims_); - ORT_RETURN_IF_ERROR(SliceImpCore(input_data, + ORT_RETURN_IF_ERROR(SliceImpCore(stream, + input_data, output_data, element_size, gsl::narrow_cast(dimension_count), @@ -237,7 +241,8 @@ Status Slice::CallSliceImp(size_t element_size, size_t dimension_count, const auto* input_tensor = ctx->Input(0); auto* output_tensor = ctx->Output(0, output_shape); - return SliceImpCore(input_tensor->DataRaw(), + return SliceImpCore(Stream(), + input_tensor->DataRaw(), output_tensor->MutableDataRaw(), element_size, gsl::narrow_cast(dimension_count), diff --git a/onnxruntime/core/providers/cuda/tensor/slice.h b/onnxruntime/core/providers/cuda/tensor/slice.h index 8bbd5158b3..b43cbfee78 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.h +++ b/onnxruntime/core/providers/cuda/tensor/slice.h @@ -11,7 +11,8 @@ namespace cuda { namespace SliceCuda { -Status Impl(const void* input_data, +Status Impl(cudaStream_t stream, + const void* input_data, const TensorShape& input_shape, void* output_data, SliceOp::PrepareForComputeMetadata& prepare_metadata, diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu index 5a74018852..f8d8a75ed4 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.cu @@ -61,7 +61,8 @@ __global__ void _SliceKernel(const TArray starts, } } -Status SliceImpl(const size_t element_size, +Status SliceImpl(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -70,11 +71,12 @@ Status SliceImpl(const size_t element_size, const void* input_data, void* output_data, const size_t N) { - return SliceImplEx(element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, + return SliceImplEx(stream, element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, output_data, N); } -Status SliceImplGrad(const size_t element_size, +Status SliceImplGrad(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -83,14 +85,14 @@ Status SliceImplGrad(const size_t element_size, const void* input_data, void* output_data, const size_t N) { - return SliceImplEx(element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, + return SliceImplEx(stream, element_size, dimension_count, starts, steps, input_strides, output_strides, input_data, output_data, N); } #define HANDLE_DIMS(ELEMENT_TYPE, DIMS) \ case DIMS: { \ _SliceKernel \ - <<>>( \ + <<>>( \ starts, steps, input_strides, output_strides, \ reinterpret_cast::MappedType*>(input_data), \ reinterpret_cast::MappedType*>(output_data), \ @@ -112,7 +114,8 @@ Status SliceImplGrad(const size_t element_size, } break template -Status SliceImplEx(const size_t element_size, +Status SliceImplEx(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, diff --git a/onnxruntime/core/providers/cuda/tensor/slice_impl.h b/onnxruntime/core/providers/cuda/tensor/slice_impl.h index 33c6ae3e4c..d691d60f81 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/slice_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { template -Status SliceImplEx(const size_t element_size, +Status SliceImplEx(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -19,7 +20,8 @@ Status SliceImplEx(const size_t element_size, void* output_data, const size_t N); -Status SliceImpl(const size_t element_size, +Status SliceImpl(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, @@ -29,7 +31,8 @@ Status SliceImpl(const size_t element_size, void* output_data, const size_t N); -Status SliceImplGrad(const size_t element_size, +Status SliceImplGrad(cudaStream_t stream, + const size_t element_size, const int32_t dimension_count, const TArray& starts, const TArray& steps, diff --git a/onnxruntime/core/providers/cuda/tensor/split.cc b/onnxruntime/core/providers/cuda/tensor/split.cc index 539f8016a6..708e3c0cfd 100644 --- a/onnxruntime/core/providers/cuda/tensor/split.cc +++ b/onnxruntime/core/providers/cuda/tensor/split.cc @@ -103,7 +103,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { axis_dimension_input_output_mapping_gpu.CopyToGpu(); size_t element_size = input_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(SplitImpl(element_size, + ORT_RETURN_IF_ERROR(SplitImpl(Stream(), + element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.cu b/onnxruntime/core/providers/cuda/tensor/split_impl.cu index 2c87126aa1..f1565428d6 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.cu @@ -39,7 +39,8 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div reinterpret_cast(output_ptr[output_index])[output_pos] = input_data[id]; } -Status SplitImpl(const size_t element_size, +Status SplitImpl(cudaStream_t stream, + const size_t element_size, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* split_sizes, @@ -56,7 +57,7 @@ Status SplitImpl(const size_t element_size, switch (element_size) { case sizeof(int8_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -64,7 +65,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int16_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -72,7 +73,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int32_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), @@ -80,7 +81,7 @@ Status SplitImpl(const size_t element_size, (CUDA_LONG)N); break; case sizeof(int64_t): - _SplitKernel<<>>( + _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.h b/onnxruntime/core/providers/cuda/tensor/split_impl.h index fa07a68fb5..a8fde02549 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.h @@ -9,7 +9,8 @@ namespace onnxruntime { namespace cuda { -Status SplitImpl(const size_t element_size, +Status SplitImpl(cudaStream_t stream, + const size_t element_size, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* split_sizes, diff --git a/onnxruntime/core/providers/cuda/tensor/squeeze.cc b/onnxruntime/core/providers/cuda/tensor/squeeze.cc index b6cd7317ea..3cd5eab410 100644 --- a/onnxruntime/core/providers/cuda/tensor/squeeze.cc +++ b/onnxruntime/core/providers/cuda/tensor/squeeze.cc @@ -68,7 +68,7 @@ Status Squeeze::ComputeInternal(OpKernelContext* ctx) const { auto count = X->Shape().Size(); auto element_bytes = X->DataType()->Size(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index 7b68b19ce3..7d70baa3a9 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -72,7 +72,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { // Repeat tensor has all 1s in it if (output_shape == input_shape) { - cudaMemcpyAsync(output_tensor.MutableDataRaw(), input_tensor.DataRaw(), input_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice); + cudaMemcpyAsync(output_tensor.MutableDataRaw(), input_tensor.DataRaw(), input_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice, Stream()); return Status::OK(); } @@ -91,6 +91,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), @@ -98,12 +99,14 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), output_shape.Size()); } else if (input_tensor.IsDataType()) { TileMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), input_shape.Size(), reinterpret_cast::MappedType*>(output_data), @@ -116,6 +119,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -125,6 +129,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -133,6 +138,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { output_shape.Size()); } else if (input_tensor.IsDataType()) { TileBatchedMemcpyImpl( + Stream(), reinterpret_cast::MappedType*>(input_data), num_of_elements_per_batch, input_shape[0], // The tensor is atleast 1-D- this is safe @@ -169,6 +175,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, @@ -179,6 +186,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { } else if (input_tensor.IsDataType() || input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, @@ -188,6 +196,7 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { output_tensor.Shape().Size()); } else if (input_tensor.IsDataType()) { TileImpl( + Stream(), rank, fdm_input_shape, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu index a66db85a2f..d5b6cc931e 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu @@ -32,6 +32,7 @@ __global__ void _TileKernel( template void TileImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, @@ -40,7 +41,7 @@ void TileImpl( T* output_data, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _TileKernel<<>>( + _TileKernel<<>>( shape_rank, fdm_input_shape, input_stride, input_data, fdm_output_strides, output_data, (CUDA_LONG)N); } @@ -58,12 +59,13 @@ __global__ void _TileMemcpyKernel( template void TileMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements) { int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); - _TileMemcpyKernel<<>>( + _TileMemcpyKernel<<>>( input_data, num_input_elements, output_data, (CUDA_LONG)num_output_elements); } @@ -84,6 +86,7 @@ __global__ void _TileBatchedMemcpyKernel( template void TileBatchedMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, @@ -91,7 +94,7 @@ void TileBatchedMemcpyImpl( T* output_data, const size_t num_output_elements) { int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); - _TileBatchedMemcpyKernel<<>>( + _TileBatchedMemcpyKernel<<>>( input_data, num_of_elements_per_input_batch, num_input_batch_count, @@ -101,9 +104,9 @@ void TileBatchedMemcpyImpl( } #define SPECIALIZED_IMPL(T) \ - template void TileImpl(const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ - template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ - template void TileBatchedMemcpyImpl(const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); + template void TileImpl(cudaStream_t stream, const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ + template void TileMemcpyImpl(cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ + template void TileBatchedMemcpyImpl(cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.h b/onnxruntime/core/providers/cuda/tensor/tile_impl.h index 27404c8d39..a612beabcf 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void TileImpl( + cudaStream_t stream, const size_t shape_rank, const TArray& input_shape, const TArray& input_strides, @@ -20,6 +21,7 @@ void TileImpl( template void TileMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_input_elements, T* output_data, @@ -27,6 +29,7 @@ void TileMemcpyImpl( template void TileBatchedMemcpyImpl( + cudaStream_t stream, const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.cc b/onnxruntime/core/providers/cuda/tensor/transpose.cc index 8a90b60cd9..f5fb7c0147 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.cc +++ b/onnxruntime/core/providers/cuda/tensor/transpose.cc @@ -54,14 +54,15 @@ static std::tuple TryTransposeWithCublas(const std::vector& pe } template -Status TransposeWithCublas(cublasHandle_t cublas_handle, const Tensor& input, Tensor& output, int M, int N) { +Status TransposeWithCublas(cudaStream_t stream, cublasHandle_t cublas_handle, const Tensor& input, Tensor& output, int M, int N) { typedef typename ToCudaType::MappedType CudaT; CudaT one = ToCudaType::FromFloat(1.0f); CudaT zero = ToCudaType::FromFloat(0.0f); const CudaT* input_data = reinterpret_cast(input.Data()); CudaT* output_data = reinterpret_cast(output.MutableData()); CUBLAS_RETURN_IF_ERROR( - cublasTransposeHelper(cublas_handle, + cublasTransposeHelper(stream, + cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, &one, input_data, @@ -76,10 +77,11 @@ Status TransposeWithCublas(cublasHandle_t cublas_handle, const Tensor& input, Te Status Transpose::DoTranspose(const Transpose& transpose_kernel, const std::vector& permutations, const Tensor& input, Tensor& output) { - return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.CublasHandle(), permutations, input, output); + return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.CublasHandle(), permutations, input, output); } Status Transpose::DoTranspose(const cudaDeviceProp& prop, + cudaStream_t stream, const cublasHandle_t cublas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override) { @@ -96,11 +98,11 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, int N = std::get<1>(mn); if (M != 0 && N != 0) { if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } else if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } else { - return TransposeWithCublas(cublas_handle, input, output, M, N); + return TransposeWithCublas(stream, cublas_handle, input, output, M, N); } } } @@ -162,14 +164,14 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, size_t element_size = input.DataType()->Size(); if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { - return Transpose3DImpl(element_size, input_shape, tmp_input_strides, + return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(element_size, input_shape, tmp_input_strides, input.DataRaw(), + return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), tmp_output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); } @@ -184,7 +186,7 @@ Status Transpose::DoTranspose(const cudaDeviceProp& prop, output_strides[i] = fast_divmod(gsl::narrow_cast(new_output_strides[i])); } - auto status = TransposeImpl(element_size, new_rank, input_strides, input.DataRaw(), + auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(), output_strides, output.MutableDataRaw(), gsl::narrow(output.Shape().Size())); return status; @@ -208,7 +210,7 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const { TensorShape output_shape{output_dims}; Tensor* Y = ctx->Output(0, output_shape); - return DoTranspose(this->GetDeviceProp(), this->CublasHandle(), *p_perm, X, *Y); + return DoTranspose(this->GetDeviceProp(), this->Stream(), this->CublasHandle(), *p_perm, X, *Y); } } // namespace cuda diff --git a/onnxruntime/core/providers/cuda/tensor/transpose.h b/onnxruntime/core/providers/cuda/tensor/transpose.h index c9cd83e5a8..c9b41e9102 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose.h @@ -23,6 +23,7 @@ class Transpose final : public CudaKernel, public TransposeBase { // `input_shape_override` (if provided) overrides the shape of `input` for compute purposes static Status DoTranspose(const cudaDeviceProp& prop, + cudaStream_t stream, const cublasHandle_t cublas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override = nullptr); diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu index 3e7b860fec..10611c9cd9 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.cu @@ -40,7 +40,7 @@ bool CanDoTranspose3D(int32_t rank, return false; } -Status Transpose3DImpl(size_t element_size, +Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N) { dim3 block_size(TILE_DIM, TILE_DIM); @@ -48,25 +48,25 @@ Status Transpose3DImpl(size_t element_size, switch (element_size) { case sizeof(int8_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int16_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int32_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); break; case sizeof(int64_t): - Transpose3DKernel<<>>( + Transpose3DKernel<<>>( input_shape, input_strides, reinterpret_cast::MappedType*>(input_data), reinterpret_cast::MappedType*>(output_data)); @@ -129,7 +129,7 @@ bool CanDoTranspose4D(const cudaDeviceProp& prop, return false; } -Status Transpose4DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, const TArray& output_strides, void* output_data, int N) { unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast(element_size); // int4 is used in the kernel to access data. dim3 block_size(static_cast(input_shape[3] / num_elements_per_thread), static_cast(input_shape[2])); @@ -137,22 +137,22 @@ Status Transpose4DImpl(size_t element_size, const TArray& input_shape, switch (element_size) { case sizeof(int8_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int16_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int32_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; case sizeof(int64_t): - Transpose4DKernel<<>>( + Transpose4DKernel<<>>( input_strides, input_data, output_strides, output_data, N / num_elements_per_thread); break; @@ -184,12 +184,12 @@ __global__ void TransposeKernel(int32_t shape_rank, const TArray input_ output_data[id] = input_data[input_index]; } -Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray& input_strides, +Status TransposeImpl(cudaStream_t stream, size_t element_size, int32_t shape_rank, const TArray& input_strides, const void* input_data, const TArray& fdm_output_strides, void* output_data, int N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); switch (element_size) { case sizeof(int8_t): - TransposeKernel<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -197,7 +197,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -205,7 +205,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, @@ -213,7 +213,7 @@ Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray<<>>( + TransposeKernel<<>>( shape_rank, input_strides, reinterpret_cast::MappedType*>(input_data), fdm_output_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h index 5ea7f6e8ce..1a4d469776 100644 --- a/onnxruntime/core/providers/cuda/tensor/transpose_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/transpose_impl.h @@ -9,16 +9,16 @@ namespace onnxruntime { namespace cuda { bool CanDoTranspose3D(int32_t rank, const std::vector& input_dims, const std::vector& permutations); -Status Transpose3DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose3DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, void* output_data, int64_t N); bool CanDoTranspose4D(const cudaDeviceProp& prop, size_t element_size, int32_t rank, const std::vector& input_dims, const std::vector& permutations); -Status Transpose4DImpl(size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, +Status Transpose4DImpl(cudaStream_t stream, size_t element_size, const TArray& input_shape, const TArray& input_strides, const void* input_data, const TArray& output_strides, void* output_data, int N); -Status TransposeImpl(size_t element_size, int32_t shape_rank, const TArray& input_strides, +Status TransposeImpl(cudaStream_t stream, size_t element_size, int32_t shape_rank, const TArray& input_strides, const void* input_data, const TArray& fdm_output_strides, void* output_data, int N); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc index c528322d0e..9d293b8821 100644 --- a/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc +++ b/onnxruntime/core/providers/cuda/tensor/unsqueeze.cc @@ -50,7 +50,7 @@ Status Unsqueeze::ComputeInternal(OpKernelContext* ctx) const { auto count = p.input_tensor->Shape().Size(); auto element_bytes = p.input_tensor->DataType()->Size(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, count * element_bytes, cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index a7fa0a6a94..2fec80ab1f 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -87,7 +87,7 @@ Status Upsample::BaseCompute(OpKernelContext* context, size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims); auto dims_mapping_buffer = GetScratchBuffer(temp_buffer_size); void* dims_mapping = reinterpret_cast(dims_mapping_buffer.get()); - ResizeImpl(mode_, (int)rank, input_shape, output_shape, + ResizeImpl(Stream(), mode_, (int)rank, input_shape, output_shape, input_strides, output_div_pitches, scales_vals, roi_vals, reinterpret_cast(X->template Data()), reinterpret_cast(Y->template MutableData()), @@ -102,7 +102,8 @@ Status Upsample::BaseCompute(OpKernelContext* context, scales_div[i] = fast_divmod(gsl::narrow_cast(ceil(scales[i]))); } - UpampleImpl(mode_, + UpampleImpl(Stream(), + mode_, rank, (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0, input_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu index 7409e707cc..83d83ef9d8 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu @@ -149,7 +149,8 @@ __global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0, } template -void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, +void UpampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, const size_t rank, const int64_t input_dim2, const TArray& input_pitches, @@ -160,22 +161,23 @@ void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); if (onnxruntime::UpsampleMode::NN == upsample_mode) { - _UpampleNearestKernel<<>>( + _UpampleNearestKernel<<>>( rank, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode && rank == 4) { - _UpampleBilinear4DInputKernel<<>>( + _UpampleBilinear4DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode && rank == 2) { - _UpampleBilinear2DInputKernel<<>>( + _UpampleBilinear2DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } } #define SPECIALIZED_IMPL(T) \ - template void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, \ + template void UpampleImpl(cudaStream_t stream, \ + const onnxruntime::UpsampleMode upsample_mode, \ const size_t rank, \ const int64_t input_dim2, \ const TArray& input_pitches, \ diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h index a431f5d61c..32376c198d 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h @@ -11,7 +11,8 @@ namespace onnxruntime { namespace cuda { template -void UpampleImpl(const onnxruntime::UpsampleMode upsample_mode, +void UpampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, const size_t rank, const int64_t input_dim2, const TArray& input_pitches, diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc index 2b765789f8..ba85c2cd4c 100644 --- a/onnxruntime/core/providers/cuda/tensor/where.cc +++ b/onnxruntime/core/providers/cuda/tensor/where.cc @@ -174,6 +174,7 @@ Status Where::ComputeInternal(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(prepare.TernaryElementwiseBroadcastPrepareHelper(condition_shape, X_shape, Y_shape, output_shape)); WhereImpl( + Stream(), prepare.output_rank_or_simple_broadcast, prepare.a_index_type, prepare.a_padded_strides, diff --git a/onnxruntime/core/providers/cuda/tensor/where_impl.cu b/onnxruntime/core/providers/cuda/tensor/where_impl.cu index 319007c359..0f2d4a4543 100644 --- a/onnxruntime/core/providers/cuda/tensor/where_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/where_impl.cu @@ -119,7 +119,7 @@ __global__ void _TenaryElementWiseSimple( Y_INDEX_TYPE, \ GridDim::maxThreadsPerBlock, \ GridDim::maxElementsPerThread> \ - <<>>(cond_data, \ + <<>>(cond_data, \ x_data, \ y_data, \ output_data, \ @@ -150,7 +150,7 @@ __global__ void _TenaryElementWiseSimple( Y_INDEX_TYPE, \ GridDim::maxThreadsPerBlock, \ GridDim::maxElementsPerThread> \ - <<>>(output_rank_or_simple_broadcast, \ + <<>>(output_rank_or_simple_broadcast, \ cond_padded_strides, \ cond_data, \ x_padded_strides, \ @@ -182,6 +182,7 @@ __global__ void _TenaryElementWiseSimple( template void WhereImpl( + cudaStream_t stream, size_t output_rank_or_simple_broadcast, BroadcastIndexType cond_index_type, const TArray& cond_padded_strides, @@ -212,7 +213,8 @@ void WhereImpl( } #define SPECIALIZED_IMPL(T) \ - template void WhereImpl(size_t output_rank_or_simple_broadcast, \ + template void WhereImpl(cudaStream_t stream, \ + size_t output_rank_or_simple_broadcast, \ BroadcastIndexType cond_index_type, \ const TArray& cond_padded_strides, \ const bool* cond_data, \ diff --git a/onnxruntime/core/providers/cuda/tensor/where_impl.h b/onnxruntime/core/providers/cuda/tensor/where_impl.h index 24cf54f351..e560c4717c 100644 --- a/onnxruntime/core/providers/cuda/tensor/where_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/where_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void WhereImpl( + cudaStream_t stream, size_t output_rank_or_simple_broadcast, BroadcastIndexType cond_index_type, const TArray& cond_padded_strides, diff --git a/onnxruntime/core/providers/rocm/fpgeneric.cu b/onnxruntime/core/providers/rocm/fpgeneric.cu index 072bd17cff..c53934c688 100644 --- a/onnxruntime/core/providers/rocm/fpgeneric.cu +++ b/onnxruntime/core/providers/rocm/fpgeneric.cu @@ -46,21 +46,21 @@ __global__ void CopyVectorHalf(const half* x, int incx, half* y, int incy, int n } // namespace -rocblas_status rocblasTransposeHelper(rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { +rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int) { if (C != A) { dim3 dimGrid((n + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, (m + TRANS_TILE_DIM - 1) / TRANS_TILE_DIM, 1); dim3 dimBlock(TRANS_TILE_DIM, BLOCK_ROWS, 1); - hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, 0, C, A, n, m); + hipLaunchKernelGGL(transposeNoOverlap, dim3(dimGrid), dim3(dimBlock), 0, stream, C, A, n, m); } else { return rocblas_status_not_implemented; } return rocblas_status_success; } -rocblas_status rocblasCopyHelper(rocblas_handle, int n, const half* x, int incx, half* y, int incy) { +rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle, int n, const half* x, int incx, half* y, int incy) { dim3 dimGrid((unsigned int)(n + COPY_BLOCK_DIM - 1) / COPY_BLOCK_DIM, 1, 1); dim3 dimBlock(COPY_BLOCK_DIM, 1, 1); - hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, 0, x, incx, y, incy, n); + hipLaunchKernelGGL(CopyVectorHalf, dim3(dimGrid), dim3(dimBlock), 0, stream, x, incx, y, incy, n); return rocblas_status_success; } diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc index 23111395f0..83986e4819 100644 --- a/onnxruntime/core/providers/rocm/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.cc @@ -5,16 +5,25 @@ #include "rocm_common.h" namespace onnxruntime { -GPUDataTransfer::GPUDataTransfer() { - // create streams, default is nullptr - streams_[kHipStreamDefault] = nullptr; - HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyIn], hipStreamNonBlocking)); - HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyOut], hipStreamNonBlocking)); +GPUDataTransfer::GPUDataTransfer(hipStream_t stream, bool do_copy_in_default_stream) { + do_copy_in_default_stream_ = do_copy_in_default_stream; + streams_[kHipStreamDefault] = stream; + if (do_copy_in_default_stream) { + streams_[kHipStreamCopyIn] = stream; + streams_[kHipStreamCopyOut] = stream; + } else { + HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyIn], hipStreamNonBlocking)); + HIP_CALL_THROW(hipStreamCreateWithFlags(&streams_[kHipStreamCopyOut], hipStreamNonBlocking)); + } } GPUDataTransfer::~GPUDataTransfer() { - HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyIn])); - HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyOut])); + if (!do_copy_in_default_stream_ && streams_[kHipStreamCopyIn] != nullptr) { + HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyIn])); + } + if (!do_copy_in_default_stream_ && streams_[kHipStreamCopyOut] != nullptr) { + HIP_CALL(hipStreamDestroy(streams_[kHipStreamCopyOut])); + } } bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { @@ -33,24 +42,26 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int e if (dst_device.Type() == OrtDevice::GPU) { if (src_device.Type() == OrtDevice::CPU && src_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copy from pinned memory to GPU, this is non-blocking - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, streams_[exec_queue_id])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(exec_queue_id))); } else if (src_device.Type() == OrtDevice::GPU) { // copying between GPU, this is non-blocking // Copy only if the two addresses are different. if (dst_data != src_data) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, streams_[kHipStreamDefault])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToDevice, GetStream(kHipStreamDefault))); } } else { // copy from other CPU memory to GPU, this is blocking - HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyHostToDevice, GetStream(kHipStreamDefault))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault))); } } else if (src_device.Type() == OrtDevice::GPU) { if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED) { // copying from GPU to pinned memory, this is non-blocking - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, streams_[exec_queue_id])); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(exec_queue_id))); } else { // copying from GPU to CPU memory, this is blocking - HIP_RETURN_IF_ERROR(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, GetStream(kHipStreamDefault))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(GetStream(kHipStreamDefault))); } } else { // copying between cpu memory diff --git a/onnxruntime/core/providers/rocm/gpu_data_transfer.h b/onnxruntime/core/providers/rocm/gpu_data_transfer.h index 8001fa48bc..9c07968d19 100644 --- a/onnxruntime/core/providers/rocm/gpu_data_transfer.h +++ b/onnxruntime/core/providers/rocm/gpu_data_transfer.h @@ -17,7 +17,7 @@ enum HIPStreamType : int { class GPUDataTransfer : public IDataTransfer { public: - GPUDataTransfer(); + GPUDataTransfer(hipStream_t stream, bool do_copy_in_default_stream = true); ~GPUDataTransfer(); bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; @@ -32,6 +32,7 @@ class GPUDataTransfer : public IDataTransfer { } private: + bool do_copy_in_default_stream_; hipStream_t streams_[kTotalHipStreams]; }; diff --git a/onnxruntime/core/providers/rocm/math/gemm.cc b/onnxruntime/core/providers/rocm/math/gemm.cc index 413744d595..b571e7930f 100644 --- a/onnxruntime/core/providers/rocm/math/gemm.cc +++ b/onnxruntime/core/providers/rocm/math/gemm.cc @@ -83,6 +83,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { if (b_shape.Size() == 1) { // if B is (), (1,) or (1, 1), broadcast the scalar ROCBLAS_RETURN_IF_ERROR(rocblasCopyHelper( + Stream(), RocblasHandle(), M * N, b_data, @@ -115,7 +116,7 @@ Status Gemm::ComputeInternal(OpKernelContext* ctx) const { out_data, N)); } else { // B is (M, N), no broadcast needed. - HIP_RETURN_IF_ERROR(hipMemcpyAsync(out_data, b_data, M * N * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(out_data, b_data, M * N * sizeof(T), hipMemcpyDeviceToDevice, Stream())); } } diff --git a/onnxruntime/core/providers/rocm/math/softmax.cc b/onnxruntime/core/providers/rocm/math/softmax.cc index 64fb4cb53c..5d66c742e6 100644 --- a/onnxruntime/core/providers/rocm/math/softmax.cc +++ b/onnxruntime/core/providers/rocm/math/softmax.cc @@ -12,6 +12,7 @@ namespace rocm { template Status SoftMaxComputeHelper( + hipStream_t stream, const T* X, const TensorShape& input_shape, T* Y, @@ -29,7 +30,7 @@ Status SoftMaxComputeHelper( // miopenSoftmaxForward/Backward is not optimal implementation. // TODO: remove miopen path completely in the future. if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_softmax_forward, is_log_softmax>(Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + dispatch_softmax_forward, is_log_softmax>(stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -51,8 +52,8 @@ Status SoftMaxComputeHelper( } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); \ - template Status SoftMaxComputeHelper(const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); + template Status SoftMaxComputeHelper(hipStream_t stream, const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); \ + template Status SoftMaxComputeHelper(hipStream_t stream, const T* input, const TensorShape& shape, T* Y, miopenHandle_t handle, int64_t axis); SPECIALIZED_SOFTMAX_HELPER_IMPL(float) // SPECIALIZED_SOFTMAX_HELPER_IMPL(double) @@ -119,9 +120,9 @@ Status Softmax::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); if (log_softmax_) { - return SoftMaxComputeHelper(X_data, input_shape, Y_data, MiopenHandle(), axis_); + return SoftMaxComputeHelper(Stream(), X_data, input_shape, Y_data, MiopenHandle(), axis_); } else { - return SoftMaxComputeHelper(X_data, input_shape, Y_data, MiopenHandle(), axis_); + return SoftMaxComputeHelper(Stream(), X_data, input_shape, Y_data, MiopenHandle(), axis_); } } diff --git a/onnxruntime/core/providers/rocm/math/softmax_impl.cu b/onnxruntime/core/providers/rocm/math/softmax_impl.cu index 94f8e4fc54..2b079949e0 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu +++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu @@ -136,7 +136,7 @@ __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batc } template -void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -160,37 +160,37 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, 0, dst, src, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_forward), dim3(blocks), dim3(threads), 0, stream, dst, src, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -199,8 +199,8 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele } #define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_forward(output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_IMPL(float, float, float) SPECIALIZED_SOFTMAX_IMPL(half, half, float) diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 5f177e1ecc..55aac97d7a 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -149,6 +149,7 @@ Status ReduceKernel::ReduceKernelShared( switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + Stream(), reinterpret_cast(X), reinterpret_cast(Y), m, n, false); @@ -167,7 +168,7 @@ Status ReduceKernel::ReduceKernelShared( // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen temp_X = GetScratchBuffer(input_count); miopen_type_X = miopenFloat; - Impl_Cast(reinterpret_cast(X), temp_X.get(), input_shape.Size()); + Impl_Cast(Stream(), reinterpret_cast(X), temp_X.get(), input_shape.Size()); } // MIOpen requires at least 3D input, so pad 1s if needed @@ -208,7 +209,8 @@ Status ReduceKernel::ReduceKernelShared( input_data_buffer = GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(Stream(), + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(X), nullptr, reinterpret_cast(X), nullptr, tmp_div, tmp_div, @@ -233,7 +235,8 @@ Status ReduceKernel::ReduceKernelShared( auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, rhs_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(Stream(), + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(X), &prepare.rhs_padded_strides, @@ -242,7 +245,8 @@ Status ReduceKernel::ReduceKernelShared( prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(Stream(), + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); @@ -253,13 +257,15 @@ Status ReduceKernel::ReduceKernelShared( &zero, output_tensor, reinterpret_cast(log_sum_result))); // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(Stream(), + reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(Stream(), + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(Y), nullptr, tmp_div, tmp_div, @@ -276,7 +282,7 @@ Status ReduceKernel::ReduceKernelShared( // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (reinterpret_cast(Y) != reinterpret_cast(X)) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y, X, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y, X, input_count * sizeof(T), hipMemcpyDeviceToDevice, Stream())); } } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -301,11 +307,12 @@ Status ReduceKernel::ReduceKernelShared( } // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_rocm.get()), reinterpret_cast(Y), output_count); + Impl_Cast(Stream(), reinterpret_cast(indices_rocm.get()), reinterpret_cast(Y), output_count); } if (calculate_log_) { - Impl_Log(reinterpret_cast(Y), + Impl_Log(Stream(), + reinterpret_cast(Y), reinterpret_cast(Y), output_count); } @@ -421,7 +428,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr std::vector& output_dims = prepare_reduce_metadata.output_dims; std::vector& input_dims_miopen = prepare_reduce_metadata.input_dims_miopen; std::vector& output_dims_miopen = prepare_reduce_metadata.output_dims_miopen; - + hipStream_t stream = static_cast(rocm_ep.GetComputeStream()); // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(output.Shape().Size() == 0); @@ -436,6 +443,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr switch (applicable_matrix_reduction) { case ApplicableMatrixReduction::Rows: { return reduce_matrix_rows( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n); @@ -444,6 +452,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr const auto buffer_size_bytes = compute_reduce_matrix_columns_buffer_size(m, n); auto buffer = rocm_ep.GetScratchBuffer(buffer_size_bytes); return reduce_matrix_columns( + stream, reinterpret_cast(input.template Data()), reinterpret_cast(output.template MutableData()), m, n, buffer.get(), buffer_size_bytes); @@ -455,7 +464,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output.MutableDataRaw(), 0, output.SizeInBytes(), stream)); IAllocatorUniquePtr temp_X; miopenDataType_t miopen_type_X = MiopenTensor::GetDataType(); @@ -464,7 +473,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // ArgMax/ArgMin with FP16 are not supported by miopen, so convert input to fp32 then call miopen temp_X = rocm_ep.GetScratchBuffer(input_count); miopen_type_X = miopenFloat; - Impl_Cast(reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); + Impl_Cast(stream, reinterpret_cast(input.template Data()), temp_X.get(), input_shape.Size()); } MiopenReduceDescriptor reduce_desc; @@ -497,7 +506,8 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr input_data_buffer = rocm_ep.GetScratchBuffer(input_count); input_data = reinterpret_cast(input_data_buffer.get()); fast_divmod tmp_div; - Impl_Mul(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Mul(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(input.template Data()), nullptr, reinterpret_cast(input.template Data()), nullptr, tmp_div, tmp_div, @@ -507,7 +517,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // This happens when the input is Scalar if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } } else { // Reduce max -- Max/Min will output indices data @@ -536,7 +546,8 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr auto log_sum_result = log_sum_result_buffer.get(); BinaryElementwisePreparation prepare; ORT_RETURN_IF_ERROR(prepare.BinaryElementwiseBroadcastPrepareHelper(input_shape, output_shape, input_shape)); - Impl_Sub(prepare.output_rank_or_simple_broadcast, + Impl_Sub(stream, + prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, reinterpret_cast(input.template Data()), &prepare.rhs_padded_strides, @@ -545,14 +556,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr prepare.fdm_H, prepare.fdm_C, reinterpret_cast(exp_result), input_count); - Impl_Exp(reinterpret_cast(exp_result), + Impl_Exp(stream, + reinterpret_cast(exp_result), reinterpret_cast(exp_result), input_count); // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(log_sum_result), exp_result, input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } else { // ReduceSum MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -563,13 +575,15 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } // Log(Sum) - Impl_Log(reinterpret_cast(log_sum_result), + Impl_Log(stream, + reinterpret_cast(log_sum_result), reinterpret_cast(log_sum_result), output_count); // Log + ReduceMax fast_divmod tmp_div; - Impl_Add(static_cast(SimpleBroadcast::NoBroadcast), nullptr, + Impl_Add(stream, + static_cast(SimpleBroadcast::NoBroadcast), nullptr, reinterpret_cast(log_sum_result), nullptr, reinterpret_cast(output.template MutableData()), nullptr, tmp_div, tmp_div, @@ -581,7 +595,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case // This happens when the input is Scalar. We do not need to add anything in this case. if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(reinterpret_cast(output.template MutableData()), input_data, input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( rocm_ep.PerThreadMiopenHandle(), reduce_desc, indices_rocm.get(), indices_bytes, @@ -593,7 +607,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (output.template MutableData() != input.template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.template MutableData(), input.template Data(), input_count * sizeof(T), hipMemcpyDeviceToDevice, stream)); } } else { MIOPEN_RETURN_IF_ERROR(miopenReduceTensor( @@ -607,7 +621,7 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr // miopenReduceTensor has issue if input and output has same size, which will happen if the axis to be reduced has dim value of 1. // the output is zeros of the output size if (input_count == output_count) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(output.template MutableData(), static_cast(0), output_count * sizeof(int64_t))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output.template MutableData(), static_cast(0), output_count * sizeof(int64_t), stream)); } else { if (temp_X) { auto temp_output = rocm_ep.GetScratchBuffer(output_count); @@ -626,12 +640,13 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr } // MIOpen reduction index is uint32_t for now, cast it to int64_t according to ONNX spec - Impl_Cast(reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); + Impl_Cast(stream, reinterpret_cast(indices_rocm.get()), output.template MutableData(), output_count); } } if (calculate_log) { - Impl_Log(reinterpret_cast(output.template MutableData()), + Impl_Log(stream, + reinterpret_cast(output.template MutableData()), reinterpret_cast(output.template MutableData()), output_count); } @@ -661,7 +676,7 @@ Status ReduceKernel::ComputeImpl(OpKernelContext* ctx, miopenR // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -701,7 +716,7 @@ Status ReduceKernel::ComputeImpl // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -728,14 +743,14 @@ Status ReduceKernel::ComputeImpl // miopenReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case if (input_count == output_count) { if (Y->template MutableData() != X->template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -745,7 +760,7 @@ Status ReduceKernel::ComputeImpl miopenDataType_t miopen_type_X = miopenFloat; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -771,7 +786,7 @@ Status ReduceKernel::ComputeImpl output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } @@ -807,14 +822,14 @@ Status ReduceKernel::ComputeImpl( const auto* const src = X->template Data(); if (input_count == output_count) { if (src != dst) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst, src, input_count * sizeof(int8_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(dst, src, input_count * sizeof(int8_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -824,7 +839,7 @@ Status ReduceKernel::ComputeImpl( miopenDataType_t miopen_type_X = miopenFloat; IAllocatorUniquePtr temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(src), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(src), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -850,7 +865,7 @@ Status ReduceKernel::ComputeImpl( output_tensor, temp_Y.get())); - Impl_Cast(temp_Y.get(), dst, output_count); + Impl_Cast(Stream(), temp_Y.get(), dst, output_count); return Status::OK(); } diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index dbaa7c3a20..eccd3afc02 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -58,10 +58,14 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace rocm -ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy) { +ROCMExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, hipStream_t stream, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy) { HIP_CALL_THROW(hipSetDevice(device_id)); + stream_ = stream; + ROCBLAS_CALL_THROW(rocblas_create_handle(&rocblas_handle_)); + ROCBLAS_CALL_THROW(rocblas_set_stream(rocblas_handle_, stream)); MIOPEN_CALL_THROW(miopenCreate(&miopen_handle_)); + MIOPEN_CALL_THROW(miopenSetStream(miopen_handle_, stream)); AllocatorCreationInfo default_memory_info( [](OrtDevice::DeviceId id) { @@ -104,6 +108,16 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in HIP_CALL_THROW(hipDeviceSynchronize()); HIP_CALL_THROW(hipGetDeviceProperties(&device_prop_, info_.device_id)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + // HIP_CALL_THROW(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking)); + // TODO: use default stream now due to failures of FusedMatMulOpTest. + // Will check with AMD to verify whether ROCBlas can run with specified stream. + stream_ = nullptr; + } + size_t free = 0; size_t total = 0; HIP_CALL_THROW(hipMemGetInfo(&free, &total)); @@ -168,6 +182,10 @@ ROCMExecutionProvider::~ROCMExecutionProvider() { ORT_IGNORE_RETURN_VALUE(cache->erase(this)); } } + + if (!external_stream_ && stream_) { + HIP_CALL(hipStreamDestroy(stream_)); + } } ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadContext() const { @@ -188,7 +206,7 @@ ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(info_.device_id, info_.hip_mem_limit, info_.arena_extend_strategy); + context = std::make_shared(info_.device_id, static_cast(GetComputeStream()), info_.hip_mem_limit, info_.arena_extend_strategy); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -286,7 +304,8 @@ Status ROCMExecutionProvider::OnRunStart() { Status ROCMExecutionProvider::OnRunEnd() { // record deferred release event on default stream, and release per_thread_context auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent(); - HIP_RETURN_IF_ERROR(hipEventRecord(current_deferred_release_event, nullptr)); + HIP_RETURN_IF_ERROR(hipEventRecord(current_deferred_release_event, static_cast(GetComputeStream()))); + HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast(GetComputeStream()))); ReleasePerThreadContext(); std::lock_guard lock(deferred_release_cpu_ptr_mutex_); deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true; @@ -1710,7 +1729,7 @@ static bool CastNeedFallbackToCPU(const onnxruntime::Node& node) { } std::unique_ptr ROCMExecutionProvider::GetDataTransfer() const { - return onnxruntime::make_unique(); + return onnxruntime::make_unique(static_cast(GetComputeStream()), info_.do_copy_in_default_stream); } std::vector> diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index 8859ec42f3..5360f93444 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -15,6 +15,7 @@ #include "core/providers/rocm/rocm_execution_provider_info.h" #include "core/providers/rocm/rocm_pch.h" #include "core/providers/rocm/shared_inc/rocm_utils.h" +#include "core/providers/rocm/shared_inc/rocm_call.h" namespace onnxruntime { @@ -37,6 +38,20 @@ class ROCMExecutionProvider : public IExecutionProvider { return nullptr; } + Status SetComputeStream(void* stream) override { + if (stream != stream_) { + if (stream_) { + HIP_CALL(hipStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } + return Status::OK(); + } + + void* GetComputeStream() const override { return static_cast(stream_); } + rocblas_handle PerThreadRocblasHandle() { return GetPerThreadContext().RocblasHandle(); } @@ -77,6 +92,8 @@ class ROCMExecutionProvider : public IExecutionProvider { private: ROCMExecutionProviderInfo info_; hipDeviceProp_t device_prop_; + bool external_stream_ = false; + hipStream_t stream_ = nullptr; struct DeferredReleaseCPUPtrs { bool recorded = false; @@ -88,7 +105,7 @@ class ROCMExecutionProvider : public IExecutionProvider { class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy); + PerThreadContext(OrtDevice::DeviceId device_id, hipStream_t stream, size_t hip_mem_limit, ArenaExtendStrategy arena_extend_strategy); ~PerThreadContext(); rocblas_handle RocblasHandle() const { @@ -109,17 +126,17 @@ class ROCMExecutionProvider : public IExecutionProvider { if (!constant_ones_float_) { constant_ones_float_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + return reinterpret_cast(constant_ones_float_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_double_) { constant_ones_double_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + return reinterpret_cast(constant_ones_double_->GetBuffer(stream_, count)); } else if (std::is_same::value) { if (!constant_ones_half_) { constant_ones_half_ = rocm::CreateConstantOnes(); } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + return reinterpret_cast(constant_ones_half_->GetBuffer(stream_, count)); } else { return nullptr; } @@ -130,6 +147,7 @@ class ROCMExecutionProvider : public IExecutionProvider { } private: + hipStream_t stream_ = nullptr; rocblas_handle rocblas_handle_ = nullptr; miopenHandle_t miopen_handle_ = nullptr; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h index 4bf12499d9..3c2383a467 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h @@ -15,6 +15,9 @@ struct ROCMExecutionProviderInfo { OrtDevice::DeviceId device_id{0}; size_t hip_mem_limit{std::numeric_limits::max()}; ArenaExtendStrategy arena_extend_strategy{ArenaExtendStrategy::kNextPowerOfTwo}; + bool do_copy_in_default_stream{true}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; static ROCMExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const ROCMExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/rocm/rocm_kernel.h b/onnxruntime/core/providers/rocm/rocm_kernel.h index f70d7136f4..6c63ded1dd 100644 --- a/onnxruntime/core/providers/rocm/rocm_kernel.h +++ b/onnxruntime/core/providers/rocm/rocm_kernel.h @@ -58,6 +58,8 @@ class RocmKernel : public OpKernel { const hipDeviceProp_t& GetDeviceProp() const { return provider_->GetDeviceProp(); }; + inline hipStream_t Stream() const { return static_cast(provider_->GetComputeStream()); } + // To support hipMemcpyAsync, the cpu memory should be allocated in pinned memory // and it can only be released after the copy has finished template @@ -91,7 +93,7 @@ class RocmKernel : public OpKernel { Status CopyToGpu() { if (cpu_pinned_copy_) { gpu_copy_ = op_kernel_->GetScratchBuffer(count_); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), hipMemcpyHostToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(gpu_copy_.get(), cpu_pinned_copy_.get(), count_ * sizeof(T), hipMemcpyHostToDevice, op_kernel_->Stream())); op_kernel_->AddDeferredReleaseCPUPtr(cpu_pinned_copy_.release()); } return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/rocm_utils.cu b/onnxruntime/core/providers/rocm/rocm_utils.cu index 923f5e64fd..3acbe88015 100644 --- a/onnxruntime/core/providers/rocm/rocm_utils.cu +++ b/onnxruntime/core/providers/rocm/rocm_utils.cu @@ -27,10 +27,10 @@ __global__ void _Fill( } template -void Fill(T* output, T value, int64_t count) { +void Fill(hipStream_t stream, T* output, T value, int64_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); HIP_LONG N = static_cast(count); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, output, value, N); + hipLaunchKernelGGL(HIP_KERNEL_NAME(_Fill), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, output, value, N); } template class ConstantBufferImpl : public IConstantBuffer { @@ -42,7 +42,7 @@ class ConstantBufferImpl : public IConstantBuffer { hipFree(buffer_); } - virtual const T* GetBuffer(size_t count) { + virtual const T* GetBuffer(hipStream_t stream, size_t count) { if (count > count_) { if (buffer_) { hipFree(buffer_); @@ -51,7 +51,7 @@ class ConstantBufferImpl : public IConstantBuffer { HIP_CALL_THROW(hipMalloc(&buffer_, count * sizeof(T))); count_ = count; - Fill(buffer_, val_, count); + Fill(stream, buffer_, val_, count); } return buffer_; } @@ -72,7 +72,7 @@ template std::unique_ptr> CreateConstantOnes(); template std::unique_ptr> CreateConstantOnes(); #define SPECIALIZED_FILL(T) \ - template void Fill(T * output, T value, int64_t count); + template void Fill(hipStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) SPECIALIZED_FILL(int16_t) diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h index c61d57ddb9..3fb52c2421 100644 --- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h @@ -206,19 +206,19 @@ inline rocblas_status rocblasGemmStridedBatchedHelper(rocblas_handle handle, } // transpose using geam -inline rocblas_status rocblasTransposeHelper(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { +inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { return rocblas_sgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -inline rocblas_status rocblasTransposeHelper(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { +inline rocblas_status rocblasTransposeHelper(hipStream_t /*stream*/, rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { return rocblas_dgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } -rocblas_status rocblasTransposeHelper(rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); +rocblas_status rocblasTransposeHelper(hipStream_t stream, rocblas_handle, rocblas_operation , rocblas_operation , int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int); // copy -inline rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const float* x, int incx, float* y, int incy) { +inline rocblas_status rocblasCopyHelper(hipStream_t /*stream*/, rocblas_handle handle, int n, const float* x, int incx, float* y, int incy) { return rocblas_scopy(handle, n, x, incx, y, incy); } -inline rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const double* x, int incx, double* y, int incy) { +inline rocblas_status rocblasCopyHelper(hipStream_t /*stream*/, rocblas_handle handle, int n, const double* x, int incx, double* y, int incy) { return rocblas_dcopy(handle, n, x, incx, y, incy); } -rocblas_status rocblasCopyHelper(rocblas_handle handle, int n, const half* x, int incx, half* y, int incy); +rocblas_status rocblasCopyHelper(hipStream_t stream, rocblas_handle handle, int n, const half* x, int incx, half* y, int incy); diff --git a/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu b/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu index 332e9befa3..5d70cbe7d4 100644 --- a/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu +++ b/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu @@ -52,6 +52,7 @@ __global__ void _GatherNDKernel( template void ComputeSliceOffsetsImpl( + hipStream_t stream, const int64_t batch_dims, const TArray input_dims, const size_t num_slices, @@ -62,7 +63,7 @@ void ComputeSliceOffsetsImpl( const TIndex* const indices_data, // num_slices * num_slice_dims elements int64_t* const input_slice_offsets_data) { // num_slices elements const auto blocks_per_grid = CeilDiv(num_slices, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, batch_dims, input_dims, num_slices, @@ -76,18 +77,20 @@ void ComputeSliceOffsetsImpl( template void GatherNDImpl( + hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const auto blocks_per_grid = CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, num_slices, static_cast(input_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \ template void ComputeSliceOffsetsImpl( \ + hipStream_t stream, \ const int64_t batch_dims, \ const TArray input_dims, \ const size_t num_slices, \ @@ -99,7 +102,7 @@ void GatherNDImpl( int64_t* const input_slice_offsets_data); #define SPECIALIZED_IMPL(T) \ - template void GatherNDImpl(const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); + template void GatherNDImpl(hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data); SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t) SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t) diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.cc b/onnxruntime/core/providers/rocm/tensor/transpose.cc index 9b59b3acf6..38b2a9cef1 100644 --- a/onnxruntime/core/providers/rocm/tensor/transpose.cc +++ b/onnxruntime/core/providers/rocm/tensor/transpose.cc @@ -54,14 +54,15 @@ static std::tuple TryTransposeWithRocblas(const std::vector& p } template -Status TransposeWithRocblas(rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) { +Status TransposeWithRocblas(hipStream_t stream, rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) { typedef typename ToHipType::MappedType HipT; HipT one = ToHipType::FromFloat(1.0f); HipT zero = ToHipType::FromFloat(0.0f); const HipT* input_data = reinterpret_cast(input.Data()); HipT* output_data = reinterpret_cast(output.MutableData()); ROCBLAS_RETURN_IF_ERROR( - rocblasTransposeHelper(rocblas_handle, + rocblasTransposeHelper(stream, + rocblas_handle, rocblas_operation_transpose, rocblas_operation_transpose, M, N, &one, input_data, @@ -76,10 +77,11 @@ Status TransposeWithRocblas(rocblas_handle rocblas_handle, const Tensor& input, Status Transpose::DoTranspose(const Transpose& transpose_kernel, const std::vector& permutations, const Tensor& input, Tensor& output) { - return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.RocblasHandle(), permutations, input, output); + return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.RocblasHandle(), permutations, input, output); } Status Transpose::DoTranspose(const hipDeviceProp_t& prop, + hipStream_t stream, const rocblas_handle rocblas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override) { @@ -96,11 +98,11 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, int N = std::get<1>(mn); if (M != 0 && N != 0) { if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } else if (element_type == utils::GetONNXTensorElementDataType()) { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } else { - return TransposeWithRocblas(rocblas_handle, input, output, M, N); + return TransposeWithRocblas(stream, rocblas_handle, input, output, M, N); } } } @@ -162,14 +164,14 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, size_t element_size = input.DataType()->Size(); if (CanDoTranspose3D(new_rank, new_input_dims, new_permutations)) { - return Transpose3DImpl(element_size, input_shape, tmp_input_strides, + return Transpose3DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), output.MutableDataRaw(), output.Shape().Size()); } else if (CanDoTranspose4D(prop, element_size, new_rank, new_input_dims, new_permutations)) { TArray tmp_output_strides(new_rank); for (auto i = 0; i < new_rank; i++) { tmp_output_strides[i] = new_output_strides[new_permutations[i]]; } - return Transpose4DImpl(element_size, input_shape, tmp_input_strides, input.DataRaw(), + return Transpose4DImpl(stream, element_size, input_shape, tmp_input_strides, input.DataRaw(), tmp_output_strides, output.MutableDataRaw(), output.Shape().Size()); } @@ -184,7 +186,7 @@ Status Transpose::DoTranspose(const hipDeviceProp_t& prop, output_strides[i] = fast_divmod(gsl::narrow_cast(new_output_strides[i])); } - auto status = TransposeImpl(element_size, new_rank, input_strides, input.DataRaw(), + auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(), output_strides, output.MutableDataRaw(), output.Shape().Size()); return status; @@ -208,7 +210,7 @@ Status Transpose::ComputeInternal(OpKernelContext* ctx) const { TensorShape output_shape{output_dims}; Tensor* Y = ctx->Output(0, output_shape); - return DoTranspose(this->GetDeviceProp(), this->RocblasHandle(), *p_perm, X, *Y); + return DoTranspose(this->GetDeviceProp(), this->Stream(), this->RocblasHandle(), *p_perm, X, *Y); } } // namespace rocm diff --git a/onnxruntime/core/providers/rocm/tensor/transpose.h b/onnxruntime/core/providers/rocm/tensor/transpose.h index 08b7fd3436..81410fac72 100644 --- a/onnxruntime/core/providers/rocm/tensor/transpose.h +++ b/onnxruntime/core/providers/rocm/tensor/transpose.h @@ -23,6 +23,7 @@ class Transpose final : public RocmKernel, public TransposeBase { // `input_shape_override` (if provided) overrides the shape of `input` for compute purposes static Status DoTranspose(const hipDeviceProp_t& prop, + hipStream_t stream, const rocblas_handle rocblas_handle, const std::vector& permutations, const Tensor& input, Tensor& output, const TensorShape* input_shape_override = nullptr); diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 306c5f945e..2313eb02b8 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -204,7 +204,7 @@ std::unique_ptr CreateCPUAllocator(const OrtMemoryInfo& memory_info) std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name); std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name); -std::unique_ptr CreateGPUDataTransfer(); +std::unique_ptr CreateGPUDataTransfer(void* stream); std::string GetEnvironmentVar(const std::string& var_name); diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index aa8540f5b9..fbb061472c 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -163,8 +163,8 @@ std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const c return g_host->CreateCUDAPinnedAllocator(device_id, name); } -std::unique_ptr CreateGPUDataTransfer() { - return g_host->CreateGPUDataTransfer(); +std::unique_ptr CreateGPUDataTransfer(void* stream) { + return g_host->CreateGPUDataTransfer(stream); } #endif diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 5babda61ce..474d133420 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -4,6 +4,10 @@ // Public wrappers around internal ort interfaces (currently) // In the future the internal implementations could derive from these to remove the need for the wrapper implementations +#ifdef USE_TENSORRT +#include +#endif + #define PROVIDER_DISALLOW_ALL(TypeName) \ TypeName() = delete; \ TypeName(const TypeName&) = delete; \ @@ -127,10 +131,10 @@ struct ProviderHost { #ifdef USE_TENSORRT virtual std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name) = 0; virtual std::unique_ptr CreateCUDAPinnedAllocator(int16_t device_id, const char* name) = 0; - virtual std::unique_ptr CreateGPUDataTransfer() = 0; + virtual std::unique_ptr CreateGPUDataTransfer(void* stream) = 0; - virtual void cuda__Impl_Cast(const int64_t* input_data, int32_t* output_data, size_t count) = 0; - virtual void cuda__Impl_Cast(const int32_t* input_data, int64_t* output_data, size_t count) = 0; + virtual void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) = 0; + virtual void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) = 0; virtual bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; virtual bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) = 0; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 1b420ccd66..ea2d88a749 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1,6 +1,5 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - #include #include #include @@ -270,16 +269,18 @@ namespace onnxruntime { namespace cuda { template <> void Impl_Cast( + cudaStream_t stream, const int64_t* input_data, int32_t* output_data, size_t count) { - return g_host->cuda__Impl_Cast(input_data, output_data, count); + return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); } template <> void Impl_Cast( + cudaStream_t stream, const int32_t* input_data, int64_t* output_data, size_t count) { - return g_host->cuda__Impl_Cast(input_data, output_data, count); + return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); } } // namespace cuda @@ -373,6 +374,12 @@ TensorrtLogger& GetTensorrtLogger() { TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, device_id_(info.device_id) { CUDA_CALL_THROW(cudaSetDevice(device_id_)); + if (info.has_user_compute_stream) { + external_stream_ = true; + stream_ = static_cast(info.user_compute_stream); + } else { + CUDA_CALL_THROW(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); + } // Get environment variables const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); @@ -438,7 +445,11 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv } } -TensorrtExecutionProvider::~TensorrtExecutionProvider() {} +TensorrtExecutionProvider::~TensorrtExecutionProvider() { + if (!external_stream_ && stream_) { + CUDA_CALL(cudaStreamDestroy(stream_)); + } +} AllocatorPtr TensorrtExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { if (mem_type == OrtMemTypeDefault) { @@ -472,7 +483,24 @@ void TensorrtExecutionProvider::RegisterAllocator(std::shared_ptr TensorrtExecutionProvider::GetDataTransfer() const { - return onnxruntime::CreateGPUDataTransfer(); + return onnxruntime::CreateGPUDataTransfer(static_cast(GetComputeStream())); +} + +Status TensorrtExecutionProvider::OnRunEnd() { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast(GetComputeStream()))); + return Status::OK(); +} + +Status TensorrtExecutionProvider::SetComputeStream(void* stream) { + if (stream != stream_) { + if (stream_) { + CUDA_RETURN_IF_ERROR(cudaStreamDestroy(stream_)); + } + + external_stream_ = true; + stream_ = static_cast(stream); + } + return Status::OK(); } // Convert GraphViewer graph to GraphProto @@ -1158,7 +1186,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse }; // Create compute function - compute_info.compute_func = [](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { + compute_info.compute_func = [this](FunctionState state, const OrtCustomOpApi* api, OrtKernelContext* context) { Ort::CustomOpApi ort{*api}; TensorrtFuncState* trt_state = reinterpret_cast(state); std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); @@ -1176,6 +1204,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse std::unordered_map dimension_update; std::unordered_map> tensor_shape_values; nvinfer1::IOptimizationProfile* trt_profile = nullptr; + cudaStream_t stream = static_cast(this->GetComputeStream()); // Load serialized engine const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); @@ -1240,7 +1269,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse switch (tensor_type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { int32_t* input = new int32_t[shape_size]; - CUDA_RETURN_IF_ERROR(cudaMemcpy(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); for (int j = 0; j < shape_size; ++j) { tensor_shape_values[input_name][j] = input[j]; } @@ -1249,7 +1279,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { int64_t* input = new int64_t[shape_size]; - CUDA_RETURN_IF_ERROR(cudaMemcpy(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input, ort.GetTensorData(input_tensor), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); for (int j = 0; j < shape_size; ++j) { tensor_shape_values[input_name][j] = static_cast(input[j]); } @@ -1515,7 +1546,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } scratch_buffers.push_back(IAllocator::MakeUniquePtr(alloc, input_dim_size * sizeof(int32_t))); buffers[binding_index] = scratch_buffers.back().get(); - cuda::Impl_Cast(input_tensor_ptr, reinterpret_cast(buffers[binding_index]), input_dim_size); + cuda::Impl_Cast(stream, input_tensor_ptr, reinterpret_cast(buffers[binding_index]), input_dim_size); } break; } @@ -1639,7 +1670,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse } // Run TRT inference - if (!trt_context->enqueueV2(&buffers[0], nullptr, nullptr)) { + if (!trt_context->enqueueV2(&buffers[0], stream, nullptr)) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed."); } @@ -1655,7 +1686,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { auto output_tensor_ptr = ort.GetTensorMutableData(output_tensor[i]); if (output_tensor_ptr != nullptr) { - cuda::Impl_Cast(reinterpret_cast(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]); + cuda::Impl_Cast(stream, reinterpret_cast(buffers[binding_index]), output_tensor_ptr, output_dim_sizes[i]); } } } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 7ca9e8c1fe..5bc13bcab3 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -67,6 +67,8 @@ using unique_pointer = std::unique_ptr; // Information needed to construct trt execution providers. struct TensorrtExecutionProviderInfo { int device_id{0}; + bool has_user_compute_stream{false}; + void* user_compute_stream{nullptr}; }; // Information to construct kernel function state. @@ -116,7 +118,15 @@ class TensorrtExecutionProvider : public IExecutionProvider { void RegisterAllocator(std::shared_ptr allocator_manager) override; + Status OnRunEnd() override; + + Status SetComputeStream(void* stream) override; + + void* GetComputeStream() const override { return static_cast(stream_); } + private: + bool external_stream_ = false; + cudaStream_t stream_ = nullptr; int max_partition_iterations_ = 1000; int min_subgraph_size_ = 1; size_t max_workspace_size_ = 1 << 30; // 1GB diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index f6b569b0a0..66bc8e517f 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -13,30 +13,43 @@ namespace onnxruntime { void Shutdown_DeleteRegistry(); struct TensorrtProviderFactory : IExecutionProviderFactory { - TensorrtProviderFactory(int device_id) : device_id_(device_id) {} + TensorrtProviderFactory(const TensorrtExecutionProviderInfo& info) : info_{info} {} ~TensorrtProviderFactory() override {} std::unique_ptr CreateProvider() override; private: - int device_id_; + TensorrtExecutionProviderInfo info_; }; std::unique_ptr TensorrtProviderFactory::CreateProvider() { - TensorrtExecutionProviderInfo info; - info.device_id = device_id_; - return onnxruntime::make_unique(info); + return onnxruntime::make_unique(info_); } std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id) { - return std::make_shared(device_id); + TensorrtExecutionProviderInfo info; + info.device_id = device_id; + return std::make_shared(info); +} + +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const TensorrtExecutionProviderInfo& info) { + return std::make_shared(info); } struct Tensorrt_Provider : Provider { std::shared_ptr CreateExecutionProviderFactory(int device_id) override { - //TODO: This is apparently a bug. The consructor parameter is create-arena-flag, not the device-id - // Will be fixed by PR #2850 - return std::make_shared(device_id); + TensorrtExecutionProviderInfo info; + info.device_id = device_id; + return std::make_shared(info); + } + + std::shared_ptr CreateExecutionProviderFactory(const void* provider_options) override { + auto& options = *reinterpret_cast(provider_options); + TensorrtExecutionProviderInfo info; + info.device_id = options.device_id; + info.has_user_compute_stream = options.has_user_compute_stream; + info.user_compute_stream = options.user_compute_stream; + return std::make_shared(info); } void Shutdown() override { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index aa89b84c40..65316cc686 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -429,6 +429,11 @@ common::Status InferenceSession::RegisterExecutionProvider(std::unique_ptrSetComputeStream(trt_ep->GetComputeStream()); + } } VLOGS(*session_logger_, 1) << "Adding execution provider of type: " << provider_type; diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 2ee7949503..3bc51f11da 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -1850,6 +1850,15 @@ ORT_API(void, OrtApis::ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg* ptr) { delete ptr; } +#if defined(ORT_MINIMAL_BUILD) +ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options) { + ORT_UNUSED_PARAMETER(options); + ORT_UNUSED_PARAMETER(tensorrt_options); + return CreateStatus(ORT_FAIL, "TensorRT execution provider is not enabled."); +} +#endif + static constexpr OrtApiBase ort_api_base = { &OrtApis::GetApi, &OrtApis::GetVersionString, @@ -2084,6 +2093,7 @@ static constexpr OrtApi ort_api_1_to_7 = { // Version 7 - In development, feel free to add/remove/rearrange here &OrtApis::ModelMetadataGetGraphDescription, + &OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, }; // Assert to do a limited check to ensure Version 1 of OrtApi never changes (will detect an addition or deletion but not if they cancel out each other) diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index fa8527d7c9..2418ff8909 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -255,4 +255,6 @@ ORT_API_STATUS_IMPL(SetGlobalDenormalAsZero, _Inout_ OrtThreadingOptions* option ORT_API_STATUS_IMPL(CreateArenaCfg, _In_ size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk, _Outptr_ OrtArenaCfg** out); ORT_API(void, ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg*); +ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_TensorRT, + _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); } // namespace OrtApis diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 32c7e0f317..f08af799b9 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -188,7 +188,7 @@ std::string nuphar_settings; const OrtDevice::DeviceType OrtDevice::GPU; namespace onnxruntime { -std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id); +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_Dnnl(int use_arena); std::shared_ptr CreateExecutionProviderFactory_OpenVINO(const OrtOpenVINOProviderOptions* params); @@ -501,7 +501,8 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector sess->GetSessionOptions().enable_cpu_mem_arena)); } else if (type == kTensorrtExecutionProvider) { #ifdef USE_TENSORRT - RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(0)); + OrtTensorRTProviderOptions params{0, 0, nullptr}; + RegisterExecutionProvider(sess, *onnxruntime::CreateExecutionProviderFactory_Tensorrt(¶ms)); #endif } else if (type == kMIGraphXExecutionProvider) { #ifdef USE_MIGRAPHX @@ -845,7 +846,11 @@ void addGlobalMethods(py::module& m, Environment& env) { onnxruntime::CreateExecutionProviderFactory_OpenVINO(openvino_device_type, false, "", 8), #endif #ifdef USE_TENSORRT - onnxruntime::CreateExecutionProviderFactory_Tensorrt(0), + onnxruntime::CreateExecutionProviderFactory_Tensorrt( + [&]() { + TensorrtExecutionProviderInfo info{}; + return info; + }()), #endif #ifdef USE_MIGRAPHX onnxruntime::CreateExecutionProviderFactory_MIGraphX(0), diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index f436e3020e..4db59b3bc8 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -330,7 +330,8 @@ void RunModelWithBindingMatMul(InferenceSession& session_object, std::unique_ptr cpu_tensor = onnxruntime::make_unique(element_type, shape, cpu_allocator); - st = GPUDataTransfer().CopyTensor(rtensor, *cpu_tensor.get(), 0); + cudaStream_t stream = static_cast(static_cast(TestCudaExecutionProvider())->GetComputeStream()); + st = GPUDataTransfer(stream).CopyTensor(rtensor, *cpu_tensor.get(), 0); ASSERT_TRUE(st.IsOK()); OrtValue ml_value; ml_value.Init(cpu_tensor.release(), diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 5c424a6104..4e4830dd08 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -304,8 +304,22 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (enable_tensorrt) { #ifdef USE_TENSORRT - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); + OrtTensorRTProviderOptions tensorrt_options{ + 0, + 0, + nullptr}; + + OrtCUDAProviderOptions cuda_options{ + 0, + OrtCudnnConvAlgoSearch::EXHAUSTIVE, + std::numeric_limits::max(), + 0, + true, + 0, + nullptr}; + + sf.AppendExecutionProvider_TensorRT(tensorrt_options); + sf.AppendExecutionProvider_CUDA(cuda_options); #else fprintf(stderr, "TensorRT is not supported in this build"); return -1; @@ -328,7 +342,9 @@ int real_main(int argc, char* argv[], Ort::Env& env) { OrtCudnnConvAlgoSearch::EXHAUSTIVE, std::numeric_limits::max(), 0, - true}; + true, + 0, + nullptr}; sf.AppendExecutionProvider_CUDA(cuda_options); #else fprintf(stderr, "CUDA is not supported in this build"); diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index a573f05691..94a0a97e11 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -46,7 +46,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device static_cast(performance_test_config.run_config.cudnn_conv_algo), std::numeric_limits::max(), 0, - !performance_test_config.run_config.do_cuda_copy_in_separate_stream}; + !performance_test_config.run_config.do_cuda_copy_in_separate_stream, + 0, + nullptr}; session_options.AppendExecutionProvider_CUDA(cuda_options); #else ORT_THROW("CUDA is not supported in this build\n"); diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc index bfd384b563..23a37a5e80 100644 --- a/onnxruntime/test/providers/cuda/reduction_functions_test.cc +++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc @@ -71,18 +71,21 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f) cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice); ASSERT_STATUS_OK(reduce_sum( + 0, device_input.get(), device_output_sum.get(), size, buffer.get(), buffer_size_in_bytes)); ASSERT_STATUS_OK(reduce_square_sum( + 0, device_input.get(), device_output_square_sum.get(), size, buffer.get(), buffer_size_in_bytes)); ASSERT_STATUS_OK(reduce_mean( + 0, device_input.get(), device_output_mean.get(), size, @@ -121,11 +124,11 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative if (!reset_initial_output) { // manually initialize output data - Fill(d_out.get(), initial_value, n); + Fill(0, d_out.get(), initial_value, n); } ASSERT_STATUS_OK(reduce_matrix_rows( - d_in.get(), d_out.get(), + 0, d_in.get(), d_out.get(), m, n, reset_initial_output)); @@ -164,6 +167,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e auto d_buffer = AllocateDeviceMemory(buffer_size_in_bytes); ASSERT_STATUS_OK(reduce_matrix_columns( + 0, d_in.get(), d_out.get(), m, n, d_buffer.get(), buffer_size_in_bytes)); @@ -223,6 +227,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) { cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); ASSERT_STATUS_OK(reduce_matrix_columns( + 0, d_input.get(), d_output.get(), m, n, d_buffer.get() + buffer_offset, @@ -250,7 +255,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) { cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice); const auto status = - reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); + reduce_matrix_columns(0, d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes); ASSERT_FALSE(status.IsOK()); } diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 786eccced9..4116cb8f48 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -724,6 +724,9 @@ TEST(CApiTest, io_binding_cuda) { Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast(output_data.get()), expected_y.size(), expected_y_shape.data(), expected_y_shape.size()); + // Sychronize to make sure the copy on default stream is done since TensorRT isn't using default stream. + cudaStreamSynchronize(nullptr); + Ort::IoBinding binding(session); binding.BindInput("X", bound_x); binding.BindOutput("Y", bound_y); diff --git a/onnxruntime/test/shared_lib/utils.cc b/onnxruntime/test/shared_lib/utils.cc index b27c9e8228..cfa7c7139b 100644 --- a/onnxruntime/test/shared_lib/utils.cc +++ b/onnxruntime/test/shared_lib/utils.cc @@ -27,6 +27,7 @@ void MyCustomKernel::Compute(OrtKernelContext* context) { // Do computation #ifdef USE_CUDA cuda_add(size, out, X, Y); + cudaStreamSynchronize(nullptr); #else for (int64_t i = 0; i < size; i++) { out[i] = X[i] + Y[i]; diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index a0d69269a2..897d14ef79 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -25,7 +25,7 @@ std::shared_ptr CreateExecutionProviderFactory_OpenVI std::shared_ptr CreateExecutionProviderFactory_Nuphar(bool, const char*); std::shared_ptr CreateExecutionProviderFactory_Nnapi(uint32_t); std::shared_ptr CreateExecutionProviderFactory_Rknpu(); -std::shared_ptr CreateExecutionProviderFactory_Tensorrt(int device_id); +std::shared_ptr CreateExecutionProviderFactory_Tensorrt(const OrtTensorRTProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_ACL(int use_arena); std::shared_ptr CreateExecutionProviderFactory_ArmNN(int use_arena); @@ -43,7 +43,8 @@ std::unique_ptr DefaultCpuExecutionProvider(bool enable_aren std::unique_ptr DefaultTensorrtExecutionProvider() { #ifdef USE_TENSORRT - if (auto factory = CreateExecutionProviderFactory_Tensorrt(0)) + OrtTensorRTProviderOptions params{0, 0, nullptr}; + if (auto factory = CreateExecutionProviderFactory_Tensorrt(¶ms)) return factory->CreateProvider(); #endif return nullptr; diff --git a/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc b/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc index 3a4142db6d..51141b3eb4 100644 --- a/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/cuda_utils_test.cc @@ -31,7 +31,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) { std::unique_ptr buffer{ reinterpret_cast(raw_buffer)}; - Fill(buffer.get(), value, num_elements); + Fill(nullptr, buffer.get(), value, num_elements); auto cpu_buffer = onnxruntime::make_unique(num_elements); CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost)); diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc index 7e4f344f44..1049079082 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad.cc @@ -24,13 +24,13 @@ namespace cuda { Status x::ComputeInternal(OpKernelContext* context) const { \ BinaryElementwisePreparation prepare; \ ORT_RETURN_IF_ERROR(Prepare(context, &prepare)); \ - CudaAsyncBuffer func_ctx(this, MakeFuncCtx(), 1); \ - if (!std::is_same::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \ + Ctx##x func_ctx = MakeFuncCtx(); \ Impl_##x::MappedType>( \ + Stream(), \ reinterpret_cast::MappedType*>(prepare.lhs_tensor->template Data()), \ reinterpret_cast::MappedType*>(prepare.rhs_tensor->template Data()), \ reinterpret_cast::MappedType*>(prepare.output_tensor->template MutableData()), \ - func_ctx.GpuPtr(), prepare.output_tensor->Shape().Size()); \ + &func_ctx, prepare.output_tensor->Shape().Size()); \ return Status::OK(); \ } diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu index caa38cac0d..2e7e3bacc2 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.cu @@ -33,14 +33,15 @@ struct OP_ReluGrad : public CtxReluGrad { #define BINARY_ELEMENTWISE_IMPL(name) \ BINARY_ELEMENTWISE_IMPL_DECLARATION(name) { \ - BinaryElementWiseNoBroadcastImpl(lhs_data, rhs_data, \ + BinaryElementWiseNoBroadcastImpl(stream, \ + lhs_data, rhs_data, \ output_data, \ *reinterpret_cast*>(func_ctx), \ count); \ } #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL(name, T) \ - template void Impl_##name(const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count); + template void Impl_##name(cudaStream_t stream, const T* lhs_data, const T* rhs_data, T* output_data, const Ctx##name* func_ctx, size_t count); #define SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(x) \ SPECIALIZED_BINARY_ELEMENTWISE_IMPL(x, half) \ diff --git a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h index bc5e292652..da23cb595b 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/activation/activations_grad_impl.h @@ -18,7 +18,8 @@ typedef onnxruntime::cuda::CtxNull CtxReluGrad; #define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ template \ - void Impl_##name(const T* lhs_data, \ + void Impl_##name(cudaStream_t stream, \ + const T* lhs_data, \ const T* rhs_data, \ T* output_data, \ const Ctx##name* func_ctx, \ diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc index e219f97951..b948c5c77c 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.cc @@ -39,12 +39,14 @@ ONNX_OPERATOR_KERNEL_EX( template template void BiasGeluGrad_dX::KernelLaunchDispatcher::operator()( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const Tensor& dY, const Tensor& X, const Tensor& B, Tensor& dX) const { using CudaT = typename ToCudaType::MappedType; LaunchBiasGeluGradDxKernel( + stream, input_size, bias_size, reinterpret_cast(dY.template Data()), reinterpret_cast(X.template Data()), @@ -78,7 +80,7 @@ Status BiasGeluGrad_dX::ComputeInternal(OpKernelContext* co KernelLaunchDispatcher, ALL_IEEE_FLOAT_DATA_TYPES> dispatcher{X->GetElementType()}; - dispatcher.Invoke(input_size, bias_size, *dY, *X, *B, *dX); + dispatcher.Invoke(Stream(), input_size, bias_size, *dY, *X, *B, *dX); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h index 695dd85b64..695739d1fd 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad.h @@ -18,6 +18,7 @@ class BiasGeluGrad_dX : public CudaKernel { template struct KernelLaunchDispatcher { void operator()( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const Tensor& dY, const Tensor& X, const Tensor& B, Tensor& dX) const; diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu index 2007036db7..d6fae84ca4 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.cu @@ -56,6 +56,7 @@ __global__ void BiasGeluGradDxKernel(int64_t bias_size, const T* dY, const T* X, template void LaunchBiasGeluGradDxKernel( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const T* dY, const T* X, const T* B, T* dX) { // given a 2D grid of blocks: @@ -70,13 +71,13 @@ void LaunchBiasGeluGradDxKernel( const dim3 grid_dim{static_cast(grid_width), static_cast(grid_height)}; BiasGeluGradDxKernel - <<>>(bias_size, dY, X, B, dX); + <<>>(bias_size, dY, X, B, dX); } // explicit instantiations #define SPECIALIZED_BIAS_GELU_GRAD_IMPL(T, GeluComputationMode) \ template void LaunchBiasGeluGradDxKernel( \ - int64_t input_size, int64_t bias_size, \ + cudaStream_t stream, int64_t input_size, int64_t bias_size, \ const T* dY, const T* X, const T* B, T* dX) SPECIALIZED_BIAS_GELU_GRAD_IMPL(half, gelu_computation_mode::Default); diff --git a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h index 6625bff938..a2edbb1749 100644 --- a/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/activation/bias_gelu_grad_impl.h @@ -14,6 +14,7 @@ namespace cuda { // - input_size % bias_size == 0 template void LaunchBiasGeluGradDxKernel( + cudaStream_t stream, int64_t input_size, int64_t bias_size, const T* dY, const T* X, const T* B, T* dX); diff --git a/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc b/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc index d43bce09ca..053d5fee1a 100644 --- a/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/collective/adasum_kernels.cc @@ -36,8 +36,8 @@ Status AdasumAllReduce::ComputeInternal(OpKernelContext* context) const { for (int i = 0; i < num_tensors; ++i) { const Tensor* x_tensor = context->Input(i); - CUDA_CALL(cudaMemcpy((uint8_t*)data_buffer_ptr.get() + tensor_offsets[i], x_tensor->DataRaw(), - tensor_sizes[i], cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpyAsync((uint8_t*)data_buffer_ptr.get() + tensor_offsets[i], x_tensor->DataRaw(), + tensor_sizes[i], cudaMemcpyDeviceToHost, Stream())); } auto recv_buffer = allocator->Alloc(total_recv_buffer_len); @@ -52,8 +52,8 @@ Status AdasumAllReduce::ComputeInternal(OpKernelContext* context) const { for (int i = 0; i < num_tensors; i++) { Tensor* y_tensor = context->Output(i, context->Input(i)->Shape()); - CUDA_CALL(cudaMemcpy(y_tensor->MutableDataRaw(), (uint8_t*)data_buffer + tensor_offsets[i], - tensor_sizes[i], cudaMemcpyHostToDevice)); + CUDA_CALL(cudaMemcpyAsync(y_tensor->MutableDataRaw(), (uint8_t*)data_buffer + tensor_offsets[i], + tensor_sizes[i], cudaMemcpyHostToDevice, Stream())); } return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc b/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc index 41775ae30e..7bd7dabdbc 100644 --- a/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/collective/nccl_kernels.cc @@ -10,7 +10,6 @@ NcclAllReduce::NcclAllReduce(const OpKernelInfo& info) : NcclKernel(info) { } Status NcclAllReduce::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const void* input_data = context->Input(0)->DataRaw(); @@ -32,7 +31,7 @@ Status NcclAllReduce::ComputeInternal(OpKernelContext* context) const { ncclDataType_t dtype = GetNcclDataType(onnx_type); #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclAllReduce(input_data, output_data, input_count, dtype, ncclSum, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclAllReduce(input_data, output_data, input_count, dtype, ncclSum, comm, Stream())); #endif return Status::OK(); } @@ -41,7 +40,6 @@ NcclAllGather::NcclAllGather(const OpKernelInfo& info) : NcclKernel(info) { } Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const int rank = nccl_->Rank(group_type_); const int size = nccl_->Size(group_type_); @@ -86,7 +84,7 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { ORT_ENFORCE(offset + tensor_bytes <= rank_end, "A single rank must be responsible for the entire tensor."); void* fusion_data_at_offset = (int8_t*)fusion_data + offset; const void* input_data = input_tensor->DataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } offset += tensor_bytes; @@ -95,7 +93,7 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { // AllGather. const void* fusion_data_rank_offset = (const int8_t*)fusion_data + rank_start; #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclAllGather(fusion_data_rank_offset, fusion_data, rank_count, dtype, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclAllGather(fusion_data_rank_offset, fusion_data, rank_count, dtype, comm, Stream())); #endif // Copy AllGather results to outputs. @@ -113,12 +111,12 @@ Status NcclAllGather::ComputeInternal(OpKernelContext* context) const { if (offset < rank_start || offset >= rank_end) { void* output_data = output_tensor->MutableDataRaw(); const void* fusion_data_at_offset = (const int8_t*)fusion_data + offset; - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } else { const void* input_data = input_tensor->DataRaw(); void* output_data = output_tensor->MutableDataRaw(); if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } } @@ -132,7 +130,6 @@ NcclReduceScatter::NcclReduceScatter(const OpKernelInfo& info) : NcclKernel(info } Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { - cudaStream_t stream = nullptr; // Default stream ncclComm_t comm = nccl_->Comm(group_type_); const int rank = nccl_->Rank(group_type_); const int size = nccl_->Size(group_type_); @@ -174,7 +171,7 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { void* fusion_data_at_offset = (int8_t*)fusion_data + offset; const void* input_data = input_tensor->DataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(fusion_data_at_offset, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); offset += tensor_bytes; } @@ -182,7 +179,7 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { // ReduceScatter. void* fusion_data_rank_offset = (int8_t*)fusion_data + rank_start; #ifdef ORT_USE_NCCL - NCCL_RETURN_IF_ERROR(ncclReduceScatter(fusion_data, fusion_data_rank_offset, rank_count, dtype, ncclSum, comm, stream)); + NCCL_RETURN_IF_ERROR(ncclReduceScatter(fusion_data, fusion_data_rank_offset, rank_count, dtype, ncclSum, comm, Stream())); #endif // Copy this rank's ReduceScatter results to outputs. offset = 0; @@ -200,12 +197,12 @@ Status NcclReduceScatter::ComputeInternal(OpKernelContext* context) const { ORT_ENFORCE(offset + tensor_bytes <= rank_end, "A single rank must be responsible for the entire tensor."); void* output_data = output_tensor->MutableDataRaw(); const void* fusion_data_at_offset = (const int8_t*)fusion_data + offset; - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, fusion_data_at_offset, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } else { const void* input_data = input_tensor->DataRaw(); void* output_data = output_tensor->MutableDataRaw(); if (input_data != output_data) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, tensor_bytes, cudaMemcpyDeviceToDevice, Stream())); } } diff --git a/orttraining/orttraining/training_ops/cuda/communication/recv.cc b/orttraining/orttraining/training_ops/cuda/communication/recv.cc index 1c42b0b8d2..fb9b383cf0 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/recv.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/recv.cc @@ -89,11 +89,11 @@ void Recv::ReceiveData( assert(tensor_offset_in_bytes + tensor->SizeInBytes() <= aggregated_aligned_tensor_bytes); // Copy data out from buffer. #if defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) - CUDA_CALL(cudaMemcpy(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, - tensor->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_CALL(cudaMemcpyAsync(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, + tensor->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); #else - CUDA_CALL(cudaMemcpy(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, - tensor->SizeInBytes(), cudaMemcpyHostToDevice)); + CUDA_CALL(cudaMemcpyAsync(tensor->MutableDataRaw(), buffer.get() + tensor_offset_in_bytes, + tensor->SizeInBytes(), cudaMemcpyHostToDevice, Stream())); #endif #ifndef NDEBUG diff --git a/orttraining/orttraining/training_ops/cuda/communication/send.cc b/orttraining/orttraining/training_ops/cuda/communication/send.cc index 72981ce040..6a5bc71fd3 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/send.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/send.cc @@ -66,11 +66,11 @@ void Send::SendData( #endif #if defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) - CUDA_CALL(cudaMemcpy(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), - tensor_sizes_in_bytes[i], cudaMemcpyDeviceToDevice)); + CUDA_CALL(cudaMemcpyAsync(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), + tensor_sizes_in_bytes[i], cudaMemcpyDeviceToDevice, Stream())); #else - CUDA_CALL(cudaMemcpy(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), - tensor_sizes_in_bytes[i], cudaMemcpyDeviceToHost)); + CUDA_CALL(cudaMemcpyAsync(buffer.get() + tensor_offsets_in_bytes[i], tensor->DataRaw(), + tensor_sizes_in_bytes[i], cudaMemcpyDeviceToHost, Stream())); #endif } diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc index 51d2f7bfbb..5ff75fb54b 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc @@ -92,7 +92,8 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co } // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -107,8 +108,8 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); T* weight_data_nd_data = weight_data_nd.get(); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream())); + ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { @@ -119,6 +120,7 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data_nd_data, normalize_factor_data.get(), static_cast(N_D), @@ -126,10 +128,11 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co buffer_size)); } else { const T normalize_factor = static_cast(1); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } - SoftmaxCrossEntropyLossImpl(log_prob_data, + SoftmaxCrossEntropyLossImpl(Stream(), + log_prob_data, label_data, weight_data_nd_data, normalize_factor_data.get(), @@ -148,7 +151,7 @@ Status SoftmaxCrossEntropyLoss::ComputeInternal(OpKernelContext* ctx) co transpose_output.GetMutable()->Reshape(log_prob->Shape()); log_prob->Reshape(log_prob_shape); ORT_RETURN_IF_ERROR(cuda::Transpose::DoTranspose(cuda::Transpose(info), permutations, *log_prob, *transpose_output.GetMutable())); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(log_prob_data, transposed_data, sizeof(T) * logit_shape.Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(log_prob_data, transposed_data, sizeof(T) * logit_shape.Size(), cudaMemcpyDeviceToDevice, Stream())); log_prob->Reshape(new_shape); } @@ -209,8 +212,8 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx IAllocatorUniquePtr weight_data_nd = GetScratchBuffer(N_D); T* weight_data_nd_data = weight_data_nd.get(); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T))); - ComputeWeightsSoftmaxCrossEntropyImpl(label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream())); + ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data); auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::MEAN) { // Compute buffer size in byte for reduction APIs. @@ -220,6 +223,7 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data_nd_data, normalize_factor_data.get(), static_cast(N_D), @@ -227,10 +231,11 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx buffer_size)); } else { const T normalize_factor = static_cast(1); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } - SoftmaxCrossEntropyLossGradImpl(dY_data, + SoftmaxCrossEntropyLossGradImpl(Stream(), + dY_data, log_prob_data, label_data, weight_data_nd_data, @@ -250,7 +255,7 @@ Status SoftmaxCrossEntropyLossGrad::ComputeInternal(OpKernelContext* ctx d_logit->Reshape(logit_shape); ORT_RETURN_IF_ERROR(cuda::Transpose::DoTranspose(cuda::Transpose(info), permutations, *d_logit, *transpose_output.GetMutable())); auto* transposed_data = (*transpose_output.GetMutable()).template Data(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(d_logit_data, transposed_data, sizeof(T) * probability_shape.Size(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(d_logit_data, transposed_data, sizeof(T) * probability_shape.Size(), cudaMemcpyDeviceToDevice, Stream())); d_logit->Reshape(new_shape); } diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu index aeda6fee4c..02bd7ebf25 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cu @@ -25,6 +25,7 @@ __global__ void _ComputeWeightsSoftmaxCrossEntropy( template void ComputeWeightsSoftmaxCrossEntropyImpl( + cudaStream_t stream, const Tin* label, const T* weight, size_t count, @@ -35,7 +36,7 @@ void ComputeWeightsSoftmaxCrossEntropyImpl( CUDA_LONG N_D = static_cast(count); CUDA_LONG C = static_cast(label_depth); CUDA_LONG II = static_cast(ignore_index); - _ComputeWeightsSoftmaxCrossEntropy<<>>( + _ComputeWeightsSoftmaxCrossEntropy<<>>( weight_data_nd, label, weight, @@ -65,6 +66,7 @@ __global__ void _WeightedSoftmaxCrossEntropyLoss( template void SoftmaxCrossEntropyLossImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -77,7 +79,7 @@ void SoftmaxCrossEntropyLossImpl( CUDA_LONG N_D = static_cast(count); CUDA_LONG C = static_cast(label_depth); CUDA_LONG II = static_cast(ignore_index); - _WeightedSoftmaxCrossEntropyLoss<<>>( + _WeightedSoftmaxCrossEntropyLoss<<>>( log_prob, label, weight, @@ -90,6 +92,7 @@ void SoftmaxCrossEntropyLossImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyLossImpl(T, Tin) \ template void SoftmaxCrossEntropyLossImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const Tin* label, \ const T* weight, \ @@ -154,6 +157,7 @@ __global__ void _WeightedReductionNoneSoftmaxCrossEntropyLossGrad( template void SoftmaxCrossEntropyLossGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -167,7 +171,7 @@ void SoftmaxCrossEntropyLossGradImpl( CUDA_LONG C = static_cast(label_depth); int blocksPerGrid = (int)(ceil(static_cast(N_D * C) / GridDim::maxThreadsPerBlock)); if (reduction_none) { - _WeightedReductionNoneSoftmaxCrossEntropyLossGrad<<>>( + _WeightedReductionNoneSoftmaxCrossEntropyLossGrad<<>>( dY, log_prob, label, @@ -177,7 +181,7 @@ void SoftmaxCrossEntropyLossGradImpl( N_D, C); } else { - _WeightedSoftmaxCrossEntropyLossGrad<<>>( + _WeightedSoftmaxCrossEntropyLossGrad<<>>( dY, log_prob, label, @@ -191,6 +195,7 @@ void SoftmaxCrossEntropyLossGradImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyLossGradImpl(T, Tin) \ template void SoftmaxCrossEntropyLossGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const Tin* label, \ @@ -206,6 +211,7 @@ SPECIALIZED_IMPL_SoftMaxEntropyLossGradImpl(float, int64_t) #define SPECIALIZED_IMPL_ComputeWeightsSoftmaxCrossEntropyImpl(T, Tin) \ template void ComputeWeightsSoftmaxCrossEntropyImpl( \ + cudaStream_t stream, \ const Tin* label, \ const T* weight, \ size_t count, \ diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h index 2333d7d593..d368fe9fbd 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h +++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.h @@ -12,6 +12,7 @@ namespace cuda { template void SoftmaxCrossEntropyLossImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -23,6 +24,7 @@ void SoftmaxCrossEntropyLossImpl( template void SoftmaxCrossEntropyLossGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -35,6 +37,7 @@ void SoftmaxCrossEntropyLossGradImpl( template void ComputeWeightsSoftmaxCrossEntropyImpl( + cudaStream_t stream, const Tin* label, const T* weight, size_t count, diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc index ce45bd8c7b..441a39d21d 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc @@ -49,7 +49,8 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { T* log_prob_data = log_prob->template MutableData(); // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -64,6 +65,7 @@ Status SoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) const { // calculate (label * log(softmax)) for each element IAllocatorUniquePtr temp_X = GetScratchBuffer(N * D); SoftMaxCrossEntropyImpl( + Stream(), log_prob_data, // logsoftmax result label_data, // label normalize_factor, // normalize_factor @@ -109,6 +111,7 @@ Status SoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* ctx) const { T* d_logits_data = d_logits->template MutableData(); SoftMaxCrossEntropyGradImpl( + Stream(), dY_data, // Dy log_prob_data, // log(pi) label_data, // Label @@ -147,7 +150,8 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) T* log_prob_data = log_prob->template MutableData(); // calculate logsoftmax - auto status = SoftMaxComputeHelper(logit_data, + auto status = SoftMaxComputeHelper(Stream(), + logit_data, logit_reshape, log_prob_data, CudnnHandle(), @@ -166,11 +170,11 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::SUM) { const T normalize_factor = static_cast(1); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else if (reduction_ == ReductionType::MEAN) { if (weight_data == nullptr) { const T normalize_factor = static_cast(N); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else { // Compute buffer size in byte for reduction APIs. const auto buffer_size = @@ -179,6 +183,7 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data, normalize_factor_data.get(), static_cast(N), @@ -187,7 +192,8 @@ Status SparseSoftmaxCrossEntropy::ComputeInternal(OpKernelContext* ctx) } } - SparseSoftmaxCrossEntropyImpl(log_prob_data, + SparseSoftmaxCrossEntropyImpl(Stream(), + log_prob_data, label_data, weight_data, normalize_factor_data.get(), @@ -241,11 +247,11 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c auto normalize_factor_data = GetScratchBuffer(1); if (reduction_ == ReductionType::SUM) { const T normalize_factor = static_cast(1); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else if (reduction_ == ReductionType::MEAN) { if (weight_data == nullptr) { const T normalize_factor = static_cast(N); - cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream())); } else { // Compute buffer size in byte for reduction APIs. const auto buffer_size = @@ -254,6 +260,7 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c IAllocatorUniquePtr reduction_buffer = GetScratchBuffer( buffer_size); ORT_RETURN_IF_ERROR(reduce_sum( + Stream(), weight_data, normalize_factor_data.get(), static_cast(N), @@ -262,7 +269,8 @@ Status SparseSoftmaxCrossEntropyGrad::ComputeInternal(OpKernelContext* c } } - SparseSoftmaxCrossEntropyGradImpl(dY_data, + SparseSoftmaxCrossEntropyGradImpl(Stream(), + dY_data, log_prob_data, label_data, weight_data, diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu index cbb430418b..a9165fd9ed 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cu @@ -22,6 +22,7 @@ __global__ void _SoftMaxCrossEntropy( template void SoftMaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const T* label, size_t normalize_factor, @@ -30,7 +31,7 @@ void SoftMaxCrossEntropyImpl( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); CUDA_LONG NORMALIZE_FACTOR = static_cast(normalize_factor); - _SoftMaxCrossEntropy<<>>( + _SoftMaxCrossEntropy<<>>( log_prob, label, NORMALIZE_FACTOR, @@ -40,6 +41,7 @@ void SoftMaxCrossEntropyImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyImpl(T) \ template void SoftMaxCrossEntropyImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const T* label, \ size_t normalize_factor, \ @@ -62,6 +64,7 @@ __global__ void _SoftMaxCrossEntropyGrad( template void SoftMaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const T* label, @@ -71,7 +74,7 @@ void SoftMaxCrossEntropyGradImpl( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); CUDA_LONG NORMALIZE_FACTOR = static_cast(normalize_factor); - _SoftMaxCrossEntropyGrad<<>>( + _SoftMaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -82,6 +85,7 @@ void SoftMaxCrossEntropyGradImpl( #define SPECIALIZED_IMPL_SoftMaxEntropyGradImpl(T) \ template void SoftMaxCrossEntropyGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const T* label, \ @@ -128,6 +132,7 @@ __global__ void _WeightedSparseSoftmaxCrossEntropy( template void SparseSoftmaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -139,7 +144,7 @@ void SparseSoftmaxCrossEntropyImpl( CUDA_LONG N = static_cast(count); CUDA_LONG D = static_cast(label_depth); if (weight) { - _WeightedSparseSoftmaxCrossEntropy<<>>( + _WeightedSparseSoftmaxCrossEntropy<<>>( log_prob, label, weight, @@ -148,7 +153,7 @@ void SparseSoftmaxCrossEntropyImpl( N, D); } else { - _SparseSoftmaxCrossEntropy<<>>( + _SparseSoftmaxCrossEntropy<<>>( log_prob, label, normalize_factor, @@ -160,6 +165,7 @@ void SparseSoftmaxCrossEntropyImpl( #define SPECIALIZED_IMPL_SparseSoftMaxEntropyImpl(T, Tin) \ template void SparseSoftmaxCrossEntropyImpl( \ + cudaStream_t stream, \ const T* log_prob, \ const Tin* label, \ const T* weight, \ @@ -212,6 +218,7 @@ __global__ void _WeightedSparseSoftmaxCrossEntropyGrad( template void SparseSoftmaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, @@ -224,7 +231,7 @@ void SparseSoftmaxCrossEntropyGradImpl( CUDA_LONG D = static_cast(label_depth); int blocksPerGrid = (int)(ceil(static_cast(N * D) / GridDim::maxThreadsPerBlock)); if (weight) { - _WeightedSparseSoftmaxCrossEntropyGrad<<>>( + _WeightedSparseSoftmaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -234,7 +241,7 @@ void SparseSoftmaxCrossEntropyGradImpl( N, D); } else { - _SparseSoftmaxCrossEntropyGrad<<>>( + _SparseSoftmaxCrossEntropyGrad<<>>( dY, log_prob, label, @@ -247,6 +254,7 @@ void SparseSoftmaxCrossEntropyGradImpl( #define SPECIALIZED_IMPL_SparseSoftMaxEntropyGradImpl(T, Tin) \ template void SparseSoftmaxCrossEntropyGradImpl( \ + cudaStream_t stream, \ const T* dY, \ const T* log_prob, \ const Tin* label, \ diff --git a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h index 6345f738a9..d41718d276 100644 --- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h +++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.h @@ -11,6 +11,7 @@ namespace cuda { template void SoftMaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const T* label, size_t normalize_factor, @@ -19,6 +20,7 @@ void SoftMaxCrossEntropyImpl( template void SoftMaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const T* label, @@ -28,6 +30,7 @@ void SoftMaxCrossEntropyGradImpl( template void SparseSoftmaxCrossEntropyImpl( + cudaStream_t stream, const T* log_prob, const Tin* label, const T* weight, @@ -38,6 +41,7 @@ void SparseSoftmaxCrossEntropyImpl( template void SparseSoftmaxCrossEntropyGradImpl( + cudaStream_t stream, const T* dY, const T* log_prob, const Tin* label, diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc index 1f2d8abb50..75477dcc93 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad.cc @@ -67,6 +67,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { switch (prepare.output_rank_or_simple_broadcast) { case static_cast(SimpleBroadcast::NoBroadcast): ImplDivGradSimple( + Stream(), SimpleBroadcast::NoBroadcast, prepare_a_data, prepare_b_data, @@ -84,6 +85,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } ImplDivGradSimple( + Stream(), SimpleBroadcast::LeftScalar, prepare_a_data, prepare_b_data, @@ -112,6 +114,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { temp_db_data = temp_db_allocator.get(); } ImplDivGradSimple( + Stream(), SimpleBroadcast::RightScalar, prepare_a_data, prepare_b_data, @@ -143,6 +146,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { if (prepare.output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatch1)) { // lhs(1,C,H) and rhs (C,1) ImplDivGradRhsPerChannelBatch1( + Stream(), prepare_a_data, prepare_b_data, prepare_dy_data, @@ -153,6 +157,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } else { // lhs(N,C,H) and rhs (C,1) ImplDivGradRhsPerChannelBatchN( + Stream(), prepare_a_data, prepare_b_data, prepare_dy_data, @@ -197,6 +202,7 @@ Status DivGrad::ComputeInternal(OpKernelContext* context) const { } ImplDivGrad( + Stream(), prepare.output_rank_or_simple_broadcast, &prepare.lhs_padded_strides, prepare_a_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu index 527f396093..1e64b1e110 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.cu @@ -260,6 +260,7 @@ __global__ void _DivGrad_B( template void ImplDivGradSimple( + cudaStream_t stream, SimpleBroadcast simpleBroadcast, const T* a_data, const T* b_data, @@ -274,7 +275,7 @@ void ImplDivGradSimple( case SimpleBroadcast::NoBroadcast: // a, b and dy has the same shape: a_is_scalar = false, b_is_scalar = false if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -282,13 +283,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -298,7 +299,7 @@ void ImplDivGradSimple( case SimpleBroadcast::LeftScalar: // a is a scalar, b and dy has the same shape if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -306,13 +307,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -322,7 +323,7 @@ void ImplDivGradSimple( case SimpleBroadcast::RightScalar: // b is a scalar, a and dy has the same shape if (da_output_data && db_output_data) - _DivGradSimple<<>>( + _DivGradSimple<<>>( a_data, b_data, dy_data, @@ -330,13 +331,13 @@ void ImplDivGradSimple( db_output_data, N); else if (da_output_data) - _DivGradSimple_A<<>>( + _DivGradSimple_A<<>>( b_data, dy_data, da_output_data, N); else - _DivGradSimple_B<<>>( + _DivGradSimple_B<<>>( a_data, b_data, dy_data, @@ -350,6 +351,7 @@ void ImplDivGradSimple( template void ImplDivGradRhsPerChannelBatch1( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -360,7 +362,7 @@ void ImplDivGradRhsPerChannelBatch1( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); if (da_output_data && db_output_data) - _DivGradRhsPerChannelBatch1<<>>( + _DivGradRhsPerChannelBatch1<<>>( a_data, b_data, dy_data, @@ -369,14 +371,14 @@ void ImplDivGradRhsPerChannelBatch1( db_output_data, N); else if (da_output_data) - _DivGradRhsPerChannelBatch1_A<<>>( + _DivGradRhsPerChannelBatch1_A<<>>( b_data, dy_data, fdm_H, da_output_data, N); else - _DivGradRhsPerChannelBatch1_B<<>>( + _DivGradRhsPerChannelBatch1_B<<>>( a_data, b_data, dy_data, @@ -387,6 +389,7 @@ void ImplDivGradRhsPerChannelBatch1( template void ImplDivGradRhsPerChannelBatchN( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -399,7 +402,7 @@ void ImplDivGradRhsPerChannelBatchN( CUDA_LONG N = static_cast(count); if (da_output_data && db_output_data) - _DivGradRhsPerChannelBatchN<<>>( + _DivGradRhsPerChannelBatchN<<>>( a_data, b_data, dy_data, @@ -409,7 +412,7 @@ void ImplDivGradRhsPerChannelBatchN( db_output_data, N); else if (da_output_data) - _DivGradRhsPerChannelBatchN_A<<>>( + _DivGradRhsPerChannelBatchN_A<<>>( b_data, dy_data, fdm_H, @@ -417,7 +420,7 @@ void ImplDivGradRhsPerChannelBatchN( da_output_data, N); else - _DivGradRhsPerChannelBatchN_B<<>>( + _DivGradRhsPerChannelBatchN_B<<>>( a_data, b_data, dy_data, @@ -429,6 +432,7 @@ void ImplDivGradRhsPerChannelBatchN( template void ImplDivGrad( + cudaStream_t stream, int32_t output_rank, const TArray* a_padded_strides, const T* a_data, @@ -443,7 +447,7 @@ void ImplDivGrad( CUDA_LONG N = static_cast(count); if (a_padded_strides && a_padded_strides->Size() && b_padded_strides && b_padded_strides->Size()) { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -455,7 +459,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -464,7 +468,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -476,7 +480,7 @@ void ImplDivGrad( N); } else if (a_padded_strides && a_padded_strides->Size()) { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -488,7 +492,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -497,7 +501,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -509,7 +513,7 @@ void ImplDivGrad( N); } else { if (da_output_data && db_output_data) - _DivGrad<<>>( + _DivGrad<<>>( output_rank, *a_padded_strides, a_data, @@ -521,7 +525,7 @@ void ImplDivGrad( db_output_data, N); else if (da_output_data) - _DivGrad_A<<>>( + _DivGrad_A<<>>( output_rank, *b_padded_strides, b_data, @@ -530,7 +534,7 @@ void ImplDivGrad( da_output_data, N); else - _DivGrad_B<<>>( + _DivGrad_B<<>>( output_rank, *a_padded_strides, a_data, @@ -545,6 +549,7 @@ void ImplDivGrad( #define SPECIALIZED_DIV_GRAD_IMPL(T) \ template void ImplDivGrad( \ + cudaStream_t stream, \ int32_t output_rank, \ const TArray* a_padded_strides, \ const T* a_data, \ @@ -556,6 +561,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradRhsPerChannelBatch1( \ + cudaStream_t stream, \ const T* a_data, \ const T* b_data, \ const T* dy_data, \ @@ -564,6 +570,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradRhsPerChannelBatchN( \ + cudaStream_t stream, \ const T* a_data, \ const T* b_data, \ const T* dy_data, \ @@ -573,6 +580,7 @@ void ImplDivGrad( T* da_output_data, \ T* db_output_data); \ template void ImplDivGradSimple( \ + cudaStream_t stream, \ SimpleBroadcast simpleBroadcast, \ const T* a_data, \ const T* b_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h index 68a59ca06a..947535277f 100644 --- a/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/math/div_grad_impl.h @@ -9,6 +9,7 @@ namespace onnxruntime { namespace cuda { template void ImplDivGradSimple( + cudaStream_t stream, SimpleBroadcast simpleBroadcast, const T* a_data, const T* b_data, @@ -19,6 +20,7 @@ void ImplDivGradSimple( template void ImplDivGradRhsPerChannelBatch1( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -29,6 +31,7 @@ void ImplDivGradRhsPerChannelBatch1( template void ImplDivGradRhsPerChannelBatchN( + cudaStream_t stream, const T* a_data, const T* b_data, const T* dy_data, @@ -40,6 +43,7 @@ void ImplDivGradRhsPerChannelBatchN( template void ImplDivGrad( + cudaStream_t stream, int32_t output_rank, const TArray* a_padded_strides, const T* a_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.cc b/orttraining/orttraining/training_ops/cuda/math/isfinite.cc index d42dd7372f..07ba929cc2 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.cc +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.cc @@ -26,6 +26,7 @@ Status IsFiniteOp::ComputeInternal(OpKernelContext* context) const { const Tensor& input = *context->Input(0); Tensor& output = *context->Output(0, input.Shape()); IsFinite( + Stream(), reinterpret_cast(input.Data()), output.MutableData(), input.Shape().Size()); @@ -59,7 +60,7 @@ Status IsAllFiniteOp::ComputeInternal(OpKernelContext* context) const { // to false if any value in any tensor is non-finite. Tensor& output = *context->Output(0, {}); auto* output_data = reinterpret_cast::MappedType*>(output.template MutableData()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_data, int(true), sizeof(bool), Stream())); std::vector> grouped_tensor_pointers(total_tensor_count); std::vector tensor_sizes(total_tensor_count); @@ -76,7 +77,7 @@ Status IsAllFiniteOp::ComputeInternal(OpKernelContext* context) const { // Check if all values are finite and write true to output. // Otherwise, false will be written. launch_multi_tensor_functor<1, TFunctor>( - 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data); + Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, output_data); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.cu b/orttraining/orttraining/training_ops/cuda/math/isfinite.cu index 5b85a7099e..95fd7d1a4e 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.cu +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.cu @@ -15,14 +15,14 @@ __global__ void _IsFinite(const TSrc* input, bool* output, CUDA_LONG N) { } template -void IsFinite(const TSrc* input, bool* output, size_t count) { +void IsFinite(cudaStream_t stream, const TSrc* input, bool* output, size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _IsFinite<<>>(input, output, N); + _IsFinite<<>>(input, output, N); } #define SPECIALIZE_ISFINITE_IMPL(T) \ -template void IsFinite(const T* input, bool* output, size_t count); +template void IsFinite(cudaStream_t stream, const T* input, bool* output, size_t count); SPECIALIZE_ISFINITE_IMPL(half) SPECIALIZE_ISFINITE_IMPL(float) @@ -53,14 +53,14 @@ __global__ void IsAllFiniteMultiTensorImpl(ChunkGroup<1> chunks, bool* output) { } template -void IsAllFiniteFunctor::operator()(ChunkGroup<1> chunks, bool* output) { +void IsAllFiniteFunctor::operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output) { const int block_count = chunks.chunk_count; const int thread_count = ChunkGroup<1>::thread_count_per_block; - IsAllFiniteMultiTensorImpl<<>>(chunks, output); + IsAllFiniteMultiTensorImpl<<>>(chunks, output); } #define INSTANTIATE_ISALLFINITE_FUNCTOR(T) \ - template void IsAllFiniteFunctor::operator()(ChunkGroup<1> chunks, bool* output); + template void IsAllFiniteFunctor::operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output); INSTANTIATE_ISALLFINITE_FUNCTOR(half) INSTANTIATE_ISALLFINITE_FUNCTOR(float) diff --git a/orttraining/orttraining/training_ops/cuda/math/isfinite.h b/orttraining/orttraining/training_ops/cuda/math/isfinite.h index 45aaa070da..44e9a7a50a 100644 --- a/orttraining/orttraining/training_ops/cuda/math/isfinite.h +++ b/orttraining/orttraining/training_ops/cuda/math/isfinite.h @@ -19,7 +19,7 @@ class IsFiniteOp final : public CudaKernel { }; template -void IsFinite(const TSrc* input, bool* output, size_t N); +void IsFinite(cudaStream_t stream, const TSrc* input, bool* output, size_t N); template class IsAllFiniteOp final : public CudaKernel { @@ -32,7 +32,7 @@ class IsAllFiniteOp final : public CudaKernel { template struct IsAllFiniteFunctor { - void operator()(ChunkGroup<1> chunks, bool* output); + void operator()(cudaStream_t stream, ChunkGroup<1> chunks, bool* output); }; } // namespace cuda diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc index 88f800b96b..1ab0a00307 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cc @@ -102,6 +102,7 @@ Status MixedPrecisionScale::ComputeInternal(OpKernelContext* context) cons #define CASE(TP_TYPE, DstT) \ case TP_TYPE: \ Impl_MixedPrecisionScale::MappedType>( \ + Stream(), \ x_data, \ scale_data, \ reinterpret_cast::MappedType*>(y_data), \ diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu index a4c46b12aa..b86641c091 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.cu @@ -24,13 +24,14 @@ __global__ void _MixedPrecisionScale( template void Impl_MixedPrecisionScale( + cudaStream_t stream, const SrcT* input_data, const float* scale_data, DstT* output_data, size_t count){ int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _MixedPrecisionScale<<>>( + _MixedPrecisionScale<<>>( input_data, scale_data, output_data, @@ -39,6 +40,7 @@ void Impl_MixedPrecisionScale( #define SPECIALIZE_MIXEDPRECISIONSCALE_IMPL(SrcT, DstT) \ template void Impl_MixedPrecisionScale( \ + cudaStream_t stream, \ const SrcT* input_data, \ const float* scale_data, \ DstT* output_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h index f63f5431ae..b5400cc30e 100644 --- a/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h +++ b/orttraining/orttraining/training_ops/cuda/math/mixed_precision_scale.h @@ -10,6 +10,7 @@ namespace cuda { template void Impl_MixedPrecisionScale( + cudaStream_t stream, const SrcT* input_data, const float* scale_data, DstT* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.cc b/orttraining/orttraining/training_ops/cuda/math/scale.cc index a525cd8df2..7fe37d00f7 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.cc +++ b/orttraining/orttraining/training_ops/cuda/math/scale.cc @@ -47,6 +47,7 @@ Status Scale::ComputeInternal(OpKernelContext* context) const { auto lhs_tensor = context->Input(0); auto output_tensor = context->Output(0, lhs_tensor->Shape()); Impl_Scale( + Stream(), reinterpret_cast(lhs_tensor->template Data()), scale_value, reinterpret_cast(output_tensor->template MutableData()), diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.cu b/orttraining/orttraining/training_ops/cuda/math/scale.cu index 7d9cce529b..b132665039 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.cu +++ b/orttraining/orttraining/training_ops/cuda/math/scale.cu @@ -36,13 +36,14 @@ __global__ void _Scale( template void Impl_Scale( + cudaStream_t stream, const T* input_data, const float scale_value, T* output_data, size_t count) { int blocksPerGrid = static_cast(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); CUDA_LONG N = static_cast(count); - _Scale<<>>( + _Scale<<>>( input_data, static_cast(scale_value), output_data, @@ -51,6 +52,7 @@ void Impl_Scale( #define SPECIALIZE_SCALE_IMPL(T) \ template void Impl_Scale( \ + cudaStream_t stream, \ const T* input_data, \ const float scale_value, \ T* output_data, \ diff --git a/orttraining/orttraining/training_ops/cuda/math/scale.h b/orttraining/orttraining/training_ops/cuda/math/scale.h index b0ecd26962..020f4efbdb 100644 --- a/orttraining/orttraining/training_ops/cuda/math/scale.h +++ b/orttraining/orttraining/training_ops/cuda/math/scale.h @@ -18,6 +18,7 @@ struct GetScaleValueImpl { template void Impl_Scale( + cudaStream_t stream, const T* input_data, const float scale_value, T* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc index 270397c4d9..74976d0639 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc @@ -13,6 +13,7 @@ namespace cuda { template Status SoftMaxGradComputeHelper( + cudaStream_t stream, const T* dY, const TensorShape& input_shape, const T* Y, @@ -33,7 +34,7 @@ Status SoftMaxGradComputeHelper( if (D <= 1024 && D * sizeof(T) <= 4096) { dispatch_softmax_backward, is_log_softmax>( - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -65,6 +66,7 @@ Status SoftMaxGradComputeHelper( #define SPECIALIZED_SOFTMAXGRAD_HELPER_IMPL_BFloat16(is_log_softmax) \ template <> \ Status SoftMaxGradComputeHelper( \ + cudaStream_t stream, \ const BFloat16* dY, \ const TensorShape& input_shape, \ const BFloat16* Y, \ @@ -79,7 +81,7 @@ Status SoftMaxGradComputeHelper( auto Y_data = reinterpret_cast(Y); \ auto dX_data = reinterpret_cast(dX); \ dispatch_softmax_backward, is_log_softmax>( \ - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); \ return Status::OK(); \ } @@ -117,9 +119,9 @@ Status SoftmaxGrad::ComputeInternal(OpKernelContext* ctx) const { T* dX_data = dX->template MutableData(); if (log_softmax_) { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); } else { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_); } } diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h index 31e396b0c0..4e50cf2cf4 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace cuda { template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +void dispatch_softmax_backward(cudaStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); template class SoftmaxGrad final : public CudaKernel { diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu index 3b1bf2e508..f3e2fe4d39 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu @@ -121,7 +121,7 @@ __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, } template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_backward(cudaStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -145,47 +145,47 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const switch (log2_elements) { case 0: // 1 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -194,8 +194,8 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const } #define SPECIALIZED_SOFTMAX_GRAD_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_backward(cudaStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_backward(cudaStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_GRAD_IMPL(float, float, float) SPECIALIZED_SOFTMAX_GRAD_IMPL(half, half, float) diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc index 688223c336..b67eb70426 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout.cc @@ -37,7 +37,8 @@ REGISTER_GRADIENT_KERNEL(DropoutGrad) template struct DropoutGradComputeImpl { - void operator()(const int64_t N, + void operator()(cudaStream_t stream, + const int64_t N, const Tensor& dY, const bool* mask_data, const float ratio_data, @@ -46,7 +47,7 @@ struct DropoutGradComputeImpl { const CudaT* dY_data = reinterpret_cast(dY.template Data()); CudaT* dX_data = reinterpret_cast(dX.template MutableData()); - DropoutGradientKernelImpl(N, dY_data, mask_data, ratio_data, dX_data); + DropoutGradientKernelImpl(stream, N, dY_data, mask_data, ratio_data, dX_data); } }; @@ -79,7 +80,7 @@ Status DropoutGrad::ComputeInternal(OpKernelContext* context) const { auto dX = context->Output(0, shape); utils::MLTypeCallDispatcher t_disp(dY->GetElementType()); - t_disp.Invoke(N, *dY, mask_data, ratio_data, *dX); + t_disp.Invoke(Stream(), N, *dY, mask_data, ratio_data, *dX); return Status::OK(); } @@ -100,6 +101,7 @@ ONNX_OPERATOR_KERNEL_EX( template struct BiasDropoutComputeImpl { Status operator()(const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio_data, @@ -124,7 +126,7 @@ struct BiasDropoutComputeImpl { CudaT* Y_data = reinterpret_cast(Y.template MutableData()); - BiasDropoutKernelImpl(prop, N, fdm_dim, ratio_data, generator, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernelImpl(prop, stream, N, fdm_dim, ratio_data, generator, X_data, bias_data, residual_data, Y_data, mask_data); return Status::OK(); } @@ -185,7 +187,7 @@ Status BiasDropout::ComputeInternal(OpKernelContext* context) const { PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default(); utils::MLTypeCallDispatcherRet t_disp(X->GetElementType()); - return t_disp.Invoke(GetDeviceProp(), N, fdm_dim, ratio_data, generator, *X, *bias, residual, *Y, mask_data); + return t_disp.Invoke(GetDeviceProp(), Stream(), N, fdm_dim, ratio_data, generator, *X, *bias, residual, *Y, mask_data); } } // namespace cuda diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu index b7960c5151..eed291e1d7 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.cu @@ -43,6 +43,7 @@ __global__ void DropoutGradientKernel( template void DropoutGradientKernelImpl( + cudaStream_t stream, const int64_t N, const T* dY_data, const bool* mask_data, @@ -50,18 +51,19 @@ void DropoutGradientKernelImpl( T* dX_data) { if (ratio == 0.0f) { if (dY_data != dX_data) { - CUDA_CALL_THROW(cudaMemcpyAsync(dX_data, dY_data, N * sizeof(T), cudaMemcpyDeviceToDevice)); + CUDA_CALL_THROW(cudaMemcpyAsync(dX_data, dY_data, N * sizeof(T), cudaMemcpyDeviceToDevice, stream)); } } else { const float scale = 1.f / (1.f - ratio); const int blocksPerGrid = static_cast(CeilDiv(N, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); DropoutGradientKernel - <<>>(N, dY_data, mask_data, scale, dX_data); + <<>>(N, dY_data, mask_data, scale, dX_data); } } #define SPECIALIZED_DROPOUT_GRAD_IMPL(T) \ template void DropoutGradientKernelImpl( \ + cudaStream_t stream, \ const int64_t N, \ const T* dY_data, \ const bool* mask_data, \ @@ -131,6 +133,7 @@ __global__ void BiasDropoutKernel( template void BiasDropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio, @@ -149,15 +152,16 @@ void BiasDropoutKernelImpl( auto seeds = generator.NextPhiloxSeeds(counter_offset); if (residual_data == nullptr) { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); } else { - BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); + BiasDropoutKernel<<>>(N, fdm_dim, ratio, seeds, X_data, bias_data, residual_data, Y_data, mask_data); } } #define SPECIALIZED_BIAS_DROPOUT_IMPL(T) \ template void BiasDropoutKernelImpl( \ const cudaDeviceProp& prop, \ + cudaStream_t stream, \ const int64_t N, \ const fast_divmod fdm_dim, \ const float ratio, \ diff --git a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h index 09444662af..8dbf3f9655 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h +++ b/orttraining/orttraining/training_ops/cuda/nn/dropout_impl.h @@ -10,6 +10,7 @@ namespace cuda { template void DropoutGradientKernelImpl( + cudaStream_t stream, const int64_t N, const T* dY_data, const bool* mask_data, @@ -19,6 +20,7 @@ void DropoutGradientKernelImpl( template void BiasDropoutKernelImpl( const cudaDeviceProp& prop, + cudaStream_t stream, const int64_t N, const fast_divmod fdm_dim, const float ratio, diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc b/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc index 3695c029c8..64262a1ed1 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm.cc @@ -95,7 +95,7 @@ Status LayerNormGrad::ComputeInternal(OpKernelContext* p_op_ke auto part_grad_gamma = GetScratchBuffer(part_size * n2); auto part_grad_beta = GetScratchBuffer(part_size * n2); - HostLayerNormGradient(GetDeviceProp(), Y_grad_data, X_data, reinterpret_cast(NULL), + HostLayerNormGradient(GetDeviceProp(), Stream(), Y_grad_data, X_data, reinterpret_cast(NULL), scale_data, reinterpret_cast(NULL), mean_data, inv_std_var_data, n1, n2, X_grad_data, scale_grad_data, bias_grad_data, part_grad_gamma.get(), part_grad_beta.get(), part_size); @@ -144,7 +144,7 @@ Status InvertibleLayerNormGrad::ComputeInternal(OpKernelContext* p_op_kern auto part_grad_gamma = GetScratchBuffer(part_size * n2); auto part_grad_beta = GetScratchBuffer(part_size * n2); - HostLayerNormGradient(GetDeviceProp(), Y_grad_data, reinterpret_cast(NULL), Y_data, + HostLayerNormGradient(GetDeviceProp(), Stream(), Y_grad_data, reinterpret_cast(NULL), Y_data, scale_data, bias_data, reinterpret_cast(NULL), inv_std_var_data, n1, n2, X_grad_data, scale_grad_data, bias_grad_data, part_grad_gamma.get(), part_grad_beta.get(), part_size); diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu index 00bdc2d525..99f818ff56 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu @@ -439,6 +439,7 @@ __global__ void cuComputeGradInput( template void HostLayerNormGradient( const cudaDeviceProp& prop, + cudaStream_t stream, const T* dout, const T* input, const T* output, @@ -464,7 +465,7 @@ void HostLayerNormGradient( const int nshared2 = nshared2_a > nshared2_b ? nshared2_a : nshared2_b; if (mean == nullptr && !simplified) { // use_mean == false, simplified == false -> Inverted Layer Norm - cuComputePartGradGammaBeta<<>>( + cuComputePartGradGammaBeta<<>>( dout, input, output, @@ -478,7 +479,7 @@ void HostLayerNormGradient( } else { // use_mean == true, simplified == false -> Layer Norm // use_mean == true, simplified == true -> Simplified Layer Norm - cuComputePartGradGammaBeta<<>>( + cuComputePartGradGammaBeta<<>>( dout, input, output, @@ -493,7 +494,7 @@ void HostLayerNormGradient( const dim3 threads3(warp_size, 8, 1); const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1); const int nshared3 = threads3.x * threads3.y * sizeof(U); - cuComputeGradGammaBeta<<>>( + cuComputeGradGammaBeta<<>>( part_grad_gamma, part_grad_beta, part_size, @@ -507,7 +508,7 @@ void HostLayerNormGradient( int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0; if (mean == nullptr && !simplified) { - cuComputeGradInput<<>>( + cuComputeGradInput<<>>( dout, input, output, @@ -518,7 +519,7 @@ void HostLayerNormGradient( n1, n2, grad_input); } else { - cuComputeGradInput<<>>( + cuComputeGradInput<<>>( dout, input, output, @@ -532,7 +533,7 @@ void HostLayerNormGradient( } #define LAYERNORMGRAD_IMPL(T, U, simplified) \ - template void HostLayerNormGradient(const cudaDeviceProp& prop, const T* dout, const T* input, const T* output, \ + template void HostLayerNormGradient(const cudaDeviceProp& prop, cudaStream_t stream, const T* dout, const T* input, const T* output, \ const T* gamma, const T* beta, const U* mean, const U* invvar, int64_t n1, int64_t n2, \ T* grad_input, T* grad_gamma, T* grad_beta, U* part_grad_gamma, U* part_grad_beta, const int part_size); diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h index 4722ab6126..a8d5e4e9d6 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h +++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.h @@ -31,6 +31,7 @@ namespace cuda { template void HostLayerNormGradient( const cudaDeviceProp& prop, + cudaStream_t stream, const T* dout, const T* input, const T* output, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc index 3045a549b3..03fdcc3685 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cc @@ -115,20 +115,20 @@ Status AdamOptimizer: if (do_update_tensor != nullptr) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M1, NM1)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M2, NM2)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M1, NM1)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M2, NM2)); if (S_in != S_out) { *(S_out) = *(S_in); } if (NW != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(W, *NW)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), W, *NW)); } if (NG != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(G, *NG)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), G, *NG)); } if (W_MIXED_FP != nullptr && NW_MIXED_FP != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*W_MIXED_FP, *NW_MIXED_FP)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), *W_MIXED_FP, *NW_MIXED_FP)); } return Status::OK(); @@ -136,6 +136,7 @@ Status AdamOptimizer: } AdamOptimizerImpl( + Stream(), reinterpret_cast(ETA.template Data()), *S_in, reinterpret_cast(W.template Data()), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu index d892cd446d..b054485c46 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.cu @@ -139,6 +139,7 @@ __global__ void _AdamOptimizer_mode1( template void AdamOptimizerImpl( + cudaStream_t stream, const T1* eta, const T2 update_count, const T3* weights, @@ -176,7 +177,7 @@ void AdamOptimizerImpl( // bias correction is applied on learning rate, // weight decay is applied after weight is updated. if (weight_decay_mode == 0) { - _AdamOptimizer_mode0<<>>( + _AdamOptimizer_mode0<<>>( eta, weights, grads, @@ -200,7 +201,7 @@ void AdamOptimizerImpl( N); } else if (weight_decay_mode == 1) { - _AdamOptimizer_mode1<<>>( + _AdamOptimizer_mode1<<>>( eta, weights, grads, @@ -230,6 +231,7 @@ void AdamOptimizerImpl( #define SPECIALIZED_AdamOptimizerImpl(T1, T2, T3, T4, T_GRAD, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ template void AdamOptimizerImpl( \ + cudaStream_t stream, \ const T1* eta, \ const T2 update_count, \ const T3* weights, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/adam.h b/orttraining/orttraining/training_ops/cuda/optimizer/adam.h index f979056e38..4ebb6e41c8 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/adam.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/adam.h @@ -10,6 +10,7 @@ namespace cuda { template void AdamOptimizerImpl( + cudaStream_t stream, const T1* eta, const T2 update_count, const T3* weights, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/common.h b/orttraining/orttraining/training_ops/cuda/optimizer/common.h index 72ebf81f91..9cd86b42b3 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/common.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/common.h @@ -9,11 +9,11 @@ namespace onnxruntime { namespace cuda { template -Status CopyIfNotSameBuffer(const Tensor& source_tensor, Tensor& target_tensor) { +Status CopyIfNotSameBuffer(cudaStream_t stream, const Tensor& source_tensor, Tensor& target_tensor) { const T* source = source_tensor.template Data(); T* target = target_tensor.template MutableData(); if (target != source) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, source_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, source_tensor.SizeInBytes(), cudaMemcpyDeviceToDevice, stream)); } return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc index 80d9b442fd..a8c4de6f5f 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cc @@ -42,7 +42,7 @@ Status ZeroGradient::ComputeInternal(OpKernelContext* ctx) const { CUDA_RETURN_IF_ERROR(cudaMemsetAsync( zero_gradient.template MutableData(), 0, - zero_gradient.Shape().Size() * sizeof(T))); + zero_gradient.Shape().Size() * sizeof(T), Stream())); return Status::OK(); } @@ -75,12 +75,13 @@ Status InPlaceAccumulator::ComputeInternal(OpKernelContext* ctx) cons if (do_update_tensor) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(left_addee_buffer, accumulation_output)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), left_addee_buffer, accumulation_output)); return Status::OK(); } } InPlaceAccumulatorImpl( + Stream(), reinterpret_cast(left_addee_buffer.template Data()), reinterpret_cast(right_addee_buffer.template Data()), reinterpret_cast(accumulation_output.template MutableData()), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu index b6c49a5acb..1d83bb166c 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.cu @@ -21,13 +21,14 @@ __global__ void _InPlaceAccumulator( template void InPlaceAccumulatorImpl( + cudaStream_t stream, const T* gradient_buffer, const T_GRAD* gradient, T* accumulated_gradient, size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _InPlaceAccumulator<<>>( + _InPlaceAccumulator<<>>( gradient_buffer, gradient, accumulated_gradient, @@ -36,6 +37,7 @@ void InPlaceAccumulatorImpl( #define SPECIALIZED_IMPL_InPlaceAccumulator(T, T_GRAD) \ template void InPlaceAccumulatorImpl( \ + cudaStream_t stream, \ const T* gradient_buffer, \ const T_GRAD* gradient, \ T* accumulated_gradient, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h index 7f54d8bbce..c2a4f8e234 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/gradient_control.h @@ -25,6 +25,7 @@ class InPlaceAccumulator final : public CudaKernel { // Implementation can be found in cuda file, optimizers_impl.cu template void InPlaceAccumulatorImpl( + cudaStream_t stream, const T* gradient_buffer, const T_GRAD* gradient, T* accumulated_gradient, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc index e27903e89d..ded60d2373 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cc @@ -117,6 +117,7 @@ void check_inputs_and_outputs( template Status copy_inputs_to_outputs( + cudaStream_t stream, OpKernelContext* ctx, const int non_grouped_input_count, const int non_grouped_output_count, @@ -155,16 +156,16 @@ Status copy_inputs_to_outputs( w_mixed_precision_new->SetByteOffset(w_mixed_precision->ByteOffset()); if (w_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(w, *w_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, w, *w_new)); } if (g_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(g, *g_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, g, *g_new)); } - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m1, m1_new)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m2, m2_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m1, m1_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m2, m2_new)); if (w_mixed_precision_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*w_mixed_precision, *w_mixed_precision_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, *w_mixed_precision, *w_mixed_precision_new)); } } @@ -173,6 +174,7 @@ Status copy_inputs_to_outputs( template Status launch_lamb_compute_direction( + cudaStream_t stream, const int64_t update_count, const int group_count, const CudaT2* p_loss_scale, @@ -221,6 +223,7 @@ Status launch_lamb_compute_direction( do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( + stream, p_ws[i], p_gs[i], p_m1s[i], @@ -268,6 +271,7 @@ Status launch_lamb_compute_direction( LambStage1 lamb_stage1; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets[key], buckets[key], @@ -299,6 +303,7 @@ Status launch_lamb_reduction( constexpr int tensor_count_per_group = 4; + cudaStream_t stream = kernel.Stream(); // Bucketize tensor groups by the associated optimizer configuration. // If two tensor groups use different "alpha", they should be put into two distinct buckets. std::vector> buckets; @@ -307,12 +312,14 @@ Status launch_lamb_reduction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ws[i], p_w_norms[i], tensor_sizes[i], reduction_buffer, reduction_buffer_size)); ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ds[i], p_d_norms[i], tensor_sizes[i], @@ -343,6 +350,7 @@ Status launch_lamb_reduction( typedef LambMultiTensorReductionFunctor TReducer; TReducer reducer; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets, buckets, @@ -357,6 +365,7 @@ Status launch_lamb_reduction( template Status launch_lamb_update( + cudaStream_t stream, const int group_count, const CudaT1* eta, const float ratio_min, @@ -389,6 +398,7 @@ Status launch_lamb_update( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { LambUpdate( + stream, eta, ratio_min, ratio_max, @@ -430,6 +440,7 @@ Status launch_lamb_update( LambStage2 lamb_stage2; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -503,6 +514,7 @@ Status LambOptimizer::Compute auto update_signal = *update_signal_tensor->template Data(); if (!update_signal) { return copy_inputs_to_outputs( + Stream(), ctx, non_grouped_input_count, non_grouped_output_count, @@ -539,14 +551,14 @@ Status LambOptimizer::Compute // and T2=float. IAllocatorUniquePtr d_norm_buffer = GetScratchBuffer(group_count); CudaT2* d_norm_data = reinterpret_cast(d_norm_buffer.get()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(d_norm_data, 0, group_count * sizeof(T2))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(d_norm_data, 0, group_count * sizeof(T2), Stream())); // Allocate buffer for reduction computation of weight tensor. // The i-th weight's norm is stored at the i-th element. // We reduce type T2 tensor to type T2 scalar. An example is that T2=float. IAllocatorUniquePtr w_norm_buffer = GetScratchBuffer(group_count); CudaT2* w_norm_data = reinterpret_cast(w_norm_buffer.get()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(w_norm_data, 0, group_count * sizeof(T2))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(w_norm_data, 0, group_count * sizeof(T2), Stream())); // Find the max size of updated weight tensors. int max_tensor_size = 0; @@ -652,6 +664,7 @@ Status LambOptimizer::Compute } ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( + Stream(), step_data ? *step_data : 0, group_count, loss_scale_data, @@ -675,6 +688,7 @@ Status LambOptimizer::Compute reduction_buffer_size)); ORT_RETURN_IF_ERROR(launch_lamb_update( + Stream(), group_count, eta_data, ratio_min_, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu index b8c8171509..5ebc2fff49 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu @@ -110,6 +110,7 @@ __global__ void _LambComputeDirectionImpl( template void LambComputeDirection( + cudaStream_t stream, const T1* weights, const T2* grads, const T3* moment_1, @@ -130,7 +131,7 @@ void LambComputeDirection( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _LambComputeDirectionImpl<<>>( + _LambComputeDirectionImpl<<>>( weights, grads, moment_1, @@ -152,6 +153,7 @@ void LambComputeDirection( #define SPECIALIZED_LAMB_COMPUTE_DIRECTION(T1, T2, T3, T_GRAD_NORM) \ template void LambComputeDirection( \ + cudaStream_t stream, \ const T1* weights, \ const T2* grads, \ const T3* moment_1, \ @@ -256,6 +258,7 @@ __global__ void _LambUpdateImpl( template void LambUpdate( + cudaStream_t stream, const T1* eta, const float ratio_min, const float ratio_max, @@ -270,7 +273,7 @@ void LambUpdate( int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _LambUpdateImpl<<>>( + _LambUpdateImpl<<>>( eta, ratio_min, ratio_max, @@ -286,6 +289,7 @@ void LambUpdate( #define INSTANTIATE_LAMB_UPDATE(T1, T2, T3, T_MIXED_PRECISION_FP) \ template void LambUpdate( \ + cudaStream_t stream, \ const T1* eta, \ const float ratio_min, \ const float ratio_max, \ @@ -356,6 +360,7 @@ __global__ void LambMultiTensorComputeDirectionImpl( template void LambMultiTensorComputeDirectionFunctor::operator()( + cudaStream_t stream, ChunkGroup<6> chunk_group, const T1* loss_scale, const T_GRAD_NORM* g_norm, @@ -369,7 +374,7 @@ void LambMultiTensorComputeDirectionFunctor::operator() const int thread_count = ChunkGroup<6>::thread_count_per_block; const int block_count = chunk_group.chunk_count; - LambMultiTensorComputeDirectionImpl<<>>( + LambMultiTensorComputeDirectionImpl<<>>( chunk_group, loss_scale, g_norm, @@ -384,6 +389,7 @@ void LambMultiTensorComputeDirectionFunctor::operator() #define INSTANTIATE_LAMB_STAGE1_MULTI_TENSOR_FUNCTOR(T1, T2, T3, T_GRAD_NORM) \ template void LambMultiTensorComputeDirectionFunctor::operator()( \ + cudaStream_t stream, \ ChunkGroup<6> chunk_group, \ const T1* loss_scale, \ const T_GRAD_NORM* g_norm, \ @@ -445,6 +451,7 @@ __global__ void LambMultiTensorUpdateImpl( template void LambMultiTensorUpdateFunctor::operator()( + cudaStream_t stream, ChunkGroup<7> chunk_group, const T1* eta, const float ratio_min, @@ -452,7 +459,7 @@ void LambMultiTensorUpdateFunctor::operator()( const int thread_count = ChunkGroup<7>::thread_count_per_block; const int block_count = chunk_group.chunk_count; - LambMultiTensorUpdateImpl<<>>( + LambMultiTensorUpdateImpl<<>>( chunk_group, eta, ratio_min, @@ -461,6 +468,7 @@ void LambMultiTensorUpdateFunctor::operator()( #define INSTANTIATE_LAMB_MULTI_TENSOR_UPDATE_FUNCTOR(T1, T2, T3, T_MIXED_PRECISION_FP) \ template void LambMultiTensorUpdateFunctor::operator()( \ + cudaStream_t stream, \ ChunkGroup<7> chunk_group, \ const T1* eta, \ const float ratio_min, \ @@ -616,7 +624,7 @@ CudaKernel::CudaAsyncBuffer compute_tensor_rang } template -void LambMultiTensorReductionFunctor::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size) { +void LambMultiTensorReductionFunctor::operator()(cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size) { // thread count per block. constexpr int thread_count = ChunkGroup<4>::thread_count_per_block; // shared memory's size per block. @@ -636,12 +644,12 @@ void LambMultiTensorReductionFunctor::operator() TOut2* d_buffer = reinterpret_cast(w_buffer + num_blocks); auto sync_range_and_lock = compute_tensor_range_and_lock(chunk_group, kernel); - LambMultiTensorReductionImpl<<>>( + LambMultiTensorReductionImpl<<>>( chunk_group, w_buffer, d_buffer, sync_range_and_lock.GpuPtr()); } #define INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(TIn1, TIn2, TOut1, TOut2, TBuf) \ - template void LambMultiTensorReductionFunctor::operator()(ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size); + template void LambMultiTensorReductionFunctor::operator()(cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, size_t reduction_buffer_size); INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(float, float, float, float, float) INSTANTIATE_LAMB_MULTI_TENSOR_REDUCTION_FUNCTOR(double, double, double, double, double) diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h index 7882a94759..d5bf742a1b 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.h @@ -49,6 +49,7 @@ class LambOptimizer final : public CudaKernel { // of this. template void LambComputeDirection( + cudaStream_t stream, const T1* weights, const T2* grads, const T3* moment_1, @@ -73,6 +74,7 @@ void LambComputeDirection( // of this. template void LambUpdate( + cudaStream_t stream, const T1* eta, const float ratio_min, const float ratio_max, @@ -106,6 +108,7 @@ void LambUpdate( template struct LambMultiTensorComputeDirectionFunctor { void operator()( + cudaStream_t stream, ChunkGroup<6> chunk_group, const T1* loss_scale, const T_GRAD_NORM* grad_norm, @@ -134,6 +137,7 @@ struct LambMultiTensorComputeDirectionFunctor { template struct LambMultiTensorReductionFunctor { void operator()( + cudaStream_t stream, ChunkGroup<4> chunk_group, const CudaKernel& kernel, void* reduction_buffer, @@ -183,6 +187,7 @@ struct LambMultiTensorSyncRangeAndLock { template struct LambMultiTensorUpdateFunctor { void operator()( + cudaStream_t stream, ChunkGroup<7> chunk_group, const T1* eta, const float ratio_min, diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc index 2501faa82c..048f1da8e7 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cc @@ -30,6 +30,7 @@ Status SGDOptimizer::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(W.Shape() == G.Shape()); SGDOptimizerImpl( + Stream(), ETA.template Data(), W.template Data(), G.template Data(), diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu index addfed2f7b..aeab19d5eb 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.cu @@ -31,6 +31,7 @@ __global__ void _SGDOptimizer( template void SGDOptimizerImpl( + cudaStream_t stream, const T* eta, const T* weights, const T* gradients, @@ -39,7 +40,7 @@ void SGDOptimizerImpl( size_t count) { int blocksPerGrid = (int)(ceil(static_cast(count) / GridDim::maxThreadsPerBlock)); CUDA_LONG N = static_cast(count); - _SGDOptimizer<<>>( + _SGDOptimizer<<>>( eta, weights, gradients, @@ -50,6 +51,7 @@ void SGDOptimizerImpl( #define SPECIALIZED_IMPL__SGDOptimizerImpl(T) \ template void SGDOptimizerImpl( \ + cudaStream_t stream, \ const T* eta, \ const T* weights, \ const T* gradients, \ diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/sg.h b/orttraining/orttraining/training_ops/cuda/optimizer/sg.h index 99d81f6984..80d47a8fa0 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/sg.h +++ b/orttraining/orttraining/training_ops/cuda/optimizer/sg.h @@ -10,6 +10,7 @@ namespace cuda { template void SGDOptimizerImpl( + cudaStream_t stream, const T* eta, const T* weights, const T* gradients, diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.cc b/orttraining/orttraining/training_ops/cuda/reduction/all.cc index 3e8741a9d6..9e1c282667 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.cc @@ -25,11 +25,14 @@ Status All::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(size <= std::numeric_limits::max(), "Number of reduced elements (", size, ") exceeds the max allowed value (", std::numeric_limits::max(), ")."); + // TODO: LaunchAllKernel is implemented with thrust, which always uses default CUDA stream. + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(Stream())); LaunchAllKernel( + Stream(), input.Data(), static_cast(size), output.MutableData()); - + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(0)); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.cu b/orttraining/orttraining/training_ops/cuda/reduction/all.cu index 64c62523d5..678d01893d 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.cu @@ -21,13 +21,13 @@ __global__ void assign_false(bool* ptr) { } template<> -void LaunchAllKernel(const bool* data, const int size, bool* output) { +void LaunchAllKernel(cudaStream_t stream, const bool* data, const int size, bool* output) { if(thrust::all_of(thrust::device, data, data + size, thrust::identity())) { - assign_true<<<1, 1, 0>>>(output); + assign_true<<<1, 1, 0, stream>>>(output); } else { - assign_false<<<1, 1, 0>>>(output); + assign_false<<<1, 1, 0, stream>>>(output); } } diff --git a/orttraining/orttraining/training_ops/cuda/reduction/all.h b/orttraining/orttraining/training_ops/cuda/reduction/all.h index 7e687cc7f9..f15f3fdff5 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/all.h +++ b/orttraining/orttraining/training_ops/cuda/reduction/all.h @@ -16,7 +16,7 @@ class All final : public CudaKernel { }; template -void LaunchAllKernel(const T* data, const int size, bool* output); +void LaunchAllKernel(cudaStream_t stream, const T* data, const int size, bool* output); } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc index 5d90e9936b..654b915ffe 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cc @@ -44,7 +44,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Allocate output tensor. Tensor* output = ctx->Output(0, {}); CudaTOut* p_output = reinterpret_cast(output->template MutableData()); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(p_output, 0, sizeof(CudaTOut))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(p_output, 0, sizeof(CudaTOut), Stream())); const bool deterministic = ctx->GetUseDeterministicCompute(); @@ -54,12 +54,12 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Check if all values are finite and write true to deviceOutput. // Otherwise, false will be written. - launch_multi_tensor_functor<1, TFunctor>( + launch_multi_tensor_functor<1, TFunctor>(Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); // *p_output is the squared sum of all elements. // Let's take a sqrt to get the actual L2-norm. - ScalarSqrt(p_output, p_output); + ScalarSqrt(Stream(), p_output, p_output); } else { // alternate path only for deterministic compute .. typedef AccumulationType_t CudaTAcc; @@ -81,7 +81,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // buffer for final output and square norms of each tensor auto results_buffer = GetScratchBuffer(1 + total_tensor_count); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(results_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count))); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(results_buffer.get(), 0, sizeof(CudaTAcc) * (1 + total_tensor_count), Stream())); CudaTAcc* p_global_sqnorm = results_buffer.get(); CudaTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; @@ -90,11 +90,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { for (int i = 0; i < total_tensor_count; ++i) { CudaTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); ORT_RETURN_IF_ERROR(reduce_square_sum( - p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); + Stream(), p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } ORT_RETURN_IF_ERROR(reduce_sum( - p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); - ScalarSqrt(p_global_sqnorm, p_output); + Stream(), p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); + ScalarSqrt(Stream(), p_global_sqnorm, p_output); } return Status::OK(); diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu index adde87d307..16603e1ade 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu @@ -17,16 +17,16 @@ __global__ void ScalarSqrtKernel(Tin* input, Tout* output) { } template -void ScalarSqrt(Tin* input, Tout* output) { - ScalarSqrtKernel<<<1, 1, 0>>>(input, output); -} +void ScalarSqrt(cudaStream_t stream, Tin* input, Tout* output) { + ScalarSqrtKernel<<<1, 1, 0, stream>>>(input, output); +}; -template void ScalarSqrt(float* input, float* output); -template void ScalarSqrt(half* input, half* output); -template void ScalarSqrt(float* input, half* output); +template void ScalarSqrt(cudaStream_t stream, float* input, float* output); +template void ScalarSqrt(cudaStream_t stream, half* input, half* output); +template void ScalarSqrt(cudaStream_t stream, float* input, half* output); #if CUDA_VERSION >= 11000 && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) -template void ScalarSqrt(nv_bfloat16* input, nv_bfloat16* output); -template void ScalarSqrt(float* input, nv_bfloat16* output); +template void ScalarSqrt(cudaStream_t stream, nv_bfloat16* input, nv_bfloat16* output); +template void ScalarSqrt(cudaStream_t stream, float* input, nv_bfloat16* output); #endif template @@ -87,7 +87,7 @@ __global__ void MultiTensorReduceKernel(ChunkGroup<1> chunk_group, TOut* output) } template -void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { +void MultiTensorReduce(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output) { // thread count per block. constexpr int thread_count = ChunkGroup<1>::thread_count_per_block; // shared memory's size per block. @@ -97,17 +97,17 @@ void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) { ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); - MultiTensorReduceKernel<<>>(chunk_group, output); + MultiTensorReduceKernel<<>>(chunk_group, output); } template -void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output) { +void MultiTensorReduceL2::operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output) { using TBuf = AccumulationType_t; - MultiTensorReduce(chunk_group, output); + MultiTensorReduce(stream, chunk_group, output); } #define INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(TIn, TOut) \ - template void MultiTensorReduceL2::operator()(ChunkGroup<1> chunk_group, TOut* output); + template void MultiTensorReduceL2::operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output); INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(double, float) INSTANTIATE_MULTI_TENSOR_REDUCTION_L2_FUNCTOR(float, float) diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h index f3ea5130eb..7de6e2ee9b 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.h @@ -18,11 +18,11 @@ class ReduceAllL2 final : public CudaKernel { template struct MultiTensorReduceL2 { - void operator()(ChunkGroup<1> chunk_group, TOut* output); + void operator()(cudaStream_t stream, ChunkGroup<1> chunk_group, TOut* output); }; template -void ScalarSqrt(Tin* input, Tout* output); +void ScalarSqrt(cudaStream_t stream, Tin* input, Tout* output); } // namespace cuda } // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc index deb316a562..adfaeff30f 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc @@ -47,7 +47,7 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, cudnn // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -80,7 +80,7 @@ Status ReduceKernel::ComputeImplExOutput(0, X->Shape()); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -107,14 +107,14 @@ Status ReduceKernel::ComputeImplExtemplate MutableData() != X->template Data()) { - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -124,7 +124,7 @@ Status ReduceKernel::ComputeImplEx temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X)); @@ -150,7 +150,7 @@ Status ReduceKernel::ComputeImplEx(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/concat.cc b/orttraining/orttraining/training_ops/cuda/tensor/concat.cc index 3185f3eb6f..0404c1fa4e 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/concat.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/concat.cc @@ -60,7 +60,8 @@ Status ConcatTraining::ComputeInternal(OpKernelContext* ctx) const { int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); int block_size_including_axis_dim = static_cast(p.output_axis_pitch); auto element_bytes = p.output_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), + element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes_gpu.GpuPtr(), @@ -71,7 +72,7 @@ Status ConcatTraining::ComputeInternal(OpKernelContext* ctx) const { p.output_num_elements)); Tensor* output_1_tensor = ctx->Output(1, {input_count}); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_1_tensor->template MutableData(), concat_sizes_gpu.GpuPtr(), input_count * sizeof(int64_t), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_1_tensor->template MutableData(), concat_sizes_gpu.GpuPtr(), input_count * sizeof(int64_t), cudaMemcpyDeviceToDevice, Stream())); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc index eb24e3eb57..983960a2df 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad.cc @@ -24,7 +24,8 @@ ONNX_OPERATOR_KERNEL_EX( template struct GatherElementsGrad::ComputeImpl { - Status operator()(const Tensor* dY, + Status operator()(cudaStream_t stream, + const Tensor* dY, const Tensor* indices_tensor, Tensor* dX, const int rank, @@ -42,6 +43,7 @@ struct GatherElementsGrad::ComputeImpl { if (utils::IsPrimitiveDataType(Tin_type)) { const int32_t* indices_data = indices_tensor->template Data(); return GatherElementsGradImpl( + stream, rank, buffer_output_dims, buffer_input_strides, @@ -55,6 +57,7 @@ struct GatherElementsGrad::ComputeImpl { } else if (utils::IsPrimitiveDataType(Tin_type)) { const int64_t* indices_data = indices_tensor->template Data(); return GatherElementsGradImpl( + stream, rank, buffer_output_dims, buffer_input_strides, @@ -113,7 +116,7 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const { int rank = static_cast(output_dims.size()); Tensor* dX = context->Output(0, data_shape); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream())); TArray buffer_output_dims(output_dims); TensorPitches input_strides(output_dims); @@ -128,7 +131,7 @@ Status GatherElementsGrad::ComputeInternal(OpKernelContext* context) const { utils::MLTypeCallDispatcherRet t_disp(dY->GetElementType()); - return t_disp.Invoke(dY, indices_tensor, dX, rank, + return t_disp.Invoke(Stream(), dY, indices_tensor, dX, rank, buffer_output_dims, buffer_input_strides, indices_size, buffer_indices_dims, fdm_indices_strides, axis); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h index 713fe3f7bc..c6873c301a 100755 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_elements_grad_impl.h @@ -11,6 +11,7 @@ namespace cuda { template Status GatherElementsGradImpl( + cudaStream_t stream, const int rank, TArray& buffer_input_dims, TArray& buffer_input_strides, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc index 41607cb88d..9a89cda8c1 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad.cc @@ -36,6 +36,7 @@ ONNX_OPERATOR_KERNEL_EX( namespace { template Status CallGatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, const Tensor& dY, const Tensor& gathered_indices, @@ -49,6 +50,7 @@ Status CallGatherGradImpl( const SafeInt num_gathered_indices{gathered_indices.Shape().Size()}; GatherGradImpl( + stream, allocator, reinterpret_cast(dY_data), indices_data, @@ -63,6 +65,7 @@ Status CallGatherGradImpl( template Status DispatchToGatherGradImplByTindex( + cudaStream_t stream, MLDataType tindex_data_type, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, @@ -70,16 +73,17 @@ Status DispatchToGatherGradImplByTindex( Tensor& dX) { if (utils::IsPrimitiveDataType(tindex_data_type)) { return CallGatherGradImpl( - allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } else if (utils::IsPrimitiveDataType(tindex_data_type)) { return CallGatherGradImpl( - allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "GatherGrad unsupported TIndex type: ", tindex_data_type); } Status DispatchToGatherGradImpl( + cudaStream_t stream, MLDataType t_data_type, MLDataType tindex_data_type, const CudaScratchBufferAllocator& allocator, int64_t num_gathered_per_index, int64_t gather_dimension_size, int64_t num_batches, @@ -87,14 +91,14 @@ Status DispatchToGatherGradImpl( Tensor& dX) { if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); } else if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } else if (utils::IsPrimitiveDataType(t_data_type)) { return DispatchToGatherGradImplByTindex( - tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); + stream, tindex_data_type, allocator, num_gathered_per_index, gather_dimension_size, num_batches, dY, gathered_indices, dX); #endif } @@ -109,7 +113,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const Tensor* dY = context->Input(2); Tensor* dX = context->Output(0, X_shape); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream())); if (gathered_indices->Shape().Size() == 0) { // nothing else to do @@ -125,7 +129,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const int64_t num_batches = X_shape.SizeToDimension(axis); return DispatchToGatherGradImpl( - t_type, tindex_type, CudaScratchBufferAllocator{*this}, + Stream(), t_type, tindex_type, CudaScratchBufferAllocator{*this}, num_gathered_per_index, gather_dimension_size, num_batches, *dY, *gathered_indices, *dX); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu index 60713126ae..9c0537c81d 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu @@ -51,6 +51,7 @@ __global__ void CopyKernel(TOutputIterator dst, TInputIterator src, int64_t leng // get sorted dX and dY indices, ordered by dX indices template void GetSortedIndices( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const TIndex* dX_indices, GatheredIndexIndex_t num_gathered_indices, @@ -58,7 +59,7 @@ void GetSortedIndices( IAllocatorUniquePtr& dY_indices_sorted_out) { auto dY_indices = allocator.GetScratchBuffer(num_gathered_indices); CopyKernel<<>>( + GridDim::maxThreadsPerBlock, 0, stream>>>( dY_indices.get(), cub::CountingInputIterator{0}, num_gathered_indices); auto dX_indices_sorted = allocator.GetScratchBuffer(num_gathered_indices); @@ -69,14 +70,14 @@ void GetSortedIndices( nullptr, temp_storage_size_bytes, dX_indices, dX_indices_sorted.get(), dY_indices.get(), dY_indices_sorted.get(), - num_gathered_indices)); + num_gathered_indices, 0, sizeof(TIndex)*8, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceRadixSort::SortPairs( temp_storage.get(), temp_storage_size_bytes, dX_indices, dX_indices_sorted.get(), dY_indices.get(), dY_indices_sorted.get(), - num_gathered_indices)); + num_gathered_indices, 0, sizeof(TIndex)*8, stream)); dX_indices_sorted_out = std::move(dX_indices_sorted); dY_indices_sorted_out = std::move(dY_indices_sorted); @@ -84,18 +85,19 @@ void GetSortedIndices( template IAllocatorUniquePtr GetOffsetsFromCounts( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* counts, int32_t num_counts) { auto offsets = allocator.GetScratchBuffer(num_counts); size_t temp_storage_size_bytes = 0; CUDA_CALL_THROW(cub::DeviceScan::ExclusiveSum( nullptr, temp_storage_size_bytes, - counts, offsets.get(), num_counts)); + counts, offsets.get(), num_counts, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceScan::ExclusiveSum( temp_storage.get(), temp_storage_size_bytes, - counts, offsets.get(), num_counts)); + counts, offsets.get(), num_counts, stream)); return offsets; } @@ -157,6 +159,7 @@ __global__ void DirectSumKernel( // directly sum gathered dY values into the corresponding dX value template void DirectSumImpl( + cudaStream_t stream, const TIndex* dX_indices_sorted, const TIndex* dY_indices_sorted, const T* dY_data, @@ -168,7 +171,7 @@ void DirectSumImpl( dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_gathered_indices, 4), CeilDiv(num_gathered_per_index, 128)); - DirectSumKernel<<>>( + DirectSumKernel<<>>( dX_indices_sorted, dY_indices_sorted, dY_data, @@ -299,6 +302,7 @@ __global__ void ComputeSegmentSumsAndScatterKernel( // the corresponding dX value template void PartialSumsImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const TIndex* dX_indices_sorted, const TIndex* dY_indices_sorted, @@ -317,29 +321,30 @@ void PartialSumsImpl( auto per_segment_partial_segment_counts = allocator.GetScratchBuffer(num_segments); { const auto blocks_per_grid = CeilDiv(num_gathered_indices, GridDim::maxThreadsPerBlock); - ComputePerSegmentPartialSegmentCountsKernel<<>>( + ComputePerSegmentPartialSegmentCountsKernel<<>>( per_segment_partial_segment_counts.get(), segment_offsets, num_segments, num_gathered_indices); } // compute partial segment offsets per segment auto per_segment_partial_segment_offsets = GetOffsetsFromCounts( - allocator, per_segment_partial_segment_counts.get(), num_segments); + stream, allocator, per_segment_partial_segment_counts.get(), num_segments); SegmentIndex_t host_num_partial_segments = 0; { SegmentIndex_t last_segment_partial_segment_offset = 0, last_segment_partial_segment_count = 0; // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( + CUDA_CALL_THROW(cudaMemcpyAsync( &last_segment_partial_segment_offset, &per_segment_partial_segment_offsets.get()[num_segments - 1], - sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( + CUDA_CALL_THROW(cudaMemcpyAsync( &last_segment_partial_segment_count, &per_segment_partial_segment_counts.get()[num_segments - 1], - sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); host_num_partial_segments = last_segment_partial_segment_offset + last_segment_partial_segment_count; } @@ -348,7 +353,7 @@ void PartialSumsImpl( auto partial_segment_offsets = allocator.GetScratchBuffer(host_num_partial_segments); { const auto blocks_per_grid = CeilDiv(num_segments, GridDim::maxThreadsPerBlock); - ComputePartialSegmentOffsetsKernel<<>>( + ComputePartialSegmentOffsetsKernel<<>>( partial_segment_offsets.get(), per_segment_partial_segment_counts.get(), per_segment_partial_segment_offsets.get(), @@ -369,7 +374,7 @@ void PartialSumsImpl( const dim3 blocks_per_grid( CeilDiv(host_num_partial_segments * num_gathered_per_index_warp_size_multiple, threads_per_block), num_batches); - ComputePartialSegmentSumsKernel<<>>( + ComputePartialSegmentSumsKernel<<>>( dY_indices_sorted, dY_data, num_gathered_indices, @@ -385,7 +390,7 @@ void PartialSumsImpl( const dim3 blocks_per_grid( CeilDiv(num_segments * num_gathered_per_index_warp_size_multiple, threads_per_block), num_batches); - ComputeSegmentSumsAndScatterKernel<<>>( + ComputeSegmentSumsAndScatterKernel<<>>( dX_indices_sorted, dX_data, num_gathered_per_index, @@ -402,6 +407,7 @@ void PartialSumsImpl( template void Impl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -412,6 +418,7 @@ void Impl( T* dX_data) { IAllocatorUniquePtr dX_indices_sorted, dY_indices_sorted; GetSortedIndices( + stream, allocator, dX_indices, num_gathered_indices, dX_indices_sorted, dY_indices_sorted); @@ -425,17 +432,18 @@ void Impl( CUDA_CALL_THROW(cub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_size_bytes, dX_indices_sorted.get(), cub::DiscardOutputIterator{}, segment_counts.get(), - num_segments.get(), num_gathered_indices)); + num_segments.get(), num_gathered_indices, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceRunLengthEncode::Encode( temp_storage.get(), temp_storage_size_bytes, dX_indices_sorted.get(), cub::DiscardOutputIterator{}, segment_counts.get(), - num_segments.get(), num_gathered_indices)); + num_segments.get(), num_gathered_indices, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( - &host_num_segments, num_segments.get(), sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpyAsync( + &host_num_segments, num_segments.get(), sizeof(SegmentIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); } // get largest segment size and use that to select implementation @@ -446,30 +454,32 @@ void Impl( size_t temp_storage_size_bytes = 0; CUDA_CALL_THROW(cub::DeviceReduce::Max( nullptr, temp_storage_size_bytes, - segment_counts.get(), max_segment_count.get(), host_num_segments)); + segment_counts.get(), max_segment_count.get(), host_num_segments, stream)); auto temp_storage = allocator.GetScratchBuffer(temp_storage_size_bytes); CUDA_CALL_THROW(cub::DeviceReduce::Max( temp_storage.get(), temp_storage_size_bytes, - segment_counts.get(), max_segment_count.get(), host_num_segments)); + segment_counts.get(), max_segment_count.get(), host_num_segments, stream)); // CPU/GPU sync! - CUDA_CALL_THROW(cudaMemcpy( - &host_max_segment_count, max_segment_count.get(), sizeof(GatheredIndexIndex_t), cudaMemcpyDeviceToHost)); + CUDA_CALL_THROW(cudaMemcpyAsync( + &host_max_segment_count, max_segment_count.get(), sizeof(GatheredIndexIndex_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CALL_THROW(cudaStreamSynchronize(stream)); } constexpr GatheredIndexIndex_t kMaxSegmentSizeThreshold = 32; if (host_max_segment_count <= kMaxSegmentSizeThreshold) { DirectSumImpl( - dX_indices_sorted.get(), dY_indices_sorted.get(), + stream, dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, dX_data, num_gathered_indices, num_gathered_per_index, gather_dimension_size, num_batches); } else { auto segment_offsets = GetOffsetsFromCounts( - allocator, segment_counts.get(), host_num_segments); + stream, allocator, segment_counts.get(), host_num_segments); segment_counts.reset(); PartialSumsImpl( + stream, allocator, dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, dX_data, @@ -482,6 +492,7 @@ void Impl( // doesn't perform well if there are many duplicate values in dX_indices template void Impl_Simplified( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -492,6 +503,7 @@ void Impl_Simplified( T* dX_data) { IAllocatorUniquePtr dX_indices_sorted, dY_indices_sorted; GetSortedIndices( + stream, allocator, dX_indices, num_gathered_indices, dX_indices_sorted, dY_indices_sorted); @@ -499,7 +511,7 @@ void Impl_Simplified( dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_gathered_indices, 4), CeilDiv(num_gathered_per_index, 128)); - DirectSumKernel<<>>( + DirectSumKernel<<>>( dX_indices_sorted.get(), dY_indices_sorted.get(), dY_data, @@ -514,6 +526,7 @@ void Impl_Simplified( template void GatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, @@ -523,6 +536,7 @@ void GatherGradImpl( const int64_t num_batches, T* dX_data) { gather_grad_internal::Impl( + stream, allocator, dY_data, dX_indices, num_gathered_indices, gather_dimension_size, num_gathered_per_index, num_batches, @@ -531,6 +545,7 @@ void GatherGradImpl( #define SPECIALIZED(T, TIndex) \ template void GatherGradImpl( \ + cudaStream_t stream, \ const CudaScratchBufferAllocator& allocator, \ const T* dY_data, \ const TIndex* dX_indices, \ diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h index 4a174da99f..a792e08f10 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.h @@ -28,6 +28,7 @@ using GatheredIndexIndex_t = int32_t; template void GatherGradImpl( + cudaStream_t stream, const CudaScratchBufferAllocator& allocator, const T* dY_data, const TIndex* dX_indices, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc index b7dad4963b..90ea1bca1b 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc @@ -36,13 +36,15 @@ REGISTER_KERNEL_TYPED_GATHER_ND_GRAD(int64_t) template struct GatherNDGradComputeImpl { - void operator()(const int64_t num_slices, + void operator()(cudaStream_t stream, + const int64_t num_slices, const int64_t slice_size, const void* const kernel_input_data, void* const kernel_output_data, int64_t* const input_slice_offsets_data) const { typedef typename ToCudaType::MappedType CudaT; - GatherNDGradImpl(num_slices, kernel_input_data, + GatherNDGradImpl(stream, + num_slices, kernel_input_data, kernel_output_data, slice_size, input_slice_offsets_data); } @@ -82,20 +84,21 @@ Status GatherNDGrad::ComputeInternal(OpKernelContext* context) const { auto output_tensor = context->Output(0, input_shape); // TODO this memset can be expensive, a sparse tensor representation would help here - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_tensor->MutableDataRaw(), 0, output_tensor->SizeInBytes())); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(output_tensor->MutableDataRaw(), 0, output_tensor->SizeInBytes(), Stream())); // Compute int64_t num_slices; int64_t slice_size; IAllocatorUniquePtr input_slice_offsets_buffer; - ORT_RETURN_IF_ERROR(PrepareCompute(batch_dims_, input_shape, indices_shape, indices_tensor, + ORT_RETURN_IF_ERROR(PrepareCompute(Stream(), + batch_dims_, input_shape, indices_shape, indices_tensor, num_slices, slice_size, input_slice_offsets_buffer)); const void* const kernel_input_data = update_tensor->DataRaw(); void* const kernel_output_data = output_tensor->MutableDataRaw(); utils::MLTypeCallDispatcher t_disp(update_tensor->GetElementType()); - t_disp.Invoke(num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); + t_disp.Invoke(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data, input_slice_offsets_buffer.get()); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu index d733887af9..8eddbd21a8 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.cu @@ -26,18 +26,19 @@ __global__ void _GatherNDGradKernel( template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const unsigned int blocks_per_grid = static_cast(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock)); - _GatherNDGradKernel<<>>( + _GatherNDGradKernel<<>>( num_slices, static_cast(update_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_GRAD_IMPL(T) \ - template void GatherNDGradImpl(const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) + template void GatherNDGradImpl(cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) SPECIALIZED_GRAD_IMPL(float); #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h index 3b19e758e2..e00a3ed410 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad_impl.h @@ -9,6 +9,7 @@ namespace cuda { template void GatherNDGradImpl( + cudaStream_t stream, const size_t num_slices, const void* update_data, void* output_data, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc index 553d67bcc3..7004cbfb70 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/slice_grad.cc @@ -52,8 +52,9 @@ Status SliceGrad::CallSliceImp(size_t element_size, size_t dimension_count, cons const TArray& output_strides, OpKernelContext* ctx, const TensorShape& output_shape) const { Tensor* gradient_out_tensor = GetOutputGradientTensor(ctx); - CUDA_RETURN_IF_ERROR(cudaMemsetAsync(gradient_out_tensor->MutableDataRaw(), 0, gradient_out_tensor->SizeInBytes())); - return SliceImplGrad(element_size, + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(gradient_out_tensor->MutableDataRaw(), 0, gradient_out_tensor->SizeInBytes(), Stream())); + return SliceImplGrad(Stream(), + element_size, gsl::narrow_cast(dimension_count), starts_buffer, steps_buffer, diff --git a/orttraining/orttraining/training_ops/cuda/tensor/split.cc b/orttraining/orttraining/training_ops/cuda/tensor/split.cc index 4a30b785a6..37a18c603f 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/split.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/split.cc @@ -82,7 +82,8 @@ Status SplitTraining::ComputeInternal(OpKernelContext* ctx) const { axis_dimension_input_output_mapping_gpu.CopyToGpu(); size_t element_size = input_tensor->DataType()->Size(); - ORT_RETURN_IF_ERROR(SplitImpl(element_size, + ORT_RETURN_IF_ERROR(SplitImpl(Stream(), + element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), diff --git a/orttraining/orttraining/training_ops/cuda/tensor/view.cc b/orttraining/orttraining/training_ops/cuda/tensor/view.cc index af6c140101..6d5d9da000 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/view.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/view.cc @@ -76,7 +76,7 @@ Status View::ComputeInternal(OpKernelContext* context) const { // View output is not sharing the underlaying buffer of input, copy instead const void* source = static_cast(X_data) + y_byte_offsets[i]; void* target = Y->MutableDataRaw(); - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice)); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice, Stream())); } else { Y->SetByteOffset(y_byte_offsets[i]); } diff --git a/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc b/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc index 766540b437..00039b7c42 100644 --- a/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/math/softmax_grad.cc @@ -13,6 +13,7 @@ namespace rocm { template Status SoftMaxGradComputeHelper( + hipStream_t stream, const T* dY, const TensorShape& input_shape, const T* Y, @@ -33,7 +34,7 @@ Status SoftMaxGradComputeHelper( if (D <= 1024 && D * sizeof(T) <= 4096) { dispatch_softmax_backward, is_log_softmax>( - dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); + stream, dX_data, dY_data, Y_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); return Status::OK(); } @@ -90,9 +91,9 @@ Status SoftmaxGrad::ComputeInternal(OpKernelContext* ctx) const { T* dX_data = dX->template MutableData(); if (log_softmax_) { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); } else { - return SoftMaxGradComputeHelper(dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); + return SoftMaxGradComputeHelper(Stream(), dY_data, input_shape, Y_data, dX_data, MiopenHandle(), axis_); } } diff --git a/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu index c9c60c0706..2781435170 100644 --- a/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/math/softmax_grad_impl.cu @@ -120,7 +120,7 @@ __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, } template -void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { +void dispatch_softmax_backward(hipStream_t stream, output_t* grad_input, const input_t* grad, const input_t* output, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { return; } else { @@ -144,37 +144,37 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { case 0: // 1 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 1: // 2 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 2: // 4 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 3: // 8 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 4: // 16 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 5: // 32 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 6: // 64 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 7: // 128 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 8: // 256 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 9: // 512 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; case 10: // 1024 - hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, 0, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); + hipLaunchKernelGGL(HIP_KERNEL_NAME(softmax_warp_backward), dim3(blocks), dim3(threads), 0, stream, grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); break; default: break; @@ -183,8 +183,8 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const } #define SPECIALIZED_SOFTMAX_GRAD_IMPL(input_t, output_t, acc_t) \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_softmax_backward(input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); +template void dispatch_softmax_backward(hipStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template void dispatch_softmax_backward(hipStream_t stream, input_t * grad_input, const output_t* grad, const output_t* output, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_GRAD_IMPL(float, float, float) SPECIALIZED_SOFTMAX_GRAD_IMPL(half, half, float) diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc index 2d378bff5b..6c554ce76e 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cc @@ -103,20 +103,20 @@ Status AdamOptimizer: if (do_update_tensor != nullptr) { const bool do_update = *(do_update_tensor->template Data()); if (!do_update) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M1, NM1)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(M2, NM2)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M1, NM1)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), M2, NM2)); if (S_in != S_out) { *(S_out) = *(S_in); } if (NW != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(W, *NW)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), W, *NW)); } if (NG != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(G, *NG)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), G, *NG)); } if (W_MIXED_FP != nullptr && NW_MIXED_FP != nullptr) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*W_MIXED_FP, *NW_MIXED_FP)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(Stream(), *W_MIXED_FP, *NW_MIXED_FP)); } return Status::OK(); @@ -124,6 +124,7 @@ Status AdamOptimizer: } AdamOptimizerImpl( + Stream(), reinterpret_cast(ETA.template Data()), *S_in, reinterpret_cast(W.template Data()), diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu index cfda9ddbca..aa05bf86d0 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu +++ b/orttraining/orttraining/training_ops/rocm/optimizer/adam.cu @@ -139,6 +139,7 @@ __global__ void _AdamOptimizer_mode1( template void AdamOptimizerImpl( + hipStream_t stream, const T1* eta, const T2 update_count, const T3* weights, @@ -176,7 +177,7 @@ void AdamOptimizerImpl( // bias correction is applied on learning rate, // weight decay is applied after weight is updated. if (weight_decay_mode == 0) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode0), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode0), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, eta, weights, grads, @@ -199,7 +200,7 @@ void AdamOptimizerImpl( N); } else if (weight_decay_mode == 1) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode1), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_AdamOptimizer_mode1), dim3(blocksPerGrid), dim3(GridDim::maxThreadsPerBlock), 0, stream, eta, weights, grads, @@ -229,6 +230,7 @@ void AdamOptimizerImpl( #define SPECIALIZED_AdamOptimizerImpl(T1, T2, T3, T4, T_GRAD, T_GRAD_NORM, T_MIXED_PRECISION_FP) \ template void AdamOptimizerImpl( \ + hipStream_t stream, \ const T1* eta, \ const T2 update_count, \ const T3* weights, \ diff --git a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc index f45b5b1a85..0809149ec5 100644 --- a/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc +++ b/orttraining/orttraining/training_ops/rocm/optimizer/lamb.cc @@ -106,6 +106,7 @@ void check_inputs_and_outputs( template Status copy_inputs_to_outputs( + hipStream_t stream, OpKernelContext* ctx, const int non_grouped_input_count, const int non_grouped_output_count, @@ -144,16 +145,16 @@ Status copy_inputs_to_outputs( w_mixed_precision_new->SetByteOffset(w_mixed_precision->ByteOffset()); if (w_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(w, *w_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, w, *w_new)); } if (g_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(g, *g_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, g, *g_new)); } - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m1, m1_new)); - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(m2, m2_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m1, m1_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, m2, m2_new)); if (w_mixed_precision_new) { - ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(*w_mixed_precision, *w_mixed_precision_new)); + ORT_RETURN_IF_ERROR(CopyIfNotSameBuffer(stream, *w_mixed_precision, *w_mixed_precision_new)); } } @@ -162,6 +163,7 @@ Status copy_inputs_to_outputs( template Status launch_lamb_compute_direction( + hipStream_t stream, const int64_t update_count, const int group_count, const HipT2* p_loss_scale, @@ -210,6 +212,7 @@ Status launch_lamb_compute_direction( do_bias_correction ? onnxruntime::contrib::compute_bias_correction_coefficient(betas[i], update_count) : 1.f; LambComputeDirection( + stream, p_ws[i], p_gs[i], p_m1s[i], @@ -257,6 +260,7 @@ Status launch_lamb_compute_direction( LambStage1 lamb_stage1; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets[key], buckets[key], @@ -287,7 +291,7 @@ Status launch_lamb_reduction( ORT_ENFORCE(group_count == static_cast(p_ds.size())); constexpr int tensor_count_per_group = 4; - + hipStream_t stream = kernel.Stream(); // Bucketize tensor groups by the associated optimizer configuration. // If two tensor groups use different "alpha", they should be put into two distinct buckets. std::vector> buckets; @@ -296,12 +300,14 @@ Status launch_lamb_reduction( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ws[i], p_w_norms[i], tensor_sizes[i], reduction_buffer, reduction_buffer_size)); ORT_RETURN_IF_ERROR(reduce_square_sum( + stream, p_ds[i], p_d_norms[i], tensor_sizes[i], @@ -332,6 +338,7 @@ Status launch_lamb_reduction( typedef LambMultiTensorReductionFunctor TReducer; TReducer reducer; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_buckets, buckets, @@ -346,6 +353,7 @@ Status launch_lamb_reduction( template Status launch_lamb_update( + hipStream_t stream, const int group_count, const HipT1* eta, const float ratio_min, @@ -378,6 +386,7 @@ Status launch_lamb_update( for (int i = 0; i < group_count; ++i) { if (tensor_sizes[i] > max_tensor_size) { LambUpdate( + stream, eta, ratio_min, ratio_max, @@ -419,6 +428,7 @@ Status launch_lamb_update( LambStage2 lamb_stage2; launch_multi_tensor_functor( + stream, 2048 * 32, tensor_sizes_in_bucket, buckets, @@ -493,6 +503,7 @@ Status LambOptimizer::Compute auto update_signal = *update_signal_tensor->template Data(); if (!update_signal) { return copy_inputs_to_outputs( + Stream(), ctx, non_grouped_input_count, non_grouped_output_count, @@ -529,14 +540,14 @@ Status LambOptimizer::Compute // and T2=float. IAllocatorUniquePtr d_norm_buffer = GetScratchBuffer(group_count); HipT2* d_norm_data = reinterpret_cast(d_norm_buffer.get()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(d_norm_data, 0, group_count * sizeof(T2))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(d_norm_data, 0, group_count * sizeof(T2), Stream())); // Allocate buffer for reduction computation of weight tensor. // The i-th weight's norm is stored at the i-th element. // We reduce type T2 tensor to type T2 scalar. An example is that T2=float. IAllocatorUniquePtr w_norm_buffer = GetScratchBuffer(group_count); HipT2* w_norm_data = reinterpret_cast(w_norm_buffer.get()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(w_norm_data, 0, group_count * sizeof(T2))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(w_norm_data, 0, group_count * sizeof(T2), Stream())); // Find the max size of updated weight tensors. int max_tensor_size = 0; @@ -642,6 +653,7 @@ Status LambOptimizer::Compute } ORT_RETURN_IF_ERROR(launch_lamb_compute_direction( + Stream(), step_data ? *step_data : 0, group_count, loss_scale_data, @@ -665,6 +677,7 @@ Status LambOptimizer::Compute reduction_buffer_size)); ORT_RETURN_IF_ERROR(launch_lamb_update( + Stream(), group_count, eta_data, ratio_min_, diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc index c054b96816..6fae7135b7 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_all.cc @@ -44,7 +44,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Allocate output tensor. Tensor* output = ctx->Output(0, {}); HipTOut* p_output = reinterpret_cast(output->template MutableData()); - HIP_RETURN_IF_ERROR(hipMemsetAsync(p_output, 0, sizeof(HipTOut))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(p_output, 0, sizeof(HipTOut), Stream())); // bool deterministic = ctx->GetUseDeterministicCompute(); bool deterministic = true; @@ -55,11 +55,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // Check if all values are finite and write true to deviceOutput. // Otherwise, false will be written. launch_multi_tensor_functor<1, TFunctor>( - 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); + Stream(), 2048 * 32, tensor_sizes, grouped_tensor_pointers, functor, p_output); // *p_output is the squared sum of all elements. // Let's take a sqrt to get the actual L2-norm. - ScalarSqrt(p_output, p_output); + ScalarSqrt(Stream(), p_output, p_output); } else { // alternate path only for deterministic compute .. typedef AccumulationType_t HipTAcc; @@ -81,7 +81,7 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { // buffer for final output and square norms of each tensor auto results_buffer = GetScratchBuffer(1 + total_tensor_count); - HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count))); + HIP_RETURN_IF_ERROR(hipMemsetAsync(results_buffer.get(), 0, sizeof(HipTAcc) * (1 + total_tensor_count), Stream())); HipTAcc* p_global_sqnorm = results_buffer.get(); HipTAcc* p_tensor_sqnorm = p_global_sqnorm + 1; @@ -90,11 +90,11 @@ Status ReduceAllL2::ComputeInternal(OpKernelContext* ctx) const { for (int i = 0; i < total_tensor_count; ++i) { HipTIn* p_tensor_i = reinterpret_cast(grouped_tensor_pointers[i][0]); ORT_RETURN_IF_ERROR(reduce_square_sum( - p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); + Stream(), p_tensor_i, p_tensor_sqnorm + i, tensor_sizes[i], reduction_buffer.get(), reduction_buffer_size)); } ORT_RETURN_IF_ERROR(reduce_sum( - p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); - ScalarSqrt(p_global_sqnorm, p_output); + Stream(), p_tensor_sqnorm, p_global_sqnorm, total_tensor_count, reduction_buffer.get(), reduction_buffer_size)); + ScalarSqrt(Stream(), p_global_sqnorm, p_output); } return Status::OK(); diff --git a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc index 2bafe92209..c628efb013 100644 --- a/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc +++ b/orttraining/orttraining/training_ops/rocm/reduction/reduction_ops.cc @@ -47,7 +47,7 @@ Status ReduceKernel::ComputeImplEx(OpKernelContext* ctx, miope // empty axes and no-op if (axes.empty() && noop_with_empty_axes_) { auto* Y = ctx->Output(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -80,7 +80,7 @@ Status ReduceKernel::ComputeImplExOutput(0, X->Shape()); - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), X->SizeInBytes(), hipMemcpyDeviceToDevice, Stream())); return Status::OK(); } @@ -107,14 +107,14 @@ Status ReduceKernel::ComputeImplExtemplate MutableData() != X->template Data()) { - HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice)); + HIP_RETURN_IF_ERROR(hipMemcpyAsync(Y->template MutableData(), X->template Data(), input_count * sizeof(int32_t), hipMemcpyDeviceToDevice, Stream())); } return Status::OK(); } // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. // Therefore zeroing out the memory is required - HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(Y->MutableDataRaw(), 0, Y->SizeInBytes(), Stream())); size_t indices_bytes = 0; size_t workspace_bytes = 0; @@ -124,7 +124,7 @@ Status ReduceKernel::ComputeImplEx temp_X = GetScratchBuffer(input_count); - Impl_Cast(reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); + Impl_Cast(Stream(), reinterpret_cast(X->template Data()), temp_X.get(), X->Shape().Size()); ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES)); ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -150,7 +150,7 @@ Status ReduceKernel::ComputeImplEx(temp_Y.get(), Y->template MutableData(), output_count); + Impl_Cast(Stream(), temp_Y.get(), Y->template MutableData(), output_count); return Status::OK(); } diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc index ef1fa2121e..d4ff6c37d8 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad.cc @@ -93,7 +93,7 @@ Status GatherGrad::ComputeInternal(OpKernelContext* context) const { const Tensor* grad = context->Input(2); Tensor* output = context->Output(0, data_shape); - HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes())); + HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream())); MLDataType T_type = grad->DataType(); MLDataType Tin_type = indices->DataType(); diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu index 14599791b4..7263591b48 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_grad_impl.cu @@ -131,12 +131,13 @@ void GatherGradImpl( ) { // allocate intermediate buffers auto original_indices = rocm_kernel.template GetScratchBuffer(num_indices); + hipStream_t stream = rocm_kernel.Stream(); // initialize original_indices with [0, num_indices) { const auto blocks_per_grid = CeilDiv(num_indices, GridDim::maxThreadsPerBlock); hipcub::CountingInputIterator counting_input(Tin{}); - hipLaunchKernelGGL(_Iota, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(_Iota, dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, counting_input, num_indices, original_indices.get()); } @@ -149,7 +150,7 @@ void GatherGradImpl( nullptr, sort_temp_storage_size_bytes, indices_data, indices_data_sorted.get(), original_indices.get(), original_indices_sorted.get(), - num_indices)); + num_indices, 0, sizeof(Tin)*8, stream)); auto sort_temp_storage = rocm_kernel.GetScratchBuffer(sort_temp_storage_size_bytes); @@ -157,13 +158,13 @@ void GatherGradImpl( sort_temp_storage.get(), sort_temp_storage_size_bytes, indices_data, indices_data_sorted.get(), original_indices.get(), original_indices_sorted.get(), - num_indices)); + num_indices, 0, sizeof(Tin)*8, stream)); dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, GridDim::maxElementsPerThread * GPU_WARP_SIZE)); if (param_itrs == 1) { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherAxis0GradImpl), dim3(grid), dim3(block), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherAxis0GradImpl), dim3(grid), dim3(block), 0, stream, indices_data_sorted.get(), original_indices_sorted.get(), grad_data, @@ -172,7 +173,7 @@ void GatherGradImpl( num_inputs, stride); } else { - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherGradImpl), dim3(grid), dim3(block), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherGradImpl), dim3(grid), dim3(block), 0, stream, indices_data_sorted.get(), original_indices_sorted.get(), grad_data, diff --git a/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu b/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu index 10270bc7c4..8f924df170 100644 --- a/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu +++ b/orttraining/orttraining/training_ops/rocm/tensor/gather_nd_grad_impl.cu @@ -24,18 +24,19 @@ __global__ void _GatherNDGradKernel( template void GatherNDGradImpl( + hipStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) { const auto blocks_per_grid = CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock); - hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDGradKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDGradKernel), dim3(blocks_per_grid), dim3(GridDim::maxThreadsPerBlock), 0, stream, num_slices, static_cast(update_data), static_cast(output_data), slice_size, input_slice_offsets_data); } #define SPECIALIZED_GRAD_IMPL(T) \ - template void GatherNDGradImpl(const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) + template void GatherNDGradImpl(hipStream_t stream, const size_t num_slices, const void* update_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data) SPECIALIZED_GRAD_IMPL(float); SPECIALIZED_GRAD_IMPL(half); From 82229c8e61062ef17e57b9ca4f1189f3b589cfcb Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Fri, 5 Feb 2021 16:48:22 -0800 Subject: [PATCH 27/41] Support no bias in layernorm and skiplayernorm op (#6554) * add noBias attribute in layernorm * skip bias in skiplayernorm * fix * fix cuda tets * add tests * fix windows build * fix win build issue * review comments --- onnxruntime/contrib_ops/cpu/layer_norm.cc | 6 ++- .../contrib_ops/cpu/skip_layer_norm.cc | 26 +++++---- .../contrib_ops/cuda/bert/layer_norm.cuh | 8 +-- .../contrib_ops/cuda/bert/skip_layer_norm.cc | 20 +++---- onnxruntime/contrib_ops/cuda/layer_norm.cc | 4 +- .../core/graph/contrib_ops/contrib_defs.cc | 4 +- .../test/contrib_ops/layer_norm_op_test.cc | 23 ++++++++ .../test/contrib_ops/layer_norm_test.cc | 15 ++++-- .../test/contrib_ops/skiplayernorm_op_test.cc | 51 ++++++++++++++++-- .../test/testdata/layernorm_no_bias.onnx | Bin 0 -> 422 bytes 10 files changed, 122 insertions(+), 35 deletions(-) create mode 100644 onnxruntime/test/testdata/layernorm_no_bias.onnx diff --git a/onnxruntime/contrib_ops/cpu/layer_norm.cc b/onnxruntime/contrib_ops/cpu/layer_norm.cc index c3c26e4684..8d043635e9 100644 --- a/onnxruntime/contrib_ops/cpu/layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/layer_norm.cc @@ -31,7 +31,7 @@ namespace contrib { KernelDefBuilder() \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ LayerNorm); - + REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) @@ -50,7 +50,7 @@ Status LayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* bias = p_ctx->Input(2); auto X_data = X->template Data(); auto scale_data = scale->template Data(); - auto bias_data = simplified ? nullptr : bias->template Data(); + auto bias_data = (simplified || nullptr == bias) ? nullptr : bias->template Data(); const TensorShape& x_shape = X->Shape(); const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions()); @@ -124,6 +124,8 @@ Status LayerNorm::Compute(OpKernelContext* p_ctx) const { for (int64_t h = 0; h < norm_size; h++) { if (simplified) { p_output[h] = p_input[h] / mean_square * scale_data[h]; + } else if (nullptr == bias){ + p_output[h] = (p_input[h] - mean) / mean_square * scale_data[h]; } else { p_output[h] = (p_input[h] - mean) / mean_square * scale_data[h] + bias_data[h]; } diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 29d244d64b..b84635d082 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -61,14 +61,16 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { "Last dimension of gamma and input does not match"); } - const auto& beta_dims = beta->Shape().GetDims(); - if (beta_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "beta is expected to have 1 dimension, got ", beta_dims.size()); - } - if (beta_dims[0] != input_dims[2]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Last dimension of beta and input does not match"); + if (nullptr != beta) { + const auto& beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "beta is expected to have 1 dimension, got ", beta_dims.size()); + } + if (beta_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Last dimension of beta and input does not match"); + } } if (nullptr != bias) { @@ -91,7 +93,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const T* input_data = input->Data(); const T* skip_data = skip->Data(); const T* gamma_data = gamma->Data(); - const T* beta_data = beta->Data(); + const T* beta_data = beta == nullptr ? nullptr : beta->Data(); const T* bias_data = bias == nullptr ? nullptr : bias->Data(); T* output_data = output->MutableData(); @@ -119,7 +121,11 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); for (int64_t h = 0; h < hidden_size; h++) { - p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; + if (nullptr == beta_data) { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h]; + } else { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; + } } }, 0); diff --git a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh index 16c1469f22..e3fef4c681 100644 --- a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh +++ b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh @@ -1,7 +1,7 @@ /* The implementation of this file is based on bert plugins in TensorRT demo: https://github.com/NVIDIA/TensorRT/tree/release/5.1/demo/BERT/ - + Copyright 2019 NVIDIA Corporation Licensed under the Apache License, Version 2.0 (the "License"); @@ -79,7 +79,7 @@ struct KeyValuePairSum { template __device__ inline void LayerNorm( - const cub::KeyValuePair& thread_data, const int ld, const int offset, const T* beta, + const cub::KeyValuePair& thread_data, const int ld, const int offset, const T* beta, const T* gamma, const T epsilon, T* output) { // Assuming thread_data is already divided by ld @@ -101,7 +101,7 @@ __device__ inline void LayerNorm( const int idx = offset + i; const T val = output[idx]; const T g(gamma[i]); - const T b(beta[i]); + const T b = (nullptr == beta) ? (T)0 : beta[i]; output[idx] = g * (val - mu) * rsigma + b; } } @@ -129,7 +129,7 @@ __device__ inline void LayerNormSmall(const T val, const cub::KeyValuePair if (threadIdx.x < ld) { const T g(gamma[threadIdx.x]); - const T b(beta[threadIdx.x]); + const T b = (nullptr == beta) ? (T)0 : beta[threadIdx.x]; output[idx] = g * (val - mu) * rsigma + b; } } diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc index b8238f7690..2af85ca89f 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc @@ -65,14 +65,16 @@ Status SkipLayerNorm::ComputeInternal(OpKernelContext* ctx) const { "Last dimension of gamma and input does not match"); } - const auto& beta_dims = beta->Shape().GetDims(); - if (beta_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "beta is expected to have 1 dimension, got ", beta_dims.size()); - } - if (beta_dims[0] != input_dims[2]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Last dimension of beta and input does not match"); + if (nullptr != beta) { + const auto& beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "beta is expected to have 1 dimension, got ", beta_dims.size()); + } + if (beta_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Last dimension of beta and input does not match"); + } } if (nullptr != bias) { @@ -98,7 +100,7 @@ Status SkipLayerNorm::ComputeInternal(OpKernelContext* ctx) const { input->template Data(), skip->template Data(), gamma->template Data(), - beta->template Data(), + beta != nullptr ? beta->template Data() : nullptr, bias != nullptr ? bias->template Data() : nullptr, epsilon_, hidden_size, diff --git a/onnxruntime/contrib_ops/cuda/layer_norm.cc b/onnxruntime/contrib_ops/cuda/layer_norm.cc index 12f37f36a0..bbd292ffa9 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm.cc +++ b/onnxruntime/contrib_ops/cuda/layer_norm.cc @@ -59,7 +59,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) const auto X_data = reinterpret_cast(X->template Data()); auto scale_data = reinterpret_cast(scale->template Data()); - auto bias_data = simplified ? nullptr: reinterpret_cast(bias->template Data()); + auto bias_data = (simplified || (nullptr == bias)) ? nullptr: reinterpret_cast(bias->template Data()); const TensorShape& x_shape = X->Shape(); const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions()); @@ -91,7 +91,7 @@ Status LayerNorm::ComputeInternal(OpKernelContext* ctx) const mean_data = reinterpret_cast(mean->template MutableData()); } } - + Tensor* var = ctx->Output(output_index, TensorShape(mean_inv_std_var_dim)); CudaU* inv_var_data = nullptr; if (var != nullptr) { diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 0112963692..54bcbd2b09 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -546,7 +546,7 @@ GELU (Gaussian Error Linear Unit) approximation: Y=0.5*X*(1+tanh(0.797885*X+0.03 .Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T") .Input(1, "skip", "3D skip tensor with shape (batch_size, sequence_length, hidden_size)", "T") .Input(2, "gamma", "1D input tensor with shape (hidden_size)", "T") - .Input(3, "beta", "1D skip tensor with shape (hidden_size", "T") + .Input(3, "beta", "1D skip tensor with shape (hidden_size", "T", OpSchema::Optional) .Input(4, "bias", "1D bias tensor with shape (hidden_size", "T", OpSchema::Optional) .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "T") .Output(1, "mean", "Saved mean used during training to speed up gradient computation", "U", OpSchema::Optional) @@ -2085,7 +2085,7 @@ Example 4: .AllowUncheckedAttributes() .Input(0, "X", "Input data tensor from the previous layer.", "T") .Input(1, "scale", "Scale tensor.", "T") - .Input(2, "B", "Bias tensor.", "T") + .Input(2, "B", "Bias tensor.", "T", OpSchema::Optional) .Output(0, "Y", "Output data tensor.", "T") .Output(1, "mean", "Saved mean used during training to speed up gradient computation", "U", OpSchema::Optional) .Output(2, "inv_std_var", "Saved inverse standard variance used during training to speed up gradient computation.", "U", OpSchema::Optional) diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc index c8c62434d2..23dd469228 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc @@ -43,5 +43,28 @@ TEST(LayerNormTest, BERTLayerNorm) { tester.Run(); } +TEST(LayerNormTest, BERTLayerNorm_NoBias) { + OpTester tester("LayerNormalization", 1 /*opset_version*/); + tester.AddAttribute("axis", -1); + tester.AddAttribute("epsilon", 1e-12f); + + // create rand inputs + RandomValueGenerator random{}; + + std::vector X_dims{4, 128}; + std::vector X_data = random.Uniform(X_dims, 0.0f, 1.0f); + tester.AddInput("X", X_dims, X_data); + + std::vector scale_dims{128}; + std::vector scale_data = random.Uniform(scale_dims, 0.0f, 1.0f); + tester.AddInput("Scale", scale_dims, scale_data); + + tester.AddMissingOptionalInput(); + + tester.AddReferenceOutputs("testdata/layernorm_no_bias.onnx"); + + tester.Run(); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc index b963efa8d6..abe39ac4ff 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc @@ -29,7 +29,8 @@ static void TestLayerNorm(const std::vector& x_dims, const std::string& op, optional epsilon, int64_t axis = -1, - int64_t keep_dims = 1) { + int64_t keep_dims = 1, + bool no_bias = false) { const std::vector& n_x_m_dims = x_dims; std::vector n_dims, m_dims; ASSERT_TRUE(SplitDims(n_x_m_dims, axis, n_dims, m_dims).IsOK()); @@ -41,7 +42,7 @@ static void TestLayerNorm(const std::vector& x_dims, ASSERT_NE(keep_dims, 0); const std::vector& stats_dims = keep_dims ? n_and_ones_dims : n_dims; - + CompareOpTester test(op.c_str()); test.AddAttribute("axis", axis); test.AddAttribute("keep_dims", keep_dims); @@ -57,7 +58,7 @@ static void TestLayerNorm(const std::vector& x_dims, test.AddInput("X", n_x_m_dims, X_data); test.AddInput("scale", m_dims, scale_data, true); - if (op.compare(SIMPLIFIED_LAYER_NORM_OP) != 0) { + if (op.compare(SIMPLIFIED_LAYER_NORM_OP) != 0 && no_bias == false) { test.AddInput("B", m_dims, B_data, true); } @@ -99,6 +100,14 @@ TEST(CudaKernelTest, LayerNorm_LargeSizeTensor) { TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default); } +TEST(CudaKernelTest, LayerNorm_MidSizeTensor_NoBias) { + std::vector X_dims{8, 80, 768}; + const int64_t axis = -1; + const int64_t keep_dims = 1; + const bool no_bias = true; + TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default, axis, keep_dims, no_bias); +} + TEST(CudaKernelTest, SimplifiedLayerNorm_SmallSizeTensor) { const std::vector X_dims{4, 20, 128}; TestLayerNorm(X_dims, SIMPLIFIED_LAYER_NORM_OP, k_epsilon_default); diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index 4b53234216..d571f43628 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -21,7 +21,8 @@ static void RunTest( int batch_size, int sequence_length, int hidden_size, - bool use_float16 = false) { + bool use_float16 = false, + bool no_beta = false) { // Input and output shapes // Input 0 - input: (batch_size, sequence_length, hidden_size) // Input 1 - skip : (batch_size, sequence_length, hidden_size) @@ -40,7 +41,11 @@ static void RunTest( test.AddInput("input", input_dims, input_data); test.AddInput("skip", skip_dims, skip_data); test.AddInput("gamma", gamma_dims, gamma_data); - test.AddInput("beta", beta_dims, beta_data); + if (!no_beta) { + test.AddInput("beta", beta_dims, beta_data); + } else { + test.AddMissingOptionalInput(); + } test.AddAttribute("epsilon", epsilon); if (!bias_data.empty()) { test.AddInput("bias", bias_dims, bias_data); @@ -53,7 +58,11 @@ static void RunTest( test.AddInput("input", input_dims, ToFloat16(input_data)); test.AddInput("skip", skip_dims, ToFloat16(skip_data)); test.AddInput("gamma", gamma_dims, ToFloat16(gamma_data)); - test.AddInput("beta", beta_dims, ToFloat16(beta_data)); + if (!no_beta) { + test.AddInput("beta", beta_dims, ToFloat16(beta_data)); + } else { + test.AddMissingOptionalInput(); + } test.AddAttribute("epsilon", epsilon); if (!bias_data.empty()) { test.AddInput("bias", bias_dims, ToFloat16(bias_data)); @@ -138,6 +147,42 @@ TEST(SkipLayerNormTest, SkipLayerNormBatch1_Float16) { true); } +TEST(SkipLayerNormTest, SkipLayerNormBatch1_NoBeta) { + int batch_size = 1; + int sequence_length = 2; + int hidden_size = 4; + + std::vector input_data = { + 0.8f, -0.5f, 0.0f, 1.f, + 0.5f, 0.2f, 0.3f, -0.6f}; + + std::vector skip_data = { + 0.1f, -0.2f, 0.3f, 1.0f, + 0.5f, 0.1f, 0.4f, 1.6f}; + + std::vector gamma_data = { + 0.3f, 0.2f, 4.0f, 2.2f}; + + std::vector beta_data = {}; + + std::vector output_data = { + 0.08433859348297119f, -0.27090578377246857f, -1.32897164821624756f, 3.0924152374267578f, + 0.26111652255058289f, -0.31333980560302734f, -0.69631003737449646f, 1.9148544311523438f}; + + RunTest(input_data, + skip_data, + gamma_data, + beta_data, + std::vector(), + output_data, + epsilon_, + batch_size, + sequence_length, + hidden_size, + false, + true); +} + TEST(SkipLayerNormTest, SkipLayerNormBatch2) { int batch_size = 2; int sequence_length = 2; diff --git a/onnxruntime/test/testdata/layernorm_no_bias.onnx b/onnxruntime/test/testdata/layernorm_no_bias.onnx new file mode 100644 index 0000000000000000000000000000000000000000..3c8b5740c247821915fa4048638c60ab4de941f6 GIT binary patch literal 422 zcmd;J6=KRQwwlSzWx>T5AtV@-no^pa>YJLF7oT5RQczl=#Dx&j65(P=tVk_(_zwe& z3mDnClz=+9Fm(#C2A3wq=PNM-DK0542#Qm2$Brrlf#O15rTJ98*%bL?MFtLTtr_MI}H{mS7;wB@J@`R9J{LC9@2q$0f6j zOAIKPoS2i!1rZctj8tOwEzJ?)@=2^rEmH8yFUr*thFi-m!K!2u=&c7qqo znLRz)QPQAL6yp-%U=$MIViRKV%g;-N(NW@1YlQf?cz_BxU=m3{U6HtSIWY-v0RZM~ Bb{_x$ literal 0 HcmV?d00001 From b5bd14fc9fddb9df41405b0661a39c026f7316db Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 5 Feb 2021 16:58:37 -0800 Subject: [PATCH 28/41] Update GPU packaging pipelines to cuda11 and fix the other build break issues (#6585) Update gpu packaging pipelines to CUDA11 In the next release we will use CUDA 11. And our CUDA 11 build suddenly became broken because recently CentOS 7 posted an update of glibc. The version of glibc was changed from 2.17-317.el7 to 2.17-322.el7_9. But the newer one isn't compatible with CUDA 11. We have to downgrade it. --- .../c-api-packaging-pipelines.yml | 8 +++--- .../java-api-packaging-pipelines-gpu.yml | 4 +-- .../azure-pipelines/linux-gpu-ci-pipeline.yml | 16 ++++++----- .../linux-gpu-cuda-11-pipeline.yml | 19 ++++++------- .../linux-multi-gpu-ci-pipeline.yml | 6 ++-- .../azure-pipelines/nuget/templates/gpu.yml | 14 +++++----- .../nuget/templates/test_linux.yml | 6 ++-- .../templates/py-packaging-stage.yml | 28 +++++++++---------- .../docker/Dockerfile.manylinux2014_cuda11 | 2 +- .../scripts/manylinux/install_centos.sh | 2 +- 10 files changed, 52 insertions(+), 53 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml index 16e44b3d1f..5cea2b2a3f 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-packaging-pipelines.yml @@ -44,15 +44,15 @@ jobs: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimegpubuild \ + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda11build \ /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \ - --skip_submodule_sync --parallel --build_shared_lib --use_cuda --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr/local/cuda-$(CUDA_VERSION) + --skip_submodule_sync --parallel --build_shared_lib --use_cuda --cuda_version=$(CUDA_VERSION) --cuda_home=/usr/local/cuda-$(CUDA_VERSION) --cudnn_home=/usr/local/cuda-$(CUDA_VERSION) --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc workingDirectory: $(Build.SourcesDirectory) - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml diff --git a/tools/ci_build/github/azure-pipelines/java-api-packaging-pipelines-gpu.yml b/tools/ci_build/github/azure-pipelines/java-api-packaging-pipelines-gpu.yml index dbcf2cfadf..9e8bce6823 100644 --- a/tools/ci_build/github/azure-pipelines/java-api-packaging-pipelines-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/java-api-packaging-pipelines-gpu.yml @@ -24,7 +24,7 @@ jobs: inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimegpubuild /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_submodule_sync --parallel --build_java --build_shared_lib --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimegpubuild /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_submodule_sync --parallel --build_java --build_shared_lib --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 workingDirectory: $(Build.SourcesDirectory) - template: templates/java-api-artifacts-package-and-publish-steps-posix.yml parameters: @@ -269,5 +269,5 @@ jobs: - task: CmdLine@2 inputs: script: | - docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build -e NIGHTLY_BUILD onnxruntimegpubuild /onnxruntime_src/tools/ci_build/github/linux/java_linux_final_test.sh -v $(OnnxRuntimeVersion) -r /build + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build -e NIGHTLY_BUILD onnxruntimegpubuild /onnxruntime_src/tools/ci_build/github/linux/java_linux_final_test.sh -v $(OnnxRuntimeVersion) -r /build workingDirectory: $(Build.BinariesDirectory)/final-jar diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index c5b586307c..bb00301dd9 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -1,6 +1,6 @@ jobs: - job: Linux_Build - timeoutInMinutes: 120 + timeoutInMinutes: 180 workspace: clean: all pool: Linux-GPU-CUDA10 @@ -11,15 +11,16 @@ jobs: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build + - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all --rm \ + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" --rm \ --volume /data/onnx:/data/onnx:ro \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ @@ -28,7 +29,7 @@ jobs: -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ -e NIGHTLY_BUILD \ -e BUILD_BUILDNUMBER \ - onnxruntimegpubuild \ + onnxruntimecuda11build \ /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ --config Release \ @@ -36,10 +37,11 @@ jobs: --build_shared_lib \ --parallel \ --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \ + --enable_onnx_tests --use_cuda --cuda_version=11.0 --cuda_home=/usr/local/cuda-11.0 --cudnn_home=/usr/local/cuda-11.0 \ --enable_pybind --build_java --build_nodejs \ - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52 PYTHON_INCLUDE_DIR=/opt/python/cp37-cp37m/include/python3.7m PYTHON_LIBRARY=/usr/lib64/librt.so + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc PYTHON_INCLUDE_DIR=/opt/python/cp37-cp37m/include/python3.7m PYTHON_LIBRARY=/usr/lib64/librt.so CMAKE_CUDA_ARCHITECTURES=52 workingDirectory: $(Build.SourcesDirectory) + - task: PublishTestResults@2 displayName: 'Publish unit test results' inputs: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml index 3607c128c4..71ead2ae29 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-cuda-11-pipeline.yml @@ -11,16 +11,15 @@ jobs: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimecuda11build - + Repository: onnxruntimegpubuild - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all --rm \ + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" --rm \ --volume /data/onnx:/data/onnx:ro \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ @@ -29,20 +28,18 @@ jobs: -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ -e NIGHTLY_BUILD \ -e BUILD_BUILDNUMBER \ - onnxruntimecuda11build \ + onnxruntimegpubuild \ /opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ - --config Debug Release \ + --config Release \ --skip_submodule_sync \ --build_shared_lib \ --parallel \ --build_wheel \ - --use_openmp \ - --enable_onnx_tests --use_cuda --cuda_version=11.0 --cuda_home=/usr/local/cuda-11.0 --cudnn_home=/usr/local/cuda-11.0 \ + --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \ --enable_pybind --build_java --build_nodejs \ - --cmake_extra_defines PYTHON_INCLUDE_DIR=/opt/python/cp37-cp37m/include/python3.7m PYTHON_LIBRARY=/usr/lib64/librt.so CMAKE_CUDA_ARCHITECTURES=52 + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc PYTHON_INCLUDE_DIR=/opt/python/cp37-cp37m/include/python3.7m PYTHON_LIBRARY=/usr/lib64/librt.so CMAKE_CUDA_ARCHITECTURES=52 workingDirectory: $(Build.SourcesDirectory) - - task: PublishTestResults@2 displayName: 'Publish unit test results' inputs: @@ -57,4 +54,4 @@ jobs: - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 displayName: 'Clean Agent Directories' - condition: always() + condition: always() \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml index cf002db4d5..4cb9de15a8 100644 --- a/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-multi-gpu-ci-pipeline.yml @@ -24,7 +24,7 @@ jobs: inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all --rm \ + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" --rm \ --volume /data/onnx:/data/onnx:ro \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ @@ -35,14 +35,14 @@ jobs: onnxruntimegpubuild \ $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ - --config Debug Release \ + --config Release \ --skip_submodule_sync \ --build_shared_lib \ --parallel \ --build_wheel \ --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \ --enable_pybind --build_java --build_nodejs --enable_multi_device_test \ - --cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so workingDirectory: $(Build.SourcesDirectory) - task: PublishTestResults@2 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml index 6db69a0fc8..80f3443125 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/gpu.yml @@ -8,7 +8,7 @@ jobs: AgentPool : 'Win-GPU-2019' ArtifactName: 'drop-nuget' JobName: 'Windows_CI_GPU_CUDA_Dev' - BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --use_telemetry --cmake_generator "Visual Studio 16 2019" --use_cuda --cuda_version=10.2 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" --cudnn_home="C:\local\cudnn-10.2-windows10-x64-v8.0.3.33\cuda" + BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --use_telemetry --cmake_generator "Visual Studio 16 2019" --use_cuda --cuda_version=11.0 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0" --cudnn_home="C:\local\cudnn-11.0-windows-x64-v8.0.2.39\cuda" BuildArch: 'x64' msbuildArchitecture: 'amd64' EnvSetupScript: 'setup_env_cuda.bat' @@ -17,7 +17,7 @@ jobs: DoNugetPack : 'true' DoCompliance: 'false' DoEsrp: ${{ parameters.DoEsrp }} - CudaVersion: '10.2' + CudaVersion: '11.0' OrtPackageId: 'Microsoft.ML.OnnxRuntime.Gpu' NuPackScript: | msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu @@ -40,7 +40,7 @@ jobs: DoNugetPack : 'true' DoCompliance: 'false' DoEsrp: ${{ parameters.DoEsrp }} - CudaVersion: '10.2' + CudaVersion: '11.0' OrtPackageId: 'Microsoft.ML.OnnxRuntime.DirectML' NuPackScript: | msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML @@ -81,16 +81,16 @@ jobs: - template: ../../templates/linux-set-variables-and-download.yml - template: ../../templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimegpubuild \ - /bin/bash -c "/opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_submodule_sync --parallel --build_shared_lib --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 --enable_onnx_tests && cd /build/Release && make install DESTDIR=/build/linux-x64" + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda11build \ + /bin/bash -c "/opt/python/cp37-cp37m/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_submodule_sync --parallel --build_shared_lib --use_cuda --cuda_version=11.0 --cuda_home=/usr/local/cuda-11.0 --cudnn_home=/usr/local/cuda-11.0 --enable_onnx_tests --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc && cd /build/Release && make install DESTDIR=/build/linux-x64" - script: | set -e -x mv $(Build.BinariesDirectory)/linux-x64/usr/local/lib64 $(Build.BinariesDirectory)/linux-x64/linux-x64 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index f2e6db75fe..0ca5d7f355 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -60,10 +60,10 @@ jobs: - ${{ if eq(parameters['TestGPU'], 'true') }}: - template: ../../templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build - script: | set -e -x @@ -72,7 +72,7 @@ jobs: $(Build.BinariesDirectory) \ nuget-artifact \ $(NuGetPackageVersionNumber) \ - onnxruntimegpubuild + onnxruntimecuda11build displayName: 'Run Package Test GPU (x64)' env: OnnxRuntimeBuildDirectory: $(Build.BinariesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 0cd66131b6..7cf6dbf07b 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -216,16 +216,16 @@ stages: - template: get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --gpus all --rm \ + docker run --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" --rm \ --volume /data/onnx:/data/onnx:ro \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ @@ -233,16 +233,16 @@ stages: --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ -e NIGHTLY_BUILD \ -e BUILD_BUILDNUMBER \ - onnxruntimegpubuild \ + onnxruntimecuda11build \ $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ --config Release \ --skip_submodule_sync \ --parallel \ --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 \ + --enable_onnx_tests --use_cuda --cuda_version=11.0 --cuda_home=/usr/local/cuda-11.0 --cudnn_home=/usr/local/cuda-11.0 \ ${{ parameters.build_py_parameters }} \ - --cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so workingDirectory: $(Build.SourcesDirectory) - task: CopyFiles@2 @@ -289,19 +289,19 @@ stages: - template: get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_gpu + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 Context: tools/ci_build/github/linux/docker DockerBuildArgs: >- --build-arg PYTHON_VERSION=$(PythonVersion) --build-arg INSTALL_DEPS_EXTRA_ARGS=-t --build-arg BUILD_UID=$(id -u) - Repository: onnxruntimegpubuild + Repository: onnxruntimecuda11build - task: CmdLine@2 inputs: script: | mkdir -p $HOME/.onnx - docker run --rm --gpus all \ + docker run --rm --gpus all -e CC=/opt/rh/devtoolset-8/root/usr/bin/cc -e CXX=/opt/rh/devtoolset-8/root/usr/bin/c++ -e CFLAGS="-g -O3" -e CXXFLAGS="-g -O3" \ --volume /data/onnx:/data/onnx:ro \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory):/build \ @@ -310,7 +310,7 @@ stages: -e NVIDIA_VISIBLE_DEVICES=all \ -e NIGHTLY_BUILD \ -e BUILD_BUILDNUMBER \ - onnxruntimegpubuild \ + onnxruntimecuda11build \ $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build \ --config Release \ @@ -319,8 +319,8 @@ stages: --build_wheel \ --enable_onnx_tests \ ${{ parameters.build_py_parameters }} \ - --cmake_extra_defines PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so \ - --use_cuda --cuda_version=10.2 --cuda_home=/usr/local/cuda-10.2 --cudnn_home=/usr/local/cuda-10.2 + --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-8/root/usr/bin/cc PYTHON_INCLUDE_DIR=$(PythonManylinuxIncludeDir) PYTHON_LIBRARY=/usr/lib64/librt.so \ + --use_cuda --cuda_version=11.0 --cuda_home=/usr/local/cuda-11.0 --cudnn_home=/usr/local/cuda-11.0 workingDirectory: $(Build.SourcesDirectory) - task: CopyFiles@2 @@ -654,7 +654,7 @@ stages: pool: 'Win-GPU-2019' timeoutInMinutes: 240 variables: - CUDA_VERSION: '10.2' + CUDA_VERSION: '11.0' buildArch: x64 EnvSetupScript: setup_env_cuda.bat GDN_CODESIGN_TARGETDIRECTORY: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist' @@ -716,7 +716,7 @@ stages: --parallel --use_cuda --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" - --cudnn_home="C:\local\cudnn-$(CUDA_VERSION)-windows10-x64-v8.0.3.33\cuda" + --cudnn_home="C:\local\cudnn-$(CUDA_VERSION)-windows-x64-v8.0.2.39\cuda" $(TelemetryOption) workingDirectory: '$(Build.BinariesDirectory)' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 index f26e199e70..52252bb208 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11 @@ -17,7 +17,7 @@ ENV LD_LIBRARY_PATH $DEVTOOLSET_ROOTPATH/usr/lib64:$DEVTOOLSET_ROOTPATH/usr/lib: ENV PKG_CONFIG_PATH /usr/local/lib/pkgconfig COPY manylinux2014_build_scripts /manylinux2014_build_scripts -RUN bash /manylinux2014_build_scripts/build.sh 9 && rm -r manylinux2014_build_scripts +RUN bash /manylinux2014_build_scripts/build.sh 8 && rm -r manylinux2014_build_scripts && yum downgrade -y glibc-2.17-317.el7 glibc-common-2.17-317.el7 glibc-devel-2.17-317.el7 glibc-headers-2.17-317.el7 ENV SSL_CERT_FILE=/opt/_internal/certs.pem diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh index 5bed6e5778..28ddc78cd1 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_centos.sh @@ -22,6 +22,6 @@ fi yum install -y java-1.8.0-openjdk-devel #If the /opt/python folder exists, we assume this is the manylinux docker image -if [ ! -d "/opt/python/cp35-cp35m" ]; then +if [ ! -d "/opt/python/cp37-cp37m" ]; then yum install -y ccache gcc gcc-c++ python3 python3-devel python3-pip fi From ccfd90291bda29492b420037ad57237b0470798b Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sat, 6 Feb 2021 11:33:29 +1000 Subject: [PATCH 29/41] Remove condition from ORT_RETURN_IF[_NOT] macro output. (#6563) Remove condition from ORT_RETURN_IF[_NOT] macro output as repeating the condition doesn't add much value compared to the explicit error message, and the error message includes the file and line anyway so it's easy enough to find the condition if needed. Update the few places where the macros were used without an explicit error message to provide an explicit error message. Saves 12.5KB in a minimal MinSizeRel build with all DNN ops, 16KB in full release build. --- .../onnxruntime/core/common/code_location.h | 4 +-- include/onnxruntime/core/common/common.h | 17 ++++------ .../passes/op_ir_creator/tensor/slice.cc | 4 +-- .../passes/op_ir_creator/tensor/split.cc | 3 +- .../core/framework/data_transfer_utils.h | 14 +++----- .../core/framework/tensorprotoutils.cc | 2 +- .../core/optimizer/computation_reduction.cc | 4 +-- onnxruntime/core/platform/posix/env.cc | 13 ++++---- onnxruntime/core/platform/windows/env.cc | 6 ++-- .../core/providers/cpu/math/matmul_helper.h | 16 ++++++---- .../object_detection/non_max_suppression.cc | 6 ++-- .../core/providers/cpu/tensor/concat.cc | 2 +- .../object_detection/non_max_suppression.cc | 6 ++-- .../core/providers/cuda/tensor/gather_nd.cc | 24 +++++++------- .../providers/dnnl/subgraph/dnnl_kernel.h | 12 ++++--- .../nnapi_builtin/builders/op_builder.cc | 4 +-- .../nuphar/compiler/traverse_shape_infer.cc | 3 +- .../x86/op_ir_creator/tensor/slice.cc | 2 +- .../compiler/x86/op_ir_creator/tensor/tile.cc | 5 +-- onnxruntime/core/session/inference_session.cc | 2 +- .../test/contrib_ops/layer_norm_test.cc | 3 +- .../test/optimizer/initializer_test.cc | 11 ++++--- .../core/framework/checkpointing.cc | 6 ++-- .../framework/protobuf_message_sequence.h | 15 +++++---- .../graph/adasum_optimizer_graph_builder.cc | 2 +- .../allreduce_optimizer_graph_builder.cc | 6 ++-- .../core/graph/mixed_precision_transformer.cc | 32 ++++++++++--------- .../graph/optimizer/adam_optimizer_builder.cc | 12 +++---- .../graph/optimizer/lamb_optimizer_builder.cc | 8 ++--- .../core/graph/optimizer_builder.cc | 4 +-- .../core/graph/optimizer_graph_builder.cc | 5 +-- .../core/graph/pipeline_transformer.cc | 4 +-- .../graph/zero_optimizer_graph_builder.cc | 4 +-- .../core/session/training_session.cc | 14 ++++---- .../orttraining/models/runner/data_loader.cc | 6 ++-- .../orttraining/models/runner/data_loader.h | 2 +- .../models/runner/training_runner.cc | 6 ++-- .../models/runner/training_util.cc | 5 +-- .../test/model/data_loader_test.cc | 2 +- .../test/training_ops/cuda/layer_norm_test.cc | 3 +- .../training_ops/cpu/tensor/gather_nd_grad.cc | 2 +- .../cuda/tensor/gather_nd_grad.cc | 6 ++-- 42 files changed, 156 insertions(+), 151 deletions(-) diff --git a/include/onnxruntime/core/common/code_location.h b/include/onnxruntime/core/common/code_location.h index ff6506c9a7..2fdb2d3a41 100644 --- a/include/onnxruntime/core/common/code_location.h +++ b/include/onnxruntime/core/common/code_location.h @@ -19,7 +19,7 @@ struct CodeLocation { */ CodeLocation(const char* file_path, const int line, const char* func) : file_and_path{file_path}, line_num{line}, function{func} { - } + } /** @param file_path Usually the value of __FILE__ @@ -29,7 +29,7 @@ struct CodeLocation { */ CodeLocation(const char* file_path, const int line, const char* func, const std::vector& stacktrace) : file_and_path{file_path}, line_num{line}, function{func}, stacktrace(stacktrace) { - } + } std::string FileNoPath() const { // assuming we always have work to do, so not trying to avoid creating a new string if diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h index 2d9dc34e34..6394c8f387 100644 --- a/include/onnxruntime/core/common/common.h +++ b/include/onnxruntime/core/common/common.h @@ -193,19 +193,16 @@ void LogRuntimeError(uint32_t session_id, const common::Status& status, const ch ::onnxruntime::MakeString(__VA_ARGS__)) // Check condition. if met, return status. -#define ORT_RETURN_IF(condition, ...) \ - if (condition) { \ - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, \ - "Satisfied, but should not be: " #condition "\n", \ - ORT_WHERE.ToString(), ::onnxruntime::MakeString(__VA_ARGS__)); \ +#define ORT_RETURN_IF(condition, ...) \ + if (condition) { \ + return ::onnxruntime::common::Status(::onnxruntime::common::ONNXRUNTIME, \ + ::onnxruntime::common::FAIL, \ + ::onnxruntime::MakeString(ORT_WHERE.ToString(), " ", __VA_ARGS__)); \ } // Check condition. if not met, return status. -#define ORT_RETURN_IF_NOT(condition, ...) \ - if (!(condition)) { \ - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not satisfied: " #condition "\n", \ - ORT_WHERE.ToString(), ::onnxruntime::MakeString(__VA_ARGS__)); \ - } +#define ORT_RETURN_IF_NOT(condition, ...) \ + ORT_RETURN_IF(!(condition), __VA_ARGS__) // Macros to disable the copy and/or move ctor and assignment methods // These are usually placed in the private: declarations for a class. diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc index 2d9cac14fa..6a016580c4 100644 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc @@ -20,7 +20,7 @@ Status SliceCommon(const tvm::Array& inputs, const std::vector& ends, const std::vector& axes1, const std::vector& steps1) { - ORT_RETURN_IF_NOT(nullptr != node.InputDefs()[0]); + ORT_RETURN_IF_NOT(nullptr != node.InputDefs()[0], "nullptr == node.InputDefs()[0]"); std::vector axes; if (axes1.size() == 0) { @@ -60,7 +60,7 @@ Status GENERIC_OP_IR_CREATOR_CLASS(Slice)::Evaluate( std::vector starts, ends, steps; ORT_RETURN_IF_ERROR(info.GetAttrs("starts", starts)); ORT_RETURN_IF_ERROR(info.GetAttrs("ends", ends)); - ORT_RETURN_IF_NOT(starts.size() == ends.size()); + ORT_RETURN_IF_NOT(starts.size() == ends.size(), "starts.size() != ends.size()"); auto axes = info.GetAttrsOrDefault("axes"); diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc index 7a190b5617..ec52d98b5b 100644 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc +++ b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc @@ -33,7 +33,8 @@ Status GENERIC_OP_IR_CREATOR_CLASS(Split)::Evaluate( // check split sizes for (size_t i = 0; i < node.OutputDefs().size(); ++i) { - ORT_RETURN_IF_NOT(split_sizes[i] == ShapeValue(node.OutputDefs()[i], gsl::narrow(axis))); + ORT_RETURN_IF_NOT(split_sizes[i] == ShapeValue(node.OutputDefs()[i], gsl::narrow(axis)), + "split_sizes[i] != ShapeValue(node.OutputDefs()[i], axis)"); } } else { diff --git a/onnxruntime/core/framework/data_transfer_utils.h b/onnxruntime/core/framework/data_transfer_utils.h index 4f08481d61..7b19e8318b 100644 --- a/onnxruntime/core/framework/data_transfer_utils.h +++ b/onnxruntime/core/framework/data_transfer_utils.h @@ -25,10 +25,8 @@ inline Status CopyTensorDataToByteSpan( const DataTransferManager& data_transfer_manager, const Tensor& src_tensor, const OrtMemoryInfo& dst_alloc_info, gsl::span dst_span) { - ORT_RETURN_IF_NOT( - src_tensor.SizeInBytes() == static_cast(dst_span.size_bytes())); - Tensor dst_tensor{ - src_tensor.DataType(), src_tensor.Shape(), dst_span.data(), dst_alloc_info}; + ORT_RETURN_IF_NOT(src_tensor.SizeInBytes() == static_cast(dst_span.size_bytes()), "src size != dst size"); + Tensor dst_tensor{src_tensor.DataType(), src_tensor.Shape(), dst_span.data(), dst_alloc_info}; ORT_RETURN_IF_ERROR(data_transfer_manager.CopyTensor(src_tensor, dst_tensor)); return Status::OK(); } @@ -51,11 +49,9 @@ common::Status CopyTensorDataToSpan( #if !defined(__GNUC__) || __GNUC__ >= 5 static_assert(std::is_trivially_copyable::value, "Element type must be trivially copyable."); #endif - ORT_RETURN_IF_NOT(src_tensor.DataType() == DataTypeImpl::GetType()); - ORT_RETURN_IF_NOT( - src_tensor.SizeInBytes() == static_cast(dst_span.size_bytes())); - Tensor dst_tensor{ - src_tensor.DataType(), src_tensor.Shape(), dst_span.data(), dst_alloc_info}; + ORT_RETURN_IF_NOT(src_tensor.DataType() == DataTypeImpl::GetType(), "Data type mismatch"); + ORT_RETURN_IF_NOT(src_tensor.SizeInBytes() == static_cast(dst_span.size_bytes()), "src size != dst size"); + Tensor dst_tensor{src_tensor.DataType(), src_tensor.Shape(), dst_span.data(), dst_alloc_info}; ORT_RETURN_IF_ERROR(data_transfer_manager.CopyTensor(src_tensor, dst_tensor)); return Status::OK(); } diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 8d396aa071..e9a39e5f74 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -187,7 +187,7 @@ namespace utils { Status UnpackTensorWithExternalData(const ONNX_NAMESPACE::TensorProto& tensor, \ const ORTCHAR_T* tensor_proto_dir, size_t expected_size, \ /*out*/ T* p_data) { \ - ORT_RETURN_IF(nullptr == p_data); \ + ORT_RETURN_IF(nullptr == p_data, "nullptr == p_data"); \ \ std::unique_ptr unpacked_tensor; \ SafeInt tensor_byte_size = 0; \ diff --git a/onnxruntime/core/optimizer/computation_reduction.cc b/onnxruntime/core/optimizer/computation_reduction.cc index f661af1e11..946e3419b2 100644 --- a/onnxruntime/core/optimizer/computation_reduction.cc +++ b/onnxruntime/core/optimizer/computation_reduction.cc @@ -176,7 +176,7 @@ static Status SimpleHandler(Graph& graph, Node& gathernd_node, Node& target_node */ static Status BinaryElementwiseHandler(Graph& graph, Node& gathernd_node, Node& target_node) { int target_node_input_index = GetValidInputForGatherND(target_node); - ORT_RETURN_IF_NOT(target_node_input_index != -1); + ORT_RETURN_IF(target_node_input_index == -1, "Invalid target node index"); return SwapGatherNDWithTargetNode(graph, gathernd_node, target_node, target_node_input_index); } @@ -202,7 +202,7 @@ static Status BinaryElementwiseHandler(Graph& graph, Node& gathernd_node, Node& */ static Status MatMulHandler(Graph& graph, Node& gathernd_node, Node& target_node) { int target_node_input_index = GetValidInputForGatherND(target_node); - ORT_RETURN_IF_NOT(target_node_input_index == 0); + ORT_RETURN_IF_NOT(target_node_input_index == 0, "target_node_input_index != 0"); return SwapGatherNDWithTargetNode(graph, gathernd_node, target_node, target_node_input_index); } diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index 399a0e3b8c..424cf1b5f5 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -255,9 +255,9 @@ class PosixEnv : public Env { Status ReadFileIntoBuffer(const ORTCHAR_T* file_path, FileOffsetType offset, size_t length, gsl::span buffer) const override { - ORT_RETURN_IF_NOT(file_path); - ORT_RETURN_IF_NOT(offset >= 0); - ORT_RETURN_IF_NOT(length <= buffer.size()); + ORT_RETURN_IF_NOT(file_path, "file_path == nullptr"); + ORT_RETURN_IF_NOT(offset >= 0, "offset < 0"); + ORT_RETURN_IF_NOT(length <= buffer.size(), "length > buffer.size()"); ScopedFileDescriptor file_descriptor{open(file_path, O_RDONLY)}; if (!file_descriptor.IsValid()) { @@ -300,8 +300,8 @@ class PosixEnv : public Env { Status MapFileIntoMemory(const ORTCHAR_T* file_path, FileOffsetType offset, size_t length, MappedMemoryPtr& mapped_memory) const override { - ORT_RETURN_IF_NOT(file_path); - ORT_RETURN_IF_NOT(offset >= 0); + ORT_RETURN_IF_NOT(file_path, "file_path == nullptr"); + ORT_RETURN_IF_NOT(offset >= 0, "offset < 0"); ScopedFileDescriptor file_descriptor{open(file_path, O_RDONLY)}; if (!file_descriptor.IsValid()) { @@ -377,8 +377,7 @@ class PosixEnv : public Env { common::Status DeleteFolder(const PathString& path) const override { const auto result = nftw( path.c_str(), &nftw_remove, 32, FTW_DEPTH | FTW_PHYS); - ORT_RETURN_IF_NOT( - result == 0, "DeleteFolder(): nftw() failed with error: ", result); + ORT_RETURN_IF_NOT(result == 0, "DeleteFolder(): nftw() failed with error: ", result); return Status::OK(); } diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 4b2f701bb1..ef9d8920c1 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -231,9 +231,9 @@ class WindowsEnv : public Env { Status ReadFileIntoBuffer(_In_z_ const ORTCHAR_T* const file_path, const FileOffsetType offset, const size_t length, const gsl::span buffer) const override { - ORT_RETURN_IF_NOT(file_path); - ORT_RETURN_IF_NOT(offset >= 0); - ORT_RETURN_IF_NOT(length <= buffer.size()); + ORT_RETURN_IF_NOT(file_path, "file_path == nullptr"); + ORT_RETURN_IF_NOT(offset >= 0, "offset < 0"); + ORT_RETURN_IF_NOT(length <= buffer.size(), "length > buffer.size()"); #if WINVER >= _WIN32_WINNT_WIN8 wil::unique_hfile file_handle{ CreateFile2(file_path, GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL)}; diff --git a/onnxruntime/core/providers/cpu/math/matmul_helper.h b/onnxruntime/core/providers/cpu/math/matmul_helper.h index e711db1cb7..d4d1b8f8bb 100644 --- a/onnxruntime/core/providers/cpu/math/matmul_helper.h +++ b/onnxruntime/core/providers/cpu/math/matmul_helper.h @@ -32,7 +32,7 @@ class MatMulComputeHelper { size_t left_num_dims = left_shape.NumDimensions(); size_t right_num_dims = right_shape.NumDimensions(); - ORT_RETURN_IF_NOT(left_num_dims >= 1 && right_num_dims >= 1); + ORT_RETURN_IF_NOT(left_num_dims >= 1 && right_num_dims >= 1, "left_num_dims and right_num_dims must be >= 1"); // Special cases below for right_shape being 2D and left_shape > 2D by flattening left_shape to 2D // Note that padding 1s in front of the right_shape can be flattened too @@ -115,23 +115,25 @@ class MatMulComputeHelper { } if (!has_1D_input) { - ORT_RETURN_IF_NOT(K_ == right_shape[transb ? right_num_dims - 1 : right_num_dims - 2], "MatMul dimension mismatch"); + ORT_RETURN_IF_NOT(K_ == right_shape[transb ? right_num_dims - 1 : right_num_dims - 2], + "MatMul dimension mismatch"); // left (...M x K), right (...K x N), output (...M x N) - ORT_RETURN_IF_NOT(num_dims_with_pad == num_output_dims); + ORT_RETURN_IF_NOT(num_dims_with_pad == num_output_dims, "num_dims_with_pad != num_output_dims"); output_dims[num_output_dims - 2] = M_; output_dims[num_output_dims - 1] = N_; } else { if (num_output_dims == 0) { // for left and right being both vector, output is scalar thus no shape - ORT_RETURN_IF_NOT(M_ == 1 && N_ == 1); + ORT_RETURN_IF_NOT(M_ == 1 && N_ == 1, "M_ == 1 && N_ == 1 was false"); } else { if (left_num_dims == 1) { - ORT_RETURN_IF_NOT(num_dims_with_pad - 1 == num_output_dims); - ORT_RETURN_IF_NOT(K_ == right_shape[transb ? right_num_dims - 1 : right_num_dims - 2], "MatMul dimension mismatch"); + ORT_RETURN_IF_NOT(num_dims_with_pad - 1 == num_output_dims, "num_dims_with_pad - 1 != num_output_dims"); + ORT_RETURN_IF_NOT(K_ == right_shape[transb ? right_num_dims - 1 : right_num_dims - 2], + "MatMul dimension mismatch"); // left (K), right (...K,N), output (...N) output_dims[num_output_dims - 1] = N_; } else { - ORT_RETURN_IF_NOT(num_dims_with_pad - 2 == num_output_dims); + ORT_RETURN_IF_NOT(num_dims_with_pad - 2 == num_output_dims, "num_dims_with_pad - 2 != num_output_dims"); ORT_RETURN_IF_NOT(K_ == right_shape[0], "MatMul dimension mismatch"); // left(...K), right (K), output (...), already assigned } diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc index 7fb7327063..fca1a20bc1 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc @@ -116,15 +116,13 @@ Status NonMaxSuppressionBase::GetThresholdsFromInputs(const PrepareContext& pc, Status NonMaxSuppression::Compute(OpKernelContext* ctx) const { PrepareContext pc; - auto ret = PrepareCompute(ctx, pc); - ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + ORT_RETURN_IF_ERROR(PrepareCompute(ctx, pc)); int64_t max_output_boxes_per_class = 0; float iou_threshold = .0f; float score_threshold = .0f; - ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); - ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + ORT_RETURN_IF_ERROR(GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold)); if (0 == max_output_boxes_per_class) { ctx->Output(0, {0, 3}); diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc index 8fd49ffdf7..96531e71d3 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.cc +++ b/onnxruntime/core/providers/cpu/tensor/concat.cc @@ -138,7 +138,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, auto& data_n = *data_n_ptr; // Type sanity check (Make sure we are working on homogeneous types) - ORT_RETURN_IF_NOT(data_n.DataType() == p.output_tensor->DataType()); + ORT_RETURN_IF_NOT(data_n.DataType() == p.output_tensor->DataType(), "Data type mismatch"); // The input_axis_pitch is the number of elements to add to move to the next split axis in the input // Can handle stacking as well (as the "new dummy dimension" in the input is of unit value). diff --git a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc index b75a09b20b..5c7545399c 100644 --- a/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cuda/object_detection/non_max_suppression.cc @@ -33,15 +33,13 @@ ONNX_OPERATOR_KERNEL_EX( Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const { PrepareContext pc; - auto ret = PrepareCompute(ctx, pc); - ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + ORT_RETURN_IF_ERROR(PrepareCompute(ctx, pc)); int64_t max_output_boxes_per_class = 0; float iou_threshold = .0f; float score_threshold = .0f; - ret = GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold); - ORT_RETURN_IF_NOT(ret.IsOK(), ret.ErrorMessage()); + ORT_RETURN_IF_ERROR(GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold)); if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) { ctx->Output(0, {0, 3}); diff --git a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc index 1fd4b3f89e..fd452d37e3 100644 --- a/onnxruntime/core/providers/cuda/tensor/gather_nd.cc +++ b/onnxruntime/core/providers/cuda/tensor/gather_nd.cc @@ -112,17 +112,19 @@ Status GatherNDBase::PrepareCompute( GatherND); #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 -#define GATHER_ND_T_TENSOR_TYPES {DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType()} +#define GATHER_ND_T_TENSOR_TYPES \ + { DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType() } #define GATHER_ND_T_DATA_TYPES float, MLFloat16, double, int64_t, BFloat16 #else -#define GATHER_ND_T_TENSOR_TYPES {DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType()} +#define GATHER_ND_T_TENSOR_TYPES \ + { DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType() } #define GATHER_ND_T_DATA_TYPES float, MLFloat16, double, int64_t #endif @@ -165,8 +167,8 @@ template Status GatherND::ComputeInternal(OpKernelContext* context) const { auto input_tensor = context->Input(0); auto indices_tensor = context->Input(1); - ORT_RETURN_IF_NOT(input_tensor != nullptr); - ORT_RETURN_IF_NOT(indices_tensor != nullptr); + ORT_RETURN_IF_NOT(input_tensor != nullptr, "input_tensor == nullptr"); + ORT_RETURN_IF_NOT(indices_tensor != nullptr, "indices_tensor == nullptr"); auto input_shape = input_tensor->Shape(); auto indices_shape = indices_tensor->Shape(); diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_kernel.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_kernel.h index 43ac739812..9bf0b376b6 100644 --- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_kernel.h +++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_kernel.h @@ -52,7 +52,8 @@ class DnnlKernel { } Status GetIntsAttr(const Provider_AttributeProto& proto, std::vector& values) { - ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS); + ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS, + "proto.type() != AttributeProto_AttributeType_INTS"); values.reserve(proto.ints_size()); for (int i = 0; i < proto.ints_size(); i++) { values.push_back(proto.ints(i)); @@ -61,18 +62,21 @@ class DnnlKernel { } Status GetIntAttr(const Provider_AttributeProto& proto, int64_t& value) { - ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT); + ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT, + "proto.type() != AttributeProto_AttributeType_INT"); value = proto.i(); return Status::OK(); } Status GetFloatAttr(const Provider_AttributeProto& proto, float& value) { - ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT); + ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_FLOAT, + "proto.type() != AttributeProto_AttributeType_FLOAT"); value = proto.f(); return Status::OK(); } Status GetStringAttr(const Provider_AttributeProto& proto, std::string& value) { - ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING); + ORT_RETURN_IF_NOT(proto.type() == ::ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_STRING, + "proto.type() != AttributeProto_AttributeType_STRING"); value = proto.s(); return Status::OK(); } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc index 111d3ed68e..e1a438262c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc @@ -125,9 +125,9 @@ Status TransposeNCHWToNHWC(ModelBuilder& model_builder, } // Convert the input from nchw to nhwc +// Caller should ensure input is currently in nchw format using ModelBuilder::IsOperandNHWC Status GetNHWCInput(ModelBuilder& model_builder, const Node& node, size_t input_index, std::string& input) { const auto& nchw_input = node.InputDefs()[input_index]->Name(); - ORT_RETURN_IF(model_builder.IsOperandNHWC(input)); if (!model_builder.GetNHWCOperand(nchw_input, input)) { input = model_builder.GetUniqueName(nchw_input + "_nchw_to_nhwc"); ORT_RETURN_IF_ERROR(TransposeNCHWToNHWC(model_builder, nchw_input, input)); @@ -136,9 +136,9 @@ Status GetNHWCInput(ModelBuilder& model_builder, const Node& node, size_t input_ } // Convert the input from nhwc to nchw +// Caller should ensure input is currently in nhwc format using ModelBuilder::IsOperandNHWC Status GetNCHWInput(ModelBuilder& model_builder, const Node& node, size_t input_index, std::string& input) { const auto& nhwc_input = node.InputDefs()[input_index]->Name(); - ORT_RETURN_IF_NOT(model_builder.IsOperandNHWC(input)); if (!model_builder.GetNCHWOperand(nhwc_input, input)) { input = model_builder.GetUniqueName(nhwc_input + "_nhwc_to_nchw"); ORT_RETURN_IF_ERROR(TransposeNHWCToNCHW(model_builder, nhwc_input, input)); diff --git a/onnxruntime/core/providers/nuphar/compiler/traverse_shape_infer.cc b/onnxruntime/core/providers/nuphar/compiler/traverse_shape_infer.cc index 2205ba516c..509efcc7cf 100644 --- a/onnxruntime/core/providers/nuphar/compiler/traverse_shape_infer.cc +++ b/onnxruntime/core/providers/nuphar/compiler/traverse_shape_infer.cc @@ -56,7 +56,8 @@ static Status CreateOutputs( if (shape[d] > 0) { output_shape[d] = DimExpr(shape[d]); } else { - ORT_RETURN_IF_NOT(shape_proto->dim_size() > d && utils::HasDimParam(shape_proto->dim(d))); + ORT_RETURN_IF_NOT(shape_proto->dim_size() > d && utils::HasDimParam(shape_proto->dim(d)), + "shape_proto->dim_size() > d && utils::HasDimParam(shape_proto->dim(d) was false"); output_shape[d] = DimExpr(shape_proto->dim(d).dim_param()); } } diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/slice.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/slice.cc index 2d618cd282..6d3a32eb7d 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/slice.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/slice.cc @@ -40,7 +40,7 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Slice)::Evaluate( std::vector starts, ends, axes, steps; ORT_RETURN_IF_ERROR(info.GetAttrs("starts", starts)); ORT_RETURN_IF_ERROR(info.GetAttrs("ends", ends)); - ORT_RETURN_IF_NOT(starts.size() == ends.size()); + ORT_RETURN_IF_NOT(starts.size() == ends.size(), "starts.size() != ends.size()"); axes = info.GetAttrsOrDefault("axes"); slice_params.push_back(starts); slice_params.push_back(ends); diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/tile.cc b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/tile.cc index 2841206eea..e96fb06f87 100644 --- a/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/tile.cc +++ b/onnxruntime/core/providers/nuphar/compiler/x86/op_ir_creator/tensor/tile.cc @@ -21,8 +21,9 @@ Status NUPHAR_TVM_X86_OP_IR_CREATOR_CLASS(Tile)::Evaluate( OpNodeProtoHelper info(&ctx); NupharCodeGenCtx* ctx_nuphar = Promote(&ctx_codegen); const auto* repeats = ctx_nuphar->GetOrtInitializerTensor(node.InputDefs()[1]->Name()); - ORT_RETURN_IF_NOT(repeats != nullptr); - ORT_RETURN_IF_NOT(repeats->Shape().Size() == gsl::narrow(inputs[0]->shape.size())); + ORT_RETURN_IF_NOT(repeats != nullptr, "repeats == nullptr"); + ORT_RETURN_IF_NOT(repeats->Shape().Size() == gsl::narrow(inputs[0]->shape.size()), + "repeats->Shape().Size() != inputs[0]->shape.size()"); const int64_t* repeats_data = repeats->Data(); const auto repeats_vector = std::vector(repeats_data, repeats_data + inputs[0]->shape.size()); tvm::Tensor Y = tvm_codegen::Tile(inputs[0], repeats_vector, node.Name() + "_Tile"); diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 65316cc686..3e98121fd8 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1009,7 +1009,7 @@ Status InferenceSession::LoadOrtModel(std::function load_ort_format_mo // Verify the ort_format_model_bytes_ is a valid InferenceSessionBuffer before we access the data flatbuffers::Verifier verifier(ort_format_model_bytes_.data(), ort_format_model_bytes_.size()); - ORT_RETURN_IF_NOT(fbs::VerifyInferenceSessionBuffer(verifier)); + ORT_RETURN_IF_NOT(fbs::VerifyInferenceSessionBuffer(verifier), "ORT model verification failed."); const auto* fbs_session = fbs::GetInferenceSession(ort_format_model_bytes_.data()); ORT_RETURN_IF(nullptr == fbs_session, "InferenceSession is null. Invalid ORT format model."); diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc index abe39ac4ff..1b3ab0d091 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc @@ -18,7 +18,8 @@ static Status SplitDims( const std::vector& dims, int64_t axis, std::vector& n_dims, std::vector& m_dims) { if (axis < 0) axis += dims.size(); - ORT_RETURN_IF_NOT(0 <= axis && static_cast(axis) <= dims.size()); + ORT_RETURN_IF_NOT(0 <= axis && static_cast(axis) <= dims.size(), + "0 <= axis && axis <= dims.size() was false"); const auto boundary = dims.begin() + axis; n_dims.assign(dims.begin(), boundary); m_dims.assign(boundary, dims.end()); diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc index cf76328f75..3e7c4eb9de 100644 --- a/onnxruntime/test/optimizer/initializer_test.cc +++ b/onnxruntime/test/optimizer/initializer_test.cc @@ -24,7 +24,8 @@ Status WriteExternalDataFile(gsl::span data, const PathString& path, Sc std::vector data_bytes(data.size_bytes()); ORT_RETURN_IF_ERROR(onnxruntime::utils::WriteLittleEndian(data, gsl::make_span(data_bytes))); std::ofstream out{path, std::ios::binary | std::ios::trunc}; - ORT_RETURN_IF_NOT(out && out.write(data_bytes.data(), data_bytes.size())); + ORT_RETURN_IF_NOT(out && out.write(data_bytes.data(), data_bytes.size()), + "out && out.write(data_bytes.data(), data_bytes.size()) was false"); file_deleter = ScopedFileDeleter{path}; return Status::OK(); } @@ -44,10 +45,10 @@ void SetTensorProtoExternalData( TEST(OptimizerInitializerTest, LoadExternalData) { const std::vector tensor_data = []() { - std::vector tensor_data(100); - std::iota(tensor_data.begin(), tensor_data.end(), 0); - return tensor_data; - }(); + std::vector tensor_data(100); + std::iota(tensor_data.begin(), tensor_data.end(), 0); + return tensor_data; + }(); const gsl::span tensor_data_span = gsl::make_span(tensor_data); const auto tensor_data_dir_path = Path::Parse(ToPathString(".")); const auto tensor_data_dir_relative_path = Path::Parse(ToPathString("OptimizerInitializerTest_LoadExternalData.bin")); diff --git a/orttraining/orttraining/core/framework/checkpointing.cc b/orttraining/orttraining/core/framework/checkpointing.cc index 4dfd521e91..b7886bb89f 100644 --- a/orttraining/orttraining/core/framework/checkpointing.cc +++ b/orttraining/orttraining/core/framework/checkpointing.cc @@ -49,7 +49,7 @@ Status SaveRuntimeTensor( const PathString& relative_data_path, std::ofstream& data_file, ONNX_NAMESPACE::TensorProto& tensor_proto) { - ORT_RETURN_IF(tensor.DataType() == DataTypeImpl::GetType()); + ORT_RETURN_IF(tensor.DataType() == DataTypeImpl::GetType(), "tensor.DataType() is std::string"); VLOGS_DEFAULT(1) << "Saving tensor " << tensor_name; @@ -143,7 +143,7 @@ Status SaveRuntimeTensors( for (const auto& tensor_name : ordered_tensor_names) { const OrtValue& ort_value = ort_values.at(tensor_name); - ORT_RETURN_IF_NOT(ort_value.IsTensor()); + ORT_RETURN_IF_NOT(ort_value.IsTensor(), "ort_value.IsTensor() was false"); const Tensor& tensor = ort_value.Get(); tensor_data_buffer.resize(tensor.SizeInBytes()); @@ -242,7 +242,7 @@ Status UpdateTensorsExternalDataLocations( auto location_it = std::find_if( external_data.begin(), external_data.end(), [](ONNX_NAMESPACE::StringStringEntryProto& kvp) { return kvp.key() == "location"; }); - ORT_RETURN_IF_NOT(location_it != external_data.end()); + ORT_RETURN_IF_NOT(location_it != external_data.end(), "location_it == external_data.end()"); // TODO is the encoding correct? https://github.com/onnx/onnx/issues/2392 location_it->set_value(ToMBString(external_data_path)); diff --git a/orttraining/orttraining/core/framework/protobuf_message_sequence.h b/orttraining/orttraining/core/framework/protobuf_message_sequence.h index 098f524fea..4f0c641f6b 100644 --- a/orttraining/orttraining/core/framework/protobuf_message_sequence.h +++ b/orttraining/orttraining/core/framework/protobuf_message_sequence.h @@ -48,17 +48,17 @@ Status WriteProtoMessageSequence( // message count const auto message_count = messages.size(); - ORT_RETURN_IF_NOT(message_count <= k_max_size); + ORT_RETURN_IF_NOT(message_count <= k_max_size, "message_count > k_max_size"); coded_output.WriteVarint32(static_cast(message_count)); for (const auto& message : messages) { // message size const auto message_size = message.ByteSizeLong(); - ORT_RETURN_IF_NOT(message_size <= k_max_size); + ORT_RETURN_IF_NOT(message_size <= k_max_size, "message_count > k_max_size"); coded_output.WriteVarint32(static_cast(message_size)); // message bytes - ORT_RETURN_IF_NOT(message.SerializeToCodedStream(&coded_output)); + ORT_RETURN_IF_NOT(message.SerializeToCodedStream(&coded_output), "message.SerializeToCodedStream failed"); } return Status::OK(); @@ -86,18 +86,19 @@ Status ReadProtoMessageSequence( // message count int message_count; - ORT_RETURN_IF_NOT(coded_input.ReadVarintSizeAsInt(&message_count)); + ORT_RETURN_IF_NOT(coded_input.ReadVarintSizeAsInt(&message_count), "coded_input.ReadVarintSizeAsInt failed"); std::vector result(message_count); for (auto& message : result) { // message size int message_size; - ORT_RETURN_IF_NOT(coded_input.ReadVarintSizeAsInt(&message_size)); + ORT_RETURN_IF_NOT(coded_input.ReadVarintSizeAsInt(&message_size), "coded_input.ReadVarintSizeAsInt failed"); // message bytes const auto message_limit = coded_input.PushLimit(message_size); - ORT_RETURN_IF_NOT(message.ParseFromCodedStream(&coded_input)); - ORT_RETURN_IF_NOT(coded_input.CheckEntireMessageConsumedAndPopLimit(message_limit)); + ORT_RETURN_IF_NOT(message.ParseFromCodedStream(&coded_input), "message.ParseFromCodedStream failed"); + ORT_RETURN_IF_NOT(coded_input.CheckEntireMessageConsumedAndPopLimit(message_limit), + "coded_input.CheckEntireMessageConsumedAndPopLimit failed"); } messages = std::move(result); diff --git a/orttraining/orttraining/core/graph/adasum_optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/adasum_optimizer_graph_builder.cc index 05aad1d166..27a2e77021 100644 --- a/orttraining/orttraining/core/graph/adasum_optimizer_graph_builder.cc +++ b/orttraining/orttraining/core/graph/adasum_optimizer_graph_builder.cc @@ -180,7 +180,7 @@ Status AdasumOptimizerGraphBuilder::BuildInternal( // add gradient scaling ArgDef fused_gradient_argdef; const auto total_num_accumulations = opt_graph_config_.gradient_accumulation_steps; - ORT_RETURN_IF_NOT(total_num_accumulations > 0); + ORT_RETURN_IF_NOT(total_num_accumulations > 0, "total_num_accumulations <= 0"); auto scale_divisor = total_num_accumulations; //If Adasum GPU hierarchical reduce is used, then divide gradients by local size. diff --git a/orttraining/orttraining/core/graph/allreduce_optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/allreduce_optimizer_graph_builder.cc index f73e55099f..e5bd5233f5 100644 --- a/orttraining/orttraining/core/graph/allreduce_optimizer_graph_builder.cc +++ b/orttraining/orttraining/core/graph/allreduce_optimizer_graph_builder.cc @@ -34,7 +34,7 @@ static Status AddNcclAllReduceForGradients( input_gradient_argdef, allreduce_outputs, {ONNX_NAMESPACE::MakeAttribute("group_type", - static_cast(WorkerGroupType::DataParallel))}, + static_cast(WorkerGroupType::DataParallel))}, "NcclAllReduce")}); gradient_argdefs = allreduce_outputs; @@ -56,7 +56,7 @@ AllreduceOptimizerGraphBuilder::AllreduceOptimizerGraphBuilder( "Allreduce optimizer graph builder can only be used for distributed training."); if (opt_graph_config.use_nccl) { ORT_ENFORCE(IsNcclAvailable(), "Distributed training with NCCL is not supported, as NCCL is not enabled in this build."); - } else if(!opt_graph_config.use_nccl && opt_graph_config.adasum_reduction_type == AdasumReductionType::None){ + } else if (!opt_graph_config.use_nccl && opt_graph_config.adasum_reduction_type == AdasumReductionType::None) { ORT_THROW("Performing Allreduce is only supported using NCCL."); } } @@ -78,7 +78,7 @@ Status AllreduceOptimizerGraphBuilder::BuildInternal( std::vector output_gradient_argdef; const auto total_num_accumulations = opt_graph_config_.gradient_accumulation_steps * opt_graph_config_.data_parallel_group_size; - ORT_RETURN_IF_NOT(total_num_accumulations > 0); + ORT_RETURN_IF_NOT(total_num_accumulations > 0, "total_num_accumulations <= 0"); const float scale = 1.0f / total_num_accumulations; ORT_RETURN_IF_ERROR(AddGradientScalingNodes(nodearg_name_generator, scale, gradient_argdefs, output_gradient_argdef, graph_defs, opt_graph_config_.AllReduceDataType())); diff --git a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc index a41732723e..b683c61cfe 100644 --- a/orttraining/orttraining/core/graph/mixed_precision_transformer.cc +++ b/orttraining/orttraining/core/graph/mixed_precision_transformer.cc @@ -157,8 +157,9 @@ static Status CastNodeArg(onnxruntime::Graph& graph, std::string output_name = graph.GenerateNodeArgName(arg->Name()); const std::string cast_node_name = graph.GenerateNodeName("cast_" + output_name); - output_name += (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT ? "_fp32" : - elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 ? "_fp16" : "_bf16"); + output_name += (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT + ? "_fp32" + : (elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 ? "_fp16" : "_bf16")); NodeArg& output = graph.GetOrCreateNodeArg(output_name, &type_proto); // Create Cast node @@ -174,7 +175,8 @@ static Status CastNodeArg(onnxruntime::Graph& graph, producer_node->OutputDefs()[producer_node_arg_index] != arg) { producer_node_arg_index++; } - ORT_RETURN_IF_NOT(producer_node_arg_index != static_cast(producer_node->OutputDefs().size())); + ORT_RETURN_IF_NOT(producer_node_arg_index != static_cast(producer_node->OutputDefs().size()), + "producer_node_arg_index == producer_node->OutputDefs().size()"); } // Update consumer @@ -215,7 +217,8 @@ static Status CastNodeArg(onnxruntime::Graph& graph, } } else { // Make sure it is not one of graph outputs, otherwise, graph outputs need to be updated. - ORT_RETURN_IF_NOT(std::find(graph.GetOutputs().cbegin(), graph.GetOutputs().cend(), arg) == graph.GetOutputs().cend()); + ORT_RETURN_IF_NOT(std::find(graph.GetOutputs().cbegin(), graph.GetOutputs().cend(), arg) == graph.GetOutputs().cend(), + arg->Name(), " is a graph output"); } // Update producer @@ -372,9 +375,9 @@ Status TransformConstants(Graph& graph, ORT_RETURN_IF_ERROR( CastNodeArg(graph, stage1_fp32_node_args, - p_loss_subgraph != nullptr ? - p_loss_subgraph->GetFP32NodeArgs() : - std::unordered_map>(), + p_loss_subgraph != nullptr + ? p_loss_subgraph->GetFP32NodeArgs() + : std::unordered_map>(), tensor, mixed_precision_type)); } @@ -386,7 +389,7 @@ Status TransformConstants(Graph& graph, // as SparseSoftmaxCrossEntropy where FP32 precision is required. // Converts fp16/bf16 tensor --> Op --> fp16/bf16 tensor to // fp16/bf16 tensor --> Cast --> fp32 tensor --> Op --> fp32 tensor --> Cast --> fp16/bf16 tensor -Status TransformStage2(Graph& graph, +Status TransformStage2(Graph& graph, ONNX_NAMESPACE::TensorProto_DataType mixed_precision_type, const std::unordered_map>& loss_subgraph_fp32_node_args = {}) { // This pass does not require topological sort order: okay to visit nodes in any order. @@ -546,7 +549,7 @@ Status TransformGraphForMixedPrecision(Graph& graph, bool layernorm_stash_as_fp32) { //Only fp16 and bfloat16 supported for now. ORT_ENFORCE(mixed_precision_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 || - mixed_precision_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16); + mixed_precision_type == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16); // Stag 0: Initialize loss subgraph. LossSubgraph loss_subgraph(graph); @@ -613,15 +616,14 @@ Status TransformGraphForMixedPrecision(Graph& graph, for (const auto& kv : mixed_precision_initializers) { const ONNX_NAMESPACE::TensorProto* tensor_proto = kv.second; Initializer initializer(*tensor_proto, graph.ModelPath()); - ONNX_NAMESPACE::TensorProto weight_tensor_proto = mixed_precision_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 ? - initializer.ToFP16(kv.first) : initializer.ToBFloat16(kv.first); + ONNX_NAMESPACE::TensorProto weight_tensor_proto = mixed_precision_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 ? initializer.ToFP16(kv.first) : initializer.ToBFloat16(kv.first); graph.AddInitializedTensor(weight_tensor_proto); } //set layernorm stash type - for (auto& node : graph.Nodes()){ - if (!node.OpType().compare("LayerNormalization")){ - node.AddAttribute("stash_type", static_cast(layernorm_stash_as_fp32 ? ONNX_NAMESPACE::TensorProto_DataType_FLOAT : mixed_precision_type)); + for (auto& node : graph.Nodes()) { + if (!node.OpType().compare("LayerNormalization")) { + node.AddAttribute("stash_type", static_cast(layernorm_stash_as_fp32 ? ONNX_NAMESPACE::TensorProto_DataType_FLOAT : mixed_precision_type)); } } @@ -633,7 +635,7 @@ Status TransformGraphForMixedPrecision(Graph& graph, auto& attributes = node.GetMutableAttributes(); auto* element_type = &(attributes.find("element_types")->second); int ints_size = element_type->ints_size(); - for(int i = 0; i < ints_size; ++i){ + for (int i = 0; i < ints_size; ++i) { if (element_type->ints(i) == static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)) { element_type->set_ints(i, static_cast(mixed_precision_type)); // Need to resolve and populate the new type through the graph. diff --git a/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc b/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc index abe0d8efeb..90dc30f2c1 100644 --- a/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer/adam_optimizer_builder.cc @@ -71,17 +71,15 @@ Status AdamOptimizerBuilder::Build( // Get shape of weight tensor. std::vector weight_dims; - ORT_RETURN_IF_NOT( - weight_argdefs[i].type_proto && - weight_argdefs[i].type_proto->has_tensor_type() && - weight_argdefs[i].type_proto->tensor_type().has_shape()); + ORT_RETURN_IF_NOT(weight_argdefs[i].type_proto && + weight_argdefs[i].type_proto->has_tensor_type() && + weight_argdefs[i].type_proto->tensor_type().has_shape(), + "weight_argsdefs[", i, "] did not have tensor with shape"); for (const auto& dim : weight_argdefs[i].type_proto->tensor_type().shape().dim()) { weight_dims.push_back(dim.dim_value()); } - const auto element_type = opt_configs[i].use_mixed_precision_moments ? - ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16 : - ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT; + const auto element_type = opt_configs[i].use_mixed_precision_moments ? ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16 : ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT; // Add first- and second-order momentums to input list. for (const auto& moments_prefix : MOMENTS_PREFIXES) { const std::string gradient_moment_name = moments_prefix + "_" + weight_name; diff --git a/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc b/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc index cb29e411b0..609bdc0930 100644 --- a/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer/lamb_optimizer_builder.cc @@ -186,10 +186,10 @@ Status LambOptimizerBuilder::Build( const TypeProto* const weight_type_proto = weight_argdefs[i].type_proto; const TypeProto* const gradient_type_proto = gradient_argdefs[i].type_proto; std::vector weight_dims; - ORT_RETURN_IF_NOT( - weight_argdefs[i].type_proto && - weight_argdefs[i].type_proto->has_tensor_type() && - weight_argdefs[i].type_proto->tensor_type().has_shape()); + ORT_RETURN_IF_NOT(weight_argdefs[i].type_proto && + weight_argdefs[i].type_proto->has_tensor_type() && + weight_argdefs[i].type_proto->tensor_type().has_shape(), + "weight_argsdefs[", i, "] did not have tensor with shape"); for (const auto& dim : weight_argdefs[i].type_proto->tensor_type().shape().dim()) { weight_dims.push_back(dim.dim_value()); } diff --git a/orttraining/orttraining/core/graph/optimizer_builder.cc b/orttraining/orttraining/core/graph/optimizer_builder.cc index b0c68b786f..7861d2e90a 100644 --- a/orttraining/orttraining/core/graph/optimizer_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer_builder.cc @@ -20,10 +20,10 @@ Status IsMatchingTypeAndShape( const onnxruntime::Tensor& tensor, const int32_t element_type, const std::vector& expected_shape_dims) { - ORT_RETURN_IF_NOT(tensor.GetElementType() == element_type); + ORT_RETURN_IF_NOT(tensor.GetElementType() == element_type, "Type mismatch"); const TensorShape& tensor_shape = tensor.Shape(); TensorShape expected_shape(expected_shape_dims); - ORT_RETURN_IF_NOT(tensor_shape == expected_shape, "Mismatch: expected:[", tensor_shape, "], actual:[", expected_shape, "]"); + ORT_RETURN_IF_NOT(tensor_shape == expected_shape, "Mismatch: expected:[", tensor_shape, "], actual:[", expected_shape, "]"); return Status::OK(); } diff --git a/orttraining/orttraining/core/graph/optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/optimizer_graph_builder.cc index f3f697e3ca..bcfae2d0c4 100644 --- a/orttraining/orttraining/core/graph/optimizer_graph_builder.cc +++ b/orttraining/orttraining/core/graph/optimizer_graph_builder.cc @@ -275,8 +275,9 @@ Status OptimizerGraphBuilder::AddDirectWeightUpdate( const std::vector& opt_configs, GraphAugmenter::GraphDefs& graph_defs, std::unordered_map>& weight_to_opt_mapping) { - ORT_RETURN_IF_NOT(weight_argdefs.size() == gradient_argdefs.size()); - ORT_RETURN_IF_NOT(weight_argdefs.size() == opt_configs.size()); + ORT_RETURN_IF_NOT(weight_argdefs.size() == gradient_argdefs.size(), + "weight_argdefs.size() != gradient_argdefs.size()"); + ORT_RETURN_IF_NOT(weight_argdefs.size() == opt_configs.size(), "weight_argdefs.size() != opt_configs.size()"); std::vector output_weight_argdefs; std::vector output_gradient_argdefs; diff --git a/orttraining/orttraining/core/graph/pipeline_transformer.cc b/orttraining/orttraining/core/graph/pipeline_transformer.cc index 4f66d3f055..cc95404ce2 100644 --- a/orttraining/orttraining/core/graph/pipeline_transformer.cc +++ b/orttraining/orttraining/core/graph/pipeline_transformer.cc @@ -770,7 +770,7 @@ void SetDataDependency( Graph& graph, Node& postponed_node, // node should happen after computing dependent_args. const std::vector& dependent_node_args // extra data-dependency to add to "postponed_node" - ) { +) { // "postponed_node"'s original inputs + "dependent_args" std::vector pass_through_inputs; // the mirror of "postponed_node"'s original inputs + "dependent_args" @@ -1667,7 +1667,7 @@ Status VerifyAssignment(const std::vector& stages, auto cs = graph.GetConsumerNodes(arg->Name()); for (const Node* c : cs) { const int outgoing_stage = stages.at(c->Index()); - ORT_RETURN_IF_NOT(node_stage <= outgoing_stage); + ORT_RETURN_IF_NOT(node_stage <= outgoing_stage, "node_stage > outgoing_stage"); } } } diff --git a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc index e7565dfc7a..48a8a55c83 100644 --- a/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc +++ b/orttraining/orttraining/core/graph/zero_optimizer_graph_builder.cc @@ -83,7 +83,7 @@ static Status AddL2NormNcclAllReduce( {norm_squared}, {allreduce_output}, {ONNX_NAMESPACE::MakeAttribute("group_type", - static_cast(WorkerGroupType::DataParallel))}, + static_cast(WorkerGroupType::DataParallel))}, allreduce_output.name)}); // Sqrt the reduced L2 norm. @@ -454,7 +454,7 @@ Status ZeROOptimizerGraphBuilder::BuildInternal( // add gradient scaling ArgDef fused_gradient_argdef; const auto total_num_accumulations = opt_graph_config_.gradient_accumulation_steps * opt_graph_config_.data_parallel_group_size; - ORT_RETURN_IF_NOT(total_num_accumulations > 0); + ORT_RETURN_IF_NOT(total_num_accumulations > 0, "total_num_accumulations <= 0"); const float scale = 1.0f / total_num_accumulations; ORT_RETURN_IF_ERROR(AddGradientScalingNodes(nodearg_name_generator, scale, gradient_argdefs, fused_gradient_argdef, graph_defs, opt_graph_config_.AllReduceDataType(), false)); diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc index d1a290b426..9fe5c1bd72 100644 --- a/orttraining/orttraining/core/session/training_session.cc +++ b/orttraining/orttraining/core/session/training_session.cc @@ -57,7 +57,7 @@ Status SetupOptimizerParams( OptimizerGraphConfig& opt_graph_config_result, std::unordered_map& opt_node_configs_result, std::unordered_map& weight_name_map_after_graph_transform) { - ORT_RETURN_IF_NOT(config.optimizer_config.has_value()); + ORT_RETURN_IF_NOT(config.optimizer_config.has_value(), "config.optimizer_config.has_value() was false"); const auto& optimizer_config = config.optimizer_config.value(); // This is the mapping from the new weight name to the original weight name @@ -354,7 +354,7 @@ Status TrainingSession::ConfigureForTraining( config.distributed_config.horizontal_parallel_size, config.distributed_config.pipeline_parallel_size}); #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) - MemoryInfo::SetLocalRank(config.distributed_config.world_rank); + MemoryInfo::SetLocalRank(config.distributed_config.world_rank); #endif #ifdef USE_MPI @@ -798,7 +798,7 @@ Status TrainingSession::ApplyModelParallelTransformationsToMainGraph(std::unorde GraphTransformerManager graph_transformation_mgr{1}; std::vector> transformers_to_register; - // Creating the CPU EP here to be used to get the + // Creating the CPU EP here to be used to get the // CPU allocator for partitioning the optimizer state by column. std::unique_ptr cpu_execution_provider = onnxruntime::make_unique(CPUExecutionProviderInfo()); @@ -863,7 +863,7 @@ Status TrainingSession::ConfigureLossFunction( loss_graph_builder_ = LossFunctionBuilder::Build(loss_function_info_value.op_def.type); - ORT_RETURN_IF_NOT(loss_graph_builder_); + ORT_RETURN_IF_NOT(loss_graph_builder_, "loss_graph_builder_ == nullptr"); } try { @@ -1094,7 +1094,7 @@ common::Status TrainingSession::GetOptimizerState(std::unordered_mapsecond; opt_state_tensors.erase(it); - } + } } return Status::OK(); } @@ -1107,7 +1107,7 @@ common::Status TrainingSession::GetModelState(std::unordered_mapsecond; fp_weights.erase(it); } - } + } } model_state_tensors["full_precision"] = fp_weights; diff --git a/orttraining/orttraining/models/runner/data_loader.cc b/orttraining/orttraining/models/runner/data_loader.cc index 756541b6af..48904d0300 100644 --- a/orttraining/orttraining/models/runner/data_loader.cc +++ b/orttraining/orttraining/models/runner/data_loader.cc @@ -75,8 +75,8 @@ DataLoader::DataLoader(const MapStringToString& input_name_map, Status DataLoader::InitializeDataSetIndex(size_t initial_data_set_index) { if (initial_data_set_index == active_file_index_) return Status::OK(); - ORT_RETURN_IF_NOT(!is_preloaded_); - ORT_RETURN_IF_NOT(initial_data_set_index < NumShards()); + ORT_RETURN_IF(is_preloaded_, "is_preloaded_ was true"); + ORT_RETURN_IF_NOT(initial_data_set_index < NumShards(), "initial_data_set_index >= NumShards()"); active_file_index_ = initial_data_set_index; @@ -99,7 +99,7 @@ std::shared_ptr DataLoader::MoveToNextDataSet() { } Status DataLoader::InitialPreLoadAsync() { - ORT_RETURN_IF_NOT(!is_preloaded_); + ORT_RETURN_IF(is_preloaded_, "is_preloaded_ was true"); for (size_t i = 0; i < std::min(max_num_files_preload_, NumShards()); ++i) { const auto data_set_index = (active_file_index_ + i) % NumShards(); diff --git a/orttraining/orttraining/models/runner/data_loader.h b/orttraining/orttraining/models/runner/data_loader.h index d375ebc42f..5fd53d346b 100644 --- a/orttraining/orttraining/models/runner/data_loader.h +++ b/orttraining/orttraining/models/runner/data_loader.h @@ -162,7 +162,7 @@ class SingleDataLoader : public IDataLoader { : data_set_(single_data_set), input_tensor_names_(input_tensor_names) {} Status InitializeDataSetIndex(size_t initial_data_set_index) override { - ORT_RETURN_IF_NOT(initial_data_set_index == 0); + ORT_RETURN_IF_NOT(initial_data_set_index == 0, "initial_data_set_index != 0"); return Status::OK(); } diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc index de9b1b4bbb..c3df039a9c 100644 --- a/orttraining/orttraining/models/runner/training_runner.cc +++ b/orttraining/orttraining/models/runner/training_runner.cc @@ -1288,7 +1288,7 @@ constexpr const char* k_loss_scaler_state = "loss_scaler_state"; template Status FromString(const std::string& s, T& t) { std::istringstream i{s}; - ORT_RETURN_IF_NOT(i >> t && i.eof()); + ORT_RETURN_IF_NOT(i >> t && i.eof(), "i >> t && i.eof() was false"); return Status::OK(); } } // namespace @@ -1315,7 +1315,7 @@ Status TrainingRunner::LoadCheckpointProperties( const std::unordered_map& properties) { auto load_property = [&properties](const char* name, auto& val) { auto prop_it = properties.find(name); - ORT_RETURN_IF_NOT(prop_it != properties.end()); + ORT_RETURN_IF_NOT(prop_it != properties.end(), "prop_it == properties.end()"); ORT_RETURN_IF_ERROR(FromString(prop_it->second, val)); return Status::OK(); }; @@ -1329,7 +1329,7 @@ Status TrainingRunner::LoadCheckpointProperties( if (loss_scaler_) { auto prop_it = properties.find(property_names::k_loss_scaler_state); - ORT_RETURN_IF_NOT(prop_it != properties.end()); + ORT_RETURN_IF_NOT(prop_it != properties.end(), "prop_it == properties.end()"); ORT_RETURN_IF_ERROR(loss_scaler_->LoadFromString(prop_it->second)); } diff --git a/orttraining/orttraining/models/runner/training_util.cc b/orttraining/orttraining/models/runner/training_util.cc index d2d122c036..66086dae43 100644 --- a/orttraining/orttraining/models/runner/training_util.cc +++ b/orttraining/orttraining/models/runner/training_util.cc @@ -76,7 +76,7 @@ size_t DataSet::TotalBatch(size_t batch_size) const { // see input_to_dimension_mapping in bert/main.cc for example, and training_utils.h for more explanation common::Status DataSet::GetTensorDimensionsFromInputs(const std::map>& input_to_dimension_mapping, MapStringToString& mapped_dimensions) const { - if (input_to_dimension_mapping.size() == 0) return Status::OK(); + if (input_to_dimension_mapping.size() == 0) return Status::OK(); for (size_t input_index = 0; input_index < NumInputs(); ++input_index) { std::string input_name = GetInputName(input_index); @@ -212,7 +212,8 @@ std::string LossScaler::SaveToString() const { Status LossScaler::LoadFromString(const std::string& input) { std::istringstream s{input}; - ORT_RETURN_IF_NOT((s >> loss_scale_ >> stable_steps_) && s.eof()); + ORT_RETURN_IF_NOT((s >> loss_scale_ >> stable_steps_) && s.eof(), + "(s >> loss_scale_ >> stable_steps_) && s.eof() was false"); return Status::OK(); } diff --git a/orttraining/orttraining/test/model/data_loader_test.cc b/orttraining/orttraining/test/model/data_loader_test.cc index 30b7876451..18f955f9ed 100644 --- a/orttraining/orttraining/test/model/data_loader_test.cc +++ b/orttraining/orttraining/test/model/data_loader_test.cc @@ -25,7 +25,7 @@ Status WriteInputDataFile( const std::vector& sample_tensor_names, const uint32_t tensor_data_value) { const uint32_t num_features = static_cast(sample_tensor_names.size()); - ORT_RETURN_IF_NOT(num_samples > 0 && num_features > 0); + ORT_RETURN_IF_NOT(num_samples > 0 && num_features > 0, "num_samples > 0 && num_features > 0 was false"); // feature tensors have dimension of {1} and data value of tensor_data_value std::vector> samples; diff --git a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc index 4e14b20d8d..16aecafaad 100644 --- a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc @@ -25,7 +25,8 @@ static Status SplitDims( const std::vector& dims, int64_t axis, std::vector& n_dims, std::vector& m_dims) { if (axis < 0) axis += dims.size(); - ORT_RETURN_IF_NOT(0 <= axis && static_cast(axis) <= dims.size()); + ORT_RETURN_IF_NOT(0 <= axis && static_cast(axis) <= dims.size(), + "0 <= axis && axis <= dims.size() was false"); const auto boundary = dims.begin() + axis; n_dims.assign(dims.begin(), boundary); m_dims.assign(boundary, dims.end()); diff --git a/orttraining/orttraining/training_ops/cpu/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cpu/tensor/gather_nd_grad.cc index 1df6dd20e6..a186287bfa 100644 --- a/orttraining/orttraining/training_ops/cpu/tensor/gather_nd_grad.cc +++ b/orttraining/orttraining/training_ops/cpu/tensor/gather_nd_grad.cc @@ -73,7 +73,7 @@ Status GatherNDGrad::Compute(OpKernelContext* context) const { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "indices tensor data type not supported"); } - ORT_RETURN_IF_NOT(nullptr == p.input_str_base); + ORT_RETURN_IF_NOT(nullptr == p.input_str_base, "nullptr != p.input_str_base"); utils::MLTypeCallDispatcher t_disp(update_tensor->GetElementType()); t_disp.Invoke(p, update_tensor); diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc index 90ea1bca1b..7e7acd2974 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_nd_grad.cc @@ -55,9 +55,9 @@ Status GatherNDGrad::ComputeInternal(OpKernelContext* context) const { auto shape_tensor = context->Input(0); auto indices_tensor = context->Input(1); auto update_tensor = context->Input(2); - ORT_RETURN_IF_NOT(shape_tensor != nullptr); - ORT_RETURN_IF_NOT(indices_tensor != nullptr); - ORT_RETURN_IF_NOT(update_tensor != nullptr); + ORT_RETURN_IF(shape_tensor == nullptr, "shape_tensor != nullptr"); + ORT_RETURN_IF(indices_tensor == nullptr, "indices_tensor != nullptr"); + ORT_RETURN_IF(update_tensor == nullptr, "update_tensor != nullptr"); auto indices_shape = indices_tensor->Shape(); auto update_shape = update_tensor->Shape(); From 115e16b37b6f5a57dd8a39432b78d6e7e2c63293 Mon Sep 17 00:00:00 2001 From: Chun-Wei Chen Date: Fri, 5 Feb 2021 17:34:08 -0800 Subject: [PATCH 30/41] ort_test_utils: skip creating input if it is an initializer (#6544) --- tools/python/ort_test_dir_utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py index 0cde75d6ca..08fce24750 100644 --- a/tools/python/ort_test_dir_utils.py +++ b/tools/python/ort_test_dir_utils.py @@ -21,7 +21,7 @@ def _get_numpy_type(model_info, name): raise ValueError("{} was not found in the model info.".format(name)) -def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values_map): +def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values_map, initializer_set): """ Update name_input_map with random input for any missing values in the model inputs. @@ -32,7 +32,10 @@ def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values for input in model_inputs: if input.name in name_input_map and name_input_map[input.name] is not None: continue - + # skip if the input has already exists in initializer + # models whose ir_version < 4 can have input same as initializer; no need to create input data + if input.name in initializer_set: + continue input_type = input.type.WhichOneof('value') if input_type != 'tensor_type': raise ValueError('Unsupported model. Need to handle input type of {}'.format(input_type)) @@ -126,9 +129,10 @@ def create_test_dir(model_path, root_path, test_name, if not symbolic_dim_values_map: symbolic_dim_values_map = {} - - _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values_map) - + initializer_set = set() + for initializer in onnx.load(model_path).graph.initializer: + initializer_set.add(initializer.name) + _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values_map, initializer_set) save_data("input", name_input_map, model_inputs) # save expected output data if provided. run model to create if not. From dda5a6207275b330c1571d13f27a3270dcf6e54f Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Fri, 5 Feb 2021 18:07:03 -0800 Subject: [PATCH 31/41] Fix updated Doxygen errors. (#6588) --- .../Tensors/Tensor.cs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.cs b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.cs index 21613a2fab..9dc3c256ce 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Tensors/Tensor.cs @@ -922,6 +922,12 @@ namespace Microsoft.ML.OnnxRuntime.Tensors return GetTriangle(offset, upper: true); } + /// + /// Implementation method for GetTriangle, GetLowerTriangle, GetUpperTriangle + /// + /// Offset of diagonal to set in returned tensor. + /// true for upper triangular and false otherwise + /// public Tensor GetTriangle(int offset, bool upper) { if (Rank < 2) @@ -1158,8 +1164,16 @@ namespace Microsoft.ML.OnnxRuntime.Tensors } } + /// + /// Always fixed size Tensor + /// + /// always true public bool IsFixedSize => true; + /// + /// Tensor is not readonly + /// + /// always false public bool IsReadOnly => false; int IList.Add(object value) @@ -1566,6 +1580,11 @@ namespace Microsoft.ML.OnnxRuntime.Tensors #endregion + /// + /// Get a string representation of Tensor + /// + /// + /// public string GetArrayString(bool includeWhitespace = true) { var builder = new StringBuilder(); From af9dfa7a4d3c2b83bdccc57c76e3e502cbcc70ae Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Fri, 5 Feb 2021 18:09:27 -0800 Subject: [PATCH 32/41] Remove docs that have been migrated to https://onnxruntime.ai/docs (#6225) --- README.md | 8 +- .../README.md | 172 ---------- .../README.md | 169 ---------- docs/AddingCustomOp.md | 31 -- docs/AddingExecutionProvider.md | 37 --- docs/CSharp_API.md | 309 ------------------ docs/C_API.md | 77 ----- docs/ExportPyTorchCustomOps.md | 186 ----------- docs/InferenceHighLevelDesign.md | 135 -------- docs/Java_API.md | 79 ----- docs/ONNX_Runtime_Graph_Optimizations.md | 148 --------- docs/ONNX_Runtime_Perf_Tuning.md | 179 ---------- docs/PyOp.md | 136 -------- docs/WinRT_API.md | 37 --- .../ACL-ExecutionProvider.md | 21 -- .../ArmNN-ExecutionProvider.md | 22 -- .../DNNL-ExecutionProvider.md | 35 -- .../DirectML-ExecutionProvider.md | 126 ------- .../MIGraphX-ExecutionProvider.md | 35 -- docs/execution_providers/MKL-DNN-Subgraphs.md | 65 ---- .../NNAPI-ExecutionProvider.md | 21 -- .../Nuphar-ExecutionProvider.md | 170 ---------- .../OpenVINO-ExecutionProvider.md | 284 ---------------- docs/execution_providers/README.md | 64 ---- .../RKNPU-ExecutionProvider.md | 70 ---- .../TensorRT-ExecutionProvider.md | 114 ------- .../Vitis-AI-ExecutionProvider.md | 118 ------- samples/README.md | 113 ------- samples/c_cxx/MNIST/ReadMe.md | 66 ---- 29 files changed, 5 insertions(+), 3022 deletions(-) delete mode 100644 csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md delete mode 100644 csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md delete mode 100644 docs/AddingCustomOp.md delete mode 100644 docs/AddingExecutionProvider.md delete mode 100644 docs/CSharp_API.md delete mode 100644 docs/C_API.md delete mode 100644 docs/ExportPyTorchCustomOps.md delete mode 100644 docs/InferenceHighLevelDesign.md delete mode 100644 docs/Java_API.md delete mode 100644 docs/ONNX_Runtime_Graph_Optimizations.md delete mode 100644 docs/ONNX_Runtime_Perf_Tuning.md delete mode 100644 docs/PyOp.md delete mode 100644 docs/WinRT_API.md delete mode 100644 docs/execution_providers/ACL-ExecutionProvider.md delete mode 100644 docs/execution_providers/ArmNN-ExecutionProvider.md delete mode 100644 docs/execution_providers/DNNL-ExecutionProvider.md delete mode 100644 docs/execution_providers/DirectML-ExecutionProvider.md delete mode 100644 docs/execution_providers/MIGraphX-ExecutionProvider.md delete mode 100644 docs/execution_providers/MKL-DNN-Subgraphs.md delete mode 100644 docs/execution_providers/NNAPI-ExecutionProvider.md delete mode 100644 docs/execution_providers/Nuphar-ExecutionProvider.md delete mode 100644 docs/execution_providers/OpenVINO-ExecutionProvider.md delete mode 100644 docs/execution_providers/README.md delete mode 100644 docs/execution_providers/RKNPU-ExecutionProvider.md delete mode 100644 docs/execution_providers/TensorRT-ExecutionProvider.md delete mode 100644 docs/execution_providers/Vitis-AI-ExecutionProvider.md delete mode 100644 samples/README.md delete mode 100644 samples/c_cxx/MNIST/ReadMe.md diff --git a/README.md b/README.md index 83e18e4449..ac2445a477 100644 --- a/README.md +++ b/README.md @@ -8,17 +8,19 @@ [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86) [![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84) -**ONNX Runtime** is a cross-platform **inferencing and training accelerator** compatible with many popular ML/DNN frameworks, including PyTorch, TensorFlow/Keras, scikit-learn, and more. **[onnxruntime.ai](https://onnxruntime.ai)** +**ONNX Runtime** is a cross-platform **inference and training machine-learning accelerator** compatible with deep learning frameworks, PyTorch and TensorFlow/Keras, as well as classical machine learning libraries such as scikit-learn, and more. **[aka.ms/onnxruntime](https://aka.ms/onnxruntime)** +ONNX Runtime uses the portable [ONNX](https://onnx.ai) computation graph format, backed by execution providers optimized for operating systems, drivers and hardware. Many users can benefit from ONNX Runtime, including those looking to: + * Improve inference performance for a wide variety of ML models * Reduce time and cost of training large models * Train in Python but deploy into a C#/C++/Java app * Run on different hardware and operating systems * Support models created in several different frameworks -[ONNX Runtime inferencing](./onnxruntime) APIs are stable and production-ready since the [1.0 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.0.0) in October 2019 and can enable faster customer experiences and lower costs. +[ONNX Runtime inference](./onnxruntime) APIs are stable and production-ready since the [1.0 release](https://github.com/microsoft/onnxruntime/releases/tag/v1.0.0) in October 2019 and can enable faster customer experiences and lower costs. [ONNX Runtime training](./orttraining) feature was introduced in May 2020 in preview. This feature supports acceleration of PyTorch training on multi-node NVIDIA GPUs for transformer models. Additional updates for this feature are coming soon. @@ -40,7 +42,7 @@ Many users can benefit from ONNX Runtime, including those looking to: [Frequently Asked Questions](./docs/FAQ.md) -## Inferencing: Start +## Inference To use ONNX Runtime, refer to the table on [aka.ms/onnxruntime](https://aka.ms/onnxruntime) for instructions for different build combinations. diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md b/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md deleted file mode 100644 index 5b042da0c6..0000000000 --- a/csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample/README.md +++ /dev/null @@ -1,172 +0,0 @@ -# C# Sample: Faster R-CNN - -The sample walks through how to run a pretrained Faster R-CNN object detection ONNX model using the ONNX Runtime C# API. - -The source code for this sample is available [here](Program.cs). - -## Prerequisites - -To run this sample, you'll need the following things: - -1. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) or higher for you OS (Mac, Windows or Linux). -2. Download the [Faster R-CNN](https://github.com/onnx/models/blob/master/vision/object_detection_segmentation/faster-rcnn/model/FasterRCNN-10.onnx) ONNX model to your local system. -3. Download [this demo image](demo.jpg) to test the model. You can also use any image you like. - -## Getting Started - -Now we have everything set up, we can start adding code to run the model on the image. We'll do this in the main method of the program for simplicity. - -### Read paths - -Firstly, let's read the path to the model, path to the image we want to test, and path to the output image: - -```cs -string modelFilePath = args[0]; -string imageFilePath = args[1]; -string outImageFilePath = args[2]; -``` - -### Read image - -Next, we will read the image in using the cross-platform image library [ImageSharp](https://www.nuget.org/packages/SixLabors.ImageSharp): - -```cs -using Image image = Image.Load(imageFilePath, out IImageFormat format); -``` - -Note, we're specifically reading the `Rgb24` type so we can efficiently preprocess the image in a later step. - -### Resize image - -Next, we will resize the image to the appropriate size that the model is expecting; it is recommended to resize the image such that both height and width are within the range of [800, 1333]. - -```cs -float ratio = 800f / Math.Min(image.Width, image.Height); -using Stream imageStream = new MemoryStream(); -image.Mutate(x => x.Resize((int)(ratio * image.Width), (int)(ratio * image.Height))); -image.Save(imageStream, format); -``` - -### Preprocess image - -Next, we will preprocess the image according to the [requirements of the model](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn#preprocessing-steps): - -```cs -var paddedHeight = (int)(Math.Ceiling(image.Height / 32f) * 32f); -var paddedWidth = (int)(Math.Ceiling(image.Width / 32f) * 32f); -Tensor input = new DenseTensor(new[] { 3, paddedHeight, paddedWidth }); -var mean = new[] { 102.9801f, 115.9465f, 122.7717f }; -for (int y = paddedHeight - image.Height; y < image.Height; y++) -{ - Span pixelSpan = image.GetPixelRowSpan(y); - for (int x = paddedWidth - image.Width; x < image.Width; x++) - { - input[0, y, x] = pixelSpan[x].B - mean[0]; - input[1, y, x] = pixelSpan[x].G - mean[1]; - input[2, y, x] = pixelSpan[x].R - mean[2]; - } -} -``` - -Here, we're creating a Tensor of the required size `(channels, paddedHeight, paddedWidth)`, accessing the pixel values, preprocessing them and finally assigning them to the tensor at the appropriate indicies. - -### Setup inputs - -Next, we will create the inputs to the model: - -```cs -var inputs = new List -{ - NamedOnnxValue.CreateFromTensor("image", input) -}; -``` - -To check the input node names for an ONNX model, you can use [Netron](https://github.com/lutzroeder/netron) to visualise the model and see input/output names. In this case, this model has `image` as the input node name. - -### Run inference - -Next, we will create an inference session and run the input through it: - -```cs -using var session = new InferenceSession(modelFilePath); -using IDisposableReadOnlyCollection results = session.Run(inputs); -``` - -### Postprocess output - -Next, we will need to postprocess the output to get boxes and associated label and confidence scores for each box: - -```cs -var resultsArray = results.ToArray(); -float[] boxes = resultsArray[0].AsEnumerable().ToArray(); -long[] labels = resultsArray[1].AsEnumerable().ToArray(); -float[] confidences = resultsArray[2].AsEnumerable().ToArray(); -var predictions = new List(); -var minConfidence = 0.7f; -for (int i = 0; i < boxes.Length - 4; i += 4) -{ - var index = i / 4; - if (confidences[index] >= minConfidence) - { - predictions.Add(new Prediction - { - Box = new Box(boxes[i], boxes[i + 1], boxes[i + 2], boxes[i + 3]), - Label = LabelMap.Labels[labels[index]], - Confidence = confidences[index] - }); - } -} -``` - -Note, we're only taking boxes that have a confidence above 0.7 to remove false positives. - -### View prediction - -Next, we'll draw the boxes and associated labels and confidence scores on the image to see how the model went: - -```cs -using var outputImage = File.OpenWrite(outImageFilePath); -Font font = SystemFonts.CreateFont("Arial", 16); -foreach (var p in predictions) -{ - image.Mutate(x => - { - x.DrawLines(Color.Red, 2f, new PointF[] { - - new PointF(p.Box.Xmin, p.Box.Ymin), - new PointF(p.Box.Xmax, p.Box.Ymin), - - new PointF(p.Box.Xmax, p.Box.Ymin), - new PointF(p.Box.Xmax, p.Box.Ymax), - - new PointF(p.Box.Xmax, p.Box.Ymax), - new PointF(p.Box.Xmin, p.Box.Ymax), - - new PointF(p.Box.Xmin, p.Box.Ymax), - new PointF(p.Box.Xmin, p.Box.Ymin) - }); - x.DrawText($"{p.Label}, {p.Confidence:0.00}", font, Color.White, new PointF(p.Box.Xmin, p.Box.Ymin)); - }); -} -image.Save(outputImage, format); -``` - -For each box prediction, we're using ImageSharp to draw red lines to create the boxes, and drawing the label and confidence text. - -## Running the program - -Now the program is created, we can run it will the following command: - -``` -dotnet run [path-to-model] [path-to-image] [path-to-output-image] -``` - -e.g. running: - -``` -dotnet run ~/Downloads/FasterRCNN-10.onnx ~/Downloads/demo.jpg ~/Downloads/out.jpg -``` - -detects the following objects in the image: - -![](out.jpg) \ No newline at end of file diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md deleted file mode 100644 index 7e72547624..0000000000 --- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/README.md +++ /dev/null @@ -1,169 +0,0 @@ -# C# Sample: ResNet50 v2 - -The sample walks through how to run a pretrained ResNet50 v2 ONNX model using the Onnx Runtime C# API. - -The source code for this sample is available [here](Program.cs). - -## Prerequisites - -To run this sample, you'll need the following things: - -1. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) or higher for you OS (Mac, Windows or Linux). -2. Download the [ResNet50 v2](https://github.com/onnx/models/blob/master/vision/classification/resnet/model/resnet50-v2-7.onnx) ONNX model to your local system. -3. Download [this picture of a dog](dog.jpeg) to test the model. You can also use any image you like. - -## Getting Started - -Now we have everything set up, we can start adding code to run the model on the image. We'll do this in the main method of the program for simplicity. - -### Read paths - -Firstly, let's read the path to the model and path to the image we want to test in through program arguments: - -```cs -string modelFilePath = args[0]; -string imageFilePath = args[1]; -``` - -### Read image - -Next, we will read the image in using the cross-platform image library [ImageSharp](https://www.nuget.org/packages/SixLabors.ImageSharp): - -```cs -using Image image = Image.Load(imageFilePath, out IImageFormat format); -``` - -Note, we're specifically reading the `Rgb24` type so we can efficiently preprocess the image in a later step. - -### Resize image - -Next, we will resize the image to the appropriate size that the model is expecting; 224 pixels by 224 pixels: - -```cs -using Stream imageStream = new MemoryStream(); -image.Mutate(x => -{ - x.Resize(new ResizeOptions - { - Size = new Size(224, 224), - Mode = ResizeMode.Crop - }); -}); -image.Save(imageStream, format); -``` - -Note, we're doing a centered crop resize to preserve aspect ratio. - -### Preprocess image - -Next, we will preprocess the image according to the [requirements of the model](https://github.com/onnx/models/tree/master/vision/classification/resnet#preprocessing): - -```cs -Tensor input = new DenseTensor(new[] { 1, 3, 224, 224 }); -var mean = new[] { 0.485f, 0.456f, 0.406f }; -var stddev = new[] { 0.229f, 0.224f, 0.225f }; -for (int y = 0; y < image.Height; y++) -{ - Span pixelSpan = image.GetPixelRowSpan(y); - for (int x = 0; x < image.Width; x++) - { - input[0, 0, y, x] = ((pixelSpan[x].R / 255f) - mean[0]) / stddev[0]; - input[0, 1, y, x] = ((pixelSpan[x].G / 255f) - mean[1]) / stddev[1]; - input[0, 2, y, x] = ((pixelSpan[x].B / 255f) - mean[2]) / stddev[2]; - } -} -``` - -Here, we're creating a Tensor of the required size `(batch-size, channels, height, width)`, accessing the pixel values, preprocessing them and finally assigning them to the tensor at the appropriate indicies. - -### Setup inputs - -Next, we will create the inputs to the model: - -```cs -var inputs = new List -{ - NamedOnnxValue.CreateFromTensor("data", input) -}; -``` - -To check the input node names for an ONNX model, you can use [Netron](https://github.com/lutzroeder/netron) to visualise the model and see input/output names. In this case, this model has `data` as the input node name. - -### Run inference - -Next, we will create an inference session and run the input through it: - -```cs -using var session = new InferenceSession(modelFilePath); -using IDisposableReadOnlyCollection results = session.Run(inputs); -``` - -### Postprocess output - -Next, we will need to postprocess the output to get the softmax vector, as this is not handled by the model itself: - -```cs -IEnumerable output = results.First().AsEnumerable(); -float sum = output.Sum(x => (float)Math.Exp(x)); -IEnumerable softmax = output.Select(x => (float)Math.Exp(x) / sum); -``` - -Other models may apply a Softmax node before the output, in which case you won't need this step. Again, you can use Netron to see the model outputs. - -### Extract top 10 - -Next, we will extract the top 10 class predictions: - -```cs -IEnumerable top10 = softmax.Select((x, i) => new Prediction { Label = LabelMap.Labels[i], Confidence = x }) - .OrderByDescending(x => x.Confidence) - .Take(10); -``` - -### Print results - -Next, we will print the top 10 results to the console: - -```cs -Console.WriteLine("Top 10 predictions for ResNet50 v2..."); -Console.WriteLine("--------------------------------------------------------------"); -foreach (var t in top10) -{ - Console.WriteLine($"Label: {t.Label}, Confidence: {t.Confidence}"); -} -``` - -## Running the program - -Now the program is created, we can run it will the following command: - -``` -dotnet run [path-to-model] [path-to-image] -``` - -e.g. - -``` -dotnet run ~/Downloads/resnet50-v2-7.onnx ~/Downloads/dog.jpeg -``` - -Running this on the following image: - -![](dog.jpeg) - -We get the following output: - -``` -Top 10 predictions for ResNet50 v2... --------------------------------------------------------------- -Label: Golden Retriever, Confidence: 0.9212826 -Label: Kuvasz, Confidence: 0.026514154 -Label: Clumber Spaniel, Confidence: 0.012455719 -Label: Labrador Retriever, Confidence: 0.004103844 -Label: Saluki, Confidence: 0.0033182495 -Label: Flat-Coated Retriever, Confidence: 0.0032045357 -Label: English Setter, Confidence: 0.002513516 -Label: Brittany, Confidence: 0.0023459378 -Label: Cocker Spaniels, Confidence: 0.0019343802 -Label: Sussex Spaniel, Confidence: 0.0019247672 -``` diff --git a/docs/AddingCustomOp.md b/docs/AddingCustomOp.md deleted file mode 100644 index 952aad3412..0000000000 --- a/docs/AddingCustomOp.md +++ /dev/null @@ -1,31 +0,0 @@ -Adding a new op -=============== - -## A new op can be written and registered with ONNXRuntime in the following 3 ways -### 1. Using the custom op API in the C/C++ APIs (onnxruntime_c_api.h) -* Create an OrtCustomOpDomain with the domain name used by the custom ops -* Create an OrtCustomOp structure for each op and add them to the OrtCustomOpDomain with OrtCustomOpDomain_Add -* Call OrtAddCustomOpDomain to add the custom domain of ops to the session options -See [this](../onnxruntime/test/shared_lib/test_inference.cc) for examples of MyCustomOp and SliceCustomOp that use the C++ helper API (onnxruntime_cxx_api.h). -You can also compile the custom ops into a shared library and use that to run a model via the C++ API. The same test file contains an example. -The source code for a sample custom op shared library containing two custom kernels is [here](../onnxruntime/test/testdata/custom_op_library/custom_op_library.cc). -See [this](../onnxruntime/test/python/onnxruntime_test_python.py) for an example called testRegisterCustomOpsLibrary that uses the Python API -to register a shared library that contains custom op kernels. -Currently, the only supported Execution Providers (EPs) for custom ops registered via this approach are the `CUDA` and the `CPU` EPs. - -Note that when a model being inferred on gpu, onnxruntime will insert MemcpyToHost op before a cpu custom op and append MemcpyFromHost after to make sure tensor(s) are accessible throughout calling, meaning there are no extra efforts required from custom op developer for the case. - -To facilitate the custom operator development, sharing and release, please check the [onnxruntime custom operator library](https://github.com/microsoft/ort-customops) project for the more information. - -### 2. Using RegisterCustomRegistry API -* Implement your kernel and schema (if required) using the OpKernel and OpSchema APIs (headers are in the include folder). -* Create a CustomRegistry object and register your kernel and schema with this registry. -* Register the custom registry with ONNXRuntime using RegisterCustomRegistry API. - -See -[this](../onnxruntime/test/framework/local_kernel_registry_test.cc) for an example. - -### 3. Contributing the op to ONNXRuntime -This is mostly meant for ops that are in the process of being proposed to ONNX. This way you don't have to wait for an approval from the ONNX team -if the op is required in production today. -See [this](../onnxruntime/contrib_ops) for an example. diff --git a/docs/AddingExecutionProvider.md b/docs/AddingExecutionProvider.md deleted file mode 100644 index 3cadc45f90..0000000000 --- a/docs/AddingExecutionProvider.md +++ /dev/null @@ -1,37 +0,0 @@ -# Adding a new execution provider - -* Create a folder under onnxruntime/core/providers -* Create a folder under include/onnxruntime/core/providers, it should has the same name as the first step. -* Create a new class, which must inherit from [IExecutionProvider](../include/onnxruntime/core/framework/execution_provider.h). The source code should be put in 'onnxruntime/core/providers/[your_provider_name]' -* Create a new header file under include/onnxruntime/core/providers/[your_provider_name]. The file should provide one function for creating an OrtProviderFactoryInterface. You may use 'include/onnxruntime/core/providers/cpu/cpu_provider_factory.h' as a template. You don't need to provide a function for creating MemoryInfo. -* Put a symbols.txt under 'onnxruntime/core/providers/[your_provider_name]'. The file should contain all the function names that would be exported from you provider. Usually, just a single function for creating provider factory is enough. -* Add your provider in onnxruntime_providers.cmake. Build it as a static lib. -* Add one line in cmake/onnxruntime.cmake, to the 'target_link_libraries' function call. Put your provider there. - - -Examples: - - * [CPU Execution - Provider](../onnxruntime/core/providers/cpu/cpu_execution_provider.h) - * [CUDA Execution - Provider](../onnxruntime/core/providers/cuda/cuda_execution_provider.h) - * [DNNL Execution - Provider](../onnxruntime/core/providers/dnnl/dnnl_execution_provider.h) - - -# Using the execution provider -1. Create a factory for that provider, by using the c function you exported in 'symbols.txt' -2. Put the provider factory into session options -3. Create session from that session option -e.g. - -```c - OrtEnv* env; - OrtInitialize(ORT_LOGGING_LEVEL_WARNING, "test", &env) - OrtSessionOptions* session_option = OrtCreateSessionOptions(); - OrtProviderFactoryInterface** factory; - OrtCreateCUDAExecutionProviderFactory(0, &factory); - OrtSessionOptionsAppendExecutionProvider(session_option, factory); - OrtReleaseObject(factory); - OrtCreateSession(env, model_path, session_option, &session); -``` diff --git a/docs/CSharp_API.md b/docs/CSharp_API.md deleted file mode 100644 index 38ba2a1678..0000000000 --- a/docs/CSharp_API.md +++ /dev/null @@ -1,309 +0,0 @@ -# ONNX Runtime C# API -The ONNX runtime provides a C# .Net binding for running inference on ONNX models in any of the .Net standard platforms. The API is .Net standard 1.1 compliant for maximum portability. This document describes the API. - -## NuGet Package -The Microsoft.ML.OnnxRuntime Nuget package includes the precompiled binaries for ONNX runtime, and includes libraries for Windows and Linux platforms with X64 CPUs. The APIs conform to .Net Standard 1.1. - -## Sample Code - -The unit tests contain several examples of loading models, inspecting input/output node shapes and types, as well as constructing tensors for scoring. - -* [../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L166](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L166) - -## Getting Started -Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data. The model is typically trained using any of the well-known training frameworks and exported into the ONNX format. To start scoring using the model, open a session using the `InferenceSession` class, passing in the file path to the model as a parameter. - -```cs -var session = new InferenceSession("model.onnx"); -``` - -Once a session is created, you can execute queries using the `Run` method of the `InferenceSession` object. Currently, only `Tensor` type of input and outputs are supported. The results of the `Run` method are represented as a collection of .Net `Tensor` objects (as defined in [System.Numerics.Tensor](https://www.nuget.org/packages/System.Numerics.Tensors)). - -```cs -Tensor t1, t2; // let's say data is fed into the Tensor objects -var inputs = new List() - { - NamedOnnxValue.CreateFromTensor("name1", t1), - NamedOnnxValue.CreateFromTensor("name2", t2) - }; -using (var results = session.Run(inputs)) -{ - // manipulate the results -} -``` - -You can load your input data into Tensor objects in several ways. A simple example is to create the Tensor from arrays. - -```cs -float[] sourceData; // assume your data is loaded into a flat float array -int[] dimensions; // and the dimensions of the input is stored here -Tensor t1 = new DenseTensor(sourceData, dimensions); -``` - -Here is a [complete sample code](../csharp/sample/Microsoft.ML.OnnxRuntime.InferenceSample) that runs inference on a pretrained model. - -## Reuse input/output tensor buffers - -In some scenarios, you may want to reuse input/output tensors. This often happens when you want to chain 2 models (ie. feed one's output as input to another), or want to accelerate inference speed during multiple inference runs. - -### Chaining: Feed model A's output(s) as input(s) to model B - -```cs -InferenceSession session1, session2; // let's say 2 sessions are initialized - -Tensor t1; // let's say data is fed into the Tensor objects -var inputs1 = new List() - { - NamedOnnxValue.CreateFromTensor("name1", t1) - }; -// session1 inference -using (var outputs1 = session1.Run(inputs1)) -{ - // get intermediate value - var input2 = outputs1.First(); - - // modify the name of the ONNX value - input2.Name = "name2"; - - // create input list for session2 - var inputs2 = new List() { input2 }; - - // session2 inference - using (var results = session2.Run(inputs2)) - { - // manipulate the results - } -} -``` - -### Multiple inference runs with fixed sized input(s) and output(s) - -If the model have fixed sized inputs and outputs of numeric tensors, you can use "FixedBufferOnnxValue" to accelerate the inference speed. By using "FixedBufferOnnxValue", the container objects only need to be allocated/disposed one time during multiple InferenceSession.Run() calls. This avoids some overhead which may be beneficial for smaller models where the time is noticeable in the overall running time. - -An example can be found at `TestReusingFixedBufferOnnxValueNonStringTypeMultiInferences()`: -* [../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L1047](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs#L1047) - -## Running on GPU (Optional) -If using the GPU package, simply use the appropriate SessionOptions when creating an InferenceSession. - -```cs -int gpuDeviceId = 0; // The GPU device ID to execute on -var session = new InferenceSession("model.onnx", SessionOptions.MakeSessionOptionWithCudaProvider(gpuDeviceId)); -``` - -## API Reference - -### OrtEnv -```cs -class OrtEnv -``` -Holds some methods which can be used to tune the ONNX Runtime's runime environment - -#### Constructor -No public constructor available. - -#### Methods -```cs -static OrtEnv Instance(); -``` -Returns an instance of the singlton class `OrtEnv`. - -```cs -void EnableTelemetryEvents(); -``` -Enables platform-specific telemetry collection where applicable. Please see [Privacy](./Privacy.md) for more details. - -```cs -void DisableTelemetryEvents(); -``` -Disables platform-specific telemetry collection. Please see [Privacy](./Privacy.md) for more details. - -### InferenceSession -```cs -class InferenceSession: IDisposable -``` - -The runtime representation of an ONNX model - -#### Constructor -```cs -InferenceSession(string modelPath); -InferenceSession(string modelPath, SessionOptions options); -``` - -#### Properties -```cs -IReadOnlyDictionary InputMetadata; -``` -Data types and shapes of the input nodes of the model. - -```cs -IReadOnlyDictionary OutputMetadata; -``` -Data types and shapes of the output nodes of the model. - -#### Methods -```cs -IDisposableReadOnlyCollection Run(IReadOnlyCollection inputs); -``` -Runs the model with the given input data to compute all the output nodes and returns the output node values. Both input and output are collection of NamedOnnxValue, which in turn is a name-value pair of string names and Tensor values. The outputs are IDisposable variant of NamedOnnxValue, since they wrap some unmanaged objects. - -```cs -IDisposableReadOnlyCollection Run(IReadOnlyCollection inputs, IReadOnlyCollection desiredOutputNodes); -``` -Runs the model on given inputs for the given output nodes only. - -### System.Numerics.Tensor -The primary .Net object that is used for holding input-output of the model inference. Details on this newly introduced data type can be found in its [open-source implementation](https://github.com/dotnet/corefx/tree/master/src/System.Numerics.Tensors). The binaries are available as a [.Net NuGet package](https://www.nuget.org/packages/System.Numerics.Tensors). - -### NamedOnnxValue -```cs -class NamedOnnxValue; -``` -Represents a name-value pair of string names and any type of value that ONNX runtime supports as input-output data. Currently, only Tensor objects are supported as input-output values. - -#### Constructor -No public constructor available. - -#### Properties -```cs -string Name; // get or set the name -``` - -#### Methods -```cs -static NamedOnnxValue CreateFromTensor(string name, Tensor); -``` -Creates a NamedOnnxValue from a name and a Tensor object. - -```cs -Tensor AsTensor(); -``` -Accesses the value as a Tensor. Returns null if the value is not a Tensor. - -### DisposableNamedOnnxValue -```cs -class DisposableNamedOnnxValue: NamedOnnxValue, IDisposable; -``` -This is a disposable variant of NamedOnnxValue, used for holding output values which contains objects allocated in unmanaged memory. - -### FixedBufferOnnxValue -```cs -class FixedBufferOnnxValue: IDisposable; -``` -Class `FixedBufferOnnxValue` enables the availability to pin the tensor buffer. This helps to minimize overhead within each inference run. - -`FixedBufferOnnxValue` can be used as either input or output. However, if used as output, it has to be a numeric tensor. - -`FixedBufferOnnxValue` implements `IDisposable`, so make sure it get disposed after use. -#### Methods -```cs -static FixedBufferOnnxValue CreateFromTensor(Tensor); -``` -Creates a FixedBufferOnnxValue from a name and a Tensor object. - - -### IDisposableReadOnlyCollection -```cs -interface IDisposableReadOnlyCollection: IReadOnlyCollection, IDisposable -``` -Collection interface to hold disposable values. Used for output of Run method. - -### SessionOptions -```cs -class SessionOptions: IDisposable; -``` -A collection of properties to be set for configuring the OnnxRuntime session - -#### Constructor -```cs -SessionOptions(); -``` -Constructs a SessionOptions will all options at default/unset values. - -#### Properties -```cs -static SessionOptions Default; //read-only -``` -Accessor to the default static option object - -#### Methods -```cs -SetSessionGraphOptimizationLevel(GraphOptimizationLevel graph_transformer_level); -``` -See [ONNX_Runtime_Graph_Optimizations.md] for more details. - -```cs -SetSessionExecutionMode(ExecutionMode execution_mode); -``` - * ORT_SEQUENTIAL - execute operators in the graph sequentially. - * ORT_PARALLEL - execute operators in the graph in parallel. -See [ONNX_Runtime_Perf_Tuning.md] for more details. - -### NodeMetadata -Container of metadata for a model graph node, used for communicating the shape and type of the input and output nodes. - -#### Properties -```cs -int[] Dimensions; -``` -Read-only shape of the node, when the node is a Tensor. Undefined if the node is not a Tensor. - -```cs -System.Type ElementType; -``` -Type of the elements of the node, when node is a Tensor. Undefined for non-Tensor nodes. - -```cs -bool IsTensor; -``` -Whether the node is a Tensor - -### Exceptions -```cs -class OnnxRuntimeException: Exception; -``` - -The type of Exception that is thrown in most of the error conditions related to Onnx Runtime. - -### ModelMetadata -```cs -class ModelMetadata -``` -Encapsulates some metadata about the ONNX model. - -#### Constructor -No public constructor available. - -The `ModelMetadata` instance for an ONNX model may be obtained by querying the `ModelMetadata` property of an `InferenceSession` instance. - -#### Properties -```cs -string ProducerName; -``` -Holds the producer name of the ONNX model. - -```cs -string GraphName; -``` -Holds the graph name of the ONNX model. - -```cs -string Domain; -``` -Holds the opset domain of the ONNX model. - -```cs -string Description; -``` -Holds the description of the ONNX model. - -```cs -long Version; -``` -Holds the version of the ONNX model. - -```cs -Dictionary CustomMetadataMap; -``` -Holds a dictionary containing key-value pairs of custom metadata held by the ONNX model. diff --git a/docs/C_API.md b/docs/C_API.md deleted file mode 100644 index 3d6999046c..0000000000 --- a/docs/C_API.md +++ /dev/null @@ -1,77 +0,0 @@ -# C API - -## Features - -* Creating an InferenceSession from an on-disk model file and a set of SessionOptions. -* Registering customized loggers. -* Registering customized allocators. -* Registering predefined providers and set the priority order. ONNXRuntime has a set of predefined execution providers, like CUDA, DNNL. User can register providers to their InferenceSession. The order of registration indicates the preference order as well. -* Running a model with inputs. These inputs must be in CPU memory, not GPU. If the model has multiple outputs, user can specify which outputs they want. -* Converting an in-memory ONNX Tensor encoded in protobuf format to a pointer that can be used as model input. -* Setting the thread pool size for each session. -* Setting graph optimization level for each session. -* Dynamically loading custom ops. [Instructions](/docs/AddingCustomOp.md) -* Ability to load a model from a byte array. See ```OrtCreateSessionFromArray``` in [onnxruntime_c_api.h](/include/onnxruntime/core/session/onnxruntime_c_api.h). -* **Global/shared threadpools:** By default each session creates its own set of threadpools. In situations where multiple -sessions need to be created (to infer different models) in the same process, you end up with several threadpools created -by each session. In order to address this inefficiency we introduce a new feature called global/shared threadpools. -The basic idea here is to share a set of global threadpools across multiple sessions. Typical usage of this feature -is as follows - * Populate ```ThreadingOptions```. Use the value of 0 for ORT to pick the defaults. - * Create env using ```CreateEnvWithGlobalThreadPools()``` - * Create session and call ```DisablePerSessionThreads()``` on the session options object - * Call ```Run()``` as usual -* **Share allocator(s) between sessions:** - * *Description*: This feature allows multiple sessions in the same process to use the same allocator(s). - * *Scenario*: You've several sessions in the same process and see high memory usage. One of the reasons for this is as follows. Each session creates its own CPU allocator which is arena based by default. [ORT implements](onnxruntime/core/framework/bfc_arena.h) a simplified version of an arena allocator that is based on [Doug Lea's best-first with coalescing algorithm](http://gee.cs.oswego.edu/dl/html/malloc.html). Each allocator lives in its own session. It allocates a large region of memory during init time and thereafter it chunks, coalesces and extends this initial region as per allocation/deallocation demands. Overtime the arena ends up with unused chunks of memory per session. Moreover, the memory allocated by the arena is never returned to the system; once allocated it always remains allocated. All these factors add up when using multiple sessions (each with its own arena) thereby increasing the overall memory consumption of the process. Hence it becomes important to share the arena allocator between sessions. - * *Usage*: - * Create and register a shared allocator with the env using the ```CreateAndRegisterAllocator``` API. This allocator is then reused by all sessions that use the same env instance unless a session -chooses to override this by setting ```session_state.use_env_allocators``` to "0". - * Set ```session.use_env_allocators``` to "1" for each session that wants to use the env registered allocators. - * See test ```TestSharedAllocatorUsingCreateAndRegisterAllocator``` in - onnxruntime/test/shared_lib/test_inference.cc for an example. - * Configuring *OrtArenaCfg*: - * Default values for these configs can be found in the [BFCArena class](onnxruntime/core/framework/bfc_arena.h). - * ```initial_chunk_size_bytes```: This is the size of the region that the arena allocates first. Chunks are handed over to allocation requests from this region. If the logs show that the arena is getting extended a lot more than expected, you're better off choosing a big enough initial size for this. - * ```max_mem```: This is the maximum amount of memory the arena allocates. If a chunk cannot be serviced by any existing region, the arena extends itself by allocating one more region depending on available memory (max_mem - allocated_so_far). An error is returned if available memory is less than the requested extension. - * ```arena_extend_strategy```: This can take only 2 values currently: kSameAsRequested or kNextPowerOfTwo. As the name suggests kNextPowerOfTwo (the default) extends the arena by a power of 2, while kSameAsRequested extends by a size that is the same as the allocation request each time. kSameAsRequested is suited for more advanced configurations where you know the expected memory usage in advance. - * ```max_dead_bytes_per_chunk```: This controls whether a chunk is split to service an allocation request. Currently if the difference between the chunk size and requested size is less than this value, the chunk is not split. This has the potential to waste memory by keeping a part of the chunk unused (hence called dead bytes) throughout the process thereby increasing the memory usage (until this chunk is returned to the arena). - -* **Share initializer(s) between sessions:** - * *Description*: This feature allows a user to share the same instance of an initializer across -multiple sessions. - * *Scenario*: You've several models that use the same set of initializers except the last few layers of the model and you load these models in the same process. When every model (session) creates a separate instance of the same initializer, it leads to excessive and wasteful memory usage since in this case it's the same initializer. You want to optimize memory usage while having the flexibility to allocate the initializers (possibly even store them in shared memory). - * *Example Usage*: Use the ```AddInitializer``` API to add a pre-allocated initializer to session options before calling ```CreateSession```. Use the same instance of session options to create several sessions allowing the initializer(s) to be shared between the sessions. See [C API sample usage (TestSharingOfInitializer)](../onnxruntime/test/shared_lib/test_inference.cc) and [C# API sample usage (TestWeightSharingBetweenSessions)](../csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs). - -## Usage Overview - -1. Include [onnxruntime_c_api.h](/include/onnxruntime/core/session/onnxruntime_c_api.h). -2. Call OrtCreateEnv -3. Create Session: OrtCreateSession(env, model_uri, nullptr,...) - - Optionally add more execution providers (e.g. for CUDA use OrtSessionOptionsAppendExecutionProvider_CUDA) -4. Create Tensor - 1) OrtCreateMemoryInfo - 2) OrtCreateTensorWithDataAsOrtValue -5. OrtRun - -## Sample code - -The example below shows a sample run using the SqueezeNet model from ONNX model zoo, including dynamically reading model inputs, outputs, shape and type information, as well as running a sample vector and fetching the resulting class probabilities for inspection. - -* [../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp](../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp) - -## Deployment - -### Windows 10 - -Your installer should put the onnxruntime.dll into the same folder as your application. Your application can either use [load-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-load-time-dynamic-linking) or [run-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-run-time-dynamic-linking) to bind to the dll. - -#### Dynamic Link Library Search Order - -This is an important article on how Windows finds supporting dlls: [Dynamic Link Library Search Order](https://docs.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order). - -There are some cases where the app is not directly consuming the onnxruntime but instead calling into a DLL that is consuming the onnxruntime. People building these DLLs that consume the onnxruntime need to take care about folder structures. Do not modify the system %path% variable to add your folders. This can conflict with other software on the machine that is also using the onnxruntme. Instead place your DLL and the onnxruntime DLL in the same folder and use [run-time dynamic linking](https://docs.microsoft.com/en-us/windows/win32/dlls/using-run-time-dynamic-linking) to bind explicity to that copy. You can use code like this sample does in [GetModulePath()](https://github.com/microsoft/Windows-Machine-Learning/blob/master/Samples/SampleSharedLib/SampleSharedLib/FileHelper.cpp) to find out what folder your dll is loaded from. - -## Telemetry - -To turn on/off telemetry collection on official Windows builds, please use Enable/DisableTelemetryEvents() in the C API. See the [Privacy](./Privacy.md) page for more information on telemetry collection and Microsoft's privacy policy. diff --git a/docs/ExportPyTorchCustomOps.md b/docs/ExportPyTorchCustomOps.md deleted file mode 100644 index 20835ad76f..0000000000 --- a/docs/ExportPyTorchCustomOps.md +++ /dev/null @@ -1,186 +0,0 @@ - -## Export of Custom Ops for ONNX Runtime - -This document explains the process of exporting PyTorch models with custom ONNX Runtime ops. -The aim is to export a PyTorch model with operators that are not supported in ONNX, and extend ONNX Runtime to support these custom ops. - -Currently, a torch op can be exported as a custom operator using our custom op (symbolic) registration API. We can -use this API to register custom ONNX Runtime ops under "com.microsoft" domain. - - -### 1. Exporting a Custom Op - -In this example, we take Inverse operator as an example. To enable export of ```torch.inverse```, a symbolic function -can be created and registered as part of custom ops: - -```python -from torch.onnx import register_custom_op_symbolic - -def my_inverse(g, self): - return g.op("com.microsoft::Inverse", self) - -# register_custom_op_symbolic('::inverse', my_inverse, ) -register_custom_op_symbolic('::inverse', my_inverse, 1) - -``` -`````` is a part of the torch operator name. For standard torch operators, namespace can be omitted. - -```com.microsoft``` should be used as the custom opset domain for ONNX Runtime ops. You can choose the custom opset -version during op registration. - -All symbolics for ONNX Runtime custom ops are defined in ``tools/python/register_custom_ops_pytorch_exporter.py``. -If you are adding a symbolic function for a new custom op, add the function to this file. - - -### 2. Extending ONNX Runtime with Custom Ops -The next step is to add op schema and kernel implementation in ONNX Runtime. -Consider the Inverse custom op as an example added in: -https://github.com/microsoft/onnxruntime/pull/3485 - - -Custom op schema and shape inference function should be added in ```onnxruntime/core/graph/contrib_ops/contrib_defs.cc ``` -using ```ONNX_CONTRIB_OPERATOR_SCHEMA```. - -```c++ -ONNX_CONTRIB_OPERATOR_SCHEMA(Inverse) - .SetDomain(kMSDomain) // kMSDomain = "com.microsoft" - .SinceVersion(1) // Same version used at op (symbolic) registration - ... -``` - -To comply with ONNX guideline for new operators, a new operator should have complete reference implementation tests and -shape inference tests. - -Reference implementation python tests should be added in: -``onnxruntime/test/python/contrib_ops`` -E.g.: ``onnxruntime/test/python/contrib_ops/onnx_test_trilu.py`` - -Shape inference C++ tests should be added in: -``onnxruntime/test/contrib_ops`` -E.g.: ``onnxruntime/test/contrib_ops/trilu_shape_inference_test.cc`` - -The operator kernel should be implemented using ```Compute``` function -under contrib namespace in ```onnxruntime/contrib_ops/cpu/.cc``` -for CPU and ```onnxruntime/contrib_ops/cuda/.cc``` for CUDA. - -```c -namespace onnxruntime { -namespace contrib { - -class Inverse final : public OpKernel { - public: - explicit Inverse(const OpKernelInfo& info) : OpKernel(info) {} - Status Compute(OpKernelContext* ctx) const override; - - private: - ... -}; - -ONNX_OPERATOR_KERNEL_EX( - Inverse, - kMSDomain, - 1, - kCpuExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T", BuildKernelDefConstraints()), - Inverse); - -Status Inverse::Compute(OpKernelContext* ctx) const { -... // kernel implementation -} - -} // namespace contrib -} // namespace onnxruntime - -``` - -Operator kernel should be registered in ```onnxruntime/contrib_ops/cpu_contrib_kernels.cc``` -for CPU and ```onnxruntime/contrib_ops/cuda_contrib_kernels.cc``` for CUDA. - -Now you should be able to build and install ONNX Runtime to start using your custom op. - -##### ONNX Runtime Tests - -ONNX Runtime custom op kernel tests should be added in: ```onnxruntime/test/contrib_ops/_test.cc ``` - -```c++ -namespace onnxruntime { -namespace test { - -// Add a comprehensive set of unit tests for custom op kernel implementation - -TEST(InverseContribOpTest, two_by_two_float) { - OpTester test("Inverse", 1, kMSDomain); // custom opset version and domain - test.AddInput("X", {2, 2}, {4, 7, 2, 6}); - test.AddOutput("Y", {2, 2}, {0.6f, -0.7f, -0.2f, 0.4f}); - test.Run(); -} - -... - -} // namespace test -} // namespace onnxruntime - -``` - - -### 3. Test model Export End to End - -Once the custom op is registered in the exporter and implemented in ONNX Runtime, you should be able to -export it as part of you ONNX model and run it with ONNX Runtime. - -Below you can find a sample script for exporting and running the inverse operator as part of a model. -The exported model includes a combination of ONNX standard ops and the custom ops. - -This test also compares the output of PyTorch model with ONNX Runtime outputs to test both the operator export and -implementation. - -```python -import torch -import onnxruntime -import io -import numpy - - -class CustomInverse(torch.nn.Module): - def forward(self, x): - return torch.inverse(x) + x - -x = torch.randn(3, 3) - -# Export model to ONNX -f = io.BytesIO() -torch.onnx.export(CustomInverse(), (x,), f) - -model = CustomInverse() -pt_outputs = model(x) - -# Run the exported model with ONNX Runtime -ort_sess = onnxruntime.InferenceSession(f.getvalue()) -ort_inputs = dict((ort_sess.get_inputs()[i].name, input.cpu().numpy()) for i, input in enumerate((x,))) -ort_outputs = ort_sess.run(None, ort_inputs) - -# Validate PyTorch and ONNX Runtime results -numpy.testing.assert_allclose(pt_outputs.cpu().numpy(), ort_outputs[0], rtol=1e-03, atol=1e-05) -``` - -By default, the opset version will be set to ``1`` for custom opsets. If you'd like to export your -custom op to a higher opset version, you can specify the custom opset domain and version using -the ``custom_opsets argument`` when calling the export API. Note that this is different than the opset -version associated with default ```ONNX``` domain. - -``` -torch.onnx.export(CustomInverse(), (x,), f, custom_opsets={"com.microsoft": 5}) -``` - -Note that you can export a custom op to any version >= the opset version used at registration. - -We have a set of tests for export and output validation of ONNX models with ONNX Runtime custom ops in -``tools/test/test_test_custom_ops_pytorch_exporter.py``. If you're adding a new custom operator, please -make sure to include tests in this file. - -You can run these tests using the command: - -``` -PYTHONPATH= pytest -v test_custom_ops_pytorch_exporter.py -``` diff --git a/docs/InferenceHighLevelDesign.md b/docs/InferenceHighLevelDesign.md deleted file mode 100644 index d5f0e73c1c..0000000000 --- a/docs/InferenceHighLevelDesign.md +++ /dev/null @@ -1,135 +0,0 @@ -# ONNX Runtime High Level Design - -This document outlines the high level design of -ONNX Runtime - a high performance, cross platform engine. - -## Key objectives -* Maximally and automatically leverage the custom accelerators and runtimes -available on disparate platforms. -* Provide the right abstraction and runtime support for custom accelerators and -runtimes. We call this abstraction an [execution -provider](../include/onnxruntime/core/framework/execution_provider.h). It defines and exposes a set of -its capabilities to ONNXRuntime: a set of single or fused nodes it can -execute, its memory allocator, and more. Custom accelerators and runtimes are -instances of execution providers. -* We don't expect that an execution provider can always run an ONNX model fully -on its device. This means that ONNXRuntime must be able to execute a single -model in a heterogeneous environment involving multiple execution providers. -* Provide support for high-level optimizations that can be expressed as -model-to-model transformations via a [graph-transformation -API](../include/onnxruntime/core/optimizer/graph_transformer.h). Such -transformations fall into two categories: global transformations, those that -require analysis and transformation of the entire graph, and local -transformations, which can be captured as simple (algebraic) [rewriting -rules](../include/onnxruntime/core/optimizer/rewrite_rule.h). - -## High-level system architecture -The flow is quite simple. Starting from an ONNX model, ONNXRuntime first -converts the model graph into its in-memory graph representation. It then -applies a number of graph transformations that a) perform a set of provider -independent optimizations such cast transformations between float16 and float32, and b) partition the -graph into a set of subgraphs based on the available execution providers. Each -subgraph is assigned to an execution provider. We ensure that a subgraph can be -executed by an execution provider by querying the capability of the execution -provider using the GetCapability() API. - -![ONNXRuntime high level system architecture](https://azurecomcdn.azureedge.net/mediahandler/acomblog/media/Default/blog/228d22d3-6e3e-48b1-811c-1d48353f031c.png) - -### More about partitioning -ONNXRuntime partitions a model graph into subgraphs based on the available execution providers, one for each distinct provider. ONNXRuntime provides -a default execution provider that is used as the fallback execution for the -operators that cannot be pushed onto the more specialized but more efficient -execution providers. Intuitively we want to push computation to more -specialized execution providers whenever possible. - -We use a simple graph partitioning technique. The available execution providers -will be considered in a specific order, and each will be assigned the maximal -subgraphs (possibly more than one) that it is able to handle. The -ONNXRuntime-provided default execution provider will be the last one -considered, and it ensures completeness. More sophisticated optimizations can be -considered in the future (or can even be implemented as a composite execution -provider). - -Conceptually, each partition is reduced to a single fused operator. It is -created by invoking the execution provider's Compile() method and wraps it as a -custom operator. Currently we support only synchronous mode of execution. An execution -provider exposes its memory allocator, which is used to allocate the input -tensors for the execution provider. The rewriting and partitioning transform the -initial model graph into a new graph composed of operators assigned to either -the default execution provider or other registered execution -providers. The ONNXRuntime execution engine is responsible for running this graph. - -## Key design decisions -* Multiple threads can invoke the Run() method on the same -inference session object. See [API doc](C_API.md) for more details. -* To facilitate this, the Compute() function of all kernels is const -implying the kernels are stateless. -* Implementations of the operators by execution providers are called -kernels. Each execution provider supports a subset of the (ONNX) -operators/kernels. -* The ONNX Runtime guarantees that all operators are supported by the default -execution provider. -* Tensor representation: ONNXRuntime will utilize a standard representation for -the tensor runtime values. The execution providers can internally use a -different representation if they choose to, but it is their responsibility to -convert the values from/to the standard representation at the boundaries of -their subgraph. - -## Extensibility Options -* [Add a custom operator/kernel](AddingCustomOp.md) -* [Add an execution provider](AddingExecutionProvider.md) -* [Add a new graph -transform](../include/onnxruntime/core/optimizer/graph_transformer.h) -* [Add a new rewrite rule](../include/onnxruntime/core/optimizer/rewrite_rule.h) - -## The ONNX Runtime and Windows OS integration - -The ONNX runtime shipped with the Windows operating system in build 1809 (RS5). The runtime was embedded inside the Windows.AI.MachineLearning.dll and was exposed via that WinRT API (WinML for short). It includes CPU support and a DirectML execution provider for GPU support. Since then it has continued to ship in every version of Windows. - -Starting with the ONNX Runtime 1.2 release we are bringing a new layered architecture to the ONNX Runtime and Windows ML. -*Note: This feature is preview as of the 1.2 release* - -The high level design looks like this - -![ONNX + WinML layered architecture](images/layered-architecture.png) - -You can see we replaced the embedded ONNX runtime with the new ONNXRuntime.dll. With this new approach customers have flexibility on which API they choose to use and on how they want to distribute the binaries. - -### API choice - -Developers can now choose which API works best for their scenario. - -||WinRT|C API| -|--|--|--| -|Type system| Integration with Windows RT types| Platform neutral types| -|Language support| Language support via WinRT Projections| Language support via per language projections| -|Tensorization| Accepts VideoFrames and converts to tensors (support for CPU and GPU)| Accepts tensors| - -### Distribution choice - -You can also choose to use runtimes included in the Windows OS, or use the redist nuget to ship the runtime with the app. - -|Distribution|Inbox|App NuGet| -|--|--|--| -|Disk footprint| Included in the OS| Included in the App| -|Servicing fixes| Serviced by OS updates| Serviced by the App| -|Execution Providers| CPU & DirectML EP | App chosen EP| -|Compatability testing| Tested with OS flights against supported GPU's and CPU's | App performs compatibility testing| -|Opset| Refreshed in OS updates| App chooses| - - -### Using the NuGet WinRT API with other C-API distributions -The WinRT API NuGet is distributed with a curated build of the OnnxRuntime engine. App developers may wish to use the WinRT API, but find themselves limited to the functionality provided by the curated OnnxRuntime engine distributed as part of the WinRT API NuGet package. This can happen because the OnnxRuntime engine shipped with the WinRT API NuGet package only contains the CPU and DML execution providers. - -App developers may additionally wish to use a custom build-from-source version of the OnnxRuntime engine as well, or use a prebuilt version of the OnnxRuntime engine from another distribution source like the Micorosoft.ML.OnnxRuntime.MKLML distribution. - -To enable this, the WinRT API NuGet has been made to be compatible with a set of OnnxRuntime engines that ship in different NuGet packages. - -Please refer to the following table listing the distributions with compatible OnnxRuntime engines. -- [Microsoft.ML.OnnxRuntime](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) -- [Microsoft.ML.OnnxRuntime.DirectML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.DirectML/) -- [Microsoft.ML.OnnxRuntime.MKLML](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime.MKLML) - -Note that compatible distributions must match in release version. - -In order to use compatible engines, replace the onnxruntime.dll with the desired engine binary and its associated binaries. \ No newline at end of file diff --git a/docs/Java_API.md b/docs/Java_API.md deleted file mode 100644 index 72978e0770..0000000000 --- a/docs/Java_API.md +++ /dev/null @@ -1,79 +0,0 @@ -# ONNX Runtime Java API -The ONNX runtime provides a Java binding for running inference on ONNX models on a JVM, using Java 8 or newer. - -Release artifacts are published to Maven Central for use as a dependency in most Java build tools. The artifacts are built with support for some popular plaforms. - -![Version Shield](https://img.shields.io/maven-central/v/com.microsoft.onnxruntime/onnxruntime) - -| Artifact | Description | Supported Platforms | -|-----------|-------------|---------------------| -| [com.microsoft.onnxruntime:onnxruntime](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime) | CPU | Windows x64, Linux x64, macOS x64 | -| [com.microsoft.onnxruntime:onnxruntime_gpu](https://search.maven.org/artifact/com.microsoft.onnxruntime/onnxruntime_gpu) | GPU (CUDA) | Windows x64, Linux x64 | - -For building locally, please see the [Java API development documentation](../java/README.md) for more details. - -For customization of the loading mechanism of the shared library, please see [advanced loading instructions](../java/README.md#advanced-loading). - -## API Reference - -The Javadoc is available [here](https://javadoc.io/doc/com.microsoft.onnxruntime/onnxruntime). - -## Sample Code - -An example implementation is located in -[src/test/java/sample/ScoreMNIST.java](../java/src/test/java/sample/ScoreMNIST.java). -Once compiled the sample code expects the following arguments `ScoreMNIST - `. MNIST is expected -to be in libsvm format. If the optional scikit-learn flag is supplied the model -is expected to be produced by skl2onnx (so expects a flat feature vector, and -produces a structured output), otherwise the model is expected to be a CNN from -pytorch (expecting a `[1][1][28][28]` input, producing a vector of -probabilities). Two example models are provided in [testdata](../java/testdata), -`cnn_mnist_pytorch.onnx` and `lr_mnist_scikit.onnx`. The first is a LeNet5 style -CNN trained using PyTorch, the second is a logistic regression trained using scikit-learn. - -The unit tests contain several examples of loading models, inspecting input/output node shapes and types, as well as constructing tensors for scoring. - -* [../java/src/test/java/ai/onnxruntime/InferenceTest.java#L66](../java/src/test/java/ai/onnxruntime/InferenceTest.java#L66) - -## Getting Started -Here is simple tutorial for getting started with running inference on an existing ONNX model for a given input data. The model is typically trained using any of the well-known training frameworks and exported into the ONNX format. -Note the code presented below uses syntax available from Java 10 onwards. The Java 8 syntax is similar but more verbose. -To start a scoring session, first create the `OrtEnvironment`, then open a session using the `OrtSession` class, passing in the file path to the model as a parameter. - - var env = OrtEnvironment.getEnvironment(); - var session = env.createSession("model.onnx",new OrtSession.SessionOptions()); - -Once a session is created, you can execute queries using the `run` method of the `OrtSession` object. -At the moment we support `OnnxTensor` inputs, and models can produce `OnnxTensor`, `OnnxSequence` or `OnnxMap` outputs. The latter two are more likely when scoring models produced by frameworks like scikit-learn. -The run call expects a `Map` where the keys match input node names stored in the model. These can be viewed by calling `session.getInputNames()` or `session.getInputInfo()` on an instantiated session. -The run call produces a `Result` object, which contains a `Map` representing the output. The `Result` object is `AutoCloseable` and can be used in a try-with-resources statement to -prevent references from leaking out. Once the `Result` object is closed, all it's child `OnnxValue`s are closed too. - - OnnxTensor t1,t2; - var inputs = Map.of("name1",t1,"name2",t2); - try (var results = session.run(inputs)) { - // manipulate the results - } - -You can load your input data into OnnxTensor objects in several ways. The most efficient way is to use a `java.nio.Buffer`, but it's possible to use multidimensional arrays too. If constructed using arrays the arrays must not be ragged. - - FloatBuffer sourceData; // assume your data is loaded into a FloatBuffer - long[] dimensions; // and the dimensions of the input are stored here - var tensorFromBuffer = OnnxTensor.createTensor(env,sourceData,dimensions); - - float[][] sourceArray = new float[28][28]; // assume your data is loaded into a float array - var tensorFromArray = OnnxTensor.createTensor(env,sourceArray); - -Here is a [complete sample program](../java/src/test/java/sample/ScoreMNIST.java) that runs inference on a pretrained MNIST model. - -## Running on a GPU or with another provider (Optional) -To enable other execution providers like GPUs simply turn on the appropriate flag on SessionOptions when creating an OrtSession. - - int gpuDeviceId = 0; // The GPU device ID to execute on - var sessionOptions = new OrtSession.SessionOptions(); - sessionOptions.addCUDA(gpuDeviceId); - var session = environment.createSession("model.onnx", sessionOptions); - -The execution providers are preferred in the order they were enabled. - diff --git a/docs/ONNX_Runtime_Graph_Optimizations.md b/docs/ONNX_Runtime_Graph_Optimizations.md deleted file mode 100644 index 9b23d30fed..0000000000 --- a/docs/ONNX_Runtime_Graph_Optimizations.md +++ /dev/null @@ -1,148 +0,0 @@ -# Graph Optimizations in ONNX Runtime - -ONNX Runtime provides various graph optimizations to improve model performance. Graph optimizations are essentially graph-level transformations, ranging from small graph simplifications and node eliminations to more complex node fusions and layout optimizations. - -Graph optimizations are divided in several categories (or *levels*) based on their complexity and functionality. They can be performed either *online* or *offline*. In online mode, the optimizations are done before performing the inference, while in offline mode, the runtime saves the optimized graph to disk. ONNX Runtime provides Python, C#, C++, and C APIs to enable different optimization levels and to choose between offline vs. online mode. - -Below we provide details on the optimization levels, the online/offline mode, and the various APIs to control them. - -## Graph Optimization Levels - -Graph optimizations are divided in three levels: -* Basic -* Extended -* Layout Optimizations - -The optimizations belonging to one level are performed after the optimizations of the previous level have been applied (e.g., extended optimizations are applied after basic optimizations have been applied). - -**All optimizations are enabled by default.** - -### Basic Graph Optimizations - -These are semantics-preserving graph rewrites which remove redundant nodes and redundant computation. They run before graph partitioning and thus apply to all the execution providers. Available basic graph optimizations are as follows: - -* Constant Folding: Statically computes parts of the graph that rely only on constant initializers. This eliminates the need to compute them during runtime. - -* Redundant node eliminations: Remove all redundant nodes without changing the graph structure. The following such optimizations are currently supported: - * Identity Elimination - * Slice Elimination - * Unsqueeze Elimination - * Dropout Elimination - -* Semantics-preserving node fusions : Fuse/fold multiple nodes into a single node. For example, Conv Add fusion folds the Add operator as the bias of the Conv operator. The following such optimizations are currently supported: - * Conv Add Fusion - * Conv Mul Fusion - * Conv BatchNorm Fusion - * Relu Clip Fusion - * Reshape Fusion - -### Extended Graph Optimizations - -These optimizations include complex node fusions. They are run after graph partitioning and are only applied to the nodes assigned to the CPU or CUDA execution provider. Available extended graph optimizations are as follows: - -| Optimization | Execution Provider | Comment | -|---------------------------------|--------------------|-----------------------------------------------------------------------------| -| GEMM Activation Fusion | cpu | | -| Matmul Add Fusion | cpu | | -| Conv Activation Fusion | cpu | | -| GELU Fusion | cpu or cuda | | -| Layer Normalization Fusion | cpu or cuda | | -| BERT Embedding Layer Fusion | cpu or cuda | Fuse BERT embedding layer, layer normalization and attention mask length | -| Attention Fusion | cpu or cuda | Attention mask has approximation in cuda execution provider | -| Skip Layer Normalization Fusion | cpu or cuda | Fuse bias of fully connected layer, skip connection and layer normalization | -| Bias GELU Fusion | cpu or cuda | Fuse bias of fully connected layer and GELU activation | -| GELU Approximation | cuda | Erf is approximated by a formula using tanh function | - -To optimize inference performance of BERT model, approximation is used in GELU approximation and Attention fusion for cuda execution provider. There might be slight difference in result. The impact on accuracy could be neglected based on our evaluation: F1 score for a BERT model on SQuAD v1.1 is almost same (87.05 vs 87.03). - -GELU approximation is disabled by default. - -### Layout Optimizations - -These optimizations change the data layout for applicable nodes to achieve higher performance improvements. They are run after graph partitioning and are only applied to nodes assigned to CPU execution provider. Available layout optimizations are as follows: - -* NCHWc Optimizer: Optimizes the graph by using NCHWc layout instead of NCHW layout. - -## Online/Offline Mode - -All optimizations can be performed either online or offline. In online mode, when initializing an inference session, we also apply all enabled graph optimizations before performing model inference. Applying all optimizations each time we initiate a session can add overhead to the model startup time (especially for complex models), which can be critical in production scenarios. This is where the offline mode can bring a lot of benefit. In offline mode, after performing graph optimizations, ONNX Runtime serializes the resulting model to disk. Subsequently, when new inference sessions are created for this model, we can instead use the already optimized model to reduce startup time. - -**Notes**: - -* When running in offline mode, make sure to use the exact same options (e.g., execution providers, optimization level) and hardware as the target machine that the model inference will run on (e.g., you cannot run a model pre-optimized for a GPU execution provider on a machine that is equipped only with CPU). -* When layout optimizations are enabled, the offline mode can only be used on compatible hardware to the environment when the offline model is saved. For example, if model has layout optimized for AVX2, the offline model would require CPUs that support AVX2. - -## Usage - -### General Note -**Levels**: -ONNX Runtime defines the `GraphOptimizationLevel` enum to determine which of the aforementioned optimization levels will be enabled. Choosing a level enables the optimizations of that level, as well as the optimizations of all preceding levels. For example, enabling Extended optimizations, also enables Basic optimizations. The mapping of these levels to the enum is as follows: - -* GraphOptimizationLevel::ORT_DISABLE_ALL -> Disables all optimizations -* GraphOptimizationLevel::ORT_ENABLE_BASIC -> Enables basic optimizations -* GraphOptimizationLevel::ORT_ENABLE_EXTENDED -> Enables basic and extended optimizations -* GraphOptimizationLevel::ORT_ENABLE_ALL -> Enables all available optimizations including layout optimizations - -**Online/Offline Mode**: -To enable serialization of the optimized model to disk, set the SessionOptions option `optimized_model_path` to the desired path where the optimized model will be stored. - -### Python API Usage -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() - -# Set graph optimization level -sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - -# To enable model serialization after graph optimization set this -sess_options.optimized_model_filepath = "" - -session = rt.InferenceSession("", sess_options) -``` - -### C API Example: -```c - const OrtApi* Ort::g_api = OrtGetApi(ORT_API_VERSION); - OrtEnv* env; - g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "test", &env); - OrtSessionOptions* session_options; - g_ort->CreateSessionOptions(&session_options) - - // Set graph optimization level - g_ort->SetSessionGraphOptimizationLevel(session_options, ORT_ENABLE_EXTENDED); - - // To enable model serialization after graph optimization set this - const wchar_t* optimized_model_path = L"optimized_model_path"; - g_ort->SetOptimizedModelFilePath(session_options, optimized_model_path); - - OrtSession* session; - const wchar_t* model_path = L"model_path"; - g_ort->CreateSession(env, model_path, session_option, &session); -``` - -### C# API Example: -```c# -SessionOptions so = new SessionOptions(); - -// Set graph optimization level -so.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_EXTENDED; - -// To enable model serialization after graph optimization set this -so.OptimizedModelFilePath = "model_output_path\optimized_model.onnx" - -var session = new InferenceSession(modelPath, so); -``` - -### C++ API Example: -```c++ -Ort::SessionOptions session_options; - -// Set graph optimization level -session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - -// To enable model serialization after graph optimization set this -session_options.SetOptimizedModelFilePath("optimized_file_path"); - -auto session_ = Ort::Session(env, "model_file_path", session_options); -``` diff --git a/docs/ONNX_Runtime_Perf_Tuning.md b/docs/ONNX_Runtime_Perf_Tuning.md deleted file mode 100644 index c7040b8a0e..0000000000 --- a/docs/ONNX_Runtime_Perf_Tuning.md +++ /dev/null @@ -1,179 +0,0 @@ -# ONNX Runtime Performance Tuning - -ONNX Runtime gives high performance across a range of hardware options by providing "Execution Providers" to interface to different execution environments. See: [design overview](./HighLevelDesign.md), [supported execution providers](../README.md#supported-accelerators). - -Along with this flexibility comes decisions for tuning and usage. For each model running with each execution provider, there are settings that can be tuned (e.g. thread number, wait policy, etc) to improve performance. - -This document covers basic tools and knobs that can be leveraged to find the best performance for your model and hardware. - -**Topics** -* [Performance Tuning Tools](#Performance-Tuning-Tools) -* [Using different Execution Providers](#Using-different-Execution-Providers) -* [Which Execution Provider will provide the best performance?](#Which-Execution-Provider-will-provide-the-best-performance) -* [Tuning performance for specific Execution Providers](#Tuning-performance-for-specific-Execution-Providers) -* [Troubleshooting model performance issues](#Troubleshooting-model-performance-issues) -*** - -## Performance Tuning Tools -The [ONNX Go Live "OLive" tool](https://github.com/microsoft/OLive) is an easy-to-use pipeline for converting models to ONNX and optimizing performance with ONNX Runtime. The tool can help identify the optimal runtime configuration to get the best performance on the target hardware for the model. -As a quickstart, please see the notebooks: [Python](https://github.com/microsoft/OLive/blob/master/notebook/Convert_Models_and_Tune_Performance_with_OLive_Python_SDK.ipynb), [Docker images](https://github.com/microsoft/OLive/blob/master/notebook/Convert_Models_and_Tune_Performance_with_OLive_Docker_Images.ipynb) - - -### Profiling and Performance Report - -The onnxruntime_perf_test.exe tool (available from the build drop) can be used to test various knobs. Please find the usage instructions using `onnxruntime_perf_test.exe -h`. - -You can enable ONNX Runtime latency profiling in code: - -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() -sess_options.enable_profiling = True -``` -If you are using the onnxruntime_perf_test.exe tool, you can add `-p [profile_file]` to enable performance profiling. - -In both cases, you will get a JSON file which contains the detailed performance data (threading, latency of each operator, etc). This file is a standard performance tracing file, and to view it in a user friendly way, you can open it by using chrome://tracing: -* Open chrome browser -* Type chrome://tracing in the address bar -* Load the generated JSON file - -## Using different Execution Providers -To learn more about different Execution Providers, see [docs/exeuction_providers](./execution_providers). - -### Python API -Official Python packages on Pypi only support the default CPU (MLAS) and default GPU (CUDA) execution providers. For other execution providers, you need to build from source. Please refer to the [build instructions](../BUILD.md). The recommended instructions build the wheel with debug info in parallel. - -For example: - -`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_wheel --parallel` - -` CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_wheel --parallel` - - -### C and C# API -Official release (nuget package) supports default (MLAS) and MKL-ML for CPU, and CUDA for GPU. For other execution providers, you need to build from source. Append `--build_csharp` to the instructions to build both C# and C packages. - -For example: - -`DNNL: ./build.sh --config RelWithDebInfo --use_dnnl --build_csharp --parallel` - -`CUDA: ./build.sh --config RelWithDebInfo --use_cuda --build_csharp --parallel` - -In order to use DNNL, CUDA, or TensorRT execution provider, you need to call the C API OrtSessionOptionsAppendExecutionProvider. Here is an example for the CUDA execution provider: - -C API Example: -```c - const OrtApi* g_ort = OrtGetApi(ORT_API_VERSION); - OrtEnv* env; - g_ort->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "test", &env) - OrtSessionOptions* session_option; - g_ort->OrtCreateSessionOptions(&session_options); - g_ort->OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0); - OrtSession* session; - g_ort->CreateSession(env, model_path, session_option, &session); -``` - -C# API Example: -```c# -SessionOptions so = new SessionOptions(); -so.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_EXTENDED; -so.AppendExecutionProvider_CUDA(0); -var session = new InferenceSession(modelPath, so); -``` - -Python API Example: -```python -import onnxruntime as rt - -so = rt.SessionOptions() -so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL -session = rt.InferenceSession(model, sess_options=so) -session.set_providers(['CUDAExecutionProvider']) -``` - -## Which Execution Provider will provide the best performance? -Performance is dependent on the specific model you're trying to run, the session and run options you've selected, and of course, your specific hardware target. Below you'll find some more information that may be helpful to select the right Execution Provider. - -### CUDA (Default GPU) or CPU? -The CPU version of ONNX Runtime provides a complete implementation of all operators in the ONNX spec. This ensures that your ONNX-compliant model can execute successfully. In order to keep the binary size small, common data types are supported for the ops. If you are using an uncommon data type that is not supported, you can file an issue and/or contribute a PR (see examples - [PR #2112](https://github.com/microsoft/onnxruntime/pull/2112), [PR #2034](https://github.com/microsoft/onnxruntime/pull/2034), [PR #1565](https://github.com/microsoft/onnxruntime/pull/1565)). Please make sure you provide details on usage justification. - -Additionally, not all CUDA kernels are implemented, as these have been prioritized on an as-needed basis. This means that if your model contains operators that do not have a CUDA implementation, it will fall back to CPU. Switching between CPU and GPU can cause significant performance impact. If you require a specific operator that is not currently supported, please consider [contributing](./../CONTRIBUTING.md) and/or [file an issue](https://github.com/microsoft/onnxruntime/issues) clearly describing your use case and share your model if possible. - -### TensorRT or CUDA? -TensorRT and CUDA are separate execution providers for ONNX Runtime. On the same hardware, TensorRT will generally provide better performance; however, this depends on the specific model and whether the operators in the model can be supported by TensorRT. In cases where TensorRT cannot handle the subgraph(s), it will fall back to CUDA. Note that the TensorRT EP may depend on a different version of CUDA than the CUDA EP. - -### TensorRT/CUDA or DirectML? -DirectML is the hardware-accelerated DirectX 12 library for machine learning on Windows and supports all DirectX 12 capable devices (Nvidia, Intel, AMD). This means that if you are targeting Windows GPUs, using the DirectML Execution Provider is likely your best bet. This can be used with both the ONNX Runtime as well as [WinML APIs](./WinRT_API.md). - -## Tuning performance for specific Execution Providers - -### Thread management -* If ORT is built with OpenMP, use the OpenMP env variable to control the number of intra op num threads. -* If ORT is not built with OpenMP, use the appropriate ORT API to control intra op num threads. -* Inter op num threads (used only when parallel execution is enabled) is not affected by OpenMP settings and should -always be set using the ORT APIs. - -### Default CPU Execution Provider (MLAS) -The default execution provider uses different knobs to control the thread number. - -For the default CPU execution provider, you can try following knobs in the Python API: -```python -import onnxruntime as rt - -sess_options = rt.SessionOptions() - -sess_options.intra_op_num_threads = 2 -sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL -sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL -``` - -* Thread Count - * `sess_options.intra_op_num_threads = 2` controls the number of threads to use to run the model -* Sequential vs Parallel Execution - * `sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL` controls whether the operators in the graph run sequentially or in parallel. Usually when a model has many branches, setting this option to false will provide better performance. - * When `sess_options.execution_mode = rt.ExecutionMode.ORT_PARALLEL`, you can set `sess_options.inter_op_num_threads` to control the -number of threads used to parallelize the execution of the graph (across nodes). - -* sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL. Default is already ORT_ENABLE_ALL(99). Please see [onnxruntime_c_api.h](../include/onnxruntime/core/session/onnxruntime_c_api.h#L241) (enum GraphOptimizationLevel) for the full list of all optimization levels. For details regarding available optimizations and usage please refer to the [Graph Optimizations Doc](../docs/ONNX_Runtime_Graph_Optimizations.md). - -### MKL_DNN/MKL_ML Execution Provider -MKL_DNN and MKL_ML depend on openmp for parallelization. For those execution providers, we need to use the openmp environment variable to tune the performance. - -The most widely used environment variables are: - -* OMP_NUM_THREADS=n - * Controls the thread pool size - -* OMP_WAIT_POLICY=PASSIVE/ACTIVE - * Controls whether thread spinning is enabled - * PASSIVE is also called throughput mode and will yield CPU after finishing current task - * ACTIVE will not yield CPU, instead it will have a while loop to check whether the next task is ready - * Use PASSIVE if your CPU usage already high, and use ACTIVE when you want to trade CPU with latency - -## Using and configuring shared arena based allocator to reduce memory consumption between multiple sessions -See `Share allocator(s) between sessions` section in [C API documentation](C_API.md). - -## Troubleshooting model performance issues -The answers below are troubleshooting suggestions based on common previous user-filed issues and questions. This list is by no means exhaustive and there is a lot of case-by-case fluctuation depending on the model and specific usage scenario. Please use this information to guide your troubleshooting, search through previously filed issues for related topics, and/or file a new issue if your problem is still not resolved. - -### Performance Troubleshooting Checklist -Here is a list of things to check through when assessing performance issues. -* Are you using OpenMP? OpenMP will parallelize some of the code for potential performance improvements. This is not recommended for running on single threads. -* Have you enabled all [graph optimizations](./ONNX_Runtime_Graph_Optimizations.md)? The official published packages do enable all by default, but when building from source, check that these are enabled in your build. -* Have you searched through prior filed [Github issues](https://github.com/microsoft/onnxruntime/issues) to see if your problem has been discussed previously? Please do this before filing new issues. -* If using CUDA or TensorRT, do you have the right versions of the dependent libraries installed? - -### I need help performance tuning for BERT models. -For BERT models, sometimes ONNX Runtime cannot apply the best optimization due to reasons such as framework version updates. We recommend trying out the [BERT optimization tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/bert), which reflects the latest changes in graph pattern matching and model conversions, and a set of [notebooks](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/bert/notebooks) to help get started. - -### Why is the model graph not optimized even with graph_optimization_level set to ORT_ENABLE_ALL? -The ONNX model from IR_VERSION 4 only treats initializers that appear in graph input as non-constant. This may fail some of the graph optimizations, like const folding, operator fusion and etc. Move initializers out of graph inputs if there is no need to override them, by either re-generating the model with latest exporter/converter or with the tool [remove_initializer_from_input.py](./../tools/python/remove_initializer_from_input.py). - -### Why is my model running slower on GPU than CPU? -Depending on which execution provider you're using, it may not have full support for all the operators in your model. Fallback to CPU ops can cause hits in performance speed. Moreover even if an op is implemented by the CUDA execution provider, it may not necessarily assign/place the op to the CUDA EP due to performance reasons. To see the placement decided by ORT, turn on verbose logging and look at the console output. - -### My converted Tensorflow model is slow - why? -NCHW and NHWC are two different memory layout for 4-D tensors. - -Most TensorFlow operations used by a CNN support both NHWC and NCHW data format. The Tensorflow team suggests that on GPU NCHW is faster but on CPU NHWC is sometimes faster in Tensorflow. However, ONNX only supports NCHW. As a result, if the original model is in NHWC format, when the model is converted extra transposes may be added. The [tensorflow-onnx](https://github.com/onnx/tensorflow-onnx) and [keras-onnx](https://github.com/onnx/keras-onnx) converters do remove many of these transposes, but if this doesn't help sufficiently, consider retraining the model using NCHW. diff --git a/docs/PyOp.md b/docs/PyOp.md deleted file mode 100644 index a494362205..0000000000 --- a/docs/PyOp.md +++ /dev/null @@ -1,136 +0,0 @@ -# Python Operator - -**Deprecation Note: This feature is deprecated and no longer supported, please refer to [onnxruntime_customops](https://github.com/microsoft/ort-customops) project for this function.** - -The Python Operator provides the capability to easily invoke any custom Python code within a single node of an ONNX graph using ONNX Runtime. This can be useful for quicker experimentation when a model requires operators that are not officially supported in ONNX and ONNX Runtime, particularly if there is already a Python implementation for the required functionality. This should be used with discretion in production scenarios, and all security or other risks should be considered beforehand. - -## Design Overview -The feature can be found under [onnxruntime/core/language_interop_ops](../onnxruntime/core/language_interop_ops). -Here is a chart of calling sequence: -
-onnxruntime                        python capi                         script
-     |                                  |                                 |
-     | ------------------------------>  |                                 |
-     |       call with tensor(s)        | ------------------------------> |
-     |                                  |         call with numpy(s)      | 
-     |                                  |                                 | compute
-     |                                  | <------------------------------ |
-     | <------------------------------  |           return numpys(s)      |
-     |         return tensor(s)         |                                 |
-
- -## How to Use -### Step 1 -Build onnxruntime with `--config Release --enable_language_interop_ops --build_wheel` and pip install the latest wheel file. - -### Step 2 -Create an onnx model containing Python operator nodes: -```python -ad1_node = helper.make_node('Add', ['A','B'], ['S']) -mul_node = helper.make_node('Mul', ['C','D'], ['P']) -py1_node = helper.make_node(op_type = 'PyOp', #required, must be 'PyOp' - inputs = ['S','P'], #required - outputs = ['L','M','N'], #required - domain = 'pyopmulti_1', #required, must be unique - input_types = [TensorProto.FLOAT, TensorProto.FLOAT], #required - output_types = [TensorProto.FLOAT, TensorProto.FLOAT, TensorProto.FLOAT], #required - module = 'mymodule', #required - class_name = 'Multi_1', #required - compute = 'compute', #optional, 'compute' by default - W1 = '5', W2 = '7', W3 = '9') #optional, must all be strings -ad2_node = helper.make_node('Add', ['L','M'], ['H']) -py2_node = helper.make_node('PyOp',['H','N','E'],['O','W'], domain = 'pyopmulti_2', - input_types = [TensorProto.FLOAT, TensorProto.FLOAT, TensorProto.FLOAT], - output_types = [TensorProto.FLOAT, TensorProto.FLOAT], - module = 'mymodule', class_name = 'Multi_2') -sub_node = helper.make_node('Sub', ['O','W'], ['F']) -graph = helper.make_graph([ad1_node,mul_node,py1_node,ad2_node,py2_node,sub_node], 'multi_pyop_graph', [A,B,C,D,E], [F]) -model = helper.make_model(graph, producer_name = 'pyop_model') -onnx.save(model, './model.onnx') -``` -### Step 3 -Implement mymodule.py: -```python -class Multi_1: - def __init__(self, W1, W2, W3): - self.W1 = int(W1) - self.W2 = int(W2) - self.W3 = int(W3) - def compute(self, S, P): - ret = S + P - return ret + self.W1, ret + self.W2, ret + self.W3 -class Multi_2: - def compute(self, *kwargs): - return sum(kwargs[0:-1]), sum(kwargs[1:]) -``` -### Step 4 -Copy mymodule.py into Python sys.path, then run the model with onnxruntime python API. On Windows, please set PYTHONHOME beforehand. It should point to directory where the python is installed, such as C:\Python37 or C:\ProgramData\Anaconda3\envs\myconda1 if it is in conda. - -## Supported Data Types -* TensorProto.BOOL -* TensorProto.UINT8 -* TensorProto.UINT16 -* TensorProto.UINT32 -* TensorProto.INT16 -* TensorProto.INT32 -* TensorProto.FLOAT -* TensorProto.DOUBLE - -## Limitations -* Inferencing and compiling environments must be installed with same version of python. -* On Windows, `--config Debug` has known issues. Please build with `--config RelWithDebInfo` if debugging symbols are needed. -* Due to Python C API restrictions, multi-threading is disabled so Python operators will run sequentially. - -## Test Coverage -The operator has been tested on multiple platforms, with or without conda: - -Platform | Python 3.5 | Python 3.6 | Python 3.7 ------------ | ------------| ----------- | ----------- -Windows | (conda) passed | (conda) passed | passed -Linux | (conda) passed | (conda) passed | passed -Mac | (conda) passed | (conda) passed | (conda) passed - -## Example -Developers could resort to PyOp during model conversion for missing operators: -```python -import os -import numpy as np -from onnx import * -from skl2onnx import convert_sklearn -from skl2onnx.common.data_types import FloatTensorType -from skl2onnx.common.utils import check_input_and_output_numbers - -X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]],dtype=np.single) -nmf = NMF(n_components=2, init='random', random_state=0) -W = np.array(nmf.fit_transform(X), dtype=np.single) - -def calculate_sklearn_nmf_output_shapes(operator): - check_input_and_output_numbers(operator, output_count_range=1, input_count_range=1) - operator.outputs[0].type.shape = operator.inputs[0].type.shape - -def convert_nmf(scope, operator, container): - ws = [str(w) for w in W.flatten()] - attrs = {'W':'|'.join(ws)} - container.add_node(op_type='PyOp', name='nmf', inputs=['X'], outputs=['variable'], - op_version=10, op_domain='MyDomain', module='mymodule', class_name='MyNmf', - input_types=[TensorProto.FLOAT], output_types=[TensorProto.FLOAT], **attrs) - -custom_shape_calculators = {type(nmf): calculate_sklearn_nmf_output_shapes} -custom_conversion_functions = {type(nmf): convert_nmf} -initial_types = [('X', FloatTensorType([6,2]))] -onx = convert_sklearn(nmf, '', initial_types, '', None, custom_conversion_functions, custom_shape_calculators) -with th open("model.onnx", "wb") as f: - f.write(onx.SerializeToString()) -``` -mymodule.py: -```python -import numpy as np -class MyNmf: - def __init__(self,W): - A = [] - for w in W.split('|'): - A.append(float(w)) - self.__W = np.array(A,dtype=np.single).reshape(6,2) - def compute(self,X): - return self.__W -``` diff --git a/docs/WinRT_API.md b/docs/WinRT_API.md deleted file mode 100644 index 566381bd9f..0000000000 --- a/docs/WinRT_API.md +++ /dev/null @@ -1,37 +0,0 @@ -# Windows Machine Learning WinRT API - -New in the ONNX Runtime Nuget package is the ability to use the full [WinML API](https://docs.microsoft.com/en-us/windows/ai/windows-ml/api-reference). - -This allows scenarios such as passing a [Windows.Media.VideoFrame](https://docs.microsoft.com/en-us/uwp/api/Windows.Media.VideoFrame) from your connected camera directly into the runtime for realtime inference. - -The WinML API is a WinRT API that shipped inside the Windows OS starting with build 1809 (RS5) in the Windows.AI.MachineLearning namespace. It embedded a version of the ONNX Runtime. - -Many customers have asked for a way to use this offering as an application redistributable package. - -With our new [layered architecture](InferenceHighLevelDesign.md#the-onnx-runtime-and-windows-os-integration) you can now do this, with some limitations. The WinML APIs have been lifted and mirrored into the Microsoft.AI.MachineLearning namespace in the redistributable. - -## NuGet Package - -The Microsoft.AI.MachineLearning [Nuget package](https://www.nuget.org/packages/Microsoft.AI.MachineLearning/) includes the precompiled binaries for using the ONNX runtime with the WinRT API. Support is compiled directly into *onnxruntime.dll* - -Note: As of the 1.3 release, you can use all of the CPU and GPU functionality from these binaries. - -## Sample Code - -Any code already written for the Windows.AI.MachineLearning API can be easily modified to run against the Microsoft.ML.OnnxRuntime package. All types originally referenced by inbox customers via the Windows namespace will need to be updated to now use the Microsoft namespace. Check out these [existing samples](https://github.com/microsoft/Windows-Machine-Learning/tree/master/Samples/SqueezeNetObjectDetection/Desktop/cpp) in github. - -## Deciding on whether to use WinML in the Windows SDK or the Redist -To detect if a particular OS version of Windows has the WinML APIs, use the [IsApiContractPresent](https://docs.microsoft.com/en-us/uwp/api/windows.foundation.metadata.apiinformation.isapicontractpresent) method. This can be called from either UWP or native apps. - -If the OS does not have the runtime you need you can switch to use the redist binaries instead. - -|Release|API contract version| -|--|--| -|Windows OS 1809| 1| -|Windows OS 1903| 2| -|Windows OS 1909| 2| -|ORT release 1.2| 3| -|ORT release 1.3| 3| -|ORT release 1.4| 3| - -See [here](https://docs.microsoft.com/en-us/windows/ai/windows-ml/onnx-versions) for more about opsets and ONNX version details in Windows OS distributions. diff --git a/docs/execution_providers/ACL-ExecutionProvider.md b/docs/execution_providers/ACL-ExecutionProvider.md deleted file mode 100644 index 40b9a49693..0000000000 --- a/docs/execution_providers/ACL-ExecutionProvider.md +++ /dev/null @@ -1,21 +0,0 @@ -## ACL Execution Provider - -[Arm Compute Library](https://github.com/ARM-software/ComputeLibrary) is an open source inference engine maintained by Arm and Linaro companies. The integration of ACL as an execution provider (EP) into ONNX Runtime accelerates performance of ONNX model workloads across Armv8 cores. - -### Build ACL execution provider -For build instructions, please see the [BUILD page](../../BUILD.md#ARM-Compute-Library). - -### Using the ACL execution provider -#### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ACL(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest), use the flag -e acl diff --git a/docs/execution_providers/ArmNN-ExecutionProvider.md b/docs/execution_providers/ArmNN-ExecutionProvider.md deleted file mode 100644 index 35f2fc2da8..0000000000 --- a/docs/execution_providers/ArmNN-ExecutionProvider.md +++ /dev/null @@ -1,22 +0,0 @@ -## ArmNN Execution Provider - -[ArmNN](https://github.com/ARM-software/armnn) is an open source inference engine maintained by Arm and Linaro companies. The integration of ArmNN as an execution provider (EP) into ONNX Runtime accelerates performance of ONNX model workloads across Armv8 cores. - -### Build ArmNN execution provider -For build instructions, please see the [BUILD page](../../BUILD.md#ArmNN). - -### Using the ArmNN execution provider -#### C/C++ -To use ArmNN as execution provider for inferencing, please register it as below. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ArmNN(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest), use the flag -e armnn diff --git a/docs/execution_providers/DNNL-ExecutionProvider.md b/docs/execution_providers/DNNL-ExecutionProvider.md deleted file mode 100644 index 0952cccbe1..0000000000 --- a/docs/execution_providers/DNNL-ExecutionProvider.md +++ /dev/null @@ -1,35 +0,0 @@ -# DNNL Execution Provider - -Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) is an open-source performance library for deep-learning applications. The library accelerates deep-learning applications and frameworks on Intel® architecture and Intel® Processor Graphics Architecture. Intel DNNL contains vectorized and threaded building blocks that you can use to implement deep neural networks (DNN) with C and C++ interfaces. For more, please see the DNNL documentation on (https://intel.github.io/mkl-dnn/). - -Intel and Microsoft have developed DNNL Execution Provider (EP) for ONNX Runtime to accelerate performance of ONNX Runtime using Intel® Math Kernel Library for Deep Neural Networks (Intel® DNNL) optimized primitives. - -For information on how DNNL optimizes subgraphs, see [Subgraph Optimization](./MKL-DNN-Subgraphs.md) - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#dnnl-and-mklml). - -## Supported OS -* Ubuntu 16.04 -* Windows 10 -* Mac OS X - -## Supported backend -* CPU - -## Using the DNNL Execution Provider -### C/C++ -The DNNLExecutionProvider execution provider needs to be registered with ONNX Runtime to enable in the inference session. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -bool enable_cpu_mem_arena = true; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Dnnl(sf, enable_cpu_mem_arena)); -``` -The C API details are [here](../C_API.md#c-api). - -### Python -When using the python wheel from the ONNX Runtime built with DNNL execution provider, it will be automatically prioritized over the CPU execution provider. Python APIs details are [here](https://aka.ms/onnxruntime-python). - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) diff --git a/docs/execution_providers/DirectML-ExecutionProvider.md b/docs/execution_providers/DirectML-ExecutionProvider.md deleted file mode 100644 index 3a1c7b5292..0000000000 --- a/docs/execution_providers/DirectML-ExecutionProvider.md +++ /dev/null @@ -1,126 +0,0 @@ -# DirectML Execution Provider - -DirectML is a high-performance, hardware-accelerated DirectX 12 library for machine learning on Windows. DirectML provides GPU acceleration for common machine learning tasks across a broad range of supported hardware and drivers. - -When used standalone, the DirectML API is a low-level DirectX 12 library and is suitable for high-performance, low-latency applications such as frameworks, games, and other real-time applications. The seamless interoperability of DirectML with Direct3D 12 as well as its low overhead and conformance across hardware makes DirectML ideal for accelerating machine learning when both high performance is desired, and the reliability and predictability of results across hardware is critical. - -The *DirectML Execution Provider* is an optional component of ONNX Runtime that uses DirectML to accelerate inference of ONNX models. The DirectML execution provider is capable of greatly improving evaluation time of models using commodity GPU hardware, without sacrificing broad hardware support or requiring vendor-specific extensions to be installed. - -The DirectML Execution Provider currently uses DirectML version 1.4.0. - -## Table of contents - -- [DirectML Execution Provider](#directml-execution-provider) - - [Table of contents](#table-of-contents) - - [Minimum requirements](#minimum-requirements) - - [Building from source](#building-from-source) - - [Using the DirectML execution provider](#using-the-directml-execution-provider) - - [`OrtSessionOptionsAppendExecutionProvider_DML` function](#ortsessionoptionsappendexecutionprovider_dml-function) - - [`OrtSessionOptionsAppendExecutionProviderEx_DML` function](#ortsessionoptionsappendexecutionproviderex_dml-function) - - [ONNX opset support](#onnx-opset-support) - - [Multi-threading and supported session options](#multi-threading-and-supported-session-options) - - [Samples](#samples) - - [Performance best practices](#performance-best-practices) - - [See also](#see-also) - -## Minimum requirements - -The DirectML execution provider requires any DirectX 12 capable device. Almost all commercially-available graphics cards released in the last several years support DirectX 12. Examples of compatible hardware include: - -* NVIDIA Kepler (GTX 600 series) and above -* AMD GCN 1st Gen (Radeon HD 7000 series) and above -* Intel Haswell (4th-gen core) HD Integrated Graphics and above - -DirectML is compatible with Windows 10, version 1709 (10.0.16299; RS3, "Fall Creators Update") and newer. - - - -## Building from source - -For general information about building onnxruntime, see [BUILD.md](../../BUILD.md). - -Requirements for building the DirectML execution provider: -1. Visual Studio 2017 toolchain (see [cmake configuration instructions](../../BUILD.md)) -2. [The Windows 10 SDK (10.0.18362.0) for Windows 10, version 1903](https://developer.microsoft.com/en-us/windows/downloads/windows-10-sdk) (or newer) - -To build onnxruntime with the DML EP included, supply the `--use_dml` parameter to `build.bat`. e.g. - - build.bat --config RelWithDebInfo --build_shared_lib --parallel --use_dml - -The DirectML execution provider supports building for both x64 (default) and x86 architectures. - -Note that building onnxruntime with the DirectML execution provider enabled causes the the DirectML redistributable package to be automatically downloaded as part of the build. Its use is governed by a license whose text may be found as part of the NuGet package. - - - -## Using the DirectML execution provider - -When using the [C API](../C_API.md) with a DML-enabled build of onnxruntime (see [Building from source](#building-from-source)), the DirectML execution provider can be enabled using one of the two factory functions included in `include/onnxruntime/core/providers/dml/dml_provider_factory.h`. - -### `OrtSessionOptionsAppendExecutionProvider_DML` function - - Creates a DirectML Execution Provider which executes on the hardware adapter with the given `device_id`, also known as the adapter index. The device ID corresponds to the enumeration order of hardware adapters as given by [IDXGIFactory::EnumAdapters](https://docs.microsoft.com/windows/win32/api/dxgi/nf-dxgi-idxgifactory-enumadapters). A `device_id` of 0 always corresponds to the default adapter, which is typically the primary display GPU installed on the system. A negative `device_id` is invalid. - - OrtStatus* OrtSessionOptionsAppendExecutionProvider_DML( - _In_ OrtSessionOptions* options, - int device_id - ); - -### `OrtSessionOptionsAppendExecutionProviderEx_DML` function - -Creates a DirectML Execution Provider using the given DirectML device, and which executes work on the supplied D3D12 command queue. The DirectML device and D3D12 command queue must have the same parent [ID3D12Device](https://docs.microsoft.com/windows/win32/api/d3d12/nn-d3d12-id3d12device), or an error will be returned. The D3D12 command queue must be of type `DIRECT` or `COMPUTE` (see [D3D12_COMMAND_LIST_TYPE](https://docs.microsoft.com/windows/win32/api/d3d12/ne-d3d12-d3d12_command_list_type)). If this function succeeds, the inference session once created will maintain a strong reference on both the `dml_device` and `command_queue` objects. - - OrtStatus* OrtSessionOptionsAppendExecutionProviderEx_DML( - _In_ OrtSessionOptions* options, - _In_ IDMLDevice* dml_device, - _In_ ID3D12CommandQueue* cmd_queue - ); - -**See Also** - -[DMLCreateDevice function](https://docs.microsoft.com/windows/win32/api/directml/nf-directml-dmlcreatedevice) -[ID3D12Device::CreateCommandQueue method](https://docs.microsoft.com/windows/win32/api/d3d12/nf-d3d12-id3d12device-createcommandqueue) -[Direct3D 12 programming guide](https://docs.microsoft.com/windows/win32/direct3d12/directx-12-programming-guide) - -### ONNX opset support - -The DirectML execution provider currently supports ONNX opset 11 ([ONNX v1.6](https://github.com/onnx/onnx/releases/tag/v1.6.0)). Evaluating models which require a higher opset version is not supported, and may produce unexpected results. - -### Multi-threading and supported session options - -The DirectML execution provider does not support the use of memory pattern optimizations or parallel execution in onnxruntime. When supplying session options during InferenceSession creation, these options must be disabled or an error will be returned. - -If using the onnxruntime C API, you must call `DisableMemPattern` and `SetSessionExecutionMode` functions to set the options required by the DirectML execution provider. - -See [onnxruntime\include\onnxruntime\core\session\onnxruntime_c_api.h](../../include/onnxruntime/core/session/onnxruntime_c_api.h). - - OrtStatus*(ORT_API_CALL* DisableMemPattern)(_Inout_ OrtSessionOptions* options)NO_EXCEPTION; - - OrtStatus*(ORT_API_CALL* SetSessionExecutionMode)(_Inout_ OrtSessionOptions* options, ExecutionMode execution_mode)NO_EXCEPTION; - -If creating the onnxruntime InferenceSession object directly, you must set the appropriate fields on the `onnxruntime::SessionOptions` struct. Specifically, `execution_mode` must be set to `ExecutionMode::ORT_SEQUENTIAL`, and `enable_mem_pattern` must be `false`. - -Additionally, as the DirectML execution provider does not support parallel execution, it does not support multi-threaded calls to `Run` on the same inference session. That is, if an inference session using the DirectML execution provider, only one thread may call `Run` at a time. Multiple threads are permitted to call `Run` simultaneously if they operate on different inference session objects. - -## Samples - -A complete sample of onnxruntime using the DirectML execution provider can be found under [samples/c_cxx/fns_candy_style_transfer](../../samples/c_cxx/fns_candy_style_transfer). - -## Performance best practices -The DirectML execution provider works most efficiently when tensor shapes are known at the time a session is created. This provides a few performance benefits: -1) Because constant folding can occur more often, there may be fewer CPU / GPU copies and stalls during evaluations. -2) More initialization work occurs when sessions are created rather than during the first evaluation. -3) Weights may be pre-processed within DirectML, enabling more efficient algorithms to be used. -4) Graph optimization occurs within DirectML. For example, Concat operators may be removed, and more optimal tensor layouts may be used for the input and output of operators. - -Normally when the shapes of model inputs are known during session creation, the shapes for the rest of the model are inferred by OnnxRuntime when a session is created. However if a model input contains a free dimension (such as for batch size), steps must be taken to retain the above performance benefits. - -In this case, there are three options: -- Edit the model to replace an input's free dimension (specified through ONNX using "dim_param") with a fixed size (specified through ONNX using "dim_value"). -- Specify values of named dimensions within model inputs when creating the session using the OnnxRuntime *AddFreeDimensionOverrideByName* ABI. -- Edit the model to ensure that an input's free dimension has a [denotation](https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md) (such as "DATA_BATCH," or a custom denotation). Then when creating the session, specify the dimension size for each denotation. This can be done using the OnnxRuntime *AddFreeDimensionOverride* ABI. - - -## See also - -[DirectML documentation \(docs.microsoft.com\)](https://docs.microsoft.com/en-us/windows/win32/direct3d12/dml) diff --git a/docs/execution_providers/MIGraphX-ExecutionProvider.md b/docs/execution_providers/MIGraphX-ExecutionProvider.md deleted file mode 100644 index 4d4974eb50..0000000000 --- a/docs/execution_providers/MIGraphX-ExecutionProvider.md +++ /dev/null @@ -1,35 +0,0 @@ -# MIGraphX Execution Provider - -ONNX Runtime's [MIGraphX](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/) execution provider uses AMD's Deep Learning graph optimization engine to accelerate ONNX model on AMD GPUs. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#AMD-MIGraphX). - -## Using the MIGraphX execution provider -### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -int device_id = 0; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MiGraphX(sf, device_id)); -``` -You can check [here](https://github.com/scxiao/ort_test/tree/master/char_rnn) for a specific c/c++ program. - -The C API details are [here](../C_API.md#c-api). - -### Python -When using the Python wheel from the ONNX Runtime build with MIGraphX execution provider, it will be automatically -prioritized over the default GPU or CPU execution providers. There is no need to separately register the execution -provider. Python APIs details are [here](../python/api_summary.rst#api-summary). - -You can check [here](https://github.com/scxiao/ort_test/tree/master/python/run_onnx) for a python script to run an -model on either the CPU or MIGraphX Execution Provider. - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest#onnxruntime-performance-test), use the flag `-e migraphx` - -## Configuring environment variables -MIGraphX providers an environment variable ORT_MIGRAPHX_FP16_ENABLE to enable the FP16 mode. - diff --git a/docs/execution_providers/MKL-DNN-Subgraphs.md b/docs/execution_providers/MKL-DNN-Subgraphs.md deleted file mode 100644 index 8bb451ddcb..0000000000 --- a/docs/execution_providers/MKL-DNN-Subgraphs.md +++ /dev/null @@ -1,65 +0,0 @@ -# Subgraph Optimization - -DNNL uses blocked layout (example: nhwc with channels blocked by 16 – nChw16c) to take advantage of vector operations using AVX512. To get best performance, we avoid reorders (example. Nchw16c to nchw) and propagate blocked layout to next primitive. - -Subgraph optimization achieves this in the following steps. -1. Parses ONNX Runtime graph and creates an Internal Representation of subgraph.. -2. Subgraph Operator (DnnlFunKernel) iterates through DNNL nodes and creates a vector DNNL Kernels -3. Compute Function of DnnlFunKernel iterates and binds data to DNNL primitives in the vector and submits vector for execution. - - -## Subgraph (IR) Internal Representation -DnnlExecutionProvicer::GetCapability() parses ONNX model graph and creates IR (Internal Representation) of subgraphs of DNNL operators. -Each subgraph contains a vector DnnlNodes, inputs, outputs and attributes for all its DnnlNodes. There can be attributes of same name. So, we prefix attribute names with Node name and its index. -Unique id for subgraph is set as an attribute. - -DnnlNode has an index to its inputs and outputs and pointer to its parent nodes. DnnlNode directly reads blocked memory from its parent to avoid data reordering. - -

- - -## Subgraph Classes -Primitive like DnnlConv, DnnlPool, etc are derived from DnnlKernel base class. - -The following UML diagram captures Subgraph classes. - -

- - -## Subgraph Execution - -DnnlExecutionProvicer::Compute() function creates DnnlFuncKernel and call it’s Compute Function. - - -DnnlFuncKernel::Compute function creates SubgraphPrimitve pool and add the object to a map. - -SubgraphPrimitve constructor calls the following member functions -``` -SubgraphPrimitve::CreatePrimitives() - for (auto& mklnode : mklnodes) { - if (mklnode.name == "Conv") { - kernel.reset(new DnnlConv()); - kernels.push_back(kernel); - } else if (mklnode.name == "BatchNormalization-Relu") { - kernel.reset(new DnnlBatchNorm()); - context_.kernels.push_back(kernel); - } else if (mklnode.name == "MaxPool") { - kernel.reset(new DnnlPool()); - context_.kernels.push_back(kernel); - } - . - . - . -``` -In CreatePrimitives method, we iterate DnnlNodes and creates DnnlKernel objects and add DNNL primitive to a vector. It also reads attributes. This is done only once, at first iteration. - -``` -SubgraphPrimitve::Compute() - for (auto& kernel : kernels) { - kernel->Bind(input_tensors, output_tensors); - } - stream->submit(net); -``` - -In SubgraphPrimitve::Compute() method, we iterate thru Dnnl Kernels and bind input data. Then we submit the vector of Primitives to DNNL stream. - diff --git a/docs/execution_providers/NNAPI-ExecutionProvider.md b/docs/execution_providers/NNAPI-ExecutionProvider.md deleted file mode 100644 index 0a96086768..0000000000 --- a/docs/execution_providers/NNAPI-ExecutionProvider.md +++ /dev/null @@ -1,21 +0,0 @@ -# NNAPI Execution Provider - -[Android Neural Networks API (NNAPI)](https://developer.android.com/ndk/guides/neuralnetworks) is a unified interface to CPU, GPU, and NN accelerators on Android. - -## Minimum requirements - -The NNAPI EP requires Android devices with Android 8.1 or higher, it is recommended to use Android devices with Android 9 or higher to achieve optimal performance. - -## Build NNAPI EP - -For build instructions, please see the [BUILD page](../../BUILD.md#Android-NNAPI-Execution-Provider). - -## Using NNAPI EP in C/C++ - -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nnapi(sf)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). diff --git a/docs/execution_providers/Nuphar-ExecutionProvider.md b/docs/execution_providers/Nuphar-ExecutionProvider.md deleted file mode 100644 index 8c7e0a8feb..0000000000 --- a/docs/execution_providers/Nuphar-ExecutionProvider.md +++ /dev/null @@ -1,170 +0,0 @@ -# Nuphar Execution Provider (preview) - -NUPHAR stands for Neural-network Unified Preprocessing Heterogeneous Architecture. As an execution provider in the ONNX Runtime, it is built on top of [TVM](https://github.com/dmlc/tvm) and [LLVM](https://llvm.org) to accelerate ONNX models by compiling nodes in subgraphs into optimized functions via JIT. It also provides JIT caching to save compilation time at runtime. - -Developers can tap into the power of Nuphar through ONNX Runtime to accelerate inferencing of ONNX models. The Nuphar execution provider comes with a common ONNX to TVM lowering [library](../../onnxruntime/core/codegen) that can potentially be reused by other execution providers to leverage TVM. With the Nuphar execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic X64 CPU acceleration, especially for quantized recurrent neural networks. Various products at Microsoft have seen up to a 5x improvement in performance with no loss of accuracy, by running quantized LSTMs via the Nuphar execution provider in the ONNX Runtime. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#nuphar). - -## Using the Nuphar execution provider -### C/C++ -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Nuphar(sf, /*allow_unaligned_buffers*/ 1, "")); -Ort::Session session(env, model_path, sf); - -### Python -You can use the Nuphar execution provider via the python wheel from the ONNX Runtime build. The Nuphar execution provider will be automatically prioritized over the default CPU execution providers, thus no need to separately register the execution provider. Python APIs details are [here](../python/api_summary.rst#api-summary). - -## Performance and Accuracy Testing -You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options. - -Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is: -- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS -- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool. - -This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on. - -## Model Conversion and Quantization -You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops. - -We use dynamic per-row quantization for inputs of LSTM MatMul, so MatMul becomes three parts: quantization, MatMulInteger and dequantization. Weights for MatMulInteger are statically quantized per-column to int8. We have observed good speed-up and no loss of accuracy with this quantization scheme inside Scan for various LSTM models. - -To convert models with LSTM/GRU/RNN ops to Scan ops: -``` -python model_editor.py --input /path/to/input/model --output /path/to/output/model --mode to_scan -``` - -To quantize MatMul ops to MatMulInteger ops (use option --only_for_scan to only quantize MatMuls inside Scan): -``` -python model_quantizer.py --input /path/to/input/model --output /path/to/output/model --only_for_scan -``` - -As an experiment, you may test conversion and quantization on [the BiDAF model](https://github.com/onnx/models/tree/master/bidaf) from the ONNX model zoo. This model has 5 bidirectional LSTM ops, and long sequence lengths. Our test shows that the quantized model has comparable accuracy of F1 76.24, EM 68.08, vs. floating point model accuracy of F1 76.20, EM 68.11. - -Speed-up in this model is ~20% on Intel Xeon E5-1620v4 (Note that AVX2 is required for Nuphar int8 GEMV performance), when comparing CPU execution provider with the floating point model with LSTM ops, vs. the Nuphar execution provider with quantized MatMulInteger inside Scan ops. Profile shows that most of the cost is in input projection outside of Scan ops, which uses MKL SGEMM. It's worth noting that MKL int8 GEMM is about the same speed as SGEMM in this model, so quantization of SGEMMs outside of Scan won't help performance. We are looking at ways to speedup int8 GEMM for better performance on quantized models. - -## JIT caching -You may cache JIT binaries to reduce model loading time spent in JIT, using [create_shared.cmd](../../onnxruntime/core/providers/nuphar/scripts/create_shared.cmd) on Windows with Visual Studio 2017, or [create_shared.sh](../../onnxruntime/core/providers/nuphar/scripts/create_shared.sh) on Linux with gcc. - -Windows -``` -REM You need to have Visual Studio 2017 for compile and link. Optionally, you can save model checksum to the output dll with FCIV tool from https://support.microsoft.com/en-us/help/841290 -set NUPHAR_CACHE_PATH=\path\to\jit\cache -REM Then run Nuphar inference from either onnx_test_runner or onnxruntime_perf_test, or whatever inference using C++ or Python -REM JIT object files would be saved to \path\to\jit\cache\ -create_shared.cmd \path\to\jit\cache\NUPHAR_CACHE_VERSION [optional_model_file_for_checksum] [optional_output_dll_name] -REM If checksum is embedded in dll, set NUPHAR_CACHE_MODEL_CHECKSUM to FCIV output for the model to inference to pass checksum verification at runtime -REM Checksum verification failure will cause Nuphar to fallback to JIT instead of loading binary from cache -REM Run Nuphar inference again with cached JIT dll -``` - -Linux -``` -# You need to have GCC of the same version Nuphar is built with, for compile and link. Optionally, you can save model checksum to jit.so with md5sum -export NUPHAR_CACHE_PATH=/path/to/jit/cache -# Then run Nuphar inference from either onnx_test_runner or onnxruntime_perf_test, or whatever inference using C++ or Python -# JIT object files would be saved to /path/to/jit/cache/ -create_shared.sh -c /path/to/jit/cache/NUPHAR_CACHE_VERSION [-m optional_model_file_for_checksum] [-o optional_output_so_name] -# If checksum is embedded in dll, set NUPHAR_CACHE_MODEL_CHECKSUM to md5sum output for the model to inference to pass checksum verification at runtime -# Checksum verification failure will cause Nuphar to fallback to JIT instead of loading binary from cache -# run Nuphar inference again with cached JIT dll -``` - - -## Debugging - -### NGEMM -NGEMM (Nuphar GEMM) is an optimized low-precision GEMM implementation based on compiler techniques. -Please refer to our paper for more details of NGEMM: ["NGEMM: Optimizing GEMM for Deep Learning via Compiler-based Techniques"](https://arxiv.org/abs/1910.00178). - -#### NGEMM Tiling / Permutation Configuration -NGEMM has default tiling parameters, but users can overwrite them through environment variables: -* NUPHAR_IGEMM_TILE_M / NUPHAR_IGEMM_TILE_N / NUPHAR_IGEMM_TILE_K - - These 3 parameters are the tiling sizes for the corresponding dimensions of GEMM ([M x K] x [K x N]). - Setting them to different values will generate GEMM with different tiling sizes. - -* NUPHAR_IGEMM_PERMUTE - - This enviornment variable is to control the loop permutation in GEMM. - The default is to not apply any loop permutation. Other options are "inner/outer/all",referring to apply permutations to only inner tile loops / only outer loops / both inner and outer loops, respectively. - - -There are several [environment variables](../../onnxruntime/core/codegen/common/settings.h) to dump debug information during code generation, plus [some more environment variables](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.h) to dump/control the Nuphar execution provider. You can set environment variables prior to inference to dump debug info to the console. To list some most useful ones: -* CODEGEN_DUMP_LOWER - - Dumps the lowered function from TVM. - - Set it to "verbose" to dump all nodes, or node op_type to dump specific nodes. You may use "concise" to dump just the op_type of nodes. - -* CODEGEN_DUMP_MODULE - - Dumps compiled binary. - - Set it to "ll" to dumps LLVM bit code, "asm" to dumps assembly. - -* CODEGEN_DUMP_SCHEDULE - - Dumps the schedule used in TVM nodes, like compute_root/compute_inline/compute_at. - - Set it to "verbose" to dump all nodes, or node op_type to dump specific nodes. You may use "concise" to dump just the op_type of nodes. - -* NUPHAR_DUMP_PARTITION - - Dumps nodes in each partition. - - Set it to "1" to dump partitions. - -## Settings -When there are conflicts of environment variables running Nuphar in multiple processes, user can specify settings string when creating the Nuphar execution provider. The string comprises of comma separated key:value pairs. Keys should be lower cased environment variable names as shown above, and separated from corresponding values with colon. For example, the equivalent string of setting environment variables of NUPHAR_CACHE_PATH/NUPHAR_CACHE_MODEL_CHECKSUM would be "nuphar_cache_path:, nuphar_cache_model_checksum:". - -* Using in C/C++ - -Settings string could be specified when creating execution provider to specify JIT cache path, as well as model checksum: - -``` -OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_Nuphar(session_options, 1, "nuphar_cache_path:/path/to/cache, nuphar_cache_model_checksum:")); -``` - -* Using in C# - -Settings string could be specified when creating session options: - -``` -SessionOptions.MakeSessionOptionWithNupharProvider("nuphar_cache_path:/path/to/cache, nuphar_cache_model_checksum:") -``` - -* Using in Python - -Settings string should be passed in before InferenceSession is created, as providers are not currently exposed yet. Here's an example in Python to set cache path and model checksum: - -``` -nuphar_settings = 'nuphar_cache_path:{}, nuphar_cache_model_checksum:{}'.format(cache_dir, model_checksum) -onnxruntime.capi._pybind_state.set_nuphar_settings(nuphar_settings) -sess = onnxruntime.InferenceSession(model_path) -``` - -## Known issues -* ONNX shape inference dependency - - To save runtime JIT cost, Nuphar requires models to have shape inference information from ONNX after model is loaded. Some nodes in ONNX can generate dynamic output tensor shapes from input data value, i.e. ConstantOfShape, Tile, Slice in opset 10, Compress, etc. Those ops may block ONNX shape inference and make the part of graph after such nodes not runnable in Nuphar. - - User may use Python script [symbolic_shape_infer.py](../../onnxruntime/python/tools/symbolic_shape_infer.py) to run symbolic shape inference in ONNX model. This script adds output tensor shapes in the model in graph.value_info field, by doing symbolic dimension computation using sympy when there are Shape ops in model. Besides, running symbolic shape inference on ONNX model would make the graph more readable. Note that when using [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to convert models with LSTM/GRU/RNN to Scan, the resulting model may have incomplete shape inference. Running symbolic_shape_infer.py is needed to get the Scan ops in the model to run in Nuphar. Besides, please note that quantization should be the last step, after verified accuracy and performance of the edited floating point model. - - In addition, user may also manually add shapes to graph.value_info using [onnx.helper.make_tensor_value_info](https://github.com/onnx/onnx/blob/v1.5.0/onnx/helper.py#L290) with model specific knowledge. For example, if you have Hardmax output casted to bool as Compress input condition, then the unknown dimension of the output of Compress is actually 1. - -* Performance benchmark - - Current Nuphar's speed-up in quantized RNNs is optimized for AVX2, when running in single thread and batch size is 1. To help understand RNN performance in different configurations, please use Python script [rnn_benchmark.py](../../onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py). For older X64 CPUs that do not support AVX2, quantized model may have worse performance than non-quantized ones. - -* Patches to TVM - - There are some changes/bug fixes in TVM for Nuphar to work properly. We are in the process of contributing them back to TVM, but for now patches are used in [our forked TVM](https://github.com/microsoft/onnxruntime-tvm). To build cleanly from scratch, please run following commands before running build.bat or build.sh: -``` -git submodule sync -git submodule foreach --recursive git stash -git submodule foreach --recursive git clean -fd -git submodule update --init --recursive -``` diff --git a/docs/execution_providers/OpenVINO-ExecutionProvider.md b/docs/execution_providers/OpenVINO-ExecutionProvider.md deleted file mode 100644 index 346e694a3f..0000000000 --- a/docs/execution_providers/OpenVINO-ExecutionProvider.md +++ /dev/null @@ -1,284 +0,0 @@ -# OpenVINO Execution Provider - -OpenVINO Execution Provider enables deep learning inference on Intel CPUs, Intel integrated GPUs and Intel® MovidiusTM Vision Processing Units (VPUs). Please refer to [this](https://software.intel.com/en-us/openvino-toolkit/hardware) page for details on the Intel hardware supported. - -### Build -For build instructions, please see the [BUILD page](../../BUILD.md#openvino). - -## Runtime configuration options ---- - -OpenVINO EP can be configured with certain options at runtime that control the behavior of the EP. These options can be set as key-value pairs as below:- - -### Python API -Key-Value pairs for config options can be set using the Session.set_providers API as follows:- - -``` -session = onnxruntime.InferenceSession(, options) -session.set_providers(['OpenVINOExecutionProvider'], [{Key1 : Value1, Key2 : Value2, ...}]) -``` -*Note that this causes the InferenceSession to be re-initialized, which may cause model recompilation and hardware re-initialization* - -### C/C++ API -All the options shown below are passed to SessionOptionsAppendExecutionProvider_OpenVINO() API and populated in the struct OrtOpenVINOProviderOptions in an example shown below, for example for CPU device type:- - -``` -OrtOpenVINOProviderOptions options; -options.device_type = "CPU_FP32"; -options.enable_vpu_fast_compile = 0; -options.device_id = ""; -options.num_of_threads = 8; -SessionOptionsAppendExecutionProvider_OpenVINO(session_options, &options); -``` - -### Available configuration options -The following table lists all the available configuratoin optoins and the Key-Value pairs to set them:- - -| **Key** | **Key type** | **Allowable Values** | **Value type** | **Description** | -| --- | --- | --- | --- | --- | -| device_type | string | CPU_FP32, GPU_FP32, GPU_FP16, MYRIAD_FP16, VAD-M_FP16, VAD-F_FP32, Any valid Hetero combination, Any valid Multi-Device combination | string | Overrides the accelerator hardware type and precision with these values at runtime. If this option is not explicitly set, default hardware and precision specified during build time is used. | -| device_id | string | Any valid OpenVINO device ID | string | Selects a particular hardware device for inference. The list of valid OpenVINO device ID's available on a platform can be obtained either by Python API (`onnxruntime.capi._pybind_state.get_available_openvino_device_ids()`) or by [OpenVINO C/C++ API](https://docs.openvinotoolkit.org/latest/classInferenceEngine_1_1Core.html#acb212aa879e1234f51b845d2befae41c). If this option is not explicitly set, an arbitrary free device will be automatically selected by OpenVINO runtime.| -| enable_vpu_fast_compile | string | True/False | boolean | This option is only available for MYRIAD_FP16 VPU devices. During initialization of the VPU device with compiled model, Fast-compile may be optionally enabled to speeds up the model's compilation to VPU device specific format. This in-turn speeds up model initialization time. However, enabling this option may slowdown inference due to some of the optimizations not being fully applied, so caution is to be exercised while enabling this option. | -| num_of_threads | string | Any unsigned positive number other than 0 | size_t | Overrides the accelerator default value of number of threads with this value at runtime. If this option is not explicitly set, default value of 8 is used during build time. This option when set actually makes those number of free InferRequests made available in the pool so that each thread has a separate InferRequest available thus enabling Multi-threading during inference. Note: This option is not to set the num_of_threads for inferencing, it is to just set number of free InferRequests that should be made available. | - -Valid Hetero or Multi-Device combination's: -HETERO:,,... -MULTI:,,... -The can be any of these devices from this list ['CPU','GPU','MYRIAD','FPGA','HDDL'] - -A minimum of two DEVICE_TYPE'S should be specified for a valid HETERO or Multi-Device Build. - -Example: -HETERO:MYRIAD,CPU HETERO:HDDL,GPU,CPU MULTI:MYRIAD,GPU,CPU - -## Other configuration settings -### Onnxruntime Graph Optimization level -OpenVINO backend performs both hardware dependent as well as independent optimizations to the graph to infer it with on the target hardware with best possible performance. In most of the cases it has been observed that passing in the graph from the input model as is would lead to best possible optimizations by OpenVINO. For this reason, it is advised to turn off high level optimizations performed by ONNX Runtime before handing the graph over to OpenVINO backend. This can be done using Session options as shown below:- - -### Python API -``` -options = onnxruntime.SessionOptions() -options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL -sess = onnxruntime.InferenceSession(, options) -``` - -### C/C++ API -``` -SessionOptions::SetGraphOptimizationLevel(ORT_DISABLE_ALL); -``` - -### Deprecated: Dynamic device type selection -**Note: This API has been deprecated. Please use the mechanism mentioned above to set the 'device-type' option.** -When ONNX Runtime is built with OpenVINO Execution Provider, a target hardware option needs to be provided. This build time option becomes the default target harware the EP schedules inference on. However, this target may be overriden at runtime to schedule inference on a different hardware as shown below. - -Note. This dynamic hardware selection is optional. The EP falls back to the build-time default selection if no dynamic hardware option value is specified. - -### Python API -``` -import onnxruntime -onnxruntime.capi._pybind_state.set_openvino_device("") -# Create session after this -``` -*This property persists and gets applied to new sessions until it is explicity unset. To unset, assign a null string ("").* - -### C/C++ API - -Append the settings string "" to the EP settings string. Example shown below for the CPU_FP32 option: -``` -std::string settings_str; -... -settings_str.append("CPU_FP32"); -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_OpenVINO(sf, settings_str.c_str())); -``` - -## ONNX Layers supported using OpenVINO - -The table below shows the ONNX layers supported and validated using OpenVINO Execution Provider.The below table also lists the Intel hardware support for each of the layers. CPU refers to Intel® -Atom, Core, and Xeon processors. GPU refers to the Intel Integrated Graphics. VPU refers to USB based Intel® MovidiusTM -VPUs as well as Intel® Vision accelerator Design with Intel Movidius TM MyriadX VPU. - -| **ONNX Layers** | **CPU** | **GPU** | **VPU** | -| --- | --- | --- | --- | -| Abs | Yes | Yes | No | -| Acos | Yes | No | No | -| Acosh | Yes | No | No | -| Add | Yes | Yes | Yes | -| ArgMax | Yes | Yes | Yes | -| ArgMin | Yes | No | Yes | -| Asin | Yes | Yes | No | -| Asinh | Yes | Yes | No | -| Atan | Yes | Yes | No | -| Atanh | Yes | No | No | -| AveragePool | Yes | Yes | Yes | -| BatchNormalization | Yes | Yes | Yes | -| Cast | Yes | Yes | Yes | -| Ceil | No | Yes | No | -| Clip | Yes | Yes | Yes | -| Concat | Yes | Yes | Yes | -| Constant | Yes | Yes | Yes | -| ConstantOfShape | Yes | Yes | Yes | -| Conv | Yes | Yes | Yes | -| ConvTranspose | Yes | Yes | Yes | -| Cos | Yes | No | No | -| Cosh | Yes | No | No | -| DepthToSpace | Yes | Yes | Yes | -| Div | Yes | Yes | Yes | -| Dropout | Yes | Yes | Yes | -| Elu | Yes | Yes | Yes | -| Equal | Yes | Yes | Yes | -| Erf | Yes | Yes | Yes | -| Exp | Yes | Yes | Yes | -| Expand | No | No | Yes | -| Flatten | Yes | Yes | Yes | -| Floor | Yes | Yes | Yes | -| Gather | Yes | Yes | Yes | -| GatherND | No | No | Yes | -| Gemm | Yes | Yes | Yes | -| GlobalAveragePool | Yes | Yes | Yes | -| GlobalLpPool | Yes | Yes | No | -| HardSigmoid | Yes | Yes | No | -| Identity | Yes | Yes | Yes | -| InstanceNormalization | Yes | Yes | Yes | -| LeakyRelu | Yes | Yes | Yes | -| Less | Yes | Yes | Yes | -| Log | Yes | Yes | Yes | -| LRN | Yes | Yes | Yes | -| MatMul | Yes | Yes | Yes | -| Max | Yes | Yes | Yes | -| MaxPool | Yes | Yes | Yes | -| Mean | Yes | Yes | Yes | -| Min | Yes | Yes | Yes | -| Mul | Yes | Yes | Yes | -| Neg | Yes | Yes | Yes | -| NonMaxSuppression | No | No | Yes | -| NonZero | Yes | No | Yes | -| Not | Yes | Yes | Yes | -| OneHot | Yes | Yes | Yes | -| Pad | Yes | Yes | Yes | -| Pow | Yes | Yes | Yes | -| PRelu | Yes | Yes | Yes | -| Reciprocal | Yes | Yes | Yes | -| Range | No | No | Yes | -| ReduceLogSum | Yes | No | Yes | -| ReduceMax | Yes | Yes | Yes | -| ReduceMean | Yes | Yes | Yes | -| ReduceMin | Yes | Yes | Yes | -| ReduceProd | Yes | No | No | -| ReduceSum | Yes | Yes | Yes | -| ReduceSumSquare | Yes | No | Yes | -| Relu | Yes | Yes | Yes | -| Reshape | Yes | Yes | Yes | -| Resize | Yes | No | Yes | -| RoiAlign | No | No | Yes | -| Scatter | No | No | Yes | -| ScatterElements | No | No | Yes | -| Selu | Yes | Yes | No | -| Shape | Yes | Yes | Yes | -| Sigmoid | Yes | Yes | Yes | -| Sign | Yes | No | No | -| SinFloat | No | No | Yes | -| Sinh | Yes | No | No | -| Slice | Yes | Yes | Yes | -| Softmax | Yes | Yes | Yes | -| Softsign | Yes | No | No | -| SpaceToDepth | Yes | Yes | Yes | -| Split | Yes | Yes | Yes | -| Sqrt | Yes | Yes | Yes | -| Squeeze | Yes | Yes | Yes | -| Sub | Yes | Yes | Yes | -| Sum | Yes | Yes | Yes | -| Tan | Yes | Yes | No | -| Tanh | Yes | Yes | Yes | -| TopK | Yes | Yes | Yes | -| Transpose | Yes | Yes | Yes | -| Unsqueeze | Yes | Yes | Yes | -| Upsample | Yes | No | No | -| Where | No | No | Yes | - -## Topology Support - -Below topologies from ONNX open model zoo are fully supported on OpenVINO Execution Provider and many more are supported through sub-graph partitioning - -## Image Classification Networks - -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| bvlc_alexnet | Yes | Yes | Yes | Yes* | -| bvlc_googlenet | Yes | Yes | Yes | Yes* | -| bvlc_reference_caffenet | Yes | Yes | Yes | Yes* | -| bvlc_reference_rcnn_ilsvrc13 | Yes | Yes | Yes | Yes* | -| emotion ferplus | Yes | Yes | Yes | Yes* | -| densenet121 | Yes | Yes | Yes | Yes* | -| inception_v1 | Yes | Yes | Yes | Yes* | -| inception_v2 | Yes | Yes | Yes | Yes* | -| mobilenetv2 | Yes | Yes | Yes | Yes* | -| resnet18v1 | Yes | Yes | Yes | Yes* | -| resnet34v1 | Yes | Yes | Yes | Yes* | -| resnet101v1 | Yes | Yes | Yes | Yes* | -| resnet152v1 | Yes | Yes | Yes | Yes* | -| resnet18v2 | Yes | Yes | Yes | Yes* | -| resnet34v2 | Yes | Yes | Yes | Yes* | -| resnet101v2 | Yes | Yes | Yes | Yes* | -| resnet152v2 | Yes | Yes | Yes | Yes* | -| resnet50 | Yes | Yes | Yes | Yes* | -| resnet50v2 | Yes | Yes | Yes | Yes* | -| shufflenet | Yes | Yes | Yes | Yes* | -| squeezenet1.1 | Yes | Yes | Yes | Yes* | -| vgg19 | Yes | Yes | Yes | Yes* | -| vgg16 | Yes | Yes | Yes | Yes* | -| zfnet512 | Yes | Yes | Yes | Yes* | -| arcface | Yes | Yes | Yes | Yes* | - - -## Image Recognition Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| mnist | Yes | Yes | Yes | Yes* | - -## Object Detection Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| tiny_yolov2 | Yes | Yes | Yes | Yes* | - -## Image Manipulation Networks -| **MODEL NAME** | **CPU** | **GPU** | **VPU** | **FPGA** | -| --- | --- | --- | --- | --- | -| mosaic | Yes | No | No | No* | -| candy | Yes | No | No | No* | -| rain_princess | Yes | No | No | No* | -| pointilism | Yes | No | No | No* | -| udnie | Yes | No | No | No* | - -*FPGA only runs in HETERO mode wherein the layers that are not supported on FPGA fall back to OpenVINO CPU. - -## Inferencing on FP16 Models -FP16 models can be inferenced on a VPU with device_type = "MYRIAD_FP16" and on GPU with -device_type = "GPU_FP16" - -## CSharp API - -To use csharp api for openvino execution provider create a custom nuget package. Follow the instructions [here](../../BUILD.md##build-nuget-packages) to install prerequisites for nuget creation. Once prerequisites are installed follow the instructions to [build openvino](../../BUILD.md#openvino) and add an extra flag `--build_nuget` to create nuget packages. Two nuget packages will be created Microsoft.ML.OnnxRuntime.Managed and Microsoft.ML.OnnxRuntime.Openvino. - -## Multi-threading for OpenVINO EP - -OpenVINO Execution Provider enables thread-safe deep learning inference - -## Heterogeneous Execution for OpenVINO EP - -The heterogeneous Execution enables computing for inference on one network on several devices. Purposes to execute networks in heterogeneous mode - -To utilize accelerators power and calculate heaviest parts of network on accelerator and execute not supported layers on fallback devices like CPU -To utilize all available hardware more efficiently during one inference - -For more information on Heterogeneous plugin of OpenVINO, please refer to the following -[documentation](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_supported_plugins_HETERO.html). - -## Multi-Device Execution for OpenVINO EP - -Multi-Device plugin automatically assigns inference requests to available computational devices to execute the requests in parallel. Potential gains are as follows - -Improved throughput that multiple devices can deliver (compared to single-device execution) -More consistent performance, since the devices can now share the inference burden (so that if one device is becoming too busy, another device can take more of the load) - -For more information on Multi-Device plugin of OpenVINO, please refer to the following -[documentation](https://docs.openvinotoolkit.org/latest/openvino_docs_IE_DG_supported_plugins_MULTI.html#introducing_multi_device_execution). diff --git a/docs/execution_providers/README.md b/docs/execution_providers/README.md deleted file mode 100644 index dccaaff747..0000000000 --- a/docs/execution_providers/README.md +++ /dev/null @@ -1,64 +0,0 @@ -# Introduction - -ONNX Runtime is capable of working with different HW acceleration libraries to execute the ONNX models on the hardware platform. ONNX Runtime supports an extensible framework, called **Execution Providers** (EP), to integrate with the HW specific libraries. This interface enables flexibility for the AP application developer to deploy their ONNX models in different environments in the cloud and the edge and optimize the execution by taking advantage of the compute capabilities of the platform. - -

Executing ONNX models across different HW environments

- -ONNX Runtime works with the execution provider(s) using the `GetCapability()` interface to allocate specific nodes or sub-graphs for execution by the EP library in supported hardware. The EP libraries that are preinstalled in the execution environment processes and executes the ONNX sub-graph on the hardware. This architecture abstracts out the details of the hardware specific libraries that are essential to optimizing the execution of deep neural networks across hardware platforms like CPU, GPU, FPGA or specialized NPUs. - -

ONNX Runtime GetCapability()

- -ONNX Runtime supports many different execution providers today. Some of the EPs are in GA and used in live service. Many are in released in preview to enable developers to develop and customize their application using the different options. - -### Adding an Execution Provider - -Developers of specialized HW acceleration solutions can integrate with ONNX Runtime to execute ONNX models on their stack. To create an EP to interface with ONNX Runtime you must first identify a unique name for the EP. Follow the steps outlined [here](../AddingExecutionProvider.md) to integrate your code in the repo. - -### Building ONNX Runtime package with EPs - -The ONNX Runtime package can be built with any combination of the EPs along with the default CPU execution provider. **Note** that if multiple EPs are combined into the same ONNX Runtime package then all the dependent libraries must be present in the execution environment. The steps for producing the ONNX Runtime package with different EPs is documented [here](../../BUILD.md#execution-providers). - -### APIs for Execution Provider - -The same ONNX Runtime API is used across all EPs. This provides the consistent interface for applications to run with different HW acceleration platforms. The APIs to set EP options are available across Python, C/C++/C#, Java and node.js. **Note** we are updating our API support to get parity across all language binding and will update specifics here. - - `get_providers`: Return list of registered execution providers. - `get_provider_options`: Return the registered execution providers' configurations. - `set_providers`: Register the given list of execution providers. The underlying session is re-created. - The list of providers is ordered by Priority. For example ['CUDAExecutionProvider', 'CPUExecutionProvider'] - means execute a node using CUDAExecutionProvider if capable, otherwise execute using CPUExecutionProvider. - -### Using Execution Providers - -``` python -import onnxruntime as rt - -#define the priority order for the execution providers -# prefer CUDA Execution Provider over CPU Execution Provider -EP_list = ['CUDAExecutionProvider', 'CPUExecutionProvider'] - -# initialize the model.onnx -sess = rt.InferenceSession("model.onnx", providers=EP_list) - -# get the outputs metadata as a list of :class:`onnxruntime.NodeArg` -output_name = sess.get_outputs()[0].name - -# get the inputs metadata as a list of :class:`onnxruntime.NodeArg` -input_name = sess.get_inputs()[0].name - -# inference run using image_data as the input to the model -detections = sess.run([output_name], {input_name: image_data})[0] - -print("Output shape:", detections.shape) - -# Process the image to mark the inference points -image = post.image_postprocess(original_image, input_size, detections) -image = Image.fromarray(image) -image.save("kite-with-objects.jpg") - -# Update EP priority to only CPUExecutionProvider -sess.set_providers('CPUExecutionProvider') - -cpu_detection = sess.run(...) - -``` diff --git a/docs/execution_providers/RKNPU-ExecutionProvider.md b/docs/execution_providers/RKNPU-ExecutionProvider.md deleted file mode 100644 index 7b1431aa6b..0000000000 --- a/docs/execution_providers/RKNPU-ExecutionProvider.md +++ /dev/null @@ -1,70 +0,0 @@ -# RKNPU Execution Provider (preview) -RKNPU DDK is an advanced interface to access Rockchip NPU. RKNPU Execution Provider enables deep learning inference on Rockchip NPU via RKNPU DDK. - -## Supported platforms - -* RK1808 Linux - -*Note: RK3399Pro platform is not supported.* - - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#RKNPU). - -## Usage -### C/C++ -To use RKNPU as execution provider for inferencing, please register it as below. -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_RKNPU(sf)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). - - -## Supported Operators - -The table below shows the ONNX Ops supported using RKNPU Execution Provider and the mapping between ONNX Ops and RKNPU Ops. - -| **ONNX Ops** | **RKNPU Ops** | -| --- | --- | -| Add | ADD | -| Mul | MULTIPLY | -| Conv | CONV2D | -| QLinearConv | CONV2D | -| Gemm | FULLCONNECT | -| Softmax | SOFTMAX | -| AveragePool | POOL | -| GlobalAveragePool | POOL | -| MaxPool | POOL | -| GlobalMaxPool | POOL | -| LeakyRelu | LEAKY_RELU | -| Concat | CONCAT | -| BatchNormalization | BATCH_NORM | -| Reshape | RESHAPE | -| Flatten | RESHAPE | -| Squeeze | RESHAPE | -| Unsqueeze | RESHAPE | -| Transpose | PERMUTE | -| Relu | RELU | -| Sub | SUBTRACT | -| Clip(0~6)| RELU6 | -| DequantizeLinear | DATACONVERT | -| Clip | CLIP | - - -## Supported Models - -Below Models are supported from ONNX open model zoo using RKNPU Execution Provider - -### Image Classification -- squeezenet -- mobilenetv2-1.0 -- resnet50v1 -- resnet50v2 -- inception_v2 - -### Object Detection -- ssd -- yolov3 \ No newline at end of file diff --git a/docs/execution_providers/TensorRT-ExecutionProvider.md b/docs/execution_providers/TensorRT-ExecutionProvider.md deleted file mode 100644 index 8d26966e3f..0000000000 --- a/docs/execution_providers/TensorRT-ExecutionProvider.md +++ /dev/null @@ -1,114 +0,0 @@ -# TensorRT Execution Provider - -The TensorRT execution provider in the ONNX Runtime makes use of NVIDIA's [TensorRT](https://developer.nvidia.com/tensorrt) Deep Learning inferencing engine to accelerate ONNX model in their family of GPUs. Microsoft and NVIDIA worked closely to integrate the TensorRT execution provider with ONNX Runtime. - -With the TensorRT execution provider, the ONNX Runtime delivers better inferencing performance on the same hardware compared to generic GPU acceleration. - -## Build -For build instructions, please see the [BUILD page](../../BUILD.md#TensorRT). - -The TensorRT execution provider for ONNX Runtime is built and tested with TensorRT 7.1.3.4. - -## Using the TensorRT execution provider -### C/C++ -``` -Ort::Env env = Ort::Env{ORT_LOGGING_LEVEL_ERROR, "Default"}; -Ort::SessionOptions sf; -int device_id = 0; -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); -Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, device_id)); -Ort::Session session(env, model_path, sf); -``` -The C API details are [here](../C_API.md#c-api). - -#### Shape Inference for TensorRT Subgraphs -If some operators in the model are not supported by TensorRT, ONNX Runtime will partition the graph and only send supported subgraphs to TensorRT execution provider. Because TensorRT requires that all inputs of the subgraphs have shape specified, ONNX Runtime will throw error if there is no input shape info. In this case please run shape inference for the entire model first by running script [here](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/symbolic_shape_infer.py). - -#### Sample -This example shows how to run Faster R-CNN model on TensorRT execution provider, - -First, download Faster R-CNN onnx model from onnx model zoo [here](https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn). - -Second, infer shapes in the model by running shape inference script [here](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/symbolic_shape_infer.py), -``` -python symbolic_shape_infer.py --input /path/to/onnx/model/model.onnx --output /path/to/onnx/model/new_model.onnx --auto_merge -``` - -Third, replace original model with the new model and run onnx_test_runner tool under ONNX Runtime build directory, -``` -./onnx_test_runner -e tensorrt /path/to/onnx/model/ -``` - -### Python -When using the Python wheel from the ONNX Runtime build with TensorRT execution provider, it will be automatically prioritized over the default GPU or CPU execution providers. There is no need to separately register the execution provider. Python APIs details are . - -#### Sample -Please see [this Notebook](../python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb) for an example of running a model on GPU using ONNX Runtime through Azure Machine Learning Services. - -## Performance Tuning -For performance tuning, please see guidance on this page: [ONNX Runtime Perf Tuning](../ONNX_Runtime_Perf_Tuning.md) - -When/if using [onnxruntime_perf_test](../../onnxruntime/test/perftest#onnxruntime-performance-test), use the flag `-e tensorrt` - -## Configuring environment variables -There are several environment variables for TensorRT execution provider. - -* ORT_TENSORRT_MAX_WORKSPACE_SIZE: maximum workspace size for TensorRT engine. Default value: 1073741824 (1GB). - -* ORT_TENSORRT_MAX_PARTITION_ITERATIONS: maximum number of iterations allowed in model partitioning for TensorRT. If target model can't be successfully partitioned when the maximum number of iterations is reached, the whole model will fall back to other execution providers such as CUDA or CPU. Default value: 1000. - -* ORT_TENSORRT_MIN_SUBGRAPH_SIZE: minimum node size in a subgraph after partitioning. Subgraphs with smaller size will fall back to other execution providers. Default value: 1. - -* ORT_TENSORRT_FP16_ENABLE: Enable FP16 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. - -* ORT_TENSORRT_INT8_ENABLE: Enable INT8 mode in TensorRT. 1: enabled, 0: disabled. Default value: 0. - -* ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME: Specify INT8 calibration table file name. By default the name is "INT8_calibration_table". - -* ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE: Select what calibration table is used. If 1, native TensorRT generated calibration table is used; if 0, ONNXRUNTIME tool generated calibration table is used. Default value: 0. -**Note: Please copy up-to-date calibration table file to ORT_TENSORRT_CACHE_PATH before inference. Calibration table is specific to models and calibration data sets. Whenever new calibration table is generated, old file in the path should be cleaned up or be replaced. - -* ORT_TENSORRT_ENGINE_CACHE_ENABLE: Enable TensorRT engine caching. The purpose of using engine caching is to save engine build time in the cases that TensorRT may take long time to optimize and build engine. Engine will be cached after it's built at the first time so that next time when inference session is created the engine can be loaded directly from cache. In order to validate that the loaded engine is usable for current inference, engine profile is also cached and loaded along with engine. If current input shapes are in the range of the engine profile, that means the loaded engine can be safely used. Otherwise if input shapes are out of range, profile cache will be updated to cover the new shape and engine will be recreated based on the new profile (and also refreshed in the engine cache). Note each engine is created for specific settings such as precision (FP32/FP16/INT8 etc), workspace, profiles etc, and specific GPUs and it's not portable, so it's essential to make sure those settings are not changing, otherwise the engines need to be rebuilt and cached again. 1: enabled, 0: disabled. Default value: 0. -**Warning: Please clean up any old engine and profile cache files (.engine and .profile) if any of the following changes:** - - Model changes (if there are any changes to the model topology, opset version etc.) - - ORT version changes (i.e. moving from ORT version 1.4 to 1.5) - - TensorRT version changes (i.e. moving from TensorRT 7.0 to 7.1) - - Hardware changes. (Engine and profile files are not portable and optimized for specific Nvidia hardware) - -* ORT_TENSORRT_ENGINE_CACHE_PATH: This variable is deprecated. Please use ORT_TENSORRT_CACHE_PATH instead. - -* ORT_TENSORRT_CACHE_PATH: Specify path for TensorRT engine and profile files if ORT_TENSORRT_ENGINE_CACHE_ENABLE is 1, or path for INT8 calibration table file if ORT_TENSORRT_INT8_ENABLE is 1. - -* ORT_TENSORRT_DUMP_SUBGRAPHS: Dumps the subgraphs that are transformed into TRT engines in onnx format to the filesystem. This can help debugging subgraphs, e.g. by using `trtexec --onnx my_model.onnx` and check the outputs of the parser. 1: enabled, 0: disabled. Default value: 0. - -One can override default values by setting environment variables ORT_TENSORRT_MAX_WORKSPACE_SIZE, ORT_TENSORRT_MAX_PARTITION_ITERATIONS, ORT_TENSORRT_MIN_SUBGRAPH_SIZE, ORT_TENSORRT_FP16_ENABLE, ORT_TENSORRT_INT8_ENABLE, ORT_TENSORRT_INT8_CALIBRATION_TABLE_NAME, ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE, ORT_TENSORRT_ENGINE_CACHE_ENABLE, ORT_TENSORRT_CACHE_PATH and ORT_TENSORRT_DUMP_SUBGRAPHS. -e.g. on Linux - -### override default max workspace size to 2GB -export ORT_TENSORRT_MAX_WORKSPACE_SIZE=2147483648 - -### override default maximum number of iterations to 10 -export ORT_TENSORRT_MAX_PARTITION_ITERATIONS=10 - -### override default minimum subgraph node size to 5 -export ORT_TENSORRT_MIN_SUBGRAPH_SIZE=5 - -### Enable FP16 mode in TensorRT -export ORT_TENSORRT_FP16_ENABLE=1 - -### Enable INT8 mode in TensorRT -export ORT_TENSORRT_INT8_ENABLE=1 - -### Use native TensorRT calibration table -export ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE=1 - -### Enable TensorRT engine caching -export ORT_TENSORRT_ENGINE_CACHE_ENABLE=1 -* Please Note warning above. This feature is experimental. Engine cache files must be invalidated if there are any changes to the model, ORT version, TensorRT version or if the -underlying hardware changes. Engine files are not portable across devices. - -### Specify TensorRT cache path -export ORT_TENSORRT_CACHE_PATH="/path/to/cache" - -### Dump out subgraphs to run on TensorRT -export ORT_TENSORRT_DUMP_SUBGRAPHS = 1 diff --git a/docs/execution_providers/Vitis-AI-ExecutionProvider.md b/docs/execution_providers/Vitis-AI-ExecutionProvider.md deleted file mode 100644 index 0063d6bfb6..0000000000 --- a/docs/execution_providers/Vitis-AI-ExecutionProvider.md +++ /dev/null @@ -1,118 +0,0 @@ -

- -

- -# Vitis-AI Execution Provider - -[Vitis-AI](https://github.com/Xilinx/Vitis-AI) is Xilinx's development stack for hardware-accelerated AI inference on Xilinx platforms, including both edge devices and Alveo cards. It consists of optimized IP, tools, libraries, models, and example designs. It is designed with high efficiency and ease of use in mind, unleashing the full potential of AI acceleration on Xilinx FPGA and ACAP. - -The current Vitis-AI execution provider inside ONNXRuntime enables acceleration of Neural Network model inference using DPUv1. DPUv1 is a hardware accelerator for Convolutional Neural Networks (CNN) on top of the Xilinx [Alveo](https://www.xilinx.com/products/boards-and-kits/alveo.html) platform and targets U200 and U250 accelerator cards. - -On this page you will find information on how to [build](#Build) ONNXRuntime with Vitis-AI and on how to [get started](#Getting-started) with an example. - -## Build - -For building ONNXRuntime with the Vitis-AI execution provider, you will have to setup the hardware environment and build the docker, see [build steps](#Hardware-setup-and-docker-build). - -### System requirements - -The following table lists system requirements for running docker containers as well as Alveo cards. - - -| **Component** | **Requirement** | -|-----------------------------------------------------|------------------------------------------------------------| -| Motherboard | PCI Express 3\.0\-compliant with one dual\-width x16 slot | -| System Power Supply | 225W | -| Operating System | Ubuntu 16\.04, 18\.04 | -| | CentOS 7\.4, 7\.5 | -| | RHEL 7\.4, 7\.5 | -| CPU | Intel i3/i5/i7/i9/Xeon 64-bit CPU | -| GPU \(Optional to accelerate quantization\) | NVIDIA GPU with a compute capability > 3.0 | -| CUDA Driver \(Optional to accelerate quantization\) | nvidia\-410 | -| FPGA | Xilinx Alveo U200 or U250 | -| Docker Version | 19\.03\.1 | - -### Hardware setup and docker build - -1. Clone the Vitis AI repository: - ``` - git clone https://github.com/xilinx/vitis-ai - ``` -2. Install the Docker, and add the user to the docker group. Link the user to docker installation instructions from the following docker's website: - * https://docs.docker.com/install/linux/docker-ce/ubuntu/ - * https://docs.docker.com/install/linux/docker-ce/centos/ - * https://docs.docker.com/install/linux/linux-postinstall/ -3. Any GPU instructions will have to be separated from Vitis AI. -4. Set up Vitis AI to target Alveo cards. To target Alveo cards with Vitis AI for machine learning workloads, you must install the following software components: - * Xilinx Runtime (XRT) - * Alveo Deployment Shells (DSAs) - * Xilinx Resource Manager (XRM) (xbutler) - * Xilinx Overlaybins (Accelerators to Dynamically Load - binary programming files) - - While it is possible to install all of these software components individually, a script has been provided to automatically install them at once. To do so: - * Run the following commands: - ``` - cd Vitis-AI/alveo/packages - sudo su - ./install.sh - ``` - * Power cycle the system. -5. Build and start the ONNXRuntime Vitis-AI Docker Container. - ``` - cd {onnxruntime-root}/dockerfiles - docker build -t onnxruntime-vitisai -f Dockerfile.vitisai . - ./scripts/docker_run_vitisai.sh - ``` - - Setup inside container - ``` - source /opt/xilinx/xrt/setup.sh - conda activate vitis-ai-tensorflow - ``` - -## Getting started - -### On-the-fly quantization - -Usually, to be able to accelerate inference of Neural Network models with Vitis-AI DPU accelerators, those models need to quantized upfront. In the ONNXRuntime Vitis-AI execution provider we make use of on-the-fly quantization to remove this additional preprocessing step. In this flow, one doesn't need to quantize his/her model upfront but can make use of the typical inference execution calls (InferenceSession.run) to quantize the model on-the-fly using the first N inputs that are provided (see more information below). This will set up and calibrate the Vitis-AI DPU and from that point onwards inference will be accelerated for all next inputs. - -### Config/Settings - -A couple of environment variables can be used to customize the Vitis-AI execution provider. - -| **Environment Variable** | **Default if unset** | **Explanation** | -|----------------------------|---------------------------|---------------------------------------------------------| -| PX_QUANT_SIZE | 128 | The number of inputs that will be used for quantization (necessary for Vitis-AI acceleration) | -| PX_BUILD_DIR | Use the on-the-fly quantization flow | Loads the quantization and compilation information from the provided build directory and immediately starts Vitis-AI hardware acceleration. This configuration can be used if the model has been executed before using on-the-fly quantization during which the quantization and comilation information was cached in a build directory. | - -### Samples - -When using python, you can base yourself on the following example: - -``` -# Import pyxir before onnxruntime -import pyxir -import pyxir.frontend.onnx -import pyxir.contrib.dpuv1.dpuv1 - -import onnxruntime - -# Add other imports -# ... - -# Load inputs and do preprocessing -# ... - -# Create an inference session using the Vitis-AI execution provider -session = onnxruntime.InferenceSession('[model_file].onnx', None,["VitisAIExecutionProvider"]) - -# First N (default = 128) inputs are used for quantization calibration and will -# be executed on the CPU -# This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64) -imput_name = [...] -outputs = [session.run([], {input_name: calib_inputs[i]})[0] for i in range(128)] - -# Afterwards, computations will be accelerated on the FPGA -input_data = [...] -result = session.run([], {input_name: input_data}) -``` diff --git a/samples/README.md b/samples/README.md deleted file mode 100644 index 7d479c5d25..0000000000 --- a/samples/README.md +++ /dev/null @@ -1,113 +0,0 @@ -# ONNX Runtime Samples and Tutorials - -Here you will find various samples, tutorials, and reference implementations for using ONNX Runtime. -For a list of available dockerfiles and published images to help with getting started, see [this page](../dockerfiles/README.md). - -**General** -* [Python](#Python) -* [C#](#C) -* [C/C++](#CC) -* [Java](#Java) -* [Node.js](#Nodejs) - -**Integrations** -* [Azure Machine Learning](#azure-machine-learning) -* [Azure IoT Edge](#azure-iot-edge) -* [Azure Media Services](#azure-media-services) -* [Azure SQL Edge and Managed Instance](#azure-sql) -* [Windows Machine Learning](#windows-machine-learning) -* [ML.NET](#mlnet) -* [Huggingface](#huggingface) - -*** -# General - -## Python -**Inference only** -* [Basic](https://microsoft.github.io/onnxruntime/python/tutorial.html) -* [Resnet50](https://github.com/onnx/onnx-docker/blob/master/onnx-ecosystem/inference_demos/resnet50_modelzoo_onnxruntime_inference.ipynb) -* [ONNX-Ecosystem Docker image samples](https://github.com/onnx/onnx-docker/tree/master/onnx-ecosystem/inference_demos) -* [ONNX Runtime Server: SSD Single Shot MultiBox Detector](https://github.com/onnx/tutorials/blob/master/tutorials/OnnxRuntimeServerSSDModel.ipynb) -* [NUPHAR EP samples](../docs/python/notebooks/onnxruntime-nuphar-tutorial.ipynb) - -**Inference with model conversion** -* [SKL tutorials](http://onnx.ai/sklearn-onnx/index_tutorial.html) -* [Keras - Basic](https://microsoft.github.io/onnxruntime/python/auto_examples/plot_dl_keras.html#sphx-glr-auto-examples-plot-dl-keras-py) -* [SSD Mobilenet (Tensorflow)](https://github.com/onnx/tensorflow-onnx/blob/master/tutorials/ConvertingSSDMobilenetToONNX.ipynb) -* [BERT-SQuAD (PyTorch) on CPU](../onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb) -* [BERT-SQuAD (PyTorch) on GPU](../onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb) -* [BERT-SQuAD (Keras)](../onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb) -* [BERT-SQuAD (Tensorflow)](https://github.com/onnx/tensorflow-onnx/blob/master/tutorials/BertTutorial.ipynb) -* [GPT2 (PyTorch)](../onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb) -* [EfficientDet (Tensorflow)](https://github.com/onnx/tensorflow-onnx/blob/master/tutorials/efficientdet.ipynb) -* [EfficientNet-Edge (Tensorflow)](https://github.com/onnx/tensorflow-onnx/blob/master/tutorials/efficientnet-edge.ipynb) -* [EfficientNet-Lite (Tensorflow)](https://github.com/onnx/tensorflow-onnx/blob/master/tutorials/efficientnet-lite.ipynb) -* [EfficientNet(Keras)](https://github.com/onnx/keras-onnx/blob/master/tutorial/TensorFlow_Keras_EfficientNet.ipynb) -* [MNIST (Keras)](https://github.com/onnx/keras-onnx/blob/master/tutorial/TensorFlow_Keras_MNIST.ipynb) - -**Quantization** -* [BERT Quantization on CPU](../onnxruntime/python/tools/quantization/notebooks/Bert-GLUE_OnnxRuntime_quantization.ipynb) - -**Other** -* [Running ONNX model tests](../docs/Model_Test.md) -* [Common Errors with explanations](https://microsoft.github.io/onnxruntime/python/auto_examples/plot_common_errors.html#sphx-glr-auto-examples-plot-common-errors-py) - -## C# -* [Inference Tutorial](../docs/CSharp_API.md#getting-started) -* [ResNet50 v2 Tutorial](../csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample) -* [Faster R-CNN Tutorial](../csharp/sample/Microsoft.ML.OnnxRuntime.FasterRcnnSample) - -## C/C++ -* [C: SqueezeNet](../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp) -* [C++: model-explorer](./c_cxx/model-explorer) - single and batch processing -* [C++: SqueezeNet](../csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/CXX_Api_Sample.cpp) -* [C++: MNIST](./c_cxx/MNIST) - -## Java -* [Inference Tutorial](../docs/Java_API.md#getting-started) -* [MNIST inference](../java/src/test/java/sample/ScoreMNIST.java) - -## Node.js - -* [Inference with Nodejs](./nodejs) - ---- -# Integrations - -## Azure Machine Learning - -**Inference and deploy through AzureML** - -*For aditional information on training in AzureML, please see [AzureML Training Notebooks](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/training)* -* Inferencing on **CPU** using [ONNX Model Zoo](https://github.com/onnx/models) models: - * [Facial Expression Recognition](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-facial-expression-recognition-deploy.ipynb) - * [MNIST Handwritten Digits](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-inference-mnist-deploy.ipynb) - * [Resnet50 Image Classification](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-modelzoo-aml-deploy-resnet50.ipynb) -* Inferencing on **CPU** with **PyTorch** model training: - * [MNIST](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-train-pytorch-aml-deploy-mnist.ipynb) - * [BERT](../onnxruntime/python/tools/transformers/notebooks/Inference_Bert_with_OnnxRuntime_on_AzureML.ipynb) -* Inferencing on **CPU** with model conversion for existing (CoreML) model: - * [TinyYolo](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/deployment/onnx/onnx-convert-aml-deploy-tinyyolo.ipynb) -* Inferencing on **GPU** with **TensorRT** Execution Provider (AKS): - * [FER+](../docs/python/notebooks/onnx-inference-byoc-gpu-cpu-aks.ipynb) - -## Azure IoT Edge -**Inference and Deploy with Azure IoT Edge** - * [Intel OpenVINO](http://aka.ms/onnxruntime-openvino) - * [NVIDIA TensorRT on Jetson Nano (ARM64)](http://aka.ms/onnxruntime-arm64) - * [ONNX Runtime with Azure ML](https://github.com/Azure-Samples/onnxruntime-iot-edge/blob/master/AzureML-OpenVINO/README.md) - -## Azure Media Services -[Video Analysis through Azure Media Services using using Yolov3 to build an IoT Edge module for object detection](https://github.com/Azure/live-video-analytics/tree/master/utilities/video-analysis/yolov3-onnx) - -## Azure SQL -[Deploy ONNX model in Azure SQL Edge](https://docs.microsoft.com/en-us/azure/azure-sql-edge/deploy-onnx) - -## Windows Machine Learning -[Examples of inferencing with ONNX Runtime through Windows Machine Learning](https://docs.microsoft.com/en-us/windows/ai/windows-ml/tools-and-samples#samples) - -## ML.NET -[Object Detection with ONNX Runtime in ML.NET](https://docs.microsoft.com/en-us/dotnet/machine-learning/tutorials/object-detection-onnx) - -## Huggingface -[Export Tranformer models](https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb) diff --git a/samples/c_cxx/MNIST/ReadMe.md b/samples/c_cxx/MNIST/ReadMe.md deleted file mode 100644 index 1716f054ec..0000000000 --- a/samples/c_cxx/MNIST/ReadMe.md +++ /dev/null @@ -1,66 +0,0 @@ -# MNIST Sample - Number recognition - -This sample uses the MNIST model from the Model Zoo: https://github.com/onnx/models/tree/master/vision/classification/mnist - -![Screenshot](Screenshot.png) - -## Requirements - -Compiled Onnxruntime.dll / lib (link to instructions on how to build dll) -Windows Visual Studio Compiler (cl.exe) - -## Build - -Run 'build.bat' in this directory to call cl.exe to generate MNIST.exe -Then just run MNIST.exe - -## How to use it - -Just draw a number with the left mouse button (or use touch) in the box on the left side. After releasing the mouse button the model will be run and the outputs of the model will be displayed. Note that when drawing numbers requiring multiple drawing strokes, the model will be run at the end of each stroke with probably wrong predictions (but it's amusing to see and avoids needing to press a 'run model' button). - -To clear the image, click the right mouse button anywhere. - -## How it works - -A single Ort::Env is created globally to initialize the runtime. -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L12 - -The MNIST structure abstracts away all of the interaction with the Onnx Runtime, creating the tensors, and running the model. - -WWinMain is the Windows entry point, it creates the main window. - -WndProc is the window procedure for the window, handling the mouse input and drawing the graphics - -### Preprocessing the data - -MNIST's input is a {1,1,28,28} shaped float tensor, which is basically a 28x28 floating point grayscale image (0.0 = background, 1.0 = foreground). - -The sample stores the image in a 32-bit per pixel windows DIB section, since that's easy to draw into and draw to the screen for windows. The DIB is created here: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L109-L121 - -The function to convert the DIB data and writ it into the model's input tensor: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L77-L92 - -### Postprocessing the output - -MNIST's output is a simple {1,10} float tensor that holds the likelihood weights per number. The number with the highest value is the model's best guess. - -The MNIST structure uses std::max_element to do this and stores it in result_: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L31 - -To make things more interesting, the window painting handler graphs the probabilities and shows the weights here: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L164-L183 - -### The Ort::Session - -1. Creation: The Ort::Session is created inside the MNIST structure here: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L43 - -2. Setup inputs & outputs: The input & output tensors are created here: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L19-L23 -In this usage, we're providing the memory location for the data instead of having Ort allocate the buffers. This is simpler in this case since the buffers are small and can just be fixed members of the MNIST struct. - -3. Run: Running the session is done in the Run() method: -https://github.com/microsoft/onnxruntime/blob/521dc757984fbf9770d0051997178fbb9565cd52/samples/c_cxx/MNIST/MNIST.cpp#L25-L33 - - From b50b0a89aac7d464c55978861213151efa8f4494 Mon Sep 17 00:00:00 2001 From: George Nash Date: Tue, 2 Feb 2021 10:00:10 -0800 Subject: [PATCH 33/41] Fix build failure when building with --build_wheel on Windows This resolves issue #6536 Signed-off-by: George Nash --- orttraining/orttraining/python/orttraining_pybind_state.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index 515be8dce1..9c31640771 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -228,13 +228,13 @@ TrainingConfigurationResult ConfigureSessionForTraining( config.graph_transformer_config.number_recompute_layers = parameters.number_recompute_layers; if (!parameters.model_after_graph_transforms_path.empty()) { - config.model_after_graph_transforms_path = parameters.model_after_graph_transforms_path; + config.model_after_graph_transforms_path = ToPathString(parameters.model_after_graph_transforms_path); } if (!parameters.model_with_gradient_graph_path.empty()) { - config.model_with_gradient_graph_path = parameters.model_with_gradient_graph_path; + config.model_with_gradient_graph_path = ToPathString(parameters.model_with_gradient_graph_path); } if (!parameters.model_with_training_graph_path.empty()) { - config.model_with_training_graph_path = parameters.model_with_training_graph_path; + config.model_with_training_graph_path = ToPathString(parameters.model_with_training_graph_path); } training::PipelineTrainingSession::TrainingConfigurationResult config_result{}; From b57a7f4de3e60557d8774a5803fc40bd55c23180 Mon Sep 17 00:00:00 2001 From: Ryan Lai Date: Fri, 5 Feb 2021 13:06:02 -0800 Subject: [PATCH 34/41] Delay load dxcore in winml model tests --- cmake/winml_unittests.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/winml_unittests.cmake b/cmake/winml_unittests.cmake index 3a2620c65e..b3ebaff1bf 100644 --- a/cmake/winml_unittests.cmake +++ b/cmake/winml_unittests.cmake @@ -279,6 +279,9 @@ if(NOT onnxruntime_ENABLE_MEMLEAK_CHECKER) SOURCES ${winml_test_model_src} LIBS winml_test_common ${winml_test_model_libs} ) + if (EXISTS ${dxcore_header}) + target_delayload(winml_test_model ext-ms-win-dxcore-l1-*.dll) + endif() target_precompiled_header(winml_test_model testPch.h) endif() From dbe31361bc111e83fbf1a8c9560f2b8394ae8938 Mon Sep 17 00:00:00 2001 From: Adam Pocock Date: Fri, 5 Feb 2021 14:55:34 -0500 Subject: [PATCH 35/41] Fix build.gradle so it always targets Java 8 class files. --- java/build.gradle | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/java/build.gradle b/java/build.gradle index 21af04353b..d519cdcded 100644 --- a/java/build.gradle +++ b/java/build.gradle @@ -78,6 +78,23 @@ spotless { compileJava { dependsOn spotlessJava options.compilerArgs += ["-h", "${project.buildDir}/headers/"] + if (!JavaVersion.current().isJava8()) { + // Ensures only methods present in Java 8 are used + options.compilerArgs.addAll(['--release', '8']) + // Gradle versions before 6.6 require that these flags are unset when using "-release" + java.sourceCompatibility = null + java.targetCompatibility = null + } +} + +compileTestJava { + if (!JavaVersion.current().isJava8()) { + // Ensures only methods present in Java 8 are used + options.compilerArgs.addAll(['--release', '8']) + // Gradle versions before 6.6 require that these flags are unset when using "-release" + java.sourceCompatibility = null + java.targetCompatibility = null + } } sourceSets.test { From d18aa45b4622c569a1ecaf601b5d0514f5fea645 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Fri, 5 Feb 2021 18:07:30 -0800 Subject: [PATCH 36/41] Enable more ROCM ops that are sharing CUDA code. Some are needed for Turing NLG models. --- .../providers/rocm/rocm_execution_provider.cc | 54 +++++++++---------- .../rocm/rocm_training_kernels.cc | 6 +-- tools/ci_build/amd_hipify.py | 8 --- .../github/pai/pai-excluded-tests.txt | 9 ---- 4 files changed, 30 insertions(+), 47 deletions(-) diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index eccd3afc02..e07de1fc34 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -570,8 +570,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kO class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 2, 10, float, Pad); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 2, 10, double, Pad); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 2, 10, MLFloat16, Pad); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 4, Reshape); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 5, 12, Reshape); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 4, Reshape_1); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 12, Shape); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 12, Size); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 6, 12, Tile); @@ -1255,12 +1255,12 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, @@ -1512,13 +1512,13 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, @@ -1584,21 +1584,21 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc index fc5e6d32e7..5497666ada 100644 --- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc +++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc @@ -235,9 +235,9 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // P2P communication operators. #if defined(ORT_USE_NCCL) || defined(USE_MPI) diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index 92b422142f..39ce3df555 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -178,11 +178,6 @@ provider_excluded_files = [ 'tensor/reverse_sequence.h', 'tensor/reverse_sequence_impl.cu', 'tensor/reverse_sequence_impl.h', - 'tensor/size.cc', - 'tensor/tile.cc', - 'tensor/tile.h', - 'tensor/tile_impl.cu', - 'tensor/tile_impl.h', 'tensor/transpose.cc', 'tensor/transpose.h', 'tensor/upsample.cc', @@ -238,9 +233,6 @@ training_ops_excluded_files = [ 'math/div_grad.h', 'math/div_grad_impl.cu', 'math/div_grad_impl.h', - 'math/scale.cc', - 'math/scale.cu', - 'math/scale.h', 'math/softmax_grad_impl.cu', 'math/softmax_grad.cc', 'nn/batch_norm_grad.cc', diff --git a/tools/ci_build/github/pai/pai-excluded-tests.txt b/tools/ci_build/github/pai/pai-excluded-tests.txt index 98b0269869..fa0583728e 100644 --- a/tools/ci_build/github/pai/pai-excluded-tests.txt +++ b/tools/ci_build/github/pai/pai-excluded-tests.txt @@ -20,13 +20,6 @@ CudaKernelTest.SparseSoftmaxCrossEntropy_LargeSizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor -CudaKernelTest.ReduceSum_SmallTensorTrailingAxes -CudaKernelTest.ReduceSum_MidTensorTrailingAxes -CudaKernelTest.ReduceSum_LargeTensorTrailingAxes -CudaKernelTest.ScaleHalfHalfScaleUp -CudaKernelTest.ScaleHalfInt64ScaleUp -CudaKernelTest.ScaleHalfHalfScaleDown -CudaKernelTest.ScaleHalfInt64ScaleDown ReductionOpTest.ReductionVariationTest ReductionOpTest.ReduceL1_default_axes_keepdims ReductionOpTest.ReduceL1_do_not_keep_dims @@ -106,8 +99,6 @@ GradientCheckerTest.SubGrad GradientCheckerTest.MulGrad GradientCheckerTest.MatMulGrad GradientCheckerTest.ReduceMeanGrad -GradientCheckerTest.ReduceSumGrad -GradientCheckerTest.ReduceLogSumExpGrad GradientCheckerTest.ReduceL2Grad GradientCheckerTest.SoftmaxCrossEntropyGrad GradientCheckerTest.ExpandGrad From c86c21e002115ded3aad590b0abaf02688c2f446 Mon Sep 17 00:00:00 2001 From: Weixing Zhang Date: Sat, 6 Feb 2021 15:54:29 -0800 Subject: [PATCH 37/41] Generate error when an explicit stream argument is not provided in the <<<...>>> kernel launch syntax (#6599) * Generate error when an explicit stream argument is not provided in the <<<...>>> kernel launch syntax Co-authored-by: Weixing Zhang --- cmake/CMakeLists.txt | 3 +++ onnxruntime/test/shared_lib/cuda_ops.cu | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index d7e5e2b9e3..f101f17480 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1199,6 +1199,9 @@ if (onnxruntime_USE_CUDA) endif() endif() set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch") + endif() if (NOT WIN32) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC") endif() diff --git a/onnxruntime/test/shared_lib/cuda_ops.cu b/onnxruntime/test/shared_lib/cuda_ops.cu index 9a3fedf3d3..4d3e10543b 100644 --- a/onnxruntime/test/shared_lib/cuda_ops.cu +++ b/onnxruntime/test/shared_lib/cuda_ops.cu @@ -15,7 +15,7 @@ __global__ void cuda_add_impl(int64_t N, float* O, const float* X, const float* } void cuda_add(int64_t N, float* O, const float* X, const float* Y) { - cuda_add_impl<<<1, 256>>>(N, O, X, Y); + cuda_add_impl<<<1, 256, 0, 0>>>(N, O, X, Y); } template @@ -28,7 +28,7 @@ __global__ void cuda_slice_impl(const T* X , int64_t from, int64_t to, T* Y) { template void cuda_slice(const T* X, int64_t from, int64_t to, T* Y) { - cuda_slice_impl<<<1, 256>>>(X, from, to, Y); + cuda_slice_impl<<<1, 256, 0, 0>>>(X, from, to, Y); } template void cuda_slice(const float*, int64_t, int64_t, float*); From 190b90a6829b458c32533790ea30cd6238b71d4b Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Mon, 8 Feb 2021 07:11:26 +1000 Subject: [PATCH 38/41] Fix some coding conventions issues (#6583) Fix some coding conventions issues Use #define for types that Cast supports --- .../core/providers/cpu/tensor/cast_op.cc | 30 ++++++------------- .../core/providers/cpu/tensor/transpose.cc | 6 ++-- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc index eb016febc3..f82fa8f060 100644 --- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc @@ -16,6 +16,7 @@ #include "core/framework/op_kernel.h" #include "core/providers/cpu/tensor/utils.h" #include "core/providers/op_kernel_type_control.h" +#include "core/providers/op_kernel_type_control_utils.h" #include "core/util/math_cpuonly.h" #include "Eigen/src/Core/arch/Default/BFloat16.h" @@ -31,21 +32,11 @@ namespace op_kernel_type_control { // we're using one set of types for all opsets of Cast ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( kCpuExecutionProvider, kOnnxDomain, Cast, Input, 0, - bool, - float, double, - uint8_t, uint16_t, uint32_t, uint64_t, - int8_t, int16_t, int32_t, int64_t, - MLFloat16, BFloat16, - std::string); + ORT_OP_KERNEL_TYPE_CTRL_ALL_TENSOR_DATA_TYPES); ORT_SPECIFY_OP_KERNEL_ARG_SUPPORTED_TYPES_ALL_OPSETS( kCpuExecutionProvider, kOnnxDomain, Cast, Output, 0, - bool, - float, double, - uint8_t, uint16_t, uint32_t, uint64_t, - int8_t, int16_t, int32_t, int64_t, - MLFloat16, BFloat16, - std::string); + ORT_OP_KERNEL_TYPE_CTRL_ALL_TENSOR_DATA_TYPES); } // namespace op_kernel_type_control namespace { @@ -313,11 +304,8 @@ Status Cast::Compute(OpKernelContext* context) const { return Status::OK(); } -const std::vector castSrcTypeConstraints = - BuildKernelDefConstraintsFunctorFromTypeList{}(); - -const std::vector castDstTypeConstraints = - BuildKernelDefConstraintsFunctorFromTypeList{}(); +const std::vector src_type_constraints = BuildKernelDefConstraintsFunctorFromTypeList{}(); +const std::vector dst_type_constraints = BuildKernelDefConstraintsFunctorFromTypeList{}(); } // namespace @@ -326,8 +314,8 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( 6, 12, KernelDefBuilder() - .TypeConstraint("T1", castSrcTypeConstraints) - .TypeConstraint("T2", castDstTypeConstraints) + .TypeConstraint("T1", src_type_constraints) + .TypeConstraint("T2", dst_type_constraints) .MayInplace(0, 0), // allocation planner will check input and output sizes match before inplacing Cast); @@ -335,8 +323,8 @@ ONNX_CPU_OPERATOR_KERNEL( Cast, 13, KernelDefBuilder() - .TypeConstraint("T1", castSrcTypeConstraints) - .TypeConstraint("T2", castDstTypeConstraints) + .TypeConstraint("T1", src_type_constraints) + .TypeConstraint("T2", dst_type_constraints) .MayInplace(0, 0), // allocation planner will check input and output sizes match before inplacing Cast); diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc index 482dd019c7..2c3f2d6478 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc @@ -23,7 +23,7 @@ namespace { using EnabledDataTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, Transpose, Input, 0); -const std::vector dataTypeConstraints = BuildKernelDefConstraintsFunctorFromTypeList{}(); +const std::vector type_constraints = BuildKernelDefConstraintsFunctorFromTypeList{}(); } // namespace /* A permutation [a,b,c,...] indicates that @@ -725,13 +725,13 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( Transpose, 1, 12, - KernelDefBuilder().TypeConstraint("T", dataTypeConstraints), + KernelDefBuilder().TypeConstraint("T", type_constraints), Transpose); ONNX_CPU_OPERATOR_KERNEL( Transpose, 13, - KernelDefBuilder().TypeConstraint("T", dataTypeConstraints), + KernelDefBuilder().TypeConstraint("T", type_constraints), Transpose); } // namespace onnxruntime From 19c130f561de5144a0c97d71373d7232938a202e Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Sun, 7 Feb 2021 13:20:53 -0800 Subject: [PATCH 39/41] Reduce CastMLFloat16ThroughFloat size (Scott's suggested changes), fix unused function warning. (#6597) --- .../core/providers/cpu/tensor/cast_op.cc | 96 +++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/cast_op.cc b/onnxruntime/core/providers/cpu/tensor/cast_op.cc index f82fa8f060..99c438ef8d 100644 --- a/onnxruntime/core/providers/cpu/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cpu/tensor/cast_op.cc @@ -45,6 +45,9 @@ using EnabledSrcTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecu using EnabledDstTypes = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST_ALL_OPSETS(kCpuExecutionProvider, kOnnxDomain, Cast, Output, 0); +template +using IsOrtFloat16Type = boost::mp11::mp_contains, T>; + // string cast helpers // Note: when C++17 is available, use functions @@ -94,17 +97,14 @@ CastToString(const SrcType& input, std::string& output) { } template -typename std::enable_if::value, void>::type +typename std::enable_if::value, void>::type CastToString(const SrcType& input, std::string& output) { output = std::to_string(input); } -// overloads for MLFloat16 and BFloat16 -void CastToString(const MLFloat16& input, std::string& output) { - CastToString(static_cast(input), output); -} - -void CastToString(const BFloat16& input, std::string& output) { +template +typename std::enable_if::value, void>::type +CastToString(const SrcType& input, std::string& output) { CastToString(static_cast(input), output); } @@ -132,17 +132,12 @@ CastFromString(const std::string& input, DstType& output) { output = gsl::narrow_cast(std::stoll(input)); } -// overloads for MLFloat16 and BFloat16 -void CastFromString(const std::string& input, MLFloat16& output) { +template +typename std::enable_if::value, void>::type +CastFromString(const std::string& input, DstType& output) { float intermediate; CastFromString(input, intermediate); - output = static_cast(intermediate); -} - -void CastFromString(const std::string& input, BFloat16& output) { - float intermediate; - CastFromString(input, intermediate); - output = static_cast(intermediate); + output = static_cast(intermediate); } // type that is usable with Eigen cast @@ -151,7 +146,7 @@ struct EigenCastType { using type = T; }; -// ORT float16 types don't support casting, so map them to Eigen ones +// ORT float16 types don't support Eigen cast, so map them to Eigen ones template <> struct EigenCastType { @@ -166,7 +161,7 @@ struct EigenCastType { // generic tensor X -> Y template struct TensorCaster { - void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const { using SrcEigenCastType = typename EigenCastType::type; using DstEigenCastType = typename EigenCastType::type; @@ -182,7 +177,7 @@ struct TensorCaster { // tensor X -> string template struct TensorCaster { - void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const { const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); const auto* in_data = in.Data(); auto* out_data = out.MutableData(); @@ -195,7 +190,7 @@ struct TensorCaster { // tensor string -> X template struct TensorCaster { - void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const { const std::ptrdiff_t shape_size = gsl::narrow(shape.Size()); const auto* in_data = in.Data(); auto* out_data = out.MutableData(); @@ -209,30 +204,10 @@ struct TensorCaster { // specializations to use optimized and Windows x64-specific // MlasConvertHalfToFloatBuffer() routine for MLFloat16 -> float conversion -template -void CastMLFloat16ThroughFloat( - const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) { - // use optimized MLFloat16 -> float, then float -> DstType - AllocatorPtr allocator; - ORT_THROW_IF_ERROR(context.GetTempSpaceAllocator(&allocator)); - auto intermediate_buffer = IAllocator::MakeUniquePtr(allocator, gsl::narrow(shape.Size())); - Tensor intermediate_tensor{DataTypeImpl::GetType(), shape, intermediate_buffer.get(), allocator->Info()}; - TensorCaster{}.Cast(context, in, intermediate_tensor, shape); - TensorCaster{}.Cast(context, intermediate_tensor, out, shape); -} - -// tensor MLFloat16 -> X -template -struct TensorCaster { - void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const { - CastMLFloat16ThroughFloat(context, in, out, shape); - } -}; - // tensor MLFloat16 -> float template <> struct TensorCaster { - void Cast(const OpKernelContext&, const Tensor& in, Tensor& out, const TensorShape& shape) const { + void Cast(const OpKernelContext&, const TensorShape& shape, const Tensor& in, Tensor& out) const { auto out_data = out.MutableData(); auto in_data = in.Data(); const size_t shape_size = gsl::narrow(shape.Size()); @@ -240,11 +215,36 @@ struct TensorCaster { } }; +Tensor GetIntermediateMLFloat16ToFloatTensor( + const OpKernelContext& context, const TensorShape& shape, const Tensor& in) { + AllocatorPtr allocator; + ORT_THROW_IF_ERROR(context.GetTempSpaceAllocator(&allocator)); + Tensor out{DataTypeImpl::GetType(), shape, allocator}; + TensorCaster{}.Cast(context, shape, in, out); + return out; +} + +template +void CastMLFloat16ThroughFloatTensor( + const OpKernelContext& context, const TensorShape& shape, const Tensor& in, Tensor& out) { + // use optimized MLFloat16 -> float, then float -> DstType + Tensor intermediate_tensor = GetIntermediateMLFloat16ToFloatTensor(context, shape, in); + TensorCaster{}.Cast(context, shape, intermediate_tensor, out); +} + +// tensor MLFloat16 -> X +template +struct TensorCaster { + void Cast(const OpKernelContext& context, const TensorShape& shape, const Tensor& in, Tensor& out) const { + CastMLFloat16ThroughFloatTensor(context, shape, in, out); + } +}; + // tensor MLFloat16 -> string template <> struct TensorCaster { - void Cast(const OpKernelContext& context, const Tensor& in, Tensor& out, const TensorShape& shape) const { - CastMLFloat16ThroughFloat(context, in, out, shape); + void Cast(const OpKernelContext& context, const TensorShape& shape, const Tensor& in, Tensor& out) const { + CastMLFloat16ThroughFloatTensor(context, shape, in, out); } }; #endif @@ -266,18 +266,18 @@ class Cast final : public OpKernel { template struct Dispatcher { - void operator()(const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) { - TensorCaster{}.Cast(context, src, dst, shape); + void operator()(const OpKernelContext& context, const TensorShape& shape, const Tensor& src, Tensor& dst) { + TensorCaster{}.Cast(context, shape, src, dst); } }; template struct SrcDispatcher { void operator()( - int32_t to, const OpKernelContext& context, const Tensor& src, Tensor& dst, const TensorShape& shape) { + int32_t to, const OpKernelContext& context, const TensorShape& shape, const Tensor& src, Tensor& dst) { using DstTypes = boost::mp11::mp_remove_if_q>; utils::MLTypeCallDispatcherFromTypeList dispatcher{to}; - dispatcher.template InvokeWithLeadingTemplateArgs>(context, src, dst, shape); + dispatcher.template InvokeWithLeadingTemplateArgs>(context, shape, src, dst); } }; @@ -299,7 +299,7 @@ Status Cast::Compute(OpKernelContext* context) const { } utils::MLTypeCallDispatcherFromTypeList dispatcher{from}; - dispatcher.Invoke(to_, *context, *X, *Y, shape); + dispatcher.Invoke(to_, *context, shape, *X, *Y); return Status::OK(); } From 1dd920fa7cac2d2ce8a0b497f2f32bd4a2b099fc Mon Sep 17 00:00:00 2001 From: nietras Date: Mon, 8 Feb 2021 05:09:30 +0100 Subject: [PATCH 40/41] Fix TensorRT unnecessary file cache operations (#6601) * Fix TensorRT unnecessary file cache operations * fix compile --- .../tensorrt/tensorrt_execution_provider.cc | 56 ++++++++++--------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index ea2d88a749..4e065fa6a4 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1210,34 +1210,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& fuse const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); const std::string engine_cache_path = cache_path + ".engine"; const std::string profile_cache_path = cache_path + ".profile"; - std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); - std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); - if (engine_file && profile_file && (trt_state->engine_cache_enable && trt_engine == nullptr)) { - // Deserialize profile - shape_ranges = DeserializeProfile(profile_file); - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path; - // Deserialize engine - trt_state->context->reset(); - trt_state->engine->reset(); - engine_file.seekg(0, std::ios::end); - int engine_size = engine_file.tellg(); - engine_file.seekg(0, std::ios::beg); - std::unique_ptr engine_buf{new char[engine_size]}; - engine_file.read((char*)engine_buf.get(), engine_size); - auto runtime_ = trt_state->runtime; - *(trt_state->engine) = tensorrt_ptr::unique_pointer( - runtime_->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr)); - if (trt_state->engine->get() == nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine."); + if ((trt_state->engine_cache_enable && trt_engine == nullptr)) { + std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); + std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); + if (engine_file && profile_file) { + // Deserialize profile + shape_ranges = DeserializeProfile(profile_file); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path; + // Deserialize engine + trt_state->context->reset(); + trt_state->engine->reset(); + engine_file.seekg(0, std::ios::end); + int engine_size = engine_file.tellg(); + engine_file.seekg(0, std::ios::beg); + std::unique_ptr engine_buf{new char[engine_size]}; + engine_file.read((char*)engine_buf.get(), engine_size); + auto runtime_ = trt_state->runtime; + *(trt_state->engine) = tensorrt_ptr::unique_pointer( + runtime_->deserializeCudaEngine(engine_buf.get(), engine_size, nullptr)); + if (trt_state->engine->get() == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine."); + } + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; + trt_engine = trt_state->engine->get(); + *(trt_state->context) = tensorrt_ptr::unique_pointer( + trt_state->engine->get()->createExecutionContext()); + if (trt_state->context->get() == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context."); + } + trt_context = trt_state->context->get(); } - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; - trt_engine = trt_state->engine->get(); - *(trt_state->context) = tensorrt_ptr::unique_pointer( - trt_state->engine->get()->createExecutionContext()); - if (trt_state->context->get() == nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to create context."); - } - trt_context = trt_state->context->get(); } for (int i = 0, end = num_inputs; i < end; ++i) { From 67ef6b1aa63ae518b98da27086fa312cf0442c57 Mon Sep 17 00:00:00 2001 From: Pranav Sharma Date: Mon, 8 Feb 2021 08:59:18 -0800 Subject: [PATCH 41/41] [Mult-GPU inferencing] Add new API to get/set device id. Set correct device id in cuda allocator. (#6592) --- .../core/session/onnxruntime_c_api.h | 11 ++++++++ .../core/providers/cuda/cuda_allocator.cc | 17 +++++++++++ .../core/providers/cuda/cuda_allocator.h | 1 + .../providers/cuda/cuda_provider_factory.cc | 28 +++++++++++++++++++ onnxruntime/core/session/onnxruntime_c_api.cc | 13 +++++++++ onnxruntime/core/session/ort_apis.h | 2 ++ 6 files changed, 72 insertions(+) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index b0985608fc..3a602b195a 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -1163,6 +1163,17 @@ struct OrtApi { */ ORT_API2_STATUS(SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); + + /** + * Set the current device id of the GPU execution provider (cuda/tensorrt/rocm). The device id should be less + * than the total number of devices available. Using this API makes sense only when doing multi-GPU inferencing. + */ + ORT_API2_STATUS(SetCurrentGpuDeviceId, _In_ int device_id); + + /** + * Get the current device id of the GPU execution provider (cuda/tensorrt/rocm). + */ + ORT_API2_STATUS(GetCurrentGpuDeviceId, _In_ int* device_id); }; /* diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc index d8eacc6ef1..03f98349c6 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.cc +++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc @@ -32,7 +32,23 @@ void CUDAAllocator::CheckDevice(bool throw_when_fail) const { #endif } +void CUDAAllocator::SetDevice(bool throw_when_fail) const { + int current_device; + auto cuda_err = cudaGetDevice(¤t_device); + if (cuda_err == cudaSuccess) { + int allocator_device_id = Info().id; + if (current_device != allocator_device_id) { + cuda_err = cudaSetDevice(allocator_device_id); + } + } + + if (cuda_err != cudaSuccess && throw_when_fail) { + CUDA_CALL_THROW(cuda_err); + } +} + void* CUDAAllocator::Alloc(size_t size) { + SetDevice(true); CheckDevice(true); void* p = nullptr; if (size > 0) { @@ -43,6 +59,7 @@ void* CUDAAllocator::Alloc(size_t size) { } void CUDAAllocator::Free(void* p) { + SetDevice(false); CheckDevice(false); // ignore CUDA failure when free cudaFree(p); // do not throw error since it's OK for cudaFree to fail during shutdown } diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.h b/onnxruntime/core/providers/cuda/cuda_allocator.h index 6c40a0d0af..9bbb5e2b74 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.h +++ b/onnxruntime/core/providers/cuda/cuda_allocator.h @@ -20,6 +20,7 @@ class CUDAAllocator : public IAllocator { private: void CheckDevice(bool throw_when_fail) const; + void SetDevice(bool throw_when_fail) const; }; //TODO: add a default constructor diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index 6a5e8fd8a0..1300703839 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -63,3 +63,31 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA, return nullptr; } + +ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) { + int num_devices; + auto cuda_err = cudaGetDeviceCount(&num_devices); + if (cuda_err != cudaSuccess) { + return CreateStatus(ORT_FAIL, "Failed to set device id since cudaGetDeviceCount failed."); + } + + if (device_id >= num_devices) { + std::ostringstream ostr; + ostr << "Invalid device id. Device id should be less than total number of devices (" << num_devices << ")"; + return CreateStatus(ORT_INVALID_ARGUMENT, ostr.str().c_str()); + } + + cuda_err = cudaSetDevice(device_id); + if (cuda_err != cudaSuccess) { + return CreateStatus(ORT_FAIL, "Failed to set device id."); + } + return nullptr; +} + +ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) { + auto cuda_err = cudaGetDevice(device_id); + if (cuda_err != cudaSuccess) { + return CreateStatus(ORT_FAIL, "Failed to get device id."); + } + return nullptr; +} diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 3bc51f11da..3fa6b01567 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -1823,6 +1823,16 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_CUDA, ORT_UNUSED_PARAMETER(cuda_options); return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled."); } + +ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) { + ORT_UNUSED_PARAMETER(device_id); + return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled."); +} + +ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) { + ORT_UNUSED_PARAMETER(device_id); + return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled."); +} #endif #if defined(ORT_MINIMAL_BUILD) @@ -2093,7 +2103,10 @@ static constexpr OrtApi ort_api_1_to_7 = { // Version 7 - In development, feel free to add/remove/rearrange here &OrtApis::ModelMetadataGetGraphDescription, + &OrtApis::SessionOptionsAppendExecutionProvider_TensorRT, + &OrtApis::SetCurrentGpuDeviceId, + &OrtApis::GetCurrentGpuDeviceId, }; // Assert to do a limited check to ensure Version 1 of OrtApi never changes (will detect an addition or deletion but not if they cancel out each other) diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index 2418ff8909..10ab7328f3 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -257,4 +257,6 @@ ORT_API_STATUS_IMPL(CreateArenaCfg, _In_ size_t max_mem, int arena_extend_strate ORT_API(void, ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg*); ORT_API_STATUS_IMPL(SessionOptionsAppendExecutionProvider_TensorRT, _In_ OrtSessionOptions* options, _In_ const OrtTensorRTProviderOptions* tensorrt_options); +ORT_API_STATUS_IMPL(SetCurrentGpuDeviceId, _In_ int device_id); +ORT_API_STATUS_IMPL(GetCurrentGpuDeviceId, _In_ int* device_id); } // namespace OrtApis