Support multiple dynamic inputs in custom ops (#6666)

This commit is contained in:
Hariharan Seshadri 2021-02-16 20:54:30 -08:00 committed by GitHub
parent 02c7873b0e
commit aa2622efb2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 181 additions and 20 deletions

View file

@ -101,6 +101,9 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
for (const auto& domain : op_domains) {
// Create an OpSchema for each op and register them
// Container to hold type template parameters
std::unordered_map<const OrtCustomOp*, std::vector<std::string>> type_constraint_ids;
#if !defined(ORT_MINIMAL_BUILD)
// Domain is not empty - add it to the DomainToVersion ONNX map
// If domain is empty, it is assumed to be part of the ONNX domain
@ -119,25 +122,36 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
for (const auto* op : domain->custom_ops_) {
ONNX_NAMESPACE::OpSchema schema(op->GetName(op), "custom op registered at runtime", 0);
size_t type_id_counter = 0;
auto input_count = op->GetInputTypeCount(op);
for (size_t i = 0; i < input_count; i++) {
auto type = op->GetInputType(op, i);
schema.Input(i, "Input" + std::to_string(i), "",
ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type
? "T"
: DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed input
schema.Input(i, "Input" + std::to_string(i), "", "T" + std::to_string(type_id_counter));
schema.TypeConstraint("T" + std::to_string(type_id_counter), DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++));
} else {
schema.Input(i, "Input" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
}
}
auto output_count = op->GetOutputTypeCount(op);
for (size_t i = 0; i < output_count; i++) {
auto type = op->GetOutputType(op, i);
schema.Output(i, "Output" + std::to_string(i), "",
ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type
? "T"
: DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed output
ORT_ENFORCE(type_id_counter == 1,
"There must be one (and only one) dynamic typed input to the custom op. "
"Its type info at runtime will be used to infer the type info of this dynamic typed output "
"which is required for the success of the model loading step. "
"More than one dynamic typed inputs are currently not supported as differing types at runtime means the output type "
"cannot be inferred without which model loading cannot proceed.");
schema.Output(i, "Output" + std::to_string(i), "", "T0");
} else {
schema.Output(i, "Output" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
}
}
schema.TypeConstraint("T", DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
schema.SetDomain(domain->domain_);
schema.SinceVersion(1);
schema.AllowUncheckedAttributes();
@ -149,14 +163,31 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
1 /* baseline opset version */,
1000 /* opset version */));
#else
// For a minimal build, we may not need any of the ONNX schema stuff but we still need to track
// the type template parameters to be used during the kernel def building step below
for (const auto* op : domain->custom_ops_) {
size_t type_id_counter = 0;
auto input_count = op->GetInputTypeCount(op);
for (size_t i = 0; i < input_count; i++) {
auto type = op->GetInputType(op, i);
if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed input
type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++));
}
}
}
#endif
// create the KernelDef for each op and register it
for (const auto* op : domain->custom_ops_) {
KernelDefBuilder def_builder;
def_builder.SetName(op->GetName(op))
.SetDomain(domain->domain_)
.SinceVersion(1)
.TypeConstraint("T", DataTypeImpl::AllTensorTypes());
.SinceVersion(1);
for (auto& id : type_constraint_ids[op]) {
def_builder.TypeConstraint(id, DataTypeImpl::AllTensorTypes());
}
if (const char* provider_type = op->GetExecutionProviderType(op)) {
def_builder.Provider(provider_type);
@ -178,4 +209,3 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
} // namespace onnxruntime
#endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)

View file

@ -7,29 +7,34 @@
using namespace std;
__global__ void cuda_add_impl(int64_t N, float* O, const float* X, const float* Y) {
template <typename T1, typename T2, typename T3>
__global__ void cuda_add_impl(int64_t N, T3* O, const T1* X, const T2* Y) {
auto offset = threadIdx.x;
if (offset < N) {
O[offset] = Y[offset] + X[offset];
}
}
void cuda_add(int64_t N, float* O, const float* X, const float* Y) {
template <typename T1, typename T2, typename T3>
void cuda_add(int64_t N, T3* O, const T1* X, const T2* Y) {
cuda_add_impl<<<1, 256, 0, 0>>>(N, O, X, Y);
}
template<typename T>
__global__ void cuda_slice_impl(const T* X , int64_t from, int64_t to, T* Y) {
template <typename T>
__global__ void cuda_slice_impl(const T* X, int64_t from, int64_t to, T* Y) {
auto offset = threadIdx.x;
if (offset >= from && offset < to) {
Y[offset - from] = X[offset];
}
}
template<typename T>
template <typename T>
void cuda_slice(const T* X, int64_t from, int64_t to, T* Y) {
cuda_slice_impl<T><<<1, 256, 0, 0>>>(X, from, to, Y);
cuda_slice_impl<T><<<1, 256, 0, 0>>>(X, from, to, Y);
}
template void cuda_slice(const float*, int64_t, int64_t, float*);
template void cuda_slice(const double*, int64_t, int64_t, double*);
template void cuda_add(int64_t, float*, const float*, const float*);
template void cuda_add(int64_t, float*, const float*, const double*);

View file

@ -164,6 +164,7 @@ static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/ov
static constexpr PATH_TYPE NAMED_AND_ANON_DIM_PARAM_URI = TSTR("testdata/capi_symbolic_dims.onnx");
static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/model_with_valid_ort_config_json.onnx");
static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx");
static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx");
#ifdef ENABLE_LANGUAGE_INTEROP_OPS
static constexpr PATH_TYPE PYOP_FLOAT_MODEL_URI = TSTR("testdata/pyop_1.onnx");
@ -382,6 +383,66 @@ TEST(CApiTest, varied_input_custom_op_handler) {
#endif
}
TEST(CApiTest, multiple_varied_input_custom_op_handler) {
#ifdef USE_CUDA
MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCudaExecutionProvider};
#else
MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCpuExecutionProvider};
#endif
Ort::CustomOpDomain custom_op_domain("");
custom_op_domain.Add(&custom_op);
Ort::SessionOptions session_options;
#ifdef USE_CUDA
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
#endif
session_options.Add(custom_op_domain);
Ort::Session session(*ort_env, VARIED_INPUT_CUSTOM_OP_MODEL_URI_2, session_options);
Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
std::vector<Ort::Value> ort_inputs;
std::vector<const char*> input_names;
// input 0 (float type)
input_names.emplace_back("X");
std::vector<float> input_0_data = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
std::vector<int64_t> input_0_dims = {3, 2};
ort_inputs.emplace_back(
Ort::Value::CreateTensor<float>(info, const_cast<float*>(input_0_data.data()),
input_0_data.size(), input_0_dims.data(), input_0_dims.size()));
// input 1 (double type)
input_names.emplace_back("W");
std::vector<double> input_1_data = {2, 3, 4, 5, 6, 7};
std::vector<int64_t> input_1_dims = {3, 2};
ort_inputs.emplace_back(
Ort::Value::CreateTensor<double>(info, const_cast<double*>(input_1_data.data()),
input_1_data.size(), input_1_dims.data(), input_1_dims.size()));
// Run
const char* output_name = "Y";
auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
&output_name, 1);
ASSERT_EQ(ort_outputs.size(), 1u);
// Validate results
std::vector<int64_t> y_dims = {3, 2};
std::vector<float> values_y = {3.f, 5.f, 7.f, 9.f, 11.f, 13.f};
auto type_info = ort_outputs[0].GetTensorTypeAndShapeInfo();
ASSERT_EQ(type_info.GetShape(), y_dims);
size_t total_len = type_info.GetElementCount();
ASSERT_EQ(values_y.size(), total_len);
float* f = ort_outputs[0].GetTensorMutableData<float>();
for (size_t i = 0; i != total_len; ++i) {
ASSERT_EQ(values_y[i], f[i]);
}
}
// Tests registration of a custom op of the same name for both CPU and CUDA EPs
#ifdef USE_CUDA
TEST(CApiTest, RegisterCustomOpForCPUAndCUDA) {

View file

@ -5,7 +5,8 @@
#ifdef USE_CUDA
#include <cuda_runtime.h>
void cuda_add(int64_t, float*, const float*, const float*);
template <typename T1, typename T2, typename T3>
void cuda_add(int64_t, T3*, const T1*, const T2*);
#endif
void MyCustomKernel::Compute(OrtKernelContext* context) {
@ -34,3 +35,33 @@ void MyCustomKernel::Compute(OrtKernelContext* context) {
}
#endif
}
void MyCustomKernelMultipleDynamicInputs::Compute(OrtKernelContext* context) {
// Setup inputs
const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0);
const OrtValue* input_Y = ort_.KernelContext_GetInput(context, 1);
// Even though this kernel backs an operator where-in both inputs can be any type and need not be homogeneous
// as a proof-of-concept, support the case where-in the first input is of float type and the second input
// is of double type. Users need to extend this logic to handle any arbitrary type should the need arise.
const float* X = ort_.GetTensorData<float>(input_X);
const double* Y = ort_.GetTensorData<double>(input_Y);
// Setup output
OrtTensorDimensions dimensions(ort_, input_X);
OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());
float* out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo* output_info = ort_.GetTensorTypeAndShape(output);
int64_t size = ort_.GetTensorShapeElementCount(output_info);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// Do computation
#ifdef USE_CUDA
cuda_add(size, out, X, Y);
cudaStreamSynchronize(nullptr);
#else
for (int64_t i = 0; i < size; i++) {
out[i] = static_cast<float>(X[i] + Y[i]);
}
#endif
}

View file

@ -35,7 +35,10 @@ struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
// Both the inputs need to be necessarily of float type
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
@ -43,3 +46,34 @@ struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
private:
const char* provider_;
};
struct MyCustomKernelMultipleDynamicInputs {
MyCustomKernelMultipleDynamicInputs(Ort::CustomOpApi ort, const OrtKernelInfo* /*info*/) : ort_(ort) {
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
};
struct MyCustomOpMultipleDynamicInputs : Ort::CustomOpBase<MyCustomOpMultipleDynamicInputs, MyCustomKernelMultipleDynamicInputs> {
explicit MyCustomOpMultipleDynamicInputs(const char* provider) : provider_(provider) {}
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const { return new MyCustomKernelMultipleDynamicInputs(api, info); };
const char* GetName() const { return "Foo"; };
const char* GetExecutionProviderType() const { return provider_; };
size_t GetInputTypeCount() const { return 2; };
ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
// Both the inputs are dynamic typed (i.e.) they can be any type and need not be
// homogeneous
return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
};
size_t GetOutputTypeCount() const { return 1; };
ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
private:
const char* provider_;
};

Binary file not shown.

BIN
onnxruntime/test/testdata/foo_3.onnx vendored Normal file

Binary file not shown.