diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 187e69b54d..bc4feafa58 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -101,6 +101,9 @@ common::Status CreateCustomRegistry(const std::vector& op_do for (const auto& domain : op_domains) { // Create an OpSchema for each op and register them + // Container to hold type template parameters + std::unordered_map> type_constraint_ids; + #if !defined(ORT_MINIMAL_BUILD) // Domain is not empty - add it to the DomainToVersion ONNX map // If domain is empty, it is assumed to be part of the ONNX domain @@ -119,25 +122,36 @@ common::Status CreateCustomRegistry(const std::vector& op_do for (const auto* op : domain->custom_ops_) { ONNX_NAMESPACE::OpSchema schema(op->GetName(op), "custom op registered at runtime", 0); + size_t type_id_counter = 0; auto input_count = op->GetInputTypeCount(op); for (size_t i = 0; i < input_count; i++) { auto type = op->GetInputType(op, i); - schema.Input(i, "Input" + std::to_string(i), "", - ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type - ? "T" - : DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type))); + if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed input + schema.Input(i, "Input" + std::to_string(i), "", "T" + std::to_string(type_id_counter)); + schema.TypeConstraint("T" + std::to_string(type_id_counter), DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types"); + type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++)); + } else { + schema.Input(i, "Input" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type))); + } } auto output_count = op->GetOutputTypeCount(op); for (size_t i = 0; i < output_count; i++) { auto type = op->GetOutputType(op, i); - schema.Output(i, "Output" + std::to_string(i), "", - ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type - ? "T" - : DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type))); + if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed output + ORT_ENFORCE(type_id_counter == 1, + "There must be one (and only one) dynamic typed input to the custom op. " + "Its type info at runtime will be used to infer the type info of this dynamic typed output " + "which is required for the success of the model loading step. " + "More than one dynamic typed inputs are currently not supported as differing types at runtime means the output type " + "cannot be inferred without which model loading cannot proceed."); + + schema.Output(i, "Output" + std::to_string(i), "", "T0"); + } else { + schema.Output(i, "Output" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type))); + } } - schema.TypeConstraint("T", DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types"); schema.SetDomain(domain->domain_); schema.SinceVersion(1); schema.AllowUncheckedAttributes(); @@ -149,14 +163,31 @@ common::Status CreateCustomRegistry(const std::vector& op_do 1 /* baseline opset version */, 1000 /* opset version */)); +#else + // For a minimal build, we may not need any of the ONNX schema stuff but we still need to track + // the type template parameters to be used during the kernel def building step below + for (const auto* op : domain->custom_ops_) { + size_t type_id_counter = 0; + auto input_count = op->GetInputTypeCount(op); + for (size_t i = 0; i < input_count; i++) { + auto type = op->GetInputType(op, i); + if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) { // Dynamic typed input + type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++)); + } + } + } #endif + // create the KernelDef for each op and register it for (const auto* op : domain->custom_ops_) { KernelDefBuilder def_builder; def_builder.SetName(op->GetName(op)) .SetDomain(domain->domain_) - .SinceVersion(1) - .TypeConstraint("T", DataTypeImpl::AllTensorTypes()); + .SinceVersion(1); + + for (auto& id : type_constraint_ids[op]) { + def_builder.TypeConstraint(id, DataTypeImpl::AllTensorTypes()); + } if (const char* provider_type = op->GetExecutionProviderType(op)) { def_builder.Provider(provider_type); @@ -178,4 +209,3 @@ common::Status CreateCustomRegistry(const std::vector& op_do } // namespace onnxruntime #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS) - diff --git a/onnxruntime/test/shared_lib/cuda_ops.cu b/onnxruntime/test/shared_lib/cuda_ops.cu index 4d3e10543b..f1c11a00ff 100644 --- a/onnxruntime/test/shared_lib/cuda_ops.cu +++ b/onnxruntime/test/shared_lib/cuda_ops.cu @@ -7,29 +7,34 @@ using namespace std; -__global__ void cuda_add_impl(int64_t N, float* O, const float* X, const float* Y) { +template +__global__ void cuda_add_impl(int64_t N, T3* O, const T1* X, const T2* Y) { auto offset = threadIdx.x; if (offset < N) { O[offset] = Y[offset] + X[offset]; } } -void cuda_add(int64_t N, float* O, const float* X, const float* Y) { +template +void cuda_add(int64_t N, T3* O, const T1* X, const T2* Y) { cuda_add_impl<<<1, 256, 0, 0>>>(N, O, X, Y); } -template -__global__ void cuda_slice_impl(const T* X , int64_t from, int64_t to, T* Y) { +template +__global__ void cuda_slice_impl(const T* X, int64_t from, int64_t to, T* Y) { auto offset = threadIdx.x; if (offset >= from && offset < to) { Y[offset - from] = X[offset]; } } -template +template void cuda_slice(const T* X, int64_t from, int64_t to, T* Y) { - cuda_slice_impl<<<1, 256, 0, 0>>>(X, from, to, Y); + cuda_slice_impl<<<1, 256, 0, 0>>>(X, from, to, Y); } template void cuda_slice(const float*, int64_t, int64_t, float*); template void cuda_slice(const double*, int64_t, int64_t, double*); + +template void cuda_add(int64_t, float*, const float*, const float*); +template void cuda_add(int64_t, float*, const float*, const double*); diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 4116cb8f48..cdb755ff7b 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -164,6 +164,7 @@ static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/ov static constexpr PATH_TYPE NAMED_AND_ANON_DIM_PARAM_URI = TSTR("testdata/capi_symbolic_dims.onnx"); static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/model_with_valid_ort_config_json.onnx"); static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx"); +static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx"); #ifdef ENABLE_LANGUAGE_INTEROP_OPS static constexpr PATH_TYPE PYOP_FLOAT_MODEL_URI = TSTR("testdata/pyop_1.onnx"); @@ -382,6 +383,66 @@ TEST(CApiTest, varied_input_custom_op_handler) { #endif } +TEST(CApiTest, multiple_varied_input_custom_op_handler) { +#ifdef USE_CUDA + MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCudaExecutionProvider}; +#else + MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCpuExecutionProvider}; +#endif + + Ort::CustomOpDomain custom_op_domain(""); + custom_op_domain.Add(&custom_op); + + Ort::SessionOptions session_options; + +#ifdef USE_CUDA + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); +#endif + + session_options.Add(custom_op_domain); + Ort::Session session(*ort_env, VARIED_INPUT_CUSTOM_OP_MODEL_URI_2, session_options); + + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + std::vector ort_inputs; + std::vector input_names; + + // input 0 (float type) + input_names.emplace_back("X"); + std::vector input_0_data = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f}; + std::vector input_0_dims = {3, 2}; + ort_inputs.emplace_back( + Ort::Value::CreateTensor(info, const_cast(input_0_data.data()), + input_0_data.size(), input_0_dims.data(), input_0_dims.size())); + + // input 1 (double type) + input_names.emplace_back("W"); + std::vector input_1_data = {2, 3, 4, 5, 6, 7}; + std::vector input_1_dims = {3, 2}; + ort_inputs.emplace_back( + Ort::Value::CreateTensor(info, const_cast(input_1_data.data()), + input_1_data.size(), input_1_dims.data(), input_1_dims.size())); + + // Run + const char* output_name = "Y"; + auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), + &output_name, 1); + ASSERT_EQ(ort_outputs.size(), 1u); + + // Validate results + std::vector y_dims = {3, 2}; + std::vector values_y = {3.f, 5.f, 7.f, 9.f, 11.f, 13.f}; + auto type_info = ort_outputs[0].GetTensorTypeAndShapeInfo(); + ASSERT_EQ(type_info.GetShape(), y_dims); + size_t total_len = type_info.GetElementCount(); + ASSERT_EQ(values_y.size(), total_len); + + float* f = ort_outputs[0].GetTensorMutableData(); + for (size_t i = 0; i != total_len; ++i) { + ASSERT_EQ(values_y[i], f[i]); + } +} + // Tests registration of a custom op of the same name for both CPU and CUDA EPs #ifdef USE_CUDA TEST(CApiTest, RegisterCustomOpForCPUAndCUDA) { diff --git a/onnxruntime/test/shared_lib/utils.cc b/onnxruntime/test/shared_lib/utils.cc index cfa7c7139b..97cda33b1a 100644 --- a/onnxruntime/test/shared_lib/utils.cc +++ b/onnxruntime/test/shared_lib/utils.cc @@ -5,7 +5,8 @@ #ifdef USE_CUDA #include -void cuda_add(int64_t, float*, const float*, const float*); +template +void cuda_add(int64_t, T3*, const T1*, const T2*); #endif void MyCustomKernel::Compute(OrtKernelContext* context) { @@ -34,3 +35,33 @@ void MyCustomKernel::Compute(OrtKernelContext* context) { } #endif } + +void MyCustomKernelMultipleDynamicInputs::Compute(OrtKernelContext* context) { + // Setup inputs + const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0); + const OrtValue* input_Y = ort_.KernelContext_GetInput(context, 1); + // Even though this kernel backs an operator where-in both inputs can be any type and need not be homogeneous + // as a proof-of-concept, support the case where-in the first input is of float type and the second input + // is of double type. Users need to extend this logic to handle any arbitrary type should the need arise. + const float* X = ort_.GetTensorData(input_X); + const double* Y = ort_.GetTensorData(input_Y); + + // Setup output + OrtTensorDimensions dimensions(ort_, input_X); + OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size()); + float* out = ort_.GetTensorMutableData(output); + + OrtTensorTypeAndShapeInfo* output_info = ort_.GetTensorTypeAndShape(output); + int64_t size = ort_.GetTensorShapeElementCount(output_info); + ort_.ReleaseTensorTypeAndShapeInfo(output_info); + + // Do computation +#ifdef USE_CUDA + cuda_add(size, out, X, Y); + cudaStreamSynchronize(nullptr); +#else + for (int64_t i = 0; i < size; i++) { + out[i] = static_cast(X[i] + Y[i]); + } +#endif +} \ No newline at end of file diff --git a/onnxruntime/test/shared_lib/utils.h b/onnxruntime/test/shared_lib/utils.h index 560ead8708..aa934b69b6 100644 --- a/onnxruntime/test/shared_lib/utils.h +++ b/onnxruntime/test/shared_lib/utils.h @@ -35,7 +35,10 @@ struct MyCustomOp : Ort::CustomOpBase { const char* GetExecutionProviderType() const { return provider_; }; size_t GetInputTypeCount() const { return 2; }; - ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; + ONNXTensorElementDataType GetInputType(size_t /*index*/) const { + // Both the inputs need to be necessarily of float type + return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; + }; size_t GetOutputTypeCount() const { return 1; }; ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; @@ -43,3 +46,34 @@ struct MyCustomOp : Ort::CustomOpBase { private: const char* provider_; }; + +struct MyCustomKernelMultipleDynamicInputs { + MyCustomKernelMultipleDynamicInputs(Ort::CustomOpApi ort, const OrtKernelInfo* /*info*/) : ort_(ort) { + } + + void Compute(OrtKernelContext* context); + + private: + Ort::CustomOpApi ort_; +}; + +struct MyCustomOpMultipleDynamicInputs : Ort::CustomOpBase { + explicit MyCustomOpMultipleDynamicInputs(const char* provider) : provider_(provider) {} + + void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const { return new MyCustomKernelMultipleDynamicInputs(api, info); }; + const char* GetName() const { return "Foo"; }; + const char* GetExecutionProviderType() const { return provider_; }; + + size_t GetInputTypeCount() const { return 2; }; + ONNXTensorElementDataType GetInputType(size_t /*index*/) const { + // Both the inputs are dynamic typed (i.e.) they can be any type and need not be + // homogeneous + return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + }; + + size_t GetOutputTypeCount() const { return 1; }; + ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; + + private: + const char* provider_; +}; \ No newline at end of file diff --git a/onnxruntime/test/testdata/foo_1.onnx.ort b/onnxruntime/test/testdata/foo_1.onnx.ort index 84b8f93da4..dd7ff5658c 100644 Binary files a/onnxruntime/test/testdata/foo_1.onnx.ort and b/onnxruntime/test/testdata/foo_1.onnx.ort differ diff --git a/onnxruntime/test/testdata/foo_3.onnx b/onnxruntime/test/testdata/foo_3.onnx new file mode 100644 index 0000000000..3293a799e4 Binary files /dev/null and b/onnxruntime/test/testdata/foo_3.onnx differ