diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index 187e69b54d..bc4feafa58 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -101,6 +101,9 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
   for (const auto& domain : op_domains) {
     // Create an OpSchema for each op and register them
 
+    // Container to hold type template parameters
+    std::unordered_map<const OrtCustomOp*, std::vector<std::string>> type_constraint_ids;
+
 #if !defined(ORT_MINIMAL_BUILD)
     // Domain is not empty - add it to the DomainToVersion ONNX map
     // If domain is empty, it is assumed to be part of the ONNX domain
@@ -119,25 +122,36 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
     for (const auto* op : domain->custom_ops_) {
       ONNX_NAMESPACE::OpSchema schema(op->GetName(op), "custom op registered at runtime", 0);
 
+      size_t type_id_counter = 0;
       auto input_count = op->GetInputTypeCount(op);
       for (size_t i = 0; i < input_count; i++) {
         auto type = op->GetInputType(op, i);
-        schema.Input(i, "Input" + std::to_string(i), "",
-                     ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type
-                         ? "T"
-                         : DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
+        if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) {  // Dynamic typed input
+          schema.Input(i, "Input" + std::to_string(i), "", "T" + std::to_string(type_id_counter));
+          schema.TypeConstraint("T" + std::to_string(type_id_counter), DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
+          type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++));
+        } else {
+          schema.Input(i, "Input" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
+        }
       }
 
       auto output_count = op->GetOutputTypeCount(op);
       for (size_t i = 0; i < output_count; i++) {
         auto type = op->GetOutputType(op, i);
-        schema.Output(i, "Output" + std::to_string(i), "",
-                      ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type
-                          ? "T"
-                          : DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
+        if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) {  // Dynamic typed output
+          ORT_ENFORCE(type_id_counter == 1,
+                      "There must be one (and only one) dynamic typed input to the custom op. "
+                      "Its type info at runtime will be used to infer the type info of this dynamic typed output "
+                      "which is required for the success of the model loading step. "
+                      "More than one dynamic typed inputs are currently not supported as differing types at runtime means the output type "
+                      "cannot be inferred without which model loading cannot proceed.");
+
+          schema.Output(i, "Output" + std::to_string(i), "", "T0");
+        } else {
+          schema.Output(i, "Output" + std::to_string(i), "", DataTypeImpl::ToString(onnxruntime::DataTypeImpl::TensorTypeFromONNXEnum(type)));
+        }
       }
 
-      schema.TypeConstraint("T", DataTypeImpl::ToString(DataTypeImpl::AllTensorTypes()), "all types");
       schema.SetDomain(domain->domain_);
       schema.SinceVersion(1);
       schema.AllowUncheckedAttributes();
@@ -149,14 +163,31 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
                                               1 /* baseline opset version */,
                                               1000 /* opset version */));
 
+#else
+    // For a minimal build, we may not need any of the ONNX schema stuff but we still need to track
+    // the type template parameters to be used during the kernel def building step below
+    for (const auto* op : domain->custom_ops_) {
+      size_t type_id_counter = 0;
+      auto input_count = op->GetInputTypeCount(op);
+      for (size_t i = 0; i < input_count; i++) {
+        auto type = op->GetInputType(op, i);
+        if (ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED == type) {  // Dynamic typed input
+          type_constraint_ids[op].push_back("T" + std::to_string(type_id_counter++));
+        }
+      }
+    }
 #endif
+
     // create the KernelDef for each op and register it
     for (const auto* op : domain->custom_ops_) {
       KernelDefBuilder def_builder;
       def_builder.SetName(op->GetName(op))
           .SetDomain(domain->domain_)
-          .SinceVersion(1)
-          .TypeConstraint("T", DataTypeImpl::AllTensorTypes());
+          .SinceVersion(1);
+
+      for (auto& id : type_constraint_ids[op]) {
+        def_builder.TypeConstraint(id, DataTypeImpl::AllTensorTypes());
+      }
 
       if (const char* provider_type = op->GetExecutionProviderType(op)) {
         def_builder.Provider(provider_type);
@@ -178,4 +209,3 @@ common::Status CreateCustomRegistry(const std::vector<OrtCustomOpDomain*>& op_do
 
 }  // namespace onnxruntime
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
-
diff --git a/onnxruntime/test/shared_lib/cuda_ops.cu b/onnxruntime/test/shared_lib/cuda_ops.cu
index 4d3e10543b..f1c11a00ff 100644
--- a/onnxruntime/test/shared_lib/cuda_ops.cu
+++ b/onnxruntime/test/shared_lib/cuda_ops.cu
@@ -7,29 +7,34 @@
 
 using namespace std;
 
-__global__ void cuda_add_impl(int64_t N, float* O, const float* X, const float* Y) {
+template <typename T1, typename T2, typename T3>
+__global__ void cuda_add_impl(int64_t N, T3* O, const T1* X, const T2* Y) {
   auto offset = threadIdx.x;
   if (offset < N) {
     O[offset] = Y[offset] + X[offset];
   }
 }
 
-void cuda_add(int64_t N, float* O, const float* X, const float* Y) {
+template <typename T1, typename T2, typename T3>
+void cuda_add(int64_t N, T3* O, const T1* X, const T2* Y) {
   cuda_add_impl<<<1, 256, 0, 0>>>(N, O, X, Y);
 }
 
-template<typename T>
-__global__ void cuda_slice_impl(const T* X , int64_t from, int64_t to, T* Y) {
+template <typename T>
+__global__ void cuda_slice_impl(const T* X, int64_t from, int64_t to, T* Y) {
   auto offset = threadIdx.x;
   if (offset >= from && offset < to) {
     Y[offset - from] = X[offset];
   }
 }
 
-template<typename T>
+template <typename T>
 void cuda_slice(const T* X, int64_t from, int64_t to, T* Y) {
-    cuda_slice_impl<T><<<1, 256, 0, 0>>>(X, from, to, Y);
+  cuda_slice_impl<T><<<1, 256, 0, 0>>>(X, from, to, Y);
 }
 
 template void cuda_slice(const float*, int64_t, int64_t, float*);
 template void cuda_slice(const double*, int64_t, int64_t, double*);
+
+template void cuda_add(int64_t, float*, const float*, const float*);
+template void cuda_add(int64_t, float*, const float*, const double*);
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 4116cb8f48..cdb755ff7b 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -164,6 +164,7 @@ static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/ov
 static constexpr PATH_TYPE NAMED_AND_ANON_DIM_PARAM_URI = TSTR("testdata/capi_symbolic_dims.onnx");
 static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/model_with_valid_ort_config_json.onnx");
 static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx");
+static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx");
 
 #ifdef ENABLE_LANGUAGE_INTEROP_OPS
 static constexpr PATH_TYPE PYOP_FLOAT_MODEL_URI = TSTR("testdata/pyop_1.onnx");
@@ -382,6 +383,66 @@ TEST(CApiTest, varied_input_custom_op_handler) {
 #endif
 }
 
+TEST(CApiTest, multiple_varied_input_custom_op_handler) {
+#ifdef USE_CUDA
+  MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCudaExecutionProvider};
+#else
+  MyCustomOpMultipleDynamicInputs custom_op{onnxruntime::kCpuExecutionProvider};
+#endif
+
+  Ort::CustomOpDomain custom_op_domain("");
+  custom_op_domain.Add(&custom_op);
+
+  Ort::SessionOptions session_options;
+
+#ifdef USE_CUDA
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+#endif
+
+  session_options.Add(custom_op_domain);
+  Ort::Session session(*ort_env, VARIED_INPUT_CUSTOM_OP_MODEL_URI_2, session_options);
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  std::vector<Ort::Value> ort_inputs;
+  std::vector<const char*> input_names;
+
+  // input 0 (float type)
+  input_names.emplace_back("X");
+  std::vector<float> input_0_data = {1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  std::vector<int64_t> input_0_dims = {3, 2};
+  ort_inputs.emplace_back(
+      Ort::Value::CreateTensor<float>(info, const_cast<float*>(input_0_data.data()),
+                                      input_0_data.size(), input_0_dims.data(), input_0_dims.size()));
+
+  // input 1 (double type)
+  input_names.emplace_back("W");
+  std::vector<double> input_1_data = {2, 3, 4, 5, 6, 7};
+  std::vector<int64_t> input_1_dims = {3, 2};
+  ort_inputs.emplace_back(
+      Ort::Value::CreateTensor<double>(info, const_cast<double*>(input_1_data.data()),
+                                       input_1_data.size(), input_1_dims.data(), input_1_dims.size()));
+
+  // Run
+  const char* output_name = "Y";
+  auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(),
+                                 &output_name, 1);
+  ASSERT_EQ(ort_outputs.size(), 1u);
+
+  // Validate results
+  std::vector<int64_t> y_dims = {3, 2};
+  std::vector<float> values_y = {3.f, 5.f, 7.f, 9.f, 11.f, 13.f};
+  auto type_info = ort_outputs[0].GetTensorTypeAndShapeInfo();
+  ASSERT_EQ(type_info.GetShape(), y_dims);
+  size_t total_len = type_info.GetElementCount();
+  ASSERT_EQ(values_y.size(), total_len);
+
+  float* f = ort_outputs[0].GetTensorMutableData<float>();
+  for (size_t i = 0; i != total_len; ++i) {
+    ASSERT_EQ(values_y[i], f[i]);
+  }
+}
+
 // Tests registration of a custom op of the same name for both CPU and CUDA EPs
 #ifdef USE_CUDA
 TEST(CApiTest, RegisterCustomOpForCPUAndCUDA) {
diff --git a/onnxruntime/test/shared_lib/utils.cc b/onnxruntime/test/shared_lib/utils.cc
index cfa7c7139b..97cda33b1a 100644
--- a/onnxruntime/test/shared_lib/utils.cc
+++ b/onnxruntime/test/shared_lib/utils.cc
@@ -5,7 +5,8 @@
 
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
-void cuda_add(int64_t, float*, const float*, const float*);
+template <typename T1, typename T2, typename T3>
+void cuda_add(int64_t, T3*, const T1*, const T2*);
 #endif
 
 void MyCustomKernel::Compute(OrtKernelContext* context) {
@@ -34,3 +35,33 @@ void MyCustomKernel::Compute(OrtKernelContext* context) {
   }
 #endif
 }
+
+void MyCustomKernelMultipleDynamicInputs::Compute(OrtKernelContext* context) {
+  // Setup inputs
+  const OrtValue* input_X = ort_.KernelContext_GetInput(context, 0);
+  const OrtValue* input_Y = ort_.KernelContext_GetInput(context, 1);
+  // Even though this kernel backs an operator where-in both inputs can be any type and need not be homogeneous
+  // as a proof-of-concept, support the case where-in the first input is of float type and the second input
+  // is of double type. Users need to extend this logic to handle any arbitrary type should the need arise.
+  const float* X = ort_.GetTensorData<float>(input_X);
+  const double* Y = ort_.GetTensorData<double>(input_Y);
+
+  // Setup output
+  OrtTensorDimensions dimensions(ort_, input_X);
+  OrtValue* output = ort_.KernelContext_GetOutput(context, 0, dimensions.data(), dimensions.size());
+  float* out = ort_.GetTensorMutableData<float>(output);
+
+  OrtTensorTypeAndShapeInfo* output_info = ort_.GetTensorTypeAndShape(output);
+  int64_t size = ort_.GetTensorShapeElementCount(output_info);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // Do computation
+#ifdef USE_CUDA
+  cuda_add(size, out, X, Y);
+  cudaStreamSynchronize(nullptr);
+#else
+  for (int64_t i = 0; i < size; i++) {
+    out[i] = static_cast<float>(X[i] + Y[i]);
+  }
+#endif
+}
\ No newline at end of file
diff --git a/onnxruntime/test/shared_lib/utils.h b/onnxruntime/test/shared_lib/utils.h
index 560ead8708..aa934b69b6 100644
--- a/onnxruntime/test/shared_lib/utils.h
+++ b/onnxruntime/test/shared_lib/utils.h
@@ -35,7 +35,10 @@ struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
   const char* GetExecutionProviderType() const { return provider_; };
 
   size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    // Both the inputs need to be necessarily of float type
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
 
   size_t GetOutputTypeCount() const { return 1; };
   ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
@@ -43,3 +46,34 @@ struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
  private:
   const char* provider_;
 };
+
+struct MyCustomKernelMultipleDynamicInputs {
+  MyCustomKernelMultipleDynamicInputs(Ort::CustomOpApi ort, const OrtKernelInfo* /*info*/) : ort_(ort) {
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+};
+
+struct MyCustomOpMultipleDynamicInputs : Ort::CustomOpBase<MyCustomOpMultipleDynamicInputs, MyCustomKernelMultipleDynamicInputs> {
+  explicit MyCustomOpMultipleDynamicInputs(const char* provider) : provider_(provider) {}
+
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const { return new MyCustomKernelMultipleDynamicInputs(api, info); };
+  const char* GetName() const { return "Foo"; };
+  const char* GetExecutionProviderType() const { return provider_; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    // Both the inputs are dynamic typed (i.e.) they can be any type and need not be
+    // homogeneous
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; };
+
+ private:
+  const char* provider_;
+};
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/foo_1.onnx.ort b/onnxruntime/test/testdata/foo_1.onnx.ort
index 84b8f93da4..dd7ff5658c 100644
Binary files a/onnxruntime/test/testdata/foo_1.onnx.ort and b/onnxruntime/test/testdata/foo_1.onnx.ort differ
diff --git a/onnxruntime/test/testdata/foo_3.onnx b/onnxruntime/test/testdata/foo_3.onnx
new file mode 100644
index 0000000000..3293a799e4
Binary files /dev/null and b/onnxruntime/test/testdata/foo_3.onnx differ